Manipulate string in column of a dataframe - r

I have a data frame
a = data.frame("a" = c("aaa|abbb", "bbb|aaa", "bbb|aaa|ccc"), "b" = c(1,2,3))
a b
aaa|abbb 1
bbb|aaa 2
bbb|aaa|ccc 3
I want to split the colum value by "|" and sort the output and merge them together to look like this
a b
aaa|abbb 1
aaa|bbb 2
|aaa|bbb|ccc 3
I tried to use following
paste(sort(ignore.case(unlist(strsplit(as.character(a$a), "\\|")))),collapse = ", ")
but that just combine everything together. How can I implement it on each value of column A and get the result as dataframe. I tried to use lapply but still got the same result, one combined list.

We could use separate_rows to split the values in 'a', then grouped by 'b', sort 'a' and paste the elements together
library(tidyverse)
a %>%
separate_rows(a) %>%
group_by(b) %>%
summarise(a = paste(sort(a), collapse="|")) %>%
select(names(a))
# A tibble: 3 x 2
# a b
# <chr> <dbl>
#1 aaa|abbb 1
#2 aaa|bbb 2
#3 aaa|bbb|ccc 3

An idea via base R,
sapply(strsplit(as.character(a$a), '|', fixed = TRUE), function(i) paste(sort(i), collapse = '|'))
#[1] "aaa|abbb" "aaa|bbb" "aaa|bbb|ccc"
So to update your column a, just assign it back to it, i.e.
a$a <- sapply(strsplit(as.character(a$a), '|', fixed = TRUE), function(i) paste(sort(i), collapse = '|'))

Similar to Sotos's answer:
a$clean <- sapply(as.character(a$a), function(i) paste(sort(tolower(unlist(strsplit(i, split = "|", fixed = TRUE)))), collapse = "|"))
# a b clean
# 1 aaa|abbb 1 aaa|abbb
# 2 bbb|aaa 2 aaa|bbb
# 3 bbb|aaa|ccc 3 aaa|bbb|ccc

if you want to do it with data.table
library(data.table)
dat <- fread("a b
aaa|abbb 1
bbb|aaa 2
bbb|aaa|ccc 3")
dat[,a_sorted :=sapply(lapply(strsplit(a, "\\|"), sort),paste,collapse="|") ]

Related

If string has a certain character, fill an empty cell in the same row with a certain value

Say I have the following data frame:
# S/N a b
# 1 L1-S2 <blank>
# 2 T1-T3 <blank>
# 3 T1-L2 <blank>
How do I turn the above data frame into this:
# S/N a b
# 1 L1-S2 LS
# 2 T1-T3 T
# 3 T1-L2 TL
I am thinking of writing a loop, where
For x in column a,
If first character in x == L AND 4th character in x == S,
fill the corresponding cell in b with LS
and so on...
However, I am not sure how to implement it, or if there is a more elegant way of doing this.
We can extract the upper case letters and remove the repeated ones
library(stringr)
library(dplyr)
df1 %>%
mutate(b = str_replace(str_replace(a, "^([A-Z])\\d+-([A-Z])\\d+",
"\\1\\2"), "(.)\\1+", "\\1"))
-output
# S_N a b
#1 1 L1-S2 LS
#2 2 T1-T3 T
#3 3 T1-L2 TL
Or another option is str_extract_all to extract the upper case letters, loop over the list with map, paste the unique elements
library(purrr)
df1 %>%
mutate(b = str_extract_all(a, "[A-Z]") %>%
map_chr(~ str_c(unique(.x), collapse="")))
Or using a corresponding base R option for the first tidyverse option
df1$b <- sub("(.)\\1+", "\\1", gsub("[0-9-]+", "", df1$a))
Or with strsplit
df1$b <- sapply(strsplit(df1$a, "[0-9-]+"),
function(x) paste(unique(x), collapse=""))
data
df1 <- structure(list(S_N = 1:3, a = c("L1-S2", "T1-T3", "T1-L2"),
b = c(NA,
NA, NA)), class = "data.frame", row.names = c(NA, -3L))

Replace Dataframe Column Values with User Defined Function in R

I have a grouped set of values in a column I am trying to replace with a since value
col1
a
a;a;b;c
c;b;a
NA
b;b;b
I want to replace all values with either mixed or the single present value if for example a;a;a;a becomes a
Expected Output
col1
a
Mixed
Mixed
NA
b
Code
grouping = function(x){
y = as.list(strsplit(x, ";")[[1]])
#select first element, and test if each is the same element.
z = ""
for (i in 1:length(y)){
if (as.character(y[1]) != as.character(y[i])) {
z = 'mixed'
break
} else {
z = as.character(y[1])
}
}
return(z)
}
db %>%
select(col1) %>%
mutate(
test = grouping(col1)
)
I have tried it a few different ways and either end up with it not working at all or giving the value a for everything
A base R option via defining a user function f
f <- function(x) ifelse(length(u <- unique(unlist((strsplit(x, ";"))))) > 1, "Mixed", u)
such that
> transform(df, col1 = Vectorize(f)(col1))
col1
1 a
2 Mixed
3 Mixed
4 <NA>
5 b
You can also consider this for your function and use base R:
#Function
myfun <- function(x)
{
y <- unlist(strsplit(x, ";"))
if(length(unique(y))==1)
{
z <- unique(y)
} else
{
z <- 'Mixed'
}
}
#Apply
df$New <- apply(df,1,myfun)
Output:
df
col1 New
1 a a
2 a;a;b;c Mixed
3 c;b;a Mixed
4 <NA> <NA>
5 b;b;b b
Some data used:
#Data
df <- structure(list(col1 = c("a", "a;a;b;c", "c;b;a", NA, "b;b;b")), class = "data.frame", row.names = c(NA,
-5L))
We can extract the substring from the 'col1' which are letters, check the number of distinct elements with n_distinct, use case_when to change those which have more one unique elements to 'Mixed'
library(dplyr)
library(stringr)
library(purrr)
df1 %>%
mutate(col1 = case_when(map_dbl(str_extract_all(col1,
"[a-z]"), n_distinct) >1 ~ "Mixed",
is.na(col) ~ NA_character_,
TRUE ~ substr(col1, 1, 1)))
-output
# col1
#1 a
#2 Mixed
#3 Mixed
#4 <NA>
#5 b
Or another option is to split the column by the delimiter with separate_rows, and do a group by row_number to summarise elements having more than one row (after the distinct) to be 'Mixed'
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
separate_rows(col1) %>%
distinct() %>%
group_by(rn) %>%
summarise(col1 = case_when(n() > 1 ~ 'Mixed', TRUE ~ first(col1)),
.groups = 'drop') %>%
select(-rn)
-output
# A tibble: 5 x 1
# col1
# <chr>
#1 a
#2 Mixed
#3 Mixed
#4 <NA>
#5 b
Or using base R with a compact option
v1 <- gsub("([a-z])\\1+", "\\1", gsub(";", "", df1$col1))
replace(v1, nchar(v1) > 1, "Mixed")
#[1] "a" "Mixed" "Mixed" NA "b"
The issue in the OP's function is that it is extracting only the first [[1]] list element
as.list(strsplit(x, ";")[[1]])
as strsplit returns a list with length equal to the number of rows of the initial data. So, basically by selecting only the first, it is recycled
data
df1 <- structure(list(col1 = c("a", "a;a;b;c", "c;b;a", NA, "b;b;b")),
class = "data.frame", row.names = c(NA,
-5L))
You can write the grouping function as :
grouping <- function(x) {
sapply(strsplit(x, ';'), function(x)
if(length(unique(x)) == 1) unique(x) else 'Mixed')
}
db$test <- grouping(db$col1)
db
# col1 test
#1 a a
#2 a;a;b;c Mixed
#3 c;b;a Mixed
#4 <NA> <NA>
#5 b;b;b b

rename columns of dataframe

I have one dataframe that basically looks like this (contains data):
t <- data.frame(x1 = 1:5, x2 = 1:5, stingsAsFactors = FALSE)
I have another dataframe that contains the original column names and a replacement for each
n <- data.frame(abb = c("x1", "x2"), erf = c("XX1", "XX2"), stringsAsFactors = FALSE)
What I would like to do is rename columns in dataframe t according to the specification in dataframe n. My problem is I can't figure out how to do that with map. Why is the following wrong:
map2_dfr(n$abb, n$erf, function(x, y) rename(t, !!y := x))
We can use rename_at
library(dplyr)
t %>%
rename_at(n$abb, ~ n$erf)
Here is a one-liner in base R using match,
names(t) <- n$erf[match(names(t), n$abb)]
t
# XX1 XX2
#1 1 1
#2 2 2
#3 3 3
#4 4 4
#5 5 5

integer type split into two columns

I've column with two alpha numeric characters separated by '->' I'm trying to split them into columns.
Df:
column e
1. asd1->ref2
2. fde4 ->fre4
3. dfgt-fgr ->frt5
4. ftr5 -> lkh-oiut
5. rey6->usre-lynng->usre-lkiujh->kiuj-bunny
6. dge1->fgt4->okiuj-dfet
Desired output
col 1 col 2
1. asd1 ref2
2. fde4 fre4
3. frt5
4. ftr5
5. rey6
6. dge1 fgt4
I tried using out <- strsplit(as.character(Df$column e),'_->_') with no output and used str_extract(m1$column e,"(?<=\\[)[[:alnum:]]")->m1$column f, also strsplit(as.character(Df$column e),' -> 'fixed=T)[[1]][[1]] but not getting the desired output.
The column if of integer type and all are capital letters(I'm not sure if this is imp.)
Here is one way with tidyverse
library(tidyverse)
df1 %>%
separate(columne, into = c('col1', 'col2'), sep = "->", extra = 'drop') %>%
mutate_all(funs(replace(., str_detect(., '-'), "")))
# col1 col2
#1 asd1 ref2
#2 fde4 fre4
#3 frt5
#4 ftr5
#5 rey6
#6 dge1 fgt4
A base R solution as well, though a fair bit less concise than #akrun's tidyverse one:
# split as appropriate
out <- strsplit( as.character( Df$column.e ), '->' )
out <- lapply( out, function(x) {
# I assume you don't want the white space
y <- trimws( x )
# take the first two "columns"
y <- y[1:2]
# remove any items containing a hyphen
y[ grepl( "-", y ) ] <- ""
y
}
)
# then bind it all rowwise
out <- do.call( rbind, out )
data.frame( out )
X1 X2
1 asd1 ref2
2 fde4 fre4
3 frt5
4 ftr5
5 rey6
6 dge1 fgt4

Select the last n characters in a string

I have the following dataset
df <-data.frame(fact=c("a,bo,v", "c,b,v,d", "c"))
I wish to select the last two items for each row. So, Ideally I wish to have this output:
fact
1 bo,v
2 v,d
3 c
I've tried to split the rows and then choose the last two items:
spl <- strsplit(as.character(df$fact), split = ",")
tail(spl[[1]], n=2)
But doe not give me the correct results
You can do this:
lapply(lapply(strsplit(as.character(df$fact), split = ','), function(x) x[c(length(x)-1,length(x))]), paste, collapse = ',')
You split the col and then extract the n and n-1 index. Then paste them together.
You can generalise this for by doing:
lapply(strsplit(as.character(df$fact), split = ','), function(x) x[(length(x)-n):length(x)] )
where n is no of backward steps you want to take.
Using tail is even simpler.
lapply(strsplit(as.character(df$fact), split = ','), tail, n=2)
We can use sapply to loop over every element of fact, split it on basis of , and then select the last n elements using tail
n <- 2
sapply(as.character(df$fact), function(x) {
temp = unlist(strsplit(x, ','))
tail(temp, n)
}, USE.NAMES = F)
#[[1]]
#[1] "bo" "v"
#[[2]]
#[1] "v" "d"
#[[3]]
#[1] "c"
A better option with dplyr I feel using rowwise
library(dplyr)
df %>%
rowwise() %>%
mutate(last_two = paste0(tail(unlist(strsplit(as.character(fact),",")), n),
collapse = ","))
# fact last_two
# <fctr> <chr>
#1 a,bo,v bo,v
#2 c,b,v,d v,d
#3 c c

Resources