How to concatenate column values from each group - r

I have a dataframe df
df <- structure(list(GENE = c("TNFRSF4", "TNFRSF4", "VWA1", "VWA1",
"PEX10", "CEP104"), KEY.varID = c("chr1:1213738:G:A", "chr1:1232280:T:C",
"chr1:1435798:T:TGGCGCGGAGC", "chr1:1437401:C:G", "chr1:2406791:C:CT",
"chr1:3844977:G:A")), row.names = c(NA, -6L), class = "data.frame")
Code I tried:
library(dplyr)
df %>% group_by(GENE) %>%
mutate(all_variants = paste(KEY.varID, collapse = ","))
Result I want:
GENE KEY.varID
TNFRSF4 chr1:1213738:G:A, chr1:1232280:T:C
VWA1 chr1:1435798:T:TGGCGCGGAGC, chr1:1437401:C:G
PEX10 chr1:2406791:C:CT
CEP104 chr1:3844977:G:A

Your data is already a data.table; you can simply paste/collapse, and use by
library(data.table)
df[, .(KEY.varID = paste(KEY.varID, collapse = ",")), by=GENE]
Output:
GENE KEY.varID
1: TNFRSF4 chr1:1213738:G:A,chr1:1232280:T:C
2: VWA1 chr1:1435798:T:TGGCGCGGAGC,chr1:1437401:C:G
3: PEX10 chr1:2406791:C:CT
4: CEP104 chr1:3844977:G:A

Or using dplyr:
library(tidyverse)
library(data.table)
df %>%
group_by(GENE) %>%
summarise(KEY.varID = str_c(KEY.varID, collapse = ", ")) %>%
as.data.table
#> GENE KEY.varID
#> 1: CEP104 chr1:3844977:G:A
#> 2: PEX10 chr1:2406791:C:CT
#> 3: TNFRSF4 chr1:1213738:G:A, chr1:1232280:T:C
#> 4: VWA1 chr1:1435798:T:TGGCGCGGAGC, chr1:1437401:C:G

A base R solution is
tapply(df$KEY.varID, df$GENE, paste, collapse = ",") |>
(\(x) data.frame(GENE = names(x), KEY.varID = unname(x)))()
#R> GENE KEY.varID
#R> 1 CEP104 chr1:3844977:G:A
#R> 2 PEX10 chr1:2406791:C:CT
#R> 3 TNFRSF4 chr1:1213738:G:A,chr1:1232280:T:C
#R> 4 VWA1 chr1:1435798:T:TGGCGCGGAGC,chr1:1437401:C:G

Related

R how to combine two data frame based on the compare

I have two dataframe like this
```
v1. v2
1 a,b,c 1,2,3
2 d,e,f,g 4,6
3 h,k,v,x 9,0
```
```
v1 v2
1 a AA
2 c CC
3 d DD
```
after combine
```
v1 v2 v3
1 a,b,c 1,2,3 AA,CC
2. d,e,f,g 4,6 DD
3 h,k,v,x 9,0
```
I dont know how to perform like this , any comment would be appreciated
library(tidyverse)
library(fuzzyjoin)
df1 %>%
regex_left_join(., df2, by = c(v1 = "v1")) %>%
group_by(v1 = v1.x, v2 = v2.x) %>%
summarise(v3 = paste0(v2.y, collapse = ","))
# v1 v2 v3
# <chr> <chr> <chr>
# 1 a,b,c 1,2,3 AA,CC
# 2 d,e,f,g 4,6 DD
# 3 h,k,v,x 9,0 NA
Sample data used
df1 <- read.table(text = "v1 v2
a,b,c 1,2,3
d,e,f,g 4,6
h,k,v,x 9,0", header = TRUE)
df2 <- read.table(text = "v1 v2
a AA
c CC
d DD", header = TRUE)
Here is a very long tidyverse pipe. There should be simpler solutions.
library(dplyr)
library(tidyr)
df1 %>%
mutate(id = row_number()) %>%
separate(v1, into = c("v1a", "v1b", "v1c", "v1d"), fill = "right") %>%
pivot_longer(
cols = starts_with("v1"),
names_to = "v1_col",
values_to = "v1"
) %>%
na.omit() %>%
separate(v2, into = c("v2a", "v2b", "v2c"), fill = "right") %>%
pivot_longer(
cols = starts_with("v2"),
names_to = "v2_col",
values_to = "v2_value"
) %>%
na.omit() %>%
select(-ends_with("col")) %>%
left_join(df2, by = "v1") %>%
group_by(id, v1, v2) %>%
summarise(v2_value = paste(v2_value, collapse = ","),
.groups = "drop") %>%
group_by(id, v2_value) %>%
summarise(v1 = paste(v1, collapse = ","),
v3 = paste(na.omit(v2), collapse = ","),
.groups = "drop") %>%
ungroup() %>%
select(-id) %>%
rename(v2 = v2_value) %>%
relocate(v2, .after = v1)
## A tibble: 3 x 3
# v1 v2 v3
# <chr> <chr> <chr>
#1 a,b,c 1,2,3 "AA,CC"
#2 d,e,f,g 4,6 "DD"
#3 h,k,v,x 9,0 ""

Collapsing Columns in R using tidyverse with mutate, replace, and unite. Writing a function to reuse?

Data:
ID
B
C
1
NA
x
2
x
NA
3
x
x
Results:
ID
Unified
1
C
2
B
3
B_C
I'm trying to combine colums B and C, using mutate and unify, but how would I scale up this function so that I can reuse this for multiple columns (think 100+), instead of having to write out the variables each time? Or is there a function that's already built in to do this?
My current solution is this:
library(tidyverse)
Data %>%
mutate(B = replace(B, B == 'x', 'B'), C = replace(C, C == 'x', 'C')) %>%
unite("Unified", B:C, na.rm = TRUE, remove= TRUE)
We may use across to loop over the column, replace the value that corresponds to 'x' with column name (cur_column())
library(dplyr)
library(tidyr)
Data %>%
mutate(across(B:C, ~ replace(., .== 'x', cur_column()))) %>%
unite(Unified, B:C, na.rm = TRUE, remove = TRUE)
-output
ID Unified
1 1 C
2 2 B
3 3 B_C
data
Data <- structure(list(ID = 1:3, B = c(NA, "x", "x"), C = c("x", NA,
"x")), class = "data.frame", row.names = c(NA, -3L))
Here are couple of options.
Using dplyr -
library(dplyr)
cols <- names(Data)[-1]
Data %>%
rowwise() %>%
mutate(Unified = paste0(cols[!is.na(c_across(B:C))], collapse = '_')) %>%
ungroup -> Data
Data
# ID B C Unified
# <int> <chr> <chr> <chr>
#1 1 NA x C
#2 2 x NA B
#3 3 x x B_C
Base R
Data$Unified <- apply(Data[cols], 1, function(x)
paste0(cols[!is.na(x)], collapse = '_'))

Sub setting a column into multiple values in r

I have the following data,
col <- c('Data1,Data2','a,b,c','d')
df <- data.frame(col)
I want to split the data where the elements are more than 2 in a cell. So "a,b,c" should be split into "a,b" , "b,c" and "c,a". See attached for reference.
We create a row identifier (row_number()), split the 'col' by the delimiter (separate_rows), grouped by 'rn', summarise on those groups where the number of rows is greater than 1 to get the combn of 'col' and paste them together
library(stringr)
library(dplyr)
library(tidyr)
df %>%
mutate(rn = row_number()) %>%
separate_rows(col) %>%
group_by(rn) %>%
summarise(col = if(n() > 1) combn(col, 2, FUN = str_c, collapse=",") else col,
.groups = 'drop') %>%
select(-rn)
-output
# A tibble: 5 x 1
# col
# <chr>
#1 Data1,Data2
#2 a,b
#3 a,c
#4 b,c
#5 d
Here is a base R option using combn
data.frame(col = unlist(sapply(
strsplit(df$col, ","),
function(x) {
if (length(x) == 1) {
x
} else {
combn(x, 2, paste0, collapse = ",")
}
}
)))
which gives
col
1 Data1,Data2
2 a,b
3 a,c
4 b,c
5 d
library(tidyverse)
df %>%
rowwise()%>%
mutate(col = list(if(str_count(col, ",")>1) combn(strsplit(col, ",")[[1]], 2, toString) else col))%>%
unnest(col)
# A tibble: 5 x 1
col
<chr>
1 Data1,Data2
2 a, b
3 a, c
4 b, c
5 d

dplyr::mutate_at() relying on multiple columns with a given prefix/suffix

dplyr::mutate_at() can be used to apply the same function to multiple columns. It also allows you to set the results in new columns using a named list.
However, what if I have many columns in pairs (say, data1_a, data1_b, data2_a, data2_b, ...) and I want to multiply those pairs together? Is that possible?
By hand, that would look like
suppressPackageStartupMessages({
library(dplyr)
})
data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6) %>%
mutate(
data1 = data1_a * data1_b,
data2 = data2_a * data2_b
)
#> data1_a data1_b data2_a data2_b data1 data2
#> 1 1 2 3 4 2 12
#> 2 2 3 4 5 6 20
#> 3 3 4 5 6 12 30
My current solution is to write a function which takes the unsuffixed variable name (i.e. "data1"), creates the suffixed names and then performs a simple mutate() on that variable using get(). I then call that function for each output:
foo <- function(df, name) {
a <- paste0(name, "_a")
b <- paste0(name, "_b")
return(
mutate(
df,
!!name := get(a) * get(b)
)
)
}
data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6) %>%
foo("data1") %>%
foo("data2")
#> data1_a data1_b data2_a data2_b data1 data2
#> 1 1 2 3 4 2 12
#> 2 2 3 4 5 6 20
#> 3 3 4 5 6 12 30
(or write a loop over all the variable names if there were more of them)
But if it's possible to use mutate_at or something of the sort, that'd be much cleaner.
We can use pivot_longer/pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c('grp', '.value'),
names_sep = "_") %>%
group_by(grp) %>%
transmute(rn, new = a * b) %>%
pivot_wider(names_from = grp, values_from = new) %>%
select(-rn) %>%
bind_cols(df1, .)
# A tibble: 3 x 6
# data1_a data1_b data2_a data2_b data1 data2
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 2 12
#2 2 3 4 5 6 20
#3 3 4 5 6 12 30
Or another option is to split into a list based on the column names and then do the *
library(purrr)
library(stringr)
df1 %>%
split.default(str_remove(names(.), "_.*")) %>%
map_dfr(reduce, `*`) %>%
bind_cols(df1, .)
# A tibble: 3 x 6
# data1_a data1_b data2_a data2_b data1 data2
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 2 12
#2 2 3 4 5 6 20
#3 3 4 5 6 12 30
With mutate, it is possible, but it would be more manual
df1 %>%
mutate(data1 = select(., starts_with('data1')) %>%
reduce(`*`),
data2 = select(., starts_with('data2')) %>%
reduce(`*`))
data
df1 <- data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6)
After adopting #akrun's elegant solution, I noticed it was unfortunately very inefficient (since it has to recreate two dataframes), taking almost a second on a dataset with 20,000 rows and 11 "groups".
So a while ago I developed the following function (with a bit of help from #user12728748... sorry for not posting here sooner), which takes the names of the groups ("data1", "data2", etc) and a formula using the prefixes, allowing for bquote-style quoting for constant names:
suppressPackageStartupMessages(library(dplyr))
mutateSet <- function(df, colNames, formula,
isPrefix = TRUE,
separator = "_") {
vars <- all.vars(formula)
# extracts names wrapped in `.()`
escapedNames <- function (expr)
{
unquote <- function(e) {
if (is.pairlist(e) || length(e) <= 1L) NULL
else if (e[[1L]] == as.name(".")) deparse(e[[2L]])
else unlist(sapply(e, unquote))
}
unquote(substitute(expr))
}
escapedVars <- eval(rlang::expr(escapedNames(!!formula)))
# remove escaped names from mapping variables
vars <- setdiff(vars, escapedVars)
# get output prefix/suffix as string
lhs <- rlang::f_lhs(formula) %>%
all.vars()
# get operation as string
# deparse() can have line breaks; paste0() brings it back to one line
rhs <- rlang::f_rhs(formula) %>%
deparse() %>%
paste0(collapse = "")
# dummy function to cover for bquote escaping
. <- function(x) x
for (i in colNames) {
if (isPrefix) {
aliases <- paste0(vars, separator, i)
newCol <- paste0(lhs, separator, i)
} else {
aliases <- paste0(i, separator, vars)
newCol <- paste0(i, separator, lhs)
}
if (length(lhs) == 0) newCol <- i
mapping <- rlang::list2(!!!aliases)
names(mapping) <- vars
mapping <- do.call(wrapr::qc, mapping)
df <- rlang::expr(wrapr::let(
mapping,
df %>% dplyr::mutate(!!newCol := ...RHS...)
)) %>%
deparse() %>%
gsub(
pattern = "...RHS...",
replacement = rhs
) %>%
{eval(parse(text = .))}
}
return(df)
}
df <- data.frame(a_data1 = 1:3, b_data1 = 2:4,
a_data2 = 3:5, b_data2 = 4:6,
static = 5:7)
mutateSet(df, "data1", ~ a + b)
#> a_data1 b_data1 a_data2 b_data2 static data1
#> 1 1 2 3 4 5 3
#> 2 2 3 4 5 6 5
#> 3 3 4 5 6 7 7
mutateSet(df, c("data1", "data2"), x ~ sqrt(a) + b)
#> a_data1 b_data1 a_data2 b_data2 static x_data1 x_data2
#> 1 1 2 3 4 5 3.000000 5.732051
#> 2 2 3 4 5 6 4.414214 7.000000
#> 3 3 4 5 6 7 5.732051 8.236068
mutateSet(df, c("data1", "data2"), ~ a + b + .(static))
#> a_data1 b_data1 a_data2 b_data2 static data1 data2
#> 1 1 2 3 4 5 8 12
#> 2 2 3 4 5 6 11 15
#> 3 3 4 5 6 7 14 18
Created on 2020-04-28 by the reprex package (v0.3.0)
This can probably be cleaned up (especially that heinous for-loop), but it works for now.
Repeating #user12728748's performance test, we see this is ~100x faster:
suppressPackageStartupMessages({
invisible(lapply(c("dplyr", "tidyr", "rlang", "wrapr", "microbenchmark"),
require, character.only = TRUE))
})
polymutate <- function(df, formula) {
form <- rlang::f_rhs(formula)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c('.value', 'grp'),
names_sep = "_") %>%
group_by(grp) %>%
transmute(rn, new = eval(form)) %>%
pivot_wider(names_from = grp, values_from = new) %>%
select(-rn) %>%
bind_cols(df, .)
}
set.seed(1)
df <- setNames(data.frame(matrix(sample(1:12, 6E6, replace=TRUE), ncol=6)),
c("a_data1", "b_data1", "a_data2", "b_data2", "a_data3", "b_data3"))
pd <- polymutate(df, ~ a + b)
pd2 <- mutateSet(df, c("data1", "data2", "data3"), ~ a + b)
all.equal(pd, pd2)
#> [1] TRUE
microbenchmark(polymutate(df, ~ a + b),
mutateSet(df, c("data1", "data2", "data3"), ~ a + b),
times=10L)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> polymutate 1612.306 1628.9776 1690.78586 1670.15600 1741.3490 1806.1412 10
#> mutateSet 8.757 9.6302 13.27135 10.45965 19.2976 20.4657 10
This is now possible using the cur_column() function within across().
library(tidyverse)
dat <- data.frame(
data1_a = 1:3,
data1_b = 2:4,
data2_a = 3:5,
data2_b = 4:6
)
mutate(
dat,
across(ends_with("a"), ~ . * dat[[str_replace(cur_column(), "a$", "b")]],
.names = "updated_{col}")
)
Returns:
data1_a data1_b data2_a data2_b updated_data1_a updated_data2_a
1 1 2 3 4 2 12
2 2 3 4 5 6 20
3 3 4 5 6 12 30
Where updated_data1_a and updated_data2_a contain the desired output variables.

How do I remove NAs with the tidyr::unite function?

After combining several columns with tidyr::unite(), NAs from missing data remain in my character vector, which I do not want.
I have a series of medical diagnoses per row (1 per column) and would like to benchmark searching for a series of codes via. %in% and grepl().
There is an open issue on Github on this problem, is there any movement - or work arounds? I would like to keep the vector comma-separated.
Here is a representative example:
library(dplyr)
library(tidyr)
df <- data_frame(a = paste0("A.", rep(1, 3)), b = " ", c = c("C.1", "C.3", " "), d = "D.4", e = "E.5")
cols <- letters[2:4]
df[, cols] <- gsub(" ", NA_character_, as.matrix(df[, cols]))
tidyr::unite(df, new, cols, sep = ",")
Current output:
# # A tibble: 3 x 3
# a new e
# <chr> <chr> <chr>
# 1 A.1 NA,C.1,D.4 E.5
# 2 A.1 NA,C.3,D.4 E.5
# 3 A.1 NA,NA,D.4 E.5
Desired output:
# # A tibble: 3 x 3
# a new e
# <chr> <chr> <chr>
# 1 A.1 C.1,D.4 E.5
# 2 A.1 C.3,D.4 E.5
# 3 A.1 D.4 E.5
In the new tidyr , you can now use na.rm parameter to remove NA values.
library(tidyr)
library(dplyr)
df %>% unite(new, cols, sep = ",", na.rm = TRUE)
# a new e
# <chr> <chr> <chr>
#1 A.1 C.1,D.4 E.5
#2 A.1 C.3,D.4 E.5
#3 A.1 D.4 E.5
However, NAs would not be removed if have columns are factors. We need to change them to character before using unite.
df %>%
mutate_all(as.character) %>%
unite(new, cols, sep = ",", na.rm = TRUE)
You could also use base R apply method for the same.
apply(df[cols], 1, function(x) toString(na.omit(x)))
#[1] "C.1, D.4" "C.3, D.4" "D.4"
data
df <- data_frame(
a = c("A.1", "A.1", "A.1"),
b = c(NA_character_, NA_character_, NA_character_),
c = c("C.1", "C.3", NA),
d = c("D.4", "D.4", "D.4"),
e = c("E.5", "E.5", "E.5")
)
cols <- letters[2:4]
You could use regex to remove the NAs after they are created:
library(dplyr)
library(tidyr)
df <- data_frame(a = paste0("A.", rep(1, 3)),
b = " ",
c = c("C.1", "C.3", " "),
d = "D.4", e = "E.5")
cols <- letters[2:4]
df[, cols] <- gsub(" ", NA_character_, as.matrix(df[, cols]))
tidyr::unite(df, new, cols, sep = ",") %>%
dplyr::mutate(new = stringr::str_replace_all(new, 'NA,?', '')) # New line
Output:
# A tibble: 3 x 3
a new e
<chr> <chr> <chr>
1 A.1 C.1,D.4 E.5
2 A.1 C.3,D.4 E.5
3 A.1 D.4 E.5
You can avoid inserting them by iterating over the rows:
library(tidyverse)
df <- data_frame(
a = c("A.1", "A.1", "A.1"),
b = c(NA_character_, NA_character_, NA_character_),
c = c("C.1", "C.3", NA),
d = c("D.4", "D.4", "D.4"),
e = c("E.5", "E.5", "E.5")
)
cols <- letters[2:4]
df %>% mutate(x = pmap_chr(.[cols], ~paste(na.omit(c(...)), collapse = ',')))
#> # A tibble: 3 x 6
#> a b c d e x
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 A.1 <NA> C.1 D.4 E.5 C.1,D.4
#> 2 A.1 <NA> C.3 D.4 E.5 C.3,D.4
#> 3 A.1 <NA> <NA> D.4 E.5 D.4
or using tidyr's underlying stringi package,
df %>% mutate(x = pmap_chr(.[cols], ~stringi::stri_flatten(
c(...), collapse = ",",
na_empty = TRUE, omit_empty = TRUE
)))
#> # A tibble: 3 x 6
#> a b c d e x
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 A.1 <NA> C.1 D.4 E.5 C.1,D.4
#> 2 A.1 <NA> C.3 D.4 E.5 C.3,D.4
#> 3 A.1 <NA> <NA> D.4 E.5 D.4
The problem is that iterating over rows usually entails making a lot of calls, and can therefore be quite slow at scale. Unfortunately, there doesn't appear to be a great vectorized alternative for removing NAs before joining the strings.
Thanks all, I've put together a summary of the solutions and bench-marked on my data:
library(microbenchmark)
library(dplyr)
library(stringr)
library(tidyr)
library(biometrics) # has my helper function for column selection
cols <- biometrics::variables(c("diagnosis", "dagger", "ediag"), 20)
system.time({
df <- dat[, cols]
df <- gsub(" ", NA_character_, as.matrix(df)) %>% tbl_df()
})
microbenchmark(
## search by base R `match()` function
match_spaces = apply(dat, 1, function(x) any(c("A37.0","A37.1","A37.8","A37.9") %in% x[cols])), # original search (match)
match_NAs = apply(df, 1, function(x) any(c("A37.0","A37.1","A37.8","A37.9") %in% x[cols])), # matching with " " replaced by NAs with gsub
## search by base R 'grep()' function - the same regex is used in each case
regex_str_replace_all = tidyr::unite(df, new, cols, sep = ",") %>% # grepl search with NAs removed with `stringr::str_replace_all()`
mutate(new = str_replace_all(new, "NA,?", "")) %>%
apply(1, function(x) grepl("A37.*", x, ignore.case = T)),
regex_toString = tidyr::unite(df, new, cols, sep = ",") %>% # grepl search with NAs removed with `apply()` & `toString()`
mutate(new = apply(df[cols], 1, function(x) toString(na.omit(x)))) %>%
apply(1, function(x) grepl("A37.*", x, ignore.case = T)),
regex_row_iteration = df %>% # grepl search after iterating over rows (using syntax I'm not familiar with and need to learn!)
mutate(new = pmap_chr(.[cols], ~paste(na.omit(c(...)), collapse = ','))) %>%
select(new) %>%
apply(1, function(x) grepl("A37.*", x, ignore.case = T)),
regex_stringi = df %>% mutate(new = pmap_chr(.[cols], ~stringi::stri_flatten( # grepl after stringi
c(...), collapse = ",",
na_empty = TRUE, omit_empty = TRUE
))) %>%
select(new) %>%
apply(1, function(x) grepl("A37.*", x, ignore.case = T)),
times = 10L
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# match_spaces 14820.2076 15060.045 15558.092 15573.885 15901.015 16521.855 10
# match_NAs 998.3184 1061.973 1191.691 1203.849 1301.511 1378.314 10
# regex_str_replace_all 1464.4502 1487.473 1637.832 1596.522 1701.718 2114.055 10
# regex_toString 4324.0914 4341.725 4631.998 4487.373 4977.603 5439.026 10
# regex_row_iteration 5794.5994 6107.475 6458.339 6436.273 6720.185 7256.980 10
# regex_stringi 4772.3859 5267.456 5466.510 5436.804 5806.272 6011.713 10
It looks like %in% is the winner - after replacing empty values (" ") with NAs. If If I go with regular expressions, then removing NAs with stringr::string_replace_all() is the quickest.
You might get some errors if you remove them while you use the unite function. I would just remove them from the column after the fact.
df <- data_frame(a = paste0("A.", rep(1, 3)), b = " ", c = c("C.1", "C.3", " "), d = "D.4", e = "E.5")
cols <- letters[2:4]
df[, cols] <- gsub(" ", NA_character_, as.matrix(df[, cols]))
df <- tidyr::unite(df, new, cols, sep = ",")
df$new <- gsub("NA,","",df$new)

Resources