R how to combine two data frame based on the compare - r

I have two dataframe like this
```
v1. v2
1 a,b,c 1,2,3
2 d,e,f,g 4,6
3 h,k,v,x 9,0
```
```
v1 v2
1 a AA
2 c CC
3 d DD
```
after combine
```
v1 v2 v3
1 a,b,c 1,2,3 AA,CC
2. d,e,f,g 4,6 DD
3 h,k,v,x 9,0
```
I dont know how to perform like this , any comment would be appreciated

library(tidyverse)
library(fuzzyjoin)
df1 %>%
regex_left_join(., df2, by = c(v1 = "v1")) %>%
group_by(v1 = v1.x, v2 = v2.x) %>%
summarise(v3 = paste0(v2.y, collapse = ","))
# v1 v2 v3
# <chr> <chr> <chr>
# 1 a,b,c 1,2,3 AA,CC
# 2 d,e,f,g 4,6 DD
# 3 h,k,v,x 9,0 NA
Sample data used
df1 <- read.table(text = "v1 v2
a,b,c 1,2,3
d,e,f,g 4,6
h,k,v,x 9,0", header = TRUE)
df2 <- read.table(text = "v1 v2
a AA
c CC
d DD", header = TRUE)

Here is a very long tidyverse pipe. There should be simpler solutions.
library(dplyr)
library(tidyr)
df1 %>%
mutate(id = row_number()) %>%
separate(v1, into = c("v1a", "v1b", "v1c", "v1d"), fill = "right") %>%
pivot_longer(
cols = starts_with("v1"),
names_to = "v1_col",
values_to = "v1"
) %>%
na.omit() %>%
separate(v2, into = c("v2a", "v2b", "v2c"), fill = "right") %>%
pivot_longer(
cols = starts_with("v2"),
names_to = "v2_col",
values_to = "v2_value"
) %>%
na.omit() %>%
select(-ends_with("col")) %>%
left_join(df2, by = "v1") %>%
group_by(id, v1, v2) %>%
summarise(v2_value = paste(v2_value, collapse = ","),
.groups = "drop") %>%
group_by(id, v2_value) %>%
summarise(v1 = paste(v1, collapse = ","),
v3 = paste(na.omit(v2), collapse = ","),
.groups = "drop") %>%
ungroup() %>%
select(-id) %>%
rename(v2 = v2_value) %>%
relocate(v2, .after = v1)
## A tibble: 3 x 3
# v1 v2 v3
# <chr> <chr> <chr>
#1 a,b,c 1,2,3 "AA,CC"
#2 d,e,f,g 4,6 "DD"
#3 h,k,v,x 9,0 ""

Related

How to concatenate column values from each group

I have a dataframe df
df <- structure(list(GENE = c("TNFRSF4", "TNFRSF4", "VWA1", "VWA1",
"PEX10", "CEP104"), KEY.varID = c("chr1:1213738:G:A", "chr1:1232280:T:C",
"chr1:1435798:T:TGGCGCGGAGC", "chr1:1437401:C:G", "chr1:2406791:C:CT",
"chr1:3844977:G:A")), row.names = c(NA, -6L), class = "data.frame")
Code I tried:
library(dplyr)
df %>% group_by(GENE) %>%
mutate(all_variants = paste(KEY.varID, collapse = ","))
Result I want:
GENE KEY.varID
TNFRSF4 chr1:1213738:G:A, chr1:1232280:T:C
VWA1 chr1:1435798:T:TGGCGCGGAGC, chr1:1437401:C:G
PEX10 chr1:2406791:C:CT
CEP104 chr1:3844977:G:A
Your data is already a data.table; you can simply paste/collapse, and use by
library(data.table)
df[, .(KEY.varID = paste(KEY.varID, collapse = ",")), by=GENE]
Output:
GENE KEY.varID
1: TNFRSF4 chr1:1213738:G:A,chr1:1232280:T:C
2: VWA1 chr1:1435798:T:TGGCGCGGAGC,chr1:1437401:C:G
3: PEX10 chr1:2406791:C:CT
4: CEP104 chr1:3844977:G:A
Or using dplyr:
library(tidyverse)
library(data.table)
df %>%
group_by(GENE) %>%
summarise(KEY.varID = str_c(KEY.varID, collapse = ", ")) %>%
as.data.table
#> GENE KEY.varID
#> 1: CEP104 chr1:3844977:G:A
#> 2: PEX10 chr1:2406791:C:CT
#> 3: TNFRSF4 chr1:1213738:G:A, chr1:1232280:T:C
#> 4: VWA1 chr1:1435798:T:TGGCGCGGAGC, chr1:1437401:C:G
A base R solution is
tapply(df$KEY.varID, df$GENE, paste, collapse = ",") |>
(\(x) data.frame(GENE = names(x), KEY.varID = unname(x)))()
#R> GENE KEY.varID
#R> 1 CEP104 chr1:3844977:G:A
#R> 2 PEX10 chr1:2406791:C:CT
#R> 3 TNFRSF4 chr1:1213738:G:A,chr1:1232280:T:C
#R> 4 VWA1 chr1:1435798:T:TGGCGCGGAGC,chr1:1437401:C:G

pivot_wider a dataframe with complex names R

so I have a dataframe that looks like this:
datInput <- tibble(id = 1:2,
c.0.opt = c("a,b", "c,d"),
c.0.optI = c("1,2", "3,4"),
c.0.sel = c("a", "c"),
c.1.opt = c("e,f", "g,h"),
c.1.optI = c("5,6", "7,8"),
c.1.sel = c("e", "g"))
datInput
# id c.0.opt c.0.optI c.0.sel c.1.opt c.1.optI c.1.sel
#1 1 a,b 1,2 a e,f 5,6 e
#2 2 c,d 3,4 c g,h 7,8 g
And I need it to look like this:
datOutput <- tibble(id = c(1,1,2,2),
c_opt = c("a,b", "e,f", "c,d", "g,h"),
c_optI = c("1,2", "5,6", "3,4", "7,8"),
c_sel = c("a", "e", "c", "g"))
# id c_opt c_optI c_sel
#1 1 a,b 1,2 a
#2 1 e,f 5,6 e
#3 2 c,d 3,4 c
#4 2 g,h 7,8 g
I usually use dplyr::pivot_longer for this kind of tasks, but I don't know how to do it with those complicated column names, were the row identifier is in the middle. Is there a way to do this?
Thanks
We can use pivot_longer as well with names_sep as regex lookaround to match the . in column names that succeeds a digit
library(dplyr)
library(tidyr)
library(stringr)
pivot_longer(datInput, cols = -id, names_to = c("grp", ".value"),
names_sep = "(?<=\\d)\\.") %>%
select(-grp) %>%
rename_with(~ str_c('c_', .), -id)
# A tibble: 4 x 4
# id c_opt c_optI c_sel
# <int> <chr> <chr> <chr>
#1 1 a,b 1,2 a
#2 1 e,f 5,6 e
#3 2 c,d 3,4 c
#4 2 g,h 7,8 g
datInput %>%
gather(colname, val,-1 ) %>%
mutate(colname = gsub("\\.\\d\\.","_",colname)) %>%
pivot_wider(id_cols = id, names_from = colname, values_from = val, values_fn = list) %>%
unnest(cols = c(colnames(.)))
# A tibble: 4 x 4
id c_opt c_optI c_sel
<int> <chr> <chr> <chr>
1 1 a,b 1,2 a
2 1 e,f 5,6 e
3 2 c,d 3,4 c
4 2 g,h 7,8 g
I modified Akrun's answer with comments from zimia like this:
datOutput <- datInput %>%
pivot_longer(-id, names_to = "colname", values_to = "val") %>%
mutate(colname = gsub("\\.\\d\\.","_",colname)) %>%
pivot_wider(id_cols = id, names_from = colname, values_from = val, values_fn = list) %>%
unnest(cols = c(colnames(.)))
It works perfectly. Thank you both.

Sub setting a column into multiple values in r

I have the following data,
col <- c('Data1,Data2','a,b,c','d')
df <- data.frame(col)
I want to split the data where the elements are more than 2 in a cell. So "a,b,c" should be split into "a,b" , "b,c" and "c,a". See attached for reference.
We create a row identifier (row_number()), split the 'col' by the delimiter (separate_rows), grouped by 'rn', summarise on those groups where the number of rows is greater than 1 to get the combn of 'col' and paste them together
library(stringr)
library(dplyr)
library(tidyr)
df %>%
mutate(rn = row_number()) %>%
separate_rows(col) %>%
group_by(rn) %>%
summarise(col = if(n() > 1) combn(col, 2, FUN = str_c, collapse=",") else col,
.groups = 'drop') %>%
select(-rn)
-output
# A tibble: 5 x 1
# col
# <chr>
#1 Data1,Data2
#2 a,b
#3 a,c
#4 b,c
#5 d
Here is a base R option using combn
data.frame(col = unlist(sapply(
strsplit(df$col, ","),
function(x) {
if (length(x) == 1) {
x
} else {
combn(x, 2, paste0, collapse = ",")
}
}
)))
which gives
col
1 Data1,Data2
2 a,b
3 a,c
4 b,c
5 d
library(tidyverse)
df %>%
rowwise()%>%
mutate(col = list(if(str_count(col, ",")>1) combn(strsplit(col, ",")[[1]], 2, toString) else col))%>%
unnest(col)
# A tibble: 5 x 1
col
<chr>
1 Data1,Data2
2 a, b
3 a, c
4 b, c
5 d

Add more rows based on a grouping variable R

I'd like to add more rows to my dataset based on a grouping variable. Right now, my data has 2 rows but I would like 3 rows and the var app to be repeated for the third row.
This is what my data currently looks like:
my_data <- data.frame(app = c('a','b'), type = c('blue','red'), code = c(1:2), type_2 = c(NA, 'blue'), code_2 = c(NA, 3))
app type code type_2 code_2
a blue 1 NA NA
b red 2 blue 3
I would like the data to look like this:
app type code
a blue 1
b red 2
b blue 3
library(data.table)
setDT(my_data)
res <-
melt(
my_data,
id.vars = "app",
measure.vars = patterns(c("^type", "^code")),
value.name = c("type", "code")
)[!is.na(type), .(app, type, code)]
Using tidyverse
library(dplyr)
library(stringr)
library(tidyr)
my_data %>%
rename_at(vars(c(type, code)), ~ str_c(., "_1")) %>%
pivot_longer(cols = -app, names_to = c(".value", "grp"), names_sep = "_",
values_drop_na = TRUE) %>% select(-grp)
# A tibble: 3 x 3
# app type code
# <chr> <chr> <dbl>
#1 a blue 1
#2 b red 2
#3 b blue 3

dplyr: group over several variables in a function

I want to have two lists of grouping variables. let's say list1 = c("var2","var3","var4") and list2 = c("var2","var3")
dta = data.frame(var1 = c(1:8),
var2 = c(rep("AA",4),rep("BB",4)),
var3 = rep(c("C","D"),4),
var4 = c(1,1,0,0,0,0,1,1))
dta %>% group_by(var2,var3,var4) %>% summarise(mv1 = mean(var1)) %>%
group_by(var2,var3) %>% summarise(mv1_2 = mean(mv1))
How can I create a function like this
sample_fun = function(dta, list1, list2){
dta %>% group_by(list1) %>% summarise(mv1 = mean(var1)) %>%
group_by(list2) %>% summarise(mv1_2 = mean(mv1))
}
Here are two ways to do this -
Pure dplyr solution using across :
library(dplyr)
library(rlang)
sample_fun = function(dta, list1, list2){
dta %>%
group_by(across(all_of(list1))) %>%
summarise(mv1 = mean(var1)) %>%
ungroup %>%
group_by(across(all_of(list2))) %>%
summarise(mv1_2 = mean(mv1))
}
sample_fun(dta, list1, list2)
# var2 var3 mv1_2
# <chr> <chr> <dbl>
#1 AA C 2
#2 AA D 3
#3 BB C 6
#4 BB D 7
Using non-standard evaluation with syms :
sample_fun = function(dta, list1, list2){
dta %>%
group_by(!!!syms(list1)) %>%
summarise(mv1 = mean(var1)) %>%
ungroup %>%
group_by(!!!syms(all_of(list2))) %>%
summarise(mv1_2 = mean(mv1))
}
sample_fun(dta, list1, list2)
# var2 var3 mv1_2
# <chr> <chr> <dbl>
#1 AA C 2
#2 AA D 3
#3 BB C 6
#4 BB D 7

Resources