Nesting duplicate variables when joining with dplyr in R - r

I'm joining data frames (tibbles) that have duplicated columns that I do not want to join. Example below is what I would usually do (joining by i, but not a or b):
library(dplyr)
df1 <- tibble(i = letters[1:3], a = 1:3, b = 4:6)
df2 <- tibble(i = letters[1:3], a = 11:13, b = 14:16)
d <- full_join(df1, df2, by ="i")
d
#> # A tibble: 3 × 5
#> i a.x b.x a.y b.y
#> <chr> <int> <int> <int> <int>
#> 1 a 1 4 11 14
#> 2 b 2 5 12 15
#> 3 c 3 6 13 16
I want these duplicated variables to be returned as nested lists such as the output created below:
tibble(
i = letters[1:3],
a = list(c(1, 11), c(2, 12), c(3, 13)),
b = list(c(4, 14), c(5, 15), c(6, 16))
)
#> # A tibble: 3 × 3
#> i a b
#> <chr> <list> <list>
#> 1 a <dbl [2]> <dbl [2]>
#> 2 b <dbl [2]> <dbl [2]>
#> 3 c <dbl [2]> <dbl [2]>
Is there a simple way to do such a thing?
Aside, I've been playing around (unsuccessfully) with various stringr and tidyr methods. Here's an example that throws an error:
library(stringr)
library(tidyr)
# Find any variables with .x or .y
dup_var <- d %>% select(matches("\\.[xy]")) %>% names()
# Condense to the stems (original names) of these variables
dup_var_stems <- dup_var %>% str_replace("(\\.[x|y])+", "") %>% unique()
# For each stem, try to nest relevant data into a single variable
for (stem in dup_var_stems) {
d <- d %>% nest_(key_col = stem, nest_cols = names(d)[str_detect(names(d), paste0(stem, "[$|\\.]"))])
}
UPDATE
After answers from #Sotos and #conor, I'll mention that the solution needs to generalise to multiple joining and duplicated columns over many data frames. Below is an example where joining is done on five data frames by two columns (i and j). This creates five duplicated versions of columns a and b, with plenty of unique columns too c:g. One problem is that duplicating over so many data frames results in duplicated versions having no suffix, .x, .x.x, and so on. So simple regex match for .x|.y will miss the no-suffix version of the column.
library(dplyr)
library(purrr)
id_cols <- tibble(i = c("x", "x", "y", "y"),
j = c(1, 2, 1, 2))
df1 <- id_cols %>% cbind(tibble(a = 1:4, b = 5:8, c = 21:24))
df2 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, d = 31:34))
df3 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, e = 31:34))
df4 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, f = 31:34))
df5 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, g = 31:34))
datalist <- list(df1, df2, df3, df4, df5)
d <- reduce(datalist, full_join, by = c("i", "j"))
d
#> i j a.x b.x c a.y b.y d a.x.x b.x.x e a.y.y b.y.y f a b g
#> 1 x 1 1 5 21 2 6 31 2 6 31 2 6 31 2 6 31
#> 2 x 2 2 6 22 3 7 32 3 7 32 3 7 32 3 7 32
#> 3 y 1 3 7 23 4 8 33 4 8 33 4 8 33 4 8 33
#> 4 y 2 4 8 24 5 9 34 5 9 34 5 9 34 5 9 34

Here is one attempt,
library(dplyr)
library(tidyr)
melt(d, id.vars = 'i') %>%
group_by(a = sub('\\..*', '', variable), i) %>%
summarise(new = list(value)) %>%
spread(a, new)
# A tibble: 3 × 3
# i a b
#* <chr> <list> <list>
#1 a <int [2]> <int [2]>
#2 b <int [2]> <int [2]>
#3 c <int [2]> <int [2]>
#With structure
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 3 obs. of 3 variables:
$ i: chr "a" "b" "c"
$ a:List of 3
..$ : int 1 11
..$ : int 2 12
..$ : int 3 13
$ b:List of 3
..$ : int 4 14
..$ : int 5 15
..$ : int 6 16
#Or via reshape2 package
library(dplyr)
library(reshape2)
d1 <- melt(d, id.vars = 'i') %>%
group_by(a = sub('\\..*', '', variable), i) %>%
summarise(new = list(value))
d2 <- dcast(d1, i ~ a, value.var = 'new')
#d2
# i a b
#1 a 1, 11 4, 14
#2 b 2, 12 5, 15
#3 c 3, 13 6, 16
#with structure:
str(d2)
'data.frame': 3 obs. of 3 variables:
$ i: chr "a" "b" "c"
$ a:List of 3
..$ : int 1 11
..$ : int 2 12
..$ : int 3 13
$ b:List of 3
..$ : int 4 14
..$ : int 5 15
..$ : int 6 16
EDIT
To follow your thought,
library(dplyr)
library(reshape2)
library(purrr)
library(tidyr)
df <- melt(d, id.vars = c(names(d)[!grepl('a|b', names(d))]))
dots <- names(df)[!grepl('value', names(df))] %>% map(as.symbol)
df %>% mutate(variable = sub('\\..*', '', variable)) %>%
group_by_(.dots = dots) %>%
summarise(new = list(value)) %>%
spread(variable, new) %>%
ungroup()
# A tibble: 4 × 9
# i j c d e f g a b
#* <chr> <dbl> <int> <int> <int> <int> <int> <list> <list>
#1 x 1 21 31 31 31 31 <int [5]> <int [5]>
#2 x 2 22 32 32 32 32 <int [5]> <int [5]>
#3 y 1 23 33 33 33 33 <int [5]> <int [5]>
#4 y 2 24 34 34 34 34 <int [5]> <int [5]>

Slightly more verbose than Sotos answer, but this will also work.
library(dplyr)
library(tidyr)
library(stringr)
d_tidy <- gather(d, col, val, a.x:b.y, -i)
d_tidy$col <- str_replace(d_tidy$col, ".x|.y", "")
d_tidy %>% group_by(i, col) %>%
summarise(val = list(val)) %>%
spread(col, val) %>%
ungroup()
i a b
<fctr> <list> <list>
1 a <int [2]> <int [2]>
2 b <int [2]> <int [2]>
3 c <int [2]> <int [2]>
If you want to use nest to create lists of dataframes you can do this instead
d_tidy <- gather(d, col, val, a.x:b.y, -i)
d_tidy$col <- str_replace(d_tidy$col, ".x|.y", "")
d_tidy %>%
group_by(i, col) %>%
nest(col) %>%
spread(col, data)
i a b
<fctr> <list> <list>
1 a <tbl_df [2,0]> <tbl_df [2,0]>
2 b <tbl_df [2,0]> <tbl_df [2,0]>
3 c <tbl_df [2,0]> <tbl_df [2,0]>

After updating the question, I arrived at the following based on the melt() solution provided by #Sotos (so please upvote that solution too if you think this works).
The following is a function that should take a data frame like the ones described, and nest duplicated columns. See comments throughout for explanation.
Create the problem data frame:
library(dplyr)
library(purrr)
id_cols <- tibble(i = c("x", "x", "y", "y"),
j = c(1, 2, 1, 2))
df1 <- id_cols %>% cbind(tibble(a = 1:4, b = 5:8, c = 21:24))
df2 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, d = 31:34))
df3 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, e = 31:34))
df4 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, f = 31:34))
df5 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, g = 31:34))
datalist <- list(df1, df2, df3, df4, df5)
d <- reduce(datalist, full_join, by = c("i", "j"))
d
#> i j a.x b.x c a.y b.y d a.x.x b.x.x e a.y.y b.y.y f a b g
#> 1 x 1 1 5 21 2 6 31 2 6 31 2 6 31 2 6 31
#> 2 x 2 2 6 22 3 7 32 3 7 32 3 7 32 3 7 32
#> 3 y 1 3 7 23 4 8 33 4 8 33 4 8 33 4 8 33
#> 4 y 2 4 8 24 5 9 34 5 9 34 5 9 34 5 9 34
Create function nest_duplicates()
# Function to nest duplicated columns after joining multiple data frames
#
# Args:
# df Data frame of joined data frames with duplicated columns.
# suffixes Character string to match suffixes. E.g., the default "\\.[xy]"
# finds any columns ending with .x or .y
#
# Depends on: dplyr, tidyr, purrr, stringr
nest_duplicated <- function(df, suffixes = "\\.[xy]") {
# Search string to match any duplicated variables
search_string <- df %>%
dplyr::select(dplyr::matches(suffixes)) %>%
names() %>%
stringr::str_replace_all(suffixes, "") %>%
unique() %>%
stringr::str_c(collapse = "|") %>%
stringr::str_c("(", ., ")($|", suffixes, ")")
# Gather duplicated variables and convert names to stems
df <- df %>%
tidyr::gather(variable, value, dplyr::matches(search_string)) %>%
dplyr::mutate(variable = stringr::str_replace_all(variable, suffixes, ""))
# Group by all columns except value to convert duplicated rows into list, then
# spread by variable (var)
dots <- names(df)[!stringr::str_detect(names(df), "value")] %>% purrr::map(as.symbol)
df %>%
dplyr::group_by_(.dots = dots) %>%
dplyr::summarise(new = list(value)) %>%
tidyr::spread(variable, new) %>%
dplyr::ungroup()
}
Apply nest_duplicates():
nest_duplicated(d)
#> # A tibble: 4 × 9
#> i j c d e f g a b
#> * <chr> <dbl> <int> <int> <int> <int> <int> <list> <list>
#> 1 x 1 21 31 31 31 31 <int [5]> <int [5]>
#> 2 x 2 22 32 32 32 32 <int [5]> <int [5]>
#> 3 y 1 23 33 33 33 33 <int [5]> <int [5]>
#> 4 y 2 24 34 34 34 34 <int [5]> <int [5]>
Updates/improvements welcome!

Related

How to join dataframes by a list column and column containing single values

I am trying to join two dataframes together based on values in one column being present in the list of values in the column of the dataframe to join with
df1 <- tibble(Group = c("Group_1", "Group_2", "Group_3"),
Members = list(letters[1:3],
letters[4:6],
letters[7:12]))
df2 <- tibble(Letters = c("a","g","f","b"),
Value = 1:4)
The final data frame would look like:
df3 <- tibble(Letters = c("a","g","f","b"),
Value = 1:4,
Group = c("Group_1", "Group_3", "Group_2", "Group_1"),
Members = list(letters[1:3],
letters[7:12],
letters[4:6],
letters[1:3]))
Ideally, this would be done using dplyr or another tidyverse package
You could use
library(dplyr)
library(tidyr)
df1 %>%
unnest(Members) %>%
rename(Letters = Members) %>%
# left_join(df2, by = "Letters") %>%
# drop_na() %>%
inner_join(df2, by = "Letters") %>% # (c) by akrun
right_join(df1, by = "Group")
to get
# A tibble: 4 x 4
Group Letters Value Members
<chr> <chr> <int> <list>
1 Group_1 a 1 <chr [3]>
2 Group_1 b 4 <chr [3]>
3 Group_2 f 3 <chr [3]>
4 Group_3 g 2 <chr [6]>
Or using base R
cbind(df2, df1[sort(unlist(lapply(df1$Members, function(x) match(df2$Letters, x)))),])
Letters Value Group Members
1 a 1 Group_1 a, b, c
2 g 2 Group_1 a, b, c
3 f 3 Group_2 d, e, f
4 b 4 Group_3 g, h, i, j, k, l
An alternative with a single join:
library(tidyr)
library(dplyr)
library(purrr)
df1 %>%
mutate(Letters = map(Members, ~ .x[.x %in% df2$Letters])) %>%
unnest(Letters) %>%
left_join(df2)
Joining, by = "Letters"
# A tibble: 4 x 4
Group Members Letters Value
<chr> <list> <chr> <int>
1 Group_1 <chr [3]> a 1
2 Group_1 <chr [3]> b 4
3 Group_2 <chr [3]> f 3
4 Group_3 <chr [6]> g 2

How to get the pairwise difference of all values within uneven categories in R

I found solutions for simple vectors, but is there a way to make all pairwise differences using dplyr or base R for all the elements in a category?
library(tidyverse)
x = 1:10
y = rep(letters[1:5],each=2)
z = rep(1:2,length.out =10)
df = data.frame(x,y, z)
df = rbind(df,c(11,"e",3))
df$verif = paste0(df$y,df$z)
df$x = as.numeric(df$x)
df %>%
group_by(y) %>%
summarise(Diff = abs(x - lag(x)))
gives:
`summarise()` regrouping output by 'y' (override with `.groups` argument)
# A tibble: 11 x 2
# Groups: y [5]
y Diff
<chr> <dbl>
1 a NA
2 a 1
3 b NA
4 b 1
5 c NA
6 c 1
7 d NA
8 d 1
9 e NA
10 e 1
11 e 1
In this example, it's only using the previous value in the data frame, therefore missing pairwise differences (look at 9, 10 and 11 for group "e" ).
Is there a way to get all the pairwise differences in each category? Keeping track of the pairwise differences would be useful as well (e.g., e1 with e2 = 1, e2 with e3 is = 1 and e1 with e3 is =2)
I tired the outer() function but wasn't able to make it work as well as the dist() function.
I continued to try and found this:
my.df=df %>%
group_by(y) %>%
summarise(Diff = combn(x,2,diff))
my.df
# A tibble: 7 x 2
# Groups: y [5]
y Diff
<chr> <dbl>
1 a 1
2 b 1
3 c 1
4 d 1
5 e 1
6 e 2
7 e 1
I just now need to get which pairwise difference was calculated...
Continued again and got this mess:
my.df=df %>%
group_by(y) %>%
summarise(Diff = combn(x,2,diff),
test = combn(verif,2,paste, simplify = FALSE)) %>%
mutate(test2 = paste0(test, collapse = "-"))
my.df
> my.df
# A tibble: 7 x 4
# Groups: y [5]
y Diff test test2
<chr> <dbl> <list> <chr>
1 a 1 <chr [2]> "c(\"a1\", \"a2\")"
2 b 1 <chr [2]> "c(\"b1\", \"b2\")"
3 c 1 <chr [2]> "c(\"c1\", \"c2\")"
4 d 1 <chr [2]> "c(\"d1\", \"d2\")"
5 e 1 <chr [2]> "c(\"e1\", \"e2\")-c(\"e1\", \"e3\")-c(\"e2\", \"e3\")"
6 e 2 <chr [2]> "c(\"e1\", \"e2\")-c(\"e1\", \"e3\")-c(\"e2\", \"e3\")"
7 e 1 <chr [2]> "c(\"e1\", \"e2\")-c(\"e1\", \"e3\")-c(\"e2\", \"e3\")"
Got it:
library(tidyverse)
x = 1:10
y = rep(letters[1:5],each=2)
z = rep(1:2,length.out =10)
df = data.frame(x,y, z)
df = rbind(df,c(11,"e",3))
df$verif = paste0(df$y,df$z)
df$x = as.numeric(df$x)
my.df=df %>%
group_by(y) %>%
summarise(Diff = combn(x,2,diff),
test = combn(verif,2,paste, simplify = FALSE)) %>%
mutate(test2 = unlist(lapply(test, function(x)paste(x,collapse="-")))) %>%
select(-test)
Here is the output
my.df
# A tibble: 7 x 3
# Groups: y [5]
y Diff test2
<chr> <dbl> <chr>
1 a 1 a1-a2
2 b 1 b1-b2
3 c 1 c1-c2
4 d 1 d1-d2
5 e 1 e1-e2
6 e 2 e1-e3
7 e 1 e2-e3
You could do:
library(tidyverse)
df %>%
group_by(y) %>%
summarise(result = combn(seq_along(x), 2, function(i)
list(test1 = diff(x[i]), #The difference
test2 = paste0(verif[i], collapse = '-')), # The pairs
simplify = FALSE),
.groups = 'drop') %>%
unnest_wider(result)
# A tibble: 7 x 3
y test1 test2
<chr> <dbl> <chr>
1 a 1 a1-a2
2 b 1 b1-b2
3 c 1 c1-c2
4 d 1 d1-d2
5 e 1 e1-e2
6 e 2 e1-e3
7 e 1 e2-e3

R unnest multiple columns

Any functional approach to unnest multiple columns of different sizes?
Example:
library(tidyr)
library(dplyr)
my_list <- list(year = 2018:2020, period = 1, id = c(17,35))
expand_grid(my_list) %>%
pivot_wider(
names_from = my_list,
values_from = my_list
) %>%
rename_at(., names(.), ~ names(my_list))
# A tibble: 1 x 3
year period id
<named list> <named list> <named list>
1 <int [3]> <dbl [1]> <dbl [2]>
expand_grid(my_list) %>%
pivot_wider(
names_from = my_list,
values_from = my_list
) %>%
rename_at(., names(.), ~ names(my_list)) %>%
unnest(cols = names(my_list))
Erro: Incompatible lengths: 3, 2.
unnest requires column names, is it possible for a string vector?
Expected:
# A tibble: 1 x 3
year period id
<int> <int> <int>
1 2018 1 17
2 2019 1 17
3 2020 1 17
4 2018 1 35
5 2019 1 35
6 2020 1 35
We can use cross_df from purrr :
purrr::cross_df(my_list)
# year period id
# <int> <dbl> <dbl>
#1 2018 1 17
#2 2019 1 17
#3 2020 1 17
#4 2018 1 35
#5 2019 1 35
#6 2020 1 35
Or in base R use expand.grid with do.call :
do.call(expand.grid, my_list)

R collapse column to form numeric list

In R hoe do I collapse column to form another column with numeric lists types.
like we define numeric list as l = c(1,2,3)
df <- read.table(text = "X Y
a 26
a 3
a 24
b 8
b 1
b 4
", header = TRUE)
I am trying this with dplyr but it gives me character list column
> df %>% group_by(X) %>% summarise(lst= paste0(Y, collapse = ","))
# A tibble: 2 x 2
X lst
<fct> <chr>
1 a 26,3,24
2 b 8,1,4
group by X then summarise Y as list
library(dplyr)
out <- df %>%
group_by(X) %>%
summarise(Y = list(Y))
out
# A tibble: 2 x 2
# X Y
# <fct> <list>
#1 a <int [3]>
#2 b <int [3]>
The Y column now looks like this
out$Y
#[[1]]
#[1] 26 3 24
#
#[[2]]
#[1] 8 1 4
nest seems to be another option but this would result in a list column of tibbles (not what you want I think)
df %>%
group_by(X) %>%
nest()
# A tibble: 2 x 2
# X data
# <fct> <list>
#1 a <tibble [3 × 1]>
#2 b <tibble [3 × 1]>
A data.table solution:
library(data.table)
dt <- as.data.table(df)[, list(Y=list(Y)), by="X"]
> dt
X Y
1: a 26, 3,24
2: b 8,1,4
> dt$Y
[[1]]
[1] 26 3 24
[[2]]
[1] 8 1 4

Within tidyverse, create seq() column based on existing variables

Goal: Within tidyverse, create a sequence column called my_seq. Each seq() number should use existing columns for "from" (x column) and "to" (y column).
Bonus points for self-referential "dot" combo (and explanation of dot grammar).
boo <- tribble(
~ x, ~y,
5, 20,
6, 10,
2, 20)
# Desired results should reflect these results in new column:
seq(5, 20, by = 2)
#> [1] 5 7 9 11 13 15 17 19
seq(6, 10, by = 2)
#> [1] 6 8 10
seq(2, 20, by = 2)
#> [1] 2 4 6 8 10 12 14 16 18 20
# These straightforward solutions do not work
boo %>%
mutate(my_seq = seq(x, y, by = 2))
boo %>%
mutate(my_seq = seq(boo$x, boo$y, by = 2))
# The grammar of self-referential dots is super arcane, but
# here are some additional tries. All fail.
boo %>%
mutate(my_seq = map_int(boo, ~seq(.$x, .$y, by = 2)))
boo %>%
mutate(my_seq = seq(.$x, .$y, by = 2))
With purrr, you can use map2 to loop through x and y in parallel, which is similar to Map/mapply in base R but different syntax:
boo %>% mutate(my_seq = map2(x, y, seq, by=2))
# A tibble: 3 x 3
# x y my_seq
# <dbl> <dbl> <list>
#1 5 20 <dbl [8]>
#2 6 10 <dbl [3]>
#3 2 20 <dbl [10]>
my_seq is a column of list type, we can pull the column out to see its content:
boo %>% mutate(my_seq = map2(x, y, seq, by=2)) %>% pull(my_seq)
#[[1]]
#[1] 5 7 9 11 13 15 17 19
#[[2]]
#[1] 6 8 10
#[[3]]
# [1] 2 4 6 8 10 12 14 16 18 20
In general, when there are multiple arguments, pmap can be used as well
library(dplyr)
library(purrr)
res <- boo %>%
mutate(my_seq = pmap(., .f = ~seq(..1, ..2, by = 2)))
res
# A tibble: 3 x 3
# x y my_seq
# <dbl> <dbl> <list>
#1 5.00 20.0 <dbl [8]>
#2 6.00 10.0 <dbl [3]>
#3 2.00 20.0 <dbl [10]>
res$my_seq
#[[1]]
#[1] 5 7 9 11 13 15 17 19
#[[2]]
#[1] 6 8 10
#[[3]]
#[1] 2 4 6 8 10 12 14 16 18 20

Resources