Within tidyverse, create seq() column based on existing variables - r

Goal: Within tidyverse, create a sequence column called my_seq. Each seq() number should use existing columns for "from" (x column) and "to" (y column).
Bonus points for self-referential "dot" combo (and explanation of dot grammar).
boo <- tribble(
~ x, ~y,
5, 20,
6, 10,
2, 20)
# Desired results should reflect these results in new column:
seq(5, 20, by = 2)
#> [1] 5 7 9 11 13 15 17 19
seq(6, 10, by = 2)
#> [1] 6 8 10
seq(2, 20, by = 2)
#> [1] 2 4 6 8 10 12 14 16 18 20
# These straightforward solutions do not work
boo %>%
mutate(my_seq = seq(x, y, by = 2))
boo %>%
mutate(my_seq = seq(boo$x, boo$y, by = 2))
# The grammar of self-referential dots is super arcane, but
# here are some additional tries. All fail.
boo %>%
mutate(my_seq = map_int(boo, ~seq(.$x, .$y, by = 2)))
boo %>%
mutate(my_seq = seq(.$x, .$y, by = 2))

With purrr, you can use map2 to loop through x and y in parallel, which is similar to Map/mapply in base R but different syntax:
boo %>% mutate(my_seq = map2(x, y, seq, by=2))
# A tibble: 3 x 3
# x y my_seq
# <dbl> <dbl> <list>
#1 5 20 <dbl [8]>
#2 6 10 <dbl [3]>
#3 2 20 <dbl [10]>
my_seq is a column of list type, we can pull the column out to see its content:
boo %>% mutate(my_seq = map2(x, y, seq, by=2)) %>% pull(my_seq)
#[[1]]
#[1] 5 7 9 11 13 15 17 19
#[[2]]
#[1] 6 8 10
#[[3]]
# [1] 2 4 6 8 10 12 14 16 18 20

In general, when there are multiple arguments, pmap can be used as well
library(dplyr)
library(purrr)
res <- boo %>%
mutate(my_seq = pmap(., .f = ~seq(..1, ..2, by = 2)))
res
# A tibble: 3 x 3
# x y my_seq
# <dbl> <dbl> <list>
#1 5.00 20.0 <dbl [8]>
#2 6.00 10.0 <dbl [3]>
#3 2.00 20.0 <dbl [10]>
res$my_seq
#[[1]]
#[1] 5 7 9 11 13 15 17 19
#[[2]]
#[1] 6 8 10
#[[3]]
#[1] 2 4 6 8 10 12 14 16 18 20

Related

How to mutate to input the mean as a column after using nest?

I'm trying to learn how to use nest(), and I'm trying to nest by once of 3 time periods participants could be in and I want to add two columns. The first column is the overall mean, which I have figured out. Then, I want to nest by the time variable and create 3 datasets (which I have figured out) and then compute the group mean. I read that you should create a function (here, section 6.3.1), but my function keeps failing. How would I do this?
Also, please use nest or nest_by in the solution. I know I could use group_by(), like someone else did here, but in my actual data, I need these to be 3 separate datasets due to other computations that I need to do.
#Here's my setup and sample data
library(dplyr)
library(purrr)
library(tidyr)
set.seed(1414)
test <- tibble(id = c(1:100),
condition = c(rep(c("pre", "post"), 50)),
time = c(case_when(condition == "pre" ~ 0,
condition == "post" ~ sample(c(1, 2), size = c(100), replace = TRUE))),
score = case_when(time == 0 ~ 1,
time == 1 ~ 10,
time == 2 ~ 100))
#Here's what I tried
#Nesting the data (works)
nested_test <- test %>%
unite(col = "all_combos", c(condition, time)) %>%
mutate(score2 = mean(score)) %>%
nest_by(all_combos)
#Make mean function and map it (doesn't work)
my_mean <- function(data) {
mean(score, na.rm = T)
}
nested_test %>%
mutate(score3 = map(data, my_mean))
We may need to ungroup as there is rowwise attribute and then loop over the data with map and create the column with mutate on the nested data
library(dplyr)
library(purrr)
nested_test_new <- nested_test %>%
ungroup %>%
mutate(data = map(data, ~ .x %>%
mutate(score3 = mean(score, na.rm = TRUE))))
-output
nested_test_new
# A tibble: 3 × 2
all_combos data
<chr> <list>
1 post_1 <tibble [19 × 4]>
2 post_2 <tibble [31 × 4]>
3 pre_0 <tibble [50 × 4]>
> nested_test_new$data
[[1]]
# A tibble: 19 × 4
id score score2 score3
<int> <dbl> <dbl> <dbl>
1 2 10 33.4 10
2 4 10 33.4 10
3 14 10 33.4 10
4 16 10 33.4 10
5 18 10 33.4 10
6 28 10 33.4 10
7 30 10 33.4 10
8 32 10 33.4 10
9 38 10 33.4 10
10 44 10 33.4 10
11 48 10 33.4 10
12 60 10 33.4 10
13 64 10 33.4 10
14 78 10 33.4 10
15 80 10 33.4 10
16 86 10 33.4 10
17 92 10 33.4 10
18 96 10 33.4 10
19 100 10 33.4 10
[[2]]
# A tibble: 31 × 4
id score score2 score3
<int> <dbl> <dbl> <dbl>
1 6 100 33.4 100
2 8 100 33.4 100
3 10 100 33.4 100
4 12 100 33.4 100
...
Or another option is nest_mutate from nplyr
library(nplyr)
test %>%
unite(col = "all_combos", c(condition, time)) %>%
mutate(score2 = mean(score)) %>%
nest(data = -all_combos) %>%
nest_mutate(data, score3 = mean(score, na.rm = TRUE))
-output
# A tibble: 3 × 2
all_combos data
<chr> <list>
1 pre_0 <tibble [50 × 4]>
2 post_1 <tibble [19 × 4]>
3 post_2 <tibble [31 × 4]>

Concisely assign vector output of a function to multiple variables in dplyr

I am trying to assign the vector output (i.e. greater than length 1) of a function to multiple columns in a single operation (or at least as concisely as possible).
Take the range() function for example which returns as output a numeric vector of length 2 denoting the minimum and maximum, respectively. Let's say I want to compute the range() per group and assign the output to two columns min and max.
My current approach is combining summarize followed by manually adding a key and then re-shaping to wide format:
library(magrittr)
# create data
df <- dplyr::tibble(group = rep(letters[1:3], each = 3),
x = rpois(9, 10))
df
#> # A tibble: 9 x 2
#> group x
#> <chr> <int>
#> 1 a 8
#> 2 a 12
#> 3 a 8
#> 4 b 9
#> 5 b 14
#> 6 b 9
#> 7 c 11
#> 8 c 6
#> 9 c 12
# summarize gives two lines per group
range_df <- df %>%
dplyr::group_by(group) %>%
dplyr::summarize(range = range(x)) %>%
dplyr::ungroup()
range_df
#> # A tibble: 6 x 2
#> group range
#> <chr> <int>
#> 1 a 8
#> 2 a 12
#> 3 b 9
#> 4 b 14
#> 5 c 6
#> 6 c 12
# add key and reshape
range_df %>%
dplyr::mutate(key = rep(c("min", "max"), 3)) %>%
tidyr::pivot_wider(names_from = key, values_from = range)
#> # A tibble: 3 x 3
#> group min max
#> <chr> <int> <int>
#> 1 a 8 12
#> 2 b 9 14
#> 3 c 6 12
Is there a more elegant / concise alternative to this?
Edit:
Ideally the alternative solution could handle an arbitrary number of outputs (e.g. if the function returns an output with length 3 then 3 variables should be created).
# Writw a small function that does the job:
library(tidyverse)
f <- function(x){
setNames(data.frame(t(range(x))), c('min', 'max'))
}
df %>%
summarise(across(x, f, .unpack = TRUE), .by=group)
#> # A tibble: 3 × 3
#> group x_min x_max
#> <chr> <int> <int>
#> 1 a 10 13
#> 2 b 7 10
#> 3 c 10 12
If you are using older version of dplyr
df %>%
group_by(group)%>%
summarise(across(x, f))%>%
unpack(x)
#> # A tibble: 3 × 3
#> group min max
#> <chr> <int> <int>
#> 1 a 6 9
#> 2 b 7 12
#> 3 c 6 10
Based on onyambu's answer, I build a small generic function for this. There probably will be some edge cases, where this will not work.
out2col <- function(x, fun, out_names = c(), add_args = list()) {
tmp <- do.call(what = fun, args = c(list(x), add_args))
out <- data.frame(t(tmp))
if (length(out_names) != 0) {
if (length(tmp) != length(out_names)) {
stop("provided names did not match the number of outputs")
}
out <- setNames(object = out, nm = out_names)
}
return(out)
}
Examples without any additional parameters:
df %>%
summarise(across(x, out2col, .unpack = TRUE, fun = range),
.by=group)
Output:
# A tibble: 3 × 3
group x_X1 x_X2
<chr> <int> <int>
1 a 7 10
2 b 11 14
3 c 9 14
Examples with additional parameters:
df %>%
summarise(across(x, out2col, .unpack = TRUE, fun = quantile,
out_names = c("min", "max", "Q25"),
add_args = list(probs = c(0, 1, 0.25))
),
.by=group)
Output:
# A tibble: 3 × 4
group x_min x_max x_Q25
<chr> <dbl> <dbl> <dbl>
1 a 7 10 7.5
2 b 11 14 11.5
3 c 9 14 10
set.seed(1)
df <- dplyr::tibble(group = rep(letters[1:3], each = 3),
x = rpois(9, 10))
function
g <- function(x){
data.frame(min = min(x), max = max(x))
}
calling g:
df %>%
group_by(group) %>%
summarise(across(x, g, .unpack = TRUE))

Mutate column using array names

Context:
My data analysis involves manipulating ~100 different trials separately, and each trial has >1000 rows. Eventually, one step requires me to combine each trial with a column value from a different dataset. I plan to combine this dataset with each trial within an array using left_join() and "ID" as the key.
Dilemma
I want to mutate() the trial name into a new column labeled "ID". I feel like this should be a simple task, but I'm still a novice when working with lists and arrays.
Working Code
I don't know how to share .csv files, but you can save the example datasets as .csv files within a practice folder named "data".
library(tidyverse)
# Create practice dataset
df1 <- tibble(Time = seq(1, 5, by = 1),
Point = seq(6, 10, by = 1)) %>% print()
# A tibble: 5 x 2
Time Point
<dbl> <dbl>
1 1 6
2 2 7
3 3 8
4 4 9
5 5 10
df2 <- tibble(Time = seq(6, 10, by = 1),
Point = seq(1, 5, by = 1)) %>% print()
# A tibble: 5 x 2
Time Point
<dbl> <dbl>
1 6 1
2 7 2
3 8 3
4 9 4
5 10 5
write_csv(df1, file.path("data", "21May27_CtYJ10.csv")
write_csv(df2, file.path("data", "21May27_HrOW07.csv"))
This is the code I have working right now:
# Isolate .csv files from directory into a list
rawFiles_List <- list.files("data", pattern = ".csv", full = TRUE) %>% print()
# Naming scheme for files w/n list
trialDate <- list(str_sub(rawFiles_List, 13, 26)) %>%
print() # Adjust the substring to include date and trial
[[1]]
[1] "21May27_CtYJ10" "21May27_HrOW07"
trial <- list(str_sub(rawFiles_List, 21, 26)) %>% print() # Only include trial
[[1]]
[1] "CtYJ10" "HrOW07"
# Combine the list and list names into an array
rawFiles <- array(map(rawFiles_List, read_csv), dimnames = trialDate) %>% print()
Parsed with column specification:
cols(
Time = col_double(),
Point = col_double()
)
Parsed with column specification:
cols(
Time = col_double(),
Point = col_double()
)
$`21May27_CtYJ10`
# A tibble: 5 x 2
Time Point
<dbl> <dbl>
1 1 6
2 2 7
3 3 8
4 4 9
5 5 10
$`21May27_HrOW07`
# A tibble: 5 x 2
Time Point
<dbl> <dbl>
1 6 1
2 7 2
3 8 3
4 9 4
5 10 5
This partially does what I want:
map(rawFiles, ~ data.frame(.) %>% # Convert to dataframe
# Create a new column with trial name
mutate(ID = map(trial, paste)) %>% # Pastes the list, not the respective value
as_tibble(.)) # Convert back to tibble
$`21May27_CtYJ10`
# A tibble: 5 x 3
Time Point MouseID
<dbl> <dbl> <list>
1 1 6 <chr [2]>
2 2 7 <chr [2]>
3 3 8 <chr [2]>
4 4 9 <chr [2]>
5 5 10 <chr [2]>
$`21May27_HrOW07`
# A tibble: 5 x 3
Time Point MouseID
<dbl> <dbl> <list>
1 6 1 <chr [2]>
2 7 2 <chr [2]>
3 8 3 <chr [2]>
4 9 4 <chr [2]>
5 10 5 <chr [2]>
Question:
Can you please help me make a new column filled with their respective trial IDs? I am trying to use mostly tidyverse functions, but I'm open to Base-R functions, too. If you are able to give some explanation as how you match the list elements to the array elements or refer me to a helpful resource, that would be much appreciated.
Bonus Question:
I am working on how to save each file after all manipulations, but I'm not sure if I'm writing my for loop correctly. Could you provide some guidance as how I should edit my for loop? I'm using previous code as a guide, but I'm willing to scrap it if I'm over-complicating things. The following is what I have written so far:
SaveDate <- format(Sys.Date(), format = "%y%b%d")
for (i in 1:length(combFiles)) { # Dataset combing array of trials manipulated
filename <- vector("list", length(rawFiles)) # Vector to fill
filename[[i]] <- paste( # Fill vector with respective filenames
as.data.frame(trial)[[1]][i], "_mod_", SaveDate, ".csv", sep = "")
write.csv(file = filename[[i]],
modFiles[[i]], # Array of trials manipulated
sep = ",", row.names = FALSE, col.names = TRUE)
}
library(tidyverse)
# Create practice dataset
df1 <- tibble(Time = seq(1, 5, by = 1),
Point = seq(6, 10, by = 1)) %>% print()
#> # A tibble: 5 x 2
#> Time Point
#> <dbl> <dbl>
#> 1 1 6
#> 2 2 7
#> 3 3 8
#> 4 4 9
#> 5 5 10
df2 <- tibble(Time = seq(6, 10, by = 1),
Point = seq(1, 5, by = 1)) %>% print()
#> # A tibble: 5 x 2
#> Time Point
#> <dbl> <dbl>
#> 1 6 1
#> 2 7 2
#> 3 8 3
#> 4 9 4
#> 5 10 5
write_csv(df1, "21May27_CtYJ10.csv")
write_csv(df2, "21May27_HrOW07.csv")
rm(df1, df2)
The easiest is to use imap_*. This will automatically loop on all the files in your list and combine them if needed. For this to work, the file list must have names.
# Prepare raw file list with names equal to the values
rawFiles_List <- list.files(pattern = "^21May27") %>%
set_names()
rawFiles_List
#> 21May27_CtYJ10.csv 21May27_HrOW07.csv
#> "21May27_CtYJ10.csv" "21May27_HrOW07.csv"
imap_dfr(rawFiles_List,
~ read_csv(.x, col_types = "dd") %>%
add_column(source_file = .y))
#> # A tibble: 10 x 3
#> Time Point source_file
#> <dbl> <dbl> <chr>
#> 1 1 6 21May27_CtYJ10.csv
#> 2 2 7 21May27_CtYJ10.csv
#> 3 3 8 21May27_CtYJ10.csv
#> 4 4 9 21May27_CtYJ10.csv
#> 5 5 10 21May27_CtYJ10.csv
#> 6 6 1 21May27_HrOW07.csv
#> 7 7 2 21May27_HrOW07.csv
#> 8 8 3 21May27_HrOW07.csv
#> 9 9 4 21May27_HrOW07.csv
#> 10 10 5 21May27_HrOW07.csv
If you prefer to stay with a list of data frames and just add a column in each, use imap():
imap(rawFiles_List,
~ read_csv(.x, col_types = "dd") %>%
add_column(source_file = .y))
#> $`21May27_CtYJ10.csv`
#> # A tibble: 5 x 3
#> Time Point source_file
#> <dbl> <dbl> <chr>
#> 1 1 6 21May27_CtYJ10.csv
#> 2 2 7 21May27_CtYJ10.csv
#> 3 3 8 21May27_CtYJ10.csv
#> 4 4 9 21May27_CtYJ10.csv
#> 5 5 10 21May27_CtYJ10.csv
#>
#> $`21May27_HrOW07.csv`
#> # A tibble: 5 x 3
#> Time Point source_file
#> <dbl> <dbl> <chr>
#> 1 6 1 21May27_HrOW07.csv
#> 2 7 2 21May27_HrOW07.csv
#> 3 8 3 21May27_HrOW07.csv
#> 4 9 4 21May27_HrOW07.csv
#> 5 10 5 21May27_HrOW07.csv
Of course, if you manipulate the names of the filelist before running the map command, you can make sure the correct value is inserted in the column:
rawFiles_List <- list.files(pattern = "^21May27") %>%
set_names(str_sub(., 21L, 26L))
As for saving, I suggest you use iwalk(). I think your for loop is not doing what you want (you are reinitializing filename at each pass, erasing its previous content, probably not what you want).

Replace values of a column based on another column having as input a dataframe

With the base::replace function I want to change some values of column z based on the values of column y
library(tidyverse)
(df <- tibble(y = 10:13, z = 20:23))
#> # A tibble: 4 x 2
#> y z
#> <int> <int>
#> 1 10 20
#> 2 11 21
#> 3 12 22
#> 4 13 23
I have the data.frame val where column a the value to be used as condition and column b will be the replacement value.
(val <- tibble(a = c(10, 12), b = c(100, 200)))
#> # A tibble: 2 x 2
#> a b
#> <dbl> <dbl>
#> 1 10 100
#> 2 12 200
Using the following approach it is possible to get the desired result, but it only works if all the values inside val are inside df
df %>% mutate(z = replace(z, y %in% val$a, val$b))
#> # A tibble: 4 x 2
#> y z
#> <int> <dbl>
#> 1 10 100
#> 2 11 21
#> 3 12 200
#> 4 13 23
For example, if I update val to have values that are not in df, then:
(val <- tibble(a = c(1, 10, 12), b = c(1, 100, 200)))
#> # A tibble: 3 x 2
#> a b
#> <dbl> <dbl>
#> 1 1 1
#> 2 10 100
#> 3 12 200
and I run the code again ...
df %>% mutate(z = replace(z, y %in% val$a, val$b))
#> Warning in x[list] <- values: number of items to replace is not a multiple of
#> replacement length
#> # A tibble: 4 x 2
#> y z
#> <int> <dbl>
#> 1 10 1
#> 2 11 21
#> 3 12 100
#> 4 13 23
There are errors ... How can I fix this?
Created on 2021-02-19 by the reprex package (v1.0.0)
One possibility might be:
df %>%
mutate(z = coalesce(b[match(y, a)], z))
y z
<int> <dbl>
1 10 100
2 11 200
3 12 22
We don't need any mapas 'a', 'b' and the number of rows of 'df' have the same length. So, an elementwise comparison with == can be done. Instead of replace, it may be better with ifelse/case_when etc as replace values should be of the same length as the list condition TRUE elements
library(dpyr)
df %>%
mutate(z = case_when(a == y ~ as.integer(b), TRUE ~ z))
-output
# A tibble: 3 x 2
# y z
# <int> <int>
#1 10 100
#2 11 200
#3 12 22
Or using base R
df$z <- with(df, ifelse(a == y, b, z))
In the OP's code, there is a difference in length when we do
replace(x = z, y == .x, values = .y)
where 'z' will be the full column length, .x , .y will be each row element
Update
Based on the updated data, we could a join and then use coalesce
df %>%
left_join(val, by = c('y' = 'a')) %>%
transmute(y, z = coalesce(b, z))
# A tibble: 4 x 2
# y z
# <dbl> <dbl>
#1 10 100
#2 11 21
#3 12 200
#4 13 23
A base R option with replace
transform(
df,
z = replace(z, na.omit(match(y, a)), b[na.omit(match(y, a))])
)
gives
y z
1 10 100
2 11 200
3 12 22

Nesting duplicate variables when joining with dplyr in R

I'm joining data frames (tibbles) that have duplicated columns that I do not want to join. Example below is what I would usually do (joining by i, but not a or b):
library(dplyr)
df1 <- tibble(i = letters[1:3], a = 1:3, b = 4:6)
df2 <- tibble(i = letters[1:3], a = 11:13, b = 14:16)
d <- full_join(df1, df2, by ="i")
d
#> # A tibble: 3 × 5
#> i a.x b.x a.y b.y
#> <chr> <int> <int> <int> <int>
#> 1 a 1 4 11 14
#> 2 b 2 5 12 15
#> 3 c 3 6 13 16
I want these duplicated variables to be returned as nested lists such as the output created below:
tibble(
i = letters[1:3],
a = list(c(1, 11), c(2, 12), c(3, 13)),
b = list(c(4, 14), c(5, 15), c(6, 16))
)
#> # A tibble: 3 × 3
#> i a b
#> <chr> <list> <list>
#> 1 a <dbl [2]> <dbl [2]>
#> 2 b <dbl [2]> <dbl [2]>
#> 3 c <dbl [2]> <dbl [2]>
Is there a simple way to do such a thing?
Aside, I've been playing around (unsuccessfully) with various stringr and tidyr methods. Here's an example that throws an error:
library(stringr)
library(tidyr)
# Find any variables with .x or .y
dup_var <- d %>% select(matches("\\.[xy]")) %>% names()
# Condense to the stems (original names) of these variables
dup_var_stems <- dup_var %>% str_replace("(\\.[x|y])+", "") %>% unique()
# For each stem, try to nest relevant data into a single variable
for (stem in dup_var_stems) {
d <- d %>% nest_(key_col = stem, nest_cols = names(d)[str_detect(names(d), paste0(stem, "[$|\\.]"))])
}
UPDATE
After answers from #Sotos and #conor, I'll mention that the solution needs to generalise to multiple joining and duplicated columns over many data frames. Below is an example where joining is done on five data frames by two columns (i and j). This creates five duplicated versions of columns a and b, with plenty of unique columns too c:g. One problem is that duplicating over so many data frames results in duplicated versions having no suffix, .x, .x.x, and so on. So simple regex match for .x|.y will miss the no-suffix version of the column.
library(dplyr)
library(purrr)
id_cols <- tibble(i = c("x", "x", "y", "y"),
j = c(1, 2, 1, 2))
df1 <- id_cols %>% cbind(tibble(a = 1:4, b = 5:8, c = 21:24))
df2 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, d = 31:34))
df3 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, e = 31:34))
df4 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, f = 31:34))
df5 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, g = 31:34))
datalist <- list(df1, df2, df3, df4, df5)
d <- reduce(datalist, full_join, by = c("i", "j"))
d
#> i j a.x b.x c a.y b.y d a.x.x b.x.x e a.y.y b.y.y f a b g
#> 1 x 1 1 5 21 2 6 31 2 6 31 2 6 31 2 6 31
#> 2 x 2 2 6 22 3 7 32 3 7 32 3 7 32 3 7 32
#> 3 y 1 3 7 23 4 8 33 4 8 33 4 8 33 4 8 33
#> 4 y 2 4 8 24 5 9 34 5 9 34 5 9 34 5 9 34
Here is one attempt,
library(dplyr)
library(tidyr)
melt(d, id.vars = 'i') %>%
group_by(a = sub('\\..*', '', variable), i) %>%
summarise(new = list(value)) %>%
spread(a, new)
# A tibble: 3 × 3
# i a b
#* <chr> <list> <list>
#1 a <int [2]> <int [2]>
#2 b <int [2]> <int [2]>
#3 c <int [2]> <int [2]>
#With structure
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 3 obs. of 3 variables:
$ i: chr "a" "b" "c"
$ a:List of 3
..$ : int 1 11
..$ : int 2 12
..$ : int 3 13
$ b:List of 3
..$ : int 4 14
..$ : int 5 15
..$ : int 6 16
#Or via reshape2 package
library(dplyr)
library(reshape2)
d1 <- melt(d, id.vars = 'i') %>%
group_by(a = sub('\\..*', '', variable), i) %>%
summarise(new = list(value))
d2 <- dcast(d1, i ~ a, value.var = 'new')
#d2
# i a b
#1 a 1, 11 4, 14
#2 b 2, 12 5, 15
#3 c 3, 13 6, 16
#with structure:
str(d2)
'data.frame': 3 obs. of 3 variables:
$ i: chr "a" "b" "c"
$ a:List of 3
..$ : int 1 11
..$ : int 2 12
..$ : int 3 13
$ b:List of 3
..$ : int 4 14
..$ : int 5 15
..$ : int 6 16
EDIT
To follow your thought,
library(dplyr)
library(reshape2)
library(purrr)
library(tidyr)
df <- melt(d, id.vars = c(names(d)[!grepl('a|b', names(d))]))
dots <- names(df)[!grepl('value', names(df))] %>% map(as.symbol)
df %>% mutate(variable = sub('\\..*', '', variable)) %>%
group_by_(.dots = dots) %>%
summarise(new = list(value)) %>%
spread(variable, new) %>%
ungroup()
# A tibble: 4 × 9
# i j c d e f g a b
#* <chr> <dbl> <int> <int> <int> <int> <int> <list> <list>
#1 x 1 21 31 31 31 31 <int [5]> <int [5]>
#2 x 2 22 32 32 32 32 <int [5]> <int [5]>
#3 y 1 23 33 33 33 33 <int [5]> <int [5]>
#4 y 2 24 34 34 34 34 <int [5]> <int [5]>
Slightly more verbose than Sotos answer, but this will also work.
library(dplyr)
library(tidyr)
library(stringr)
d_tidy <- gather(d, col, val, a.x:b.y, -i)
d_tidy$col <- str_replace(d_tidy$col, ".x|.y", "")
d_tidy %>% group_by(i, col) %>%
summarise(val = list(val)) %>%
spread(col, val) %>%
ungroup()
i a b
<fctr> <list> <list>
1 a <int [2]> <int [2]>
2 b <int [2]> <int [2]>
3 c <int [2]> <int [2]>
If you want to use nest to create lists of dataframes you can do this instead
d_tidy <- gather(d, col, val, a.x:b.y, -i)
d_tidy$col <- str_replace(d_tidy$col, ".x|.y", "")
d_tidy %>%
group_by(i, col) %>%
nest(col) %>%
spread(col, data)
i a b
<fctr> <list> <list>
1 a <tbl_df [2,0]> <tbl_df [2,0]>
2 b <tbl_df [2,0]> <tbl_df [2,0]>
3 c <tbl_df [2,0]> <tbl_df [2,0]>
After updating the question, I arrived at the following based on the melt() solution provided by #Sotos (so please upvote that solution too if you think this works).
The following is a function that should take a data frame like the ones described, and nest duplicated columns. See comments throughout for explanation.
Create the problem data frame:
library(dplyr)
library(purrr)
id_cols <- tibble(i = c("x", "x", "y", "y"),
j = c(1, 2, 1, 2))
df1 <- id_cols %>% cbind(tibble(a = 1:4, b = 5:8, c = 21:24))
df2 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, d = 31:34))
df3 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, e = 31:34))
df4 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, f = 31:34))
df5 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, g = 31:34))
datalist <- list(df1, df2, df3, df4, df5)
d <- reduce(datalist, full_join, by = c("i", "j"))
d
#> i j a.x b.x c a.y b.y d a.x.x b.x.x e a.y.y b.y.y f a b g
#> 1 x 1 1 5 21 2 6 31 2 6 31 2 6 31 2 6 31
#> 2 x 2 2 6 22 3 7 32 3 7 32 3 7 32 3 7 32
#> 3 y 1 3 7 23 4 8 33 4 8 33 4 8 33 4 8 33
#> 4 y 2 4 8 24 5 9 34 5 9 34 5 9 34 5 9 34
Create function nest_duplicates()
# Function to nest duplicated columns after joining multiple data frames
#
# Args:
# df Data frame of joined data frames with duplicated columns.
# suffixes Character string to match suffixes. E.g., the default "\\.[xy]"
# finds any columns ending with .x or .y
#
# Depends on: dplyr, tidyr, purrr, stringr
nest_duplicated <- function(df, suffixes = "\\.[xy]") {
# Search string to match any duplicated variables
search_string <- df %>%
dplyr::select(dplyr::matches(suffixes)) %>%
names() %>%
stringr::str_replace_all(suffixes, "") %>%
unique() %>%
stringr::str_c(collapse = "|") %>%
stringr::str_c("(", ., ")($|", suffixes, ")")
# Gather duplicated variables and convert names to stems
df <- df %>%
tidyr::gather(variable, value, dplyr::matches(search_string)) %>%
dplyr::mutate(variable = stringr::str_replace_all(variable, suffixes, ""))
# Group by all columns except value to convert duplicated rows into list, then
# spread by variable (var)
dots <- names(df)[!stringr::str_detect(names(df), "value")] %>% purrr::map(as.symbol)
df %>%
dplyr::group_by_(.dots = dots) %>%
dplyr::summarise(new = list(value)) %>%
tidyr::spread(variable, new) %>%
dplyr::ungroup()
}
Apply nest_duplicates():
nest_duplicated(d)
#> # A tibble: 4 × 9
#> i j c d e f g a b
#> * <chr> <dbl> <int> <int> <int> <int> <int> <list> <list>
#> 1 x 1 21 31 31 31 31 <int [5]> <int [5]>
#> 2 x 2 22 32 32 32 32 <int [5]> <int [5]>
#> 3 y 1 23 33 33 33 33 <int [5]> <int [5]>
#> 4 y 2 24 34 34 34 34 <int [5]> <int [5]>
Updates/improvements welcome!

Resources