left_join on a nested list - r

I have a nested df x and an unnested df y.
How can I join these two together so that the final output is a a single row with the id and val columns from x and a new column for each of the respective num values in order of appearance, num_1, num_2 ...?
library(tidyverse)
x <- tibble(id = list(letters[1:6]), val = 13)
x
#> # A tibble: 1 x 2
#> id val
#> <list> <dbl>
#> 1 <chr [6]> 13
y <- tibble(id = letters[1:6], num = rnorm(6))
y
#> # A tibble: 6 x 2
#> id num
#> <chr> <dbl>
#> 1 a 0.532
#> 2 b -0.106
#> 3 c -0.105
#> 4 d 0.973
#> 5 e -0.825
#> 6 f -0.951
map2(x, y, left_join, by = 'id')
Error in UseMethod("left_join"): no applicable method for 'left_join' applied to an object of class "list"
Created on 2020-08-14 by the reprex package (v0.3.0)
Edit: I'm looking for something loosely like this while still maintaining the ID column.
x %>%
unnest(id) %>%
left_join(y) %>%
mutate(n = row_number()) %>%
pivot_wider(id_cols = -id,
values_from = num,
names_from = n)
#> Joining, by = "id"
#> # A tibble: 1 x 7
#> val `1` `2` `3` `4` `5` `6`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 13 1.28 -0.387 -0.438 -0.0826 0.164 -1.24

Continuing with your work, you can try the following.
library(dplyr)
library(tidyr)
x %>%
unnest(id) %>%
left_join(y, by = "id") %>%
mutate(name = row_number(), id = list(id)) %>%
pivot_wider(values_from = num, names_glue = "num_{name}")
# # A tibble: 1 x 8
# id val num_1 num_2 num_3 num_4 num_5 num_6
# <list> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 <chr [6]> 13 1.28 -0.387 -0.438 -0.0826 0.164 -1.24
or
x %>%
mutate(num = map(id, ~ tibble::deframe(y) %>% .[match(names(.), .x)] %>% unname)) %>%
unnest_wider(num, names_sep = "_")
# # A tibble: 1 x 8
# id val num_1 num_2 num_3 num_4 num_5 num_6
# <list> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 <chr [6]> 13 1.28 -0.387 -0.438 -0.0826 0.164 -1.24
Description of the second solution
deframe() in tibble transforms a two-column data.frame to a named vector, the first column is converted to vector names and the second one is converted to vector values. deframe(y) %>% .[match(names(.), .x)] is equivalent to deframe(y)[match(names(deframe(y)), .x)]. The deframe(y) part appears twice, so I move it to the front of a pipe and use the . symbol to represent it behind the pipe. This line is to match the position of id columns of both data and reorder num column of y.

Based on your y you're not going to have multiple columns but adjusting the example a little, is this what you were aiming for?
x <- tibble(id = list(letters[1:6]), val = 13)
y <- tibble(id = rep(letters[1:6],2), num = rnorm(12),
name = paste0("num_", rep(1:2, each = 6)))
map_dfr(x$id[[1]], ~tibble(id = .x, val = x$val)) %>%
left_join(
pivot_wider(y, names_from = name, values_from = num)
)
#> Joining, by = "id"
#> # A tibble: 6 x 4
#> id val num_1 num_2
#> <chr> <dbl> <dbl> <dbl>
#> 1 a 13 0.609 1.97
#> 2 b 13 0.956 -1.84
#> 3 c 13 0.425 0.297
#> 4 d 13 0.0379 -0.784
#> 5 e 13 -0.532 -0.769
#> 6 f 13 0.538 -1.10

Related

Trying to isolate all columns (representing years) via conditional operator and renaming them in R

library(readr)
d <- read.csv("per_capita.csv")
rc <- d[,-2:-3]
df <- data.frame(rc)
draw <- df$X1994[df$Country.Name == "India"]
format(draw, scientific = F, big.marks = ",")
library(dplyr)
df %>%
filter(Country.Name == "India") %>%
select(names(.)[-1][readr::parse_integer(names(.)[-1] > 1994])
I tried this code and its giving me an error in the last line. Also, how should I rename these columns in the CSV file without using a dataframe?
The column names are: X1994, X1995..... and so on.
Thank You!
If you want to select columns that have numbers greater than a value, you could do this:
library(tidyverse)
#example
set.seed(24)
df <- tibble(country = rep(c("India", "Canada"), each = 3),
X1990 = runif(6),
X1991 = runif(6),
X1992 = runif(6))
df |>
filter(country == "India") |>
select(!!!vars(colnames(df)[-1][which(parse_number(colnames(df)[-1]) > 1990)]))
#> # A tibble: 3 x 2
#> X1991 X1992
#> <dbl> <dbl>
#> 1 0.280 0.672
#> 2 0.764 0.673
#> 3 0.802 0.320
Although that is pretty complicated. It might be better to go long, filter, then go wide:
df |>
filter(country == "India") |>
mutate(id = row_number()) |>
pivot_longer(contains("X")) |>
mutate(name = parse_number(name))|>
filter(name > 1990) |>
pivot_wider(names_from = name, values_from = value)|>
select(-c(id, country))
#> # A tibble: 3 x 2
#> `1991` `1992`
#> <dbl> <dbl>
#> 1 0.280 0.672
#> 2 0.764 0.673
#> 3 0.802 0.320
We can see that this answer is pretty long and cumbersome. Maybe we stick in base R:
cols <- which(as.numeric(sub("^.*?(\\d+).*$", "\\1", colnames(df)[-1])) > 1990) +1
rows <- df$country == "India"
df[rows,cols]
#> # A tibble: 3 x 2
#> X1991 X1992
#> <dbl> <dbl>
#> 1 0.280 0.672
#> 2 0.764 0.673
#> 3 0.802 0.320
Or actually, maybe we can make the tidyverse version cleaner if we just look for strings that have values higher than the target year:
all_years <- 1990:1995
df |>
filter(country == "India") |>
select(contains(paste0("X", all_years[all_years > 1990])))
#> # A tibble: 3 x 2
#> X1991 X1992
#> <dbl> <dbl>
#> 1 0.280 0.672
#> 2 0.764 0.673
#> 3 0.802 0.320
Using the same logic, we can also do a partial string match with base R:
all_years <- 1990:1995
cols <- grepl(paste(all_years[all_years>1990], collapse = "|"), colnames(df))
rows <- df$country == "India"
df[rows,cols]
#> # A tibble: 3 x 2
#> X1991 X1992
#> <dbl> <dbl>
#> 1 0.280 0.672
#> 2 0.764 0.673
#> 3 0.802 0.320
Hopefully one of these helps and strikes your fancy. Lots of options out there for whatever flavor your in the mood for.

how to use a variable as a parameter of the dplyr::slice_max() function in R

Given a data.frame:
tibble(group = c(rep("A", 4), rep("B", 4), rep("C", 4)),
value = runif(12),
n_slice = c(rep(2, 4), rep(1, 4), rep(3, 4)) )
# A tibble: 12 x 3
group value n_slice
<chr> <dbl> <dbl>
1 A 0.853 2
2 A 0.726 2
3 A 0.783 2
4 A 0.0426 2
5 B 0.320 1
6 B 0.683 1
7 B 0.732 1
8 B 0.319 1
9 C 0.118 3
10 C 0.0259 3
11 C 0.818 3
12 C 0.635 3
I'd like to slice by group with diferent number of rows in each group
I tried the code below but I get notified that "n" must be a constant
re %>%
group_by(group) %>%
slice_max(value, n = n_slice)
Erro: `n` must be a constant in `slice_max()`.
Expected output:
group value n_slice
<chr> <dbl> <dbl>
1 A 0.853 2
2 A 0.783 2
3 B 0.732 1
4 C 0.818 3
5 C 0.635 3
6 C 0.118 3
In this case, an option is with group_modify
library(dplyr)
re %>%
group_by(group) %>%
group_modify(~ .x %>%
slice_max(value, n = first(.x$n_slice))) %>%
ungroup
-output
# A tibble: 6 × 3
group value n_slice
<chr> <dbl> <dbl>
1 A 0.931 2
2 A 0.931 2
3 B 0.722 1
4 C 0.591 3
5 C 0.519 3
6 C 0.494 3
Or another option is to summarise using cur_data() and then unnest
library(tidyr)
re %>%
group_by(group) %>%
summarise(out = list(cur_data() %>%
slice_max(value, n = first(n_slice)))) %>%
unnest(out)
-output
# A tibble: 6 × 3
group value n_slice
<chr> <dbl> <dbl>
1 A 0.931 2
2 A 0.931 2
3 B 0.722 1
4 C 0.591 3
5 C 0.519 3
6 C 0.494 3
I don't think slice_max supports this, perhaps because it's not hard to imagine data where n_slice is not constant within a group (that action is ambiguous). Try using filter:
set.seed(42)
re <- tibble(group = c(rep("A", 4), rep("B", 4), rep("C", 4)),
value = runif(12),
n_slice = c(rep(2, 4), rep(1, 4), rep(3, 4)) )
re
# # A tibble: 12 x 3
# group value n_slice
# <chr> <dbl> <dbl>
# 1 A 0.915 2
# 2 A 0.937 2
# 3 A 0.286 2
# 4 A 0.830 2
# 5 B 0.642 1
# 6 B 0.519 1
# 7 B 0.737 1
# 8 B 0.135 1
# 9 C 0.657 3
# 10 C 0.705 3
# 11 C 0.458 3
# 12 C 0.719 3
re %>%
group_by(group) %>%
filter(rank(-value) <= n_slice[1])
# # A tibble: 6 x 3
# # Groups: group [3]
# group value n_slice
# <chr> <dbl> <dbl>
# 1 A 0.915 2
# 2 A 0.937 2
# 3 B 0.737 1
# 4 C 0.657 3
# 5 C 0.705 3
# 6 C 0.719 3
Notes:
Because of the potential for ties in the data, it might be useful to use rank(., ties.method = ...) (see ?rank) or dplyr::dense_rank.
If the column you are slicing on does not support negation (e.g., Date or POSIXt), then you can change rank(-value) to n() - rank(value) + 1L <= n_slice[1] for the same effect (or more simply n() - rank(value) < n_slice[1]). Another option is rank(desc(value)), thanks to #IceCreamToucan for the suggestion.
If performance is an issue (you have lots more rows), then using just filter per group is the fastest of the answers so far :-)
bench::mark(
akrun1 = re %>% group_by(group) %>% group_modify(~ .x %>% slice_max(value, n = first(.x$n_slice))) %>% ungroup,
akrun2 = re %>% group_by(group) %>% summarise(out = list(cur_data() %>% slice_max(value, n = first(n_slice)))) %>% tidyr::unnest(out),
r2evans = re %>% group_by(group) %>% filter(rank(-value) <= n_slice[1]) %>% ungroup() %>% arrange(group, -value)
)
# # A tibble: 3 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:t> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 akrun1 7.2ms 9.05ms 108. 7.55KB 13.8 47 6 436ms <tibble [~ <Rprofmem[,~ <bch:tm~ <tibble ~
# 2 akrun2 9.56ms 11.32ms 87.0 20KB 12.1 36 5 414ms <tibble [~ <Rprofmem[,~ <bch:tm~ <tibble ~
# 3 r2evans 4.21ms 5.27ms 186. 4.87KB 14.3 78 6 420ms <tibble [~ <Rprofmem[,~ <bch:tm~ <tibble ~
(Adding the arrange(.) at the end of mine to mimic the output exactly.)
If you don't have many more rows, or even if you do, readability is frankly more important. All answers produce the same results, so whichever makes more sense to the user (and the future self when you look back 6 months on old code), a little penalty in runtime is usually worth it.
Not that this question is in need of yet another way to replicate slice_max, but just for fun, you can use arrange followed by slice
library(dplyr, warn.conflicts = F)
set.seed(42)
re <- tibble(group = c(rep("A", 4), rep("B", 4), rep("C", 4)),
value = runif(12),
n_slice = c(rep(2, 4), rep(1, 4), rep(3, 4)) )
re %>%
group_by(group) %>%
arrange(desc(value)) %>%
slice(seq(first(n_slice))) %>%
ungroup
#> # A tibble: 6 × 3
#> group value n_slice
#> <chr> <dbl> <dbl>
#> 1 A 0.937 2
#> 2 A 0.915 2
#> 3 B 0.737 1
#> 4 C 0.719 3
#> 5 C 0.705 3
#> 6 C 0.657 3
Created on 2021-12-17 by the reprex package (v2.0.1)
This, surprisingly, seems a little faster
library(bench)
library(dplyr, warn.conflicts = F)
set.seed(42)
n <- 1e5
re <- tibble(group = c(rep("A", n), rep("B", n), rep("C", n)),
value = runif(n*3),
n_slice = c(rep(sample(n, 1), n), rep(sample(n, 1), n), rep(sample(n, 1), n)) )
bench::mark(
akrun1 = re %>% group_by(group) %>% group_modify(~ .x %>% slice_max(value, n = first(.x$n_slice))) %>% ungroup,
akrun2 = re %>% group_by(group) %>% summarise(out = list(cur_data() %>% slice_max(value, n = first(n_slice)))) %>% tidyr::unnest(out),
r2evans = re %>% group_by(group) %>% filter(rank(-value) <= n_slice[1]) %>% ungroup() %>% arrange(group, -value),
arrange = re %>% group_by(group) %>% arrange(desc(value)) %>% slice(seq(first(n_slice))) %>% ungroup %>% arrange(group, -value)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 akrun1 167.7ms 169.9ms 5.71 31MB 5.71
#> 2 akrun2 166.2ms 172.4ms 5.82 29.5MB 9.70
#> 3 r2evans 173ms 175.2ms 5.67 31.8MB 3.78
#> 4 arrange 66.7ms 75.2ms 11.9 29.5MB 17.9
Created on 2021-12-17 by the reprex package (v2.0.1)

Reshape long data table to list of wide data tables

My question is an expansion of the question posed here
How to reshape data from long to wide format so I will phrase it in a similar way.
The difference is that I want to rearrange one long data table into a list of wide data tables.
dat <- data.table(
sim = rep(c(1,2), each=4),
time = rep(1:4, 2),
value1 = rnorm(8),
value2 = rnorm(8)
)
dat
sim time value1 value2
1 1 1 0.3407 0.5167
2 1 2 -0.7033 0.8416
3 1 3 -0.3795 -0.4717
4 1 4 -0.7460 0.8479
5 2 1 0.8981 -0.7163
6 2 2 -0.3347 -0.6849
7 2 3 0.5013 0.8941
8 2 4 -0.1745 0.0795
I want to reshape it so that I have a list of wide data tables named value1, value2 ... value99 etc...
l = list()
l[["value1"]]
sim 1 2 3 4
1 1 0.3407 -0.7033 -0.3795 -0.7460
5 2 -0.8981 -0.3347 -0.5013 -0.1745
l[["value2"]]
sim 1 2 3 4
1 1 0.5167 0.8416 -0.4717 0.8479
5 2 -0.7163 -0.6849 0.8941 0.0795
Two variants.
data.table
library(data.table)
tmp <- dcast(melt(as.data.table(dat), id = c("sim", "time")), sim + variable ~ time)
tmp <- split(tmp, tmp$variable)
tmp <- lapply(tmp, set, i = NULL, j = "variable", value = NULL)
tmp
# $value1
# sim 1 2 3 4
# <num> <num> <num> <num> <num>
# 1: 1 1.0458737762 -0.4845954 0.1891288 0.05100633
# 2: 2 -0.0002406689 1.8093820 -0.8253280 1.14547045
# $value2
# sim 1 2 3 4
# <num> <num> <num> <num> <num>
# 1: 1 0.03157319 -0.8352058 -0.06876365 0.7467717
# 2: 2 -0.42551873 -0.7720822 0.15276411 0.9885968
I often use magrittr::%>% with data.table as well, so that can be converted into
library(data.table)
library(magrittr) # if %>% is not already available
as.data.table(dat) %>%
melt(., id = c("sim", "time")) %>%
dcast(., sim + variable ~ time) %>%
split(., .$variable) %>%
lapply(., set, i = NULL, j = "variable", value = NULL)
# $value1
# sim 1 2 3 4
# <num> <num> <num> <num> <num>
# 1: 1 1.0458737762 -0.4845954 0.1891288 0.05100633
# 2: 2 -0.0002406689 1.8093820 -0.8253280 1.14547045
# $value2
# sim 1 2 3 4
# <num> <num> <num> <num> <num>
# 1: 1 0.03157319 -0.8352058 -0.06876365 0.7467717
# 2: 2 -0.42551873 -0.7720822 0.15276411 0.9885968
tidyverse
library(dplyr)
library(tidyr) # pivot_longer, pivot_wider
dat %>%
pivot_longer(., -c(sim, time)) %>%
pivot_wider(., names_from = time, values_from = value) %>%
split(., .$name) %>%
lapply(., select, -name)
# $value1
# # A tibble: 2 x 5
# sim `1` `2` `3` `4`
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1.05 -0.485 0.189 0.0510
# 2 2 -0.000241 1.81 -0.825 1.15
# $value2
# # A tibble: 2 x 5
# sim `1` `2` `3` `4`
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 0.0316 -0.835 -0.0688 0.747
# 2 2 -0.426 -0.772 0.153 0.989
My solution to this issue would be to create a nested datafrae of the results. I have provided a brief description of the method followed by a reprex.
I would do this by using pivot_wider() and pivot_longer() to reshape the data. pivot_longer is used first to make each row only contain 1 value with a label for the time, simulation and whether it is value one or two. Then using pivot_wider each row will contain the values at each time with a label for the simulation and which set of values they are. (value1 or value2).
Finally we nest the dataframe using nest which stores all the data for each set of values in a dataframe. This can be accessed as an array of dataframes by nested_vals$data if necessary where nested_vals is the object we assigned the nested dataframe to.
library(tidyverse)
#Setup data
dat <- data.frame(
sim = rep(c(1,2), each=4),
time = rep(1:4, 2),
value1 = rnorm(8),
value2 = rnorm(8)
)
# Construct nested dataframe
nested_vals <- dat %>%
# Format dataset in tidy format
pivot_longer(cols = c(value1, value2)) %>%
# Move the name of the data to the beginning of the dataframe
relocate(name) %>%
# Pivot to matrix form as requested (i.e. times as columns, sims as rows)
pivot_wider(id_cols = c(name, sim), names_from = time, values_from = value) %>%
# Nest results by name
nest(-name)
#> Warning: All elements of `...` must be named.
#> Did you want `data = c(sim, `1`, `2`, `3`, `4`)`?
nested_vals
#> # A tibble: 2 x 2
#> name data
#> <chr> <list>
#> 1 value1 <tibble[,5] [2 x 5]>
#> 2 value2 <tibble[,5] [2 x 5]>
nested_vals$data[[2]]
#> # A tibble: 2 x 5
#> sim `1` `2` `3` `4`
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.0639 0.250 -1.28 0.850
#> 2 2 -1.90 0.000421 0.704 -0.164
Created on 2021-04-07 by the reprex package (v2.0.0)
One more way, with a single pipe syntax
library(tidyverse)
dat %>% pivot_longer(c(value1, value2)) %>%
group_split(name) %>% setNames(map(., ~.x[[3]][1])) %>%
map(~ .x %>% pivot_wider(id_cols = sim, names_from = time, values_from = value))
$value1
# A tibble: 2 x 5
sim `1` `2` `3` `4`
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 -0.851 -0.0484 -0.656 -0.121
2 2 -0.645 1.59 -0.274 0.445
$value2
# A tibble: 2 x 5
sim `1` `2` `3` `4`
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1.46 -1.62 -0.672 1.43
2 2 1.65 0.790 0.495 0.162
Another approach:
library(dplyr)
library(tidyr)
wide_dat <- dat %>% pivot_wider(id_cols = sim, names_from = time, values_from = starts_with('value'))
lapply(lapply(split.default(wide_dat[-1], sub('_\\d','',names(wide_dat[-1]))), function(x) cbind(wide_dat[1],x)), setNames, c('sim', 1:4))
$value1
sim 1 2 3 4
1 1 -0.1704969 0.2820143 1.181898 2.2377396
2 2 2.1920534 0.8214070 0.421177 0.7601796
$value2
sim 1 2 3 4
1 1 0.1760887 0.3440053 -0.8435849 0.6729751
2 2 -0.1714095 1.5125986 -0.5739871 -0.9648294
A tidyverse solution could be:
library(dplyr)
library(purrr)
library(tidyr)
dat_longer <- dat %>%
tidyr::pivot_longer(starts_with("value"), names_to="col_name", values_to="values")
list_wide <- purrr::map(unique(dat_longer[["col_name"]]),
~dat_longer %>%
dplyr::filter(col_name==.x) %>%
tidyr::pivot_wider(values_from = "values", names_from="time") %>%
select(-col_name)) %>%
purrr::set_names(unique(dat_longer[["col_name"]]))
$value1
# A tibble: 2 x 5
sim `1` `2` `3` `4`
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 -0.710 -0.334 -0.370 0.777
2 2 0.130 0.877 1.24 -0.202
$value2
# A tibble: 2 x 5
sim `1` `2` `3` `4`
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 -0.719 -0.909 0.0821 -0.158
2 2 -0.706 1.51 0.234 1.09

What is the best way to tidily append many columns?

I'm looking to append 30 columns which give values for gamma distributions by using the tidyverse. Here's an example of the data:
data.frame('rank'=1:3,'shape'=c(16,0.2,4),'rate'=c(13,0.4,0.2))
I'd like to use dgamma(1:30,shape,rate) to append 30 columns to the existing dataframe.
You can use map2() in purrr and unnest_wider() in tidyr.
library(tidyverse)
df %>%
mutate(density = map2(shape, rate, dgamma, x = 1:30)) %>%
unnest_wider(density, names_sep = "_")
Or use rowwise() at first and then mutate() with list().
df %>%
rowwise() %>%
mutate(density = list(dgamma(1:30, shape, rate))) %>%
unnest_wider(density, names_sep = "_")
Both of them give
# # A tibble: 3 x 33
# rank shape rate density_1 density_2 density_3 density_4 density_5 density_6 density_7
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 16 13 1.15 0.0852 0.0000843 1.43e-8 9.16e-13 3.19e-17 7.28e-22
# 2 2 0.2 0.4 0.122 0.0468 0.0227 1.21e-2 6.77e- 3 3.92e- 3 2.32e- 3
# 3 3 4 0.2 0.000218 0.00143 0.00395 7.67e-3 1.23e- 2 1.73e- 2 2.26e- 2
# # … with 23 more variables: density_8 <dbl>, density_9 <dbl>, density_10 <dbl>, ..., density_30 <dbl>

Specify a vector of x and y variables for purrr::map() in conjunction with dplyr::summarise_at()

I have a similar dataset but with many more r and v variables.
set.seed(1000)
tb <- tibble(grp = c(rep("A",4),rep("B",4)),
v1 = rnorm(8),
v2 = rnorm(8),
v3 = rnorm(8),
r1 = rnorm(8),
r2 = rnorm(8))
For each v variable, I would like to create a lm() with r variables.
This is what I have so far:
lm_fun <- function(x,y) coef(lm(x ~ y))[2]
tb %>%
nest(-grp) %>%
mutate(lm_list = map(data, ~ .x %>%
summarise_at(colnames(tb)[c(2:4)], funs(r1=lm_fun), .$r1)),
lm_list2= map(data, ~ .x %>%
summarise_at(colnames(tb)[c(2:4)], funs(r2=lm_fun), .$r2)),) %>%
select(grp,lm_list,lm_list2) %>%
unnest()
which gives me the intended output:
# A tibble: 2 x 7
grp v1_r1 v2_r1 v3_r1 v1_r2 v2_r2 v3_r2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A -0.188 -0.0972 0.858 0.130 0.136 1.21
2 B 0.208 0.935 -1.33 -0.339 0.0580 -0.840
However, how can I specify the r variables in a vector (in a similar way of specifying the v variables as colnames(tb)[...]. I don't want to copy-pasta the code for every r variable I have in my full data. Also, would it be possible to solve this with another method?
Note that it is not important that the function is performing lm(), could be any function that involves two variables.
An option would be to loop through the 'r' columns inside map. This simplifies the code as we are using the same data but different 'r' columns
library(tidyverse)
tb %>%
nest(-grp) %>%
mutate(lm_list = map(data, function(x)
map(paste0('r', 1:2), function(y)
x %>%
summarise_at(vars(names(.)[1:3]), funs(lm_fun), .[[y]]) %>%
rename_all(~ paste(., y, sep="_")) ) %>%
bind_cols)) %>%
select(-data) %>%
unnest
# A tibble: 2 x 7
# grp v1_r1 v2_r1 v3_r1 v1_r2 v2_r2 v3_r2
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 A -0.188 -0.0972 0.858 0.130 0.136 1.21
#2 B 0.208 0.935 -1.33 -0.339 0.0580 -0.840
Another option would be to gather the levels of r before mutate/map:
tb %>%
gather(r, value, starts_with('r')) %>%
nest(-r, -grp) %>%
mutate(lm_list = map(
data, ~ .x %>%
summarise_at(colnames(tb)[c(2:4)], funs(lm_fun), .$value)
)) %>%
unnest(lm_list, .drop = T)
grp r v1 v2 v3
<chr> <chr> <dbl> <dbl> <dbl>
1 A r1 -0.188 -0.0972 0.858
2 B r1 0.208 0.935 -1.33
3 A r2 0.130 0.136 1.21
4 B r2 -0.339 0.0580 -0.840

Resources