How to use rowwise with dtplyr - r

I have the following data frame:
df <- tibble(x = runif(6), y = runif(6), z = runif(6))
And for the operation, I'd like to do it has to use dplyr::rowwise().
library(dplyr)
df <- tibble(x = runif(6), y = runif(6), z = runif(6))
df %>%
rowwise() %>%
mutate(m = mean(c(x, y, z)))
#> # A tibble: 6 × 4
#> # Rowwise:
#> x y z m
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0.606 0.452 0.799 0.619
#> 2 0.760 0.168 0.666 0.531
#> 3 0.125 0.792 0.105 0.341
#> 4 0.431 0.714 0.178 0.441
#> 5 0.430 0.115 0.676 0.407
#> 6 0.290 0.830 0.0335 0.385
What I'd like to do next is to use dtplyr to speed up the process.
But I found an error:
library(dtplyr)
library(dplyr)
df <- tibble(x = runif(6), y = runif(6), z = runif(6))
df.dt <- lazy_dt(df)
df.dt %>%
rowwise() %>%
mutate(m = mean(c(x, y, z)))
#> Error in UseMethod("rowwise"): no applicable method for 'rowwise' applied to an object of class "c('dtplyr_step_first', 'dtplyr_step')"
What's the right way to do it?

Related

How to scale up a transmute in tidyverse?

I have a tibble with many variables organised this way:
tibble(
A = rep("A",10),
xyz1 = rnorm(10),
xyz2 = rnorm(10),
xyz3 = rnorm(10),
abc1 = rnorm(10),
abc2 = rnorm(10),
abb3 = rnorm(10),
acc4 = rnorm(10)
)
where xyz, abc, etc. are placeholder. After the placeholder there is a number. Assume it can be any integer.
In my tibble, that number can be any integer.
I want to trasmute it according to the formula
xyzn = xyzn - 'xyzn-1', where n is the symbol for the counted integer.
Whereas 'xyzn-1' does not exist, the result can be ignored and not join the transmute.
Schematic output:
tibble(
A = A
xyz2 = xyz2 - xyz1,
xyz3 = xyz3 - xyz2,
abc2 = abc2 - abc1
)
Perhaps this helps
library(dplyr)
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -c(A, rn), names_to = c(".value", "ind"),
names_pattern = "(\\D+)(\\d+)",
names_transform = list(ind = as.integer)) %>%
arrange(A, rn, ind) %>%
group_by(A, rn) %>%
mutate(across(-ind, ~ c(NA, diff(.x)))) %>%
ungroup %>%
pivot_wider(names_from = ind, values_from = xyz:acc,
names_sep = "") %>%
select(-rn) %>%
select(where(~ any(complete.cases(.x))))
-output
# A tibble: 10 × 4
A xyz2 xyz3 abc2
<chr> <dbl> <dbl> <dbl>
1 A -1.60 1.75 -1.53
2 A 2.89 -3.81 0.0701
3 A -0.657 0.920 -0.912
4 A 0.305 0.395 -0.477
5 A -0.289 1.39 1.38
6 A -0.103 0.426 -1.38
7 A -2.16 1.44 -0.913
8 A -0.260 -0.249 -1.45
9 A -1.15 1.14 -1.42
10 A -0.306 0.198 -0.118
Or may be
cbind(df1[1], do.call(cbind, unname(Filter(nrow, lapply(split.default(df1[-1],
sub("\\d+$", "", names(df1)[-1])),
\(x) {
i1 <- order(as.integer(sub("\\D+", "", names(x))))
x <- x[i1]
x[-1]- x[-ncol(x)]
})))
))
-output
A abc2 xyz2 xyz3
1 A -1.52667071 -1.5985160 1.7533450
2 A 0.07013841 2.8939503 -3.8113492
3 A -0.91213998 -0.6573093 0.9197824
4 A -0.47712113 0.3049918 0.3945995
5 A 1.37871603 -0.2886773 1.3933839
6 A -1.37608993 -0.1031296 0.4264927
7 A -0.91313982 -2.1630265 1.4407289
8 A -1.45439105 -0.2598476 -0.2493127
9 A -1.41590040 -1.1490018 1.1383060
10 A -0.11775196 -0.3061306 0.1984115
Usually the {dplyover} package can help with this kind of problems (disclaimer: I'm the maintainer). However, in your specific case the problem is not that easy to solve due to he specific conditions for variable selection.
In the approach below we first construct the variable names that we want to subtract from each other myvars1 and myvars2.
After that we can use dplyover::across2() together with all_of().
See the code comments for what we do in each step:
library(dplyr)
library(stringr)
library(dplyover) # https://timteafan.github.io/dplyover/
# get all variable stems
all_stems <- dplyover::cut_names("[0-9]$", .vars = names(df1))
# exlcude stems which don't start with 1
use_stems <- all_stems[paste0(all_stems, 1) %in% names(df1)]
# construct regex pattern to select all vars with > 1
patrn <- paste0("(", paste(use_stems, collapse = "|"), ")[^1]$")
# select vars with > 1
myvars1 <- grep(patrn, names(df1), value = TRUE)
# select vars to substract from `myvars1`
myvars2 <- str_replace(myvars1, "\\d$", ~ as.numeric(.x) - 1)
# use `dplyover::across2()` with `all_of()`
df1 %>%
transmute(
A = A, # dplyover doesn't support the `.keep` argument so we need a workaround
across2(all_of(myvars1),
all_of(myvars2),
~ .x - .y,
.names = "{xcol}")
)
#> # A tibble: 10 × 4
#> A xyz2 xyz3 abc2
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 0.847 -1.19 0.413
#> 2 A 1.00 0.946 -3.26
#> 3 A 0.856 -1.11 -2.62
#> 4 A -0.325 1.47 1.11
#> 5 A -1.18 0.0830 2.78
#> 6 A -2.65 -0.520 -0.337
#> 7 A 0.197 -0.447 0.347
#> 8 A -0.484 1.18 -0.717
#> 9 A -1.94 1.81 1.05
#> 10 A -3.80 1.36 0.202
The from OP:
df1 <- tibble(
A = rep("A",10),
xyz1 = rnorm(10), # 2
xyz2 = rnorm(10), # 1 2
xyz3 = rnorm(10), # 1
abc1 = rnorm(10), # 2
abc2 = rnorm(10), # 1
abb3 = rnorm(10), #
acc4 = rnorm(10) #
)
Created on 2023-01-07 with reprex v2.0.2

How to use dplyr::summarize multiple times in a single command in R dplyr/ tidyr?

I have a community of species 1,2,3, and 4. I am trying to compute the covariance between species i and combined abundances of reciprocal species using dplyr. I want to do this for each species combination. The dplyr works fine for just one species, but not when I try to do multiple summarise together. Any suggestions?
set.seed(111)
month <- rep(c("J","J","J","F","F","F"), time = 3)
site <- rep(c(1,2,3), each = 6)
quadrant <- rep(c(1,2,3), times = 6)
sp1 <- sample(0:20, 18, replace = TRUE)
sp2 <- sample(0:15, 18, replace = TRUE)
sp3 <- sample(0:10, 18, replace = TRUE)
sp4 <- sample(0:4, 18, replace = TRUE)
df <- data.frame(month, site, quadrant, sp1, sp2, sp3, sp4)
df$sp2.3.4 <- df$sp2 + df$sp3 + df$sp4 #no sp1
df$sp3.4.1 <- df$sp3 + df$sp4 + df$sp1 #no sp2
df$sp1.2.4 <- df$sp1 + df$sp2 + df$sp4 #no sp3
df$sp1.2.3 <- df$sp1 + df$sp2 + df$sp3 #no sp4
library(tidyr)
df.long <- gather(df,
key = "species",
value = "abundance",
sp1, sp2, sp3, sp4)
df.long <- gather(df.long,
key = "species.covar",
value = "abundance.covar",
sp2.3.4, sp3.4.1, sp1.2.4, sp1.2.3)
df.long$species <- as.factor(as.character(df.long$species))
df.long$species.covar <- as.factor(as.character(df.long$species.covar))
library(dplyr)
agg.cov <- df.long%>%
group_by(month,site)%>%
dplyr::summarise(covar.species1 = cor(abundance[species=="sp1"],abundance.covar[species.covar=="sp2.3.4"]))%>%
as.data.frame()
agg.cov <- df.long%>%
group_by(month,site)%>%
dplyr::summarise(covar.species1 = cor(abundance[species=="sp1"],abundance.covar[species.covar=="sp2.3.4"]))%>%
dplyr::summarise(covar.species2 = cor(abundance[species=="sp2"],abundance.covar[species.covar=="sp3.4.1"]))%>%
dplyr::summarise(covar.species3 = cor(abundance[species=="sp3"],abundance.covar[species.covar=="sp1.2.4"]))%>%
dplyr::summarise(covar.species4 = cor(abundance[species=="sp4"],abundance.covar[species.covar=="sp1.2.3"]))%>%
as.data.frame()
Error: Error: Problem with `summarise()` column `covar.species2`.
ℹ `covar.species2 = cor(...)`.
x object 'abundance.covar' not found
ℹ The error occurred in group 1: month = "F".
There are three method below that should work
map - based
library(dplyr)
library(stringr)
library(purrr)
nm1 <- names(df)[startsWith(names(df), "sp")]
map(nm1, ~ df %>%
group_by(month, site) %>%
summarise(!!str_c("covar_species", "_", .x) :=
cor(!! rlang::sym(.x), rowSums(select(cur_data(), nm1, - !!.x)) ),
.groups = 'drop')) %>%
reduce(left_join)
-output
# A tibble: 6 x 6
month site covar_species_sp1 covar_species_sp2 covar_species_sp3 covar_species_sp4
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 F 1 0.479 0.987 -0.170 -0.980
2 F 2 -0.858 -0.454 -0.160 0.359
3 F 3 -0.999 -1.00 -0.933 NA
4 J 1 -0.945 -0.963 NA 0.596
5 J 2 -0.516 -0.148 -0.792 0.629
6 J 3 0.277 -0.591 -0.702 0.277
Reshaping with pivot_longer
library(tidyr)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = starts_with('sp'), names_to = "sp") %>%
group_by(rn) %>%
mutate(newvalue = sum(value) - value) %>%
group_by(month, site, sp = str_c('covar_species_', sp)) %>%
summarise(value = cor(value, newvalue), .groups = 'drop') %>%
pivot_wider(names_from = sp, values_from = value)
-output
# A tibble: 6 x 6
month site covar_species_sp1 covar_species_sp2 covar_species_sp3 covar_species_sp4
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 F 1 0.479 0.987 -0.170 -0.980
2 F 2 -0.858 -0.454 -0.160 0.359
3 F 3 -0.999 -1.00 -0.933 NA
4 J 1 -0.945 -0.963 NA 0.596
5 J 2 -0.516 -0.148 -0.792 0.629
6 J 3 0.277 -0.591 -0.702 0.277
Using across
df %>%
mutate(Sum = select(cur_data(), starts_with('sp')) %>%
rowSums) %>%
group_by(month, site) %>%
summarise(across(starts_with('sp'),
~ cor(., Sum - .), .names = "covar_species_{.col}"), .groups = 'drop')
-output
# A tibble: 6 x 6
month site covar_species_sp1 covar_species_sp2 covar_species_sp3 covar_species_sp4
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 F 1 0.479 0.987 -0.170 -0.980
2 F 2 -0.858 -0.454 -0.160 0.359
3 F 3 -0.999 -1.00 -0.933 NA
4 J 1 -0.945 -0.963 NA 0.596
5 J 2 -0.516 -0.148 -0.792 0.629
6 J 3 0.277 -0.591 -0.702 0.277
data
set.seed(111)
month <- rep(c("J","J","J","F","F","F"), time = 3)
site <- rep(c(1,2,3), each = 6)
quadrant <- rep(c(1,2,3), times = 6)
sp1 <- sample(0:20, 18, replace = TRUE)
sp2 <- sample(0:15, 18, replace = TRUE)
sp3 <- sample(0:10, 18, replace = TRUE)
sp4 <- sample(0:4, 18, replace = TRUE)
df <- data.frame(month, site, quadrant, sp1, sp2, sp3, sp4)
I wasn't able to get your example to work, but a potential solution (I'm guessing) is:
agg.cov <- df.long %>%
group_by(month,site)%>%
dplyr::summarise(covar.species1 = cor(abundance[species=="sp1"],abundance.covar[species.covar=="sp2.3.4"]),
covar.species2 = cor(abundance[species=="sp2"],abundance.covar[species.covar=="sp3.4.1"]),
covar.species3 = cor(abundance[species=="sp3"],abundance.covar[species.covar=="sp1.2.4"]),
covar.species4 = cor(abundance[species=="sp4"],abundance.covar[species.covar=="sp1.2.3"]))%>%
as.data.frame()

Rolling window with slide_dbl() on grouped data

This is an extension to following question: Rolling window slider::slide() with grouped data
I want to mutate a column of my grouped tibble with slide_dbl(), i.e. applying slide_dbl() on all groups, but only within them, not across them.
When running the solution of linked question I receive following error message:
Error: Problem with `mutate()` input `rollreg`.
x Inapplicable method for 'mutate_' applied to object of class "c('double', 'numeric')".
My tibble has following structure:
tibble [450,343 x 3] (S3: grouped_df/tbl_df/tbl/data.frame)
$ company: num [1:450343] 1 1 1 1 1 ...
$ date: Date[1:450343], format: "2011-11-30" "2011-12-31" "2012-01-31" "2012-02-29" ...
$ result: num [1:450343] NA NA NA 12.5981 -2.9023 ...
- attr(*, "groups")= tibble [3,339 x 2] (S3: tbl_df/tbl/data.frame)
..$ company: num [1:3339] 1 2 3 4 5 ...
..$ .rows : list<int> [1:3339]
To complete, this is the code I ran according to the linked solution:
testtest <- data %>%
group_by(company) %>% nest() %>%
mutate(rollreg = map(data, ~ .x %>% mutate(result_2 = slide_dbl(.x = .$result, .f = ~prod(1+.)-1, .before = 11, .after = -1, complete=TRUE)))) %>%
select(-data) %>% unnest(rollreg)
Here, above mentioned error message occurs. I guess it's because of the data structure. Yet, I can't figure any solution (also not with similar functions like group_map() or group_modify()). Can anyone help? Thanks in advance!
An option is group_split by the grouping column (in the example, using 'case', loop over the list of datasets with map, create new column in mutate by applying the slide_dbl
library(dplyr)
library(tidyr)
library(purrr)
data %>%
group_split(case) %>%
map_dfr(~ .x %>%
mutate(out = slide_dbl(r1, .f = ~ prod(1 + .x) - 1,
.before = 5, .after = -1, complete = TRUE)))
-output
# A tibble: 30 x 6
# t case r1 r2 r3 out
# <int> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 a -0.294 -0.164 1.33 0
# 2 2 a 0.761 1.01 0.115 -0.294
# 3 3 a -0.781 -0.499 0.290 0.243
# 4 4 a -0.0732 -0.110 0.289 -0.728
# 5 5 a -0.528 0.707 0.181 -0.748
# 6 6 a -1.35 -0.411 -1.47 -0.881
# 7 7 a -0.397 -1.28 0.172 -1.06
# 8 8 a 1.68 0.956 -2.81 -1.02
# 9 9 a -0.0167 -0.0727 -1.08 -1.24
#10 10 a 1.25 -0.326 1.61 -1.26
## … with 20 more rows
Or if we need to use the nest_by, it creates an attribute rowwise, so, it is better to ungroup before applying
out1 <- data %>%
select(-t) %>%
nest_by(case) %>%
ungroup %>%
mutate(data = map(data, ~ .x %>%
mutate(out = slide_dbl(r1, .f = ~ prod(1 + .x) - 1,
.before = 5, .after = -1, complete = TRUE))))
-output
out1
# A tibble: 3 x 2
# case data
# <chr> <list>
#1 a <tibble [10 × 4]>
#2 b <tibble [10 × 4]>
#3 c <tibble [10 × 4]>
Now, we unnest the structure
out1 %>%
unnest(data)
# A tibble: 30 x 5
# case r1 r2 r3 out
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 a -0.294 -0.164 1.33 0
# 2 a 0.761 1.01 0.115 -0.294
# 3 a -0.781 -0.499 0.290 0.243
# 4 a -0.0732 -0.110 0.289 -0.728
# 5 a -0.528 0.707 0.181 -0.748
# 6 a -1.35 -0.411 -1.47 -0.881
# 7 a -0.397 -1.28 0.172 -1.06
# 8 a 1.68 0.956 -2.81 -1.02
# 9 a -0.0167 -0.0727 -1.08 -1.24
#10 a 1.25 -0.326 1.61 -1.26
# … with 20 more rows
data
data <- tibble(t = rep(1:10, 3),
case = c(rep("a", 10), rep("b", 10), rep("c", 10)),
r1 = rnorm(30),
r2 = rnorm(30),
r3 = rnorm(30))
i also got a question regarding the slide_Dbl function. I would like to check out other rollingregressions. My data is already fixed with an 8 weak week, but if i would like to look at for example 16 or 24 weeks, should i change the (before= ) from 8 to 16? The reason why i am asking is that i dont have the original dataset, but its already fixed with 8 weeks, so if i add the (before= ) with an additional 8 will it be 16?
new8 <- new%>%mutate( across(
where(is.double),
function(x) slide_dbl(x, mean, na.rm = TRUE, .before = 8L) %>% lag()))
Or should i put
new16 <- new%>%mutate(across(
where(is.double),
function(x) slide_dbl(x, mean, na.rm = TRUE, .before = 16L) %>% lag()))

In R, use nonstandard evaluation to select specific variables from data.frames

I've got several large-ish data.frames set up like a relational database, and I'd like to make a single function to look for whatever variable I need and grab it from that particular data.frame and add it to the data.frame I'm currently working on. I've got a way to do this that works, but it requires temporarily making a list of all the data.frames, which seems inefficient. I suspect that nonstandard evaluation would solve this problem for me, but I'm not sure how to do it.
Here's what works but seems inefficient:
Table1 <- data.frame(ID = LETTERS[1:10], ColA = rnorm(10), ColB = rnorm(10),
ColC = rnorm(10))
Table2 <- data.frame(ID = LETTERS[1:10], ColD = rnorm(10), ColE = rnorm(10),
ColF = rnorm(10))
Table3 <- data.frame(ID = LETTERS[1:10], ColG = rnorm(10), ColH = rnorm(10),
ColI = rnorm(10))
Key <- data.frame(Table = rep(c("Table1", "Table2", "Table3"), each = 4),
ColumnName = c("ID", paste0("Col", LETTERS[1:3]),
"ID", paste0("Col", LETTERS[4:6]),
"ID", paste0("Col", LETTERS[7:9])))
# function for grabbing info from other tables
grab <- function(StartDF, ColNames){
AllDFs <- list(Table1, Table2, Table3)
names(AllDFs) <- c("Table1", "Table2", "Table3")
# Determine which data.frames have that column
WhichDF <- Key %>% filter(ColumnName %in% ColNames) %>%
select(Table)
TempDF <- StartDF
for(i in 1:length(ColNames)){
ToAdd <- AllDFs[WhichDF[i, 1]]
ToAdd <- ToAdd[[1]] %>%
select(c(ColNames[i], ID))
TempDF <- TempDF %>% left_join(ToAdd)
rm(ToAdd)
}
return(TempDF)
}
grab(Table1, c("ColE", "ColH"))
What would be great instead would be something like this:
grab <- function(StartDF, ColNames){
# Some function that returns the column names of all the data.frames
# without me creating a new object that is a list of them
# Some function that left_joins the correct data.frame plus the column
# "ID" to my starting data.frame, again without needing to create that list
# of all the data.frames
}
Instead of creating the list manually, we can directly get the values of the objects returned from the 'Table' column of 'Key' dataset with mget
library(dplyr)
library(purrr)
grab <- function(StartDF, ColNames){
# filter the rows of Key based on the ColNames input
# pull the Table column as a vector
# column was factor, so convert to character class
# return the value of the objects with mget in a list
Tables <- Key %>%
filter(ColumnName %in% ColNames) %>%
pull(Table) %>%
as.character %>%
mget(envir = .GlobalEnv)
TempDF <- StartDF
# use the same left_joins in a loop after selecting only the
# ID and corresponding columns from 'ColNames'
for(i in seq_along(ColNames)){
ToAdd <- Tables[[i]] %>%
select(ColNames[i], ID)
TempDF <- TempDF %>%
left_join(ToAdd)
rm(ToAdd)
}
TempDF
}
grab(Table1, c("ColE", "ColH"))
Or another option is reduce
grab <- function(StartDF, ColNames) {
#only change is that instead of a for loop
# use reduce with left_join after selecting the corresponding columns
# with map
Key %>%
filter(ColumnName %in% ColNames) %>%
pull(Table) %>%
as.character %>%
mget(envir = .GlobalEnv) %>%
map2(ColNames, ~ .x %>%
select(ID, .y)) %>%
append(list(Table1), .) %>%
reduce(left_join)
}
grab(Table1, c("ColE", "ColH"))
# ID ColA ColB ColC ColE ColH
#1 A -0.9490093 0.5177143 -1.91015491 0.07777086 1.86277670
#2 B -0.7182786 -1.1019146 -0.70802738 -0.73965230 0.18375660
#3 C 0.5064516 -1.6904354 1.11106206 2.04315508 -0.65365228
#4 D 0.9362477 0.5260682 -0.03419651 -0.51628310 -1.17104181
#5 E 0.5636047 -0.9470895 0.43303304 -2.95928629 1.86425049
#6 F 1.0598531 0.4144901 0.10239896 1.57681703 -0.05382603
#7 G 1.1335047 -0.8282173 -0.28327898 2.02917831 0.50768462
#8 H 0.2941341 0.3261185 -0.15528127 -0.46470035 -0.86561320
#9 I -2.1434905 0.6567689 0.02298549 0.90822132 0.64360337
#10 J 0.4291258 1.3410147 0.67544567 0.12466251 0.75989623
There is a serious bug in the accepted solution. If you're not careful with the ordering in the ColNames argument, then the function won't work. Also, I redefined your data to use tibbles instead. They're basically the same as data frames, but their default settings are nicer (e.g. you don't need StringsAsFactors = FALSE)
library(tidyverse)
Table1 <- tibble(
ID = LETTERS[1:10], ColA = rnorm(10), ColB = rnorm(10), ColC = rnorm(10)
)
Table2 <- tibble(
ID = LETTERS[1:10], ColD = rnorm(10), ColE = rnorm(10), ColF = rnorm(10)
)
Table3 <- tibble(
ID = LETTERS[1:10], ColG = rnorm(10), ColH = rnorm(10), ColI = rnorm(10)
)
Key <- tibble(
Table = rep(c("Table1", "Table2", "Table3"), each = 4),
ColumnName = c("ID", paste0("Col", LETTERS[1:3]),
"ID", paste0("Col", LETTERS[4:6]),
"ID", paste0("Col", LETTERS[7:9]))
)
grab_akrun <- function(StartDF, ColNames) {
#only change is that instead of a for loop
# use reduce with left_join after selecting the corresponding columns
# with map
Key %>%
filter(ColumnName %in% ColNames) %>%
pull(Table) %>%
as.character %>%
mget(envir = .GlobalEnv) %>%
map2(ColNames, ~ .x %>%
select(ID, .y)) %>%
append(list(Table1), .) %>%
reduce(left_join)
}
grab_akrun(Table1, c("ColE", "ColH"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#> ID ColA ColB ColC ColE ColH
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A -0.658 -0.613 0.689 -0.850 -0.795
#> 2 B 0.143 0.732 -0.212 -1.74 1.99
#> 3 C -0.966 -0.570 -0.354 0.559 -1.11
#> 4 D -1.05 0.269 -0.856 -0.370 -1.35
#> 5 E 0.255 -0.349 0.329 1.39 0.421
#> 6 F 1.51 1.38 0.707 -0.639 0.289
#> 7 G -1.28 1.44 -1.35 1.94 -1.04
#> 8 H -1.56 -0.434 0.231 0.467 0.656
#> 9 I -0.553 -1.64 -0.761 0.133 0.249
#> 10 J -0.950 0.418 -0.843 0.593 0.343
This works, but if you change the order:
grab_akrun(Table1, c("ColH", "ColE"))
#> Error: Unknown column `ColH`
Instead, you should approach it like this:
grab_new <- function(StartDF, ColNames) {
Key %>%
filter(ColumnName %in% ColNames) %>%
pluck("Table") %>%
mget(inherits = TRUE) %>%
map(~select(.x, ID, intersect(colnames(.x), ColNames))) %>%
reduce(left_join, .init = StartDF)
}
grab_new(Table1, c("ColE", "ColH"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#> ID ColA ColB ColC ColE ColH
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A -0.658 -0.613 0.689 -0.850 -0.795
#> 2 B 0.143 0.732 -0.212 -1.74 1.99
#> 3 C -0.966 -0.570 -0.354 0.559 -1.11
#> 4 D -1.05 0.269 -0.856 -0.370 -1.35
#> 5 E 0.255 -0.349 0.329 1.39 0.421
#> 6 F 1.51 1.38 0.707 -0.639 0.289
#> 7 G -1.28 1.44 -1.35 1.94 -1.04
#> 8 H -1.56 -0.434 0.231 0.467 0.656
#> 9 I -0.553 -1.64 -0.761 0.133 0.249
#> 10 J -0.950 0.418 -0.843 0.593 0.343
grab_new(Table1, c("ColH", "ColE"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#> ID ColA ColB ColC ColE ColH
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A -0.658 -0.613 0.689 -0.850 -0.795
#> 2 B 0.143 0.732 -0.212 -1.74 1.99
#> 3 C -0.966 -0.570 -0.354 0.559 -1.11
#> 4 D -1.05 0.269 -0.856 -0.370 -1.35
#> 5 E 0.255 -0.349 0.329 1.39 0.421
#> 6 F 1.51 1.38 0.707 -0.639 0.289
#> 7 G -1.28 1.44 -1.35 1.94 -1.04
#> 8 H -1.56 -0.434 0.231 0.467 0.656
#> 9 I -0.553 -1.64 -0.761 0.133 0.249
#> 10 J -0.950 0.418 -0.843 0.593 0.343
Which works as expected.
Created on 2020-01-21 by the reprex package (v0.3.0)

Specify a vector of x and y variables for purrr::map() in conjunction with dplyr::summarise_at()

I have a similar dataset but with many more r and v variables.
set.seed(1000)
tb <- tibble(grp = c(rep("A",4),rep("B",4)),
v1 = rnorm(8),
v2 = rnorm(8),
v3 = rnorm(8),
r1 = rnorm(8),
r2 = rnorm(8))
For each v variable, I would like to create a lm() with r variables.
This is what I have so far:
lm_fun <- function(x,y) coef(lm(x ~ y))[2]
tb %>%
nest(-grp) %>%
mutate(lm_list = map(data, ~ .x %>%
summarise_at(colnames(tb)[c(2:4)], funs(r1=lm_fun), .$r1)),
lm_list2= map(data, ~ .x %>%
summarise_at(colnames(tb)[c(2:4)], funs(r2=lm_fun), .$r2)),) %>%
select(grp,lm_list,lm_list2) %>%
unnest()
which gives me the intended output:
# A tibble: 2 x 7
grp v1_r1 v2_r1 v3_r1 v1_r2 v2_r2 v3_r2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A -0.188 -0.0972 0.858 0.130 0.136 1.21
2 B 0.208 0.935 -1.33 -0.339 0.0580 -0.840
However, how can I specify the r variables in a vector (in a similar way of specifying the v variables as colnames(tb)[...]. I don't want to copy-pasta the code for every r variable I have in my full data. Also, would it be possible to solve this with another method?
Note that it is not important that the function is performing lm(), could be any function that involves two variables.
An option would be to loop through the 'r' columns inside map. This simplifies the code as we are using the same data but different 'r' columns
library(tidyverse)
tb %>%
nest(-grp) %>%
mutate(lm_list = map(data, function(x)
map(paste0('r', 1:2), function(y)
x %>%
summarise_at(vars(names(.)[1:3]), funs(lm_fun), .[[y]]) %>%
rename_all(~ paste(., y, sep="_")) ) %>%
bind_cols)) %>%
select(-data) %>%
unnest
# A tibble: 2 x 7
# grp v1_r1 v2_r1 v3_r1 v1_r2 v2_r2 v3_r2
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 A -0.188 -0.0972 0.858 0.130 0.136 1.21
#2 B 0.208 0.935 -1.33 -0.339 0.0580 -0.840
Another option would be to gather the levels of r before mutate/map:
tb %>%
gather(r, value, starts_with('r')) %>%
nest(-r, -grp) %>%
mutate(lm_list = map(
data, ~ .x %>%
summarise_at(colnames(tb)[c(2:4)], funs(lm_fun), .$value)
)) %>%
unnest(lm_list, .drop = T)
grp r v1 v2 v3
<chr> <chr> <dbl> <dbl> <dbl>
1 A r1 -0.188 -0.0972 0.858
2 B r1 0.208 0.935 -1.33
3 A r2 0.130 0.136 1.21
4 B r2 -0.339 0.0580 -0.840

Resources