How to scale up a transmute in tidyverse? - r

I have a tibble with many variables organised this way:
tibble(
A = rep("A",10),
xyz1 = rnorm(10),
xyz2 = rnorm(10),
xyz3 = rnorm(10),
abc1 = rnorm(10),
abc2 = rnorm(10),
abb3 = rnorm(10),
acc4 = rnorm(10)
)
where xyz, abc, etc. are placeholder. After the placeholder there is a number. Assume it can be any integer.
In my tibble, that number can be any integer.
I want to trasmute it according to the formula
xyzn = xyzn - 'xyzn-1', where n is the symbol for the counted integer.
Whereas 'xyzn-1' does not exist, the result can be ignored and not join the transmute.
Schematic output:
tibble(
A = A
xyz2 = xyz2 - xyz1,
xyz3 = xyz3 - xyz2,
abc2 = abc2 - abc1
)

Perhaps this helps
library(dplyr)
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -c(A, rn), names_to = c(".value", "ind"),
names_pattern = "(\\D+)(\\d+)",
names_transform = list(ind = as.integer)) %>%
arrange(A, rn, ind) %>%
group_by(A, rn) %>%
mutate(across(-ind, ~ c(NA, diff(.x)))) %>%
ungroup %>%
pivot_wider(names_from = ind, values_from = xyz:acc,
names_sep = "") %>%
select(-rn) %>%
select(where(~ any(complete.cases(.x))))
-output
# A tibble: 10 × 4
A xyz2 xyz3 abc2
<chr> <dbl> <dbl> <dbl>
1 A -1.60 1.75 -1.53
2 A 2.89 -3.81 0.0701
3 A -0.657 0.920 -0.912
4 A 0.305 0.395 -0.477
5 A -0.289 1.39 1.38
6 A -0.103 0.426 -1.38
7 A -2.16 1.44 -0.913
8 A -0.260 -0.249 -1.45
9 A -1.15 1.14 -1.42
10 A -0.306 0.198 -0.118
Or may be
cbind(df1[1], do.call(cbind, unname(Filter(nrow, lapply(split.default(df1[-1],
sub("\\d+$", "", names(df1)[-1])),
\(x) {
i1 <- order(as.integer(sub("\\D+", "", names(x))))
x <- x[i1]
x[-1]- x[-ncol(x)]
})))
))
-output
A abc2 xyz2 xyz3
1 A -1.52667071 -1.5985160 1.7533450
2 A 0.07013841 2.8939503 -3.8113492
3 A -0.91213998 -0.6573093 0.9197824
4 A -0.47712113 0.3049918 0.3945995
5 A 1.37871603 -0.2886773 1.3933839
6 A -1.37608993 -0.1031296 0.4264927
7 A -0.91313982 -2.1630265 1.4407289
8 A -1.45439105 -0.2598476 -0.2493127
9 A -1.41590040 -1.1490018 1.1383060
10 A -0.11775196 -0.3061306 0.1984115

Usually the {dplyover} package can help with this kind of problems (disclaimer: I'm the maintainer). However, in your specific case the problem is not that easy to solve due to he specific conditions for variable selection.
In the approach below we first construct the variable names that we want to subtract from each other myvars1 and myvars2.
After that we can use dplyover::across2() together with all_of().
See the code comments for what we do in each step:
library(dplyr)
library(stringr)
library(dplyover) # https://timteafan.github.io/dplyover/
# get all variable stems
all_stems <- dplyover::cut_names("[0-9]$", .vars = names(df1))
# exlcude stems which don't start with 1
use_stems <- all_stems[paste0(all_stems, 1) %in% names(df1)]
# construct regex pattern to select all vars with > 1
patrn <- paste0("(", paste(use_stems, collapse = "|"), ")[^1]$")
# select vars with > 1
myvars1 <- grep(patrn, names(df1), value = TRUE)
# select vars to substract from `myvars1`
myvars2 <- str_replace(myvars1, "\\d$", ~ as.numeric(.x) - 1)
# use `dplyover::across2()` with `all_of()`
df1 %>%
transmute(
A = A, # dplyover doesn't support the `.keep` argument so we need a workaround
across2(all_of(myvars1),
all_of(myvars2),
~ .x - .y,
.names = "{xcol}")
)
#> # A tibble: 10 × 4
#> A xyz2 xyz3 abc2
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 0.847 -1.19 0.413
#> 2 A 1.00 0.946 -3.26
#> 3 A 0.856 -1.11 -2.62
#> 4 A -0.325 1.47 1.11
#> 5 A -1.18 0.0830 2.78
#> 6 A -2.65 -0.520 -0.337
#> 7 A 0.197 -0.447 0.347
#> 8 A -0.484 1.18 -0.717
#> 9 A -1.94 1.81 1.05
#> 10 A -3.80 1.36 0.202
The from OP:
df1 <- tibble(
A = rep("A",10),
xyz1 = rnorm(10), # 2
xyz2 = rnorm(10), # 1 2
xyz3 = rnorm(10), # 1
abc1 = rnorm(10), # 2
abc2 = rnorm(10), # 1
abb3 = rnorm(10), #
acc4 = rnorm(10) #
)
Created on 2023-01-07 with reprex v2.0.2

Related

How to use rowwise with dtplyr

I have the following data frame:
df <- tibble(x = runif(6), y = runif(6), z = runif(6))
And for the operation, I'd like to do it has to use dplyr::rowwise().
library(dplyr)
df <- tibble(x = runif(6), y = runif(6), z = runif(6))
df %>%
rowwise() %>%
mutate(m = mean(c(x, y, z)))
#> # A tibble: 6 × 4
#> # Rowwise:
#> x y z m
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0.606 0.452 0.799 0.619
#> 2 0.760 0.168 0.666 0.531
#> 3 0.125 0.792 0.105 0.341
#> 4 0.431 0.714 0.178 0.441
#> 5 0.430 0.115 0.676 0.407
#> 6 0.290 0.830 0.0335 0.385
What I'd like to do next is to use dtplyr to speed up the process.
But I found an error:
library(dtplyr)
library(dplyr)
df <- tibble(x = runif(6), y = runif(6), z = runif(6))
df.dt <- lazy_dt(df)
df.dt %>%
rowwise() %>%
mutate(m = mean(c(x, y, z)))
#> Error in UseMethod("rowwise"): no applicable method for 'rowwise' applied to an object of class "c('dtplyr_step_first', 'dtplyr_step')"
What's the right way to do it?

How to use dplyr::summarize multiple times in a single command in R dplyr/ tidyr?

I have a community of species 1,2,3, and 4. I am trying to compute the covariance between species i and combined abundances of reciprocal species using dplyr. I want to do this for each species combination. The dplyr works fine for just one species, but not when I try to do multiple summarise together. Any suggestions?
set.seed(111)
month <- rep(c("J","J","J","F","F","F"), time = 3)
site <- rep(c(1,2,3), each = 6)
quadrant <- rep(c(1,2,3), times = 6)
sp1 <- sample(0:20, 18, replace = TRUE)
sp2 <- sample(0:15, 18, replace = TRUE)
sp3 <- sample(0:10, 18, replace = TRUE)
sp4 <- sample(0:4, 18, replace = TRUE)
df <- data.frame(month, site, quadrant, sp1, sp2, sp3, sp4)
df$sp2.3.4 <- df$sp2 + df$sp3 + df$sp4 #no sp1
df$sp3.4.1 <- df$sp3 + df$sp4 + df$sp1 #no sp2
df$sp1.2.4 <- df$sp1 + df$sp2 + df$sp4 #no sp3
df$sp1.2.3 <- df$sp1 + df$sp2 + df$sp3 #no sp4
library(tidyr)
df.long <- gather(df,
key = "species",
value = "abundance",
sp1, sp2, sp3, sp4)
df.long <- gather(df.long,
key = "species.covar",
value = "abundance.covar",
sp2.3.4, sp3.4.1, sp1.2.4, sp1.2.3)
df.long$species <- as.factor(as.character(df.long$species))
df.long$species.covar <- as.factor(as.character(df.long$species.covar))
library(dplyr)
agg.cov <- df.long%>%
group_by(month,site)%>%
dplyr::summarise(covar.species1 = cor(abundance[species=="sp1"],abundance.covar[species.covar=="sp2.3.4"]))%>%
as.data.frame()
agg.cov <- df.long%>%
group_by(month,site)%>%
dplyr::summarise(covar.species1 = cor(abundance[species=="sp1"],abundance.covar[species.covar=="sp2.3.4"]))%>%
dplyr::summarise(covar.species2 = cor(abundance[species=="sp2"],abundance.covar[species.covar=="sp3.4.1"]))%>%
dplyr::summarise(covar.species3 = cor(abundance[species=="sp3"],abundance.covar[species.covar=="sp1.2.4"]))%>%
dplyr::summarise(covar.species4 = cor(abundance[species=="sp4"],abundance.covar[species.covar=="sp1.2.3"]))%>%
as.data.frame()
Error: Error: Problem with `summarise()` column `covar.species2`.
ℹ `covar.species2 = cor(...)`.
x object 'abundance.covar' not found
ℹ The error occurred in group 1: month = "F".
There are three method below that should work
map - based
library(dplyr)
library(stringr)
library(purrr)
nm1 <- names(df)[startsWith(names(df), "sp")]
map(nm1, ~ df %>%
group_by(month, site) %>%
summarise(!!str_c("covar_species", "_", .x) :=
cor(!! rlang::sym(.x), rowSums(select(cur_data(), nm1, - !!.x)) ),
.groups = 'drop')) %>%
reduce(left_join)
-output
# A tibble: 6 x 6
month site covar_species_sp1 covar_species_sp2 covar_species_sp3 covar_species_sp4
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 F 1 0.479 0.987 -0.170 -0.980
2 F 2 -0.858 -0.454 -0.160 0.359
3 F 3 -0.999 -1.00 -0.933 NA
4 J 1 -0.945 -0.963 NA 0.596
5 J 2 -0.516 -0.148 -0.792 0.629
6 J 3 0.277 -0.591 -0.702 0.277
Reshaping with pivot_longer
library(tidyr)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = starts_with('sp'), names_to = "sp") %>%
group_by(rn) %>%
mutate(newvalue = sum(value) - value) %>%
group_by(month, site, sp = str_c('covar_species_', sp)) %>%
summarise(value = cor(value, newvalue), .groups = 'drop') %>%
pivot_wider(names_from = sp, values_from = value)
-output
# A tibble: 6 x 6
month site covar_species_sp1 covar_species_sp2 covar_species_sp3 covar_species_sp4
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 F 1 0.479 0.987 -0.170 -0.980
2 F 2 -0.858 -0.454 -0.160 0.359
3 F 3 -0.999 -1.00 -0.933 NA
4 J 1 -0.945 -0.963 NA 0.596
5 J 2 -0.516 -0.148 -0.792 0.629
6 J 3 0.277 -0.591 -0.702 0.277
Using across
df %>%
mutate(Sum = select(cur_data(), starts_with('sp')) %>%
rowSums) %>%
group_by(month, site) %>%
summarise(across(starts_with('sp'),
~ cor(., Sum - .), .names = "covar_species_{.col}"), .groups = 'drop')
-output
# A tibble: 6 x 6
month site covar_species_sp1 covar_species_sp2 covar_species_sp3 covar_species_sp4
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 F 1 0.479 0.987 -0.170 -0.980
2 F 2 -0.858 -0.454 -0.160 0.359
3 F 3 -0.999 -1.00 -0.933 NA
4 J 1 -0.945 -0.963 NA 0.596
5 J 2 -0.516 -0.148 -0.792 0.629
6 J 3 0.277 -0.591 -0.702 0.277
data
set.seed(111)
month <- rep(c("J","J","J","F","F","F"), time = 3)
site <- rep(c(1,2,3), each = 6)
quadrant <- rep(c(1,2,3), times = 6)
sp1 <- sample(0:20, 18, replace = TRUE)
sp2 <- sample(0:15, 18, replace = TRUE)
sp3 <- sample(0:10, 18, replace = TRUE)
sp4 <- sample(0:4, 18, replace = TRUE)
df <- data.frame(month, site, quadrant, sp1, sp2, sp3, sp4)
I wasn't able to get your example to work, but a potential solution (I'm guessing) is:
agg.cov <- df.long %>%
group_by(month,site)%>%
dplyr::summarise(covar.species1 = cor(abundance[species=="sp1"],abundance.covar[species.covar=="sp2.3.4"]),
covar.species2 = cor(abundance[species=="sp2"],abundance.covar[species.covar=="sp3.4.1"]),
covar.species3 = cor(abundance[species=="sp3"],abundance.covar[species.covar=="sp1.2.4"]),
covar.species4 = cor(abundance[species=="sp4"],abundance.covar[species.covar=="sp1.2.3"]))%>%
as.data.frame()

t-test of one group versus many groups in tidyverse

I have the following tibble
test_tbl <- tibble(name = rep(c("John", "Allan", "George", "Peter", "Paul"), each = 12),
category = rep(rep(LETTERS[1:4], each = 3), 5),
replicate = rep(1:3, 20),
value = sample.int(n = 1e5, size = 60, replace = T))
# A tibble: 60 x 4
name category replicate value
<chr> <chr> <int> <int>
1 John A 1 71257
2 John A 2 98887
3 John A 3 87354
4 John B 1 25352
5 John B 2 69913
6 John B 3 43086
7 John C 1 24957
8 John C 2 33928
9 John C 3 79854
10 John D 1 32842
11 John D 2 19156
12 John D 3 50283
13 Allan A 1 98188
14 Allan A 2 26208
15 Allan A 3 69329
16 Allan B 1 32696
17 Allan B 2 81240
18 Allan B 3 54689
19 Allan C 1 77044
20 Allan C 2 97776
# … with 40 more rows
I want to group_by(name, category) and perform 3 t.test calls, comparing category B, C and D with category A.
I would like to store the estimate and p.value from the output. The expected result is something like this:
# A tibble: 5 x 7
name B_vs_A_estimate B_vs_A_p_value C_vs_A_estimate C_vs_A_p_value D_vs_A_estimate D_vs_A_p_value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 John -0.578 0.486 0.198 0.309 0.631 0.171
2 Allan 0.140 0.644 0.728 0.283 0.980 0.485
3 George -0.778 0.320 -0.424 0.391 -0.154 0.589
4 Peter -0.435 0.470 -0.156 0.722 0.315 0.0140
5 Paul 0.590 0.0150 -0.473 0.475 0.681 0.407
I would prefer a solution using tidyverse and/or broom.
There are many ways to achieve the desired output but maybe this one is the more intuitive one and easy to debug (you can put a browser() anywhere)
test_tbl %>%
group_by(name) %>%
do({
sub_tbl <- .
expand.grid(g1="A", g2=c("B", "C", "D"), stringsAsFactors = FALSE) %>%
mutate(test=as.character(glue::glue("{g1}_vs_{g2}"))) %>%
rowwise() %>%
do({
gs <- .
t_res <- t.test(sub_tbl %>% filter(category == gs$g1) %>% pull(value),
sub_tbl %>% filter(category == gs$g2) %>% pull(value))
data.frame(test=gs$test, estimate=t_res$statistic, p_value=t_res$p.value,
stringsAsFactors = FALSE)
})
}) %>%
ungroup() %>%
gather(key="statistic", value="val", -name, -test) %>%
mutate(test_statistic = paste(test, statistic, sep = "_")) %>%
select(-test, -statistic) %>%
spread(key="test_statistic", value="val")
Result
# A tibble: 5 x 7
name A_vs_B_estimate A_vs_B_p_value A_vs_C_estimate A_vs_C_p_value A_vs_D_estimate A_vs_D_p_value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Allan -0.270 0.803 -1.03 0.396 1.55 0.250
2 George 0.201 0.855 0.221 0.838 1.07 0.380
3 John -1.59 0.249 0.0218 0.984 -0.410 0.704
4 Paul 0.116 0.918 -1.62 0.215 -1.53 0.212
5 Peter 0.471 0.664 0.551 0.611 0.466 0.680
It groups the records by name then apply a function (do #1). Save the sub dataframe in sub_tbl, expand all the test cases (expand.grid) and create a test name with the two letters combined. Now, for each combination apply the function to run the t-tests (do #2). That anonymous function performs the test between group 1 (g1) and group 2 (g2) and returns a dataframe with the results.
The second part basically rearranges the columns to have the final output.
test_tbl %>%
dplyr::group_by(name) %>%
dplyr::summarise(estimate_AB =
t.test(value[category == "A"| category == "B"] ~ category[category == "A" | category == "B"]) %>% (function(x){x$estimate[1] - x$estimate[2]}),
pvalue_AB = t.test(value[category == "A"| category == "B"] ~ category[category == "A" | category == "B"]) %>% (function(x){x$p.value})
)
Here is what I did for testing the A against B by group. I think that you could extend my approach, or try to incorporate the code from the first solution.
EDIT : cleanner code
map(unique(test_tbl$name),function(nm){test_tbl %>% filter(name == nm)}) %>%
map2(unique(test_tbl$name),function(dat,nm){
map(LETTERS[2:4],function(cat){
dat %>%
filter(category == "A") %>%
pull %>%
t.test(dat %>% filter(category == cat) %>% pull)
}) %>%
map_dfr(broom::glance) %>%
select(statistic,p.value) %>%
mutate(
name = nm,
cross_cat = paste0(LETTERS[2:4]," versus A")
)
}) %>%
{do.call(rbind,.)}
We can use
library(dplyr)
library(purrr)
library(stringr)
library(tidyr)
test_tbl %>%
split(.$name) %>%
map_dfr(~ {
Avalue <- .x$value[.x$category == 'A']
.x %>%
filter(category != 'A') %>%
group_by(category) %>%
summarise(out = t.test(value, Avalue)$p.value) %>%
mutate(category = str_c(category, '_vs_A_p_value'))}, .id = 'name') %>%
pivot_wider(names_from = category, values_from = out)

In R, use nonstandard evaluation to select specific variables from data.frames

I've got several large-ish data.frames set up like a relational database, and I'd like to make a single function to look for whatever variable I need and grab it from that particular data.frame and add it to the data.frame I'm currently working on. I've got a way to do this that works, but it requires temporarily making a list of all the data.frames, which seems inefficient. I suspect that nonstandard evaluation would solve this problem for me, but I'm not sure how to do it.
Here's what works but seems inefficient:
Table1 <- data.frame(ID = LETTERS[1:10], ColA = rnorm(10), ColB = rnorm(10),
ColC = rnorm(10))
Table2 <- data.frame(ID = LETTERS[1:10], ColD = rnorm(10), ColE = rnorm(10),
ColF = rnorm(10))
Table3 <- data.frame(ID = LETTERS[1:10], ColG = rnorm(10), ColH = rnorm(10),
ColI = rnorm(10))
Key <- data.frame(Table = rep(c("Table1", "Table2", "Table3"), each = 4),
ColumnName = c("ID", paste0("Col", LETTERS[1:3]),
"ID", paste0("Col", LETTERS[4:6]),
"ID", paste0("Col", LETTERS[7:9])))
# function for grabbing info from other tables
grab <- function(StartDF, ColNames){
AllDFs <- list(Table1, Table2, Table3)
names(AllDFs) <- c("Table1", "Table2", "Table3")
# Determine which data.frames have that column
WhichDF <- Key %>% filter(ColumnName %in% ColNames) %>%
select(Table)
TempDF <- StartDF
for(i in 1:length(ColNames)){
ToAdd <- AllDFs[WhichDF[i, 1]]
ToAdd <- ToAdd[[1]] %>%
select(c(ColNames[i], ID))
TempDF <- TempDF %>% left_join(ToAdd)
rm(ToAdd)
}
return(TempDF)
}
grab(Table1, c("ColE", "ColH"))
What would be great instead would be something like this:
grab <- function(StartDF, ColNames){
# Some function that returns the column names of all the data.frames
# without me creating a new object that is a list of them
# Some function that left_joins the correct data.frame plus the column
# "ID" to my starting data.frame, again without needing to create that list
# of all the data.frames
}
Instead of creating the list manually, we can directly get the values of the objects returned from the 'Table' column of 'Key' dataset with mget
library(dplyr)
library(purrr)
grab <- function(StartDF, ColNames){
# filter the rows of Key based on the ColNames input
# pull the Table column as a vector
# column was factor, so convert to character class
# return the value of the objects with mget in a list
Tables <- Key %>%
filter(ColumnName %in% ColNames) %>%
pull(Table) %>%
as.character %>%
mget(envir = .GlobalEnv)
TempDF <- StartDF
# use the same left_joins in a loop after selecting only the
# ID and corresponding columns from 'ColNames'
for(i in seq_along(ColNames)){
ToAdd <- Tables[[i]] %>%
select(ColNames[i], ID)
TempDF <- TempDF %>%
left_join(ToAdd)
rm(ToAdd)
}
TempDF
}
grab(Table1, c("ColE", "ColH"))
Or another option is reduce
grab <- function(StartDF, ColNames) {
#only change is that instead of a for loop
# use reduce with left_join after selecting the corresponding columns
# with map
Key %>%
filter(ColumnName %in% ColNames) %>%
pull(Table) %>%
as.character %>%
mget(envir = .GlobalEnv) %>%
map2(ColNames, ~ .x %>%
select(ID, .y)) %>%
append(list(Table1), .) %>%
reduce(left_join)
}
grab(Table1, c("ColE", "ColH"))
# ID ColA ColB ColC ColE ColH
#1 A -0.9490093 0.5177143 -1.91015491 0.07777086 1.86277670
#2 B -0.7182786 -1.1019146 -0.70802738 -0.73965230 0.18375660
#3 C 0.5064516 -1.6904354 1.11106206 2.04315508 -0.65365228
#4 D 0.9362477 0.5260682 -0.03419651 -0.51628310 -1.17104181
#5 E 0.5636047 -0.9470895 0.43303304 -2.95928629 1.86425049
#6 F 1.0598531 0.4144901 0.10239896 1.57681703 -0.05382603
#7 G 1.1335047 -0.8282173 -0.28327898 2.02917831 0.50768462
#8 H 0.2941341 0.3261185 -0.15528127 -0.46470035 -0.86561320
#9 I -2.1434905 0.6567689 0.02298549 0.90822132 0.64360337
#10 J 0.4291258 1.3410147 0.67544567 0.12466251 0.75989623
There is a serious bug in the accepted solution. If you're not careful with the ordering in the ColNames argument, then the function won't work. Also, I redefined your data to use tibbles instead. They're basically the same as data frames, but their default settings are nicer (e.g. you don't need StringsAsFactors = FALSE)
library(tidyverse)
Table1 <- tibble(
ID = LETTERS[1:10], ColA = rnorm(10), ColB = rnorm(10), ColC = rnorm(10)
)
Table2 <- tibble(
ID = LETTERS[1:10], ColD = rnorm(10), ColE = rnorm(10), ColF = rnorm(10)
)
Table3 <- tibble(
ID = LETTERS[1:10], ColG = rnorm(10), ColH = rnorm(10), ColI = rnorm(10)
)
Key <- tibble(
Table = rep(c("Table1", "Table2", "Table3"), each = 4),
ColumnName = c("ID", paste0("Col", LETTERS[1:3]),
"ID", paste0("Col", LETTERS[4:6]),
"ID", paste0("Col", LETTERS[7:9]))
)
grab_akrun <- function(StartDF, ColNames) {
#only change is that instead of a for loop
# use reduce with left_join after selecting the corresponding columns
# with map
Key %>%
filter(ColumnName %in% ColNames) %>%
pull(Table) %>%
as.character %>%
mget(envir = .GlobalEnv) %>%
map2(ColNames, ~ .x %>%
select(ID, .y)) %>%
append(list(Table1), .) %>%
reduce(left_join)
}
grab_akrun(Table1, c("ColE", "ColH"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#> ID ColA ColB ColC ColE ColH
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A -0.658 -0.613 0.689 -0.850 -0.795
#> 2 B 0.143 0.732 -0.212 -1.74 1.99
#> 3 C -0.966 -0.570 -0.354 0.559 -1.11
#> 4 D -1.05 0.269 -0.856 -0.370 -1.35
#> 5 E 0.255 -0.349 0.329 1.39 0.421
#> 6 F 1.51 1.38 0.707 -0.639 0.289
#> 7 G -1.28 1.44 -1.35 1.94 -1.04
#> 8 H -1.56 -0.434 0.231 0.467 0.656
#> 9 I -0.553 -1.64 -0.761 0.133 0.249
#> 10 J -0.950 0.418 -0.843 0.593 0.343
This works, but if you change the order:
grab_akrun(Table1, c("ColH", "ColE"))
#> Error: Unknown column `ColH`
Instead, you should approach it like this:
grab_new <- function(StartDF, ColNames) {
Key %>%
filter(ColumnName %in% ColNames) %>%
pluck("Table") %>%
mget(inherits = TRUE) %>%
map(~select(.x, ID, intersect(colnames(.x), ColNames))) %>%
reduce(left_join, .init = StartDF)
}
grab_new(Table1, c("ColE", "ColH"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#> ID ColA ColB ColC ColE ColH
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A -0.658 -0.613 0.689 -0.850 -0.795
#> 2 B 0.143 0.732 -0.212 -1.74 1.99
#> 3 C -0.966 -0.570 -0.354 0.559 -1.11
#> 4 D -1.05 0.269 -0.856 -0.370 -1.35
#> 5 E 0.255 -0.349 0.329 1.39 0.421
#> 6 F 1.51 1.38 0.707 -0.639 0.289
#> 7 G -1.28 1.44 -1.35 1.94 -1.04
#> 8 H -1.56 -0.434 0.231 0.467 0.656
#> 9 I -0.553 -1.64 -0.761 0.133 0.249
#> 10 J -0.950 0.418 -0.843 0.593 0.343
grab_new(Table1, c("ColH", "ColE"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#> ID ColA ColB ColC ColE ColH
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A -0.658 -0.613 0.689 -0.850 -0.795
#> 2 B 0.143 0.732 -0.212 -1.74 1.99
#> 3 C -0.966 -0.570 -0.354 0.559 -1.11
#> 4 D -1.05 0.269 -0.856 -0.370 -1.35
#> 5 E 0.255 -0.349 0.329 1.39 0.421
#> 6 F 1.51 1.38 0.707 -0.639 0.289
#> 7 G -1.28 1.44 -1.35 1.94 -1.04
#> 8 H -1.56 -0.434 0.231 0.467 0.656
#> 9 I -0.553 -1.64 -0.761 0.133 0.249
#> 10 J -0.950 0.418 -0.843 0.593 0.343
Which works as expected.
Created on 2020-01-21 by the reprex package (v0.3.0)

Order data frame by the last column with dplyr

library(dplyr)
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
df %>%
arrange(colnames(df) %>% tail(1) %>% desc())
I am looping over a list of data frames. There are different columns in the data frames and the last column of each may have a different name.
I need to arrange every data frame by its last column. The simple case looks like the above code.
Using arrange_at and ncol:
df %>% arrange_at(ncol(.), desc)
As arrange_at will be depricated in the future, you could also use:
# option 1
df %>% arrange(desc(.[ncol(.)]))
# option 2
df %>% arrange(across(ncol(.), desc))
If we need to arrange by the last column name, either use the name string
df %>%
arrange_at(vars(last(names(.))), desc)
Or specify the index
df %>%
arrange_at(ncol(.), desc)
The new dplyr way (I guess from 1.0.0 on) would be using across(last_col()):
library(dplyr)
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
df %>%
arrange(across(last_col(), desc))
#> # A tibble: 10 x 4
#> a b c d
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.283 0.443 1.30 0.910
#> 2 0.797 -0.0819 -0.936 0.828
#> 3 0.0717 -0.858 -0.355 0.671
#> 4 -1.38 -1.08 -0.472 0.426
#> 5 1.52 1.43 -0.0593 0.249
#> 6 0.827 -1.28 1.86 0.0824
#> 7 -0.448 0.0558 -1.48 -0.143
#> 8 0.377 -0.601 0.238 -0.918
#> 9 0.770 1.93 1.23 -1.43
#> 10 0.0532 -0.0934 -1.14 -2.08
> packageVersion("dplyr")
#> [1] ‘1.0.4’

Resources