Order data frame by the last column with dplyr - r

library(dplyr)
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
df %>%
arrange(colnames(df) %>% tail(1) %>% desc())
I am looping over a list of data frames. There are different columns in the data frames and the last column of each may have a different name.
I need to arrange every data frame by its last column. The simple case looks like the above code.

Using arrange_at and ncol:
df %>% arrange_at(ncol(.), desc)
As arrange_at will be depricated in the future, you could also use:
# option 1
df %>% arrange(desc(.[ncol(.)]))
# option 2
df %>% arrange(across(ncol(.), desc))

If we need to arrange by the last column name, either use the name string
df %>%
arrange_at(vars(last(names(.))), desc)
Or specify the index
df %>%
arrange_at(ncol(.), desc)

The new dplyr way (I guess from 1.0.0 on) would be using across(last_col()):
library(dplyr)
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
df %>%
arrange(across(last_col(), desc))
#> # A tibble: 10 x 4
#> a b c d
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.283 0.443 1.30 0.910
#> 2 0.797 -0.0819 -0.936 0.828
#> 3 0.0717 -0.858 -0.355 0.671
#> 4 -1.38 -1.08 -0.472 0.426
#> 5 1.52 1.43 -0.0593 0.249
#> 6 0.827 -1.28 1.86 0.0824
#> 7 -0.448 0.0558 -1.48 -0.143
#> 8 0.377 -0.601 0.238 -0.918
#> 9 0.770 1.93 1.23 -1.43
#> 10 0.0532 -0.0934 -1.14 -2.08
> packageVersion("dplyr")
#> [1] ‘1.0.4’

Related

How to scale up a transmute in tidyverse?

I have a tibble with many variables organised this way:
tibble(
A = rep("A",10),
xyz1 = rnorm(10),
xyz2 = rnorm(10),
xyz3 = rnorm(10),
abc1 = rnorm(10),
abc2 = rnorm(10),
abb3 = rnorm(10),
acc4 = rnorm(10)
)
where xyz, abc, etc. are placeholder. After the placeholder there is a number. Assume it can be any integer.
In my tibble, that number can be any integer.
I want to trasmute it according to the formula
xyzn = xyzn - 'xyzn-1', where n is the symbol for the counted integer.
Whereas 'xyzn-1' does not exist, the result can be ignored and not join the transmute.
Schematic output:
tibble(
A = A
xyz2 = xyz2 - xyz1,
xyz3 = xyz3 - xyz2,
abc2 = abc2 - abc1
)
Perhaps this helps
library(dplyr)
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -c(A, rn), names_to = c(".value", "ind"),
names_pattern = "(\\D+)(\\d+)",
names_transform = list(ind = as.integer)) %>%
arrange(A, rn, ind) %>%
group_by(A, rn) %>%
mutate(across(-ind, ~ c(NA, diff(.x)))) %>%
ungroup %>%
pivot_wider(names_from = ind, values_from = xyz:acc,
names_sep = "") %>%
select(-rn) %>%
select(where(~ any(complete.cases(.x))))
-output
# A tibble: 10 × 4
A xyz2 xyz3 abc2
<chr> <dbl> <dbl> <dbl>
1 A -1.60 1.75 -1.53
2 A 2.89 -3.81 0.0701
3 A -0.657 0.920 -0.912
4 A 0.305 0.395 -0.477
5 A -0.289 1.39 1.38
6 A -0.103 0.426 -1.38
7 A -2.16 1.44 -0.913
8 A -0.260 -0.249 -1.45
9 A -1.15 1.14 -1.42
10 A -0.306 0.198 -0.118
Or may be
cbind(df1[1], do.call(cbind, unname(Filter(nrow, lapply(split.default(df1[-1],
sub("\\d+$", "", names(df1)[-1])),
\(x) {
i1 <- order(as.integer(sub("\\D+", "", names(x))))
x <- x[i1]
x[-1]- x[-ncol(x)]
})))
))
-output
A abc2 xyz2 xyz3
1 A -1.52667071 -1.5985160 1.7533450
2 A 0.07013841 2.8939503 -3.8113492
3 A -0.91213998 -0.6573093 0.9197824
4 A -0.47712113 0.3049918 0.3945995
5 A 1.37871603 -0.2886773 1.3933839
6 A -1.37608993 -0.1031296 0.4264927
7 A -0.91313982 -2.1630265 1.4407289
8 A -1.45439105 -0.2598476 -0.2493127
9 A -1.41590040 -1.1490018 1.1383060
10 A -0.11775196 -0.3061306 0.1984115
Usually the {dplyover} package can help with this kind of problems (disclaimer: I'm the maintainer). However, in your specific case the problem is not that easy to solve due to he specific conditions for variable selection.
In the approach below we first construct the variable names that we want to subtract from each other myvars1 and myvars2.
After that we can use dplyover::across2() together with all_of().
See the code comments for what we do in each step:
library(dplyr)
library(stringr)
library(dplyover) # https://timteafan.github.io/dplyover/
# get all variable stems
all_stems <- dplyover::cut_names("[0-9]$", .vars = names(df1))
# exlcude stems which don't start with 1
use_stems <- all_stems[paste0(all_stems, 1) %in% names(df1)]
# construct regex pattern to select all vars with > 1
patrn <- paste0("(", paste(use_stems, collapse = "|"), ")[^1]$")
# select vars with > 1
myvars1 <- grep(patrn, names(df1), value = TRUE)
# select vars to substract from `myvars1`
myvars2 <- str_replace(myvars1, "\\d$", ~ as.numeric(.x) - 1)
# use `dplyover::across2()` with `all_of()`
df1 %>%
transmute(
A = A, # dplyover doesn't support the `.keep` argument so we need a workaround
across2(all_of(myvars1),
all_of(myvars2),
~ .x - .y,
.names = "{xcol}")
)
#> # A tibble: 10 × 4
#> A xyz2 xyz3 abc2
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 0.847 -1.19 0.413
#> 2 A 1.00 0.946 -3.26
#> 3 A 0.856 -1.11 -2.62
#> 4 A -0.325 1.47 1.11
#> 5 A -1.18 0.0830 2.78
#> 6 A -2.65 -0.520 -0.337
#> 7 A 0.197 -0.447 0.347
#> 8 A -0.484 1.18 -0.717
#> 9 A -1.94 1.81 1.05
#> 10 A -3.80 1.36 0.202
The from OP:
df1 <- tibble(
A = rep("A",10),
xyz1 = rnorm(10), # 2
xyz2 = rnorm(10), # 1 2
xyz3 = rnorm(10), # 1
abc1 = rnorm(10), # 2
abc2 = rnorm(10), # 1
abb3 = rnorm(10), #
acc4 = rnorm(10) #
)
Created on 2023-01-07 with reprex v2.0.2

Calculate variable from data.frame using dplyr

I'm sure this is obvious, but I can't figure it out.
I have a data.frame, and want to calculate a variable from several types.
df = data.frame(time = rep(seq(10),each=2),Type=rep(c("A","B"),times=10),value = runif(20))
I want a new data.frame, with A / B for each time point.
I've tried:
df2 <- df |> group_by(time) |> mutate(new_value= value[Type=="A"] / value[Type=="B"],.keep="none")
But I still have a new_value twice for each time.
A better option may be to reshape to 'wide' format with pivot_wider and then create the column
library(dplyr)
library(tidyr)
df %>%
pivot_wider(names_from = Type, values_from = value) %>%
mutate(new_value = A/B)
-output
# A tibble: 10 × 4
time A B new_value
<int> <dbl> <dbl> <dbl>
1 1 0.565 0.913 0.618
2 2 0.902 0.274 3.29
3 3 0.321 0.986 0.326
4 4 0.620 0.937 0.661
5 5 0.467 0.407 1.15
6 6 0.659 0.152 4.33
7 7 0.573 0.239 2.40
8 8 0.962 0.601 1.60
9 9 0.515 0.403 1.28
10 10 0.880 0.364 2.42
mutate creates or modifies a column in the original dataset, thus it returns the same number of rows. Instead, it may be better to use summarise if we want unique values (but here the 'Type' will be lost)
df |>
group_by(time) |>
summarise(new_value= value[Type=="A"] / value[Type=="B"])
In addition, this works only when the count of 'A', 'B' elements per 'time' is the same

How to pass tibble of variable names and function calls to tibble

I'm trying to go from a tibble of variable names and functions like this:
N <- 100
dat <-
tibble(
variable_name = c("a", "b"),
variable_value = c("rnorm(N)", "rnorm(N)")
)
to a tibble with two variables a and b of length N
dat2 <-
tibble(
a = rnorm(N),
b = rnorm(N)
)
is there a !!! or rlang-y way to accomplish this?
We can evalutate the string
library(dplyr)
library(purrr)
library(tibble)
deframe(dat) %>%
map_dfc(~ eval(rlang::parse_expr(.x)))
-output
# A tibble: 100 x 2
a b
<dbl> <dbl>
1 0.0750 2.55
2 -1.65 -1.48
3 1.77 -0.627
4 0.766 -0.0411
5 0.832 0.200
6 -1.91 -0.533
7 -0.0208 -0.266
8 -0.409 1.08
9 -1.38 -0.181
10 0.727 0.252
# … with 90 more rows
Here is a base way with a pipe and a as_tibble call.
Map(function(x) eval(str2lang(x)), setNames(dat$variable_value, dat$variable_name)) %>%
as_tibble

Using group_modify with selected columns (retaining whole data frame and order)

I have run out of R power on this one. I appreciate any help, it is probably quite simple for someone with more experience.
I have a data frame (tibble) with some numerical columns, a group column, and some other columns with other information. I want to do operations on the numerical columns, by group, but still retain all the columns.
I've put an example below: I am replacing the NAs with the group mean, for each column. The columns to replace the NAs are specified by the df_names variable.
It basically works, except it removes all columns except the numerical ones, AND reorders everything. Which makes it hard to reassemble. I could work around this, but I have a feeling there must be a simpler way to direct group_apply to specified columns, while retaining the other columns, and keeping the order.
Can anyone help? Thanks so much in advance!
Will
library("tidyverse")
# create tibble
df <- tibble(
name=letters[1:10],
csize=c("L","S","S","L","L","S","L","S","L","S"),
v1=rnorm(10),
v2=rnorm(10),
v3=rnorm(10)
)
# introduce some missing data
df$v1[3] <- NA
df$v1[6] <- NA
df$v1[7] <- NA
df$v3[2] <- NA
# these are the cols where I want to replace the NAs
df_names <- c("v1","v2","v3")
# this is the grouping variable (has to be stored as a string, since it is an input to the function)
groupvar <- "csize"
# now I want to replace the NAs with column means, restricted to their group
# the following line works, but the problem is that it removes the name column, and reorders the rows...
df_imp <- df %>% group_by(.dots=groupvar) %>% select(df_names) %>% group_modify( ~{replace_na(.x,as.list(colMeans(.x, na.rm=TRUE)))})
group_modify is overkill in this case; mutate(across()) is your friend here:
df %>% group_by(.dots = groupvar) %>%
mutate(across(all_of(df_names), ~if_else(is.na(.x), mean(.x, na.rm = TRUE), .x)))
Result:
> df
# A tibble: 10 x 5
# Groups: csize [2]
name csize v1 v2 v3
<chr> <chr> <dbl> <dbl> <dbl>
1 a L -1.22 1.48 -0.628
2 b S -1.17 0.0890 -0.130
3 c S -0.422 -0.0956 -0.0271
4 d L -0.265 0.180 -0.786
5 e L -0.491 0.509 -0.359
6 f S -0.422 -0.712 0.232
7 g L -0.400 -1.13 1.13
8 h S -0.538 -0.0785 0.690
9 i L 0.373 0.308 0.252
10 j S 0.445 0.743 -1.41
Does this work:
> library(dplyr)
> df %>% group_by(csize) %>% mutate(across(v1:v3, ~ replace_na(., mean(., na.rm = T))))
# A tibble: 10 x 5
# Groups: csize [2]
name csize v1 v2 v3
<chr> <chr> <dbl> <dbl> <dbl>
1 a L 1.57 0.310 -1.76
2 b S -0.705 0.0655 0.577
3 c S -1.05 1.28 1.82
4 d L 0.958 -2.09 -0.371
5 e L -0.712 0.247 -1.13
6 f S -1.05 -0.516 -0.107
7 g L 0.403 1.79 0.128
8 h S -0.793 1.52 1.07
9 i L -0.206 -0.369 -1.77
10 j S -1.65 -0.992 -0.476

In R, use nonstandard evaluation to select specific variables from data.frames

I've got several large-ish data.frames set up like a relational database, and I'd like to make a single function to look for whatever variable I need and grab it from that particular data.frame and add it to the data.frame I'm currently working on. I've got a way to do this that works, but it requires temporarily making a list of all the data.frames, which seems inefficient. I suspect that nonstandard evaluation would solve this problem for me, but I'm not sure how to do it.
Here's what works but seems inefficient:
Table1 <- data.frame(ID = LETTERS[1:10], ColA = rnorm(10), ColB = rnorm(10),
ColC = rnorm(10))
Table2 <- data.frame(ID = LETTERS[1:10], ColD = rnorm(10), ColE = rnorm(10),
ColF = rnorm(10))
Table3 <- data.frame(ID = LETTERS[1:10], ColG = rnorm(10), ColH = rnorm(10),
ColI = rnorm(10))
Key <- data.frame(Table = rep(c("Table1", "Table2", "Table3"), each = 4),
ColumnName = c("ID", paste0("Col", LETTERS[1:3]),
"ID", paste0("Col", LETTERS[4:6]),
"ID", paste0("Col", LETTERS[7:9])))
# function for grabbing info from other tables
grab <- function(StartDF, ColNames){
AllDFs <- list(Table1, Table2, Table3)
names(AllDFs) <- c("Table1", "Table2", "Table3")
# Determine which data.frames have that column
WhichDF <- Key %>% filter(ColumnName %in% ColNames) %>%
select(Table)
TempDF <- StartDF
for(i in 1:length(ColNames)){
ToAdd <- AllDFs[WhichDF[i, 1]]
ToAdd <- ToAdd[[1]] %>%
select(c(ColNames[i], ID))
TempDF <- TempDF %>% left_join(ToAdd)
rm(ToAdd)
}
return(TempDF)
}
grab(Table1, c("ColE", "ColH"))
What would be great instead would be something like this:
grab <- function(StartDF, ColNames){
# Some function that returns the column names of all the data.frames
# without me creating a new object that is a list of them
# Some function that left_joins the correct data.frame plus the column
# "ID" to my starting data.frame, again without needing to create that list
# of all the data.frames
}
Instead of creating the list manually, we can directly get the values of the objects returned from the 'Table' column of 'Key' dataset with mget
library(dplyr)
library(purrr)
grab <- function(StartDF, ColNames){
# filter the rows of Key based on the ColNames input
# pull the Table column as a vector
# column was factor, so convert to character class
# return the value of the objects with mget in a list
Tables <- Key %>%
filter(ColumnName %in% ColNames) %>%
pull(Table) %>%
as.character %>%
mget(envir = .GlobalEnv)
TempDF <- StartDF
# use the same left_joins in a loop after selecting only the
# ID and corresponding columns from 'ColNames'
for(i in seq_along(ColNames)){
ToAdd <- Tables[[i]] %>%
select(ColNames[i], ID)
TempDF <- TempDF %>%
left_join(ToAdd)
rm(ToAdd)
}
TempDF
}
grab(Table1, c("ColE", "ColH"))
Or another option is reduce
grab <- function(StartDF, ColNames) {
#only change is that instead of a for loop
# use reduce with left_join after selecting the corresponding columns
# with map
Key %>%
filter(ColumnName %in% ColNames) %>%
pull(Table) %>%
as.character %>%
mget(envir = .GlobalEnv) %>%
map2(ColNames, ~ .x %>%
select(ID, .y)) %>%
append(list(Table1), .) %>%
reduce(left_join)
}
grab(Table1, c("ColE", "ColH"))
# ID ColA ColB ColC ColE ColH
#1 A -0.9490093 0.5177143 -1.91015491 0.07777086 1.86277670
#2 B -0.7182786 -1.1019146 -0.70802738 -0.73965230 0.18375660
#3 C 0.5064516 -1.6904354 1.11106206 2.04315508 -0.65365228
#4 D 0.9362477 0.5260682 -0.03419651 -0.51628310 -1.17104181
#5 E 0.5636047 -0.9470895 0.43303304 -2.95928629 1.86425049
#6 F 1.0598531 0.4144901 0.10239896 1.57681703 -0.05382603
#7 G 1.1335047 -0.8282173 -0.28327898 2.02917831 0.50768462
#8 H 0.2941341 0.3261185 -0.15528127 -0.46470035 -0.86561320
#9 I -2.1434905 0.6567689 0.02298549 0.90822132 0.64360337
#10 J 0.4291258 1.3410147 0.67544567 0.12466251 0.75989623
There is a serious bug in the accepted solution. If you're not careful with the ordering in the ColNames argument, then the function won't work. Also, I redefined your data to use tibbles instead. They're basically the same as data frames, but their default settings are nicer (e.g. you don't need StringsAsFactors = FALSE)
library(tidyverse)
Table1 <- tibble(
ID = LETTERS[1:10], ColA = rnorm(10), ColB = rnorm(10), ColC = rnorm(10)
)
Table2 <- tibble(
ID = LETTERS[1:10], ColD = rnorm(10), ColE = rnorm(10), ColF = rnorm(10)
)
Table3 <- tibble(
ID = LETTERS[1:10], ColG = rnorm(10), ColH = rnorm(10), ColI = rnorm(10)
)
Key <- tibble(
Table = rep(c("Table1", "Table2", "Table3"), each = 4),
ColumnName = c("ID", paste0("Col", LETTERS[1:3]),
"ID", paste0("Col", LETTERS[4:6]),
"ID", paste0("Col", LETTERS[7:9]))
)
grab_akrun <- function(StartDF, ColNames) {
#only change is that instead of a for loop
# use reduce with left_join after selecting the corresponding columns
# with map
Key %>%
filter(ColumnName %in% ColNames) %>%
pull(Table) %>%
as.character %>%
mget(envir = .GlobalEnv) %>%
map2(ColNames, ~ .x %>%
select(ID, .y)) %>%
append(list(Table1), .) %>%
reduce(left_join)
}
grab_akrun(Table1, c("ColE", "ColH"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#> ID ColA ColB ColC ColE ColH
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A -0.658 -0.613 0.689 -0.850 -0.795
#> 2 B 0.143 0.732 -0.212 -1.74 1.99
#> 3 C -0.966 -0.570 -0.354 0.559 -1.11
#> 4 D -1.05 0.269 -0.856 -0.370 -1.35
#> 5 E 0.255 -0.349 0.329 1.39 0.421
#> 6 F 1.51 1.38 0.707 -0.639 0.289
#> 7 G -1.28 1.44 -1.35 1.94 -1.04
#> 8 H -1.56 -0.434 0.231 0.467 0.656
#> 9 I -0.553 -1.64 -0.761 0.133 0.249
#> 10 J -0.950 0.418 -0.843 0.593 0.343
This works, but if you change the order:
grab_akrun(Table1, c("ColH", "ColE"))
#> Error: Unknown column `ColH`
Instead, you should approach it like this:
grab_new <- function(StartDF, ColNames) {
Key %>%
filter(ColumnName %in% ColNames) %>%
pluck("Table") %>%
mget(inherits = TRUE) %>%
map(~select(.x, ID, intersect(colnames(.x), ColNames))) %>%
reduce(left_join, .init = StartDF)
}
grab_new(Table1, c("ColE", "ColH"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#> ID ColA ColB ColC ColE ColH
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A -0.658 -0.613 0.689 -0.850 -0.795
#> 2 B 0.143 0.732 -0.212 -1.74 1.99
#> 3 C -0.966 -0.570 -0.354 0.559 -1.11
#> 4 D -1.05 0.269 -0.856 -0.370 -1.35
#> 5 E 0.255 -0.349 0.329 1.39 0.421
#> 6 F 1.51 1.38 0.707 -0.639 0.289
#> 7 G -1.28 1.44 -1.35 1.94 -1.04
#> 8 H -1.56 -0.434 0.231 0.467 0.656
#> 9 I -0.553 -1.64 -0.761 0.133 0.249
#> 10 J -0.950 0.418 -0.843 0.593 0.343
grab_new(Table1, c("ColH", "ColE"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#> ID ColA ColB ColC ColE ColH
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A -0.658 -0.613 0.689 -0.850 -0.795
#> 2 B 0.143 0.732 -0.212 -1.74 1.99
#> 3 C -0.966 -0.570 -0.354 0.559 -1.11
#> 4 D -1.05 0.269 -0.856 -0.370 -1.35
#> 5 E 0.255 -0.349 0.329 1.39 0.421
#> 6 F 1.51 1.38 0.707 -0.639 0.289
#> 7 G -1.28 1.44 -1.35 1.94 -1.04
#> 8 H -1.56 -0.434 0.231 0.467 0.656
#> 9 I -0.553 -1.64 -0.761 0.133 0.249
#> 10 J -0.950 0.418 -0.843 0.593 0.343
Which works as expected.
Created on 2020-01-21 by the reprex package (v0.3.0)

Resources