subtract columns in the datatable in pairs - r

I have a data. table
h1 <- c(rnorm(50, mean = 50, sd = 1),
rnorm(50, mean = 60, sd = 1),
rnorm(50, mean = 70, sd = 1),
rnorm(50, mean = 80, sd = 1))
w1 <- c(rnorm(150, mean = 150, sd = 1),
rnorm(150, mean = 160, sd = 1),
rnorm(150, mean = 170, sd = 1),
rnorm(150, mean = 180, sd = 1))
e1 <- c(rnorm(150, mean = 150, sd = 1),
rnorm(150, mean = 160, sd = 1),
rnorm(150, mean = 170, sd = 1),
rnorm(150, mean = 180, sd = 1))
h2 <- c(rnorm(50, mean = 50, sd = 1),
rnorm(50, mean = 60, sd = 1),
rnorm(50, mean = 70, sd = 1),
rnorm(50, mean = 80, sd = 1))
w2 <- c(rnorm(150, mean = 150, sd = 1),
rnorm(150, mean = 160, sd = 1),
rnorm(150, mean = 170, sd = 1),
rnorm(150, mean = 180, sd = 1))
e2 <- c(rnorm(150, mean = 150, sd = 1),
rnorm(150, mean = 160, sd = 1),
rnorm(150, mean = 170, sd = 1),
rnorm(150, mean = 180, sd = 1))
df <- data.frame(h1,w1,e1,h2,w2,e2)
> df
h1 w1 e1 h2 w2 e2
1 49.85148 148.6694 149.4619 49.05355 151.1857 147.7629
2 49.81708 149.7126 149.1840 50.75627 150.4471 149.2853
I would like to find the difference between the columns in data. table?
what I want to get:
h1
w1
e1
h2
w2
e2
h2-h1
w2-w1
e2-e1
49.85148
148.6694
149.4619
49.05355
151.1857
147.7629
-0,79793
2,5163
-1,699

You can divide the dataframe in half and subtract the second part with the first one and assign new columns names.
n <- ncol(df)
col1 <- 1:(n/2)
col2 <- (n/2 + 1):n
new_col_name <- paste(names(df)[col2], names(df)[col1], sep = '-')
df[new_col_name] <- df[col2] - df[col1]
head(df)
# h1 w1 e1 h2 w2 e2 h2-h1 w2-w1 e2-e1
#1 49.43 149.6 150.2 49.39 149.4 150.1 -0.03665 -0.193458 -0.09741
#2 50.10 149.7 150.8 49.03 149.6 149.6 -1.07812 -0.053813 -1.25975
#3 50.05 149.8 150.7 48.42 149.8 151.0 -1.62448 -0.007319 0.32304
#4 49.77 149.7 148.8 49.92 148.7 149.1 0.15132 -1.005730 0.23139
#5 49.44 149.9 151.0 48.39 150.9 150.0 -1.04673 0.977863 -0.97748
#6 49.58 148.8 151.1 50.41 150.6 148.6 0.83088 1.800697 -2.52930

Perhaps this:
library(tidyverse)
str_sub(names(df), 1, 1) %>%
unique() %>%
map_dfc(~ select(df,starts_with(.x)) %>% transmute('{.x}1-{.x}2' := .[, 1] - .[, 2])) %>%
{bind_cols(df, .)} %>% as.tibble()
#> Warning: `as.tibble()` was deprecated in tibble 2.0.0.
#> Please use `as_tibble()` instead.
#> The signature and semantics have changed, see `?as_tibble`.
#> # A tibble: 600 x 9
#> h1 w1 e1 h2 w2 e2 `h1-h2` `w1-w2` `e1-e2`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 49.6 152. 151. 48.4 151. 148. 1.22 1.13 2.90
#> 2 50.1 150. 150. 51.6 150. 150. -1.50 -0.301 0.552
#> 3 50.9 150. 150. 50.4 151. 148. 0.474 -1.28 1.64
#> 4 51.6 149. 152. 51.0 150. 149. 0.599 -0.613 2.78
#> 5 50.2 149. 150. 51.4 150. 152. -1.27 -1.24 -2.44
#> 6 50.9 149. 152. 50.2 151. 151. 0.682 -2.18 0.815
#> 7 49.4 149. 149. 49.0 151. 149. 0.315 -2.57 -0.858
#> 8 50.3 151. 152. 50.4 148. 149. -0.0959 2.38 2.70
#> 9 50.3 151. 151. 49.0 150. 150. 1.30 0.611 0.749
#> 10 51.5 149. 150. 50.0 150. 149. 1.47 -1.21 0.366
#> # … with 590 more rows
Created on 2021-06-25 by the reprex package (v2.0.0)

Variation on the idea of splitting names up and subtracting:
sel <- names(df)[endsWith(names(df), "1")]
df[sprintf("%1$s1-%1$s2", sub("[12]$", "", sel))] <- df[sel] - df[sub("1", "2", sel)]
head(df)
# h1 w1 e1 h2 w2 e2 h1-h2 w1-w2 e1-e2
#1 49.93223 150.9997 149.2362 50.34892 150.3622 148.6164 -0.4166869 0.6375487 0.6198452
#2 48.83462 149.7938 150.9722 49.24049 150.0979 149.2758 -0.4058752 -0.3040331 1.6964756
#3 49.55578 146.8567 149.4173 48.61832 150.7250 148.2298 0.9374638 -3.8682714 1.1875108

Related

Referring to a dynamic name inside a for loop

I'm having trouble with referring to a dynamic name inside a for loop. I have the following dataframe:
library("tidyverse")
set.seed(10)
df <- data.frame(group = rep(LETTERS[1:3], each = 100),
measure1 = runif(300, min = 20, max = 30),
measure2 = runif(300, min = 10, max = 20),
risk = rbinom(n=300, size=1, prob=0.05))
df[c(20,21,103),2] <- NA
df[c(44,80,201),3] <- NA
df[c(61,98,207),4] <- NA
in which i calculate limits:
df %>% group_by(group)%>%
mutate(LLm1 = quantile(measure1[risk == 0], prob = c(0.05), na.rm = TRUE),
ULm2 = quantile(measure2[risk == 0], prob = c(0.95), na.rm = TRUE))%>% ungroup -> df
Now I would like to calculate which rows are outside a certain interval, and i would like to vary this interval according to some predefined additions or subtractions
for (k in seq(-2, 2, length.out = 5)){
for (i in seq(0.7, 1.0, length.out = 4)){
df %>%
group_by(group) %>%
mutate(
!!paste0("new", format(i, nsmall=1), "var", format(k, nsmall=0)) := ifelse(measure1 < (LLm1 - k) & measure2 >= (ULm2 - i), 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
) %>% ungroup -> df
}
}
So far this works fine, i'd like to however make another variable based on the dynamic variables im making, like this:
for (k in seq(-2, 2, length.out = 5)){
for (i in seq(0.7, 1.0, length.out = 4)){
df %>%
group_by(group) %>%
mutate(
!!paste0("var", format(i, nsmall=1), "_", format(k, nsmall=0)) := ifelse(measure1 < (LLm1 - k) & measure2 >= (ULm2 - i), 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
!!paste0("new", format(i, nsmall=1), "_", format(k, nsmall=0)) := ifelse((!!paste0("var", format(i, nsmall=1), "_", format(k, nsmall=0))) == 1 & risk == 1, 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
) %>% ungroup -> df
}
}
Unfortunately im not getting the desired results, as a row who has 1 in the new variable 'var' and a 1 in 'risk' does not get a 1 but it gets a 0. I've tried some alternatives with brackets and eval() but the result stays the same. Can anyone show me were I'm wrong in the syntax or help me explain how to refer to a dynamic name inside the for loop?
You have to wrap the string with the variable name after the double-bang operator !! in sym() to make sure it is treated as a name.
Further, as I pointed out in my comment, the condition risk == 1 in ifelse is never met, so it seems like its not working, so for the example at hand, I dropped that condition.
for (k in seq(-2, 2, length.out = 5)){
for (i in seq(0.7, 1.0, length.out = 4)){
df %>%
group_by(group) %>%
mutate(
!!paste0("var", format(i, nsmall=1), "_", format(k, nsmall=0)) := ifelse(measure1 < (LLm1 - k) & measure2 >= (ULm2 - i), 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
!!paste0("new", format(i, nsmall=1), "_", format(k, nsmall=0)) := ifelse((!! sym(paste0("var", format(i, nsmall=1), "_", format(k, nsmall=0)))) == 1, 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
) %>% ungroup -> df
}
}
df %>% filter(if_any(starts_with("new"), ~ .x != 0))
#> # A tibble: 23 x 46
#> group measure1 measure2 risk LLm1 ULm2 `var0.7_-2` `new0.7_-2` `var0.8_-2`
#> <chr> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 22.6 19.7 0 20.6 19.5 1 1 1
#> 2 A 21.0 19.3 0 20.6 19.5 1 1 1
#> 3 A 21.8 19.8 0 20.6 19.5 1 1 1
#> 4 A 21.9 19.0 0 20.6 19.5 1 1 1
#> 5 A 22.2 18.9 1 20.6 19.5 1 1 1
#> 6 A 21.7 19.5 0 20.6 19.5 1 1 1
#> 7 B 22.1 19.4 0 20.6 19.4 1 1 1
#> 8 B 22.1 18.6 0 20.6 19.4 0 0 0
#> 9 B 20.6 19.6 0 20.6 19.4 1 1 1
#> 10 B 22.5 18.7 0 20.6 19.4 0 0 1
#> # ... with 13 more rows, and 37 more variables: new0.8_-2 <dbl>,
#> # var0.9_-2 <dbl>, new0.9_-2 <dbl>, var1.0_-2 <dbl>, new1.0_-2 <dbl>,
#> # var0.7_-1 <dbl>, new0.7_-1 <dbl>, var0.8_-1 <dbl>, new0.8_-1 <dbl>,
#> # var0.9_-1 <dbl>, new0.9_-1 <dbl>, var1.0_-1 <dbl>, new1.0_-1 <dbl>,
#> # var0.7_0 <dbl>, new0.7_0 <dbl>, var0.8_0 <dbl>, new0.8_0 <dbl>,
#> # var0.9_0 <dbl>, new0.9_0 <dbl>, var1.0_0 <dbl>, new1.0_0 <dbl>,
#> # var0.7_1 <dbl>, new0.7_1 <dbl>, var0.8_1 <dbl>, new0.8_1 <dbl>, ...
Another way to approach the problem is to use the dplyover package (disclaimer: I'm the maintainer), and here the funciton dplyover::over2x() which generates columns in a nested loop style based on the input objects.
After dplyover::over2x() we can just use a regular call to across() and target all variables that start_with("var").
library(dplyover)
df %>%
group_by(group) %>%
mutate(
over2x(seq(-2, 2, length.out = 5),
seq(0.7, 1.0, length.out = 4),
~ ifelse(measure1 < (LLm1 - .x) & measure2 >= (ULm2 - .y), 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
.names = "var{y}_{x}"
),
across(starts_with("var"),
~ ifelse(.x == 1, 1,
ifelse(is.na(measure1) | is.na(measure2),
NA, 0)),
.names = "{gsub('var', 'new', {.col})}")
) %>%
ungroup()
#> # A tibble: 300 x 46
#> group measure1 measure2 risk LLm1 ULm2 `var0.7_-2` `var0.8_-2` `var0.9_-2`
#> <chr> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 27.2 12.6 0 20.8 19.3 0 0 0
#> 2 A 29.0 14.1 0 20.8 19.3 0 0 0
#> 3 A 29.3 14.6 0 20.8 19.3 0 0 0
#> 4 A 25.4 14.4 0 20.8 19.3 0 0 0
#> 5 A 25.2 12.3 0 20.8 19.3 0 0 0
#> 6 A 25.6 10.0 0 20.8 19.3 0 0 0
#> 7 A 28.2 10.3 0 20.8 19.3 0 0 0
#> 8 A 27.4 19.7 0 20.8 19.3 0 0 0
#> 9 A 24.8 19.3 0 20.8 19.3 0 0 0
#> 10 A 22.7 18.0 0 20.8 19.3 0 0 0
#> # ... with 290 more rows, and 37 more variables: var1_-2 <dbl>,
#> # var0.7_-1 <dbl>, var0.8_-1 <dbl>, var0.9_-1 <dbl>, var1_-1 <dbl>,
#> # var0.7_0 <dbl>, var0.8_0 <dbl>, var0.9_0 <dbl>, var1_0 <dbl>,
#> # var0.7_1 <dbl>, var0.8_1 <dbl>, var0.9_1 <dbl>, var1_1 <dbl>,
#> # var0.7_2 <dbl>, var0.8_2 <dbl>, var0.9_2 <dbl>, var1_2 <dbl>,
#> # new0.7_-2 <dbl>, new0.8_-2 <dbl>, new0.9_-2 <dbl>, new1_-2 <dbl>,
#> # new0.7_-1 <dbl>, new0.8_-1 <dbl>, new0.9_-1 <dbl>, new1_-1 <dbl>, ...
The data
df <- data.frame(group = rep(LETTERS[1:3], each = 100),
measure1 = runif(300, min = 20, max = 30),
measure2 = runif(300, min = 10, max = 20),
risk = rbinom(n=300, size=1, prob=0.05))
df[c(20,21,103),2] <- NA
df[c(44,80,201),3] <- NA
df[c(61,98,207),4] <- NA
library(dplyr)
df %>% group_by(group)%>%
mutate(LLm1 = quantile(measure1[risk == 0], prob = c(0.05), na.rm = TRUE),
ULm2 = quantile(measure2[risk == 0], prob = c(0.95), na.rm = TRUE))%>% ungroup -> df
Created on 2022-11-25 by the reprex package (v2.0.1)

Compute multiple column to column differences R

I would like to compute differences among several columns, per identifiers (see script below for reproducible example and target data frame).
This question is somehow similar, but only for pairs of identifiers. I can't think on how to adapt it.
I could also have several data frame, one per identifier, but I also don't know in that case how to compute multiple columns differences.
The code below allows to create a sample dataset, and has the code I currently use. It gives me what I want, I'd just like to know if there is a way not to spell out all the differences I want to compute (in my dataset, I have more parameters and depths than in that sample data).
Thanks in advance for your help!
library(tidyverse)
# sample data
create.dt <- function(t = 0) {
data.frame(parameter = rep(c("temperature","oxygen"), each = 3),
date = rep(c(Sys.Date()+t), each = 6),
depth = rep(1:3, times = 2),
value = c(data.frame(x = rnorm(3, 16, 2)) %>%
arrange(-x) %>% pull,
data.frame(x = rnorm(3, 7, 1)) %>%
arrange(-x) %>% pull
))
}
# Multi-site dataset
dt <- rbind(
cbind(site = "A", create.dt(t = c(-3:0))),
cbind(site = "B", create.dt(t = c(-3:0))),
cbind(site = "C", create.dt(t = c(-3:0))),
cbind(site = "D", create.dt(t = c(-3:0))),
cbind(site = "E", create.dt(t = c(-3:0))))
# Reshape the data and compute differences
dt %>% pivot_wider(id_cols = c(site,date), names_from = c(parameter,depth), values_from = value, names_sep = "_") %>%
# do the difference, depth to depth, parameter by parameter
# What I would like is not have to write manually each differences pair
mutate(temperature_1_2 = temperature_1 - temperature_2,
temperature_1_3 = temperature_1 - temperature_3,
temperature_2_3 = temperature_2 - temperature_3,
oxygen_1_2 = oxygen_1 - oxygen_2,
oxygen_1_3 = oxygen_1 - oxygen_3,
oxygen_2_3 = oxygen_2 - oxygen_3)
library(tidyverse)
library(rlang)
create.dt <- function(t = 0) {
data.frame(parameter = rep(c("temperature","oxygen"), each = 3),
date = rep(c(Sys.Date()+t), each = 6),
depth = rep(1:3, times = 2),
value = c(data.frame(x = rnorm(3, 16, 2)) %>%
arrange(-x) %>% pull,
data.frame(x = rnorm(3, 7, 1)) %>%
arrange(-x) %>% pull
))
}
# Multi-site dataset
dt <- rbind(
cbind(site = "A", create.dt(t = c(-3:0))),
cbind(site = "B", create.dt(t = c(-3:0))),
cbind(site = "C", create.dt(t = c(-3:0))),
cbind(site = "D", create.dt(t = c(-3:0))),
cbind(site = "E", create.dt(t = c(-3:0))))
# result
temperature <- str_c("temperature_", 1:3)
oxygen <- str_c("oxygen_", 1:3)
temperature_frml <- combn(temperature, m = 2, FUN = function(x) str_c(x, collapse = " - "))
oxygen_frml <- combn(oxygen, m = 2, FUN = function(x) str_c(x, collapse = " - "))
all_frml <- c(temperature_frml, oxygen_frml)
df_wider <- dt %>% pivot_wider(
id_cols = c(site, date),
names_from = c(parameter, depth),
values_from = value,
names_sep = "_"
)
bind_cols(df_wider,
map_dfc(
.x = all_frml,
.f = ~ transmute(.data = df_wider,!!.x := eval(parse_expr(.x)))
))
#> # A tibble: 20 x 14
#> site date temperature_1 temperature_2 temperature_3 oxygen_1 oxygen_2
#> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 2021-12-11 17.6 17.1 12.9 7.34 6.86
#> 2 A 2021-12-12 17.6 17.1 12.9 7.34 6.86
#> 3 A 2021-12-13 17.6 17.1 12.9 7.34 6.86
#> 4 A 2021-12-14 17.6 17.1 12.9 7.34 6.86
#> 5 B 2021-12-11 17.1 15.6 13.7 8.52 7.58
#> 6 B 2021-12-12 17.1 15.6 13.7 8.52 7.58
#> 7 B 2021-12-13 17.1 15.6 13.7 8.52 7.58
#> 8 B 2021-12-14 17.1 15.6 13.7 8.52 7.58
#> 9 C 2021-12-11 17.7 15.5 13.6 7.66 7.31
#> 10 C 2021-12-12 17.7 15.5 13.6 7.66 7.31
#> 11 C 2021-12-13 17.7 15.5 13.6 7.66 7.31
#> 12 C 2021-12-14 17.7 15.5 13.6 7.66 7.31
#> 13 D 2021-12-11 16.5 16.4 14.5 7.50 7.27
#> 14 D 2021-12-12 16.5 16.4 14.5 7.50 7.27
#> 15 D 2021-12-13 16.5 16.4 14.5 7.50 7.27
#> 16 D 2021-12-14 16.5 16.4 14.5 7.50 7.27
#> 17 E 2021-12-11 16.7 16.1 15.7 7.52 7.51
#> 18 E 2021-12-12 16.7 16.1 15.7 7.52 7.51
#> 19 E 2021-12-13 16.7 16.1 15.7 7.52 7.51
#> 20 E 2021-12-14 16.7 16.1 15.7 7.52 7.51
#> # ... with 7 more variables: oxygen_3 <dbl>,
#> # temperature_1 - temperature_2 <dbl>, temperature_1 - temperature_3 <dbl>,
#> # temperature_2 - temperature_3 <dbl>, oxygen_1 - oxygen_2 <dbl>,
#> # oxygen_1 - oxygen_3 <dbl>, oxygen_2 - oxygen_3 <dbl>
Created on 2021-12-14 by the reprex package (v2.0.1)

Divide all values by reference row

Although this seems similar to this, I'm looking for a "tidy" solution...
Let's look at the following data (it's rocks compositions for some chemical elements, if you are curious):
# A tibble: 4 x 15
Rock La Ce Pr Nd Sm Eu Gd Tb Dy Ho Er Tm Yb Lu
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Upper CC 31 63 7.1 27 4.7 1 4 0.7 3.9 0.83 2.3 0.3 1.96 0.31
2 Middle CC 24 53 5.8 25 4.6 1.4 4 0.7 3.8 0.82 2.3 0.32 2.2 0.4
3 Lower CC 8 20 2.4 11 2.8 1.1 3.1 0.48 3.1 0.68 1.9 0.24 1.5 0.25
4 chondrite 0.235 0.603 0.0891 0.452 0.147 0.056 0.197 0.0363 0.243 0.0556 0.159 0.0242 0.162 0.0243
(see at the end for the dput)
This is made of three samples and a reference value (chondrite). I want to normalize the value of each element by the chondrite, for each sample, i.e. get something like that:
# A tibble: 4 x 15
Rock La Ce Pr Nd Sm Eu Gd Tb Dy Ho Er Tm Yb Lu
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Upper CC 132. 104. 79.7 59.7 32.0 17.9 20.3 19.3 16.0 14.9 14.5 12.4 12.1 12.8
2 Middle CC 102. 87.9 65.1 55.3 31.3 25 20.3 19.3 15.6 14.8 14.5 13.2 13.6 16.5
3 Lower CC 34.0 33.2 26.9 24.3 19.0 19.6 15.7 13.2 12.8 12.2 12.0 9.92 9.26 10.3
4 chondrite 1 1 1 1 1 1 1 1 1 1 1 1 1 1
In which, of course, the first 132 for df["Upper CC","La"] comes from 31 / 0.235, i.e. df["Upper CC","La"] / df["chondrite","La"]
This is trivial in excel, and can be done in plain R with something along the lines of
apply(df[,-1],1,FUN=function(z){return(z/df[4,-1])})
Give or take some unlist() and other niceties.
But how do I do this in tidyverse idiom ? I started constructing
df %>% mutate(across( where(is.numeric), ... ? .... ) )
... but could not go further.
Generalize/related question: instead of normalizing by df[4,], normalize by an arbitrary named vector.
dput(df)
structure(list(Rock = c("Upper CC", "Middle CC", "Lower CC",
"chondrite"), La = c(31, 24, 8, 0.2347), Ce = c(63, 53, 20, 0.6032
), Pr = c(7.1, 5.8, 2.4, 0.0891), Nd = c(27, 25, 11, 0.4524),
Sm = c(4.7, 4.6, 2.8, 0.1471), Eu = c(1, 1.4, 1.1, 0.056),
Gd = c(4, 4, 3.1, 0.1966), Tb = c(0.7, 0.7, 0.48, 0.0363),
Dy = c(3.9, 3.8, 3.1, 0.2427), Ho = c(0.83, 0.82, 0.68, 0.0556
), Er = c(2.3, 2.3, 1.9, 0.1589), Tm = c(0.3, 0.32, 0.24,
0.0242), Yb = c(1.96, 2.2, 1.5, 0.1625), Lu = c(0.31, 0.4,
0.25, 0.0243)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
You can use :
library(dplyr)
df %>% mutate(across(where(is.numeric), ~./.[Rock == "chondrite"]))
# Rock La Ce Pr Nd Sm Eu Gd Tb Dy
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Upper … 132. 104. 79.7 59.7 32.0 17.9 20.3 19.3 16.1
#2 Middle… 102. 87.9 65.1 55.3 31.3 25.0 20.3 19.3 15.7
#3 Lower … 34.1 33.2 26.9 24.3 19.0 19.6 15.8 13.2 12.8
#4 chondr… 1 1 1 1 1 1 1 1 1
# … with 5 more variables: Ho <dbl>, Er <dbl>, Tm <dbl>,
# Yb <dbl>, Lu <dbl>
Using matrix calculations.
m <- t(dat[-1])
dat[-1] <- t(m / m[,4])
# Rock La Ce Pr Nd Sm Eu Gd Tb Dy Ho Er Tm Yb Lu
# 1 Upper CC 131.91489 104.47761 79.68575 59.73451 31.97279 17.85714 20.30457 19.28375 16.04938 14.92806 14.46541 12.396694 12.098765 12.75720
# 2 Middle CC 102.12766 87.89386 65.09540 55.30973 31.29252 25.00000 20.30457 19.28375 15.63786 14.74820 14.46541 13.223140 13.580247 16.46091
# 3 Lower CC 34.04255 33.16750 26.93603 24.33628 19.04762 19.64286 15.73604 13.22314 12.75720 12.23022 11.94969 9.917355 9.259259 10.28807
# 4 chondrite 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.000000 1.000000 1.00000
Data
dat <- structure(list(Rock = c("Upper CC", "Middle CC", "Lower CC",
"chondrite"), La = c(31, 24, 8, 0.235), Ce = c(63, 53, 20, 0.603
), Pr = c(7.1, 5.8, 2.4, 0.0891), Nd = c(27, 25, 11, 0.452),
Sm = c(4.7, 4.6, 2.8, 0.147), Eu = c(1, 1.4, 1.1, 0.056),
Gd = c(4, 4, 3.1, 0.197), Tb = c(0.7, 0.7, 0.48, 0.0363),
Dy = c(3.9, 3.8, 3.1, 0.243), Ho = c(0.83, 0.82, 0.68, 0.0556
), Er = c(2.3, 2.3, 1.9, 0.159), Tm = c(0.3, 0.32, 0.24,
0.0242), Yb = c(1.96, 2.2, 1.5, 0.162), Lu = c(0.31, 0.4,
0.25, 0.0243)), class = "data.frame", row.names = c("1",
"2", "3", "4"))
Using data.table
library(data.table)
setDT(df1)[, (names(df1)[-1]) := lapply(.SD, function(x)
x/x[match( "chondrite", Rock)]), .SDcols = -1]

Using dplyr summarise() for specific columns within purrr map() with grouped data

I have a problem I'm trying to solve, and I can't seem to find a succinct solution. There are a few similar questions on SO, but nothing that quite fits.
Take some sample data:
library(dplyr)
dat <- tibble(
group1 = factor(sample(c("one", "two"), 10, replace = T)),
group2 = factor(sample(c("alpha", "beta"), 10, replace = T)),
var1 = rnorm(10, 20, 2),
var2 = rnorm(10, 20, 2),
var3 = rnorm(10, 20, 2),
other1 = sample(c("a", "b", "c"), 10, replace = T),
other2 = sample(c("a", "b", "c"), 10, replace = T),
)
I would like to summarise just the numeric variables (i.e. ignoring other1 and other2), but have the output grouped by group1 and group2.
I have tried something like this, but it returns an error as it attempts to apply my summarise() functions to the grouping variables too.
dat %>%
group_by(group1, group2) %>%
select(where(is.numeric)) %>%
map(~ .x %>%
filter(!is.na(.x)) %>%
summarise(mean = mean(.x),
sd = sd(.x),
median = median(.x),
q1 = quantile(.x, p = .25),
q3 = quantile(.x, p = .75))
)
My expected output would be something like
group1 group2 mean sd median q1 q3
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 one alpha ? ? ? ? ?
2 one beta ? ? ? ? ?
3 two alpha ? ? ? ? ?
4 two beta ? ? ? ? ?
Any solutions would be greatly appreciated.
Thanks,
Sam
Try:
dat %>% group_by(group1,group2) %>%
summarize(across(is.numeric,c(sd = sd,
mean = mean,
median =median,
q1 = function(x) quantile(x,.25),
q3 = function(x) quantile(x,.75))))
group1 group2 var1_sd var1_mean var1_median var1_q1 var1_q3 var2_sd var2_mean var2_median var2_q1 var2_q3 var3_sd
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 one alpha 4.06 20.6 19.3 18.3 22.2 1.12 17.9 17.3 17.2 18.2 1.09
2 one beta 0.726 18.7 18.7 18.4 18.9 0.348 18.8 18.8 18.7 18.9 0.604
3 two alpha 1.31 19.9 20.0 19.3 20.6 1.10 17.8 18.3 17.4 18.5 0.624
4 two beta 0.777 21.2 21.2 21.0 21.5 1.13 19.6 19.6 19.2 20.0 0.0161
You can also pass the columns to the functions in summarise:
dat %>%
group_by(group1, group2) %>%
summarise(mean = mean(var1:var3),
sd = sd(var1:var3),
median = median(var1:var3),
q1 = quantile(var1:var3, p = .25),
q3 = quantile(var1:var3, p = .75))
dat
# A tibble: 4 x 7
# Groups: group1 [2]
# group1 group2 mean sd median q1 q3
# <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 one alpha 19.1 0.707 19.1 18.8 19.3
# 2 one beta 17.5 1.29 17.5 16.8 18.3
# 3 two alpha 17.1 NA 17.1 17.1 17.1
# 4 two beta 19.9 NA 19.9 19.9 19.9

dplyr summarise_all with quantile and other functions

I have a dataframe PatientA
Height Weight Age BMI
<dbl> <dbl> <dbl> <dbl>
1 161 72.2 27 27.9
2 164 61.0 21 22.8
3 171 72.0 30 24.6
4 169. 63.9 25 22.9
5 174. 64.4 27 21.1
6 160 50.9 22 19.9
7 172 77.5 22 26.3
8 165 54.5 22 20
9 173 82.4 29 27.5
10 169 76.6 22 26.9
and I would like to get some statistics for each column. I have the next working code which deals only with quantiles
genStat <- PatientsA %>%
summarise_all(funs(list(quantile(., probs = c(0.25, 0.5, 0.75))))) %>%
unnest %>%
transpose %>%
setNames(., c('25%', '50%', '75%')) %>%
map_df(unlist) %>%
bind_cols(data.frame(vars = names(PatientsA)), .)
and I need to add mean and sd to summarise_all like this
genStat <- PatientsA %>%
summarise_all(funs(mean,sd,list(quantile(., probs = c(0.25, 0.5, 0.75))))) %>%
unnest %>%
transpose %>%
setNames(., c('mean','sd','25%', '50%', '75%')) %>%
map_df(unlist) %>%
bind_cols(data.frame(vars = names(PatientsA)), .)
This straightforward approach fails returning the next error:
Error in names(object) <- nm : 'names' attribute [5] must be the
same length as the vector [3]
I'm a newbie in R, so what is the right syntax for completing this task?
This is what I would suggest. There is a little repetition in the code (calling quantile three times) but overall I think it is easier to understand and debug.
library(tidyverse)
PatientsA %>%
gather("variable", "value") %>%
group_by(variable) %>%
summarize(mean_val = mean(value),
sd_val = sd(value),
q25 = quantile(value, probs = .25),
q50 = quantile(value, probs = .5),
q75 = quantile(value, probs = .75))
## A tibble: 4 x 6
# variable mean_val sd_val q25 q50 q75
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Age 24.7 3.33 22 23.5 27
#2 BMI 24.0 3.08 21.5 23.8 26.7
#3 Height 168. 5.01 164. 169 172.
#4 Weight 67.5 10.3 61.7 68.2 75.5
We could also place the quantile output in a list and then unnest
library(tidyverse)
PatientsA %>%
gather %>%
group_by(key) %>%
summarise_at(vars('value'),
funs(mean,
sd,
quantile = list(as.tibble(as.list(quantile(.,
probs = c(0.25, 0.5, 0.75))))))) %>%
unnest
# A tibble: 4 x 6
# key mean sd `25%` `50%` `75%`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Age 24.7 3.33 22 23.5 27
#2 BMI 24.0 3.08 21.5 23.8 26.7
#3 Height 168. 5.01 164. 169 172.
#4 Weight 67.5 10.3 61.7 68.2 75.5
Or using pivot_longer
PatientsA %>%
pivot_longer(cols = everything()) %>%
group_by(name) %>%
summarise(across(value, list(mean= ~ mean(., na.rm = TRUE),
sd = ~ sd(., na.rm = TRUE),
quantile = ~ list(as_tibble(as.list(quantile(.,
probs = c(0.25, 0.5, 0.75)))))))) %>%
unnest(c(value_quantile))
# A tibble: 4 x 6
name value_mean value_sd `25%` `50%` `75%`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Age 24.7 3.33 22 23.5 27
2 BMI 24.0 3.08 21.5 23.8 26.7
3 Height 168. 5.01 164. 169 172.
4 Weight 67.5 10.3 61.7 68.2 75.5
###data
PatientsA <- structure(list(Height = c(161, 164, 171, 169, 174, 160, 172,
165, 173, 169), Weight = c(72.2, 61, 72, 63.9, 64.4, 50.9, 77.5,
54.5, 82.4, 76.6), Age = c(27L, 21L, 30L, 25L, 27L, 22L, 22L,
22L, 29L, 22L), BMI = c(27.9, 22.8, 24.6, 22.9, 21.1, 19.9, 26.3,
20, 27.5, 26.9)), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10"))

Resources