I'm having trouble with referring to a dynamic name inside a for loop. I have the following dataframe:
library("tidyverse")
set.seed(10)
df <- data.frame(group = rep(LETTERS[1:3], each = 100),
measure1 = runif(300, min = 20, max = 30),
measure2 = runif(300, min = 10, max = 20),
risk = rbinom(n=300, size=1, prob=0.05))
df[c(20,21,103),2] <- NA
df[c(44,80,201),3] <- NA
df[c(61,98,207),4] <- NA
in which i calculate limits:
df %>% group_by(group)%>%
mutate(LLm1 = quantile(measure1[risk == 0], prob = c(0.05), na.rm = TRUE),
ULm2 = quantile(measure2[risk == 0], prob = c(0.95), na.rm = TRUE))%>% ungroup -> df
Now I would like to calculate which rows are outside a certain interval, and i would like to vary this interval according to some predefined additions or subtractions
for (k in seq(-2, 2, length.out = 5)){
for (i in seq(0.7, 1.0, length.out = 4)){
df %>%
group_by(group) %>%
mutate(
!!paste0("new", format(i, nsmall=1), "var", format(k, nsmall=0)) := ifelse(measure1 < (LLm1 - k) & measure2 >= (ULm2 - i), 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
) %>% ungroup -> df
}
}
So far this works fine, i'd like to however make another variable based on the dynamic variables im making, like this:
for (k in seq(-2, 2, length.out = 5)){
for (i in seq(0.7, 1.0, length.out = 4)){
df %>%
group_by(group) %>%
mutate(
!!paste0("var", format(i, nsmall=1), "_", format(k, nsmall=0)) := ifelse(measure1 < (LLm1 - k) & measure2 >= (ULm2 - i), 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
!!paste0("new", format(i, nsmall=1), "_", format(k, nsmall=0)) := ifelse((!!paste0("var", format(i, nsmall=1), "_", format(k, nsmall=0))) == 1 & risk == 1, 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
) %>% ungroup -> df
}
}
Unfortunately im not getting the desired results, as a row who has 1 in the new variable 'var' and a 1 in 'risk' does not get a 1 but it gets a 0. I've tried some alternatives with brackets and eval() but the result stays the same. Can anyone show me were I'm wrong in the syntax or help me explain how to refer to a dynamic name inside the for loop?
You have to wrap the string with the variable name after the double-bang operator !! in sym() to make sure it is treated as a name.
Further, as I pointed out in my comment, the condition risk == 1 in ifelse is never met, so it seems like its not working, so for the example at hand, I dropped that condition.
for (k in seq(-2, 2, length.out = 5)){
for (i in seq(0.7, 1.0, length.out = 4)){
df %>%
group_by(group) %>%
mutate(
!!paste0("var", format(i, nsmall=1), "_", format(k, nsmall=0)) := ifelse(measure1 < (LLm1 - k) & measure2 >= (ULm2 - i), 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
!!paste0("new", format(i, nsmall=1), "_", format(k, nsmall=0)) := ifelse((!! sym(paste0("var", format(i, nsmall=1), "_", format(k, nsmall=0)))) == 1, 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
) %>% ungroup -> df
}
}
df %>% filter(if_any(starts_with("new"), ~ .x != 0))
#> # A tibble: 23 x 46
#> group measure1 measure2 risk LLm1 ULm2 `var0.7_-2` `new0.7_-2` `var0.8_-2`
#> <chr> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 22.6 19.7 0 20.6 19.5 1 1 1
#> 2 A 21.0 19.3 0 20.6 19.5 1 1 1
#> 3 A 21.8 19.8 0 20.6 19.5 1 1 1
#> 4 A 21.9 19.0 0 20.6 19.5 1 1 1
#> 5 A 22.2 18.9 1 20.6 19.5 1 1 1
#> 6 A 21.7 19.5 0 20.6 19.5 1 1 1
#> 7 B 22.1 19.4 0 20.6 19.4 1 1 1
#> 8 B 22.1 18.6 0 20.6 19.4 0 0 0
#> 9 B 20.6 19.6 0 20.6 19.4 1 1 1
#> 10 B 22.5 18.7 0 20.6 19.4 0 0 1
#> # ... with 13 more rows, and 37 more variables: new0.8_-2 <dbl>,
#> # var0.9_-2 <dbl>, new0.9_-2 <dbl>, var1.0_-2 <dbl>, new1.0_-2 <dbl>,
#> # var0.7_-1 <dbl>, new0.7_-1 <dbl>, var0.8_-1 <dbl>, new0.8_-1 <dbl>,
#> # var0.9_-1 <dbl>, new0.9_-1 <dbl>, var1.0_-1 <dbl>, new1.0_-1 <dbl>,
#> # var0.7_0 <dbl>, new0.7_0 <dbl>, var0.8_0 <dbl>, new0.8_0 <dbl>,
#> # var0.9_0 <dbl>, new0.9_0 <dbl>, var1.0_0 <dbl>, new1.0_0 <dbl>,
#> # var0.7_1 <dbl>, new0.7_1 <dbl>, var0.8_1 <dbl>, new0.8_1 <dbl>, ...
Another way to approach the problem is to use the dplyover package (disclaimer: I'm the maintainer), and here the funciton dplyover::over2x() which generates columns in a nested loop style based on the input objects.
After dplyover::over2x() we can just use a regular call to across() and target all variables that start_with("var").
library(dplyover)
df %>%
group_by(group) %>%
mutate(
over2x(seq(-2, 2, length.out = 5),
seq(0.7, 1.0, length.out = 4),
~ ifelse(measure1 < (LLm1 - .x) & measure2 >= (ULm2 - .y), 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
.names = "var{y}_{x}"
),
across(starts_with("var"),
~ ifelse(.x == 1, 1,
ifelse(is.na(measure1) | is.na(measure2),
NA, 0)),
.names = "{gsub('var', 'new', {.col})}")
) %>%
ungroup()
#> # A tibble: 300 x 46
#> group measure1 measure2 risk LLm1 ULm2 `var0.7_-2` `var0.8_-2` `var0.9_-2`
#> <chr> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 27.2 12.6 0 20.8 19.3 0 0 0
#> 2 A 29.0 14.1 0 20.8 19.3 0 0 0
#> 3 A 29.3 14.6 0 20.8 19.3 0 0 0
#> 4 A 25.4 14.4 0 20.8 19.3 0 0 0
#> 5 A 25.2 12.3 0 20.8 19.3 0 0 0
#> 6 A 25.6 10.0 0 20.8 19.3 0 0 0
#> 7 A 28.2 10.3 0 20.8 19.3 0 0 0
#> 8 A 27.4 19.7 0 20.8 19.3 0 0 0
#> 9 A 24.8 19.3 0 20.8 19.3 0 0 0
#> 10 A 22.7 18.0 0 20.8 19.3 0 0 0
#> # ... with 290 more rows, and 37 more variables: var1_-2 <dbl>,
#> # var0.7_-1 <dbl>, var0.8_-1 <dbl>, var0.9_-1 <dbl>, var1_-1 <dbl>,
#> # var0.7_0 <dbl>, var0.8_0 <dbl>, var0.9_0 <dbl>, var1_0 <dbl>,
#> # var0.7_1 <dbl>, var0.8_1 <dbl>, var0.9_1 <dbl>, var1_1 <dbl>,
#> # var0.7_2 <dbl>, var0.8_2 <dbl>, var0.9_2 <dbl>, var1_2 <dbl>,
#> # new0.7_-2 <dbl>, new0.8_-2 <dbl>, new0.9_-2 <dbl>, new1_-2 <dbl>,
#> # new0.7_-1 <dbl>, new0.8_-1 <dbl>, new0.9_-1 <dbl>, new1_-1 <dbl>, ...
The data
df <- data.frame(group = rep(LETTERS[1:3], each = 100),
measure1 = runif(300, min = 20, max = 30),
measure2 = runif(300, min = 10, max = 20),
risk = rbinom(n=300, size=1, prob=0.05))
df[c(20,21,103),2] <- NA
df[c(44,80,201),3] <- NA
df[c(61,98,207),4] <- NA
library(dplyr)
df %>% group_by(group)%>%
mutate(LLm1 = quantile(measure1[risk == 0], prob = c(0.05), na.rm = TRUE),
ULm2 = quantile(measure2[risk == 0], prob = c(0.95), na.rm = TRUE))%>% ungroup -> df
Created on 2022-11-25 by the reprex package (v2.0.1)
Related
I'm trying to perform a forloop to apply a custom summarise function to all the numeric columns in the dataframe. The forloop output seems to ignore the grouping factor- however, if I perform the function alone on a single column (without the for loop), it provides the correct output.
#sample df
structure(list(participant = c("pt04", "pt75", "pt21", "pt73",
"pt27", "pt39", "pt43", "pt52", "pt69", "pt49", "pt50", "pt56",
"pt62", "pt68", "pt22", "pt64", "pt54", "pt79", "pt36", "pt26",
"pt65", "pt38"), group = structure(c(1L, 2L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L
), .Label = c("c", "e"), class = "factor"), sex = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 1L, 1L, 2L, 1L), .Label = c("m", "f"), class = "factor"),
fm_bdc3 = c(18.535199635968, 23.52996574649, 17.276246451976,
11.526088555461, 23.805048656112, 23.08597823716, 28.691020942436,
28.968097858499, 23.378093165331, 22.491725344661, 14.609015054932,
19.734914019306, 31.947412973684, 25.152298171274, 12.007356801787,
20.836128108938, 22.322230884349, 14.777652101515, 21.389572717608,
16.992853675086, 14.138189878472, 17.777235203826), fm_rec3 = c(18.545007190636,
23.017181869742, 17.031403417007, 11.227201061887, 23.581434653208,
21.571120542136, 28.919246372213, 28.138632765662, 22.990408911436,
22.274932676852, 14.012586350504, 19.066675709151, 30.897705534847,
24.491614222412, 11.670939246332, 20.306494543464, 22.052263684182,
14.252973638341, 21.028701096846, 17.207104923059, 13.172159777361,
17.610831079442), fm_chg = c(0.00980755466799721, -0.512783876747999,
-0.244843034968998, -0.298887493573998, -0.223614002904,
-1.514857695024, 0.228225429777002, -0.829465092836998, -0.387684253894999,
-0.216792667809003, -0.596428704428, -0.668238310155001,
-1.049707438837, -0.660683948862001, -0.336417555455, -0.529633565474001,
-0.269967200167002, -0.524678463173998, -0.360871620761998,
0.214251247972999, -0.966030101111, -0.166404124383998),
fm_percchg = c(0.00052913132097943, -0.0217928016671462,
-0.0141722355981437, -0.0259313896588437, -0.00939355370091154,
-0.0656180855522784, 0.00795459423472242, -0.0286337438132355,
-0.0165832282022865, -0.00963877445980213, -0.0408260722701251,
-0.0338607155572751, -0.0328573534170568, -0.0262673392452288,
-0.028017619615079, -0.025419001203338, -0.0120940958619099,
-0.0355048596062299, -0.0168713805332318, 0.0126083147698213,
-0.0683277073949869, -0.00936051767758492)), row.names = c(NA,
-22L), class = "data.frame")
#my function:
summbygrp <- function(x) {
group_by(dexadf, group) %>%
summarise(
count = n(),
mean = mean({{x}}, na.rm = TRUE),
sd = sd({{x}}, na.rm = TRUE)
) %>%
mutate(se = sd / sqrt(11),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
)
}
#apply function to all numeric columns and print column names before output
coln = 1
for (col in dexadf) {
print(colnames(dexadf)[coln])
coln = coln + 1
if(is.numeric(col)) {
print(summbygrp(col))
} else {next}
}
#output:
[1] "fm_bdc3"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 20.6 5.48 1.65 16.9 24.3
2 e 11 20.6 5.48 1.65 16.9 24.3
[1] "fm_rec3"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 20.1 5.41 1.63 16.5 23.8
2 e 11 20.1 5.41 1.63 16.5 23.8
[1] "fm_chg"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 -0.450 0.406 0.122 -0.723 -0.178
2 e 11 -0.450 0.406 0.122 -0.723 -0.178
[1] "fm_percchg"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 -0.0227 0.0198 0.00598 -0.0360 -0.00942
2 e 11 -0.0227 0.0198 0.00598 -0.0360 -0.00942
As you can see, all the means for both groups are the same, and I know this shouldn't be true. Could someone identify the error in the code? Thank you!
So instead of using for-loops you can do better,
library(dplyr)
library(rlang)
library(purrr)
library(tibble)
dexadf <- data.frame(
stringsAsFactors = FALSE,
participant = c("pt04","pt75","pt21","pt73",
"pt27","pt39","pt43","pt52","pt69","pt49","pt50",
"pt56","pt62","pt68","pt22","pt64","pt54","pt79",
"pt36","pt26","pt65","pt38"),
fm_bdc3 = c(18.535199635968,23.52996574649,
17.276246451976,11.526088555461,23.805048656112,
23.08597823716,28.691020942436,28.968097858499,
23.378093165331,22.491725344661,14.609015054932,19.734914019306,
31.947412973684,25.152298171274,12.007356801787,
20.836128108938,22.322230884349,14.777652101515,
21.389572717608,16.992853675086,14.138189878472,17.777235203826),
fm_rec3 = c(18.545007190636,
23.017181869742,17.031403417007,11.227201061887,23.581434653208,
21.571120542136,28.919246372213,28.138632765662,
22.990408911436,22.274932676852,14.012586350504,19.066675709151,
30.897705534847,24.491614222412,11.670939246332,
20.306494543464,22.052263684182,14.252973638341,
21.028701096846,17.207104923059,13.172159777361,17.610831079442),
fm_chg = c(0.00980755466799721,
-0.512783876747999,-0.244843034968998,-0.298887493573998,
-0.223614002904,-1.514857695024,0.228225429777002,
-0.829465092836998,-0.387684253894999,-0.216792667809003,
-0.596428704428,-0.668238310155001,-1.049707438837,
-0.660683948862001,-0.336417555455,-0.529633565474001,
-0.269967200167002,-0.524678463173998,-0.360871620761998,
0.214251247972999,-0.966030101111,-0.166404124383998),
fm_percchg = c(0.00052913132097943,
-0.0217928016671462,-0.0141722355981437,-0.0259313896588437,
-0.00939355370091154,-0.0656180855522784,
0.00795459423472242,-0.0286337438132355,-0.0165832282022865,
-0.00963877445980213,-0.0408260722701251,-0.0338607155572751,
-0.0328573534170568,-0.0262673392452288,-0.028017619615079,
-0.025419001203338,-0.0120940958619099,
-0.0355048596062299,-0.0168713805332318,0.0126083147698213,
-0.0683277073949869,-0.00936051767758492),
group = as.factor(c("c","e",
"e","c","c","e","c","e","c","e","e","c",
"e","c","c","e","e","c","e","c","e",
"c")),
sex = as.factor(c("f","m",
"m","m","m","m","m","f","m","f","f","f",
"f","f","f","f","m","f","m","m","f",
"m"))
)
dexadf <- as_tibble(dexadf)
# Note the use of .data pronoun, since columns will passed to this function as characters
summbygrp <- function(df, x) {
df %>%
group_by(group) %>%
summarise(
count = n(),
mean = mean(.data[[x]], na.rm = TRUE), # use of .data
sd = sd(.data[[x]], na.rm = TRUE) # use of .data
) %>%
mutate(se = sd / sqrt(11),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
)
}
# Here we extract the numerical columns of the dataset
cols <- dexadf %>%
select(where(is.numeric)) %>% colnames(.)
cols
#> [1] "fm_bdc3" "fm_rec3" "fm_chg" "fm_percchg"
# Then instead of for loops we can simply use this map function
map(.x = cols, ~ summbygrp(dexadf, .x))
#> [[1]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 19.3 5.49 1.66 15.6 23.0
#> 2 e 11 21.9 5.40 1.63 18.2 25.5
#>
#> [[2]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 19.1 5.54 1.67 15.3 22.8
#> 2 e 11 21.2 5.31 1.60 17.7 24.8
#>
#> [[3]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 -0.256 0.311 0.0938 -0.465 -0.0470
#> 2 e 11 -0.645 0.407 0.123 -0.918 -0.371
#>
#> [[4]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
#> 2 e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
# -------------------------------------------------------------------
# we can also bind all the output results (dataframes) in a single dataframe
map_dfr(.x = cols, ~ summbygrp(dexadf, .x), .id = "vars")
#> # A tibble: 8 × 8
#> vars group count mean sd se lower.ci upper.ci
#> <chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 c 11 19.3 5.49 1.66 15.6 23.0
#> 2 1 e 11 21.9 5.40 1.63 18.2 25.5
#> 3 2 c 11 19.1 5.54 1.67 15.3 22.8
#> 4 2 e 11 21.2 5.31 1.60 17.7 24.8
#> 5 3 c 11 -0.256 0.311 0.0938 -0.465 -0.0470
#> 6 3 e 11 -0.645 0.407 0.123 -0.918 -0.371
#> 7 4 c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
#> 8 4 e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
Created on 2022-07-09 by the reprex package (v2.0.1)
out <- df %>%
pivot_longer(starts_with('fm')) %>%
group_by(name, group) %>%
summarise(
count = n(),
mean = mean(value, na.rm = TRUE),
sd = sd(value, na.rm = TRUE),
.groups = 'drop'
) %>%
mutate(se = sd / sqrt(11),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
)
out
# A tibble: 8 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_bdc3 c 11 19.3 5.49 1.66 15.6 23.0
2 fm_bdc3 e 11 21.9 5.40 1.63 18.2 25.5
3 fm_chg c 11 -0.256 0.311 0.0938 -0.465 -0.0470
4 fm_chg e 11 -0.645 0.407 0.123 -0.918 -0.371
5 fm_percchg c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
6 fm_percchg e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
7 fm_rec3 c 11 19.1 5.54 1.67 15.3 22.8
8 fm_rec3 e 11 21.2 5.31 1.60 17.7 24.8
if you need the list, just split it:
split(out, ~name)
$fm_bdc3
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_bdc3 c 11 19.3 5.49 1.66 15.6 23.0
2 fm_bdc3 e 11 21.9 5.40 1.63 18.2 25.5
$fm_chg
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_chg c 11 -0.256 0.311 0.0938 -0.465 -0.0470
2 fm_chg e 11 -0.645 0.407 0.123 -0.918 -0.371
$fm_percchg
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_percchg c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
2 fm_percchg e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
$fm_rec3
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_rec3 c 11 19.1 5.54 1.67 15.3 22.8
2 fm_rec3 e 11 21.2 5.31 1.60 17.7 24.8
A similar answer to the above, but combining across and summarise:
df |>
group_by(group) |>
summarise(
across(
where(is.numeric),
list(
mean = ~mean(.x, na.rm = TRUE),
sd = ~sd(.x, na.rm = TRUE),
n = ~n()
),
.names = "{.col}.{.fn}"
)
) |>
pivot_longer(
-group,
names_to = c("measure", "stat"),
names_sep = "\\."
) |>
pivot_wider(
names_from = stat,
values_from = value
) |>
mutate(
se = sd / sqrt(n),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
) |>
arrange(measure)
I would like to compute differences among several columns, per identifiers (see script below for reproducible example and target data frame).
This question is somehow similar, but only for pairs of identifiers. I can't think on how to adapt it.
I could also have several data frame, one per identifier, but I also don't know in that case how to compute multiple columns differences.
The code below allows to create a sample dataset, and has the code I currently use. It gives me what I want, I'd just like to know if there is a way not to spell out all the differences I want to compute (in my dataset, I have more parameters and depths than in that sample data).
Thanks in advance for your help!
library(tidyverse)
# sample data
create.dt <- function(t = 0) {
data.frame(parameter = rep(c("temperature","oxygen"), each = 3),
date = rep(c(Sys.Date()+t), each = 6),
depth = rep(1:3, times = 2),
value = c(data.frame(x = rnorm(3, 16, 2)) %>%
arrange(-x) %>% pull,
data.frame(x = rnorm(3, 7, 1)) %>%
arrange(-x) %>% pull
))
}
# Multi-site dataset
dt <- rbind(
cbind(site = "A", create.dt(t = c(-3:0))),
cbind(site = "B", create.dt(t = c(-3:0))),
cbind(site = "C", create.dt(t = c(-3:0))),
cbind(site = "D", create.dt(t = c(-3:0))),
cbind(site = "E", create.dt(t = c(-3:0))))
# Reshape the data and compute differences
dt %>% pivot_wider(id_cols = c(site,date), names_from = c(parameter,depth), values_from = value, names_sep = "_") %>%
# do the difference, depth to depth, parameter by parameter
# What I would like is not have to write manually each differences pair
mutate(temperature_1_2 = temperature_1 - temperature_2,
temperature_1_3 = temperature_1 - temperature_3,
temperature_2_3 = temperature_2 - temperature_3,
oxygen_1_2 = oxygen_1 - oxygen_2,
oxygen_1_3 = oxygen_1 - oxygen_3,
oxygen_2_3 = oxygen_2 - oxygen_3)
library(tidyverse)
library(rlang)
create.dt <- function(t = 0) {
data.frame(parameter = rep(c("temperature","oxygen"), each = 3),
date = rep(c(Sys.Date()+t), each = 6),
depth = rep(1:3, times = 2),
value = c(data.frame(x = rnorm(3, 16, 2)) %>%
arrange(-x) %>% pull,
data.frame(x = rnorm(3, 7, 1)) %>%
arrange(-x) %>% pull
))
}
# Multi-site dataset
dt <- rbind(
cbind(site = "A", create.dt(t = c(-3:0))),
cbind(site = "B", create.dt(t = c(-3:0))),
cbind(site = "C", create.dt(t = c(-3:0))),
cbind(site = "D", create.dt(t = c(-3:0))),
cbind(site = "E", create.dt(t = c(-3:0))))
# result
temperature <- str_c("temperature_", 1:3)
oxygen <- str_c("oxygen_", 1:3)
temperature_frml <- combn(temperature, m = 2, FUN = function(x) str_c(x, collapse = " - "))
oxygen_frml <- combn(oxygen, m = 2, FUN = function(x) str_c(x, collapse = " - "))
all_frml <- c(temperature_frml, oxygen_frml)
df_wider <- dt %>% pivot_wider(
id_cols = c(site, date),
names_from = c(parameter, depth),
values_from = value,
names_sep = "_"
)
bind_cols(df_wider,
map_dfc(
.x = all_frml,
.f = ~ transmute(.data = df_wider,!!.x := eval(parse_expr(.x)))
))
#> # A tibble: 20 x 14
#> site date temperature_1 temperature_2 temperature_3 oxygen_1 oxygen_2
#> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 2021-12-11 17.6 17.1 12.9 7.34 6.86
#> 2 A 2021-12-12 17.6 17.1 12.9 7.34 6.86
#> 3 A 2021-12-13 17.6 17.1 12.9 7.34 6.86
#> 4 A 2021-12-14 17.6 17.1 12.9 7.34 6.86
#> 5 B 2021-12-11 17.1 15.6 13.7 8.52 7.58
#> 6 B 2021-12-12 17.1 15.6 13.7 8.52 7.58
#> 7 B 2021-12-13 17.1 15.6 13.7 8.52 7.58
#> 8 B 2021-12-14 17.1 15.6 13.7 8.52 7.58
#> 9 C 2021-12-11 17.7 15.5 13.6 7.66 7.31
#> 10 C 2021-12-12 17.7 15.5 13.6 7.66 7.31
#> 11 C 2021-12-13 17.7 15.5 13.6 7.66 7.31
#> 12 C 2021-12-14 17.7 15.5 13.6 7.66 7.31
#> 13 D 2021-12-11 16.5 16.4 14.5 7.50 7.27
#> 14 D 2021-12-12 16.5 16.4 14.5 7.50 7.27
#> 15 D 2021-12-13 16.5 16.4 14.5 7.50 7.27
#> 16 D 2021-12-14 16.5 16.4 14.5 7.50 7.27
#> 17 E 2021-12-11 16.7 16.1 15.7 7.52 7.51
#> 18 E 2021-12-12 16.7 16.1 15.7 7.52 7.51
#> 19 E 2021-12-13 16.7 16.1 15.7 7.52 7.51
#> 20 E 2021-12-14 16.7 16.1 15.7 7.52 7.51
#> # ... with 7 more variables: oxygen_3 <dbl>,
#> # temperature_1 - temperature_2 <dbl>, temperature_1 - temperature_3 <dbl>,
#> # temperature_2 - temperature_3 <dbl>, oxygen_1 - oxygen_2 <dbl>,
#> # oxygen_1 - oxygen_3 <dbl>, oxygen_2 - oxygen_3 <dbl>
Created on 2021-12-14 by the reprex package (v2.0.1)
I have a problem I'm trying to solve, and I can't seem to find a succinct solution. There are a few similar questions on SO, but nothing that quite fits.
Take some sample data:
library(dplyr)
dat <- tibble(
group1 = factor(sample(c("one", "two"), 10, replace = T)),
group2 = factor(sample(c("alpha", "beta"), 10, replace = T)),
var1 = rnorm(10, 20, 2),
var2 = rnorm(10, 20, 2),
var3 = rnorm(10, 20, 2),
other1 = sample(c("a", "b", "c"), 10, replace = T),
other2 = sample(c("a", "b", "c"), 10, replace = T),
)
I would like to summarise just the numeric variables (i.e. ignoring other1 and other2), but have the output grouped by group1 and group2.
I have tried something like this, but it returns an error as it attempts to apply my summarise() functions to the grouping variables too.
dat %>%
group_by(group1, group2) %>%
select(where(is.numeric)) %>%
map(~ .x %>%
filter(!is.na(.x)) %>%
summarise(mean = mean(.x),
sd = sd(.x),
median = median(.x),
q1 = quantile(.x, p = .25),
q3 = quantile(.x, p = .75))
)
My expected output would be something like
group1 group2 mean sd median q1 q3
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 one alpha ? ? ? ? ?
2 one beta ? ? ? ? ?
3 two alpha ? ? ? ? ?
4 two beta ? ? ? ? ?
Any solutions would be greatly appreciated.
Thanks,
Sam
Try:
dat %>% group_by(group1,group2) %>%
summarize(across(is.numeric,c(sd = sd,
mean = mean,
median =median,
q1 = function(x) quantile(x,.25),
q3 = function(x) quantile(x,.75))))
group1 group2 var1_sd var1_mean var1_median var1_q1 var1_q3 var2_sd var2_mean var2_median var2_q1 var2_q3 var3_sd
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 one alpha 4.06 20.6 19.3 18.3 22.2 1.12 17.9 17.3 17.2 18.2 1.09
2 one beta 0.726 18.7 18.7 18.4 18.9 0.348 18.8 18.8 18.7 18.9 0.604
3 two alpha 1.31 19.9 20.0 19.3 20.6 1.10 17.8 18.3 17.4 18.5 0.624
4 two beta 0.777 21.2 21.2 21.0 21.5 1.13 19.6 19.6 19.2 20.0 0.0161
You can also pass the columns to the functions in summarise:
dat %>%
group_by(group1, group2) %>%
summarise(mean = mean(var1:var3),
sd = sd(var1:var3),
median = median(var1:var3),
q1 = quantile(var1:var3, p = .25),
q3 = quantile(var1:var3, p = .75))
dat
# A tibble: 4 x 7
# Groups: group1 [2]
# group1 group2 mean sd median q1 q3
# <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 one alpha 19.1 0.707 19.1 18.8 19.3
# 2 one beta 17.5 1.29 17.5 16.8 18.3
# 3 two alpha 17.1 NA 17.1 17.1 17.1
# 4 two beta 19.9 NA 19.9 19.9 19.9
I have a data frame, each of the data point has a structure like: ID, measure, timemark
ID measure timemark
001 12 15
003 3 13
004 365 0
003 1 13
ID is a unique study ID for a person, while measure is the number of days the person using a service at that time, and timemark is a number range from 0 to 51 which indicate the 52 weeks in a year x
Now I want to create dataframe of 52 columns, each of them consists of the number of days they spent in the service that week (so maximal number of days should be 7 each week). For each person, they can have more than one entry in a time point. In this sense, the total days should be the sum of the two rows.
So I want to make it like:
ID ... week13 week14 week15 week 16
001 ... 0 0 7 5
003 ... 4 0 0 0
004 ... 7 7 7 7
I was struggling with the logic inside and guess it would be related to the quotient and remainder of measure, but I couldn't make the way through. Can anyone help?
We can first create one row per ID and timemark and sum the measure values. We create a list dividing measure into steps of 7 along with it's remainder. Using unnest_longer we get the data in long format and create timemark column appending the week number and finally spread the data in wide format.
library(dplyr)
library(tidyr)
df %>%
group_by(ID, timemark) %>%
summarise(measure = sum(measure)) %>%
mutate(measure = list(c(rep(7, floor(measure/7)), measure %% 7))) %>%
unnest_longer(measure) %>%
mutate(timemark = paste0('week', first(timemark) + 0:(n() - 1))) %>%
slice(1:pmin(n(), 52)) %>%
mutate(timemark = factor(timemark, levels = paste0('week', 0:51))) %>%
spread(timemark, measure)
#Or using pivot_wider in new tidyr
#pivot_wider(names_from = timemark, values_from = measure)
# A tibble: 3 x 53
# Groups: ID [3]
# ID week0 week1 week2 week3 week4 week5 week6 week7 week8 week9 week10 week11 week12 week13 week14 week15 week16
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 7 5
#2 3 NA NA NA NA NA NA NA NA NA NA NA NA NA 4 NA NA NA
#3 4 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
# … with 35 more variables: week17 <dbl>, week18 <dbl>, week19 <dbl>, week20 <dbl>, week21 <dbl>, week22 <dbl>,
# week23 <dbl>, week24 <dbl>, week25 <dbl>, week26 <dbl>, week27 <dbl>, week28 <dbl>, week29 <dbl>, week30 <dbl>,
# week31 <dbl>, week32 <dbl>, week33 <dbl>, week34 <dbl>, week35 <dbl>, week36 <dbl>, week37 <dbl>, week38 <dbl>,
# week39 <dbl>, week40 <dbl>, week41 <dbl>, week42 <dbl>, week43 <dbl>, week44 <dbl>, week45 <dbl>, week46 <dbl>,
# week47 <dbl>, week48 <dbl>, week49 <dbl>, week50 <dbl>, week51 <dbl>
data
df <- structure(list(ID = c(1L, 3L, 4L, 3L), measure = c(12L, 3L, 365L,
1L), timemark = c(15L, 13L, 0L, 13L)), class = "data.frame", row.names = c(NA, -4L))
I want to leave what I tried for you. First, I created a master data frame that contains all combination of ID and timemark for each ID using expand(). Then, I created result in the following way. I defined groups by ID and timemark and summed up measure. Then, I identified how many weeks (rows) I needed in order to expand the result in the first mutate(). Then, I expanded the data frame using expandRows() in the splitstackshape package. Then, I updated the numbers in timemark to have correct week numbers in the 2nd mutate(). Then, I handled some calculation to assign the number of days in each week. lag(measure - 7 * row_number(), default = 7) creates a vector that contains how many days are still left in measure. I needed to replace some of the numbers using logical conditions. For each group, when the number of row is 1, assign the value in measure. When res is larger than 7, assign 7 to res. (Any number larger than 7 is 7 since each week (row) can take up to 7 days.) Otherwise, keep the original value in res.
library(dplyr)
library(tidyr)
library(splitstackshape)
master <- expand(mydf, timemark = 0:51, ID)
group_by(mydf, ID, timemark) %>%
summarize(measure = sum(measure)) %>%
ungroup %>%
group_by(group = 1:n()) %>%
mutate(nrow = as.integer(measure / 7) + 1) %>%
expandRows(count = "nrow") %>%
mutate(timemark = first(timemark):(first(timemark) + n() - 1),
res = lag(measure - 7 * row_number(), default = 7),
res = case_when(n() == 1 ~ as.numeric(measure),
res > 7 ~ 7,
TRUE ~ res)) -> result
The final step was to join result to master. I dropped unnecessary columns, made the data frame wide, and updated column names.
left_join(master, result, by = c("ID", "timemark"))%>%
select(-c(measure, group)) %>%
spread(key = timemark, value = res, fill = 0) %>%
rename_at(vars(-ID),
.funs = list(~paste("week", ., sep = "")))
ID week0 week1 week2 week3 week4 week5 week6 week7 week8 week9 week10 week11 week12 week13 week14 week15 week16
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 5
2 3 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0
3 4 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
# … with 35 more variables: week17 <dbl>, week18 <dbl>, week19 <dbl>, week20 <dbl>, week21 <dbl>, week22 <dbl>,
# week23 <dbl>, week24 <dbl>, week25 <dbl>, week26 <dbl>, week27 <dbl>, week28 <dbl>, week29 <dbl>, week30 <dbl>,
# week31 <dbl>, week32 <dbl>, week33 <dbl>, week34 <dbl>, week35 <dbl>, week36 <dbl>, week37 <dbl>, week38 <dbl>,
# week39 <dbl>, week40 <dbl>, week41 <dbl>, week42 <dbl>, week43 <dbl>, week44 <dbl>, week45 <dbl>, week46 <dbl>,
# week47 <dbl>, week48 <dbl>, week49 <dbl>, week50 <dbl>, week51 <dbl>
DATA
mydf <- structure(list(ID = c(1L, 3L, 4L, 3L), measure = c(12L, 3L, 365L,
1L), timemark = c(15L, 13L, 0L, 13L)), class = "data.frame", row.names = c(NA,
-4L))
I've got a data like below:
df <- structure(list(x1 = c(0.544341260178568, 0.412555423655238, -0.013600925280521,
-0.947831642260442, -0.705819557090343, -0.440052278478676, 0.583360907624305,
-0.548217106316072, -0.381271093402877, 1.66078031000975), x2 = c(-2.17595468838955,
3.73045998213455, 7.88166053118859, 0.295257601073637, -0.503260811313588,
0.118118179398699, 3.77037347523743, 2.92758197923041, 3.40618904087335,
1.45012335878481), x3 = c(14.1085074738418, 9.46630939737492,
7.30026032988652, 10.1473062197382, 11.0336174184083, 7.09744336163716,
16.6871358354018, 13.5030856142587, 14.8384334167838, 1.82381360524456
), x4 = c(-2.78166486821977, -3.14368874900826, -3.70425316743753,
-4.34268218961615, -3.03557313652054, -2.74059520574829, -4.10826186695405,
-1.97243713944283, -3.88803755426516, -2.56315085425652), x5 = c(-0.279614449281486,
-0.480466773938402, -1.43353886424161, 0.286937906279445, 0.701999608919316,
0.591932833840325, 0.994266002713824, 1.03424778687263, 0.462618513817936,
-3.08491622131441)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
Now I want to create columns that are sums, products and differences of each pair of column. With sums it went easy:
combn(df, 2, function(x) {
x %>% transmute(!!paste(names(.), collapse = '+') := rowSums(.))},
simplify = FALSE)
But now I need to calculate products and differences. As there are no equivalent to rowSums for difference or product my approach fails here. I was thinking about something like
combn(df, 2, function(x) {
x %>% transmute(!!paste(names(.), collapse = '-') := apply(., 1, `-`)},
simplify = FALSE)
but it doesn't work.
Here's one "tidy" approach. It relies on converting the data to a long format where each row in your original df gets assigned an id and the columns are gathered.
This allows us to do a full join of the data frame with itself. That way you get all pairwise combinations of your columns. Once in this format, applying the sums, products and differences becomes really easy.
Update: Reformat output
library(tidyverse)
df <-
structure(
list(
x1 = c(
0.544341260178568,
0.412555423655238,
-0.013600925280521,-0.947831642260442,
-0.705819557090343,
-0.440052278478676,
0.583360907624305,-0.548217106316072,
-0.381271093402877,
1.66078031000975
),
x2 = c(
-2.17595468838955,
3.73045998213455,
7.88166053118859,
0.295257601073637,
-0.503260811313588,
0.118118179398699,
3.77037347523743,
2.92758197923041,
3.40618904087335,
1.45012335878481
),
x3 = c(
14.1085074738418,
9.46630939737492,
7.30026032988652,
10.1473062197382,
11.0336174184083,
7.09744336163716,
16.6871358354018,
13.5030856142587,
14.8384334167838,
1.82381360524456
),
x4 = c(
-2.78166486821977,
-3.14368874900826,
-3.70425316743753,-4.34268218961615,
-3.03557313652054,
-2.74059520574829,
-4.10826186695405,-1.97243713944283,
-3.88803755426516,
-2.56315085425652
),
x5 = c(
-0.279614449281486,-0.480466773938402,
-1.43353886424161,
0.286937906279445,
0.701999608919316,
0.591932833840325,
0.994266002713824,
1.03424778687263,
0.462618513817936,-3.08491622131441
)
),
row.names = c(NA,-10L),
class = c("tbl_df",
"tbl", "data.frame")
)
# Add an id for each observation and covert to long format
df_wrangled <- df %>%
mutate(id = 1:n()) %>%
gather(col, val, -id)
pairs <- full_join(df_wrangled, df_wrangled, by = "id") %>%
mutate(
sum = val.x + val.y,
prod = val.x * val.y,
diff = val.x - val.y
)
head(pairs)
#> # A tibble: 6 x 8
#> id col.x val.x col.y val.y sum prod diff
#> <int> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 1 x1 0.544 x1 0.544 1.09 0.296 0
#> 2 1 x1 0.544 x2 -2.18 -1.63 -1.18 2.72
#> 3 1 x1 0.544 x3 14.1 14.7 7.68 -13.6
#> 4 1 x1 0.544 x4 -2.78 -2.24 -1.51 3.33
#> 5 1 x1 0.544 x5 -0.280 0.265 -0.152 0.824
#> 6 2 x1 0.413 x1 0.413 0.825 0.170 0
pairs_wrangled <- pairs %>%
filter(col.x != col.y) %>%
gather(operation, val, sum, prod, diff) %>%
mutate(
label = paste0(
col.x,
case_when(operation == "sum" ~ "+", operation == "diff" ~ "-", operation == "prod" ~ "*"),
col.y
)
) %>%
select(id, label, val) %>%
spread(label, val)
head(pairs_wrangled)
#> # A tibble: 6 x 61
#> id `x1-x2` `x1-x3` `x1-x4` `x1-x5` `x1*x2` `x1*x3` `x1*x4` `x1*x5`
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2.72 -13.6 3.33 0.824 -1.18 7.68 -1.51 -0.152
#> 2 2 -3.32 -9.05 3.56 0.893 1.54 3.91 -1.30 -0.198
#> 3 3 -7.90 -7.31 3.69 1.42 -0.107 -0.0993 0.0504 0.0195
#> 4 4 -1.24 -11.1 3.39 -1.23 -0.280 -9.62 4.12 -0.272
#> 5 5 -0.203 -11.7 2.33 -1.41 0.355 -7.79 2.14 -0.495
#> 6 6 -0.558 -7.54 2.30 -1.03 -0.0520 -3.12 1.21 -0.260
#> # … with 52 more variables: `x1+x2` <dbl>, `x1+x3` <dbl>, `x1+x4` <dbl>,
#> # `x1+x5` <dbl>, `x2-x1` <dbl>, `x2-x3` <dbl>, `x2-x4` <dbl>,
#> # `x2-x5` <dbl>, `x2*x1` <dbl>, `x2*x3` <dbl>, `x2*x4` <dbl>,
#> # `x2*x5` <dbl>, `x2+x1` <dbl>, `x2+x3` <dbl>, `x2+x4` <dbl>,
#> # `x2+x5` <dbl>, `x3-x1` <dbl>, `x3-x2` <dbl>, `x3-x4` <dbl>,
#> # `x3-x5` <dbl>, `x3*x1` <dbl>, `x3*x2` <dbl>, `x3*x4` <dbl>,
#> # `x3*x5` <dbl>, `x3+x1` <dbl>, `x3+x2` <dbl>, `x3+x4` <dbl>,
#> # `x3+x5` <dbl>, `x4-x1` <dbl>, `x4-x2` <dbl>, `x4-x3` <dbl>,
#> # `x4-x5` <dbl>, `x4*x1` <dbl>, `x4*x2` <dbl>, `x4*x3` <dbl>,
#> # `x4*x5` <dbl>, `x4+x1` <dbl>, `x4+x2` <dbl>, `x4+x3` <dbl>,
#> # `x4+x5` <dbl>, `x5-x1` <dbl>, `x5-x2` <dbl>, `x5-x3` <dbl>,
#> # `x5-x4` <dbl>, `x5*x1` <dbl>, `x5*x2` <dbl>, `x5*x3` <dbl>,
#> # `x5*x4` <dbl>, `x5+x1` <dbl>, `x5+x2` <dbl>, `x5+x3` <dbl>,
#> # `x5+x4` <dbl>
Created on 2019-04-02 by the reprex package (v0.2.1)