I have a data.frame like this.
library(tidyverse)
df <- tibble(
name = rep(c("a", "b"), each = 100),
value = runif(100*2),
date = rep(Sys.Date() + days(1:100), 2)
)
I would like to do something very similar to the code below. Is there a way to create these 10 columns in one go? Basically, I am trying to find out how much does 99th percent quantile change if we remove one observation, and then 2, and then 3 and so on.
df %>%
nest_by(name) %>%
mutate(
q99_lag_0 = data %>% pull(value) %>% quantile(.99),
q99_lag_1 = data %>% pull(value) %>% tail(-1) %>% quantile(.99),
q99_lag_2 = data %>% pull(value) %>% tail(-2) %>% quantile(.99),
q99_lag_3 = data %>% pull(value) %>% tail(-3) %>% quantile(.99),
q99_lag_4 = data %>% pull(value) %>% tail(-4) %>% quantile(.99),
q99_lag_5 = data %>% pull(value) %>% tail(-5) %>% quantile(.99),
q99_lag_6 = data %>% pull(value) %>% tail(-6) %>% quantile(.99),
q99_lag_7 = data %>% pull(value) %>% tail(-7) %>% quantile(.99),
q99_lag_8 = data %>% pull(value) %>% tail(-8) %>% quantile(.99),
q99_lag_9 = data %>% pull(value) %>% tail(-9) %>% quantile(.99),
q99_lag_10 = data %>% pull(value) %>% tail(-10) %>% quantile(.99)
)
First, reproducible random data:
library(dplyr)
library(purrr) # map_dfx
set.seed(42)
df <- tibble(
name = rep(c("a", "b"), each = 100),
value = runif(100*2),
date = rep(Sys.Date() + 1:100, 2)
)
head(df)
# # A tibble: 6 x 3
# name value date
# <chr> <dbl> <date>
# 1 a 0.915 2021-12-14
# 2 a 0.937 2021-12-15
# 3 a 0.286 2021-12-16
# 4 a 0.830 2021-12-17
# 5 a 0.642 2021-12-18
# 6 a 0.519 2021-12-19
Then the call:
df %>%
nest_by(name) %>%
mutate(
q99_lag_0 = quantile(data$value, 0.99),
map_dfc(-1:-10, ~ tibble("q99_lag_{-.x}" := quantile(tail(data$value, .x), 0.99)))
) %>%
ungroup()
# # A tibble: 2 x 13
# name data q99_lag_0 q99_lag_1 q99_lag_2 q99_lag_3 q99_lag_4 q99_lag_5 q99_lag_6 q99_lag_7 q99_lag_8 q99_lag_9 q99_lag_10
# <chr> <list<tbl_df[,2]>> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 a [100 x 2] 0.983 0.983 0.983 0.983 0.983 0.983 0.983 0.983 0.983 0.983 0.983
# 2 b [100 x 2] 0.963 0.963 0.963 0.963 0.963 0.963 0.946 0.946 0.946 0.947 0.947
Related
I would like to perform multiple pairwise t-tests on a dataset containing about 400 different column variables and 3 subject groups, and extract p-values for every comparison. A shorter representative example of the data, using only 2 variables could be the following;
df <- tibble(var1 = rnorm(90, 1, 1), var2 = rnorm(90, 1.5, 1), group = rep(1:3, each = 30))
Ideally the end result will be a summarised data frame containing four columns; one for the variable being tested (var1, var2 etc.), two for the groups being tested every time and a final one for the p-value.
I've tried duplicating the group column in the long form, and doing a double group_by in order to do the comparisons but with no result
result <- df %>%
pivot_longer(var1:var2, "var", "value") %>%
rename(group_a = group) %>%
mutate(group_b = group_a) %>%
group_by(group_a, group_b) %>%
summarise(n = n())
We can reshape the data into 'long' format with pivot_longer, then grouped by 'group', apply the pairwise.t.test, extract the list elements and transform into tibble with tidy (from broom) and unnest the list column
library(dplyr)
library(tidyr)
library(broom)
df %>%
pivot_longer(cols = -group, names_to = 'grp') %>%
group_by(group) %>%
summarise(out = list(pairwise.t.test(value, grp
) %>%
tidy)) %>%
unnest(c(out))
-output
# A tibble: 3 x 4
group group1 group2 p.value
<int> <chr> <chr> <dbl>
1 1 var2 var1 0.0760
2 2 var2 var1 0.0233
3 3 var2 var1 0.000244
In case you end up wanting more information about the t-tests, here is an approach that will allow you to extract more information such as the degrees of freedom and value of the test statistic:
library(dplyr)
library(tidyr)
library(purrr)
library(broom)
df <- tibble(
var1 = rnorm(90, 1, 1),
var2 = rnorm(90, 1.5, 1),
group = rep(1:3, each = 30)
)
df %>%
select(-group) %>%
names() %>%
map_dfr(~ {
y <- .
combn(3, 2) %>%
t() %>%
as.data.frame() %>%
pmap_dfr(function(V1, V2) {
df %>%
select(group, all_of(y)) %>%
filter(group %in% c(V1, V2)) %>%
t.test(as.formula(sprintf("%s ~ group", y)), ., var.equal = TRUE) %>%
tidy() %>%
transmute(y = y,
group_1 = V1,
group_2 = V2,
df = parameter,
t_value = statistic,
p_value = p.value
)
})
})
#> # A tibble: 6 x 6
#> y group_1 group_2 df t_value p_value
#> <chr> <int> <int> <dbl> <dbl> <dbl>
#> 1 var1 1 2 58 -0.337 0.737
#> 2 var1 1 3 58 -1.35 0.183
#> 3 var1 2 3 58 -1.06 0.295
#> 4 var2 1 2 58 -0.152 0.879
#> 5 var2 1 3 58 1.72 0.0908
#> 6 var2 2 3 58 1.67 0.100
And here is #akrun's answer tweaked to give the same p-values as the above approach. Note the p.adjust.method = "none" which gives independent t-tests which will inflate your Type I error rate.
df %>%
pivot_longer(
cols = -group,
names_to = "y"
) %>%
group_by(y) %>%
summarise(
out = list(
tidy(
pairwise.t.test(
value,
group,
p.adjust.method = "none",
pool.sd = FALSE
)
)
)
) %>%
unnest(c(out))
#> # A tibble: 6 x 4
#> y group1 group2 p.value
#> <chr> <chr> <chr> <dbl>
#> 1 var1 2 1 0.737
#> 2 var1 3 1 0.183
#> 3 var1 3 2 0.295
#> 4 var2 2 1 0.879
#> 5 var2 3 1 0.0909
#> 6 var2 3 2 0.100
Created on 2021-07-30 by the reprex package (v1.0.0)
I have the following tibble
test_tbl <- tibble(name = rep(c("John", "Allan", "George", "Peter", "Paul"), each = 12),
category = rep(rep(LETTERS[1:4], each = 3), 5),
replicate = rep(1:3, 20),
value = sample.int(n = 1e5, size = 60, replace = T))
# A tibble: 60 x 4
name category replicate value
<chr> <chr> <int> <int>
1 John A 1 71257
2 John A 2 98887
3 John A 3 87354
4 John B 1 25352
5 John B 2 69913
6 John B 3 43086
7 John C 1 24957
8 John C 2 33928
9 John C 3 79854
10 John D 1 32842
11 John D 2 19156
12 John D 3 50283
13 Allan A 1 98188
14 Allan A 2 26208
15 Allan A 3 69329
16 Allan B 1 32696
17 Allan B 2 81240
18 Allan B 3 54689
19 Allan C 1 77044
20 Allan C 2 97776
# … with 40 more rows
I want to group_by(name, category) and perform 3 t.test calls, comparing category B, C and D with category A.
I would like to store the estimate and p.value from the output. The expected result is something like this:
# A tibble: 5 x 7
name B_vs_A_estimate B_vs_A_p_value C_vs_A_estimate C_vs_A_p_value D_vs_A_estimate D_vs_A_p_value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 John -0.578 0.486 0.198 0.309 0.631 0.171
2 Allan 0.140 0.644 0.728 0.283 0.980 0.485
3 George -0.778 0.320 -0.424 0.391 -0.154 0.589
4 Peter -0.435 0.470 -0.156 0.722 0.315 0.0140
5 Paul 0.590 0.0150 -0.473 0.475 0.681 0.407
I would prefer a solution using tidyverse and/or broom.
There are many ways to achieve the desired output but maybe this one is the more intuitive one and easy to debug (you can put a browser() anywhere)
test_tbl %>%
group_by(name) %>%
do({
sub_tbl <- .
expand.grid(g1="A", g2=c("B", "C", "D"), stringsAsFactors = FALSE) %>%
mutate(test=as.character(glue::glue("{g1}_vs_{g2}"))) %>%
rowwise() %>%
do({
gs <- .
t_res <- t.test(sub_tbl %>% filter(category == gs$g1) %>% pull(value),
sub_tbl %>% filter(category == gs$g2) %>% pull(value))
data.frame(test=gs$test, estimate=t_res$statistic, p_value=t_res$p.value,
stringsAsFactors = FALSE)
})
}) %>%
ungroup() %>%
gather(key="statistic", value="val", -name, -test) %>%
mutate(test_statistic = paste(test, statistic, sep = "_")) %>%
select(-test, -statistic) %>%
spread(key="test_statistic", value="val")
Result
# A tibble: 5 x 7
name A_vs_B_estimate A_vs_B_p_value A_vs_C_estimate A_vs_C_p_value A_vs_D_estimate A_vs_D_p_value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Allan -0.270 0.803 -1.03 0.396 1.55 0.250
2 George 0.201 0.855 0.221 0.838 1.07 0.380
3 John -1.59 0.249 0.0218 0.984 -0.410 0.704
4 Paul 0.116 0.918 -1.62 0.215 -1.53 0.212
5 Peter 0.471 0.664 0.551 0.611 0.466 0.680
It groups the records by name then apply a function (do #1). Save the sub dataframe in sub_tbl, expand all the test cases (expand.grid) and create a test name with the two letters combined. Now, for each combination apply the function to run the t-tests (do #2). That anonymous function performs the test between group 1 (g1) and group 2 (g2) and returns a dataframe with the results.
The second part basically rearranges the columns to have the final output.
test_tbl %>%
dplyr::group_by(name) %>%
dplyr::summarise(estimate_AB =
t.test(value[category == "A"| category == "B"] ~ category[category == "A" | category == "B"]) %>% (function(x){x$estimate[1] - x$estimate[2]}),
pvalue_AB = t.test(value[category == "A"| category == "B"] ~ category[category == "A" | category == "B"]) %>% (function(x){x$p.value})
)
Here is what I did for testing the A against B by group. I think that you could extend my approach, or try to incorporate the code from the first solution.
EDIT : cleanner code
map(unique(test_tbl$name),function(nm){test_tbl %>% filter(name == nm)}) %>%
map2(unique(test_tbl$name),function(dat,nm){
map(LETTERS[2:4],function(cat){
dat %>%
filter(category == "A") %>%
pull %>%
t.test(dat %>% filter(category == cat) %>% pull)
}) %>%
map_dfr(broom::glance) %>%
select(statistic,p.value) %>%
mutate(
name = nm,
cross_cat = paste0(LETTERS[2:4]," versus A")
)
}) %>%
{do.call(rbind,.)}
We can use
library(dplyr)
library(purrr)
library(stringr)
library(tidyr)
test_tbl %>%
split(.$name) %>%
map_dfr(~ {
Avalue <- .x$value[.x$category == 'A']
.x %>%
filter(category != 'A') %>%
group_by(category) %>%
summarise(out = t.test(value, Avalue)$p.value) %>%
mutate(category = str_c(category, '_vs_A_p_value'))}, .id = 'name') %>%
pivot_wider(names_from = category, values_from = out)
I have a data frame similar to data created below:
ID <- data.frame(ID=rep(c(12,122,242,329,595,130,145,245,654,878),each=5))
Var <- data.frame(Variable=c("Copper","Iron","Lead","Zinc","CaCO"))
n <- 10
Variable <- do.call("rbind",replicate(n,Var,simplify=F))
Location <- rep(c("Alpha","Beta","Gamma"), times=c(20,20,10))
Location <- data.frame(Location)
set.seed(1)
FirstPt<- data.frame(FirstPt=sample(1:100,50,replace=T))
LastPt <- data.frame(LastPt=sample(1:100,50,replace=T))
First3<- data.frame(First3=sample(1:100,50,replace=T))
First5<- data.frame(First5=sample(1:100,50,replace=T))
First7<- data.frame(First7=sample(1:100,50,replace=T))
First10<- data.frame(First10=sample(1:100,50,replace=T))
Last3<- data.frame(Last3=sample(1:100,50,replace=T))
Last5<- data.frame(Last5=sample(1:100,50,replace=T))
Last7<- data.frame(Last7=sample(1:100,50,replace=T))
Last10<- data.frame(Last10=sample(1:100,50,replace=T))
data <- cbind(ID,Location,Variable,FirstPt,LastPt,First3,First5,First7,
First10,Last3,Last5,Last7,Last10)
This may be a two part question, but I want to write a function that groups all Variables that are the same (for instance, all the observations that are Copper) and conducts a paired t test between all possible combinations of the numeric columns (FirstPt:Last10). I want it to return the p values in a data frame like this:
Test P-Value
FirstPt.vs.LastPt …
FirstPt.vs.First3 …
ect... …
This will likely be a second function, but I also want to do this after the observations are grouped by Location so that the output data frame will look like this:
Test P-Value
FirstPt.vs.LastPt.InAlpha
FirstPt.vs.LastPt.InBeta
ect...
You can do both of these with one function:
library(tidyverse)
t.test.by.group.combos <- function(.data, groups){
by <- gsub(x = rlang::quo_get_expr(enquo(groups)), pattern = "\\((.*)?\\)", replacement = "\\1")[-1]
.data %>%
group_by(!!!groups) %>%
select_if(is.integer) %>%
group_split() %>%
map(.,
~pivot_longer(., cols = (FirstPt:Last10), names_to = "name", values_to = "val") %>%
nest(data = val) %>%
full_join(.,.,by = by) %>%
filter(name.x != name.y) %>%
mutate(test = paste(name.x, "vs",name.y, !!!groups, sep = "."),
p.value = map2_dbl(data.x,data.y, ~t.test(unlist(.x), unlist(.y))$p.value)) %>%
select(test,p.value)%>%
filter(!duplicated(p.value))
) %>%
bind_rows()
}
t.test.by.group.combos(data, vars(Variable))
#> # A tibble: 225 x 2
#> test p.value
#> <chr> <dbl>
#> 1 FirstPt.vs.LastPt.CaCO 0.511
#> 2 FirstPt.vs.First3.CaCO 0.184
#> 3 FirstPt.vs.First5.CaCO 0.494
#> 4 FirstPt.vs.First7.CaCO 0.354
#> 5 FirstPt.vs.First10.CaCO 0.893
#> 6 FirstPt.vs.Last3.CaCO 0.496
#> 7 FirstPt.vs.Last5.CaCO 0.909
#> 8 FirstPt.vs.Last7.CaCO 0.439
#> 9 FirstPt.vs.Last10.CaCO 0.146
#> 10 LastPt.vs.First3.CaCO 0.578
#> # … with 215 more rows
t.test.by.group.combos(data, vars(Variable, Location))
#> # A tibble: 674 x 2
#> test p.value
#> <chr> <dbl>
#> 1 FirstPt.vs.LastPt.CaCO.Alpha 0.850
#> 2 FirstPt.vs.First3.CaCO.Alpha 0.822
#> 3 FirstPt.vs.First5.CaCO.Alpha 0.895
#> 4 FirstPt.vs.First7.CaCO.Alpha 0.810
#> 5 FirstPt.vs.First10.CaCO.Alpha 0.645
#> 6 FirstPt.vs.Last3.CaCO.Alpha 0.870
#> 7 FirstPt.vs.Last5.CaCO.Alpha 0.465
#> 8 FirstPt.vs.Last7.CaCO.Alpha 0.115
#> 9 FirstPt.vs.Last10.CaCO.Alpha 0.474
#> 10 LastPt.vs.First3.CaCO.Alpha 0.991
#> # … with 664 more rows
This is kind of a lengthy function, but in general we group by the groups argument, then we select the groups and any integer columns, then we split the dataframe by the groups. After, we map all the combinations of variables and perform t.tests for each combo. Lastly, we rejoin all the groups into one dataframe.
I think this is what you want. The key was to use group_by and do from tidyverse.
df <- NULL
for(i in (4:(ncol(data)-1))){
for(j in ((i+1):ncol(data))){
df <- rbind(df,data %>%
group_by(Location) %>%
do(data.frame(pval = t.test(.[[i]],.[[j]], data = .)$p.value)) %>%
ungroup() %>%
mutate(Test = paste0(colnames(data)[i],'.vs.',colnames(data)[j]))
)
}
}
df$Test <- paste0(df$Test,'.In',df$Location)
Probably, you can acheive what you want using the below code :
library(dplyr)
library(tidyr)
data %>%
pivot_longer(cols = FirstPt:Last10) %>%
group_by(Variable) %>%
summarise(p_value = list(combn(name, 2, function(x)
t.test(value[name == x[1]], value[name == x[2]])$p.value)),
test = list(combn(name, 2, paste, collapse = "_"))) %>%
unnest(cols = c(test, p_value))
# Variable p_value test
# <fct> <dbl> <chr>
# 1 CaCO 0.915 FirstPt_LastPt
# 2 CaCO 0.529 FirstPt_First3
# 3 CaCO 0.337 FirstPt_First5
# 4 CaCO 0.350 FirstPt_First7
# 5 CaCO 0.395 FirstPt_First10
# 6 CaCO 0.765 FirstPt_Last3
# 7 CaCO 0.204 FirstPt_Last5
# 8 CaCO 0.873 FirstPt_Last7
# 9 CaCO 0.479 FirstPt_Last10
#10 CaCO 1 FirstPt_FirstPt
# … with 24,740 more rows
To do it grouped by Location you can add that into group_by command and keep rest of the code as it is.
I have some data which looks similar to the following:
Data:
rank_A <- sample(x = 1:5, size = 100, replace = TRUE)
rank_B <- sample(x = 1:5, size = 100, replace = TRUE)
Y_A <- rnorm(n = 100)
Y_B <- rnorm(n = 100)
X <- rnorm(n = 100)
df <- data.frame(rank_A, rank_B, Y_A, Y_B, X)
Which looks like:
> tibble(df)
# A tibble: 100 x 1
df$rank_A $rank_B $Y_A $Y_B $X
<int> <int> <dbl> <dbl> <dbl>
1 5 1 0.128 -0.833 1.15
2 2 1 0.165 0.325 0.225
3 1 3 0.525 -0.632 0.390
4 5 3 -1.32 0.718 -0.377
5 4 2 -0.900 -0.364 -0.259
6 5 3 -1.17 0.556 -0.0702
7 4 1 -0.560 -1.66 -1.64
8 5 1 -2.01 -0.898 0.306
9 1 1 -2.56 0.693 1.34
10 4 2 0.338 -0.733 -0.481
# ... with 90 more rows
So I have two rank columns and a corresponding Y for each rank. A and B here are just two categories. I want to run two regressions both on Y_A and Y_B both regressed on X. I have the following code which works for rank_A.
df_regs <- df %>%
group_by(rank_A) %>%
nest() %>%
mutate(
Reg_A = map(data,
~lm(Y_A ~ X,
data = .)
),
Reg_A_summary = map(Reg_A,
~summary(.)
)
)
df_regs$Reg_A_summary
I would like to construct it such that I do not have to write a new piece of code for rank_B. I have many different regressions (different X variables I will give to each Y_A and Y_B). I thought about doing something like the following (which doesn't work).
df %>%
mutate(Class_A = group_by(rank_A) %>%
nest() %>%
mutate(
Reg_A = map(data,
~lm(Y_a ~ X,
data = .)
)
),
Class_B = group_by(rank_B) %>%
nest() %>%
mutate(
Reg_B = map(data,
~lm(Y_b ~ X,
data = .)
)
)
)
If you know of a cleaner method, please also let me know your thoughts as I will have many regressions.
Here is one approach, using the new pivot functions from the latest release of tidyr...
df2 <- df %>% pivot_longer(cols=-X,
names_to = c(".value", "Class"),
names_sep = "_") %>%
group_by(Class, rank) %>%
summarise(Reg = list(lm(Y ~ X)))
This creates a tibble with columns Class, rank, and the list output Reg of the lm model for Y~X for each combination. You can then map through it to extract the summary, coefficients, or whatever.
We can do this with map2 for corresponding 'group_by` and dependent columns
library(dplyr)
library(purrr)
library(stringr)
library(broom)
map2(c('rank_A', 'rank_B'), c("Y_A", "Y_B"), ~ {
grp <- .x
predCol <- .y
newCol <- str_replace(predCol, "Y", "Reg")
df %>%
group_by_at(grp) %>%
nest %>%
mutate(!! newCol:= map(data,
~lm(reformulate('X', response = predCol))))
}
)
I have a similar dataset but with many more r and v variables.
set.seed(1000)
tb <- tibble(grp = c(rep("A",4),rep("B",4)),
v1 = rnorm(8),
v2 = rnorm(8),
v3 = rnorm(8),
r1 = rnorm(8),
r2 = rnorm(8))
For each v variable, I would like to create a lm() with r variables.
This is what I have so far:
lm_fun <- function(x,y) coef(lm(x ~ y))[2]
tb %>%
nest(-grp) %>%
mutate(lm_list = map(data, ~ .x %>%
summarise_at(colnames(tb)[c(2:4)], funs(r1=lm_fun), .$r1)),
lm_list2= map(data, ~ .x %>%
summarise_at(colnames(tb)[c(2:4)], funs(r2=lm_fun), .$r2)),) %>%
select(grp,lm_list,lm_list2) %>%
unnest()
which gives me the intended output:
# A tibble: 2 x 7
grp v1_r1 v2_r1 v3_r1 v1_r2 v2_r2 v3_r2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A -0.188 -0.0972 0.858 0.130 0.136 1.21
2 B 0.208 0.935 -1.33 -0.339 0.0580 -0.840
However, how can I specify the r variables in a vector (in a similar way of specifying the v variables as colnames(tb)[...]. I don't want to copy-pasta the code for every r variable I have in my full data. Also, would it be possible to solve this with another method?
Note that it is not important that the function is performing lm(), could be any function that involves two variables.
An option would be to loop through the 'r' columns inside map. This simplifies the code as we are using the same data but different 'r' columns
library(tidyverse)
tb %>%
nest(-grp) %>%
mutate(lm_list = map(data, function(x)
map(paste0('r', 1:2), function(y)
x %>%
summarise_at(vars(names(.)[1:3]), funs(lm_fun), .[[y]]) %>%
rename_all(~ paste(., y, sep="_")) ) %>%
bind_cols)) %>%
select(-data) %>%
unnest
# A tibble: 2 x 7
# grp v1_r1 v2_r1 v3_r1 v1_r2 v2_r2 v3_r2
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 A -0.188 -0.0972 0.858 0.130 0.136 1.21
#2 B 0.208 0.935 -1.33 -0.339 0.0580 -0.840
Another option would be to gather the levels of r before mutate/map:
tb %>%
gather(r, value, starts_with('r')) %>%
nest(-r, -grp) %>%
mutate(lm_list = map(
data, ~ .x %>%
summarise_at(colnames(tb)[c(2:4)], funs(lm_fun), .$value)
)) %>%
unnest(lm_list, .drop = T)
grp r v1 v2 v3
<chr> <chr> <dbl> <dbl> <dbl>
1 A r1 -0.188 -0.0972 0.858
2 B r1 0.208 0.935 -1.33
3 A r2 0.130 0.136 1.21
4 B r2 -0.339 0.0580 -0.840