Create parameterized summaries of a column - r

I have a tibble and I want create several summaries of the same column, specifically the first, second and third quartiles.
To do it, I create a named list of functions and that works fine.
library("tidyverse")
set.seed(1234)
df <- tibble(x = rnorm(100))
df %>%
summarise(
across(x,
list(
Q1 = ~ quantile(., 1 / 4),
Q2 = ~ quantile(., 2 / 4),
Q3 = ~ quantile(., 3 / 4)
),
.names = "{.fn}"
)
)
#> # A tibble: 1 × 3
#> Q1 Q2 Q3
#> <dbl> <dbl> <dbl>
#> 1 -0.895 -0.385 0.471
Can I achieve this by specifying the list of probabilities to pass to quantile? So that I save myself typing and more importantly avoid hard-coding the arguments to pass to the aggregating function.
The following doesn't work because it creates one row per probability rather than one column.
df %>%
summarise(
across(x, quantile, 1:3 / 4)
)
#> # A tibble: 3 × 1
#> x
#> <dbl>
#> 1 -0.895
#> 2 -0.385
#> 3 0.471

you're almost here
df <- tibble(x = rnorm(100))
df %>%
summarise(
across(x,
map(1:3, ~partial(quantile, probs=./4)),
.names = "Q{.fn}"
)
)
# A tibble: 1 x 3
Q1 Q2 Q3
<dbl> <dbl> <dbl>
1 -0.579 0.0815 0.475

If you define the quantiles like this:
Q <- c(0.25, 0.5, 0.75)
Then the following code will produce columns of the appropriate quantiles with sensible labels:
df %>%
summarise(
across(x,
setNames( lapply(Q,
function(x) { f <- ~quantile(., b); f[2][[1]][[3]] <- x; f }),
paste("Q", round(100 * Q), sep = "_")),
.names = "{.fn}"
)
)
#> # A tibble: 1 x 3
#> Q_25 Q_50 Q_75
#> <dbl> <dbl> <dbl>
#> 1 -0.895 -0.385 0.471
Created on 2022-06-29 by the reprex package (v2.0.1)

Related

Remove unused contrasts when making multiple linear models using R map

I am making linear models across a large dataset which is unbalanced (not all contrasts are present for all groupings). Is there an efficient way to ignore groupings where there are less than 2 contrasts? In the examples below testData1 represents a balanced dataset where the workflow works correctly. testData2 represents an unbalanced dataset which throws a contrast error.
aovFxn <- function(dat){
lm(outcomeVar ~ predVar1, data = dat) %>%
broom::tidy()
}
testData1 <- data.frame(
groupVar = rep(c('a', 'b'), each = 12),
predVar1 = c(rep(c('x', 'y', 'z'), each = 4, times = 2)),
outcomeVar = sample(1:100, 24)
)
testData2 <- data.frame(
groupVar = rep(c('a', 'b'), each = 12),
predVar1 = c(rep(c('x', 'y', 'z'), each = 4),
rep('x', 12)),
outcomeVar = sample(1:100, 24)
)
testStats1 <- testData1 %>%
nest(groupData = -groupVar) %>%
mutate(df = purrr::map(groupData, aovFxn)) %>%
unnest_legacy(df)
testStats2 <- testData2 %>%
nest(groupData = -groupVar) %>%
mutate(df = purrr::map(groupData, aovFxn)) %>%
unnest_legacy(df)
We may use either tryCatch or purrr::possibly to return a desired value when there is an error
library(dplyr)
library(purrr)
paovFxn <- possibly(aovFxn, otherwise = NULL)
testData2 %>%
nest(groupData = -groupVar) %>%
mutate(df = purrr::map(groupData, paovFxn)) %>%
unnest(df)%>%
select(-groupData)
-output
A tibble: 3 × 6
groupVar term estimate std.error statistic p.value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 a (Intercept) 42.5 17.3 2.45 0.0367
2 a predVar1y 19.7 24.5 0.805 0.441
3 a predVar1z 2.25 24.5 0.0917 0.929
Another option is to create an if condition
testData2 %>%
nest(groupData = -groupVar) %>%
mutate(df = map(groupData, ~ if(n_distinct(.x$predVar1) > 1) aovFxn(.x)) ) %>%
unnest(df, keep_empty = TRUE) %>%
select(-groupData)
-output
# A tibble: 4 × 6
groupVar term estimate std.error statistic p.value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 a (Intercept) 42.5 17.3 2.45 0.0367
2 a predVar1y 19.7 24.5 0.805 0.441
3 a predVar1z 2.25 24.5 0.0917 0.929
4 b <NA> NA NA NA NA
NOTE: If we don't use keep_empty = TRUE, it will be FALSE by default and the 'groupVar' 'b' row will not be there in the output

Passing multiple arguments into map() with prop.test() with dplyr

could anybody help my to change the rowwise() + prop.test() to map2?
The issue is, that map2 takes .x and .y arguments, but I need to pass 4 columns: 2xsucces and 2xtries into the prop.test() function. I want to extract the p-value into new column p.
This code runs OK'isch:
library(tidyverse)
cc <- tribble(
~n1, ~x1, ~n2, ~x2,
1000,100,900,85,
1000,100,100,10,
1000,100,10,10
)
cc %>%
rowwise()%>%
mutate(p = prop.test(x=c(x1, x2),
n=c(n1, n2),
conf.level=0.95)$p.value)%>%
mutate(p=round(p,5))
#> Warning in prop.test(x = c(x1, x2), n = c(n1, n2), conf.level = 0.95): Chi-
#> squared approximation may be incorrect
#> # A tibble: 3 × 5
#> # Rowwise:
#> n1 x1 n2 x2 p
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1000 100 900 85 0.741
#> 2 1000 100 100 10 1
#> 3 1000 100 10 10 0
But I noticed that I can gain speed with a for loop:
cc <- tribble(
~n1, ~x1, ~n2, ~x2,
1000,100,900,85,
1000,100,100,10,
1000,100,10,10
)
cc %>%
rowwise()%>%
mutate(p = prop.test(x=c(x1, x2),
n=c(n1, n2),
conf.level=0.95)$p.value)%>%
mutate(p=round(p,5))
cc$p <-999
for (i in 1:nrow(cc)){
result <- prop.test(x=c(cc$x1[[i]], cc$x2[[i]]),
n = c(cc$n1[[i]], cc$n2[[i]]),
conf.level=0.95)
cc$p[[i]] <- result$p.value
}
cc %>%
mutate(p=round(p,5))
}
But I ask myself it there is more elegant way to use map function from dplyr in order to write less code and achieve the same results?
Thanks in advance
There is pmap:
library(tidyverse)
cc <- tribble(
~n1, ~x1, ~n2, ~x2,
1000, 100, 900, 85,
1000, 100, 100, 10,
1000, 100, 10, 10
)
cc %>%
mutate(
signif = list(x1, x2, n1, n2) %>% pmap_dbl(~ {
prop.test(
x = c(..1, ..2),
n = c(..3, ..4),
conf.level = 0.95
)$p.value
})
)
#> Warning in prop.test(x = c(..1, ..2), n = c(..3, ..4), conf.level = 0.95): Chi-
#> squared approximation may be incorrect
#> # A tibble: 3 x 5
#> n1 x1 n2 x2 signif
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1000 100 900 85 7.41e- 1
#> 2 1000 100 100 10 1 e+ 0
#> 3 1000 100 10 10 9.46e-18
Created on 2022-02-22 by the reprex package (v2.0.0)
For loops are usually very slow, and the code is hard to read and often unnecessary when it comes to the scenario of just apply a function for each element.
The thing which slows down your dplyr code is rowwise, which is not needed anymore using the map function.

Using a function in R with multiple outcomes to create multiple columns in mutate

I am using a self declared function that runs a regression analysis. I want to run this for thousands of companies for multiple years, thus speed is essential. My function creates three outputs (a coefficient, the p value and r-squared). The function runs fine individually, however when I use mutate() to let it run through the whole dataset, it only gives the same values for all rows. The weirdest thing is that I can't reproduce those particular values by running the function individually. I made an reproducible example below. I have used lapply successfully before with this data, but I would like to keep it in mutate and above all I would like to know what's exactly happening here.
So my question is: how can I make this function work for each individual row for the companies dataset using mutate?
library(tidyverse)
companies <- data.frame(comp_id = 1:5)
individuals <- data.frame(id = 1:100,
comp_id = sample(1:5, 100, replace = T),
age = sample(18:67, 100, replace = T),
wage = sample(1700:10000, 100, replace = T))
regger <- function(x){
df <- individuals %>% filter(comp_id == x)
formula <- wage ~ age
regression <- lm(formula, df)
res <- list(coeff = summary(regression)$coefficient[2,1],
p = summary(regression)$coefficients[2,4],
r2 = summary(regression)$r.squared)
return(res)
}
companies %>%
mutate(data = list(regger(comp_id))) %>%
unnest_wider(data)
output:
# A tibble: 5 x 4
comp_id coeff p r2
<int> <dbl> <dbl> <dbl>
1 1 -4.92 0.916 0.000666
2 2 -4.92 0.916 0.000666
3 3 -4.92 0.916 0.000666
4 4 -4.92 0.916 0.000666
5 5 -4.92 0.916 0.000666
Use map from the purrr package if a function is not vectorized:
library(tidyverse)
set.seed(1337)
companies <- data.frame(comp_id = 1:5)
individuals <- data.frame(
id = 1:100,
comp_id = sample(1:5, 100, replace = T),
age = sample(18:67, 100, replace = T),
wage = sample(1700:10000, 100, replace = T)
)
regger <- function(x) {
df <- individuals %>% filter(comp_id == x)
formula <- wage ~ age
regression <- lm(formula, df)
res <- list(
coeff = summary(regression)$coefficient[2, 1],
p = summary(regression)$coefficients[2, 4],
r2 = summary(regression)$r.squared
)
return(res)
}
companies %>%
mutate(data = comp_id %>% map(regger)) %>%
unnest_wider(data)
#> # A tibble: 5 x 4
#> comp_id coeff p r2
#> <int> <dbl> <dbl> <dbl>
#> 1 1 67.1 0.108 0.218
#> 2 2 23.7 0.466 0.0337
#> 3 3 31.2 0.292 0.0462
#> 4 4 18.4 0.582 0.0134
#> 5 5 0.407 0.994 0.00000371
Created on 2021-09-09 by the reprex package (v2.0.1)
I'm not sure what the output should look like, but could it be that you need to work on a row-by-row basis?
companies %>%
rowwise() %>%
mutate(data = list(regger(comp_id))) %>%
unnest_wider(data)
comp_id coeff p r2
<int> <dbl> <dbl> <dbl>
1 1 21.6 0.470 0.0264
2 2 13.5 0.782 0.00390
3 3 0.593 0.984 0.0000175
4 4 -9.33 0.824 0.00394
5 5 64.9 0.145 0.156

Perform multiple two-sample t-test using dplyr in R

I would like to perform multiple pairwise t-tests on a dataset containing about 400 different column variables and 3 subject groups, and extract p-values for every comparison. A shorter representative example of the data, using only 2 variables could be the following;
df <- tibble(var1 = rnorm(90, 1, 1), var2 = rnorm(90, 1.5, 1), group = rep(1:3, each = 30))
Ideally the end result will be a summarised data frame containing four columns; one for the variable being tested (var1, var2 etc.), two for the groups being tested every time and a final one for the p-value.
I've tried duplicating the group column in the long form, and doing a double group_by in order to do the comparisons but with no result
result <- df %>%
pivot_longer(var1:var2, "var", "value") %>%
rename(group_a = group) %>%
mutate(group_b = group_a) %>%
group_by(group_a, group_b) %>%
summarise(n = n())
We can reshape the data into 'long' format with pivot_longer, then grouped by 'group', apply the pairwise.t.test, extract the list elements and transform into tibble with tidy (from broom) and unnest the list column
library(dplyr)
library(tidyr)
library(broom)
df %>%
pivot_longer(cols = -group, names_to = 'grp') %>%
group_by(group) %>%
summarise(out = list(pairwise.t.test(value, grp
) %>%
tidy)) %>%
unnest(c(out))
-output
# A tibble: 3 x 4
group group1 group2 p.value
<int> <chr> <chr> <dbl>
1 1 var2 var1 0.0760
2 2 var2 var1 0.0233
3 3 var2 var1 0.000244
In case you end up wanting more information about the t-tests, here is an approach that will allow you to extract more information such as the degrees of freedom and value of the test statistic:
library(dplyr)
library(tidyr)
library(purrr)
library(broom)
df <- tibble(
var1 = rnorm(90, 1, 1),
var2 = rnorm(90, 1.5, 1),
group = rep(1:3, each = 30)
)
df %>%
select(-group) %>%
names() %>%
map_dfr(~ {
y <- .
combn(3, 2) %>%
t() %>%
as.data.frame() %>%
pmap_dfr(function(V1, V2) {
df %>%
select(group, all_of(y)) %>%
filter(group %in% c(V1, V2)) %>%
t.test(as.formula(sprintf("%s ~ group", y)), ., var.equal = TRUE) %>%
tidy() %>%
transmute(y = y,
group_1 = V1,
group_2 = V2,
df = parameter,
t_value = statistic,
p_value = p.value
)
})
})
#> # A tibble: 6 x 6
#> y group_1 group_2 df t_value p_value
#> <chr> <int> <int> <dbl> <dbl> <dbl>
#> 1 var1 1 2 58 -0.337 0.737
#> 2 var1 1 3 58 -1.35 0.183
#> 3 var1 2 3 58 -1.06 0.295
#> 4 var2 1 2 58 -0.152 0.879
#> 5 var2 1 3 58 1.72 0.0908
#> 6 var2 2 3 58 1.67 0.100
And here is #akrun's answer tweaked to give the same p-values as the above approach. Note the p.adjust.method = "none" which gives independent t-tests which will inflate your Type I error rate.
df %>%
pivot_longer(
cols = -group,
names_to = "y"
) %>%
group_by(y) %>%
summarise(
out = list(
tidy(
pairwise.t.test(
value,
group,
p.adjust.method = "none",
pool.sd = FALSE
)
)
)
) %>%
unnest(c(out))
#> # A tibble: 6 x 4
#> y group1 group2 p.value
#> <chr> <chr> <chr> <dbl>
#> 1 var1 2 1 0.737
#> 2 var1 3 1 0.183
#> 3 var1 3 2 0.295
#> 4 var2 2 1 0.879
#> 5 var2 3 1 0.0909
#> 6 var2 3 2 0.100
Created on 2021-07-30 by the reprex package (v1.0.0)

Speed up the applications of the function "cummean" on 4 vectors

I have 2 vectors with the same length x,y. Then x^2,y^2 are square (element-wise) of x,y respectively. In each iteration, I need to apply function cummean on x,y,x^2,y^2.
I would like to ask if I can speed up the process someway rather than running 4 separate operations.
library(dplyr)
x <- c(1, 2, 3)
y <- c(5, 5, 6)
dplyr::cummean(x)
dplyr::cummean(y)
dplyr::cummean(x^2)
dplyr::cummean(y^2)
Thank you so much for your suggestion!
I guess you could do something like:
tibble(x, y) %>%
mutate(across(1:2, ~.x^2, .names = c("{col}^2"))) %>%
mutate(across(1:4, cummean, .names = "cummean_{col}"))
#> # A tibble: 3 x 8
#> x y `x^2` `y^2` cummean_x cummean_y `cummean_x^2` `cummean_y^2`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 5 1 25 1 5 1 25
#> 2 2 5 4 25 1 5 1 25
#> 3 3 6 9 36 1.33 5 2 25
And if you want the variables in the global environment rather than in a tibble you could do:
tibble(x, y) %>%
mutate(across(1:2, ~.x^2, .names = c("{col}^2"))) %>%
mutate(across(1:4, cummean, .names = "cummean_{col}")) %>%
as.list() %>%
list2env(envir = globalenv())
Or in a function if you had to do this a lot you could do:
func <- function(x, y)
{
tibble(x, y) %>%
mutate(across(1:2, ~.x^2, .names = c("{col}^2"))) %>%
mutate(across(1:4, cummean, .names = "cummean_{col}")) %>%
as.list() %>%
list2env(envir = parent.frame())
}

Resources