Summary Tables using Nested Tibbles - r

I am trying to generate a table of summary statistics using purrr/tibble methods. I am able to calculate group-wise mean (sd) and counts using the following:
library(dplyr)
library(tidyr)
library(purrr)
library(tibble)
mtcars %>%
gather(variable, value, -vs, -am) %>%
group_by(vs, am, variable) %>%
nest() %>%
filter(variable %in% c("mpg", "hp")) %>%
mutate(
mean = map_dbl(data, ~mean(.$value, na.rm = TRUE)),
sd = map_dbl(data, ~sd(.$value, na.rm = TRUE)),
n = map_dbl(data, ~sum(!is.na(.$value)))
) %>%
select(vs:variable, mean:n) %>%
mutate_at(vars(mean, sd), round, 3) %>%
mutate(mean_sd = paste0(mean, " (", sd, ")"),
var_group = paste(vs, am, variable, sep = "_")) %>%
select(n:var_group) %>%
nest(n, mean_sd, .key = "summary") %>%
spread(key = var_group, value = summary) %>%
unnest()
My immediate question is, how do I retain the column names as seen in spread(key = var_group, value = summary) in the unnest()-ed output?
edit: Thanks to all for the responses.
https://stackoverflow.com/a/55912326/5745045 has the advantages of being easier to read and not storing a temporary variable. A disadvantage is the change of numeric to character in the n columns.
The final goal is to replace the column names with formatted text within the context of a grouped kable table.

By storing the "nested" tibble as a temporary variable1 and using its colnames2, we can achieve what you desire. Look below;
mtcars %>%
gather(variable, value, -vs, -am) %>%
group_by(vs, am, variable) %>%
nest() %>%
filter(variable %in% c("mpg", "hp")) %>%
mutate(
mean = map_dbl(data, ~mean(.$value, na.rm = TRUE)),
sd = map_dbl(data, ~sd(.$value, na.rm = TRUE)),
n = map_dbl(data, ~sum(!is.na(.$value)))
) %>%
select(vs:variable, mean:n) %>%
mutate_at(vars(mean, sd), round, 3) %>%
mutate(mean_sd = paste0(mean, " (", sd, ")"),
var_group = paste(vs, am, variable, sep = "_")) %>%
select(n:var_group) %>%
nest(n, mean_sd, .key = "summary") %>%
spread(key = var_group, value = summary) %>%
#1: storing the temporary nested variable
{. ->> temptibble} %>%
unnest() %>%
#2: renaming the columns of unnested output and removing temporary variable
rename_all(funs(paste0(., "_", rep(colnames(temptibble), each=2)))); rm(temptibble)
# # A tibble: 1 x 16
# n_0_0_hp mean_sd_0_0_hp n1_0_0_mpg mean_sd1_0_0_mpg n2_0_1_hp mean_sd2_0_1_hp n3_0_1_mpg mean_sd3_0_1_mpg
# <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr>
# 1 12 194.167 (33.36) 12 15.05 (2.774) 6 180.833 (98.816) 6 19.75 (4.009)
# n4_1_0_hp mean_sd4_1_0_hp n5_1_0_mpg mean_sd5_1_0_mpg n6_1_1_hp mean_sd6_1_1_hp n7_1_1_mpg mean_sd7_1_1_mpg
# <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr>
# 1 7 102.143 (20.932) 7 20.743 (2.471) 7 80.571 (24.144) 7 28.371 (4.758)

Here's another method that doesn't require creating a temporary variable. Instead of nesting the data at the end, I used gather() and unite() to restructure the data so that it ends up as one key and value pair.
library(tidyverse)
#> Registered S3 methods overwritten by 'ggplot2':
#> method from
#> [.quosures rlang
#> c.quosures rlang
#> print.quosures rlang
#> Registered S3 method overwritten by 'rvest':
#> method from
#> read_xml.response xml2
mtcars %>%
gather(variable, value, -vs, -am) %>%
group_by(vs, am, variable) %>%
nest() %>%
filter(variable %in% c("mpg", "hp")) %>%
mutate(
mean = map_dbl(data, ~mean(.$value, na.rm = TRUE)),
sd = map_dbl(data, ~sd(.$value, na.rm = TRUE)),
n = map_dbl(data, ~sum(!is.na(.$value)))
) %>%
select(vs:variable, mean:n) %>%
mutate_at(vars(mean, sd), round, 3) %>%
mutate(mean_sd = paste0(mean, " (", sd, ")"),
var_group = paste(vs, am, variable, sep = "_")) %>%
select(n:var_group) %>%
gather(key, value, -var_group) %>%
unite(var_group_key, var_group, key) %>%
spread(var_group_key, value)
#> # A tibble: 1 x 16
#> `0_0_hp_mean_sd` `0_0_hp_n` `0_0_mpg_mean_s… `0_0_mpg_n` `0_1_hp_mean_sd`
#> <chr> <chr> <chr> <chr> <chr>
#> 1 194.167 (33.36) 12 15.05 (2.774) 12 180.833 (98.816)
#> # … with 11 more variables: `0_1_hp_n` <chr>, `0_1_mpg_mean_sd` <chr>,
#> # `0_1_mpg_n` <chr>, `1_0_hp_mean_sd` <chr>, `1_0_hp_n` <chr>,
#> # `1_0_mpg_mean_sd` <chr>, `1_0_mpg_n` <chr>, `1_1_hp_mean_sd` <chr>,
#> # `1_1_hp_n` <chr>, `1_1_mpg_mean_sd` <chr>, `1_1_mpg_n` <chr>
Created on 2019-04-29 by the reprex package (v0.2.1)

Related

Using dplyr::summarise with dplyr::across and purrr::map to sum across columns with the same prefix

I have a data frame where I want to sum column values with the same prefix to produce a new column. My current problem is that it's not taking into account my group_by variable and returning identical values. Is part of the problem the .cols variable I'm selecting in the across function?
Sample data
library(dplyr)
library(purrr)
set.seed(10)
dat <- data.frame(id = rep(1:2, 5),
var1.pre = rnorm(10),
var1.post = rnorm(10),
var2.pre = rnorm(10),
var2.post = rnorm(10)
) %>%
mutate(index = id)
var_names = c("var1", "var2")
What I've tried
sumfunction <- map(
var_names,
~function(.){
sum(dat[glue("{.x}.pre")], dat[glue("{.x}.post")], na.rm = TRUE)
}
) %>%
setNames(var_names)
dat %>%
group_by(id) %>%
summarise(
across(
.cols = index,
.fns = sumfunction,
.names = "{.fn}"
)
) %>%
ungroup
Desired output
For this and similar problems I made the 'dplyover' package (it is not on CRAN). Here we can use dplyover::across2() to loop over two series of columns, first, all columns ending with "pre" and second all columns ending with "post". To get the names correct we can use .names = "{pre}" to get the common prefix of both series of columns.
library(dplyr)
library(dplyover) # https://timteafan.github.io/dplyover/
dat %>%
group_by(id) %>%
summarise(across2(ends_with("pre"),
ends_with("post"),
~ sum(c(.x, .y)),
.names = "{pre}"
)
)
#> # A tibble: 2 × 3
#> id var1 var2
#> <int> <dbl> <dbl>
#> 1 1 -2.32 -5.55
#> 2 2 1.11 -9.54
Created on 2022-12-14 with reprex v2.0.2
Whenever operations across multiple columns get complicated, we could pivot:
library(dplyr)
library(tidyr)
dat %>%
pivot_longer(-c(id, index),
names_to = c(".value", "name"),
names_sep = "\\.") %>%
group_by(id) %>%
summarise(var1 = sum(var1), var2=sum(var2))
id var1 var2
<int> <dbl> <dbl>
1 1 -2.32 -5.55
2 2 1.11 -9.54

Adding differences to a summary-table created without iteration

Based on my first question found here about creating a summary table without iteration, ie. without using map, I made the following algorithm based on the formidable answers,
library(tidyverse)
sum_variables <- c("mpg", "hp", "disp")
# Create grouping var; ####
mtcars <- mtcars %>% mutate(
am_factor = case_when(
am == 0 ~ "Automatic",
TRUE ~ "Manual"
)
)
# Create summary table; ####
mtcars %>%
group_by(am_factor) %>%
summarise(
across(
all_of(sum_variables),
~ paste0(mean(.) %>% round(2), "(±", sd(.) %>% round(2), ")")
)
) %>% pivot_longer(
cols = -"am_factor"
) %>% pivot_wider(
names_from = "am_factor"
)
Which gives me the following output,
# A tibble: 3 x 3
name Automatic Manual
<chr> <chr> <chr>
1 mpg 17.15(±3.83) 24.39(±6.17)
2 hp 160.26(±53.91) 126.85(±84.06)
3 disp 290.38(±110.17) 143.53(±87.2)
Using paste0 here has the benefit of reducing the amount of codes needed in the algorithm, but complicates further additions to table. If I, for example, want to add differences to this table, my current solution is the following,
mtcars %>%
group_by(am_factor) %>%
summarise(
across(
all_of(sum_variables),
~ paste0(mean(.) %>% round(2), "(±", sd(.) %>% round(2), ")")
)
) %>% pivot_longer(
cols = -"am_factor"
) %>% pivot_wider(
names_from = "am_factor"
) %>% mutate(
difference = str_extract(Automatic, "[:digit:].?[:digit:]") %>% as.numeric() -
str_extract(Manual, "[:digit:].?[:digit:]") %>% as.numeric()
)
Which gives the desired output,
# A tibble: 3 × 4
name Automatic Manual difference
<chr> <chr> <chr> <dbl>
1 mpg 17.15(±3.83) 24.39(±6.17) -7
2 hp 160.26(±53.91) 126.85(±84.06) 34
3 disp 290.38(±110.17) 143.53(±87.2) 147
Although it works, it defeats the purpose of making a simple algorithm for the purpose.
How do I create a summary of my data in a simple manner? It has to be a tidyverse-solution, preferably without iteration.
This is not necessarily simpler or shorter but I would prefer to do the mathematical calculations on numbers directly rather than extracting them from strings.
library(dplyr)
library(tidyr)
mtcars %>%
group_by(am_factor) %>%
summarise(across(all_of(sum_variables), list(mean = mean,
sd = ~sprintf('%.2f (± %.2f)', mean(.), sd(.))))) %>%
pivot_longer(cols = -am_factor,
names_to = c('measure', '.value'),
names_sep = '_') %>%
group_by(measure) %>%
mutate(difference = -diff(mean)) %>%
ungroup %>%
select(-mean) %>%
pivot_wider(names_from = am_factor, values_from = sd)
# measure difference Automatic Manual
# <chr> <dbl> <chr> <chr>
#1 mpg -7.24 17.15 (± 3.83) 24.39 (± 6.17)
#2 hp 33.4 160.26 (± 53.91) 126.85 (± 84.06)
#3 disp 147. 290.38 (± 110.17) 143.53 (± 87.20)

Summarising multiple variables without iteration

Consider this data that needs the summary measures mean and sd on multiple variables,
# Create grouping var; ####
mtcars <- mtcars %>% mutate(
am = case_when(
am == 0 ~ "Automatic",
TRUE ~ "Manual"
)
)
With the following custom function and purrr, I can create a baseline table,
# Summarising function; ####
sum_foo <- function(data, var) {
data %>%
group_by(am) %>%
summarise(
mean = mean( !!sym(var) , na.rm = TRUE),
sd = sd( !!sym(var) , na.rm = TRUE)
) %>%
mutate(across(where(is.double), round, 2)) %>%
group_by(am) %>%
transmute(
value = paste(mean, "(±", sd, ")", sep = ""),
variable = var
) %>%
pivot_wider(
names_from = "am"
)
}
# Execute Function; ####
sum_variables <- c("mpg", "hp", "disp")
sum_variables %>% map(
sum_foo,
data = mtcars
) %>% reduce(
bind_rows
)
Which gives the following output,
# A tibble: 3 x 3
variable Automatic Manual
<chr> <chr> <chr>
1 mpg 17.15(±3.83) 24.39(±6.17)
2 hp 160.26(±53.91) 126.85(±84.06)
3 disp 290.38(±110.17) 143.53(±87.2)
I want to get the output without using map and reduce, ie. without iterating through the variables with rowwise or map.
I'm looking for an alternative tidyverse-solution!
Maybe you could use this solution:
library(dplyr)
library(tidyr)
library(tibble)
sum_variables %>%
enframe() %>%
rowwise() %>%
mutate(output = list(sum_foo(mtcars, value))) %>%
select(output) %>%
unnest(cols = output)
# A tibble: 3 x 3
variable Automatic Manual
<chr> <chr> <chr>
1 mpg 17.15(±3.83) 24.39(±6.17)
2 hp 160.26(±53.91) 126.85(±84.06)
3 disp 290.38(±110.17) 143.53(±87.2)
Updated
Or you could even modify your function in the following way:
sum_foo2 <- function(data, var) {
data %>%
group_by(am) %>%
summarise(across(all_of(var), list(Mean = mean, sd = sd))) %>%
mutate(across(where(is.double), round, 2)) %>%
group_by(am) %>%
summarise(across(ends_with("Mean"), ~ paste(.x, "(±", get(gsub("_Mean", "_sd", cur_column())), ")", sep = ""))) %>%
pivot_longer(!am, names_to = "Mean", values_to = "Val") %>%
pivot_wider(names_from = "am", values_from = "Val")
}
sum_foo2(mtcars, sum_variables)
# A tibble: 3 x 3
Mean Automatic Manual
<chr> <chr> <chr>
1 mpg_Mean 17.15(±3.83) 24.39(±6.17)
2 hp_Mean 160.26(±53.91) 126.85(±84.06)
3 disp_Mean 290.38(±110.17) 143.53(±87.2)
If I am to trim the function above into a more concise version:
sum_foo2 <- function(data, var) {
data %>%
group_by(am) %>%
summarise(across(all_of(var), ~ paste0(round(mean(.x), 2), "(±", round(sd(.x), 2), ")"))) %>%
pivot_longer(!am, names_to = "Mean", values_to = "Val") %>%
pivot_wider(names_from = "am", values_from = "Val")
}
sum_foo2(mtcars, sum_variables)
Without using the function that you wrote, which require an iteration, ie rowwise/map, You could simply do:
sum_variables <- c("mpg", "hp", "disp")
mtcars %>%
group_by(am) %>%
summarise(across(all_of(sum_variables),
~sprintf('%.2f(\u00B1%.2f)', mean(.x), sd(.x))), .groups = 'drop') %>%
data.table::transpose(keep.names = 'variable', make.names = TRUE)
variable Automatic Manual
1 mpg 17.15(±3.83) 24.39(±6.17)
2 hp 160.26(±53.91) 126.85(±84.06)
3 disp 290.38(±110.17) 143.53(±87.20)
So: Please be gentile. My solution without using map and reduce:
library(dplyr)
library(tidyr)
library(stringr)
data %>%
group_by(am) %>%
summarise(across(c(mpg, hp, disp), list(mean = mean, sd = sd), .names = "{.col}_{.fn}")) %>%
pivot_longer (
cols = 2:7,
names_to = "variable",
values_to = "values"
) %>%
pivot_wider(
names_from = am,
values_from = values
) %>%
mutate(variable = str_extract(variable, "[^_]*")) %>%
mutate(across(c(Automatic, Manual), lead, .names = "{.col}_{.fn}")) %>%
filter(row_number() %% 2 == 1) %>%
mutate(across(where(is.numeric), round, 2)) %>%
mutate(Automatic = paste0(Automatic,"(±",Automatic_1,")"), .keep="unused") %>%
mutate(Manual = paste0(Manual,"(±",Manual_1,")"), .keep="unused")
Output:
variable Automatic Manual
<chr> <chr> <chr>
1 mpg 17.15(±3.83) 24.39(±6.17)
2 hp 160.26(±53.91) 126.85(±84.06)
3 disp 290.38(±110.17) 143.53(±87.2)
With the formidable answers that I got, this is the final tidyverse-solution without iteration or map that were born,
mtcars %>%
group_by(am) %>%
summarise(
across(
all_of(sum_variables),
~ paste0(mean(.) %>% round(2), "(±", sd(.) %>% round(2), ")")
)
) %>% pivot_longer(
cols = -"am"
) %>% pivot_wider(
names_from = "am"
)
Which gives the following output,
# A tibble: 3 x 3
name Automatic Manual
<chr> <chr> <chr>
1 mpg 17.15(±3.83) 24.39(±6.17)
2 hp 160.26(±53.91) 126.85(±84.06)
3 disp 290.38(±110.17) 143.53(±87.2)

R need a help to correct a function

library(tidyverse)
#make a sample data frame
a <- c(2000,2000,2000,2000,2001,2001,2001,2001)
b <- c("M","M","M","F","F","M","F","F")
d<- c("Yes","No","Yes","No","No","Unknown","Unknown","Yes")
e <- c("Unknown","No","No","Yes","Unknown","Yes","No","Unknown")
df <- data.frame(a,b,d,e)
colnames(df) <- c("Year","Gender","q1","q2")
# make a table for q1
myvar <- c("Gender","q1")
mydf <- df[,myvar]
table1 <- mydf %>%
pivot_longer(-q1) %>%
group_by(name,q1,value) %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = "q1", values_from = "summary_str")
#make the function creating a table
maketable <- function(df,x){
myvar <- c("gender",paste0(x))
mydf <- df[,myvar]
table1 <- mydf %>%
pivot_longer(-get(x)) %>%
group_by(name,get(x),value) %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = paste0(x), values_from = "summary_str")
colnames(table1)
}
maketable(df,q1)
maketable(df,q2)
Error in paste0(x): object 'q1' not found.
I want to make a function, so that I can use it for q2.
Could anyone help to correct the code? or suggest a better way?
Output per variable is as below
If you want to pass in unquoted column names to your function, you can use the {{}} (embrace) operator to inject them into your commands. For example
maketable <- function(df,x){
df %>%
select(Gender, {{x}}) %>%
pivot_longer(-{{x}}) %>%
group_by(name,{{x}},value)%>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = {{x}}, values_from = "summary_str")
}
table1 <-maketable(df, q1)
See the programming with dplyr guide for more information.
Also note that the function just returns the new value. If you want to assign that to a new variable, make sure you do that outside the function. Values created inside of functions will not appear outside.
I have tried this one here
my_func = function(x)
{
new_df = df %>% group_by(Gender) %>% count({{x}}) %>% pivot_wider(names_from = {{x}}, values_from = n)
return(new_df)
}
I'm not sure that this is what you asked
colns <- colnames(df)
lapply(colns[c(3:4)], function(x) {
myvar <- c("Gender", x)
mydf <- df[,myvar]
table1 <- mydf%>%
pivot_longer(-x) %>%
group_by_all %>%
summarise(n=n()) %>%
mutate(prop = round(n/sum(n),3)*100,
summary_str = glue::glue("{n}({prop}%)")) %>%
pivot_wider(id_cols = c(name,value), names_from = x, values_from = "summary_str")
})
result is like
[[1]]
# A tibble: 2 x 5
# Groups: name [1]
name value No Unknown Yes
<chr> <chr> <glue> <glue> <glue>
1 Gender F 2(25%) 1(12.5%) 1(12.5%)
2 Gender M 1(12.5%) 1(12.5%) 2(25%)
[[2]]
# A tibble: 2 x 5
# Groups: name [1]
name value No Unknown Yes
<chr> <chr> <glue> <glue> <glue>
1 Gender F 1(12.5%) 2(25%) 1(12.5%)
2 Gender M 2(25%) 1(12.5%) 1(12.5%)
You may need to change
lapply(colns[c(3:4)],...
3:4 to 3:102 for q1~q100

Applying function and spreading data in dplyr mutate/summarise

I am trying to compute some function and then spread my data based on the results.
Code & Data:
-- Preprocessing:
library(tidyquant)
library(tsfeatures)
data(FANG)
FANG_returns <- FANG %>%
group_by(symbol) %>%
tq_transmute(select = adjusted,
mutate_fun = periodReturn,
period = "daily",
type = "arithmetic")
FANG_returns
Code which works:
FANG_returns %>%
filter(symbol == "FB") %>%
mutate(ent = entropy(daily.returns))
Code which doesn't work:
FANG_returns %>%
filter(symbol == "FB") %>%
mutate(max = max_level_shift(daily.returns))
Error:
Error: Column max must be length 1008 (the group size) or one, not 2
The function max_level_shift returns two columns:
max_level_shift(AirPassengers)
#> max_level_shift time_level_shift
#> 54.5 117.0
How can I spread my data such that I have my data in the following way:
> FANG_returns %>%
+ filter(symbol == "FB") %>%
+ summarise(ent = entropy(daily.returns))
# A tibble: 1 x 2
symbol ent max_level_shift time_level_shift
<chr> <dbl> <dbl> <dbl>
1 FB 0.991 0.0573yyy 0.92764zzzz
Any pointers would be great.
Instead of using spread, we can just subset max_level_shift(daily.returns) by positions, as we know max will be in the first position and time in the 2nd position.
library(tidyquant)
library(tsfeatures)
FANG_returns %>%
filter(symbol == "FB") %>%
summarise(ent = entropy(daily.returns) ,
max_level_shift = max_level_shift(daily.returns)[1],
time_level_shift = max_level_shift(daily.returns)[2])
# A tibble: 1 x 4
symbol ent max_level_shift time_level_shift
<chr> <dbl> <dbl> <dbl>
1 FB 0.991 0.0484 141
#2nd option
FANG_returns %>%
filter(symbol == "FB") %>%
summarise(ent = entropy(daily.returns) ,
max = paste(max_level_shift(daily.returns), collapse = '-')) %>%
separate(max, into=c('max_level_shift','time_level_shift'), sep = '-', convert = TRUE)

Resources