Extracting data from R summary object to a table - r

With some df tab1 that gives summary output like below
summary(tab1)
I need to extract a table out of this like below
What is the right way to go about extracting data for the above table ? Looking for a solution that can use R's summary object or equivalent. Eventually I will use kabble to pretty print it.

We could combine apply with kable_styling from kableExtra package:
library(dplyr)
library(kableExtra)
#example data
df <- mtcars %>%
select(1:3)
as.data.frame(apply(df,2,summary)) %>%
kbl() %>%
kable_styling()

Here is another tidyverse approach. Just select the columns you want and iterate over them one at a time. This gives you a list of them that you can then join together.
library(dplyr)
library(purrr)
library(tibble)
# add in a NA value
mtcars2 <- mtcars
mtcars2[5, "wt"] <- NA
# select variables of interest
# iterate over each column to create a data frame of the summary statistics
# move through the list joining the results together
# iterate through each joined column, replacing NA with 0
summary_vars <- mtcars2 %>%
select(wt, hp) %>%
imap(~ enframe(c(summary(.x)), name = "Metric", value = .y)) %>%
reduce(full_join, by = "Metric") %>%
modify_if(is.numeric, replace_na, 0) %>%
filter(Metric %in% c("Mean", "Min.", "Max.", "Median", "NA's"))
summary_vars
# # A tibble: 7 x 3
# Metric wt hp
# <chr> <dbl> <dbl>
# 1 Min. 1.51 52
# 2 Median 3.22 123
# 3 Mean 3.21 147.
# 4 Max. 5.42 335
# 5 NA's 1 0
You can now use this summary_vars with kable().

I think that this will give you what you want:
df <- data.frame(
a = c(rnorm(100), NA),
b = rnorm(101),
c = rnorm(101)
)
sumtab <- summary(df)
rownames(sumtab) <- names(summary(df$a))
sumtab <- sumtab[c("Mean", "Min.", "Max.", "Median", "NA's"),]
sumtab[5,][is.na(sumtab[5,])] <- 0
knitr::kable(gsub("^.+\\:", "", sumtab))
| | a | b | c |
|:------|:-------|:--------|:-------|
|Mean |0.1033 |-0.03059 |-0.1172 |
|Min. |-2.0331 |-2.63760 |-1.9736 |
|Max. |1.9081 |1.93582 |2.1688 |
|Median |0.2622 |0.05986 |-0.1869 |
|NA's |1 |0 |0 |
Alternatively, you could just select the rows that you want from the summary table when you initially calculate the summary and use your preferred names:
sumtab <- summary(df)[c(4, 1, 6, 3, 7),]
rownames(sumtab) <- c("Mean", "Min", "Max", "Median", "NA")
sumtab["NA", ][is.na(sumtab["NA", ])] <- 0
knitr::kable(gsub("^.+\\:", "", sumtab))
| | a | b | c |
|:------|:--------|:-------|:--------|
|Mean |-0.06337 |0.1074 |-0.00909 |
|Min |-2.27831 |-1.9913 |-2.01234 |
|Max |2.09241 |2.2401 |2.58487 |
|Median |-0.08660 |0.2212 |0.01049 |
|NA |1 |0 |0 |

# summary of numeric (including factor class) columns of a data frame
my_summ <- function(df) {
df_nonchar <- df[, !sapply(df, typeof) %in% "character"]
summ <- data.frame(summary(df_nonchar), row.names = NULL)
# test for empty columns usually the 1st column is empty as a result of class
# (summary obj) which is "table" to data.frame coercion.
empty <- sapply(summ, function(x) all(x == ""))
summ <- summ[, !empty]
summ <- setNames(summ, c("var_name", "stats"))
summ <- summ[which(!is.na(summ$stats)), ]
# just in case if there are multiple :'s, we need to split only at the first match
summ$stats <- sub(":", "-;-", summ$stats)
summ <- data.frame(summ[1], do.call(rbind, strsplit(summ$stats, "-;-")))
names(summ)[-1] <- c("stats", "value")
# pivot into wide form, using 'stats' column as a key.
summ <- reshape(summ,
direction = "wide",
idvar = "var_name",
timevar = "stats",
v.names = "value"
)
var_nms <- strsplit(colnames(summ)[-1], "value\\.")
var_nms <- vapply(var_nms, function(x) x[[2]], NA_character_)
var_nms <- gsub("\\s+$", "", var_nms) # remove white spaces
names(summ)[-1] <- var_nms
rownames(summ) <- NULL
# remove white spaces
summ <- as.data.frame(sapply(summ, function(x) gsub("\\s+$", "", x)))
# when vars in the dataset contain NAs, we may have two additional columns in
# summary call
nas <- "NA's" %in% colnames(summ)
if (any(nas)) {
colnames(summ)[colnames(summ) == "NA's"] <- "missing"
}
summ
}
my_summ(mtcars[, 1:5])
var_name Min. 1st Qu. Median Mean 3rd Qu. Max.
1 mpg 10.40 15.43 19.20 20.09 22.80 33.90
2 cyl 4.000 4.000 6.000 6.188 8.000 8.000
3 disp 71.1 120.8 196.3 230.7 326.0 472.0
4 hp 52.0 96.5 123.0 146.7 180.0 335.0
5 drat 2.760 3.080 3.695 3.597 3.920 4.930
# transpose of that would put stats(summaries) on rows and vars on columns
t(my_summ(mtcars[, 1:5]))
[,1] [,2] [,3] [,4] [,5]
var_name " mpg" " cyl" " disp" " hp" " drat"
Min. "10.40" "4.000" " 71.1" " 52.0" "2.760"
1st Qu. "15.43" "4.000" "120.8" " 96.5" "3.080"
Median "19.20" "6.000" "196.3" "123.0" "3.695"
Mean "20.09" "6.188" "230.7" "146.7" "3.597"
3rd Qu. "22.80" "8.000" "326.0" "180.0" "3.920"
Max. "33.90" "8.000" "472.0" "335.0" "4.930"

There is a nice package for it called vtable
df <- data.frame(a = c(rnorm(100), NA), b = rnorm(101), c = rnorm(101))
library(vtable)
# return will return the data as data.frame but look at other options as well
sumtable(df, out = "return")
# Variable N Mean Std. Dev. Min Pctl. 25 Pctl. 75 Max
# 1 a 100 0.235 0.955 -1.593 -0.433 0.792 3.078
# 2 b 101 0.052 1.017 -2.542 -0.675 0.654 2.753
# 3 c 101 -0.163 1.01 -2.894 -0.832 0.497 2.326
# https://cran.r-project.org/web/packages/vtable/vignettes/sumtable.html
# Options for out:
# browser: Loads output in web browser.
# viewer: Loads output in Viewer pane (RStudio only).
# htmlreturn: Returns HTML code for output file.
# return: Returns summary table in data frame format. Depending on options, the data frame may be entirely character variables.
# csv: Returns summary table in data.frame format and, with a file option, saves that to CSV.
# kable: Returns a knitr::kable()
# latex: Returns a LaTeX table.
# latexpage: Returns an independently-buildable LaTeX document.

A solution based on purrr::map and data.table:
library(tidyverse)
library(data.table)
mtcars2 <- mtcars; mtcars2[4,7] <- NA
mtcars2 %>%
map(summary) %>% bind_rows %>% transpose(keep.names = "rn") %>%
setnames(-1, names(mtcars2)) %>% as_tibble
#> # A tibble: 7 × 12
#> rn mpg cyl disp hp drat wt qsec vs am gear carb
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Min. 10.4 4 71.1 52 2.76 1.51 14.5 0 0 3 1
#> 2 1st Qu. 15.4 4 121. 96.5 3.08 2.58 16.9 0 0 3 2
#> 3 Median 19.2 6 196. 123 3.70 3.32 17.6 0 0 4 2
#> 4 Mean 20.1 6.19 231. 147. 3.60 3.22 17.8 0.438 0.406 3.69 2.81
#> 5 3rd Qu. 22.8 8 326 180 3.92 3.61 18.8 1 1 4 4
#> 6 Max. 33.9 8 472 335 4.93 5.42 22.9 1 1 5 8
#> 7 NA's NA NA NA NA NA NA 1 NA NA NA NA
A tidyverse approach:
library(tidyverse)
mtcars[4,7] <- NA
mtcars %>%
summary %>%
as.data.frame %>%
separate(Freq, into = c("name", "value"),sep = ":") %>%
mutate(across(everything(), ~ str_trim(as.character(.))),
value = parse_number(value)) %>%
pivot_wider(id_cols = "name", names_from = "Var2", values_from = "value") %>%
filter(!str_detect(name,"Qu."))
#> # A tibble: 5 × 12
#> name mpg cyl disp hp drat wt qsec vs am gear carb
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Min. 10.4 4 71.1 52 2.76 1.51 14.5 0 0 3 1
#> 2 Median 19.2 6 196. 123 3.70 3.32 17.6 0 0 4 2
#> 3 Mean 20.1 6.19 231. 147. 3.60 3.22 17.8 0.438 0.406 3.69 2.81
#> 4 Max. 33.9 8 472 335 4.93 5.42 22.9 1 1 5 8
#> 5 NA's NA NA NA NA NA NA 1 NA NA NA NA

Related

How to loop through my dataframe and extract certain values?

the code below shows me extracting certain values for 1 parameter in my data frame (Calcium), but I want to be able to do this for all of the parameters/rows in my data frame. There are multiple rows for Calcium, which is why I took the median value.
How can I create a loop that does this for the other drug substance parameters?
Cal_limits=ag_limits_5 %>% filter(PARAMETER=="Drug Substance.Calcium")
lcl <- median(Cal_limits$LCL, na.rm = TRUE)
ucl <- median(Cal_limits$UCL, na.rm = TRUE)
lsl <- median(Cal_limits$LSL_1, na.rm = TRUE)
usl <- median(Cal_limits$USL_1, na.rm = TRUE)
cl <- median(Cal_limits$TARGET_MEAN, na.rm = TRUE)
stdev <- median(Cal_limits$TARGET_STDEV, na.rm = TRUE)
sigabove <- ucl + stdev #3.219 #(UCL + sd (3.11+0.107))
sigbelow <- lcl - stdev#2.363 #(LCL - sd (2.47-0.107))
Snapshot showing that there are multiple rows dedicated to one parameter, the columns not pictured have confidential information but include the values I am looking to extract
Edit: I am creating an RShiny app, so I am not sure if I will need to incorporate a reactive function
Using mtcars, you can do
aggregate(. ~ cyl, data = mtcars, FUN = median)
# cyl mpg disp hp drat wt qsec vs am gear carb
# 1 4 26.0 108.0 91.0 4.080 2.200 18.900 1 1 4 2.0
# 2 6 19.7 167.6 110.0 3.900 3.215 18.300 1 0 4 4.0
# 3 8 15.2 350.5 192.5 3.115 3.755 17.175 0 0 3 3.5
which provides the median for each of the variables (. means "all others") for each of the levels of cyl. I'm going to guess that this would apply to your data as
aggregate(. ~ PARAMETER, data = ag_limits_5, FUN = median)
If you have more columns than you want to reduce, then you can specify them manually with
aggregate(LCL + UCL + LSL_1 + USL_1 + TARGET_MEAN + TARGET_STDDEV ~ PARAMETER,
data = ag_limits_5, FUN = median)
and I think you'll get output something like
# PARAMETER LCL UCL LSL_1 USL_1 TARGET_MEAN TARGET_STDDEV
# 1 Drug Substance.Calcium 1.1 1.2 1.3 1.4 ...
# 2 Drug Substance.Copper ...
(with real numbers, I'm just showing structure there).
Since it appears that you're using dplyr, you can do it this way, too:
mtcars %>%
group_by(cyl) %>%
summarize(across(everything(), ~ median(., na.rm = TRUE)))
# # A tibble: 3 x 11
# cyl mpg disp hp drat wt qsec vs am gear carb
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 4 26 108 91 4.08 2.2 18.9 1 1 4 2
# 2 6 19.7 168. 110 3.9 3.22 18.3 1 0 4 4
# 3 8 15.2 350. 192. 3.12 3.76 17.2 0 0 3 3.5
which for you might be
ag_limits_5 %>%
group_by(PARAMETER) %>%
summarize(across(everything(), ~ median(., na.rm = TRUE)))

How to create a one-row data frame from a vector in R?

What is the most concise and/or elegant way to turn a vector into a single-row data frame in R? I'm trying to use a function that returns a 5-element vector and apply it across groups in my data frame, like so:
library(tidyverse)
mtcars %>%
group_by(cyl) %>%
group_map(~boxplot.stats(.$wt)$stats)
I'd like to have these results as rows in a new data frame, rather than a list of vectors, so I can plot with them in ggplot. But the best I've come up with is to add do.call(rbind, .) %>% as.data.frame to the end of my pipeline, which seems ungainly and inelegant. Replacing the group_map with a group_modify seems like the answer, but the function complains that my results aren't a data frame. And wrapping the anonymous function in a call to tibble_row doesn't work because tibble_row won't take vectors, just individual elements or explicit list columns.
Ideally, output would look like:
cyl V1 V2 V3 V4 V5
1 4 1.513 1.8850 2.200 2.6225 3.19
2 6 2.620 2.8225 3.215 3.4400 3.46
3 8 3.170 3.5200 3.755 4.0700 4.07
Surely there's a more elegant way to do what I'm trying to do, ideally within the tidyverse framework?
We could use unnest_wider after returning the output in a list in summarise
library(dplyr)
library(tidyr)
mtcars %>%
group_by(cyl) %>%
summarise(out = list(boxplot.stats(wt)$stats)) %>%
unnest_wider(out) %>%
rename_at(-1, ~ str_replace(., '\\.+', 'x'))
# A tibble: 3 x 6
# cyl x1 x2 x3 x4 x5
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 4 1.51 1.88 2.2 2.62 3.19
#2 6 2.62 2.82 3.22 3.44 3.46
#3 8 3.17 3.52 3.76 4.07 4.07
Or if we want to use the OP's method, then set the names for that vector and use as_tibble_row
library(purrr)
library(stringr)
mtcars %>%
group_by(cyl) %>%
group_map(~ tibble(cyl = first(.x$cyl),
setNames(boxplot.stats(.$wt)$stats, str_c('x', 1:5)) %>%
as_tibble_row) , .keep = TRUE) %>%
bind_rows
# A tibble: 3 x 6
# cyl x1 x2 x3 x4 x5
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 4 1.51 1.88 2.2 2.62 3.19
#2 6 2.62 2.82 3.22 3.44 3.46
#3 8 3.17 3.52 3.76 4.07 4.07
As the output of group_map is always a list, it may be better to use group_modify to return a tbl thus avoiding the last map_dfr/bind_rows
mtcars %>%
group_by(cyl) %>%
group_modify(~ setNames(boxplot.stats(.$wt)$stats, str_c('x', 1:5)) %>%
as_tibble_row , .keep = TRUE) %>%
ungroup
# A tibble: 3 x 6
# cyl x1 x2 x3 x4 x5
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 4 1.51 1.88 2.2 2.62 3.19
#2 6 2.62 2.82 3.22 3.44 3.46
#3 8 3.17 3.52 3.76 4.07 4.07
Base R method with aggregate :
tmp <- aggregate(wt~cyl, mtcars, function(x) boxplot.stats(x)$stats)
tmp
# cyl wt.1 wt.2 wt.3 wt.4 wt.5
#1 4 1.5130 1.8850 2.2000 2.6225 3.1900
#2 6 2.6200 2.8225 3.2150 3.4400 3.4600
#3 8 3.1700 3.5200 3.7550 4.0700 4.0700
The wt.1, wt.2 etc columns are part of matrix, to get the output as individual columns you can do
data.frame(cyl = tmp$cyl, tmp$wt)

Rowwise subtract vectors using purrr

I have a numeric dataframe (m rows * n columns)
For each row of this dataframe, I want to treat it as
a numeric vector (1 * n) and subtract from it another
fixed (1 * n) vector. So for each row we return a
(1 * n) vector.
I would like to return a list with this vector subtraction
done for each row of the dataframe. So in this case
a list with m number of 1 * n vectors.
I have manually done this for 2 rows in a simple reprex
below:
library(tidyverse)
#> Registered S3 methods overwritten by 'ggplot2':
# A function that takes a row as a vector
diff_vec <- function(inp_vec, diff_val){
base::return(inp_vec - diff_val)
}
# Create a test (dummy) dataset with 3 rows and 4 columns
test_dat <- mtcars %>% dplyr::slice(c(1, 3, 6)) %>% dplyr::select(1:4)
test_dat
#> mpg cyl disp hp
#> 1 21.0 6 160 110
#> 2 22.8 4 108 93
#> 3 18.1 6 225 105
# This is the vector we want to subtract from each row
diff_v <- c(3.2, 5.4, 7.5, 8.2)
first_row <- test_dat %>% dplyr::slice(1) %>% as.vector()
test_out1 <- diff_vec(inp_vec = first_row, diff_val = diff_v)
first_row
#> mpg cyl disp hp
#> 1 21 6 160 110
test_out1
#> mpg cyl disp hp
#> 1 17.8 0.6 152.5 101.8
second_row <- test_dat %>% dplyr::slice(2) %>% as.vector()
test_out2 = diff_vec(inp_vec = second_row, diff_val = diff_v)
second_row
#> mpg cyl disp hp
#> 1 22.8 4 108 93
test_out2
#> mpg cyl disp hp
#> 1 19.6 -1.4 100.5 84.8
Created on 2019-06-07 by the reprex package (v0.2.1)
Could anyone please show how to do this using
purrr based approach?
Thanks
There is a simple solution exists:
test_dat %>% map2_dfc(diff_v, ~ .x - .y)
Resulting tibble:
mpg cyl disp hp
<dbl> <dbl> <dbl> <dbl>
1 17.8 0.600 152. 102.
2 19.6 -1.4 100. 84.8
3 14.9 0.600 218. 96.8

Summary statistics grouped by multiple columns on specific columns corresponding to them

I have a data.frame of the size 75 million x 36, { 75 million rows) , where the cols are
col1, col1_decile, col2, col2_decile ........... col18 , col18_decile
Now I want to get summary statistics ( min, max, mean and standard deviation) corresponding for each of the columns col1,col2 ....... col18 grouped by their deciles.
i.e. summary statistics of
col1 by col1_decile, of col2 by col2_decile , of col3 by col3_decile ......, of col18 by col18_decile
For a reproducible example , I will do with the mtcars dataset:
library(dplyr)
data("mtcars")
mtcars %>% mutate_all(funs(decile = ntile(., 10))) -> mtcars_deciled
head(mtcars_deciled)
Here the columns are
mpg,cyl, disp, hp, drat,wt,qsec, vs, am, gear, carb,mpg_decile, cyl_decile, disp_decile, hp_decile, drat_decile,wt_decile qsec_decile, vs_decile, am_decile, gear_decile,carb_decile
I want the final data.frame to look like
decile mpg_decile_min mpg_decile_max mpg_decile_mean mpg_decile_sd ...
and so on for all the columns.
Each min, max, mean , std. deviation will be calculated based on the corresponding decile column
since it is a huge dataset of 75 million rows, I'm looking for fast solutions. I have tinkered with seplyr in R! , but didn't get far.
Fast solutions with data.table or dplyr or seplyr would be appreciated. The final data.frame should have 10 rows and 73 columns ( 4 summary statistic columns for min,max , mean and sd for each deciled columns ( 18 decile columns) and the common decile group column
decile mpg_decile_min mpg_decile_max mpg_decile_mean mpg_decile_sd .... carb_decile_min carb_decile_max carb_decile_mean carb_decile_d
Here is one possibility only with data.table.
The problem is the structure of the dataset with a mix of variables types on the same line (decile and measures). You have to reorganise it to make the aggregation easier.
The following example might be slow on big dataset (grepl, gsub, ifelse,... ??) and can probably be optimized.There are also multiple copies of the whole dataset. Maybe piping each command into the next wone might be better ? Advices welcome...
library(data.table)
library(dplyr)
data("mtcars")
# Your example in data.table format
DT <- as.data.table(mtcars %>% mutate_all(funs(decile = ntile(., 10))))
# Add an ID for each row
DT[,ID := 1:nrow(DT)]
# Transform the dataset in "long" format
tmp <- melt(DT, id.vars = "ID")
# Create a variable to make the distinction between the decile values and the
# measurements. Maybe not optimal for speed ?
tmp[, decile := ifelse(grepl("_decile$", variable), "decile", "value")]
# Remove the "_decile" suffix
tmp[, variable := gsub("_decile$", "", variable)]
# Cross table to have for each observation, the type of variable, the decile and the value
tmp <- dcast(tmp, ID + variable ~ decile)
# Now it is quite straightforward to compute your summary statistics with data.table syntax
result <- tmp[, .(min = min(value), max = max(value), mean = mean(value), sd = sd(value)),
keyby = .(variable, decile)]
print(result, 10)
## variable decile min max mean sd
## 1: am 1 0.000 0.000 0.000000 0.00000000
## 2: am 2 0.000 0.000 0.000000 0.00000000
## 3: am 3 0.000 0.000 0.000000 0.00000000
## 4: am 4 0.000 0.000 0.000000 0.00000000
## 5: am 5 0.000 0.000 0.000000 0.00000000
## 6: am 6 0.000 1.000 0.250000 0.50000000
## 7: am 7 1.000 1.000 1.000000 0.00000000
## 8: am 8 1.000 1.000 1.000000 0.00000000
## 9: am 9 1.000 1.000 1.000000 0.00000000
## 10: am 10 1.000 1.000 1.000000 0.00000000
## ---
## 101: wt 1 1.513 1.935 1.724500 0.19428759
## 102: wt 2 2.140 2.320 2.220000 0.09165151
## 103: wt 3 2.465 2.770 2.618333 0.15250683
## 104: wt 4 2.780 3.150 2.935000 0.19215879
## 105: wt 5 3.170 3.215 3.191667 0.02254625
## 106: wt 6 3.435 3.440 3.438750 0.00250000
## 107: wt 7 3.460 3.570 3.516667 0.05507571
## 108: wt 8 3.570 3.780 3.693333 0.10969655
## 109: wt 9 3.840 4.070 3.918333 0.13137098
## 110: wt 10 5.250 5.424 5.339667 0.08712252
Here is a piped version of the same code :
result <-
DT[,ID := 1:nrow(DT)] %>%
melt(id.vars = "ID") %>%
.[, decile := ifelse(grepl("_decile$", variable), "decile", "value")] %>%
.[, variable := gsub("_decile$", "", variable)] %>%
dcast(ID + variable ~ decile) %>%
.[, .(min = min(value), max = max(value), mean = mean(value), sd = sd(value)),
keyby = .(variable, decile)]
tidyverse solution
Using mtcars_deciled as the data. Replace mtcars with yourdata in the following solution to apply to your situation. This assumes _decile columns are a fixed width apart from the parent column.
library(tidyverse)
numcol <- ncol(mtcars)
ans <- map2(seq_len(numcol), names(mtcars), ~mtcars_deciled[,c(.x, .x+numcol)] %>%
group_by_at(vars(dplyr::contains("decile"))) %>%
summarise_at(vars(.y), funs(mean, sd, min, max)))
Note dplyr::contains is necessary to disambiguate it from purrr::contains
output
This will result in a list of data frames
[[1]]
# A tibble: 10 x 5
mpg_decile mean_mpg min_mpg max_mpg sd_mpg
<int> <dbl> <dbl> <dbl> <dbl>
1 1 12.10000 10.4 14.3 2.00499377
2 2 14.96667 14.7 15.2 0.25166115
3 3 15.50000 15.2 15.8 0.30000000
4 4 17.16667 16.4 17.8 0.70945989
5 5 18.66667 18.1 19.2 0.55075705
6 6 20.22500 19.2 21.0 0.91787799
7 7 21.43333 21.4 21.5 0.05773503
8 8 23.33333 22.8 24.4 0.92376043
9 9 27.90000 26.0 30.4 2.26053091
10 10 32.23333 30.4 33.9 1.75594229
[[2]]
# A tibble: 10 x 5
cyl_decile mean_cyl min_cyl max_cyl sd_cyl
<int> <dbl> <dbl> <dbl> <dbl>
1 1 4.000000 4 4 0.000000
2 2 4.000000 4 4 0.000000
3 3 4.000000 4 4 0.000000
4 4 5.333333 4 6 1.154701
5 5 6.000000 6 6 0.000000
6 6 7.000000 6 8 1.154701
7 7 8.000000 8 8 0.000000
8 8 8.000000 8 8 0.000000
9 9 8.000000 8 8 0.000000
10 10 8.000000 8 8 0.000000
# etc

dplyr - summary table for multiple variables

How to create simple summary statistics using dplyr from multiple variables? Using the summarise_each function seems to be the way to go, however, when applying multiple functions to multiple columns, the result is a wide, hard-to-read data frame.
Use dplyr in combination with tidyr to reshape the end result.
library(dplyr)
library(tidyr)
df <- tbl_df(mtcars)
df.sum <- df %>%
select(mpg, cyl, vs, am, gear, carb) %>% # select variables to summarise
summarise_each(funs(min = min,
q25 = quantile(., 0.25),
median = median,
q75 = quantile(., 0.75),
max = max,
mean = mean,
sd = sd))
# the result is a wide data frame
> dim(df.sum)
[1] 1 42
# reshape it using tidyr functions
df.stats.tidy <- df.sum %>% gather(stat, val) %>%
separate(stat, into = c("var", "stat"), sep = "_") %>%
spread(stat, val) %>%
select(var, min, q25, median, q75, max, mean, sd) # reorder columns
> print(df.stats.tidy)
var min q25 median q75 max mean sd
1 am 0.0 0.000 0.0 1.0 1.0 0.40625 0.4989909
2 carb 1.0 2.000 2.0 4.0 8.0 2.81250 1.6152000
3 cyl 4.0 4.000 6.0 8.0 8.0 6.18750 1.7859216
4 gear 3.0 3.000 4.0 4.0 5.0 3.68750 0.7378041
5 mpg 10.4 15.425 19.2 22.8 33.9 20.09062 6.0269481
6 vs 0.0 0.000 0.0 1.0 1.0 0.43750 0.5040161
A potentially easy solution could created with broom::tidy and purrr::map_df. broom::tidy summarises key objects from statistical ouput into a tibble. purrr::map_df applies function to each element, in this case a column and returns a tibble.
Example
library(tidyverse)
mtcars %>%
select(mpg, cyl, vs, am, gear, carb) %>%
map_df(.f = ~ broom::tidy(summary(.x)), .id = "variable")
Results
# A tibble: 6 x 7
# variable minimum q1 median mean q3 maximum
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 mpg 10.4 15.4 19.2 20.1 22.8 33.9
# 2 cyl 4 4 6 6.19 8 8
# 3 vs 0 0 0 0.438 1 1
# 4 am 0 0 0 0.406 1 1
# 5 gear 3 3 4 3.69 4 5
# 6 carb 1 2 2 2.81 4 8
I liked paljenczy's idea of just using dplyr/tidy and getting the table in a data.frame/tibble before formatting it. But I ran into robustness issues: Because it relies on parsing variable names it choked on columns with underscores in the names. After trying to fix this within the dplyr framework it seemed like it would always be somewhat fragile because it relied on string parsing.
So in the end I decided on using psych::describe() which is a function designed for exactly this thing. It doesn't do completely arbitrary functions, but pretty much anything one would realistically want to do. A full example duplicating the previous solutions is included below, combining psych::describe() with some tidyverse stuff to get the exact tibble we are looking for.
It is worth noting that this answer has been updated to reflect the changed behavior of as_tibble() with regards to how it handles rownames in data.frames:
library(psych)
library(tidyverse)
# Create an extended version with a bunch of stats
d.summary.extended <- mtcars %>%
select(mpg, cyl, vs, am, gear, carb) %>%
psych::describe(quant=c(.25,.75)) %>%
as_tibble(rownames="rowname") %>%
print()
<OUTPUT>
# A tibble: 6 x 16
rowname vars n mean sd median trimmed mad min max range skew kurtosis se Q0.25 Q0.75
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 mpg 1 32 20.09062 6.0269481 19.2 19.6961538 5.41149 10.4 33.9 23.5 0.6106550 -0.372766 1.06542396 15.425 22.8
2 cyl 2 32 6.18750 1.7859216 6.0 6.2307692 2.96520 4.0 8.0 4.0 -0.1746119 -1.762120 0.31570933 4.000 8.0
3 vs 3 32 0.43750 0.5040161 0.0 0.4230769 0.00000 0.0 1.0 1.0 0.2402577 -2.001938 0.08909831 0.000 1.0
4 am 4 32 0.40625 0.4989909 0.0 0.3846154 0.00000 0.0 1.0 1.0 0.3640159 -1.924741 0.08820997 0.000 1.0
5 gear 5 32 3.68750 0.7378041 4.0 3.6153846 1.48260 3.0 5.0 2.0 0.5288545 -1.069751 0.13042656 3.000 4.0
6 carb 6 32 2.81250 1.6152000 2.0 2.6538462 1.48260 1.0 8.0 7.0 1.0508738 1.257043 0.28552971 2.000 4.0
</OUTPUT>
# Select stats for comparison with other solutions
d.summary <- d.summary.extended %>%
select(var=rowname, min, q25=Q0.25, median, q75=Q0.75, max, mean, sd) %>%
print()
<OUTPUT>
# A tibble: 6 x 8
var min q25 median q75 max mean sd
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 mpg 10.4 15.425 19.2 22.8 33.9 20.09062 6.0269481
2 cyl 4.0 4.000 6.0 8.0 8.0 6.18750 1.7859216
3 vs 0.0 0.000 0.0 1.0 1.0 0.43750 0.5040161
4 am 0.0 0.000 0.0 1.0 1.0 0.40625 0.4989909
5 gear 3.0 3.000 4.0 4.0 5.0 3.68750 0.7378041
6 carb 1.0 2.000 2.0 4.0 8.0 2.81250 1.6152000
</OUTPUT>
If you want to create a summary table for publication (not for further calculations) you may want to look at the excellent stargazer package.
df <- data.frame(mtcars)
cols <- c('mpg', 'cyl', 'vs', 'am', 'gear', 'carb')
stargazer(
df[, cols], type = "text",
summary.stat = c("min", "p25", "median", "p75", "max", "median", "sd")
)
================================================================
Statistic Min Pctl(25) Median Pctl(75) Max Median St. Dev.
----------------------------------------------------------------
mpg 10.400 15.430 19.200 22.800 33.900 19.200 6.027
cyl 4 4 6 8 8 6 1.786
vs 0 0 0 1 1 0 0.504
am 0 0 0 1 1 0 0.499
gear 3 3 4 4 5 4 0.738
carb 1 2 2 4 8 2 1.615
----------------------------------------------------------------
You can change type to 'latex' and 'html' as well and save it to file with specifying the file giving 'out' argument.
There's a "new" package called skimr that has a function called skim() that gives wonderful output describing individual variables in a data.fame/tibble.
Try:
skimr::skim(mtcars)
and you'll get:
it is customizable and works well with pipes etc.
see ?skimr::skim()
and vignette("Using_skimr", package = "skimr")
You can achieve the same result using data.table as well. You might consider using it if your table is big.
dt <- data.table(mtcars)
cols <- c('mpg', 'cyl', 'vs', 'am', 'gear', 'carb')
functions <- c('min', 'q25', 'median', 'q75', 'max', 'mean', 'sd')
dt.sum <- dt[
,
lapply(
.SD,
function(x) list(
min(x), quantile(x, 0.25), median(x),
quantile(x, 0.75), max(x), mean(x), sd(x)
)
),
.SDcols = cols
]
dt.sum
mpg cyl vs am gear carb
1: 10.4 4 0 0 3 1
2: 15.43 4 0 0 3 2
3: 19.2 6 0 0 4 2
4: 22.8 8 1 1 4 4
5: 33.9 8 1 1 5 8
6: 20.09 6.188 0.4375 0.4062 3.688 2.812
7: 6.027 1.786 0.504 0.499 0.7378 1.615
# transpose and provide meaningful names
dt.sum.t <- as.data.table(t(sum))[]
setnames(dt.sum.t, names(dt.sum.t), functions)
dt.sum.t[, var := cols]
setcolorder(dt.sum.t, c("var", functions))
dt.sum.t
var min q25 median q75 max mean sd
1: mpg 10.4 15.43 19.2 22.8 33.9 20.09 6.027
2: cyl 4 4 6 8 8 6.188 1.786
3: vs 0 0 0 1 1 0.4375 0.504
4: am 0 0 0 1 1 0.4062 0.499
5: gear 3 3 4 4 5 3.688 0.7378
6: carb 1 2 2 4 8 2.812 1.615
Similar to the accepted answer, but tidied up a bit into a function:
summarise_continuous = function(d, cvars) {
d %>%
select(all_of(cvars)) %>%
mutate_all(as.numeric) %>%
summarise(across(all_of(cvars), list(N = ~sum(!is.na(.)),
mean = ~mean(., na.rm=T),
sd = ~sd(., na.rm=T),
median = ~median(., na.rm=T),
min = ~min(., na.rm=T),
max = ~max(., na.rm=T)))) %>%
pivot_longer(everything(),
names_to = c("variable",".value"),
names_pattern = "(.+)_(.+)") # %>%
# knitr::kable()
# uncomment these bits if you want a nicely formatted table in a .Rmd document
}
summarise_continuous(mtcars, c("mpg", "cyl", "vs", "am", "gear", "carb"))

Resources