creating summary tables with tidyverse - r

Q1. Is there a more direct (but still tidyverse) way to create a summary table like this?
library(tidyverse)
library(knitr)
library(kableExtra)
df <- data.frame(group=c(1, 1, 1, 1, 0, 0, 0, 0),
v1=c(1, 2, 3, 4, 5, 6, 1, 2),
v2=c(4, 3, 2, 5, 3, 5, 3, 8),
v3=c(0, 1, 0, 1, 1, 0, 1, 1))
df %>%
group_by(group) %>%
summarise(v1=paste0(round(mean(v1), 2),
" (",
round(sd(v1), 2),
")"),
v2=paste0(round(mean(v2), 2),
" (",
round(sd(v2), 2),
")"),
v3=round(mean(v3)*100, 1)
) %>%
dplyr::select(-group) %>%
t() %>%
`rownames<-` (c("v1 mean (SD)",
"v2 mean (SD)",
"Percent v3")) %>%
kable("html",
col.names=c("Group 0", "Group 1")) %>%
kable_styling()
Q2. Related to this, is there a way to combine two levels of summarise (e.g., no grouping + grouping) without repeating the summarise code?
all <-
df %>%
summarise(v1=paste0(round(mean(v1), 2),
" (",
round(sd(v1), 2),
")"),
v2=paste0(round(mean(v2), 2),
" (",
round(sd(v2), 2),
")"),
v3=round(mean(v3)*100, 1)
) %>%
t() %>%
`rownames<-` (c("v1 mean (SD)",
"v2 mean (SD)",
"Percent v3"))
groups <-
df %>%
group_by(group) %>%
summarise(v1=paste0(round(mean(v1), 2),
" (",
round(sd(v1), 2),
")"),
v2=paste0(round(mean(v2), 2),
" (",
round(sd(v2), 2),
")"),
v3=round(mean(v3)*100, 1)
) %>%
dplyr::select(-group) %>%
t() %>%
`rownames<-` (c("v1 mean (SD)",
"v2 mean (SD)",
"Percent v3"))
all %>%
cbind(groups) %>%
kable("html",
col.names=c("All", "Group 0", "Group 1")) %>%
kable_styling()

One solution (especially if you want to expand the number of columns v1, v2, ... in the future) to make your code a bit more concise might be, to put paste0(round(mean(v1), 2)," (", round(sd(v1), 2), ")") into a function: paste_mean_and_sd = function(df_col){paste0(round(mean(df_col), 2)," (", round(sd(df_col), 2), ")")}.
That would shorten your "pipeline" and make it more easily readable:
... %>% summarise(v1 = paste_mean_and_sd(v1), v2 = paste_mean_and_sd(v2), v3=round(mean(v3)*100, 1)) %>% ...

This is the minimum I can think of.
cat_var <- "v3"
df_cal <- function(x, var) {
if (var[1] %in% cat_var) return(as.character(round(mean(x), 1)))
paste0(mean(x), " (", round(sd(x), 2), ")")
}
df_tall <- df %>% gather(var, x, v1:v3) %>% group_by(var)
all <- df_tall %>% summarise(stat = df_cal(x, var)) %>% mutate(group = -1)
groups <- df_tall %>% group_by(group, var) %>% summarise(stat = df_cal(x, var))
bind_rows(all, groups) %>%
ungroup() %>%
mutate(var = factor(var, labels = c(
"v1 mean (SD)", "v2 mean (SD)", "Precent v3"
))) %>%
spread(group, stat) %>%
kable("html", col.names = c(" ", "All", "Group 0", "Group 1")) %>%
kable_styling()

Related

R: Creating a Function with a "Dynamic" Structure

I am working with the R programming language.
Suppose there is a classroom of students - each student flips the same coin many times (the students don't flip the coin the same number of times). Here is a simulate dataset to represent this example:
library(tidyverse)
library(dplyr)
set.seed(123)
ids = 1:100
student_id = sample(ids, 1000, replace = TRUE)
coin_result = sample(c("H", "T"), 1000, replace = TRUE)
my_data = data.frame(student_id, coin_result)
my_data = my_data[order(my_data$student_id),]
I want to count the number of "3 Flip Sequences" recorded by each student (e.g. Student 1 got HHHTH : HHH 1 time, HHT 1 time, HTH 1 time)
And the probability of the 3rd Flip based on the previous 2 flips (e.g. in general, over all students, the probability of a H following HH was 0.54)
Here is some R code that performs these tasks:
results = my_data %>%
group_by(student_id) %>%
summarize(Sequence = str_c(coin_result, lead(coin_result), lead(coin_result, 2)), .groups = 'drop') %>%
filter(!is.na(Sequence)) %>%
count(Sequence)
final = results %>%
mutate(two_seq = substr(Sequence, 1, 2)) %>%
group_by(two_seq) %>%
mutate(third = substr(Sequence, 3, 3)) %>%
group_by(two_seq, third) %>%
summarize(sums = sum(n)) %>%
mutate(prob = sums / sum(sums))
My Question: Suppose I want to now extend this problem to "4 Flip Sequences" (e.g. probability of H given HHH) - I can manually extend this code:
results = my_data %>%
group_by(student_id) %>%
summarize(Sequence = str_c(coin_result, lead(coin_result), lead(coin_result, 2), lead(coin_result, 3)), .groups = 'drop') %>%
filter(!is.na(Sequence)) %>%
count(Sequence)
final = results %>%
mutate(three_seq = substr(Sequence, 1, 3)) %>%
group_by(three_seq) %>%
mutate(fourth = substr(Sequence, 4, 4)) %>%
group_by(three_seq, fourth) %>%
summarize(sums = sum(n)) %>%
mutate(prob = sums / sum(sums))
Is it possible to convert the above code into a function such that I can repeat this for arbitrary combinations? For example:
results <- function(i) {return(my_data %>%
group_by(student_id) %>%
summarize(Sequence = str_c(coin_result, lead(coin_result), lead(coin_result, i+1), lead(coin_result, i+2) .....### insert code here ####), .groups = 'drop') %>%
filter(!is.na(Sequence)) %>%
count(Sequence))}
final <- function(i)
return(results %>%
mutate(three_seq = substr(Sequence, 1, i)) %>%
group_by(three_seq) %>%
mutate(fourth = substr(Sequence, i+1, i+1)) %>%
group_by(three_seq, fourth) %>%
summarize(sums = sum(n)) %>%
mutate(prob = sums / sum(sums)))
}
I am not sure how exactly I would do this, seeing as the first function would require to be "dynamically changed" depending on the value of "i".
Can someone please show me how to do this?
Thanks!
Here's a way you can do it in base R:
# Returns a vector of 0's and 1's, bit more efficient than sample
tosses <- floor(runif(1e3, 0, 2))
count_seqs <- function(x, seq_length) {
vec_length <- length(x)
rolling_window_indices <- rep(1:seq_length, vec_length - seq_length + 1) +
rep(0:(vec_length - seq_length), each = seq_length)
mat <- matrix(x[rolling_window_indices], nrow = seq_length)
sequences <- apply(mat, 2, paste0, collapse = "")
table(sequences)
}
count_seqs(tosses, 3)
Notice I didn't include any ids in the code above. The reason is that, if all students have the same probability of tossing heads or tails, we can treat them as independent (or, more precisely, treat the design as ignorable). However, it's easy to expand the code for situations where the tosses are not independent, e.g. where each participant has a different probability of tossing heads:
ids <- floor(runif(1e3, 1, 101))
probs <- runif(1e2, 0, 1)
tosses_by_id <- lapply(ids, function(i) rbinom(10, 1, probs[i]))
lapply(tosses_by_id, function(x) count_seqs(x, 3))

Change row group labels in gt table (with superscript/subscript and line breaks). Customising row group labels in R

I have the following data and table:
library(gt)
library(dplyr)
a <- rnorm(21, mean = 112, sd =12)
colour <- rep(c("Blue", "Red", "Green"), 7)
data <- data.frame(colour, a) %>%
group_by(colour) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = html("[H<sub>2</sub>O]")) %>%
rename(cat = colour)
b <- rnorm(21, mean = 60, sd =12)
day <- rep(c("2", "4", "6"), 7)
data2 <- data.frame(day, b) %>%
group_by(day) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = html("[H<sub>2</sub>O] Additition <br> (Days)")) %>%
rename(cat = day)
bind_rows(data, data2) %>%
group_by(grp) %>%
gt(rowname_col = "cat")
bind_rows(data, data2) %>%
group_by(grp) %>%
gt() %>%
tab_options(row_group.as_column = TRUE)
The row group labels appear literally as '[H<sub>2<\sub>O]', rather than [H2O] etc. It is likely that I am using HTML wrong and it needs to be used with another package/function. I have also tried using cols_label but doesn't recognise these as columns in the dataframe.
Is there also a way to have the row groups column vertically centered, rather than at the top where is currently is? How do you bold these row groups?
The html function won't work outside of a gt table, so you'll have to create the row groups using tab_row_group and add the html labels there.
data <- data.frame(colour, a) %>%
group_by(colour) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = "color") %>%
rename(cat = colour)
data2 <- data.frame(day, b) %>%
group_by(day) %>%
summarise(mean = mean(a), sd = sd(a), n = n()) %>%
mutate(grp = "day") %>%
rename(cat = day)
bind_rows(data, data2) %>%
gt() %>%
tab_row_group(
label = html("[H<sub>2</sub>O]"),
rows = grp == "color"
) %>%
tab_row_group(
label = html("[H<sub>2</sub>O] Additition <br> (Days)"),
rows = grp == "day"
) %>%
cols_hide(grp)

pivot wider - create two levels of headers for better readability

There are six regression models. I used pivot wider but it is difficult to read.
Can I use two level of headers -
first level - regression model
second level - estimate, tstat
library(dplyr)
regression <- c(rep("A", 3), rep("B", 3), rep("C", 3), rep("D", 3), rep("E", 3), rep("F", 3))
attribute <- rep(c("b0", "b1", "b2"), 6)
estimate <- round(runif(n = 18, min = 0, max = 10), 2)
tstat <- round(runif(n = 18, min = 0, max = 10), 2)
# tibble
tbl <- tibble(regression, attribute, estimate, tstat)
# pivot wider
tbl <- tbl %>%
pivot_wider(names_from = regression,
values_from = c("estimate", "tstat"))
One option is separate_header from ftExtra
library(ftExtra)
library(dplyr)
library(tidyr)
library(stringr)
tbl %>%
pivot_wider(names_from = regression,
values_from = c("estimate", "tstat")) %>%
rename_with(~ str_replace(., "(.*)_(.*)", "\\2_\\1"), -1) %>%
as_flextable() %>%
separate_header()
-output
Or may use span_header
library(flextable)
tbl %>%
pivot_wider(names_from = regression,
values_from = c("estimate", "tstat")) %>%
rename_with(~ str_replace(., "(.*)_(.*)", "\\2_\\1"), -1) %>%
select(attribute, order(str_remove(names(.)[-1], "_.*")) + 1) %>%
as_flextable() %>%
span_header() %>%
align(align = "center", part = "all")
-output
If we need to make some column bold,
tbl %>%
pivot_wider(names_from = regression,
values_from = c("estimate", "tstat")) %>%
rename_with(~ str_replace(., "(.*)_(.*)", "\\2_\\1"), -1) %>%
mutate(across(ends_with('tstat'), ~sprintf('**%.2f**', .))) %>%
select(attribute, order(str_remove(names(.)[-1], "_.*")) + 1) %>%
as_flextable() %>%
span_header() %>%
align(align = "center", part = "all") %>%
colformat_md()
-output

Vectorised wilcox.test function dplyr

I have tried to create a vectorised version of the wilcox.test function in R. It doesn't return the same p values as the original wilcox.test function. Does anyone understand why ?
library(tidyverse)
vect_wilcox <- function(df, grouping_variable, x, y){
df %>%
group_by({{grouping_variable}}) %>%
group_map(~ broom::tidy(wilcox.test(pull(.x[x]), pull(.x[y])), paired = TRUE)) %>%
enframe %>%
unnest(value) %>%
mutate(df %>% distinct({{grouping_variable}}))
}
df <- tribble(
~session, ~name_var, ~time_pt1, ~time_pt2,
1, "fio2", 90, NA,
2, "fio2", 100, 80,
3, "fio2", 100, 70,
4, "fio2", 90, 70,
1, "ph", 7.24, NA,
2, "ph", 7.19, 7.38,
3, "ph", 7.2, 7.2,
4, "ph", 7.37, 7.33
)
new_wilcox <- vect_wilcox(df, grouping_variable = name_var, x= "time_pt1", y="time_pt2")
d3 <- df %>%
pivot_longer(col = 3:4, names_to = "time_point", values_to = "value") %>%
pivot_wider(
names_from = c(name_var, time_point),
values_from = value,
names_sep = "_")
pval = format(wilcox.test(d3$fio2_time_pt1,d3$fio2_time_pt2,paired=T)$p.value,digits=3)
pval = c(pval,format(wilcox.test(d3$ph_time_pt1,d3$ph_time_pt2,paired=T)$p.value,digits=3))
(comp<- new_wilcox %>%
select(name_var, p.value) %>%
mutate(old_p.value = pval) %>%
rename(new_p.value = p.value))
dput(comp)
Thanks a lot !
There were two errors:
a parenthesis error in group_map
the variable reassignement with mutate(df %>% distinct({{grouping_variable}}))obviously didn't work so I changed group_map all together for group_modify.
This function works:
vect_wilcox <- function(df, grouping_variable, x, y){
df %>%
group_by({{grouping_variable}}) %>%
group_modify(~ broom::tidy(
wilcox.test(
x = pull(.x[x]),
y = pull(.x[y]),
paired = TRUE))) %>%
ungroup()
}

How to count instances of duplicate characters within a string?

I have a dataframe:
levels counts
1, 2, 2 24
1, 2 20
1, 3, 3, 3 15
1, 3 10
1, 2, 3 25
I want to treat, for example, "1, 2, 2" and "1, 2" as the same thing. So, as long as there is a "1" and "2" without any other character, it will count as the level "1, 2". Here is the desired data frame:
levels counts
1, 2 44
1, 3 25
1, 2, 3 25
Here is code to reproduce the original data frame:
df <- data.frame(levels = c("1, 2, 2", "1, 2", "1, 3, 3, 3", "1, 3", "1, 2, 3"),
counts = c(24, 20, 15, 10, 25))
df$levels <- as.character(df$levels)
Split df$levels, get the unique elements, and then sort it. Then use that to obtain aggregate of counts.
df$levels2 = sapply(strsplit(df$levels, ", "), function(x)
paste(sort(unique(x)), collapse = ", ")) #Or toString(sort(unique(x))))
aggregate(counts~levels2, df, sum)
# levels2 counts
#1 1, 2 44
#2 1, 2, 3 25
#3 1, 3 25
A solution uses tidyverse. df2 is the final output.
library(tidyverse)
df2 <- df %>%
mutate(ID = 1:n()) %>%
mutate(levels = strsplit(levels, split = ", ")) %>%
unnest() %>%
distinct() %>%
arrange(ID, levels) %>%
group_by(ID, counts) %>%
summarise(levels = paste(levels, collapse = ", ")) %>%
ungroup() %>%
group_by(levels) %>%
summarise(counts = sum(counts))
Update
Based on comments below, a solution using ideas similar to d.b
df2 <- df %>%
mutate(l2 = map_chr(strsplit(levels, ", "),
.f = ~ .x %>% unique %>% sort %>% toString)) %>%
group_by(l2) %>%
summarise(counts = sum(counts))

Resources