Print summary statistics in sub-groups of flextable object - r

Context: I am trying to create a docx table with summary statistics by groups.
Question: how to add summary statistics (e.g. sum) on top or bottom of each group and get a "total" line as last row?
So far I got good results using flextable::as_grouped_data() as shown here: https://davidgohel.github.io/flextable/reference/as_grouped_data.html#see-also
Example:
library(dplyr) # feel free to use data.table if you prefer, I am just more used to dplyr
data_co2_2 <- CO2 %>%
group_by(Type, Treatment, conc) %>%
summarise(uptake = mean(uptake)) %>%
pivot_wider(names_from = Type, values_from = uptake)
data_co2_2 <- as_grouped_data(x = data_co2_2, groups = c("Treatment"))
Output:
data_co2
#> Treatment conc Quebec Mississippi
#> 1 nonchilled NA NA NA
#> 3 <NA> 95 15.26667 11.30000
#> 4 <NA> 175 30.03333 20.20000
#> 5 <NA> 250 37.40000 27.53333
#> 6 <NA> 350 40.36667 29.90000
#> 7 <NA> 500 39.60000 30.60000
#> 8 <NA> 675 41.50000 30.53333
#> 9 <NA> 1000 43.16667 31.60000
#> 2 chilled NA NA NA
#> 10 <NA> 95 12.86667 9.60000
#> 11 <NA> 175 24.13333 14.76667
#> 12 <NA> 250 34.46667 16.10000
#> 13 <NA> 350 35.80000 16.60000
#> 14 <NA> 500 36.66667 16.63333
#> 15 <NA> 675 37.50000 18.26667
#> 16 <NA> 1000 40.83333 18.73333
Expected output: instead of NA in the "group" line I would like to display a summary statistic (like the sum of the sub-group). Icing on the cake: display a "Overall total" on the bottom of the table.

instead of NA in the "group" line I would like to display a summary statistic (like the sum of the sub-group).
That's not possible if using as_grouped_data() %>% as_flextable(). The value displayed is the name of the group.
The following is a proposition:
library(flextable)
library(dplyr)
library(tidyr)
CO2 <- CO2 %>%
mutate(conc = as.character(conc))
agg1 <- CO2 %>%
group_by(Type, Treatment, conc) %>%
summarise(uptake = mean(uptake), .groups = "drop")
agg2 <- CO2 %>%
group_by(Type, Treatment) %>%
summarise(uptake = mean(uptake), .groups = "drop") %>%
mutate(conc="Overall")
agg3 <- CO2 %>%
group_by(Type) %>%
summarise(uptake = mean(uptake), .groups = "drop") %>%
mutate(conc="Overall", Treatment = "Overall")
all_data <- bind_rows(agg1, agg2, agg3) %>%
arrange(Type, Treatment, conc) %>%
pivot_wider(names_from = Type, values_from = uptake)
as_grouped_data(x = all_data, groups = c("Treatment")) %>%
as_flextable() %>%
compose(i = ~ is.na(conc) & is.na(Treatment),
j = "conc", value = as_paragraph("avg for all conc")) %>%
compose(i = ~ is.na(conc) & is.na(Treatment),
j = "conc", value = as_paragraph("avg for all conc")) %>%
bold(bold = TRUE, i = ~!is.na(Treatment)) %>%
color(i= ~ conc %in% "Overall", color = "red") %>%
colformat_double(j = c("Quebec", "Mississippi"), digits = 1)

Related

How to exclude percentages from Total column and row when using janitor::adorn_percentages()

Is there any way I can get the output below directly from adorn functions?
library(janitor)
library(stringr)
df <- mtcars %>%
tabyl(am, cyl) %>%
adorn_totals(c("row", "col")) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 2) %>%
adorn_ns(position = "front")
df
# am 4 6 8 Total
# 0 3 (15.79%) 4 (21.05%) 12 (63.16%) 19 (100.00%)
# 1 8 (61.54%) 3 (23.08%) 2 (15.38%) 13 (100.00%)
# Total 11 (34.38%) 7 (21.88%) 14 (43.75%) 32 (100.00%)
df$Total <- str_replace(df$Total, " \\s*\\([^\\)]+\\)", "")
df[df$am == "Total",] <- str_replace(df[df$am == "Total",], " \\s*\\([^\\)]+\\)", "")
df
# am 4 6 8 Total
# 0 3 (15.79%) 4 (21.05%) 12 (63.16%) 19
# 1 8 (61.54%) 3 (23.08%) 2 (15.38%) 13
# Total 11 7 14 32
This is a solution not only by janitor but in one run using dyplr and readr:
We add to your code one line with mutate(across... using a case_when conditional only on specific row and (the trick) using parse_number (that extracts automatically the first number),
The second step is to use parse_number for the Total column:
library(janitor)
library(readr)
library(dplyr)
mtcars %>%
tabyl(am, cyl) %>%
adorn_totals(c("row", "col")) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 2) %>%
adorn_ns(position = "front") %>%
mutate(across(-c(am, Total), ~case_when(am == "Total" ~as.character(parse_number(.)),
TRUE ~.))) %>%
mutate(Total = parse_number(Total))
am 4 6 8 Total
0 3 (15.79%) 4 (21.05%) 12 (63.16%) 19
1 8 (61.54%) 3 (23.08%) 2 (15.38%) 13
Total 11 7 14 32
We could do the tidy-select options in some of the adorn functions
library(dplyr)
library(janitor)
mtcars %>%
tabyl(am, cyl) %>%
adorn_totals(c("row", "col")) %>%
adorn_percentages("row", `...` = -c(am, Total)) %>%
adorn_pct_formatting(digits = 2, `...` = -c(am, Total)) %>%
adorn_ns(position = "front", `...` = -c(am, Total)) %>%
mutate(across(-c(am, Total),
~ replace(.x, n(), readr::parse_number(.x[n()]))))
-output
am 4 6 8 Total
0 3 (15.79%) 4 (21.05%) 12 (63.16%) 19
1 8 (61.54%) 3 (23.08%) 2 (15.38%) 13
Total 11 7 14 32
Or use group_modify
mtcars %>%
tabyl(am, cyl) %>%
adorn_totals(c("row", "col")) %>%
group_by(grp = replace(am, am != 'Total', 'Cell')) %>%
group_modify(~ if(.y$grp != "Total") .x %>%
adorn_percentages("row", `...` = -c(am, Total)) %>%
adorn_pct_formatting(digits = 2, `...` = -c(am, Total)) %>%
adorn_ns(position = "front", `...` = -c(am, Total)) else
.x %>%
mutate(across(-Total, as.character))) %>%
ungroup %>%
select(-grp)
-output
# A tibble: 3 × 5
am `4` `6` `8` Total
<chr> <chr> <chr> <chr> <dbl>
1 0 3 (15.79%) 4 (21.05%) 12 (63.16%) 19
2 1 8 (61.54%) 3 (23.08%) 2 (15.38%) 13
3 Total 11 7 14 32
Essentially your problem is that you want to call adorn_totals() after you create the percentages. But you can't do that because then you're working with character columns with values like "3 (15.79%)", and you can't sum them.
I would just create a function to calculate the totals in one data frame and the percentages in the other and join them together:
library(dplyr)
library(janitor)
create_formatted_totals <- function(rows, cols, dat) {
dat_pct <- dat |>
tabyl({{ rows }}, {{ cols }}) |>
adorn_percentages() |>
adorn_pct_formatting() |>
adorn_ns(position = "front")
totals <- dat |>
tabyl({{ rows }}, {{ cols }}) |>
adorn_totals(c("row", "col")) |>
mutate(across(everything(), as.character))
# Add row totals
dat_pct$Total <- head(totals$Total, -1)
# Add col totals
dat_pct <- rbind(dat_pct, tail(totals, 1))
return(dat_pct)
}
You can then just do:
create_formatted_totals(am, cyl, mtcars)
# am 4 6 8 Total
# 0 3 (15.8%) 4 (21.1%) 12 (63.2%) 19
# 1 8 (61.5%) 3 (23.1%) 2 (15.4%) 13
# Total 11 7 14 32

How can I add individual summary values per participant or group to a long dataframe in R, when the replacement is shorter than the original variable?

I have a long dataset with about 6000 observations per participant. I would like to compute a count for one of my variables (max count is 12) and add this count into a new variable in the dataframe. However, there should be only one value entered per participant and the remaining cells may be filled with NA.
I have first attempted to create an empty variable and then tried the following mutation:
dfl$Hits <- NA
dfl$Hits <- dfl %>%
group_by(participant) %>%
filter(SpaceREsponseType == "Hit") %>%
count() %>%
mutate(id = cur_group_id()) %>%
mutate(id, na.rm = F)
I have also tried
dfl$Hits <- dfl %>%
group_by(participant) %>%
mutate(n = replace(rep(NA, n()), 1, sum(!is.na(SpaceREsponseType == "Hit")))) %>%
ungroup
However, this results in the following error message:
Error:
! Assigned data ... %>% count() must be compatible with existing data.
✖ Existing data has 66619 rows.
✖ Assigned data has 142 rows.
ℹ Only vectors of size 1 are recycled.
What do I need to add to make this work?
Thanks in advance and best wishes,
Jasmine
I have created a sample DF.
The data are grouped by participant and Hit and a row number is added.
with mutate)n=n()) the Hits and No Hits are count per participant.
After making the data wider the condition is added with case_when.
Then the result is brought back into the original format.
library(tidyverse)
df <- data.frame(
participant = sample(c("A", "B", "C"), replace = T, 100),
Hit = sample(c("Hit", "NoHit"), replace = T, 100)
)
df |>
group_by(participant, Hit) |>
mutate(rn = row_number()) |>
mutate(n = n()) |>
pivot_wider(names_from = Hit, values_from = n) |>
ungroup() |>
mutate(across(
ends_with("it"),
~ case_when(
rn == 1 ~ .x,
rn > 1 ~ NA_integer_
)
)) |>
pivot_longer(NoHit:Hit) |>
select(-rn)
#> # A tibble: 114 × 3
#> participant name value
#> <chr> <chr> <int>
#> 1 A NoHit 21
#> 2 A Hit 12
#> 3 B NoHit 17
#> 4 B Hit 17
#> 5 A NoHit NA
#> 6 A Hit NA
#> 7 C NoHit 19
#> 8 C Hit 14
#> 9 B NoHit NA
#> 10 B Hit NA
#> # … with 104 more rows

Programmatically count grouped data using logic rules and string

I have a grouped data frame which I want to summarise into "count of values less than x, y, z by group". I can manually generate the wide dataframe I want using code similar to this below
library(tidyverse)
set.seed(1337)
df <- data.frame(cbind(group = seq(1:5), num = sample(x = 1:400, size = 100, replace = T)))
manual <- df %>%
group_by(group) %>%
summarise(less_than_50 = sum(num < 50),
less_than_100 = sum(num < 100),
less_than_150 = sum(num < 150))
However, I'd like to be able to define a list of "less thans" and generate these columns by referring to a list. I've done something similar in the past, though using enframe(quantile()) to generate a long list of quantiles before pivoting
pc <- c(0.1, 0.5, 0.9)
quantiles <- df %>%
group_by(group) %>%
summarise(enframe(quantile(num, pc))) %>%
pivot_wider(
id_cols = group,
names_from = name,
values_from = value
)
But I don't know / understand the way to define a custom function within the enframe(). Ideally I'd like to apply this in something like the code below (though this obviously doesn't work), with or without the pivot step, in order to get back to the same output as "manual"
levels <- c(50, 100, 150)
programmatic <- df %>%
group_by(group) %>%
summarise(cols = ("less_than", x), num < levels) %>%
pivot...
Any help greatly appreciated
One way you could do it:
library(tidyverse)
set.seed(1337)
df <- data.frame(cbind(group = seq(1:5), num = sample(x = 1:400, size = 100, replace = T)))
less_than <- function(x) {
df %>%
group_by(group) %>%
summarise(less_than_ = sum(num < x)) %>%
rename_with(~ str_c(., x), .cols = -group)
}
levels <- c(50, 100, 150)
map_dfr(levels, less_than) |>
group_by(group) |>
summarise(across(everything(), mean, na.rm = TRUE))
#> # A tibble: 5 × 4
#> group less_than_50 less_than_100 less_than_150
#> <int> <dbl> <dbl> <dbl>
#> 1 1 4 5 10
#> 2 2 2 2 5
#> 3 3 2 6 11
#> 4 4 4 5 5
#> 5 5 1 7 9
# Manual result for comparison
df %>%
group_by(group) %>%
summarise(less_than_50 = sum(num < 50),
less_than_100 = sum(num < 100),
less_than_150 = sum(num < 150))
#> # A tibble: 5 × 4
#> group less_than_50 less_than_100 less_than_150
#> <int> <int> <int> <int>
#> 1 1 4 5 10
#> 2 2 2 2 5
#> 3 3 2 6 11
#> 4 4 4 5 5
#> 5 5 1 7 9
Created on 2022-06-06 by the reprex package (v2.0.1)

applying weighted.mean for specific values in a column

I have a data frame named df with five columns :
age <- c(10,11,12,12,10,11,11,12,10,11,12)
time <- c(20,26,41,60,29,28,54,24,59,70,25)
weight <- c(123,330,445,145,67,167,190,104,209,146,201)
gender <- c(1,1,2,2,2,2,1,2,2,2,1)
Q2 <- c(112,119,114,120,121,117,116,114,121,122,124)
df <- data_frame(age, w, time, gender, Q2)
what I want is applying the weighted.mean based on each age to my data frame by using two conditions: 1)gender = 2 and 2) Q2 >=114 & Q2 <= 121
by the code below, I can simply apply weighted.mean but I do not know how to use my two conditions.
df1<-
df %>%
group_by(age) %>%
summarise(weighted_time = weighted.mean(time, weight))
Is the following what you are looking for?
library(tidyverse)
age <- c(10,11,12,12,10,11,11,12,10,11,12)
time <- c(20,26,41,60,29,28,54,24,59,70,25)
weight <- c(123,330,445,145,67,167,190,104,209,146,201)
gender <- c(1,1,2,2,2,2,1,2,2,2,1)
Q2 <- c(112,119,114,120,121,117,116,114,121,122,124)
df <- data.frame(age, weight, time, gender, Q2)
df %>%
group_by(age) %>%
filter(gender == 2 & Q2 >=114 & Q2 <= 121) %>%
summarise(weighted_time = weighted.mean(time, weight), .groups = "drop")
#> # A tibble: 3 × 2
#> age weighted_time
#> <dbl> <dbl>
#> 1 10 51.7
#> 2 11 28
#> 3 12 42.4
You can add a filter for those 2 (3) conditions:
df %>% filter(gender == 2 & Q2 >= 114 & Q2 <= 121) %>% group_by(age) %>% summarise(weighted_time = weighted.mean(time, weight))
This gives
# A tibble: 3 x 2
age weighted_time
<dbl> <dbl>
1 10 51.7
2 11 28
3 12 42.4
data.table
age <- c(10,11,12,12,10,11,11,12,10,11,12)
time <- c(20,26,41,60,29,28,54,24,59,70,25)
weight <- c(123,330,445,145,67,167,190,104,209,146,201)
gender <- c(1,1,2,2,2,2,1,2,2,2,1)
Q2 <- c(112,119,114,120,121,117,116,114,121,122,124)
df <- data.frame(age, weight, time, gender, Q2)
library(data.table)
setDT(df)[gender == 2 & (Q2 >=114 & Q2 <= 121), list(res = weighted.mean(time, weight)), by = age
][order(age)]
#> age res
#> 1: 10 51.71739
#> 2: 11 28.00000
#> 3: 12 42.42219
Created on 2021-12-10 by the reprex package (v2.0.1)

Perform multiple two-sample t-test using dplyr in R

I would like to perform multiple pairwise t-tests on a dataset containing about 400 different column variables and 3 subject groups, and extract p-values for every comparison. A shorter representative example of the data, using only 2 variables could be the following;
df <- tibble(var1 = rnorm(90, 1, 1), var2 = rnorm(90, 1.5, 1), group = rep(1:3, each = 30))
Ideally the end result will be a summarised data frame containing four columns; one for the variable being tested (var1, var2 etc.), two for the groups being tested every time and a final one for the p-value.
I've tried duplicating the group column in the long form, and doing a double group_by in order to do the comparisons but with no result
result <- df %>%
pivot_longer(var1:var2, "var", "value") %>%
rename(group_a = group) %>%
mutate(group_b = group_a) %>%
group_by(group_a, group_b) %>%
summarise(n = n())
We can reshape the data into 'long' format with pivot_longer, then grouped by 'group', apply the pairwise.t.test, extract the list elements and transform into tibble with tidy (from broom) and unnest the list column
library(dplyr)
library(tidyr)
library(broom)
df %>%
pivot_longer(cols = -group, names_to = 'grp') %>%
group_by(group) %>%
summarise(out = list(pairwise.t.test(value, grp
) %>%
tidy)) %>%
unnest(c(out))
-output
# A tibble: 3 x 4
group group1 group2 p.value
<int> <chr> <chr> <dbl>
1 1 var2 var1 0.0760
2 2 var2 var1 0.0233
3 3 var2 var1 0.000244
In case you end up wanting more information about the t-tests, here is an approach that will allow you to extract more information such as the degrees of freedom and value of the test statistic:
library(dplyr)
library(tidyr)
library(purrr)
library(broom)
df <- tibble(
var1 = rnorm(90, 1, 1),
var2 = rnorm(90, 1.5, 1),
group = rep(1:3, each = 30)
)
df %>%
select(-group) %>%
names() %>%
map_dfr(~ {
y <- .
combn(3, 2) %>%
t() %>%
as.data.frame() %>%
pmap_dfr(function(V1, V2) {
df %>%
select(group, all_of(y)) %>%
filter(group %in% c(V1, V2)) %>%
t.test(as.formula(sprintf("%s ~ group", y)), ., var.equal = TRUE) %>%
tidy() %>%
transmute(y = y,
group_1 = V1,
group_2 = V2,
df = parameter,
t_value = statistic,
p_value = p.value
)
})
})
#> # A tibble: 6 x 6
#> y group_1 group_2 df t_value p_value
#> <chr> <int> <int> <dbl> <dbl> <dbl>
#> 1 var1 1 2 58 -0.337 0.737
#> 2 var1 1 3 58 -1.35 0.183
#> 3 var1 2 3 58 -1.06 0.295
#> 4 var2 1 2 58 -0.152 0.879
#> 5 var2 1 3 58 1.72 0.0908
#> 6 var2 2 3 58 1.67 0.100
And here is #akrun's answer tweaked to give the same p-values as the above approach. Note the p.adjust.method = "none" which gives independent t-tests which will inflate your Type I error rate.
df %>%
pivot_longer(
cols = -group,
names_to = "y"
) %>%
group_by(y) %>%
summarise(
out = list(
tidy(
pairwise.t.test(
value,
group,
p.adjust.method = "none",
pool.sd = FALSE
)
)
)
) %>%
unnest(c(out))
#> # A tibble: 6 x 4
#> y group1 group2 p.value
#> <chr> <chr> <chr> <dbl>
#> 1 var1 2 1 0.737
#> 2 var1 3 1 0.183
#> 3 var1 3 2 0.295
#> 4 var2 2 1 0.879
#> 5 var2 3 1 0.0909
#> 6 var2 3 2 0.100
Created on 2021-07-30 by the reprex package (v1.0.0)

Resources