I have some struggles converting the following data (from an Excel-sheet) into a tidy format:
input <- structure(list(...11 = c(
NA, NA, "<1000", ">=1000 and <2000",
"2000", ">2000 and < 3000", ">=3000"
), ...13 = c(
"male", "female",
NA, NA, NA, NA, NA
), ...14 = c(
"<777", "<555", "0.3", "0.1",
"0.15", "0.13", "0.15"
), ...15 = c(
"888-999", "555-999", "0.23",
"0.21", "0", "0.21", "0.36"
), ...16 = c(
"556-899", "1020-1170",
"0.13", "0.29", "0.7", "0.8", "0.2"
), ...17 = c(
">960", ">11000",
"0.58", "0.31", "0.22", "0.65", "0.7"
)), row.names = c(NA, -7L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 7 × 6
...11 ...13 ...14 ...15 ...16 ...17
<chr> <chr> <chr> <chr> <chr> <chr>
1 NA male <777 888-999 556-899 >960
2 NA female <555 555-999 1020-1170 >11000
3 <1000 NA 0.3 0.23 0.13 0.58
4 >=1000 and <2000 NA 0.1 0.21 0.29 0.31
5 2000 NA 0.15 0 0.7 0.22
6 >2000 and < 3000 NA 0.13 0.21 0.8 0.65
7 >=3000 NA 0.15 0.36 0.2 0.7
I would like to bring it into the following structure:
output <- tibble::tribble(
~gender, ~x, ~y, ~share,
"male", "<777", "<1000", 0.3,
"female", "<555", "<1000", 0.3,
"male", "<777", ">=1000 and <2000", 0.1,
"female", "<555", ">=1000 and <2000", 0.1,
)
# A tibble: 4 × 4
gender x y share
<chr> <chr> <chr> <dbl>
1 male <777 <1000 0.3
2 female <555 <1000 0.3
3 male <777 >=1000 and <2000 0.1
4 female <555 >=1000 and <2000 0.1
Any hints are much appreciated!
As outlined in the comments, here's a suggested approach:
Import the excel sheet twice using readxl's read_excel using the skip argument:
library(readxl)
df1 <- read_excel(file = "yourfile.xlsx", skip = 2)
df2 <- read_excel(file = "yourfile.xlsx", skip = 1)
That should give you (note X1 might be called ...1):
df1 <- read_table("NA male <777 888-999 556-899 >960
<1000 NA 0.3 0.23 0.13 0.58
>=1000and<2000 NA 0.1 0.21 0.29 0.31
2000 NA 0.15 0 0.7 0.22
>2000and<3000 NA 0.13 0.21 0.8 0.65
>=3000 NA 0.15 0.36 0.2 0.7")
df2 <- read_table("NA female <555 555-999 1020-1170 >11000
<1000 NA 0.3 0.23 0.13 0.58
>=1000and<2000 NA 0.1 0.21 0.29 0.31
2000 NA 0.15 0 0.7 0.22
>2000and<3000 NA 0.13 0.21 0.8 0.65
>=3000 NA 0.15 0.36 0.2 0.7")
Then do a little wrangling; most importantly turn into a long format:
library(dplyr)
library(tidyr)
df1 <- df1 |>
select(-male) |>
rename(y = X1) |>
mutate(gender = "male") |>
pivot_longer(-c("gender", "y"), names_to = "x", values_to = "share")
df2 <- df2 |>
select(-female) |>
rename(y = X1) |>
mutate(gender = "female") |>
pivot_longer(-c("gender", "y"), names_to = "x", values_to = "share")
And voila, a tidy frame:
bind_rows(df1, df2) |> arrange(y)
Output:
# A tibble: 40 × 4
y gender x share
<chr> <chr> <chr> <dbl>
1 <1000 male <777 0.3
2 <1000 male 888-999 0.23
3 <1000 male 556-899 0.13
4 <1000 male >960 0.58
5 <1000 female <555 0.3
6 <1000 female 555-999 0.23
7 <1000 female 1020-1170 0.13
8 <1000 female >11000 0.58
9 >=1000and<2000 male <777 0.1
10 >=1000and<2000 male 888-999 0.21
# … with 30 more rows
It's a bit unclear, but I think you'd need to do something like this
df <- input[3:nrow(input),]
input <- input[1:2, 2:3]
t <- input[rep(1:nrow(input), nrow(df)),]
s <- df[rep(1:nrow(df), 2), ]
t <- cbind(t,s)
, and repeat as needed if you need this for multiple columns.
To display the results of a regression I ran, I've got a tibble with estimates and corresponding confidence intervals:
library(tidyverse)
library(magrittr
mydata <- structure(list(term = structure(c(1L, 3L, 4L), .Label = c("Intercept",
"Follow-up time (years)", "Age (years)", "Sex (male)", "Never smoker (reference)",
"Current smoker", "Former smoker", "Obesity (=30 kg/m²)", "BMI (kg/m²)",
"Diabetes", "Glucose (mmol/L)", "Glucose lowering medication use",
"Hypertension", "Systolic blood pressure (mmHg)", "Diastolic blood pressure (mmHg)",
"Antihypertensive medication use", "Hypercholesterolemia", "LDL cholesterol (mmol/L)",
"Lipid lowering medication use", "Chronic kidney disease (mL/min/1.73m²)",
"=90 (reference)", "60-89", "=60"), class = c("ordered", "factor"
)), estimate = c(518.38, 0.98, 1.07), conf_low = c(178.74, 0.93,
0.96), conf_high = c(1503.36, 1.03, 1.19), label = c("518.38 (178.74-1503.36)",
" 0.98 ( 0.93- 1.03)", " 1.07 ( 0.96- 1.19)")), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))
mydata
# A tibble: 3 x 4
term estimate conf_low conf_high
<ord> <dbl> <dbl> <dbl>
1 Intercept 518. 179. 1503.
2 Age (years) 0.98 0.93 1.03
3 Sex (male) 1.07 0.96 1.19
To make a label that includes the estimate and 95%CI, I've used paste0, and to make sure that every number has two decimals I've used format. However, when combining these, extra whitespaces appear:
mydata <-
mydata %>%
mutate(
label=
paste0(format(round(estimate, digits=2), nsmall=2),
" (",
format(round(conf_low, digits=2), nsmall=2),
"-",
format(round(conf_high, digits=2), nsmall=2),
")",
sep="", collaps=""))
mydata
# A tibble: 3 x 5
term estimate conf_low conf_high label
<ord> <dbl> <dbl> <dbl> <chr>
1 Intercept 518. 179. 1503. "518.38 (178.74-1503.36)"
2 Age (years) 0.98 0.93 1.03 " 0.98 ( 0.93- 1.03)"
3 Sex (male) 1.07 0.96 1.19 " 1.07 ( 0.96- 1.19)"
Why does this happen? Can I prevent this or otherwise remove the whitespaces so that the format becomes "estimate (conf_low-conf_high)"?
Add trim=TRUE in the format() call:
mydata %>%
mutate(
label=
paste0(format(round(estimate, digits=2), nsmall=2, trim=TRUE),
" (",
format(round(conf_low, digits=2), nsmall=2, trim=TRUE),
"-",
format(round(conf_high, digits=2), nsmall=2, trim=TRUE),
")",
sep="", collaps=""))
# A tibble: 3 × 5
term estimate conf_low conf_high label
<ord> <dbl> <dbl> <dbl> <chr>
1 Intercept 518. 179. 1503. "518.38 (178.74-1503.36)"
2 Age (years) 0.98 0.93 1.03 "0.98 (0.93-1.03)"
3 Sex (male) 1.07 0.96 1.19 "1.07 (0.96-1.19)"
1) Use sprintf
mydata %>%
mutate(label = sprintf("%.2f (%.2f-%.2f)", estimate, conf_low, conf_high))
giving:
# A tibble: 3 x 5
term estimate conf_low conf_high label
<ord> <dbl> <dbl> <dbl> <chr>
1 Intercept 518. 179. 1503. 518.38 (178.74-1503.36)
2 Age (years) 0.98 0.93 1.03 0.98 (0.93-1.03)
3 Sex (male) 1.07 0.96 1.19 1.07 (0.96-1.19)
2) or this variation producing slightly different output
mydata %>%
mutate(label = sprintf("%6.2f (%6.2f-%7.2f)", estimate, conf_low, conf_high))
giving;
# A tibble: 3 x 5
term estimate conf_low conf_high label
<ord> <dbl> <dbl> <dbl> <chr>
1 Intercept 518. 179. 1503. "518.38 (178.74-1503.36)"
2 Age (years) 0.98 0.93 1.03 " 0.98 ( 0.93- 1.03)"
3 Sex (male) 1.07 0.96 1.19 " 1.07 ( 0.96- 1.19)"
I have the following table:
# A tibble: 40 x 5
# Groups: stock [1]
stock date mkt_cap week returns
<chr> <date> <dbl> <int> <dbl>
1 A 2019-03-04 10522834. NA NA
2 A 2019-03-05 11659707. NA 9.70
3 A 2019-03-06 11464531. NA -2.25
4 A 2019-03-07 12217241. NA 5.80
5 A 2019-03-08 11619351. 1 -5.57
6 A 2019-03-09 11578687. NA -0.899
7 A 2019-03-10 11658368. NA 0.141
8 A 2019-03-11 12722921. NA 8.20
9 A 2019-03-12 15429934. NA 18.8
10 A 2019-03-13 16801600. NA 7.98
11 A 2019-03-14 17898334. NA 5.79
12 A 2019-03-15 18492686. 2 2.74
13 A 2019-03-16 20686683. NA 10.7
14 A 2019-03-17 22299970. NA 6.98
15 A 2019-03-18 22924182. NA 2.24
16 A 2019-03-19 24174351. NA 4.79
17 A 2019-03-20 24661467. NA 1.48
18 A 2019-03-21 23351810. NA -5.97
19 A 2019-03-22 27826601. 3 17.0
20 A 2019-03-23 30670482. NA 9.22
21 A 2019-03-24 32802772. NA 6.21
22 A 2019-03-25 31778387. NA -3.68
23 A 2019-03-26 33237006. NA 3.99
24 A 2019-03-27 34971479. NA 4.59
25 A 2019-03-28 36774005. NA 4.53
26 A 2019-03-29 37594815. 4 1.71
27 A 2019-03-30 38321816. NA 1.42
28 A 2019-03-31 35167070. NA -9.08
29 A 2019-04-01 35625396. NA 0.808
30 A 2019-04-02 35764747. NA -0.0940
31 A 2019-04-03 28316242. NA -23.8
32 A 2019-04-04 26124803. NA -8.53
33 A 2019-04-05 30390295. 5 14.6
34 A 2019-04-06 28256485. NA -7.76
35 A 2019-04-07 29807837. NA 4.87
36 A 2019-04-08 30970364. NA 3.36
37 A 2019-04-09 30470093. NA -2.10
38 A 2019-04-10 30860276. NA 0.806
39 A 2019-04-11 27946472. NA -10.4
40 A 2019-04-12 27662766. 6 -1.48
Over this table, I want to run a rolling regression where the rolling regression contains the past month of data. I want to run these rolling regressions over the weeks. That is, over week==1, week ==2 etc., where we use the past month of data. The regression should be lm(return~mkt_cap). I have tried a number of things using the slide_period() function, however, this did not work out for me. For example, I have tried to run
tbl.data %>% group_by(stock, week) %>% slide_period(date, date, "month", ~.x, .before = 1). There are some gaps in my data, therefore I prefer a solution that considers the date.
Could someone help me out? Kind regards.
I would use a tidyverse rowwise approach.
Not clear to me is how models should be created by week and go back to the last month. In the approach below I calculate max_date per week and from this I go back 30 days.
# setup
library(tidyverse)
library(lubridate)
dat <- tribble(~stock, ~date, ~mkt_cap, ~week, ~returns,
"A", "2019-03-04", 10522834., NA, NA,
"A", "2019-03-05", 11659707., NA, 9.70,
"A", "2019-03-06", 11464531., NA, -2.25,
"A", "2019-03-07", 12217241., NA, 5.80,
"A", "2019-03-08", 11619351., 1, -5.57,
"A", "2019-03-09", 11578687., NA, -0.899,
"A", "2019-03-10", 11658368., NA, 0.141,
"A", "2019-03-11", 12722921., NA, 8.20,
"A", "2019-03-12", 15429934., NA, 18.8,
"A", "2019-03-13", 16801600., NA, 7.98,
"A", "2019-03-14", 17898334., NA, 5.79,
"A", "2019-03-15", 18492686., 2, 2.74,
"A", "2019-03-16", 20686683., NA, 10.7,
"A", "2019-03-17", 22299970., NA, 6.98,
"A", "2019-03-18", 22924182., NA, 2.24,
"A", "2019-03-19", 24174351., NA, 4.79,
"A", "2019-03-20", 24661467., NA, 1.48,
"A", "2019-03-21", 23351810., NA, -5.97,
"A", "2019-03-22", 27826601., 3, 17.0,
"A", "2019-03-23", 30670482., NA, 9.22,
"A", "2019-03-24", 32802772., NA, 6.21,
"A", "2019-03-25", 31778387., NA, -3.68,
"A", "2019-03-26", 33237006., NA, 3.99,
"A", "2019-03-27", 34971479., NA, 4.59,
"A", "2019-03-28", 36774005., NA, 4.53,
"A", "2019-03-29", 37594815., 4, 1.71,
"A", "2019-03-30", 38321816., NA, 1.42,
"A", "2019-03-31", 35167070., NA, -9.08,
"A", "2019-04-01", 35625396., NA, 0.808,
"A", "2019-04-02", 35764747., NA, -0.0940,
"A", "2019-04-03", 28316242., NA, -23.8,
"A", "2019-04-04", 26124803., NA, -8.53,
"A", "2019-04-05", 30390295., 5, 14.6,
"A", "2019-04-06", 28256485., NA, -7.76,
"A", "2019-04-07", 29807837., NA, 4.87,
"A", "2019-04-08", 30970364., NA, 3.36,
"A", "2019-04-09", 30470093., NA, -2.10,
"A", "2019-04-10", 30860276., NA, 0.806,
"A", "2019-04-11", 27946472., NA, -10.4,
"A", "2019-04-12", 27662766., 6, -1.48) %>%
mutate(date = as.Date(date)) %>%
fill(week, .direction = "up")
# summarised data.frame by week with min and max date
dat2 <- dat %>%
group_by(week) %>%
summarise(max_date = max(date),
min_date = max_date %m-% months(1))
#> `summarise()` ungrouping output (override with `.groups` argument)
# create the models
dat3 <- dat2 %>%
rowwise() %>%
mutate(mod = list(lm(returns ~ mkt_cap,
data = filter(dat,
date <= .env$max_date,
date >= .env$min_date))))
# get the relevant informationen per week
dat3 %>%
mutate(res = list(broom::tidy(mod)),
broom::glance(mod)) %>%
select(week,
res,
adj.r.squared,
mod_p.value = p.value,
nobs) %>%
unnest(res) %>%
filter(term != "(Intercept)")
#> # A tibble: 6 x 9
#> week term estimate std.error statistic p.value adj.r.squared mod_p.value
#> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 mkt_~ 1.01e-5 1.34e-5 0.756 0.529 -0.167 0.529
#> 2 2 mkt_~ 9.26e-7 7.45e-7 1.24 0.245 0.0520 0.245
#> 3 3 mkt_~ 2.56e-7 2.97e-7 0.864 0.400 -0.0152 0.400
#> 4 4 mkt_~ 2.00e-8 1.42e-7 0.141 0.889 -0.0426 0.889
#> 5 5 mkt_~ -1.18e-7 1.61e-7 -0.736 0.467 -0.0150 0.467
#> 6 6 mkt_~ -3.23e-7 2.37e-7 -1.37 0.182 0.0271 0.182
#> # ... with 1 more variable: nobs <int>
Created on 2021-04-27 by the reprex package (v0.3.0)
Update
This approach can be easily expanded when working with more than one stock:
# lets append the same data and change stock to "B":
dat <- dat %>%
bind_rows({mutate(., stock = "B")})
# summarised data.frame by week and group with min and max date
dat2 <- dat %>%
group_by(stock, week) %>%
summarise(max_date = max(date),
min_date = max_date %m-% months(1))
#> `summarise()` has grouped output by 'stock'. You can override using the `.groups` argument.
# create the models, and this time also filer for .env$stock
dat3 <- dat2 %>%
rowwise() %>%
mutate(mod = list(lm(returns ~ mkt_cap,
data = filter(dat,
stock == .env$stock,
date <= .env$max_date,
date >= .env$min_date))))
# get the relevant informationen per week (this stays the same!)
dat3 %>%
mutate(res = list(broom::tidy(mod)),
broom::glance(mod)) %>%
select(week,
res,
adj.r.squared,
mod_p.value = p.value,
nobs) %>%
unnest(res) %>%
filter(term != "(Intercept)")
#> Adding missing grouping variables: `stock`
#> # A tibble: 12 x 10
#> # Groups: stock [2]
#> stock week term estimate std.error statistic p.value adj.r.squared
#> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 1 mkt_cap 0.0000101 0.0000134 0.756 0.529 -0.167
#> 2 A 2 mkt_cap 0.000000926 0.000000745 1.24 0.245 0.0520
#> 3 A 3 mkt_cap 0.000000256 0.000000297 0.864 0.400 -0.0152
#> 4 A 4 mkt_cap 0.0000000200 0.000000142 0.141 0.889 -0.0426
#> 5 A 5 mkt_cap -0.000000118 0.000000161 -0.736 0.467 -0.0150
#> 6 A 6 mkt_cap -0.000000323 0.000000237 -1.37 0.182 0.0271
#> 7 B 1 mkt_cap 0.0000101 0.0000134 0.756 0.529 -0.167
#> 8 B 2 mkt_cap 0.000000926 0.000000745 1.24 0.245 0.0520
#> 9 B 3 mkt_cap 0.000000256 0.000000297 0.864 0.400 -0.0152
#> 10 B 4 mkt_cap 0.0000000200 0.000000142 0.141 0.889 -0.0426
#> 11 B 5 mkt_cap -0.000000118 0.000000161 -0.736 0.467 -0.0150
#> 12 B 6 mkt_cap -0.000000323 0.000000237 -1.37 0.182 0.0271
#> # … with 2 more variables: mod_p.value <dbl>, nobs <int>
Created on 2021-04-27 by the reprex package (v0.3.0)
An ugly Base R solution (assuming you just want the predicted values returned):
# Allocate some memory such that each stock in data.frame
# can become an element in a list: df_list => empty list:
df_list <- vector("list", length(unique(df$stock)))
# Split the data.frame into the list: df_list => list of data.frames:
df_list <- with(df, split(df, stock))
# Number of weeks to consider in rolling regression in this case 4,
# approximating a month: n_weeks => integer scalar:
n_weeks <- 4
# For each stock in the list: nested lists => stdout(console)
lapply(df_list, function(x){
# Clean the week vector, filling NAs with values:
# week => integer vector
x$week <- with(x, rev(na.omit(rev(week))[cumsum(!is.na(rev(week)))]))
# Impute the first return value if it is missing:
x$returns[1] <- with(x,
ifelse(is.na(returns[1]), returns[which.min(!(is.na(returns)))],
returns[1]
)
)
# Interpolate the return using the previous value:
# returns => numeric vector
x$returns <- with(x, na.omit(returns)[cumsum(!is.na(returns))])
# For each week:
y <- lapply(unique(x$week), function(z){
# Calculate the range for the regression:
rng <- if(z - n_weeks <= 0){
seq_len(z)
}else{
seq(from = (z - n_weeks), to = z, by = 1)
}
# Subset the data: sbst => data.frame
sbst <- x[x$week %in% rng,]
# Calculate the regression:
predict(lm(returns ~ mkt_cap, data = sbst))
}
)
# Return the list of regressions:
y
}
)
Data:
df <- structure(list(stock = c("A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A"), date = structure(17959:17998, class = c("IDate",
"Date")), mkt_cap = c(10522834, 11659707, 11464531, 12217241,
11619351, 11578687, 11658368, 12722921, 15429934, 16801600, 17898334,
18492686, 20686683, 22299970, 22924182, 24174351, 24661467, 23351810,
27826601, 30670482, 32802772, 31778387, 33237006, 34971479, 36774005,
37594815, 38321816, 35167070, 35625396, 35764747, 28316242, 26124803,
30390295, 28256485, 29807837, 30970364, 30470093, 30860276, 27946472,
27662766), week = c(NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA,
2L, NA, NA, NA, NA, NA, NA, 3L, NA, NA, NA, NA, NA, NA, 4L, NA,
NA, NA, NA, NA, NA, 5L, NA, NA, NA, NA, NA, NA, 6L), returns = c(NA,
9.7, -2.25, 5.8, -5.57, -0.899, 0.141, 8.2, 18.8, 7.98, 5.79,
2.74, 10.7, 6.98, 2.24, 4.79, 1.48, -5.97, 17, 9.22, 6.21, -3.68,
3.99, 4.59, 4.53, 1.71, 1.42, -9.08, 0.808, -0.094, -23.8, -8.53,
14.6, -7.76, 4.87, 3.36, -2.1, 0.806, -10.4, -1.48)), class = "data.frame", row.names = c(NA,
-40L))
Does slide_index() from the slider package do what you want?
library(tidyverse)
library(slider)
library(broom)
set.seed(1001)
## more or less the slider help page for slide_index()
df <- data.frame(
y = rnorm(100),
x = rnorm(100),
i = as.Date("2019-08-15") + c(0, 2, 4, 6:102) # <- irregular
)
head(df)
#> y x i
#> 1 2.1886481 0.07862339 2019-08-15
#> 2 -0.1775473 -0.98708727 2019-08-17
#> 3 -0.1852753 -1.17523226 2019-08-19
#> 4 -2.5065362 1.68140888 2019-08-21
#> 5 -0.5573113 0.75623228 2019-08-22
#> 6 -0.1435595 0.30309733 2019-08-23
# 20 day rolling regression. Current day + 10 days back.
out <- df %>%
mutate(model = slide_index(df, i, ~ lm(y ~ x, df),
.before = 10, .complete = TRUE)) %>%
as_tibble()
out %>%
filter(!(map_lgl(model, ~ is_empty(.x)))) %>%
mutate(results = map(model, tidy)) %>%
unnest(cols = c(results))
#> # A tibble: 186 x 9
#> y x i model term estimate std.error statistic p.value
#> <dbl> <dbl> <date> <list> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.623 0.741 2019-08-25 <lm> (Intercept) -0.000347 0.115 -0.00302 0.998
#> 2 -0.623 0.741 2019-08-25 <lm> x -0.0825 0.144 -0.575 0.567
#> 3 -0.907 0.495 2019-08-26 <lm> (Intercept) -0.000347 0.115 -0.00302 0.998
#> 4 -0.907 0.495 2019-08-26 <lm> x -0.0825 0.144 -0.575 0.567
#> 5 -1.59 -1.13 2019-08-27 <lm> (Intercept) -0.000347 0.115 -0.00302 0.998
#> 6 -1.59 -1.13 2019-08-27 <lm> x -0.0825 0.144 -0.575 0.567
#> 7 0.303 -1.16 2019-08-28 <lm> (Intercept) -0.000347 0.115 -0.00302 0.998
#> 8 0.303 -1.16 2019-08-28 <lm> x -0.0825 0.144 -0.575 0.567
#> 9 1.63 -0.713 2019-08-29 <lm> (Intercept) -0.000347 0.115 -0.00302 0.998
#> 10 1.63 -0.713 2019-08-29 <lm> x -0.0825 0.144 -0.575 0.567
#> # … with 176 more rows
I've used group_by with the summarise command in dplyr to generate some summaries for my data. I would like to get the same summaries for the overall data set and combine it as one tibble.
Is there a straighforward way of doing this? My solution below feels like it has 4X the amount of code required to do this efficently!
Thanks in advance.
# reprex
library(tidyverse)
tidy_data <- tibble::tribble(
~drug, ~gender, ~condition, ~value,
"control", "f", "work", 0.06,
"treatment", "m", "work", 0.42,
"treatment", "f", "work", 0.22,
"control", "m", "work", 0.38,
"treatment", "m", "work", 0.57,
"treatment", "f", "work", 0.24,
"control", "f", "work", 0.61,
"control", "f", "play", 0.27,
"treatment", "m", "play", 0.3,
"treatment", "f", "play", 0.09,
"control", "m", "play", 0.84,
"control", "m", "play", 0.65,
"treatment", "m", "play", 0.98,
"treatment", "f", "play", 0.38
)
tidy_summaries <- tidy_data %>%
# Group by the required variables
group_by(drug, gender, condition) %>%
summarise(mean = mean(value),
median = median(value),
min = min(value),
max = max(value)) %>%
# Bind rows will bind this output to the following one
bind_rows(
# Now for the overall version
tidy_data %>%
# Generate the overall summary values
mutate(mean = mean(value),
median = median(value),
min = min(value),
max = max(value)) %>%
# We need to know what the structure of the 'grouped_by' tibble first
# as the overall output format needs to match that
select(drug, gender, condition, mean:max) %>% # Keep columns of interest
# The same information will be appended to all rows, so we just need to retain one
filter(row_number() == 1) %>%
# Change the values in drug, gender, condition to "overall"
mutate_at(vars(drug:condition),
list(~ifelse(is.character(.), "overall", .)))
)
This the output I want, but it wasn't as simple as I might have hoped.
tidy_summaries
#> # A tibble: 9 x 7
#> # Groups: drug, gender [5]
#> drug gender condition mean median min max
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 control f play 0.27 0.27 0.27 0.27
#> 2 control f work 0.335 0.335 0.06 0.61
#> 3 control m play 0.745 0.745 0.65 0.84
#> 4 control m work 0.38 0.38 0.38 0.38
#> 5 treatment f play 0.235 0.235 0.09 0.38
#> 6 treatment f work 0.23 0.23 0.22 0.24
#> 7 treatment m play 0.64 0.64 0.3 0.98
#> 8 treatment m work 0.495 0.495 0.42 0.570
#> 9 overall overall overall 0.429 0.38 0.06 0.98
Try
tidy_data %>%
group_by(drug, gender, condition) %>%
summarise(mean = mean(value), median = median(value), min = min(value), max = max(value)) %>%
bind_rows(.,
tidy_data %>%
summarise(drug = "Overall", gender = "Overall", condition = "Overall", mean = mean(value), median = median(value), min = min(value), max = max(value))
)
This gives:
# A tibble: 9 x 7
# Groups: drug, gender [5]
drug gender condition mean median min max
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 control f play 0.27 0.27 0.27 0.27
2 control f work 0.335 0.335 0.06 0.61
3 control m play 0.745 0.745 0.65 0.84
4 control m work 0.38 0.38 0.38 0.38
5 treatment f play 0.235 0.235 0.09 0.38
6 treatment f work 0.23 0.23 0.22 0.24
7 treatment m play 0.64 0.64 0.3 0.98
8 treatment m work 0.495 0.495 0.42 0.570
9 Overall Overall Overall 0.429 0.38 0.06 0.98
The code summarizes it via groupings first, and then creates the final summary row from the original data and binds it at the very bottom.
Interesting question. My take is basically the same answer as #sumshyftw but uses mutate_if and summarise_at.
Code
library(hablar)
funs <- list(mean = ~mean(.),
median = ~median(.),
min = ~min(.),
max = ~max(.))
tidy_data %>%
group_by(drug, gender, condition) %>%
summarise_at(vars(value), funs) %>%
ungroup() %>%
bind_rows(., tidy_data %>% summarise_at(vars(value), funs)) %>%
mutate_if(is.character, ~if_na(., "Overall"))
Result
drug gender condition mean median min max
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 control f play 0.27 0.27 0.27 0.27
2 control f work 0.335 0.335 0.06 0.61
3 control m play 0.745 0.745 0.65 0.84
4 control m work 0.38 0.38 0.38 0.38
5 treatment f play 0.235 0.235 0.09 0.38
6 treatment f work 0.23 0.23 0.22 0.24
7 treatment m play 0.64 0.64 0.3 0.98
8 treatment m work 0.495 0.495 0.42 0.570
9 Overall Overall Overall 0.429 0.38 0.06 0.98