I am running an experiment where participants are randomly assigned to one of two conditions, and then I collect data on several variables. Here is an example of my code:
df <- data.frame(condition =c(1,1,1,1,1,-1,-1,-1,-1,-1),
var1 = c(6,6,4,7,5,6,6,6,4,7),
var2 = c(3,4,3,6,7,1,2,1,2,5),
var3 = c(2,2,6,6,7,1,7,7,3,1),
var4 = c(6,4,3,6,4,1,3,3,4,4))
df$condition = factor(df$condition, levels = c(-1,1),labels = c("Digital","Physical"))
For each variable (var1, var2, etc.) I would like a little table with the count, mean, and standard deviation. This code creates the kind of table that I want:
group_by(df, df$condition) %>%
summarise(
count = n(),
mean = mean(var1),
sd = sd(var1))
But because I have many variables, I would like to use some kind of loop (or "lapply"?) to create all these tables at once. It would also be great if each table could show the name of the variable. Thanks!
You can just use summarise on all the variables, i.e.
library(dplyr)
group_by(df, condition) %>%
summarise(across(everything(), ~ c(count = n(), mean = mean(.), sd = sd(.))))
`summarise()` has grouped output by 'condition'. You can override using the `.groups` argument.
# A tibble: 6 x 5
# Groups: condition [2]
condition var1 var2 var3 var4
<fct> <dbl> <dbl> <dbl> <dbl>
1 Digital 5 5 5 5
2 Digital 5.8 2.2 3.8 3
3 Digital 1.10 1.64 3.03 1.22
4 Physical 5 5 5 5
5 Physical 5.6 4.6 4.6 4.6
6 Physical 1.14 1.82 2.41 1.34
You can control the output structure by changing object in the formula, i.e.
group_by(df, condition) %>%
summarise(across(everything(), ~ data.frame(count = n(), mean = mean(.), sd = sd(.))))
# A tibble: 2 x 5
condition var1$count $mean $sd var2$count $mean $sd var3$count $mean $sd var4$count $mean $sd
<fct> <int> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl>
1 Digital 5 5.8 1.10 5 2.2 1.64 5 3.8 3.03 5 3 1.22
2 Physical 5 5.6 1.14 5 4.6 1.82 5 4.6 2.41 5 4.6 1.34
We could still do it my summarise using a list:
library(dplyr)
df %>%
group_by(condition) %>%
summarise(across(starts_with("var"), .f = list(n = ~n(),
mean = mean,
sd = sd), na.rm = TRUE))
condition var1_n var1_mean var1_sd var2_n var2_mean var2_sd var3_n var3_mean var3_sd var4_n var4_mean var4_sd
<dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl>
1 -1 5 5.8 1.10 5 2.2 1.64 5 3.8 3.03 5 3 1.22
2 1 5 5.6 1.14 5 4.6 1.82 5 4.6 2.41 5 4.6 1.34
df <- data.frame(condition =c(1,1,1,1,1,-1,-1,-1,-1,-1),
var1 = c(6,6,4,7,5,6,6,6,4,7),
var2 = c(3,4,3,6,7,1,2,1,2,5),
var3 = c(2,2,6,6,7,1,7,7,3,1),
var4 = c(6,4,3,6,4,1,3,3,4,4))
df$condition = factor(df$condition, levels = c(-1,1),labels = c("Digital","Physical"))
for (var in names(df)[2:length(names(df))]){
tab <- group_by(df, condition) %>%
select(c("condition", var)) %>%
dplyr::rename(v = var) %>%
summarise(
count = n(),
mean = mean(v),
sd = sd(v)
)
print(var)
print(tab)
}
gives
[1] "var1"
# A tibble: 2 × 4
condition count mean sd
<fct> <int> <dbl> <dbl>
1 Digital 5 5.8 1.10
2 Physical 5 5.6 1.14
[1] "var2"
# A tibble: 2 × 4
condition count mean sd
<fct> <int> <dbl> <dbl>
1 Digital 5 2.2 1.64
2 Physical 5 4.6 1.82
[1] "var3"
# A tibble: 2 × 4
condition count mean sd
<fct> <int> <dbl> <dbl>
1 Digital 5 3.8 3.03
2 Physical 5 4.6 2.41
[1] "var4"
# A tibble: 2 × 4
condition count mean sd
<fct> <int> <dbl> <dbl>
1 Digital 5 3 1.22
2 Physical 5 4.6 1.34
>
Rather than lapply, the function of choice is aggregate, a close relative to the *apply family at least. Put in a custom function f.
f <- \(x) c(n=length(x), mu=mean(x), sd=sd(x))
aggregate(. ~ condition, df, f)
# condition var1.n var1.mu var1.sd var2.n var2.mu var2.sd var3.n var3.mu var3.sd var4.n var4.mu var4.sd
# 1 Digital 5.000000 5.800000 1.095445 5.000000 2.200000 1.643168 5.000000 3.800000 3.033150 5.000000 3.000000 1.224745
# 2 Physical 5.000000 5.600000 1.140175 5.000000 4.600000 1.816590 5.000000 4.600000 2.408319 5.000000 4.600000 1.341641
If you want to aggregate on a specific set of variables (e.g. assembled with grep), use list notation instead.
aggregate(df[grep('^var', names(df))], df['condition'], f)
You can use gtsummary here if you need to present the results.
Example one below will make one table with all of your variables. Example two will split each variable into its own table (if you need them to be seperate)
library(gtsummary)
#example one:
tbl_summary(df, by = condition,
type = list(everything()~"continuous"),
statistic = list(all_continuous()~"{mean} ({sd}) "))
#example two:
tbl_summary(df, by = condition,
type = list(everything()~"continuous"),
statistic = list(all_continuous()~"{mean} ({sd}) ")) %>%
tbl_split(variables = c(var1, var2,var3,var4))
Related
I recently understood how to access a column names inside a user defined function: How to access a column name in a user defined function with dplyr?
However, now I also need to access the column names within the operations that are being carried out. For example I would like to do this:
samp_df <- tibble(var1 = c('a', 'b', 'c'),
var_in_df = c(3,7,9))
calculateSummaries <- function(df, variable){
df <- df %>%
mutate("mean_of_{{variable}}" := mean({{variable}}),
"sd_of_{{variable}}" := sd({{variable}}),
"sd_plus_mean_of_{{variable}}" := ("mean_of_{{variable}}" + "sd_of_{{variable}}")
)
}
df_result <- calculateSummaries(samp_df, var_in_df)
Of course I could do:
"sd_plus_mean_of_{{variable}}" := mean({{variable}}) + sd({{variable}})
But in practice, with the real data this won't be practical.
Does anyone know how to so this?
This case ineed a little bit tricky, I think we have to constuct the names first and then use !! sym() to evaluate the strings as objects.
library(dplyr)
samp_df <- tibble(var1 = c('a', 'b', 'c'),
var_in_df = c(3,7,9))
calculateSummaries <- function(df, variable){
var_nm <- deparse(substitute(variable))
mean_var_nm <- paste0("mean_of_", var_nm)
sd_var_nm <- paste0("sd_of_", var_nm)
df %>%
mutate("mean_of_{{variable}}" := mean({{variable}}),
"sd_of_{{variable}}" := sd({{variable}}),
"sd_plus_mean_of_{{variable}}" := !! sym(mean_var_nm) + !! sym(sd_var_nm)
)
}
calculateSummaries(samp_df, var_in_df)
#> # A tibble: 3 x 5
#> var1 var_in_df mean_of_var_in_df sd_of_var_in_df sd_plus_mean_of_var_in_df
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 3 6.33 3.06 9.39
#> 2 b 7 6.33 3.06 9.39
#> 3 c 9 6.33 3.06 9.39
An alternative way is using across(), but we still have to construct the variable names.
calculateSummaries <- function(df, variable){
df %>%
mutate("mean_of_{{variable}}" := mean({{variable}}),
"sd_of_{{variable}}" := sd({{variable}}),
across(c({{ variable }}),
list(sd_plus_mean_of = ~ get(paste0("mean_of_", cur_column())) + get(paste0("sd_of_", cur_column())))
)
)
}
calculateSummaries(samp_df, var_in_df)
#> # A tibble: 3 x 5
#> var1 var_in_df mean_of_var_in_df sd_of_var_in_df var_in_df_sd_plus_mean_of
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 3 6.33 3.06 9.39
#> 2 b 7 6.33 3.06 9.39
#> 3 c 9 6.33 3.06 9.39
Here is a final way inspired by Lionel Henry's answer to this question. We can use rlang::englue() to construct names and use those names with the .data[[...]] pronoun.
calculateSummaries <- function(df, variable){
mean_var_nm <- rlang::englue("mean_of_{{ variable }}")
sd_var_nm <- rlang::englue("sd_of_{{ variable }}")
df %>%
mutate("mean_of_{{ variable }}" := mean({{ variable }}),
"sd_of_{{ variable }}" := sd({{ variable }}),
"sd_plus_mean_of_{{ variable }}" := .data[[mean_var_nm]] + .data[[sd_var_nm]]
)
}
calculateSummaries(samp_df, var_in_df)
#> # A tibble: 3 x 5
#> var1 var_in_df mean_of_var_in_df sd_of_var_in_df sd_plus_mean_of_var_in_df
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 3 6.33 3.06 9.39
#> 2 b 7 6.33 3.06 9.39
#> 3 c 9 6.33 3.06 9.39
Created on 2022-10-13 by the reprex package (v2.0.1)
According to this tidyverse blog post glue strings are only supported as result names, which IMHO means only on the LHS.
Besides the options offered by #TimTeaFan another option would be to use across to compute all desired values and name the columns using the .names argument:
library(dplyr)
calculateSummaries1 <- function(df, variable) {
df <- df %>%
mutate(across({{ variable }},
.fns = list(
mean = mean,
sd = sd,
sd_plus_mean = ~ mean(.x) + sd(.x)
),
.names = "{.fn}_of_{.col}"
))
df
}
calculateSummaries1(samp_df, var_in_df)
#> # A tibble: 3 × 5
#> var1 var_in_df mean_of_var_in_df sd_of_var_in_df sd_plus_mean_of_var_in_df
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 3 6.33 3.06 9.39
#> 2 b 7 6.33 3.06 9.39
#> 3 c 9 6.33 3.06 9.39
And a second option would be to use some helper variable names for the mean and the sd which avoids to use glue syntax one the RHS but requires an additional rename step:
calculateSummaries2 <- function(df, variable) {
df <- df %>%
mutate(
mean = mean({{ variable }}),
sd = sd({{ variable }}),
"sd_plus_mean_of_{{variable}}" := mean + sd
) |>
rename("mean_of_{{variable}}" := mean, "sd_of_{{variable}}" := sd)
df
}
calculateSummaries2(samp_df, var_in_df)
#> # A tibble: 3 × 5
#> var1 var_in_df mean_of_var_in_df sd_of_var_in_df sd_plus_mean_of_var_in_df
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 3 6.33 3.06 9.39
#> 2 b 7 6.33 3.06 9.39
#> 3 c 9 6.33 3.06 9.39
I am using WRS2 to carry out robust pairwise comparisons. But one problem is that it removes the group level names from the output dataframes and saves it in a different object.
# setup
set.seed(123)
library(WRS2)
library(tidyverse)
# robust pairwise comparisons
x <- lincon(libido ~ dose, data = viagra, tr = 0.1)
# comparisons
x$comp
#> Group Group psihat ci.lower ci.upper p.value
#> [1,] 1 2 -1.0 -3.440879 1.44087853 0.25984505
#> [2,] 1 3 -2.8 -5.536161 -0.06383861 0.04914871
#> [3,] 2 3 -1.8 -4.536161 0.93616139 0.17288911
# vector with group level names
x$fnames
#> [1] "placebo" "low" "high"
I can convert it to a tibble:
# converting to tibble
suppressMessages(as_tibble(x$comp, .name_repair = "unique")) %>%
dplyr::rename(group1 = Group...1, group2 = Group...2)
#> # A tibble: 3 x 6
#> group1 group2 psihat ci.lower ci.upper p.value
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 -1 -3.44 1.44 0.260
#> 2 1 3 -2.8 -5.54 -0.0638 0.0491
#> 3 2 3 -1.8 -4.54 0.936 0.173
I would then like to replace the group column numeric values with actual names included in fnames (so map fnames[1] -> 1, fnames[2] -> 2, and so on).
So the final dataframe should look something like the following-
#> # A tibble: 3 x 6
#> group1 group2 psihat ci.lower ci.upper p.value
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 placebo low -1 -3.44 1.44 0.260
#> 2 placebo high -2.8 -5.54 -0.0638 0.0491
#> 3 low high -1.8 -4.54 0.936 0.173
In this case, it was easy to just copy-paste the three values, but I want to have a generalizable approach where no matter the number of levels, it works. How can I do this using dplyr?
Using a named vector to match with tidyverse. This matches by value and not by the sequence of index i.e. if the value in 'Group' columns are not in a sequence or character, this would still work
library(dplyr)
as_tibble(x$comp, .name_repair = 'unique') %>%
mutate(across(starts_with("Group"),
~ setNames(x$fnames, seq_along(x$fnames))[as.character(.)]))
Does this fullfil your needs :
names <- c("A","B","C")
df = data.frame(group=c(1,2,3))
library(dplyr)
df %>% mutate(group = names[group])
group
1 A
2 B
3 C
Here's an approach using the recode function, with the recoding vector built programmatically from the data:
# Setup
set.seed(123)
library(WRS2)
library(tidyverse)
x <- lincon(libido ~ dose, data = viagra, tr = 0.1)
# Create recoding vector
recode.vec = x$fnames %>% set_names(1:length(x$fnames))
# Recode columns
x.comp = x$comp %>%
as_tibble(.name_repair=make.unique) %>%
mutate(across(starts_with("Group"), ~recode(., !!!recode.vec)))
Output:
x.comp
#> # A tibble: 3 x 6
#> Group Group.1 psihat ci.lower ci.upper p.value
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 placebo low -1 -3.44 1.44 0.260
#> 2 placebo high -2.8 -5.54 -0.0638 0.0491
#> 3 low high -1.8 -4.54 0.936 0.173
Try this tidyverse approach formating data to long after extracting the objects as tibbles. You can use left_join() to get your groups as you want. Here the code to get something close to what you want:
# setup
set.seed(123)
library(WRS2)
library(tidyverse)
# robust pairwise comparisons
x <- lincon(libido ~ dose, data = viagra, tr = 0.1)
#Transform to tibble
df1 <- suppressMessages(as_tibble(x$comp, .name_repair = "unique")) %>%
dplyr::rename(group1 = Group...1, group2 = Group...2)
#Extract labels
df2 <- tibble(treat=x$fnames) %>% mutate(value=1:n())
#Format to long df1
df1 <- df1 %>%
mutate(id=1:n()) %>%
pivot_longer(cols = c(group1,group2)) %>%
rename(group=name) %>% left_join(df2) %>% select(-value) %>%
pivot_wider(names_from = group,values_from=treat) %>% select(-id)
Output:
# A tibble: 3 x 6
psihat ci.lower ci.upper p.value group1 group2
<dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 -1 -3.44 1.44 0.260 placebo low
2 -2.8 -5.54 -0.0638 0.0491 placebo high
3 -1.8 -4.54 0.936 0.173 low high
Let's say I have a df with groups and a group-level variable like a mean. How do I produce a variable which is the group-level mean of the lagged group, where the only rows with NA for this variable are those in the first group?
e.g:
df <- data_frame(group = c(1,1,2,2),
grouped.mean = c(2.5,2.5,3.5,3.5))
# my attempt
df %<>%
group_by(group) %>%
mutate(lag.group.mean = lag(grouped.mean))
# A tibble: 4 x 3
# Groups: group [2]
group grouped.mean lag.group.mean
<dbl> <dbl> <dbl>
1 1. 2.50 NA
2 1. 2.50 2.50
3 2. 3.50 NA
4 2. 3.50 3.50
Desired output:
group grouped.mean lag.group.mean
<dbl> <dbl> <dbl>
1 1. 2.50 NA
2 1. 2.50 NA
3 2. 3.50 2.50
4 2. 3.50 2.50
Thanks!
EDIT: more challenging example:
df <- data_frame(group = c(1,1,2,3,3,3),
grouped.mean = c(2.5,2.5,3.5,4.5,4.5,4.5))
expected output:
group grouped.mean lag.grouped.mean
<dbl> <dbl> <dbl>
1 1. 2.50 NA
2 1. 2.50 NA
3 2. 3.50 2.50
4 3. 4.50 3.50
5 3. 4.50 3.50
6 3. 4.50 3.50
Here is an option. The key is to use distinct to remove duplicated rows, create the lag.group.mean column, and then left_join to the original data frame.
library(dplyr)
df <- data_frame(group = c(1,1,2,2),
grouped.mean = c(2.5,2.5,3.5,3.5))
df2 <- df %>%
distinct() %>%
mutate(lag.group.mean = lag(grouped.mean)) %>%
left_join(df, ., by = c("group", "grouped.mean"))
df2
# # A tibble: 4 x 3
# group grouped.mean lag.group.mean
# <dbl> <dbl> <dbl>
# 1 1 2.5 NA
# 2 1 2.5 NA
# 3 2 3.5 2.5
# 4 2 3.5 2.5
The lagged group value is the first globally lagged value within each group:
library(tidyverse)
df <- data_frame(group = c(1, 1, 2, 3, 3, 3),
grouped.mean = c(2.5, 2.5, 3.5, 4.5, 4.5, 4.5))
df %>%
mutate(lag.grouped.mean = lag(grouped.mean)) %>%
group_by(group) %>%
mutate(lag.grouped.mean = first(lag.grouped.mean))
#> # A tibble: 6 x 3
#> # Groups: group [3]
#> group grouped.mean lag.grouped.mean
#> <dbl> <dbl> <dbl>
#> 1 1 2.5 NA
#> 2 1 2.5 NA
#> 3 2 3.5 2.5
#> 4 3 4.5 3.5
#> 5 3 4.5 3.5
#> 6 3 4.5 3.5
But it's probably easier to see what's happening if you use a join like in
#www's answer.
Created on 2018-08-06 by the reprex package (v0.2.0.9000).
I am interested in the total mean, and the mean within different conditions, of some measurements preferably using dplyr's summarise function.
I'll illustrate my question in the following. Say I have some data, borrowed form this this,
dta <- read.table(header=TRUE, text='
subject sex condition measurement
1 M control 7.9
1 M cond1 12.3
1 M cond2 10.7
2 F control 6.3
2 F cond1 10.6
2 F cond2 11.1
3 F control 9.5
3 F cond1 13.1
3 F cond2 13.8
4 M control 11.5
4 M cond1 13.4
4 M cond2 12.9
') # ; dta
I now want the mean for each sex and the mean by sex for each condition. I know how to get it for each condition, like this.
# install.packages(c("dplyr"), dependencies = TRUE)
library(dplyr)
dta %>%
group_by(sex, condition) %>%
summarise(
mean = mean(measurement)
)
#> # A tibble: 6 x 3
#> # Groups: sex [?]
#> sex condition mean
#> <fctr> <fctr> <dbl>
#> 1 F cond1 11.85
#> 2 F cond2 12.45
#> 3 F control 7.90
#> 4 M cond1 12.85
#> 5 M cond2 11.80
#> 6 M control 9.70
But, this does not give me the aggregate mean for both sexes. To get this I either have to run a separate call, i.e.
dta %>%
group_by(sex) %>%
summarise(
mean = mean(measurement)
)
#> # A tibble: 2 x 2
#> sex mean
#> <fctr> <dbl>
#> 1 F 10.73333
#> 2 M 11.45000
or deconstruct data structure. Like this,
# install.packages(c("tidyr"), dependencies = TRUE)
library(tidyr)
dta_wide <- spread(dta, condition, measurement)
dta_wide %>%
group_by(sex) %>%
summarise(
mean_tot = mean(cond1 + cond2 + control)/3,
mean_cond1 = mean(cond1),
mean_cond2 = mean(cond2),
mean_control = mean(control)
)
#> # A tibble: 2 x 5
#> sex mean_tot mean_cond1 mean_cond2 mean_control
#> <fctr> <dbl> <dbl> <dbl> <dbl>
#> 1 F 10.73333 11.85 12.45 7.9
#> 2 M 11.45000 12.85 11.80 9.7
This gives me an output with both the over all mean by sex and the individual mean by condition.
However, both running two separate calls and deconstructing data seems unnecessarily cumbersome. Isn't there a simply way to add a categorical variable, here condition, as the by variable and at the same time keep the aggregate information, here mean by sex? Maybe I am overlooking something logical and shouldn't be messing with data like this?
One option is to calculate the two summaries separately, then join back:
dta %>%
group_by(sex, condition) %>%
summarise(mean = mean(measurement)) %>%
inner_join(
group_by(dta, sex) %>%
summarise(mean_tot = mean(measurement))
)
# Joining, by = "sex"
# A tibble: 6 x 4
# Groups: sex [?]
# sex condition mean mean_tot
# <fctr> <fctr> <dbl> <dbl>
#1 F cond1 11.85 10.73333
#2 F cond2 12.45 10.73333
#3 F control 7.90 10.73333
#4 M cond1 12.85 11.45000
#5 M cond2 11.80 11.45000
#6 M control 9.70 11.45000
Or use group_by twice:
dta %>%
group_by(sex, condition) %>%
summarise(s = sum(measurement), n = n()) %>%
group_by(sex) %>%
transmute(condition, mean_tot = sum(s) / sum(n), mean = s / n)
# Adding missing grouping variables: `sex`
# A tibble: 6 x 4
# Groups: sex [2]
# sex condition mean_tot mean
# <fctr> <fctr> <dbl> <dbl>
#1 F cond1 10.73333 11.85
#2 F cond2 10.73333 12.45
#3 F control 10.73333 7.90
#4 M cond1 11.45000 12.85
#5 M cond2 11.45000 11.80
#6 M control 11.45000 9.70
I have data frame with multiple variable , some variables those contains only 0's and 1's and other columns contains all the possible values.
How to summarize df columns contains only 0's & 1's with "sts_1=sum(sts_1*0.25,na.rm=T)" and other columns with "non_sts_3=mean(non_sts_3,na.rm = T)," with out specifying column name.
df <- data.frame(year=c("2014","2014","2015","2015","2015"),
month_=c("Jan","Jan","Jan","Jan","Feb"),
sts_1=c(0,1,1,1,0),
sts_2=c(1,0,0,1,NA),
non_sts_1=c(0,3,7,31,10),
non_sts_2=c(1,4,NA,12,6),
non_sts_3 = c(12,14,18,1,9))
We can do by dplyr by entering column names manually with below code
df<-group_by(df,year, month_)
df_aggregation<-summarise(df,
non_sts_1=mean(non_sts_1,na.rm = T),
non_sts_2=mean(non_sts_2,na.rm = T),
non_sts_3=mean(non_sts_3,na.rm = T),
sts_1=sum(sts_1*0.25,na.rm=T),
sts_2=sum(sts_2*0.25,na.rm=T))
Thanks in advance...
#akrun's answer is straight-forward. If you prefer to not calculate unnecessarily, however, you can define a function that discriminates directly:
library(dplyr)
mysumm <- function(x, na.rm = FALSE) {
if (all(x %in% 0:1)) {
sum(x * 0.25, na.rm = na.rm)
} else {
mean(x, na.rm = na.rm)
}
}
df %>%
group_by(year, month_) %>%
summarise_if(is.numeric, mysumm, na.rm = TRUE)
# # A tibble: 3 x 7
# # Groups: year [?]
# year month_ sts_1 sts_2 non_sts_1 non_sts_2 non_sts_3
# <fctr> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2014 Jan 0.25 0.25 1.5 2.5 13.0
# 2 2015 Feb 0.00 NaN 10.0 6.0 9.0
# 3 2015 Jan 0.50 0.25 19.0 12.0 9.5
We can use summarise_all and then remove the extra columns
df %>%
group_by(year, month_) %>%
summarise_all(funs(mean(., na.rm = TRUE), sum(.*0.25, na.rm = TRUE))) %>%
select(matches("month_|non_sts.*mean|\\bsts.*sum"))
# A tibble: 3 x 7
# Groups: year [2]
# year month_ non_sts_1_mean non_sts_2_mean non_sts_3_mean sts_1_sum sts_2_sum
# <fctr> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 2014 Jan 1.5 2.5 13.0 0.25 0.25
#2 2015 Feb 10.0 6.0 9.0 0.00 0.00
#3 2015 Jan 19.0 12.0 9.5 0.50 0.25
Another approach if we have multiple sets of functions to be applied on different set of columns, will be to approach by applying the functions on different blocks of columns separately and then join
library(tidyverse)
flist <- list(function(x) mean(x, na.rm = TRUE), function(x) sum(x*0.25, na.rm = TRUE))
nm1 <- c("^non_sts", "^sts")
map2(nm1, flist, ~df %>%
group_by(year, month_) %>%
summarise_at(vars(matches(.x)), funs(.y))) %>%
reduce(inner_join, by = c('year', 'month_'))
# A tibble: 3 x 7
# Groups: year [?]
# year month_ non_sts_1 non_sts_2 non_sts_3 sts_1 sts_2
# <fctr> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 2014 Jan 1.5 2.5 13.0 0.25 0.25
#2 2015 Feb 10.0 6.0 9.0 0.00 0.00
#3 2015 Jan 19.0 12.0 9.5 0.50 0.25
NOTE: This approach is flexible to use for any set of columns
If we were to modify the approach for 0:1 case
l1 <- df %>%
summarise_at(3:7, funs(all(. %in% c(0, 1, NA)))) %>%
unlist
nm1 <- split(names(df)[-(1:2)], l1)
and then apply as above by removing the matches