I am trying to find a way to get summary stats such as means by group and overall in one step using dplyr
#Data set-up
sex <- sample(c("M", "F"), size=100, replace=TRUE)
age <- rnorm(n=100, mean=20 + 4*(sex=="F"), sd=0.1)
dsn <- data.frame(sex, age)
library("tidyverse")
#Using dplyr to get means by group and overall
mean_by_sex <- dsn %>%
group_by(sex) %>%
summarise(mean_age = mean(age))
mean_all <- dsn %>%
summarise(mean_age = mean(age)) %>%
add_column(sex = "All")
#combining the results by groups and overall
final_result <- rbind(mean_by_sex, mean_all)
final_result
#> # A tibble: 3 x 2
#> sex mean_age
#> <fct> <dbl>
#> 1 F 24.0
#> 2 M 20.0
#> 3 All 21.9
#This is the table I want but I wonder if is the only way to do this
Is there a way this in shorter step using group_by_at or group_by_all or a similar functions using tidyverse and dplyr
Any help would be greatly appreciated
One option could perhaps be:
dsn %>%
group_by(sex) %>%
summarise(mean_age = mean(age)) %>%
add_row(sex = "ALL", mean_age = mean(dsn$age))
sex mean_age
<fct> <dbl>
1 F 24.0
2 M 20.0
3 ALL 21.9
A little switching around can do it, too.
final_result <- dsn %>%
add_row(sex = "All", age = mean(age)) %>%
group_by(sex) %>%
summarise(mean_age = mean(age))
These answers are great if you have one variable to summarize by. What about two? I want to summarize across one but leave the other as is. The above solutions do not work in this case because the data frame still needs to be grouped.
#Data set up
set.seed(3243242)
dsn <- tibble(
obese = sample(c(TRUE, FALSE), size=100, replace = TRUE),
sex = sample(c("M", "F"), size=100, replace=TRUE),
age = rnorm(n=100, mean=20 + 4*(sex=="F"), sd=0.1)
)
library("tidyverse")
I restated the original problem using 2 group_by variables.
#Extend to 2 group_by variables?
df1 <- dsn %>%
group_by(sex, obese) %>%
summarise(mean_age = mean(age)) %>%
ungroup()
#Also across sex
df2 <- dsn %>%
group_by(obese) %>%
summarise(mean_age = mean(age)) %>%
ungroup()
#Final_result:
bind_rows(df1, df2)
Way to do this in one step? You can add mean with add_row() but not with a grouped df. Another option is to create a function that does all the things on the group dataset. If there are other things you want to do, say sort or create new variables, you can do it in the function. Then, you can apply the function to each grouped dataset. After combining via dplyr::bind_rows(), you can change the missing group variable to all via tidyr::replace_na().
#'#param df_group A grouped tibble
find_summary <- function(df_group){
df_group %>%
summarize(mean_age = mean(age)) #add other dplyr verbs here as needed like arrange or mutate
}
bind_rows(
find_summary(group_by(dsn, sex, obese)),
find_summary(group_by(dsn, obese))
) %>%
replace_na(list(sex = "all"))
sex obese mean_age
<chr> <lgl> <dbl>
1 F FALSE 24.0
2 F TRUE 24.0
3 M FALSE 20.0
4 M TRUE 20.0
5 all FALSE 21.7
6 all TRUE 22.3
You can extend the idea if you want a summary of all variables, by one variable, and by two variables.
bind_rows(
find_summary(group_by(dsn, sex, obese)),
find_summary(group_by(dsn, obese)),
find_summary(dsn)
) %>%
replace_na(list(sex = "all", obese = "all"))
sex obese mean_age
<chr> <chr> <dbl>
1 F FALSE 24.0
2 F TRUE 24.0
3 M FALSE 20.0
4 M TRUE 20.0
5 all FALSE 21.7
6 all TRUE 22.3
7 all all 22.0
Related
I would like to use group_by( ) function with my customised function but the column names that goes within group_by would be defined in my function argument.
See a hypothetical example of what my data would look like:
data <- data.frame(ind = rep(c("A", "B", "C"), 4),
gender = rep(c("F", "M"), each = 6),
value = sample(1:100, 12))
And this is the result I would like to have:
result <- data %>%
group_by(ind, gender) %>%
mutate(value = mean(value)) %>%
distinct()
This is how I was trying to make my function to work:
myFunction <- function(data, set_group, variable){
result <- data %>%
group_by(get(set_group)) %>%
mutate(across(all_of(variable), ~ mean(.x, na.rm = TRUE))) %>%
distinct()
}
result3 <- myFunction(data, set_group = c("ind", "gender"), variable = c("value"))
result3
I want to allow that the user define as many set_group as needed and as many variable as needed. I tried using get( ) function, all_of( ) function and mget( ) function within group_by but none worked.
Does anyone know how can I code it?
Thank you!
We could use across within group_by
myFunction <- function(data, set_group, variable){
data %>%
group_by(across(all_of(set_group))) %>%
mutate(across(all_of(variable), ~ mean(.x, na.rm = TRUE))) %>%
ungroup %>%
distinct()
}
-testing
> myFunction(data, set_group = c("ind", "gender"), variable = c("value"))
# A tibble: 6 × 3
ind gender value
<chr> <chr> <dbl>
1 A F 43.5
2 B F 87.5
3 C F 67.5
4 A M 13
5 B M 43.5
6 C M 37.5
Another option is to convert to symbols and evaluate (!!!)
myFunction <- function(data, set_group, variable){
data %>%
group_by(!!! rlang::syms(set_group)) %>%
mutate(across(all_of(variable), ~ mean(.x, na.rm = TRUE))) %>%
ungroup %>%
distinct()
}
-testing
> myFunction(data, set_group = c("ind", "gender"), variable = c("value"))
# A tibble: 6 × 3
ind gender value
<chr> <chr> <dbl>
1 A F 43.5
2 B F 87.5
3 C F 67.5
4 A M 13
5 B M 43.5
6 C M 37.5
NOTE: get is used when there is a single object, for multiple objects mget can be used. But, it is better to use tidyverse functions
I want to get sums of a variable depending on the other variable's na or non-na values in R. A working example code is below:
library(dplyr)
df <- data.frame(A = c(1,2,3,NA,4),
B = c(NA,2,3,NA,5),
C = c(3,4,NA,NA,NA),
REF = c(10,20,30,40,50))
df.na <- df %>% mutate_at(vars(-REF),is.na)
sums <- matrix(0,2,3)
row.names(sums) <- c("NON-NA","NA")
colnames(sums) <- c("A","B","C")
for(i in 1:3){
sums[,i] <- df.na %>% group_by_at(i) %>% summarise(sum=sum(REF)) %>% select(sum) %>% unlist()
}
> sums
A B C
NON-NA 110 100 30
NA 40 50 120
For example, since 4th term in the A column is NA, the corresponding column values are 30 and 10+20+3+50 = 150-30 = 120 in sums object.
My question is how do I get this output without a for loop?
Here is a solution using the pivot_ functions from tidyr. The approach pivots to a longer form so that you can group by column name and whether the column value is NA.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = c("A", "B", "C")) %>%
mutate(isna = is.na(value)) %>%
group_by(name, isna) %>%
summarize(value = sum(REF)) %>%
pivot_wider()
isna A B C
<lgl> <dbl> <dbl> <dbl>
1 FALSE 110 100 30
2 TRUE 40 50 120
df <- data.frame(A = c(1,2,3,NA,4),
B = c(NA,2,3,NA,5),
C = c(3,4,NA,NA,NA),
REF = c(10,20,30,40,50))
library(tidyverse)
imap(.x = df[1:3],
.f = ~ df %>%
group_by(grp = is.na(.x)) %>%
summarise(!!.y := sum(REF, na.rm = T))) %>%
reduce(left_join)
#> Joining, by = "grp"
#> Joining, by = "grp"
#> # A tibble: 2 x 4
#> grp A B C
#> <lgl> <dbl> <dbl> <dbl>
#> 1 FALSE 110 100 30
#> 2 TRUE 40 50 120
Created on 2022-01-26 by the reprex package (v2.0.1)
I used group_map for the first time and think I do it correctly. This is my code:
library(REAT)
df <- data.frame(value = c(1,1,1, 1,0.5,0.1, 0,0,0,1), group = c(1,1,1, 2,2,2, 3,3,3,3))
haves <- df %>%
group_by(group) %>%
group_map(~gini(.x$value, coefnorm = TRUE))
The thing is that haves is a list rather than a data frame. What would I have to do to obtain this df
wants <- data.frame(group = c(1,2,3), gini = c(0,0.5625,1))
group gini
1 0.0000
2 0.5625
3 1.0000
Thanks!
You can use dplyr::summarize:
df %>%
group_by(group) %>%
summarize(gini = gini(value, coefnorm = TRUE))
#> # A tibble: 3 x 2
#> group gini
#> <dbl> <dbl>
#> 1 1 0
#> 2 2 0.562
#> 3 3 1
According to the documentation, group_map always produces a list. group_modify is an alternative that produces a tibble if the function does, but gini just outputs a vector. So, you could do something like this...
df %>%
group_by(group) %>%
group_modify(~tibble(gini = gini(.x$value, coefnorm = TRUE)))
# A tibble: 3 x 2
# Groups: group [3]
group gini
<dbl> <dbl>
1 1 0
2 2 0.562
3 3 1
Using data.table
library(data.table)
setDT(df)[, .(gini = gini(value, coefnorm = TRUE)), group]
For grouped datasets, we can specify .data if in case we don't want to use column names unquoted
library(dplyr)
df %>%
group_by(group) %>%
summarize(gini = gini(.data$value, coefnorm = TRUE))
I often have to transform long tidy data frames into wide format. To do so I use the following standard procedure:
# Example data frame
df <- data.frame("ID" = rep(1:5, each = 4), "score" = runif(20, 0, 100), "location" = rep(c("a", "b", "c", "d"), 5))
# Transform into wide format
df_wide <- df %>%
group_by_at(vars(-score)) %>% # group by everything other than the value column.
mutate(row_id=1:n()) %>% ungroup() %>% # build group index
spread(key=location, value=score) %>% # spread
dplyr::select(-row_id)
Instead of typing this little script over and over again, I wanted to define a function to do it automatically. I found many useful posts on how to include column names as function inputs, but somehow it doesn't work or I get error messages. What am I doing wrong?
Below a few of my attempts (neither of them work), following these and this suggestions:
wide_fun <- function(dat, key_name, value_name) {
group_by_at(vars(- !! sym(value_name))) %>% # group by everything other than the value column.
mutate(row_id=1:n()) %>% ungroup() %>% # build group index
spread(key=!! sym(key_name), value=!! sym(value_name)) %>% # spread
dplyr::select(-row_id)
}
wide_fun2 <- function(dat, key_name, value_name) {
key_col <- enquo(key_name)
value_col <- enquo(value_name)
group_by_at(vars(- !!value_col)) %>% # group by everything other than the value column.
mutate(row_id=1:n()) %>% ungroup() %>% # build group index
spread(key= !!key_col, value= !!value_col) %>% # spread
dplyr::select(-row_id)
}
wide_fun3 <- function(dat, key_name, value_name) {
group_by_at(vars(- value_name)) %>% # group by everything other than the value column.
mutate(row_id=1:n()) %>% ungroup() %>% # build group index
spread(key=key_name, value=value_name) %>% # spread
dplyr::select(-row_id)
}
wide_fun3(df, quote(location), quote(score))
Thanks for your help!
I've slightly updated your code to dplyr 1.0.0 and tidyr. Then you can make use of the new dplyr programming feature {{}} to specify variables that are arguments of a function.
# Example data frame
df <- data.frame("ID" = rep(1:5, each = 4), "score" = runif(20, 0, 100), "location" = rep(c("a", "b", "c", "d"), 5))
library(dplyr)
wide_fun <- function(.data, key_name, value_name) {
.data %>%
group_by(across(-{{value_name}})) %>% # group by everything other than the value column.
mutate(row_id = 1:n()) %>% ungroup() %>% # build group index
tidyr::pivot_wider(
names_from = {{key_name}},
values_from = {{value_name}}) %>% # spread
select(-row_id)
}
wide_fun(df, location, score)
#> # A tibble: 5 x 5
#> ID a b c d
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 90.8 38.9 28.7 39.0
#> 2 2 94.5 24.9 84.6 54.6
#> 3 3 61.1 97.2 12.2 57.7
#> 4 4 52.7 85.6 41.4 100.
#> 5 5 17.8 86.1 92.3 33.7
Created on 2020-09-11 by the reprex package (v0.3.0)
Edit
This function should also work with older versions of dplyr:
library(dplyr)
wide_fun_2 <- function(.data, key_name, value_name) {
.data %>%
group_by_at(vars(-!!ensym(value_name))) %>% # group by everything other than the value column.
mutate(row_id = 1:n()) %>% ungroup() %>% # build group index
tidyr::pivot_wider(
names_from = !!ensym(key_name),
values_from = !!ensym(value_name)) %>% # spread
select(-row_id)
}
df %>%
wide_fun_2(location, score)
A tibble: 5 x 5
ID a b c d
<int> <dbl> <dbl> <dbl> <dbl>
1 1 72.2 81.4 52.5 48.8
2 2 36.1 27.5 82.2 73.0
3 3 83.9 68.2 80.9 15.7
4 4 0.451 70.0 18.5 43.2
5 5 82.6 68.2 22.8 63.0
If you just provide the argument that specifies the column, you only need to deal with symbols and not quosures, therefore you need to use ensym.
Using tidyr/dplyr, I have some factor columns which I'd like to Z-score, and then mutate an average Z-score, whilst retaining the original data for reference.
I'd like to avoid using a for loop in tidyr/dplyr, thus I'm gathering my data and performing my calculation (Z-score) on a single column. However, I'm struggling with restoring the wide format.
Here is a MWE:
library(dplyr)
library(tidyr)
# Original Data
dfData <- data.frame(
Name = c("Steve","Jwan","Ashley"),
A = c(10,20,12),
B = c(0.2,0.3,0.5)
) %>% tbl_df()
# Gather to Z-score
dfLong <- dfData %>% gather("Factor","Value",A:B) %>%
mutate(FactorZ = paste0("Z_",Factor)) %>%
group_by(Factor) %>%
mutate(ValueZ = (Value - mean(Value,na.rm = TRUE))/sd(Value,na.rm = TRUE))
# Now go wide to do some mutations (eg Z)Avg = (Z_A + Z_B)/2)
# This does not work
dfWide <- dfLong %>%
spread(Factor,Value) %>%
spread(FactorZ,ValueZ)%>%
mutate(Z_Avg = (Z_A+Z_B)/2)
# This is the desired result
dfDesired <- dfData %>% mutate(Z_A = (A - mean(A,na.rm = TRUE))/sd(A,na.rm = TRUE)) %>% mutate(Z_B = (B - mean(B,na.rm = TRUE))/sd(B,na.rm = TRUE)) %>%
mutate(Z_Avg = (Z_A+Z_B)/2)
Thanks for any help/input!
Another approach using dplyr (version 0.5.0)
library(dplyr)
dfData %>%
mutate_each(funs(Z = scale(.)), -Name) %>%
mutate(Z_Avg = (A_Z+B_Z)/2)
means <-function(x)mean(x, na.rm=T)
dfWide %>% group_by(Name) %>% summarise_each(funs(means)) %>% mutate(Z_Avg = (Z_A + Z_B)/2)
# A tibble: 3 x 6
Name A B Z_A Z_B Z_Avg
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Ashley 12 0.5 -0.3779645 1.0910895 0.3565625
2 Jwan 20 0.3 1.1338934 -0.2182179 0.4578378
3 Steve 10 0.2 -0.7559289 -0.8728716 -0.8144003
Here is one approach with long and wide format. For z-transformation, you can use the base function scale. Furthermore, this approach includes a join to combine the original data frame and the one including the new values.
dfLong <- dfData %>%
gather(Factor, Value, A:B) %>%
group_by(Factor) %>%
mutate(ValueZ = scale(Value))
# Name Factor Value ValueZ
# <fctr> <chr> <dbl> <dbl>
# 1 Steve A 10.0 -0.7559289
# 2 Jwan A 20.0 1.1338934
# 3 Ashley A 12.0 -0.3779645
# 4 Steve B 0.2 -0.8728716
# 5 Jwan B 0.3 -0.2182179
# 6 Ashley B 0.5 1.0910895
dfWide <- dfData %>% inner_join(dfLong %>%
ungroup %>%
select(-Value) %>%
mutate(Factor = paste0("Z_", Factor)) %>%
spread(Factor, ValueZ) %>%
mutate(Z_Avg = (Z_A + Z_B) / 2))
# Name A B Z_A Z_B Z_Avg
# <fctr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Steve 10 0.2 -0.7559289 -0.8728716 -0.8144003
# 2 Jwan 20 0.3 1.1338934 -0.2182179 0.4578378
# 3 Ashley 12 0.5 -0.3779645 1.0910895 0.3565625
I would just do it all in wide format. No need to keep switching between the long and wide formats.
dfData %>%
mutate(Z_A=(A-mean(unlist(dfData$A)))/sd(unlist(dfData$A)),
Z_B=(B-mean(unlist(dfData$B)))/sd(unlist(dfData$B))) %>%
mutate(Z_AVG=(Z_A+Z_B)/2)