dplyr::mutate_at() relying on multiple columns with a given prefix/suffix - r

dplyr::mutate_at() can be used to apply the same function to multiple columns. It also allows you to set the results in new columns using a named list.
However, what if I have many columns in pairs (say, data1_a, data1_b, data2_a, data2_b, ...) and I want to multiply those pairs together? Is that possible?
By hand, that would look like
suppressPackageStartupMessages({
library(dplyr)
})
data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6) %>%
mutate(
data1 = data1_a * data1_b,
data2 = data2_a * data2_b
)
#> data1_a data1_b data2_a data2_b data1 data2
#> 1 1 2 3 4 2 12
#> 2 2 3 4 5 6 20
#> 3 3 4 5 6 12 30
My current solution is to write a function which takes the unsuffixed variable name (i.e. "data1"), creates the suffixed names and then performs a simple mutate() on that variable using get(). I then call that function for each output:
foo <- function(df, name) {
a <- paste0(name, "_a")
b <- paste0(name, "_b")
return(
mutate(
df,
!!name := get(a) * get(b)
)
)
}
data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6) %>%
foo("data1") %>%
foo("data2")
#> data1_a data1_b data2_a data2_b data1 data2
#> 1 1 2 3 4 2 12
#> 2 2 3 4 5 6 20
#> 3 3 4 5 6 12 30
(or write a loop over all the variable names if there were more of them)
But if it's possible to use mutate_at or something of the sort, that'd be much cleaner.

We can use pivot_longer/pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c('grp', '.value'),
names_sep = "_") %>%
group_by(grp) %>%
transmute(rn, new = a * b) %>%
pivot_wider(names_from = grp, values_from = new) %>%
select(-rn) %>%
bind_cols(df1, .)
# A tibble: 3 x 6
# data1_a data1_b data2_a data2_b data1 data2
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 2 12
#2 2 3 4 5 6 20
#3 3 4 5 6 12 30
Or another option is to split into a list based on the column names and then do the *
library(purrr)
library(stringr)
df1 %>%
split.default(str_remove(names(.), "_.*")) %>%
map_dfr(reduce, `*`) %>%
bind_cols(df1, .)
# A tibble: 3 x 6
# data1_a data1_b data2_a data2_b data1 data2
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 2 12
#2 2 3 4 5 6 20
#3 3 4 5 6 12 30
With mutate, it is possible, but it would be more manual
df1 %>%
mutate(data1 = select(., starts_with('data1')) %>%
reduce(`*`),
data2 = select(., starts_with('data2')) %>%
reduce(`*`))
data
df1 <- data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6)

After adopting #akrun's elegant solution, I noticed it was unfortunately very inefficient (since it has to recreate two dataframes), taking almost a second on a dataset with 20,000 rows and 11 "groups".
So a while ago I developed the following function (with a bit of help from #user12728748... sorry for not posting here sooner), which takes the names of the groups ("data1", "data2", etc) and a formula using the prefixes, allowing for bquote-style quoting for constant names:
suppressPackageStartupMessages(library(dplyr))
mutateSet <- function(df, colNames, formula,
isPrefix = TRUE,
separator = "_") {
vars <- all.vars(formula)
# extracts names wrapped in `.()`
escapedNames <- function (expr)
{
unquote <- function(e) {
if (is.pairlist(e) || length(e) <= 1L) NULL
else if (e[[1L]] == as.name(".")) deparse(e[[2L]])
else unlist(sapply(e, unquote))
}
unquote(substitute(expr))
}
escapedVars <- eval(rlang::expr(escapedNames(!!formula)))
# remove escaped names from mapping variables
vars <- setdiff(vars, escapedVars)
# get output prefix/suffix as string
lhs <- rlang::f_lhs(formula) %>%
all.vars()
# get operation as string
# deparse() can have line breaks; paste0() brings it back to one line
rhs <- rlang::f_rhs(formula) %>%
deparse() %>%
paste0(collapse = "")
# dummy function to cover for bquote escaping
. <- function(x) x
for (i in colNames) {
if (isPrefix) {
aliases <- paste0(vars, separator, i)
newCol <- paste0(lhs, separator, i)
} else {
aliases <- paste0(i, separator, vars)
newCol <- paste0(i, separator, lhs)
}
if (length(lhs) == 0) newCol <- i
mapping <- rlang::list2(!!!aliases)
names(mapping) <- vars
mapping <- do.call(wrapr::qc, mapping)
df <- rlang::expr(wrapr::let(
mapping,
df %>% dplyr::mutate(!!newCol := ...RHS...)
)) %>%
deparse() %>%
gsub(
pattern = "...RHS...",
replacement = rhs
) %>%
{eval(parse(text = .))}
}
return(df)
}
df <- data.frame(a_data1 = 1:3, b_data1 = 2:4,
a_data2 = 3:5, b_data2 = 4:6,
static = 5:7)
mutateSet(df, "data1", ~ a + b)
#> a_data1 b_data1 a_data2 b_data2 static data1
#> 1 1 2 3 4 5 3
#> 2 2 3 4 5 6 5
#> 3 3 4 5 6 7 7
mutateSet(df, c("data1", "data2"), x ~ sqrt(a) + b)
#> a_data1 b_data1 a_data2 b_data2 static x_data1 x_data2
#> 1 1 2 3 4 5 3.000000 5.732051
#> 2 2 3 4 5 6 4.414214 7.000000
#> 3 3 4 5 6 7 5.732051 8.236068
mutateSet(df, c("data1", "data2"), ~ a + b + .(static))
#> a_data1 b_data1 a_data2 b_data2 static data1 data2
#> 1 1 2 3 4 5 8 12
#> 2 2 3 4 5 6 11 15
#> 3 3 4 5 6 7 14 18
Created on 2020-04-28 by the reprex package (v0.3.0)
This can probably be cleaned up (especially that heinous for-loop), but it works for now.
Repeating #user12728748's performance test, we see this is ~100x faster:
suppressPackageStartupMessages({
invisible(lapply(c("dplyr", "tidyr", "rlang", "wrapr", "microbenchmark"),
require, character.only = TRUE))
})
polymutate <- function(df, formula) {
form <- rlang::f_rhs(formula)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c('.value', 'grp'),
names_sep = "_") %>%
group_by(grp) %>%
transmute(rn, new = eval(form)) %>%
pivot_wider(names_from = grp, values_from = new) %>%
select(-rn) %>%
bind_cols(df, .)
}
set.seed(1)
df <- setNames(data.frame(matrix(sample(1:12, 6E6, replace=TRUE), ncol=6)),
c("a_data1", "b_data1", "a_data2", "b_data2", "a_data3", "b_data3"))
pd <- polymutate(df, ~ a + b)
pd2 <- mutateSet(df, c("data1", "data2", "data3"), ~ a + b)
all.equal(pd, pd2)
#> [1] TRUE
microbenchmark(polymutate(df, ~ a + b),
mutateSet(df, c("data1", "data2", "data3"), ~ a + b),
times=10L)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> polymutate 1612.306 1628.9776 1690.78586 1670.15600 1741.3490 1806.1412 10
#> mutateSet 8.757 9.6302 13.27135 10.45965 19.2976 20.4657 10

This is now possible using the cur_column() function within across().
library(tidyverse)
dat <- data.frame(
data1_a = 1:3,
data1_b = 2:4,
data2_a = 3:5,
data2_b = 4:6
)
mutate(
dat,
across(ends_with("a"), ~ . * dat[[str_replace(cur_column(), "a$", "b")]],
.names = "updated_{col}")
)
Returns:
data1_a data1_b data2_a data2_b updated_data1_a updated_data2_a
1 1 2 3 4 2 12
2 2 3 4 5 6 20
3 3 4 5 6 12 30
Where updated_data1_a and updated_data2_a contain the desired output variables.

Related

How to Sum Specific Columns of Dataframe in r? [duplicate]

I have the following condensed data set:
a<-as.data.frame(c(2000:2005))
a$Col1<-c(1:6)
a$Col2<-seq(2,12,2)
colnames(a)<-c("year","Col1","Col2")
for (i in 1:2){
a[[paste("Var_", i, sep="")]]<-i*a[[paste("Col", i, sep="")]]
}
I would like to sum the columns Var1 and Var2, which I use:
a$sum<-a$Var_1 + a$Var_2
In reality my data set is much larger - I would like to sum from Var_1 to Var_n (n can be upto 20). There must be a more efficient way to do this than:
a$sum<-a$Var_1 + ... + a$Var_n
Here's a solution using the tidyverse. You can extend it to as many columns as you like using the select() function to select the appropriate columns within a mutate().
library(tidyverse)
a<-as.data.frame(c(2000:2005))
a$Col1<-c(1:6)
a$Col2<-seq(2,12,2)
colnames(a)<-c("year","Col1","Col2")
for (i in 1:2){
a[[paste("Var_", i, sep="")]]<-i*a[[paste("Col", i, sep="")]]
}
a
#> year Col1 Col2 Var_1 Var_2
#> 1 2000 1 2 1 4
#> 2 2001 2 4 2 8
#> 3 2002 3 6 3 12
#> 4 2003 4 8 4 16
#> 5 2004 5 10 5 20
#> 6 2005 6 12 6 24
# Tidyverse solution
a %>%
mutate(Total = select(., Var_1:Var_2) %>% rowSums(na.rm = TRUE))
#> year Col1 Col2 Var_1 Var_2 Total
#> 1 2000 1 2 1 4 5
#> 2 2001 2 4 2 8 10
#> 3 2002 3 6 3 12 15
#> 4 2003 4 8 4 16 20
#> 5 2004 5 10 5 20 25
#> 6 2005 6 12 6 24 30
Created on 2019-01-01 by the reprex package (v0.2.1)
You can use colSums(a[,c("Var1", "Var2")]) or rowSums(a[,c("Var_1", "Var_2")]). In your case you want the latter.
with dplyr you can use
a %>%
rowwise() %>%
mutate(sum = sum(Col1,Col1, na.rm = T))
or more efficiently
a %>%
rowwise() %>%
mutate(sum = sum(across(starts_with("Col")), na.rm = T))
If you're working with a very large dataset, rowSums can be slow.
An alternative is the rowsums function from the Rfast package. This requires you to convert your data to a matrix in the process and use column indices rather than names. Here's an example based on your code:
## load Rfast
library(Rfast)
## create dataset
a <- as.data.frame(c(2000:2005))
a$Col1 <- c(1:6)
a$Col2 <- seq(2,12,2)
colnames(a) <- c("year","Col1","Col2")
for (i in 1:2){
a[[paste("Var_", i, sep="")]] <- i*a[[paste("Col", i, sep="")]]
}
## get column indices based on names
col_st <- grep("Var_1", colnames(a)) # index of "Var_1" col
col_en <- grep("Var_2", colnames(a)) # index of "Var_2" col
cols <- c(col_st:col_en) # indices of all cols from "Var_1" to "Var_2"
## sum rows 4 to 5
a$Total <- rowsums(as.matrix(a[,cols]))
You can use this:
library(dplyr)
a$Sum <- apply(a[,select(a, starts_with("Var_"))], 1, sum)
In Base R:
You could simply just use sapply:
sapply(unique(sub(".$", "", colnames(a))), function(x) rowSums(a[startsWith(colnames(a), x)]))
This is very reliable, it works for anything.
Benchmarking seems to show that plain Reduce('+', ...) is the fastest. Libraries just make it (at least slightly) slower, at least for mtcars, even if I expand it to be huge.
Unit: milliseconds
expr min lq mean median uq max
rowSums 8.672061 9.014344 13.708022 9.602312 10.672726 148.47183
Reduce 2.994240 3.157500 6.331503 3.223612 3.616555 99.49181
apply 524.488376 651.549401 771.095002 743.286441 857.993418 1235.53153
Rfast 5.649006 5.901787 11.110896 6.387990 9.727408 66.03151
DT_rowSums 9.209539 9.566574 20.955033 10.131163 12.967030 294.32911
DT_Reduce 3.590719 3.774761 10.595256 3.924592 4.259343 340.52855
tidy_rowSums 15.532917 15.997649 33.736883 17.316108 27.072343 343.21254
tidy_Reduce 8.627810 8.960008 12.271105 9.603124 11.089334 79.98853
Code:
library('data.table')
library('tidyverse')
library('Rfast')
DFcars = data.table::copy(mtcars)
DFcars = do.call("rbind", replicate(10000, DFcars, simplify = FALSE))
DT_cars = data.table::copy(DFcars)
DFcars2 = data.table::copy(DFcars)
setDT(DT_cars)
colnms = c("mpg", "cyl", "disp", "hp", "drat")
microbenchmark::microbenchmark(
rowSums =
{
DFcars$new_col = rowSums(DFcars[, colnms])
(as.numeric(DFcars$new_col))
},
Reduce =
{
DFcars$new_col = Reduce('+', DFcars[, colnms])
(as.numeric(DFcars$new_col))
},
apply =
{
DFcars$new_col = apply(DFcars[, 1:5], 1, sum)
(as.numeric(DFcars$new_col))
},
Rfast =
{
DFcars$new_col = rowsums(as.matrix(DFcars[, colnms]))
(as.numeric(DFcars$new_col))
},
DT_rowSums =
{
DT_cars[, new_col := rowSums(.SD), .SDcols = colnms]
(as.numeric(DT_cars$new_col))
},
DT_Reduce =
{
DT_cars[, new_col := Reduce('+', .SD), .SDcols = colnms]
(as.numeric(DT_cars$new_col))
},
tidy_rowSums =
{
DFcars2 = DFcars2 %>% mutate(new_col = select(., colnms) %>% rowSums())
(as.numeric(DFcars2$new_col))
},
tidy_Reduce =
{
DFcars2 = DFcars2 %>% mutate(new_col = select(., colnms) %>% Reduce('+', .))
(as.numeric(DFcars2$new_col))
},
check = 'equivalent'
)

Create NA's based on another dataframe without long data

I have a tibble with the explicit "id" and colnames I need to convert to NA's. Is there anyway I can create the NA's without making my df a long dataset? I considered using the new rows_update function, but I'm not sure if this is correct because I only want certain columns to be NA.
library(dplyr)
to_na <- tribble(~x, ~col,
1, "z",
3, "y"
)
df <- tibble(x = c(1,2,3),
y = c(1,1,1),
z = c(2,2,2))
# desired output:
#> # A tibble: 3 x 3
#> x y z
#> <dbl> <dbl> <dbl>
#> 1 1 1 NA
#> 2 2 1 2
#> 3 3 NA 2
Created on 2020-07-03 by the reprex package (v0.3.0)
This definitely isn't the most elegant solution, but it gets the output you want.
library(dplyr)
library(purrr)
to_na <- tribble(~x, ~col,
1, "z",
3, "y"
)
df <- tibble(x = c(1,2,3),
y = c(1,1,1),
z = c(2,2,2))
map2(to_na$x, to_na$col, #Pass through these two objects in parallel
function(xval_to_missing, col) df %>% #Two objects above matched by position here.
mutate_at(col, #mutate_at the specified cols
~if_else(x == xval_to_missing, NA_real_, .) #if x == xval_to_missing, make NA, else keep as is.
) %>%
select(x, col) #keep x and the modified column.
) %>% #end of map2
reduce(left_join, by = "x") %>% #merge within the above list, by x.
relocate(x, y, z) #Keep your ordering
Output:
# A tibble: 3 x 3
x y z
<dbl> <dbl> <dbl>
1 1 1 NA
2 2 1 2
3 3 NA 2
We can use row/column indexing to assign the values to NA in base R
df <- as.data.frame(df)
df[cbind(to_na$x, match(to_na$col, names(df)))] <- NA
df
# x y z
#1 1 1 NA
#2 2 1 2
#3 3 NA 2
If we want to use rows_update
library(dplyr)
library(tidyr)
library(purrr)
lst1 <- to_na %>%
mutate(new = NA_real_) %>%
split(seq_len(nrow(.))) %>%
map(~ .x %>%
pivot_wider(names_from = col, values_from = new))
for(i in seq_along(lst1)) df <- rows_update(df, lst1[[i]])
df
# A tibble: 3 x 3
# x y z
# <dbl> <dbl> <dbl>
#1 1 1 NA
#2 2 1 2
#3 3 NA 2

Group data by factor level, then transform to data frame with colname being levels?

There is my problem that I can't solve it:
Data:
df <- data.frame(f1=c("a", "a", "b", "b", "c", "c", "c"),
v1=c(10, 11, 4, 5, 0, 1, 2))
data.frame:f1 is factor
f1 v1
a 10
a 11
b 4
b 5
c 0
c 1
c 2
# What I want is:(for example, fetch data with the number of element of some level == 2, then to data.frame)
a b
10 4
11 5
Thanks in advance!
I might be missing something simple here , but the below approach using dplyr works.
library(dplyr)
nlevels = 2
df1 <- df %>%
add_count(f1) %>%
filter(n == nlevels) %>%
select(-n) %>%
mutate(rn = row_number()) %>%
spread(f1, v1) %>%
select(-rn)
This gives
# a b
# <int> <int>
#1 10 NA
#2 11 NA
#3 NA 4
#4 NA 5
Now, if you want to remove NA's we can do
do.call("cbind.data.frame", lapply(df1, function(x) x[!is.na(x)]))
# a b
#1 10 4
#2 11 5
As we have filtered the dataframe which has only nlevels observations, we would have same number of rows for each column in the final dataframe.
split might be useful here to split df$v1 into parts corresponding to df$f1. Since you are always extracting equal length chunks, it can then simply be combined back to a data.frame:
spl <- split(df$v1, df$f1)
data.frame(spl[lengths(spl)==2])
# a b
#1 10 4
#2 11 5
Or do it all in one call by combining this with Filter:
data.frame(Filter(function(x) length(x)==2, split(df$v1, df$f1)))
# a b
#1 10 4
#2 11 5
Here is a solution using unstack :
unstack(
droplevels(df[ave(df$v1, df$f1, FUN = function(x) length(x) == 2)==1,]),
v1 ~ f1)
# a b
# 1 10 4
# 2 11 5
A variant, similar to #thelatemail's solution :
data.frame(Filter(function(x) length(x) == 2, unstack(df,v1 ~ f1)))
My tidyverse solution would be:
library(tidyverse)
df %>%
group_by(f1) %>%
filter(n() == 2) %>%
mutate(i = row_number()) %>%
spread(f1, v1) %>%
select(-i)
# # A tibble: 2 x 2
# a b
# * <dbl> <dbl>
# 1 10 4
# 2 11 5
or mixing approaches :
as_tibble(keep(unstack(df,v1 ~ f1), ~length(.x) == 2))
Using all base functions (but you should use tidyverse)
# Add count of instances
x$len <- ave(x$v1, x$f1, FUN = length)
# Filter, drop the count
x <- x[x$len==2, c('f1','v1')]
# Hacky pivot
result <- data.frame(
lapply(unique(x$f1), FUN = function(y) x$v1[x$f1==y])
)
colnames(result) <- unique(x$f1)
> result
a b
1 10 4
2 11 5
I'd like code this, may it helps for you
library(reshape2)
library(dplyr)
aa = data.frame(v1=c('a','a','b','b','c','c','c'),f1=c(10,11,4,5,0,1,2))
cc = aa %>% group_by(v1) %>% summarise(id = length((v1)))
dd= merge(aa,cc) #get the level
ee = dd[dd$aa==2,] #select number of level equal to 2
ee$id = rep(c(1,2),nrow(ee)/2) # reset index like (1,2,1,2)
dcast(ee, id~v1,value.var = 'f1')
all done!

Efficiently sum across multiple columns in R

I have the following condensed data set:
a<-as.data.frame(c(2000:2005))
a$Col1<-c(1:6)
a$Col2<-seq(2,12,2)
colnames(a)<-c("year","Col1","Col2")
for (i in 1:2){
a[[paste("Var_", i, sep="")]]<-i*a[[paste("Col", i, sep="")]]
}
I would like to sum the columns Var1 and Var2, which I use:
a$sum<-a$Var_1 + a$Var_2
In reality my data set is much larger - I would like to sum from Var_1 to Var_n (n can be upto 20). There must be a more efficient way to do this than:
a$sum<-a$Var_1 + ... + a$Var_n
Here's a solution using the tidyverse. You can extend it to as many columns as you like using the select() function to select the appropriate columns within a mutate().
library(tidyverse)
a<-as.data.frame(c(2000:2005))
a$Col1<-c(1:6)
a$Col2<-seq(2,12,2)
colnames(a)<-c("year","Col1","Col2")
for (i in 1:2){
a[[paste("Var_", i, sep="")]]<-i*a[[paste("Col", i, sep="")]]
}
a
#> year Col1 Col2 Var_1 Var_2
#> 1 2000 1 2 1 4
#> 2 2001 2 4 2 8
#> 3 2002 3 6 3 12
#> 4 2003 4 8 4 16
#> 5 2004 5 10 5 20
#> 6 2005 6 12 6 24
# Tidyverse solution
a %>%
mutate(Total = select(., Var_1:Var_2) %>% rowSums(na.rm = TRUE))
#> year Col1 Col2 Var_1 Var_2 Total
#> 1 2000 1 2 1 4 5
#> 2 2001 2 4 2 8 10
#> 3 2002 3 6 3 12 15
#> 4 2003 4 8 4 16 20
#> 5 2004 5 10 5 20 25
#> 6 2005 6 12 6 24 30
Created on 2019-01-01 by the reprex package (v0.2.1)
You can use colSums(a[,c("Var1", "Var2")]) or rowSums(a[,c("Var_1", "Var_2")]). In your case you want the latter.
with dplyr you can use
a %>%
rowwise() %>%
mutate(sum = sum(Col1,Col1, na.rm = T))
or more efficiently
a %>%
rowwise() %>%
mutate(sum = sum(across(starts_with("Col")), na.rm = T))
If you're working with a very large dataset, rowSums can be slow.
An alternative is the rowsums function from the Rfast package. This requires you to convert your data to a matrix in the process and use column indices rather than names. Here's an example based on your code:
## load Rfast
library(Rfast)
## create dataset
a <- as.data.frame(c(2000:2005))
a$Col1 <- c(1:6)
a$Col2 <- seq(2,12,2)
colnames(a) <- c("year","Col1","Col2")
for (i in 1:2){
a[[paste("Var_", i, sep="")]] <- i*a[[paste("Col", i, sep="")]]
}
## get column indices based on names
col_st <- grep("Var_1", colnames(a)) # index of "Var_1" col
col_en <- grep("Var_2", colnames(a)) # index of "Var_2" col
cols <- c(col_st:col_en) # indices of all cols from "Var_1" to "Var_2"
## sum rows 4 to 5
a$Total <- rowsums(as.matrix(a[,cols]))
You can use this:
library(dplyr)
a$Sum <- apply(a[,select(a, starts_with("Var_"))], 1, sum)
In Base R:
You could simply just use sapply:
sapply(unique(sub(".$", "", colnames(a))), function(x) rowSums(a[startsWith(colnames(a), x)]))
This is very reliable, it works for anything.
Benchmarking seems to show that plain Reduce('+', ...) is the fastest. Libraries just make it (at least slightly) slower, at least for mtcars, even if I expand it to be huge.
Unit: milliseconds
expr min lq mean median uq max
rowSums 8.672061 9.014344 13.708022 9.602312 10.672726 148.47183
Reduce 2.994240 3.157500 6.331503 3.223612 3.616555 99.49181
apply 524.488376 651.549401 771.095002 743.286441 857.993418 1235.53153
Rfast 5.649006 5.901787 11.110896 6.387990 9.727408 66.03151
DT_rowSums 9.209539 9.566574 20.955033 10.131163 12.967030 294.32911
DT_Reduce 3.590719 3.774761 10.595256 3.924592 4.259343 340.52855
tidy_rowSums 15.532917 15.997649 33.736883 17.316108 27.072343 343.21254
tidy_Reduce 8.627810 8.960008 12.271105 9.603124 11.089334 79.98853
Code:
library('data.table')
library('tidyverse')
library('Rfast')
DFcars = data.table::copy(mtcars)
DFcars = do.call("rbind", replicate(10000, DFcars, simplify = FALSE))
DT_cars = data.table::copy(DFcars)
DFcars2 = data.table::copy(DFcars)
setDT(DT_cars)
colnms = c("mpg", "cyl", "disp", "hp", "drat")
microbenchmark::microbenchmark(
rowSums =
{
DFcars$new_col = rowSums(DFcars[, colnms])
(as.numeric(DFcars$new_col))
},
Reduce =
{
DFcars$new_col = Reduce('+', DFcars[, colnms])
(as.numeric(DFcars$new_col))
},
apply =
{
DFcars$new_col = apply(DFcars[, 1:5], 1, sum)
(as.numeric(DFcars$new_col))
},
Rfast =
{
DFcars$new_col = rowsums(as.matrix(DFcars[, colnms]))
(as.numeric(DFcars$new_col))
},
DT_rowSums =
{
DT_cars[, new_col := rowSums(.SD), .SDcols = colnms]
(as.numeric(DT_cars$new_col))
},
DT_Reduce =
{
DT_cars[, new_col := Reduce('+', .SD), .SDcols = colnms]
(as.numeric(DT_cars$new_col))
},
tidy_rowSums =
{
DFcars2 = DFcars2 %>% mutate(new_col = select(., colnms) %>% rowSums())
(as.numeric(DFcars2$new_col))
},
tidy_Reduce =
{
DFcars2 = DFcars2 %>% mutate(new_col = select(., colnms) %>% Reduce('+', .))
(as.numeric(DFcars2$new_col))
},
check = 'equivalent'
)

Grouping Over All Possible Combinations of Several Variables With dplyr

Given a situation such as the following
library(dplyr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
I would like to group `myData' to eventually find summary data grouping by all possible combinations of var2, var3, and var4.
I can create a list with all possible combinations of variables as character values with
groupNames <- names(myData)[2:4]
myGroups <- Map(combn,
list(groupNames),
seq_along(groupNames),
simplify = FALSE) %>%
unlist(recursive = FALSE)
My plan was to make separate data sets for each variable combination with a for() loop, something like
### This Does Not Work
for (i in 1:length(myGroups)){
assign( myGroups[i]%>%
unlist() %>%
paste0(collapse = "")%>%
paste0("Data"),
myData %>%
group_by_(lapply(myGroups[[i]], as.symbol)) %>%
summarise( n = length(var1),
avgVar2 = var2 %>%
mean()))
}
Admittedly I am not very good with lists, and looking up this issue was a bit challenging since dpyr updates have altered how grouping works a bit.
If there is a better way to do this than separate data sets I would love to know.
I've gotten a loop similar to above working when I am only grouping by a single variable.
Any and all help is greatly appreciated! Thank you!
This seems convulated, and there's probably a way to simplify or fancy it up with a do, but it works. Using your myData and myGroups,
results = lapply(myGroups, FUN = function(x) {
do.call(what = group_by_, args = c(list(myData), x)) %>%
summarise( n = length(var1),
avgVar1 = mean(var1))
}
)
> results[[1]]
Source: local data frame [3 x 3]
var2 n avgVar1
1 a 31 0.38929738
2 b 31 -0.07451717
3 c 38 -0.22522129
> results[[4]]
Source: local data frame [9 x 4]
Groups: var2
var2 var3 n avgVar1
1 a A 11 -0.1159160
2 a B 11 0.5663312
3 a C 9 0.7904056
4 b A 7 0.0856384
5 b B 13 0.1309756
6 b C 11 -0.4192895
7 c A 15 -0.2783099
8 c B 10 -0.1110877
9 c C 13 -0.2517602
> results[[7]]
# I won't paste them here, but it has all 27 rows, grouped by var2, var3 and var4.
I changed your summarise call to average var1 since var2 isn't numeric.
I have created a function based on the answer of #Gregor and the comments that followed:
library(magrittr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
Function combSummarise
combSummarise <- function(data, variables=..., summarise=...){
# Get all different combinations of selected variables (credit to #Michael)
myGroups <- lapply(seq_along(variables), function(x) {
combn(c(variables), x, simplify = FALSE)}) %>%
unlist(recursive = FALSE)
# Group by selected variables (credit to #konvas)
df <- eval(parse(text=paste("lapply(myGroups, function(x){
dplyr::group_by_(data, .dots=x) %>%
dplyr::summarize_( \"", paste(summarise, collapse="\",\""),"\")})"))) %>%
do.call(plyr::rbind.fill,.)
groupNames <- c(myGroups[[length(myGroups)]])
newNames <- names(df)[!(names(df) %in% groupNames)]
df <- cbind(df[, groupNames], df[, newNames])
names(df) <- c(groupNames, newNames)
df
}
Call of combSummarise
combSummarise (myData, var=c("var2", "var3", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)"))
etc
Inspired by the answers by Gregor and dimitris_ps, I wrote a dplyr style function that runs summarise for all combinations of group variables.
summarise_combo <- function(data, ...) {
groupVars <- group_vars(data) %>% map(as.name)
groupCombos <- map( 0:length(groupVars), ~combn(groupVars, ., simplify=FALSE) ) %>%
unlist(recursive = FALSE)
results <- groupCombos %>%
map(function(x) {data %>% group_by(!!! x) %>% summarise(...)} ) %>%
bind_rows()
results %>% select(!!! groupVars, everything())
}
Example
library(tidyverse)
mtcars %>% group_by(cyl, vs) %>% summarise_combo(cyl_n = n(), mean(mpg))
Using unite to create a new column is the simplest way
library(tidyverse)
df = tibble(
a = c(1,1,2,2,1,1,2,2),
b = c(3,4,3,4,3,4,3,4),
val = c(1,2,3,4,5,6,7,8)
)
print(df)#output1
df_2 = unite(df, 'combined_header', a, b, sep='_', remove=FALSE) #remove=F doesn't remove existing columns
print(df_2)#output2
df_2 %>% group_by(combined_header) %>%
summarize(avg_val=mean(val)) %>% print()#output3
#avg 1_3 = mean(1,5)=3 avg 1_4 = mean(2, 6) = 4
RESULTS
Output:
output1
a b val
<dbl> <dbl> <dbl>
1 1 3 1
2 1 4 2
3 2 3 3
4 2 4 4
5 1 3 5
6 1 4 6
7 2 3 7
8 2 4 8
output2
combined_header a b val
<chr> <dbl> <dbl> <dbl>
1 1_3 1 3 1
2 1_4 1 4 2
3 2_3 2 3 3
4 2_4 2 4 4
5 1_3 1 3 5
6 1_4 1 4 6
7 2_3 2 3 7
8 2_4 2 4 8
output3
combined_header avg_val
<chr> <dbl>
1 1_3 3
2 1_4 4
3 2_3 5
4 2_4 6

Resources