Efficiently sum across multiple columns in R - r

I have the following condensed data set:
a<-as.data.frame(c(2000:2005))
a$Col1<-c(1:6)
a$Col2<-seq(2,12,2)
colnames(a)<-c("year","Col1","Col2")
for (i in 1:2){
a[[paste("Var_", i, sep="")]]<-i*a[[paste("Col", i, sep="")]]
}
I would like to sum the columns Var1 and Var2, which I use:
a$sum<-a$Var_1 + a$Var_2
In reality my data set is much larger - I would like to sum from Var_1 to Var_n (n can be upto 20). There must be a more efficient way to do this than:
a$sum<-a$Var_1 + ... + a$Var_n

Here's a solution using the tidyverse. You can extend it to as many columns as you like using the select() function to select the appropriate columns within a mutate().
library(tidyverse)
a<-as.data.frame(c(2000:2005))
a$Col1<-c(1:6)
a$Col2<-seq(2,12,2)
colnames(a)<-c("year","Col1","Col2")
for (i in 1:2){
a[[paste("Var_", i, sep="")]]<-i*a[[paste("Col", i, sep="")]]
}
a
#> year Col1 Col2 Var_1 Var_2
#> 1 2000 1 2 1 4
#> 2 2001 2 4 2 8
#> 3 2002 3 6 3 12
#> 4 2003 4 8 4 16
#> 5 2004 5 10 5 20
#> 6 2005 6 12 6 24
# Tidyverse solution
a %>%
mutate(Total = select(., Var_1:Var_2) %>% rowSums(na.rm = TRUE))
#> year Col1 Col2 Var_1 Var_2 Total
#> 1 2000 1 2 1 4 5
#> 2 2001 2 4 2 8 10
#> 3 2002 3 6 3 12 15
#> 4 2003 4 8 4 16 20
#> 5 2004 5 10 5 20 25
#> 6 2005 6 12 6 24 30
Created on 2019-01-01 by the reprex package (v0.2.1)

You can use colSums(a[,c("Var1", "Var2")]) or rowSums(a[,c("Var_1", "Var_2")]). In your case you want the latter.

with dplyr you can use
a %>%
rowwise() %>%
mutate(sum = sum(Col1,Col1, na.rm = T))
or more efficiently
a %>%
rowwise() %>%
mutate(sum = sum(across(starts_with("Col")), na.rm = T))

If you're working with a very large dataset, rowSums can be slow.
An alternative is the rowsums function from the Rfast package. This requires you to convert your data to a matrix in the process and use column indices rather than names. Here's an example based on your code:
## load Rfast
library(Rfast)
## create dataset
a <- as.data.frame(c(2000:2005))
a$Col1 <- c(1:6)
a$Col2 <- seq(2,12,2)
colnames(a) <- c("year","Col1","Col2")
for (i in 1:2){
a[[paste("Var_", i, sep="")]] <- i*a[[paste("Col", i, sep="")]]
}
## get column indices based on names
col_st <- grep("Var_1", colnames(a)) # index of "Var_1" col
col_en <- grep("Var_2", colnames(a)) # index of "Var_2" col
cols <- c(col_st:col_en) # indices of all cols from "Var_1" to "Var_2"
## sum rows 4 to 5
a$Total <- rowsums(as.matrix(a[,cols]))

You can use this:
library(dplyr)
a$Sum <- apply(a[,select(a, starts_with("Var_"))], 1, sum)

In Base R:
You could simply just use sapply:
sapply(unique(sub(".$", "", colnames(a))), function(x) rowSums(a[startsWith(colnames(a), x)]))
This is very reliable, it works for anything.

Benchmarking seems to show that plain Reduce('+', ...) is the fastest. Libraries just make it (at least slightly) slower, at least for mtcars, even if I expand it to be huge.
Unit: milliseconds
expr min lq mean median uq max
rowSums 8.672061 9.014344 13.708022 9.602312 10.672726 148.47183
Reduce 2.994240 3.157500 6.331503 3.223612 3.616555 99.49181
apply 524.488376 651.549401 771.095002 743.286441 857.993418 1235.53153
Rfast 5.649006 5.901787 11.110896 6.387990 9.727408 66.03151
DT_rowSums 9.209539 9.566574 20.955033 10.131163 12.967030 294.32911
DT_Reduce 3.590719 3.774761 10.595256 3.924592 4.259343 340.52855
tidy_rowSums 15.532917 15.997649 33.736883 17.316108 27.072343 343.21254
tidy_Reduce 8.627810 8.960008 12.271105 9.603124 11.089334 79.98853
Code:
library('data.table')
library('tidyverse')
library('Rfast')
DFcars = data.table::copy(mtcars)
DFcars = do.call("rbind", replicate(10000, DFcars, simplify = FALSE))
DT_cars = data.table::copy(DFcars)
DFcars2 = data.table::copy(DFcars)
setDT(DT_cars)
colnms = c("mpg", "cyl", "disp", "hp", "drat")
microbenchmark::microbenchmark(
rowSums =
{
DFcars$new_col = rowSums(DFcars[, colnms])
(as.numeric(DFcars$new_col))
},
Reduce =
{
DFcars$new_col = Reduce('+', DFcars[, colnms])
(as.numeric(DFcars$new_col))
},
apply =
{
DFcars$new_col = apply(DFcars[, 1:5], 1, sum)
(as.numeric(DFcars$new_col))
},
Rfast =
{
DFcars$new_col = rowsums(as.matrix(DFcars[, colnms]))
(as.numeric(DFcars$new_col))
},
DT_rowSums =
{
DT_cars[, new_col := rowSums(.SD), .SDcols = colnms]
(as.numeric(DT_cars$new_col))
},
DT_Reduce =
{
DT_cars[, new_col := Reduce('+', .SD), .SDcols = colnms]
(as.numeric(DT_cars$new_col))
},
tidy_rowSums =
{
DFcars2 = DFcars2 %>% mutate(new_col = select(., colnms) %>% rowSums())
(as.numeric(DFcars2$new_col))
},
tidy_Reduce =
{
DFcars2 = DFcars2 %>% mutate(new_col = select(., colnms) %>% Reduce('+', .))
(as.numeric(DFcars2$new_col))
},
check = 'equivalent'
)

Related

How to Sum Specific Columns of Dataframe in r? [duplicate]

I have the following condensed data set:
a<-as.data.frame(c(2000:2005))
a$Col1<-c(1:6)
a$Col2<-seq(2,12,2)
colnames(a)<-c("year","Col1","Col2")
for (i in 1:2){
a[[paste("Var_", i, sep="")]]<-i*a[[paste("Col", i, sep="")]]
}
I would like to sum the columns Var1 and Var2, which I use:
a$sum<-a$Var_1 + a$Var_2
In reality my data set is much larger - I would like to sum from Var_1 to Var_n (n can be upto 20). There must be a more efficient way to do this than:
a$sum<-a$Var_1 + ... + a$Var_n
Here's a solution using the tidyverse. You can extend it to as many columns as you like using the select() function to select the appropriate columns within a mutate().
library(tidyverse)
a<-as.data.frame(c(2000:2005))
a$Col1<-c(1:6)
a$Col2<-seq(2,12,2)
colnames(a)<-c("year","Col1","Col2")
for (i in 1:2){
a[[paste("Var_", i, sep="")]]<-i*a[[paste("Col", i, sep="")]]
}
a
#> year Col1 Col2 Var_1 Var_2
#> 1 2000 1 2 1 4
#> 2 2001 2 4 2 8
#> 3 2002 3 6 3 12
#> 4 2003 4 8 4 16
#> 5 2004 5 10 5 20
#> 6 2005 6 12 6 24
# Tidyverse solution
a %>%
mutate(Total = select(., Var_1:Var_2) %>% rowSums(na.rm = TRUE))
#> year Col1 Col2 Var_1 Var_2 Total
#> 1 2000 1 2 1 4 5
#> 2 2001 2 4 2 8 10
#> 3 2002 3 6 3 12 15
#> 4 2003 4 8 4 16 20
#> 5 2004 5 10 5 20 25
#> 6 2005 6 12 6 24 30
Created on 2019-01-01 by the reprex package (v0.2.1)
You can use colSums(a[,c("Var1", "Var2")]) or rowSums(a[,c("Var_1", "Var_2")]). In your case you want the latter.
with dplyr you can use
a %>%
rowwise() %>%
mutate(sum = sum(Col1,Col1, na.rm = T))
or more efficiently
a %>%
rowwise() %>%
mutate(sum = sum(across(starts_with("Col")), na.rm = T))
If you're working with a very large dataset, rowSums can be slow.
An alternative is the rowsums function from the Rfast package. This requires you to convert your data to a matrix in the process and use column indices rather than names. Here's an example based on your code:
## load Rfast
library(Rfast)
## create dataset
a <- as.data.frame(c(2000:2005))
a$Col1 <- c(1:6)
a$Col2 <- seq(2,12,2)
colnames(a) <- c("year","Col1","Col2")
for (i in 1:2){
a[[paste("Var_", i, sep="")]] <- i*a[[paste("Col", i, sep="")]]
}
## get column indices based on names
col_st <- grep("Var_1", colnames(a)) # index of "Var_1" col
col_en <- grep("Var_2", colnames(a)) # index of "Var_2" col
cols <- c(col_st:col_en) # indices of all cols from "Var_1" to "Var_2"
## sum rows 4 to 5
a$Total <- rowsums(as.matrix(a[,cols]))
You can use this:
library(dplyr)
a$Sum <- apply(a[,select(a, starts_with("Var_"))], 1, sum)
In Base R:
You could simply just use sapply:
sapply(unique(sub(".$", "", colnames(a))), function(x) rowSums(a[startsWith(colnames(a), x)]))
This is very reliable, it works for anything.
Benchmarking seems to show that plain Reduce('+', ...) is the fastest. Libraries just make it (at least slightly) slower, at least for mtcars, even if I expand it to be huge.
Unit: milliseconds
expr min lq mean median uq max
rowSums 8.672061 9.014344 13.708022 9.602312 10.672726 148.47183
Reduce 2.994240 3.157500 6.331503 3.223612 3.616555 99.49181
apply 524.488376 651.549401 771.095002 743.286441 857.993418 1235.53153
Rfast 5.649006 5.901787 11.110896 6.387990 9.727408 66.03151
DT_rowSums 9.209539 9.566574 20.955033 10.131163 12.967030 294.32911
DT_Reduce 3.590719 3.774761 10.595256 3.924592 4.259343 340.52855
tidy_rowSums 15.532917 15.997649 33.736883 17.316108 27.072343 343.21254
tidy_Reduce 8.627810 8.960008 12.271105 9.603124 11.089334 79.98853
Code:
library('data.table')
library('tidyverse')
library('Rfast')
DFcars = data.table::copy(mtcars)
DFcars = do.call("rbind", replicate(10000, DFcars, simplify = FALSE))
DT_cars = data.table::copy(DFcars)
DFcars2 = data.table::copy(DFcars)
setDT(DT_cars)
colnms = c("mpg", "cyl", "disp", "hp", "drat")
microbenchmark::microbenchmark(
rowSums =
{
DFcars$new_col = rowSums(DFcars[, colnms])
(as.numeric(DFcars$new_col))
},
Reduce =
{
DFcars$new_col = Reduce('+', DFcars[, colnms])
(as.numeric(DFcars$new_col))
},
apply =
{
DFcars$new_col = apply(DFcars[, 1:5], 1, sum)
(as.numeric(DFcars$new_col))
},
Rfast =
{
DFcars$new_col = rowsums(as.matrix(DFcars[, colnms]))
(as.numeric(DFcars$new_col))
},
DT_rowSums =
{
DT_cars[, new_col := rowSums(.SD), .SDcols = colnms]
(as.numeric(DT_cars$new_col))
},
DT_Reduce =
{
DT_cars[, new_col := Reduce('+', .SD), .SDcols = colnms]
(as.numeric(DT_cars$new_col))
},
tidy_rowSums =
{
DFcars2 = DFcars2 %>% mutate(new_col = select(., colnms) %>% rowSums())
(as.numeric(DFcars2$new_col))
},
tidy_Reduce =
{
DFcars2 = DFcars2 %>% mutate(new_col = select(., colnms) %>% Reduce('+', .))
(as.numeric(DFcars2$new_col))
},
check = 'equivalent'
)

dplyr::mutate_at() relying on multiple columns with a given prefix/suffix

dplyr::mutate_at() can be used to apply the same function to multiple columns. It also allows you to set the results in new columns using a named list.
However, what if I have many columns in pairs (say, data1_a, data1_b, data2_a, data2_b, ...) and I want to multiply those pairs together? Is that possible?
By hand, that would look like
suppressPackageStartupMessages({
library(dplyr)
})
data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6) %>%
mutate(
data1 = data1_a * data1_b,
data2 = data2_a * data2_b
)
#> data1_a data1_b data2_a data2_b data1 data2
#> 1 1 2 3 4 2 12
#> 2 2 3 4 5 6 20
#> 3 3 4 5 6 12 30
My current solution is to write a function which takes the unsuffixed variable name (i.e. "data1"), creates the suffixed names and then performs a simple mutate() on that variable using get(). I then call that function for each output:
foo <- function(df, name) {
a <- paste0(name, "_a")
b <- paste0(name, "_b")
return(
mutate(
df,
!!name := get(a) * get(b)
)
)
}
data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6) %>%
foo("data1") %>%
foo("data2")
#> data1_a data1_b data2_a data2_b data1 data2
#> 1 1 2 3 4 2 12
#> 2 2 3 4 5 6 20
#> 3 3 4 5 6 12 30
(or write a loop over all the variable names if there were more of them)
But if it's possible to use mutate_at or something of the sort, that'd be much cleaner.
We can use pivot_longer/pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c('grp', '.value'),
names_sep = "_") %>%
group_by(grp) %>%
transmute(rn, new = a * b) %>%
pivot_wider(names_from = grp, values_from = new) %>%
select(-rn) %>%
bind_cols(df1, .)
# A tibble: 3 x 6
# data1_a data1_b data2_a data2_b data1 data2
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 2 12
#2 2 3 4 5 6 20
#3 3 4 5 6 12 30
Or another option is to split into a list based on the column names and then do the *
library(purrr)
library(stringr)
df1 %>%
split.default(str_remove(names(.), "_.*")) %>%
map_dfr(reduce, `*`) %>%
bind_cols(df1, .)
# A tibble: 3 x 6
# data1_a data1_b data2_a data2_b data1 data2
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 2 12
#2 2 3 4 5 6 20
#3 3 4 5 6 12 30
With mutate, it is possible, but it would be more manual
df1 %>%
mutate(data1 = select(., starts_with('data1')) %>%
reduce(`*`),
data2 = select(., starts_with('data2')) %>%
reduce(`*`))
data
df1 <- data.frame(data1_a = 1:3, data1_b = 2:4,
data2_a = 3:5, data2_b = 4:6)
After adopting #akrun's elegant solution, I noticed it was unfortunately very inefficient (since it has to recreate two dataframes), taking almost a second on a dataset with 20,000 rows and 11 "groups".
So a while ago I developed the following function (with a bit of help from #user12728748... sorry for not posting here sooner), which takes the names of the groups ("data1", "data2", etc) and a formula using the prefixes, allowing for bquote-style quoting for constant names:
suppressPackageStartupMessages(library(dplyr))
mutateSet <- function(df, colNames, formula,
isPrefix = TRUE,
separator = "_") {
vars <- all.vars(formula)
# extracts names wrapped in `.()`
escapedNames <- function (expr)
{
unquote <- function(e) {
if (is.pairlist(e) || length(e) <= 1L) NULL
else if (e[[1L]] == as.name(".")) deparse(e[[2L]])
else unlist(sapply(e, unquote))
}
unquote(substitute(expr))
}
escapedVars <- eval(rlang::expr(escapedNames(!!formula)))
# remove escaped names from mapping variables
vars <- setdiff(vars, escapedVars)
# get output prefix/suffix as string
lhs <- rlang::f_lhs(formula) %>%
all.vars()
# get operation as string
# deparse() can have line breaks; paste0() brings it back to one line
rhs <- rlang::f_rhs(formula) %>%
deparse() %>%
paste0(collapse = "")
# dummy function to cover for bquote escaping
. <- function(x) x
for (i in colNames) {
if (isPrefix) {
aliases <- paste0(vars, separator, i)
newCol <- paste0(lhs, separator, i)
} else {
aliases <- paste0(i, separator, vars)
newCol <- paste0(i, separator, lhs)
}
if (length(lhs) == 0) newCol <- i
mapping <- rlang::list2(!!!aliases)
names(mapping) <- vars
mapping <- do.call(wrapr::qc, mapping)
df <- rlang::expr(wrapr::let(
mapping,
df %>% dplyr::mutate(!!newCol := ...RHS...)
)) %>%
deparse() %>%
gsub(
pattern = "...RHS...",
replacement = rhs
) %>%
{eval(parse(text = .))}
}
return(df)
}
df <- data.frame(a_data1 = 1:3, b_data1 = 2:4,
a_data2 = 3:5, b_data2 = 4:6,
static = 5:7)
mutateSet(df, "data1", ~ a + b)
#> a_data1 b_data1 a_data2 b_data2 static data1
#> 1 1 2 3 4 5 3
#> 2 2 3 4 5 6 5
#> 3 3 4 5 6 7 7
mutateSet(df, c("data1", "data2"), x ~ sqrt(a) + b)
#> a_data1 b_data1 a_data2 b_data2 static x_data1 x_data2
#> 1 1 2 3 4 5 3.000000 5.732051
#> 2 2 3 4 5 6 4.414214 7.000000
#> 3 3 4 5 6 7 5.732051 8.236068
mutateSet(df, c("data1", "data2"), ~ a + b + .(static))
#> a_data1 b_data1 a_data2 b_data2 static data1 data2
#> 1 1 2 3 4 5 8 12
#> 2 2 3 4 5 6 11 15
#> 3 3 4 5 6 7 14 18
Created on 2020-04-28 by the reprex package (v0.3.0)
This can probably be cleaned up (especially that heinous for-loop), but it works for now.
Repeating #user12728748's performance test, we see this is ~100x faster:
suppressPackageStartupMessages({
invisible(lapply(c("dplyr", "tidyr", "rlang", "wrapr", "microbenchmark"),
require, character.only = TRUE))
})
polymutate <- function(df, formula) {
form <- rlang::f_rhs(formula)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c('.value', 'grp'),
names_sep = "_") %>%
group_by(grp) %>%
transmute(rn, new = eval(form)) %>%
pivot_wider(names_from = grp, values_from = new) %>%
select(-rn) %>%
bind_cols(df, .)
}
set.seed(1)
df <- setNames(data.frame(matrix(sample(1:12, 6E6, replace=TRUE), ncol=6)),
c("a_data1", "b_data1", "a_data2", "b_data2", "a_data3", "b_data3"))
pd <- polymutate(df, ~ a + b)
pd2 <- mutateSet(df, c("data1", "data2", "data3"), ~ a + b)
all.equal(pd, pd2)
#> [1] TRUE
microbenchmark(polymutate(df, ~ a + b),
mutateSet(df, c("data1", "data2", "data3"), ~ a + b),
times=10L)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> polymutate 1612.306 1628.9776 1690.78586 1670.15600 1741.3490 1806.1412 10
#> mutateSet 8.757 9.6302 13.27135 10.45965 19.2976 20.4657 10
This is now possible using the cur_column() function within across().
library(tidyverse)
dat <- data.frame(
data1_a = 1:3,
data1_b = 2:4,
data2_a = 3:5,
data2_b = 4:6
)
mutate(
dat,
across(ends_with("a"), ~ . * dat[[str_replace(cur_column(), "a$", "b")]],
.names = "updated_{col}")
)
Returns:
data1_a data1_b data2_a data2_b updated_data1_a updated_data2_a
1 1 2 3 4 2 12
2 2 3 4 5 6 20
3 3 4 5 6 12 30
Where updated_data1_a and updated_data2_a contain the desired output variables.

Check string pattern for non-unique characters

I've a data frame with two columns: id and gradelist.
The value in gradelist column includes a list of grades (separated by ;) with different length.
Here's the data:
id <- seq(1,7)
gradelist <- c("a;b;b",
"c;c",
"d;d;d;f",
"f;f;f;f;f;f",
"a;a;a;a",
"f;b;b;b;b;b;b;b",
"c;c;d;d;a;a")
df <- data.frame(id, gradelist)
df$gradelist <- as.character(df$gradelist)
I need to add another cloumn to chech whether all grades are the smae for each id.
The output would look like:
We can extract the characters and check with n_distinct to find the number of distinct elements is 1
library(dplyr)
library(purrr)
df %>%
mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"),
~ c("no", "yes")[1+(n_distinct(.x)==1)]))
# id gradelist same
#1 1 a;b;b no
#2 2 c;c yes
#3 3 d;d;d;f no
#4 4 f;f;f;f;f;f yes
#5 5 a;a;a;a yes
#6 6 f;b;b;b;b;b;b;b no
#7 7 c;c;d;d;a;a no
Or make use of case_when
df %>%
mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no")))
Or another option is separate_rows on the 'gradelist' to expand the data, find the n_distinct
library(tidyr)
df %>%
separate_rows(gradelist) %>%
distinct %>%
group_by(id) %>%
summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>%
left_join(df)
Check which character is in first place and replace all occurrences of that character with empty string. If nothing's left, that means all characters are same.
sapply(df$gradelist, function(x) {
nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0
}, USE.NAMES = FALSE)
#[1] FALSE TRUE FALSE TRUE TRUE FALSE FALSE
df$same <- factor(unlist(lapply(strsplit(df$g, ";"), function(x)
length(unique(x))))==1, labels=c("No", "Yes"))
df
#> id gradelist same
#> 1 1 a;b;b No
#> 2 2 c;c Yes
#> 3 3 d;d;d;f No
#> 4 4 f;f;f;f;f;f Yes
#> 5 5 a;a;a;a Yes
#> 6 6 f;b;b;b;b;b;b;b No
#> 7 7 c;c;d;d;a;a No
Here are some base R solutions.
define your custom function f, i.e.,
f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))
and then you can add column same by
df$same <- f(df$gradelist)
use regmatches + sapply
df <- within(df,same <- sapply(regmatches(gradelist,gregexpr("\\w",gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no")))
such that
> df
id gradelist same
1 1 a;b;b no
2 2 c;c yes
3 3 d;d;d;f no
4 4 f;f;f;f;f;f yes
5 5 a;a;a;a yes
6 6 f;b;b;b;b;b;b;b no
7 7 c;c;d;d;a;a no
Try:
transform(df, same = c('No', 'Yes')[grepl("^(.)\\1*$", gsub(';', '', gradelist)) + 1])
Output:
id gradelist same
1 1 a;b;b No
2 2 c;c Yes
3 3 d;d;d;f No
4 4 f;f;f;f;f;f Yes
5 5 a;a;a;a Yes
6 6 f;b;b;b;b;b;b;b No
7 7 c;c;d;d;a;a No
You can also go the strsplit way as follows:
transform(df, same = c('No', 'Yes')[sapply(strsplit(gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1])
Benchmark
We repeat the string few times. We also repeat the rows of df so that we end up with slightly more than 100k rows, and assign the function used by #ThomasIsCoding.
df$gradelist <- sapply(df$gradelist, function(x) paste(replicate(20, x), collapse = ";"))
df <- df[rep(seq_len(nrow(df)), each = 15000), ]
f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))
We use transform for all base functions to emulate the behaviour of mutate in case of tidy solutions and microbenchmark 10 times:
mBench <- microbenchmark::microbenchmark(
akrun1 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"),
~ c("no", "yes")[1+(n_distinct(.x)==1)])) },
akrun2 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no"))) },
akrun3 = { df %>%
separate_rows(gradelist) %>%
distinct %>%
group_by(id) %>%
summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>%
left_join(df) },
db = { transform(df, same = sapply(gradelist, function(x) {
nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0}, USE.NAMES = FALSE)) },
`M--` = { transform(df, same = factor(unlist(lapply(strsplit(gradelist, ";"), function(x) length(unique(x))))==1, labels=c("No", "Yes"))) },
ThomasIsCoding1 = { transform(df, same = f(gradelist)) },
ThomasIsCoding2 = { transform(df, same = sapply(regmatches(df$gradelist,gregexpr("\\w",df$gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no"))) },
arg0naut91_1 = { transform(df, same = c('No', 'Yes')[grepl("^(.)\\1*$", gsub(';', '', df$gradelist)) + 1]) },
arg0naut91_2 = { transform(df, same = c('No', 'Yes')[sapply(strsplit(df$gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1]) },
times = 10
)
Results:
Unit: seconds
expr min lq mean median uq max neval
akrun1 19.684781 19.912789 21.084244 20.646490 21.606763 24.008420 10
akrun2 30.393006 31.066965 32.590679 31.824528 33.567449 37.780535 10
akrun3 6.378463 7.190472 7.379439 7.373730 7.704365 8.321929 10
db 3.738271 3.785858 3.935769 3.911479 3.926385 4.523876 10
M-- 3.551592 3.648720 3.723315 3.741075 3.798664 3.915588 10
ThomasIsCoding1 4.453528 4.498858 4.702160 4.613088 4.823517 5.379984 10
ThomasIsCoding2 3.368358 3.532593 3.752111 3.610664 3.773345 4.969414 10
arg0naut91_1 1.638212 1.683986 1.699327 1.704614 1.716077 1.759059 10
arg0naut91_2 3.665604 3.739662 3.774542 3.750144 3.774753 4.071887 10
Plot:

simplifying R code using dplyr (or other) to rowSums while ignoring NA, unlss all is NA

I've initially solved my NA-issue helped by this questions. However, I would like to simplify my code. In the past, I've enjoyed the way dplyr has helped me simplify R code.
Below is a minimal working example illustrating my current solution and where I am at with dplyr.
I have data like this,
dta <- data.frame(foo=c(1,NA,3,4), fooZ=c(4,NA,5,NA), fooQ2=c(7,0,9,NA))
I need to summarize quite a few rows, using 0 as a value that I sum and keeping NA's for rows with all NA. Like this,
dta$sum1 <- rowSums(dta[, c('fooZ', 'fooQ2') ], na.rm=TRUE) * ifelse(
rowSums(is.na(dta[, c('fooZ', 'fooQ2') ])) ==
ncol(dta[, c('fooZ', 'fooQ2') ]), NA, 1)
dta
# > foo fooZ fooQ2 sum1
# > 1 1 4 7 11
# > 2 NA NA 0 0
# > 3 3 5 9 14
# > 4 4 NA NA NA
This does the trick and creates sum1, but I have to repeat the reference to the data three times. Can I simplify this in some handy way? I've made the below code using dplyr, but maybe there's a better way of summarizing rows; while keeping NA for rows that have all NA, ignoring NA's in rows with one or more values, and treating 0 a value to be 'summarized'?
# install.packages(c("dplyr", "ggplot2"), dependencies = TRUE)
require(dplyr)
dta$sum2 = dta %>% select(fooZ, fooQ2) %>% rowSums(., na.rm = TRUE)
dta
# > foo fooZ fooQ2 sum1 sum2
# > 1 1 4 7 11 11
# > 2 NA NA 0 0 0
# > 3 3 5 9 14 14
# > 4 4 NA NA NA 0
This creates sum2, but generates a 0 if na.rm = TRUE and too many NA's if na.rm = F.
Update as of 16 22:18:33Z
I made this somewhat elaborate micro-benchmark comparison of the different answer. Please feel do not haste to optimize any of the function. Writing R functions is not my force. Regardless,
set.seed(667)
n <- 1e5+22
dta <- data.frame(
foo = sample(c(1:10, NA), n, replace = TRUE),
fooZ = sample(c(1:10, NA), n, replace = TRUE),
fooQ2 = sample(c(1:10, NA), n, replace = TRUE))
slice <- c(902:907,979:984)
dta[slice,]
#> foo fooZ fooQ2
#> 902 10 7 2
#> 903 10 10 9
#> 904 NA NA 8
#> 905 6 4 3
#> 906 8 9 10
#> 907 1 5 NA
#> 979 NA 1 1
#> 980 10 2 NA
#> 981 7 NA NA
#> 982 3 7 7
#> 983 NA 9 6
#> 984 7 10 7
# `baseline' solution
baseline <- function(z, ...) {W <- z[, c(...)]; W <- rowSums(W, na.rm=TRUE) * ifelse(rowSums(is.na(W)) == ncol(W), NA, 1); W}
# install.packages(c("dplyr", "ggplot2"), dependencies = TRUE)
require(dplyr)
# G. G.Gro's dplyr solution
G.Gro_dplyr1 <- function(z, ...) z %>% mutate(sum2 = select(., ...) %>% { rowSums(., na.rm = TRUE) + ifelse(apply(is.na(.), 1, all), NA, 0) })
# G. G.Gro's Variation 1a solution
G.Gro_dplyr1a <- function(z, ...) z %>% mutate(sum2 = select(., fooZ, fooQ2) %>% apply(1, . %>% { sum(., na.rm = TRUE) + if (all(is.na(.))) NA else 0}))
# G. G.Gro's base solution
G.Gro_base <- function(z, ...) {W <- z[, c(...)]; S = {X <- dta[, c("fooZ", "fooQ2")]; rowSums(X, na.rm = TRUE) + ifelse(apply(is.na(X), 1, all), NA, 0)}; S}
# Thierry's solution
Thierry_my_sum <- function(z, ...){z <- select(z, ...); sums <- rowSums(z, na.rm = TRUE); sums[apply(is.na(z), 1, all)] <- NA; sums}
# lmo's solution
lmo <- function(z, ...) {W <- z[, c(...)]; rowSums(W, na.rm=TRUE) * (NA^(rowSums(is.na(W)) == ncol(W)))}
# Benjamin's solution
Benjamin <- function(..., na.rm = FALSE, all.na = NA){v <- list(...); all_na <- lapply(v, is.na); all_na <- Reduce(`&`, all_na); all_na; if (na.rm){v <- lapply(v, function(x) {x[is.na(x)] <- 0; x}); }; v <- Reduce(`+`, v); v[all_na] <- all.na; v;}
# Aramis7d's solution
Aramis7d <- function(z, ...) {z %>% select(...) %>% mutate(sum = rowSums(., na.rm=TRUE)) %>% mutate(s2 = rowSums(is.na(.))) %>% mutate(sum = if_else(s2 < 2, sum, as.double(NA))) %>% select(sum) }
# Fail's solution combining from all
Fail <- function(z, ...){z <- select(z, ...); zTF <- rowMeans(is.na(z)) == 1; replace(rowSums(z, na.rm = TRUE), zTF, NA)}
# install.packages("microbenchmark", dependencies = TRUE)
require(microbenchmark)
# run test
res <- microbenchmark(
baseline(dta, c("fooZ", "fooQ2")),
Thierry_my_sum(dta, fooZ, fooQ2),
G.Gro_dplyr1(dta, fooZ, fooQ2)[,ncol(dta)+1],
G.Gro_dplyr1a(dta, fooZ, fooQ2)[, ncol(dta) + 1],
G.Gro_base(dta, c("fooZ", "fooQ2")),
(dta %>% mutate(sum99 = Benjamin(fooZ, fooQ2, na.rm = TRUE)))[,ncol(dta)+1],
lmo(dta, c("fooZ", "fooQ2")),
Aramis7d(dta, fooZ, fooQ2)[,1],
Fail(dta, fooZ, fooQ2),
times = 25)
# clean up
levels(res[[1]]) <- c('baseline', 'Thierry', 'G.Gro1', 'G.Gro1a', 'G.Gro2', 'Benjamin', 'lmo', 'Aramis7d', 'Fail')
## Print results:
print(res)
print(res)
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> baseline 12.729803 15.691060 31.141114 23.299101 48.694436 72.83702 25 a
#> Thierry 215.541035 241.795764 298.319826 263.822553 363.066476 494.90875 25 b
#> G.Gro1 226.761181 242.617099 295.413437 264.911513 307.339115 591.28424 25 b
#> G.Gro1a 935.176542 985.329298 1088.300741 997.788858 1030.085839 1736.51506 25 c
#> G.Gro2 219.650080 227.464694 292.898566 246.188189 320.789036 505.08154 25 b
#> Benjamin 6.227054 9.327364 15.583907 11.230079 14.345366 55.44653 25 a
#> lmo 4.138434 5.970850 9.329506 6.851132 8.406799 39.40295 25 a
#> Aramis7d 33.966101 38.737671 60.777304 66.663967 72.686939 100.72799 25 a
#> Fail 11.464254 13.932386 20.476011 14.865245 25.156740 58.37730 25 a
### Plot results:
boxplot(res)
Here's a base R trick using exponentiation of NA:
rowSums(dta[-1], na.rm=TRUE) * (NA^(rowSums(is.na(dta[-1])) == ncol(dta[-1])))
[1] 11 8 14 NA
Any number to the 0th power is 1, so any rows that contain a non-NA value return a 1 in the second term. Otherwise, NA is returned.
This assumes that you only want to take into account variables other than your first variable.
Combining the improvements the OP made to the code above with an additional step, we could improve the efficiency with
rowSumsNA <- function(dat, ...) {
W <- data.matrix(dat[...])
rowSums(W, na.rm=TRUE) * (NA^(rowSums(is.na(W)) == ncol(W)))
}
Most of the improvements are in the OP's method of storing the subset data.frame prior to the calculation (127ms vs 84ms on my machine), but a slight additional improvement can be had by converting that data.frame to a matrix prior to calling rowSums (84ms vs 77ms on my machine).
Here is a simple dplyr solution
library(dplyr)
dta <- data.frame(foo=c(1,NA,3,4), fooZ=c(4,NA,5,NA), fooQ2=c(7,0,9,NA))
my_sum <- function(z, ...){
z <- select(z, ...)
sums <- rowSums(z, na.rm = TRUE)
sums[apply(is.na(z), 1, all)] <- NA
sums
}
dta %>%
mutate(
sum1 = my_sum(., fooZ, fooQ2),
sum2 = my_sum(., foo, fooQ2),
sum3 = my_sum(., foo, fooZ)
)
1) dplyr This computes the row sums and then adds on NA or 0 depending on whether the entire row is NA or not.
dta %>%
mutate(sum2 = select(., fooZ, fooQ2) %>%
{ rowSums(., na.rm = TRUE) + ifelse(apply(is.na(.), 1, all), NA, 0) })
giving:
foo fooZ fooQ2 sum2
1 1 4 7 11
2 NA NA 8 8
3 3 5 9 14
4 4 NA NA NA
1a) Variation A variation of (1) is:
dta %>%
mutate(sum2 = select(., fooZ, fooQ2) %>%
apply(1, . %>% { sum(., na.rm = TRUE) + if (all(is.na(.))) NA else 0}))
2) base Using no packages we can do this:
transform(dta, sum2 = {
X <- data.frame(fooZ, fooQ2)
rowSums(X, na.rm = TRUE) + ifelse(apply(is.na(X), 1, all), NA, 0)
})
3) data.table
library(data.table)
DT <- as.data.table(dta)
DT[, sum2 := rowSums(.SD, na.rm = TRUE) + ifelse(apply(is.na(.SD), 1, all), NA, 0) , .SDcols = c("fooZ", "fooQ2")]
Update: Moved select inside mutate to preserve foo column. Added additional solutions.
Not as elegant as the other solutions, but it avoids having to drop variables from the data frame and then rejoin. So this is good if you're interested in keeping your data frame intact. It will lose it's advantage if you have a lot of variables to include.
dta %>%
mutate(all_na = Reduce(`&`, lapply(list(fooZ, fooQ2), is.na)),
sum1 = Reduce(`+`, lapply(list(fooZ, fooQ2), function(x) {x[is.na(x)] <- 0; x})),
sum1 = ifelse(all_na, NA, sum1)) %>%
select(-all_na)
Alternatively, you can bundle it into a function:
rsum <- function(..., na.rm = FALSE, all.na = NA){
v <- list(...)
all_na <- lapply(v, is.na)
all_na <- Reduce(`&`, all_na)
all_na
if (na.rm){
v <- lapply(v, function(x) {x[is.na(x)] <- 0; x})
}
v <- Reduce(`+`, v)
v[all_na] <- all.na
v
}
dta %>%
mutate(sum1 = rsum(fooZ, fooQ2, na.rm = TRUE))
alternately, using dplyr, you can try something like:
dta %>%
select(-foo) %>%
mutate(sum1 = rowSums(., na.rm=TRUE)) %>%
mutate(s2 = rowSums(is.na(.))) %>%
mutate(sum1 = if_else(s2 < 2, sum1, as.double(NA))) %>%
bind_cols(dta) %>%
select(foo, fooZ, fooQ2, sum1)
which gives:
foo fooZ fooQ2 sum1
1 1 4 7 11
2 NA NA 8 8
3 3 5 9 14
4 4 NA NA NA
in case you don't really care about retaining the column foo , you can get rid of the col_bind fucntion call

r: group, remove columns, and sum

I've had some trouble with a large data.frame. I need to sum each column of groups, if each group column does not have any 0's (complete). I.E. I only want to sum columns of each group that is "complete".
Here is an example of needing to group and sum each column, however, I cannot figure out how to work complete.cases in a dplyr pipeline
df <- data.frame(ca = c("a","b","a","c","b"),
f = c(3,4,0,2,3),
f2 = c(2,5,6,1,9),
f3 = c(3,0,6,3,0))
What the outcome should look like
ca f f2 f3
1 a NA 8 9
2 b 7 14 NA
3 c 2 1 3
This works to sum each group
df2 <- df %>%
arrange(ca) %>%
group_by(ca) %>%
summarize_at(.cols=vars(starts_with("f")),
.funs=funs("sum"))
Here is what I cannot get to work, but it seems like what I should be working towards
df2 <- df %>%
arrange(ca) %>%
group_by(ca) %>%
summarize_(funs_(sum(complete.cases(.),na.rm=T)))
Maybe I need a summarize_if, any help would be greatly appreciated.
If one column is grouped, the *_all functions will operate on all the non-grouping columns. You can use na_if to insert NAs for a particular value, which makes the whole process fairly simple:
df %>% mutate_all(funs(na_if(., 0L))) %>%
group_by(ca) %>%
summarise_all(sum)
## # A tibble: 3 × 4
## ca f f2 f3
## <fctr> <dbl> <dbl> <dbl>
## 1 a NA 8 9
## 2 b 7 14 NA
## 3 c 2 1 3
or combine the two calls, if you like:
df %>% group_by(ca) %>% summarise_all(funs(sum(na_if(., 0L))))
which returns the same thing.
Benchmarks
Per the comments, benchmarks on 10000 rows and 100 non-grouping columns. Very wide data (more than 1000 columns) does not fare well with either method, but if you gather to long and group by the former variable names, it's tolerable.
library(tidyr)
set.seed(47)
df <- data.frame(ca = sample(letters[1:3], 10000, replace = TRUE),
replicate(100, rpois(100, 10)))
microbenchmark::microbenchmark(
'two stp' = {
df %>% mutate_all(funs(na_if(., 0L))) %>%
group_by(ca) %>% summarise_all(sum)
}, 'one stp' = {
df %>% group_by(ca) %>% summarise_all(funs(sum(na_if(., 0L))))
}, 'two stp, reshape' = {
df %>% gather(var, val, -ca) %>%
mutate(val = na_if(val, 0L)) %>%
group_by(ca, var) %>% summarise(val = sum(val)) %>%
spread(var, val)
}, 'one stp, reshape' = {
df %>% gather(var, val, -ca) %>%
group_by(ca, var) %>% summarise(val = sum(na_if(val, 0L))) %>%
spread(var, val)
})
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## two stp 311.36733 330.23884 347.77353 340.98458 354.21105 548.4810 100 c
## one stp 299.90327 317.38300 329.78662 326.66370 341.09945 385.1589 100 b
## two stp, reshape 61.72992 67.78778 85.94939 73.37648 81.04525 300.5608 100 a
## one stp, reshape 70.95492 77.76685 90.53199 83.33557 90.14023 297.8924 100 a
Using data.tables via dtplyr is much faster. If you don't mind learning another grammar, writing in data.table is faster yet (h/t #docendodiscimus for replace). Reshaping results in worse times here, at least with tidyr functions, though with data.table::melt and dcast it still may be a good option for extremely wide data.
library(data.table)
library(dtplyr)
set.seed(47)
df <- data.frame(ca = sample(letters[1:3], 10000, replace = TRUE),
replicate(100, rpois(10000, 10)))
setDT(df)
microbenchmark::microbenchmark(
'dtplyr 2 stp' = {
df %>% mutate_all(funs(na_if(., 0L))) %>%
group_by(ca) %>%
summarise_all(sum)
}, 'dtplyr 1 stp' = {
df %>% group_by(ca) %>%
summarise_all(funs(sum(na_if(., 0L))))
}, 'dt + na_if 2 stp' = {
df[, lapply(.SD, function(x){na_if(x, 0L)})][, lapply(.SD, sum), by = ca]
}, 'dt + na_if 1 stp' = {
df[, lapply(.SD, function(x){sum(na_if(x, 0L))}), by = ca]
}, 'pure dt 2 stp' = {
df[, lapply(.SD, function(x){replace(x, x == 0L, NA)})][, lapply(.SD, sum), by = ca]
}, 'pure dt 1 stp' = {
df[, lapply(.SD, function(x){sum(replace(x, x == 0L, NA))}), by = ca]
})
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## dtplyr 2 stp 121.31556 130.88189 143.39661 138.32966 146.39086 355.24750 100 c
## dtplyr 1 stp 28.30813 31.03421 36.94506 33.28435 43.46300 55.36789 100 b
## dt + na_if 2 stp 27.03971 29.04306 34.06559 31.20259 36.95895 53.66865 100 b
## dt + na_if 1 stp 10.50404 12.64638 16.10507 13.43007 15.18257 34.37919 100 a
## pure dt 2 stp 27.15501 28.91975 35.07725 30.28981 33.03950 238.66445 100 b
## pure dt 1 stp 10.49617 12.09324 16.31069 12.84595 20.03662 34.44306 100 a
One way to go in base R is to fill the 0s in as NA and then use aggregate.
# fill 0s as NAs
is.na(df) <- df == 0
aggregate(cbind(f=df$f,f2=df$f2,f3=df$f3), df["ca"], sum)
ca f f2 f3
1 a NA 8 9
2 b 7 14 NA
3 c 2 1 3
Note: Using the formula interface to aggregate may produce an unexpected result.
aggregate(.~ca, data=df, sum)
ca f f2 f3
1 a 3 2 3
2 c 2 1 3
The "b" category drops out and the value for a in variable f is 3, not NA. The specification in the help file indicates that na.action is set to na.omit, which drops NA values from computation. To get the formula interface to work as desired, change this value to na.pass.
aggregate(.~ca, data=df, sum, na.action=na.pass)
ca f f2 f3
1 a NA 8 9
2 b 7 14 NA
3 c 2 1 3

Resources