dplyr conditional summation - r

I have the following data frame:
set.seed(42)
df <- data_frame(x = sample(0:100, 50, replace = T),
y = sample(c(T, F), 50, replace = T))
I would like to create a third column z that will be the sum of column x, but only if there are more than 3 trues in a row in column y.
Is there a vectorized way to do it with dplyr? I don't even know how to approach this.

We create a grouping variable with rleid from data.table and get the sum of 'x' if there are more than 3 elements (n() >3) and if all the elements in 'y' are TRUE or else return NA
library(dplyr)
library(data.table)
df %>%
group_by(grp = rleid(y)) %>%
mutate(Sum = if(n() > 3 & all(y)) sum(x) else NA_integer_) %>%
ungroup %>%
select(-grp)
It can be also done with data.table
library(data.table)
setDT(df)[, Sum := sum(x) * NA^(!((.N > 3) & all(y))), .(grp = rleid(y))]

The question did not specify what values to use if there are not 3 TRUE values so we will use 0.
library(dplyr)
library(zoo)
sum3 <- function(z) all(z[, "y"]) * sum(z[, "x"])
df %>% mutate(sum = rollapplyr(df, 3, sum3, by.column = FALSE, fill = 0))
giving:
# A tibble: 50 x 3
x y sum
<int> <lgl> <int>
1 92 TRUE 0
2 94 TRUE 0
3 28 TRUE 214
4 83 FALSE 0
5 64 TRUE 0
6 52 FALSE 0
7 74 FALSE 0
8 13 TRUE 0
9 66 TRUE 0
10 71 FALSE 0
# ... with 40 more rows

Related

How to Sum Specific Columns of Dataframe in r? [duplicate]

I have the following condensed data set:
a<-as.data.frame(c(2000:2005))
a$Col1<-c(1:6)
a$Col2<-seq(2,12,2)
colnames(a)<-c("year","Col1","Col2")
for (i in 1:2){
a[[paste("Var_", i, sep="")]]<-i*a[[paste("Col", i, sep="")]]
}
I would like to sum the columns Var1 and Var2, which I use:
a$sum<-a$Var_1 + a$Var_2
In reality my data set is much larger - I would like to sum from Var_1 to Var_n (n can be upto 20). There must be a more efficient way to do this than:
a$sum<-a$Var_1 + ... + a$Var_n
Here's a solution using the tidyverse. You can extend it to as many columns as you like using the select() function to select the appropriate columns within a mutate().
library(tidyverse)
a<-as.data.frame(c(2000:2005))
a$Col1<-c(1:6)
a$Col2<-seq(2,12,2)
colnames(a)<-c("year","Col1","Col2")
for (i in 1:2){
a[[paste("Var_", i, sep="")]]<-i*a[[paste("Col", i, sep="")]]
}
a
#> year Col1 Col2 Var_1 Var_2
#> 1 2000 1 2 1 4
#> 2 2001 2 4 2 8
#> 3 2002 3 6 3 12
#> 4 2003 4 8 4 16
#> 5 2004 5 10 5 20
#> 6 2005 6 12 6 24
# Tidyverse solution
a %>%
mutate(Total = select(., Var_1:Var_2) %>% rowSums(na.rm = TRUE))
#> year Col1 Col2 Var_1 Var_2 Total
#> 1 2000 1 2 1 4 5
#> 2 2001 2 4 2 8 10
#> 3 2002 3 6 3 12 15
#> 4 2003 4 8 4 16 20
#> 5 2004 5 10 5 20 25
#> 6 2005 6 12 6 24 30
Created on 2019-01-01 by the reprex package (v0.2.1)
You can use colSums(a[,c("Var1", "Var2")]) or rowSums(a[,c("Var_1", "Var_2")]). In your case you want the latter.
with dplyr you can use
a %>%
rowwise() %>%
mutate(sum = sum(Col1,Col1, na.rm = T))
or more efficiently
a %>%
rowwise() %>%
mutate(sum = sum(across(starts_with("Col")), na.rm = T))
If you're working with a very large dataset, rowSums can be slow.
An alternative is the rowsums function from the Rfast package. This requires you to convert your data to a matrix in the process and use column indices rather than names. Here's an example based on your code:
## load Rfast
library(Rfast)
## create dataset
a <- as.data.frame(c(2000:2005))
a$Col1 <- c(1:6)
a$Col2 <- seq(2,12,2)
colnames(a) <- c("year","Col1","Col2")
for (i in 1:2){
a[[paste("Var_", i, sep="")]] <- i*a[[paste("Col", i, sep="")]]
}
## get column indices based on names
col_st <- grep("Var_1", colnames(a)) # index of "Var_1" col
col_en <- grep("Var_2", colnames(a)) # index of "Var_2" col
cols <- c(col_st:col_en) # indices of all cols from "Var_1" to "Var_2"
## sum rows 4 to 5
a$Total <- rowsums(as.matrix(a[,cols]))
You can use this:
library(dplyr)
a$Sum <- apply(a[,select(a, starts_with("Var_"))], 1, sum)
In Base R:
You could simply just use sapply:
sapply(unique(sub(".$", "", colnames(a))), function(x) rowSums(a[startsWith(colnames(a), x)]))
This is very reliable, it works for anything.
Benchmarking seems to show that plain Reduce('+', ...) is the fastest. Libraries just make it (at least slightly) slower, at least for mtcars, even if I expand it to be huge.
Unit: milliseconds
expr min lq mean median uq max
rowSums 8.672061 9.014344 13.708022 9.602312 10.672726 148.47183
Reduce 2.994240 3.157500 6.331503 3.223612 3.616555 99.49181
apply 524.488376 651.549401 771.095002 743.286441 857.993418 1235.53153
Rfast 5.649006 5.901787 11.110896 6.387990 9.727408 66.03151
DT_rowSums 9.209539 9.566574 20.955033 10.131163 12.967030 294.32911
DT_Reduce 3.590719 3.774761 10.595256 3.924592 4.259343 340.52855
tidy_rowSums 15.532917 15.997649 33.736883 17.316108 27.072343 343.21254
tidy_Reduce 8.627810 8.960008 12.271105 9.603124 11.089334 79.98853
Code:
library('data.table')
library('tidyverse')
library('Rfast')
DFcars = data.table::copy(mtcars)
DFcars = do.call("rbind", replicate(10000, DFcars, simplify = FALSE))
DT_cars = data.table::copy(DFcars)
DFcars2 = data.table::copy(DFcars)
setDT(DT_cars)
colnms = c("mpg", "cyl", "disp", "hp", "drat")
microbenchmark::microbenchmark(
rowSums =
{
DFcars$new_col = rowSums(DFcars[, colnms])
(as.numeric(DFcars$new_col))
},
Reduce =
{
DFcars$new_col = Reduce('+', DFcars[, colnms])
(as.numeric(DFcars$new_col))
},
apply =
{
DFcars$new_col = apply(DFcars[, 1:5], 1, sum)
(as.numeric(DFcars$new_col))
},
Rfast =
{
DFcars$new_col = rowsums(as.matrix(DFcars[, colnms]))
(as.numeric(DFcars$new_col))
},
DT_rowSums =
{
DT_cars[, new_col := rowSums(.SD), .SDcols = colnms]
(as.numeric(DT_cars$new_col))
},
DT_Reduce =
{
DT_cars[, new_col := Reduce('+', .SD), .SDcols = colnms]
(as.numeric(DT_cars$new_col))
},
tidy_rowSums =
{
DFcars2 = DFcars2 %>% mutate(new_col = select(., colnms) %>% rowSums())
(as.numeric(DFcars2$new_col))
},
tidy_Reduce =
{
DFcars2 = DFcars2 %>% mutate(new_col = select(., colnms) %>% Reduce('+', .))
(as.numeric(DFcars2$new_col))
},
check = 'equivalent'
)

calculating n quantiles by group in tidyverse

I have a unique problem where I would like to add a column of percentiles for each group in a data frame. Here is how my data look like:
library(tidyverse)
set.seed(123)
df <- tibble(id = 1:100,
group = rep(letters[1:4], 25),
x = c(sample(1:100, 25, replace = T),
sample(101:200, 25, replace = T),
sample(201:300, 25, replace = T),
sample(301:400, 25, replace = T)))
> df
# A tibble: 100 x 3
id group x
<int> <chr> <int>
1 1 a 78
2 2 b 80
3 3 c 7
4 4 d 100
5 5 a 45
6 6 b 76
7 7 c 25
8 8 d 91
9 9 a 13
10 10 b 84
# ... with 90 more rows
# Function to create a table ten percentiles for a numeric vector
percentiles_table <- function(x) {
res <- round(quantile(x, probs = seq(from=.1, to=1, by=0.1)), 0)
res <- data.frame(percentile = names(res), to = res )
res <- res %>%
mutate(from = lag(to, default = 0)) %>%
select(from,to,percentile)
}
# Table of percentiles
percentiles <- df %>%
group_by(group) %>%
summarise(percentiles_table(x)) %>%
ungroup()
> percentiles
# A tibble: 40 x 4
group from to percentile
<chr> <dbl> <dbl> <chr>
1 a 0 25 10%
2 a 25 71 20%
3 a 71 106 30%
4 a 106 125 40%
5 a 125 198 50%
6 a 198 236 60%
7 a 236 278 70%
8 a 278 325 80%
9 a 325 379 90%
10 a 379 389 100%
I would like to add the percentile column to df for each group where the value of x falls between from and to.
There might be some way to calculate the percentile column directly without having it calculated in a separated data.frame and then appending it back to df.
A one-liner with my santoku package:
library(santoku)
df |>
group_by(group) |>
mutate(
percentile = chop_quantiles(x, 0:100/100,
labels = lbl_endpoint())
)
# A tibble: 100 × 4
# Groups: group [4]
id group x percentile
<int> <chr> <int> <fct>
1 1 a 35 8%
2 2 b 97 20%
3 3 c 39 4%
4 4 d 20 8%
5 5 a 89 16%
...
Using data.table:
setDT(df)[
,
percentile := cut(
x,
quantile(x, seq(0, 1, 0.1)),
include.lowest = TRUE,
labels = paste0(seq(10, 100, 10), "%")
),
by = group
]
install.packages("zoo")
library(zoo)
y=as.data.frame(c(0:max(percentiles$to)))
y=merge(y,unique(percentiles[,c(1)]))
y=merge(y,percentiles[,c(1,2,4)], by.x = c("group","c(0:max(percentiles$to))"), by.y = c("group","from"), all.x = TRUE)
y=na.locf(y)
df=merge(df,y, all.x = TRUE, by.x = c("group","x"), by.y = c("group","c(0:max(percentiles$to))"))
I got this working solution.
percentile_ranks <- function(x) {
res <- trunc(rank(x))/length(x) * 100
res <- floor(res/10) }
df <- df %>%
group_by(group) %>%
arrange(x) %>%
mutate(percentile = percentile_ranks(x)) %>%
mutate(percentile_pct = paste0(percentile*10,"%")) %>%
ungroup() %>%
arrange(id) # original data.frame order

How to get sum by each factor level?

I have filtered data and one of the columns has 5 factor levels and I want to get sum for each of the factor level.
I am using the below code
levels(df_Temp$ATYPE)
[1] "a" "b" "c" "d" "Unknown"
I am using the below code
cast(df_Temp,ATYPE~AFTER_ADM, sum, value = "CHRGES")
but the output I am getting is as below
ATYPE 0 1
1 a 0 2368968.39
2 b 0 3206567.47
3 c 0 19551.19
4 e 0 2528688.12
I want to all the factor levels and sum as "0" for those missing data of factors level.
So the desired output is
ATYPE 0 1
1 a 0 2368968.39
2 b 0 3206567.47
3 c 0 19551.19
4 d 0 0
5 e 0 2528688.12
Using xtabs from base R
xtabs(CHRGES ~ ATYPE + AFTER_ADM, subset(df_Temp, ATYPE != "e"))
# AFTER_ADM
#ATYPE 0 1
# a 0.00000000 -5.92270971
# b -1.68910431 0.05222349
# c -0.26869311 0.16922669
# d 1.44764443 -1.59011411
# e 0.00000000 0.00000000
data
set.seed(24)
df_Temp <- data.frame(ATYPE = sample(letters[1:5], 20, replace = TRUE),
AFTER_ADM = sample(0:1, 20, replace = TRUE), CHRGES = rnorm(20))
If I understand your question correctly, you can use dplyr. First I created an example dataset:
set.seed(123)
x <- sample(letters[1:5], 1e3, replace = T)
x[x == "e"] <- "Unknown"
y <- sample(1:100, 1e3, replace = T)
df1 <- data.frame(ATYPE = factor(x), AFTER_ADM = y)
df1$AFTER_ADM[df1$ATYPE == "Unknown"] <- NA
head(df1, 10)
ATYPE AFTER_ADM
1 b 28
2 d 60
3 c 17
4 Unknown NA
5 Unknown NA
6 a 48
7 c 78
8 Unknown NA
9 c 7
10 c 45
And then use group_by and summarise to get the sum and the counts. I was not sure if you would want the counts for the factor levels but it is easy to take out if you are not interested:
library(dplyr)
df1 %>%
group_by(ATYPE) %>%
summarise(sum_AFTER_ADM = sum(AFTER_ADM, na.rm = T),
n_ATYPE = n())
# A tibble: 5 x 3
ATYPE sum_AFTER_ADM n_ATYPE
<fct> <int> <int>
1 a 10363 198
2 b 11226 206
3 c 9611 203
4 d 9483 195
5 Unknown 0 198
Another possible solution using dplyr and tidyr. Using count and complete from the two packages will help solve your problem.
library(dplyr)
library(tidyr)
#using iris as toy data
iris2 <- iris %>%
filter(Species != "setosa")
#count data and then fill n with 0
ir3 <- count(iris2, Species) %>%
complete(Species, fill = list(n =0))

Group data by factor level, then transform to data frame with colname being levels?

There is my problem that I can't solve it:
Data:
df <- data.frame(f1=c("a", "a", "b", "b", "c", "c", "c"),
v1=c(10, 11, 4, 5, 0, 1, 2))
data.frame:f1 is factor
f1 v1
a 10
a 11
b 4
b 5
c 0
c 1
c 2
# What I want is:(for example, fetch data with the number of element of some level == 2, then to data.frame)
a b
10 4
11 5
Thanks in advance!
I might be missing something simple here , but the below approach using dplyr works.
library(dplyr)
nlevels = 2
df1 <- df %>%
add_count(f1) %>%
filter(n == nlevels) %>%
select(-n) %>%
mutate(rn = row_number()) %>%
spread(f1, v1) %>%
select(-rn)
This gives
# a b
# <int> <int>
#1 10 NA
#2 11 NA
#3 NA 4
#4 NA 5
Now, if you want to remove NA's we can do
do.call("cbind.data.frame", lapply(df1, function(x) x[!is.na(x)]))
# a b
#1 10 4
#2 11 5
As we have filtered the dataframe which has only nlevels observations, we would have same number of rows for each column in the final dataframe.
split might be useful here to split df$v1 into parts corresponding to df$f1. Since you are always extracting equal length chunks, it can then simply be combined back to a data.frame:
spl <- split(df$v1, df$f1)
data.frame(spl[lengths(spl)==2])
# a b
#1 10 4
#2 11 5
Or do it all in one call by combining this with Filter:
data.frame(Filter(function(x) length(x)==2, split(df$v1, df$f1)))
# a b
#1 10 4
#2 11 5
Here is a solution using unstack :
unstack(
droplevels(df[ave(df$v1, df$f1, FUN = function(x) length(x) == 2)==1,]),
v1 ~ f1)
# a b
# 1 10 4
# 2 11 5
A variant, similar to #thelatemail's solution :
data.frame(Filter(function(x) length(x) == 2, unstack(df,v1 ~ f1)))
My tidyverse solution would be:
library(tidyverse)
df %>%
group_by(f1) %>%
filter(n() == 2) %>%
mutate(i = row_number()) %>%
spread(f1, v1) %>%
select(-i)
# # A tibble: 2 x 2
# a b
# * <dbl> <dbl>
# 1 10 4
# 2 11 5
or mixing approaches :
as_tibble(keep(unstack(df,v1 ~ f1), ~length(.x) == 2))
Using all base functions (but you should use tidyverse)
# Add count of instances
x$len <- ave(x$v1, x$f1, FUN = length)
# Filter, drop the count
x <- x[x$len==2, c('f1','v1')]
# Hacky pivot
result <- data.frame(
lapply(unique(x$f1), FUN = function(y) x$v1[x$f1==y])
)
colnames(result) <- unique(x$f1)
> result
a b
1 10 4
2 11 5
I'd like code this, may it helps for you
library(reshape2)
library(dplyr)
aa = data.frame(v1=c('a','a','b','b','c','c','c'),f1=c(10,11,4,5,0,1,2))
cc = aa %>% group_by(v1) %>% summarise(id = length((v1)))
dd= merge(aa,cc) #get the level
ee = dd[dd$aa==2,] #select number of level equal to 2
ee$id = rep(c(1,2),nrow(ee)/2) # reset index like (1,2,1,2)
dcast(ee, id~v1,value.var = 'f1')
all done!

Summarize all group values and a conditional subset in the same call

I'll illustrate my question with an example.
Sample data:
df <- data.frame(ID = c(1, 1, 2, 2, 3, 5), A = c("foo", "bar", "foo", "foo", "bar", "bar"), B = c(1, 5, 7, 23, 54, 202))
df
ID A B
1 1 foo 1
2 1 bar 5
3 2 foo 7
4 2 foo 23
5 3 bar 54
6 5 bar 202
What I want to do is to summarize, by ID, the sum of B and the sum of B when A is "foo". I can do this in a couple steps like:
require(magrittr)
require(dplyr)
df1 <- df %>%
group_by(ID) %>%
summarize(sumB = sum(B))
df2 <- df %>%
filter(A == "foo") %>%
group_by(ID) %>%
summarize(sumBfoo = sum(B))
left_join(df1, df2)
ID sumB sumBfoo
1 1 6 1
2 2 30 30
3 3 54 NA
4 5 202 NA
However, I'm looking for a more elegant/faster way, as I'm dealing with 10gb+ of out-of-memory data in sqlite.
require(sqldf)
my_db <- src_sqlite("my_db.sqlite3", create = T)
df_sqlite <- copy_to(my_db, df)
I thought of using mutate to define a new Bfoo column:
df_sqlite %>%
mutate(Bfoo = ifelse(A=="foo", B, 0))
Unfortunately, this doesn't work on the database end of things.
Error in sqliteExecStatement(conn, statement, ...) :
RS-DBI driver: (error in statement: no such function: IFELSE)
You can do both sums in a single dplyr statement:
df1 <- df %>%
group_by(ID) %>%
summarize(sumB = sum(B),
sumBfoo = sum(B[A=="foo"]))
And here is a data.table version:
library(data.table)
dt = setDT(df)
dt1 = dt[ , .(sumB = sum(B),
sumBfoo = sum(B[A=="foo"])),
by = ID]
dt1
ID sumB sumBfoo
1: 1 6 1
2: 2 30 30
3: 3 54 0
4: 5 202 0
Writing up #hadley's comment as an answer
df_sqlite %>%
group_by(ID) %>%
mutate(Bfoo = if(A=="foo") B else 0) %>%
summarize(sumB = sum(B),
sumBfoo = sum(Bfoo)) %>%
collect
If you want to do counting instead of summarizing, then the answer is somewhat different. The change in code is small, especially in the conditional counting part.
df1 <- df %>%
group_by(ID) %>%
summarize(countB = n(),
countBfoo = sum(A=="foo"))
df1
Source: local data frame [4 x 3]
ID countB countBfoo
1 1 2 1
2 2 2 2
3 3 1 0
4 5 1 0
If you wanted to count the rows, instead of summing them, can you pass a variable to the function:
df1 <- df %>%
group_by(ID) %>%
summarize(RowCountB = n(),
RowCountBfoo = n(A=="foo"))
I get an error both with n() and nrow().

Resources