Related
Suppose we have a data frame (df) like this:
a b
1 2
2 4
3 6
If I want to compute the ratio of each element in vectors a and b and assign to variable c, we'd do this:
c <- df$a / df$b
However, I was wondering how the same thing could be done using the dplyr package? I.e. are there any ways that this can be achieved using functions from dplyr?
Maybe you can try the code below
df %>%
mutate(c = do.call("/", .))
or
df %>%
mutate(c = Reduce("/", .))
or
df %>%
mutate(c = a/b)
An option with invoke
library(dplyr)
library(purrr)
df %>%
mutate(c = invoke('/', .))
-output
# a b c
#1 1 2 0.5
#2 2 4 0.5
#3 3 6 0.5
data
df <- data.frame(a = c(1,2,3), b= c(2,4,6))
You can use mutate function from dplyr library:
df <- data.frame(a = c(1,2,3), b= c(2,4,6))
library(dplyr)
df <- df %>%
dplyr::mutate(c = a/b)
Console output:
a b c
1 1 2 0.5
2 2 4 0.5
3 3 6 0.5
I am trying to divide each cell in a data frame by the sum of the column. For example, I have a data frame df:
sample a b c
a2 1 4 6
a3 5 5 4
I would like to create a new data frame that takes each cell in and divides by the sum of the column, like so:
sample a b c
a2 .167 .444 .6
a3 .833 .556 .4
I have seen answers using sweep(), but that looks like its for matrices, and I have data frames. I understand how to use colSums(), but I'm not sure how to write a function that loops through every cell in the column, and then divides by the column sum. Thanks for the help!
Solution 1
Here are two dplyr solutions. We can use mutate_at or mutate_if to efficiently specify which column we want to apply an operation, or under what condition we want to apply an operation.
library(dplyr)
# Apply the operation to all column except sample
dat2 <- dat %>%
mutate_at(vars(-sample), funs(./sum(.)))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
# Apply the operation if the column is numeric
dat2 <- dat %>%
mutate_if(is.numeric, funs(./sum(.)))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
Solution 2
We can also use the map_at and map_if function from the purrr package. However, since the output is a list, we will need as.data.frame from base R or as_data_frame from dplyr to convert the list to a data frame.
library(dplyr)
library(purrr)
# Apply the operation to column a, b, and c
dat2 <- dat %>%
map_at(c("a", "b", "c"), ~./sum(.)) %>%
as_data_frame()
dat2
# # A tibble: 2 x 4
# sample a b c
# <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
# Apply the operation if the column is numeric
dat2 <- dat %>%
map_if(is.numeric, ~./sum(.)) %>%
as_data_frame()
dat2
# # A tibble: 2 x 4
# sample a b c
# <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
Solution 3
We can also use the .SD and .SDcols from the data.table package.
library(data.table)
# Convert to data.table
setDT(dat)
dat2 <- copy(dat)
dat2[, (c("a", "b", "c")) := lapply(.SD, function(x) x/sum(x)), .SDcols = c("a", "b", "c")]
dat2[]
# sample a b c
# 1: a2 0.1666667 0.4444444 0.6
# 2: a3 0.8333333 0.5555556 0.4
Solution 4
We can also use the lapply function to loop through all column except the first column to perform the operation.
dat2 <- dat
dat2[, -1] <- lapply(dat2[, -1], function(x) x/sum(x))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
We can also use apply to loop through all columns but add an if-else statement in the function to make sure only perform the operation on the numeric columns.
dat2 <- dat
dat2[] <- lapply(dat2[], function(x){
# Check if the column is numeric
if (is.numeric(x)){
return(x/sum(x))
} else{
return(x)
}
})
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
Solution 5
A dplyr and tidyr solution based on gather and spread.
library(dplyr)
library(tidyr)
dat2 <- dat %>%
gather(Column, Value, -sample) %>%
group_by(Column) %>%
mutate(Value = Value/sum(Value)) %>%
spread(Column, Value)
dat2
# # A tibble: 2 x 4
# sample a b c
# * <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
Performance Evaluation
I am curious about which method has the best performance. So I conduct the following performance evaluation using the microbenchmark package with a data frame having the same column names as OP's example but with 1000000 rows.
library(dplyr)
library(tidyr)
library(purrr)
library(data.table)
library(microbenchmark)
set.seed(100)
dat <- data_frame(sample = paste0("a", 1:1000000),
a = rpois(1000000, lambda = 3),
b = rpois(1000000, lambda = 3),
c = rpois(1000000, lambda = 3))
# Convert the data frame to a data.table for later perofrmance evaluation
dat_dt <- as.data.table(dat)
head(dat)
# # A tibble: 6 x 4
# sample a b c
# <chr> <int> <int> <int>
# 1 a1 2 5 2
# 2 a2 2 5 5
# 3 a3 3 2 4
# 4 a4 1 2 2
# 5 a5 3 3 1
# 6 a6 3 6 1
In addition to all the methods I proposed, I also interested two other methods proposed by others: the prop.table method proposed by Henrik in the comments, and the apply method by Spacedman. I called all my solutions with m1_1, m1_2, m2_1, ... to m5. If there are two methods in one solution, I used _ to separate them. I also called the prop.table method as m6 and the apply method as m7. Notice that I modified m6 to have an output as a data frame so that all the methods can have data frame, tibble, or data.table output.
Here is the code I used to assess the performance.
per <- microbenchmark(m1_1 = {dat2 <- dat %>% mutate_at(vars(-sample), funs(./sum(.)))},
m1_2 = {dat2 <- dat %>% mutate_if(is.numeric, funs(./sum(.)))},
m2_1 = {dat2 <- dat %>%
map_at(c("a", "b", "c"), ~./sum(.)) %>%
as_data_frame()
},
m2_2 = {dat2 <- dat %>%
map_if(is.numeric, ~./sum(.)) %>%
as_data_frame()},
m3 = {dat_dt2 <- copy(dat_dt)
dat_dt2[, c("a", "b", "c") := lapply(.SD, function(x) x/sum(x)),
.SDcols = c("a", "b", "c")]},
m4_1 = {dat2 <- dat
dat2[, -1] <- lapply(dat2[, -1], function(x) x/sum(x))},
m4_2 = {dat2 <- dat
dat2[] <- lapply(dat2[], function(x){
if (is.numeric(x)){
return(x/sum(x))
} else{
return(x)
}
})},
m5 = {dat2 <- dat %>%
gather(Column, Value, -sample) %>%
group_by(Column) %>%
mutate(Value = Value/sum(Value)) %>%
spread(Column, Value)},
m6 = {dat2 <- dat
dat2[-1] <- prop.table(as.matrix(dat2[-1]), margin = 2)},
m7 = {dat2 <- dat
dat2[, -1] = apply(dat2[, -1], 2, function(x) {x/sum(x)})}
)
print(per)
# Unit: milliseconds
# expr min lq mean median uq max neval
# m1_1 23.335600 24.326445 28.71934 25.134798 27.465017 75.06974 100
# m1_2 20.373093 21.202780 29.73477 21.967439 24.897305 216.27853 100
# m2_1 9.452987 9.817967 17.83030 10.052634 11.056073 175.00184 100
# m2_2 10.009197 10.342819 16.43832 10.679270 11.846692 163.62731 100
# m3 16.195868 17.154327 34.40433 18.975886 46.521868 190.50681 100
# m4_1 8.100504 8.342882 12.66035 8.778545 9.348634 181.45273 100
# m4_2 8.130833 8.499926 15.84080 8.766979 9.732891 172.79242 100
# m5 5373.395308 5652.938528 5791.73180 5737.383894 5825.141584 6660.35354 100
# m6 117.038355 150.688502 191.43501 166.665125 218.837502 325.58701 100
# m7 119.680606 155.743991 199.59313 174.007653 215.295395 357.02775 100
library(ggplot2)
autoplot(per)
The result shows that methods based on lapply (m4_1 and m4_2) are the fastest, while the tidyr approach (m5) is the slowest, indicating that when row numbers are large it is not a good idea to use the gather and spread method.
DATA
dat <- read.table(text = "sample a b c
a2 1 4 6
a3 5 5 4",
header = TRUE, stringsAsFactors = FALSE)
Given this:
> d = data.frame(sample=c("a2","a3"),a=c(1,5),b=c(4,5),c=c(6,4))
> d
sample a b c
1 a2 1 4 6
2 a3 5 5 4
You can replace every column other than the first by applying over the rest:
> d[,-1] = apply(d[,-1],2,function(x){x/sum(x)})
> d
sample a b c
1 a2 0.1666667 0.4444444 0.6
2 a3 0.8333333 0.5555556 0.4
If you don't want d being stomped on make a copy beforehand.
You could do this in dplyr as well.
sample <- c("a2", "a3")
a <- c(1, 5)
b <- c(4, 5)
c <- c(6, 4)
dat <- data.frame(sample, a, b, c)
dat
library(dplyr)
dat %>%
mutate(
a.PCT = round(a/sum(a), 3),
b.PCT = round(b/sum(b), 3),
c.PCT = round(c/sum(c), 3))
sample a b c a.PCT b.PCT c.PCT
1 a2 1 4 6 0.167 0.444 0.6
2 a3 5 5 4 0.833 0.556 0.4
You can use the transpose of the matrix and then transpose again:
t(t(as.matrix(df))/colSums(df))
try apply:
mat <- matrix(1:6, ncol=3)
apply(mat,2, function(x) x / sum(x))
okay, if you have not numeric values in you columns you can force them to be numeric:
df <- data.frame( a=c('a', 'b'), b=c(3,4), d=c(1,6))
apply(df,2, function(x) {
x <- as.numeric(x)
x / sum(x)
})
I would like to subset a data.frame based on the dates in the rownames. My dates are of this format:
192707
192708
192709
df$Date <- as.yearmon(as.character(df$Date), "%Y%m")
edit: I set the rownames equal to the Date variabel like this (and would like to delete Date afterwards):
rownames(df)<-df$Date
I thought of subsetting like this:
train_dates <- seq(as.yearmon(as.character("1959-12-31"), "%Y%m"), as.yearmon(as.character("1984-12-31"), "%Y%m", "months"))
df <- subset(df, rownames(df) %in% train_dates)
or
df[train_dates,]
But I am having difficulties creating the correct sequence.
Try using format
train_dates <- format(seq(as.Date.character('1959-01-31'),
as.Date.character('1959-12-31'), by = 'month'), '%Y%m')
and then, using library(data.table)
df <- as.data.table(df)
train_df <- df[Date %in% train_dates]
One solution could be using rownames_to_column from tibble package.
#data
df <- data.frame(A = 1:5, B = letters[1:5])
rownames(df) <- c("195901", "196008", "196109", "201812", "196112")
# A B
# 195901 1 a
# 196008 2 b
# 196109 3 c
# 201812 4 d # not in train_dates
# 196112 5 e
library(zoo)
#create sequence from 1959 to 1968. Lookup table
train_dates <- format(as.yearmon(1959 + seq(0, 119)/12), format="%Y%m")
Option #1:
library(tidyverse)
df %>%
rownames_to_column("datemon") %>%
filter(datemon %in% train_dates) %>%
column_to_rownames("datemon")
# A B
# 195901 1 a
# 196008 2 b
# 196109 3 c
# 196112 5 e
Option #2
df[rownames(df) %in% train_dates, ]
# A B
# 195901 1 a
# 196008 2 b
# 196109 3 c
# 196112 5 e
I am trying to divide each cell in a data frame by the sum of the column. For example, I have a data frame df:
sample a b c
a2 1 4 6
a3 5 5 4
I would like to create a new data frame that takes each cell in and divides by the sum of the column, like so:
sample a b c
a2 .167 .444 .6
a3 .833 .556 .4
I have seen answers using sweep(), but that looks like its for matrices, and I have data frames. I understand how to use colSums(), but I'm not sure how to write a function that loops through every cell in the column, and then divides by the column sum. Thanks for the help!
Solution 1
Here are two dplyr solutions. We can use mutate_at or mutate_if to efficiently specify which column we want to apply an operation, or under what condition we want to apply an operation.
library(dplyr)
# Apply the operation to all column except sample
dat2 <- dat %>%
mutate_at(vars(-sample), funs(./sum(.)))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
# Apply the operation if the column is numeric
dat2 <- dat %>%
mutate_if(is.numeric, funs(./sum(.)))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
Solution 2
We can also use the map_at and map_if function from the purrr package. However, since the output is a list, we will need as.data.frame from base R or as_data_frame from dplyr to convert the list to a data frame.
library(dplyr)
library(purrr)
# Apply the operation to column a, b, and c
dat2 <- dat %>%
map_at(c("a", "b", "c"), ~./sum(.)) %>%
as_data_frame()
dat2
# # A tibble: 2 x 4
# sample a b c
# <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
# Apply the operation if the column is numeric
dat2 <- dat %>%
map_if(is.numeric, ~./sum(.)) %>%
as_data_frame()
dat2
# # A tibble: 2 x 4
# sample a b c
# <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
Solution 3
We can also use the .SD and .SDcols from the data.table package.
library(data.table)
# Convert to data.table
setDT(dat)
dat2 <- copy(dat)
dat2[, (c("a", "b", "c")) := lapply(.SD, function(x) x/sum(x)), .SDcols = c("a", "b", "c")]
dat2[]
# sample a b c
# 1: a2 0.1666667 0.4444444 0.6
# 2: a3 0.8333333 0.5555556 0.4
Solution 4
We can also use the lapply function to loop through all column except the first column to perform the operation.
dat2 <- dat
dat2[, -1] <- lapply(dat2[, -1], function(x) x/sum(x))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
We can also use apply to loop through all columns but add an if-else statement in the function to make sure only perform the operation on the numeric columns.
dat2 <- dat
dat2[] <- lapply(dat2[], function(x){
# Check if the column is numeric
if (is.numeric(x)){
return(x/sum(x))
} else{
return(x)
}
})
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
Solution 5
A dplyr and tidyr solution based on gather and spread.
library(dplyr)
library(tidyr)
dat2 <- dat %>%
gather(Column, Value, -sample) %>%
group_by(Column) %>%
mutate(Value = Value/sum(Value)) %>%
spread(Column, Value)
dat2
# # A tibble: 2 x 4
# sample a b c
# * <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
Performance Evaluation
I am curious about which method has the best performance. So I conduct the following performance evaluation using the microbenchmark package with a data frame having the same column names as OP's example but with 1000000 rows.
library(dplyr)
library(tidyr)
library(purrr)
library(data.table)
library(microbenchmark)
set.seed(100)
dat <- data_frame(sample = paste0("a", 1:1000000),
a = rpois(1000000, lambda = 3),
b = rpois(1000000, lambda = 3),
c = rpois(1000000, lambda = 3))
# Convert the data frame to a data.table for later perofrmance evaluation
dat_dt <- as.data.table(dat)
head(dat)
# # A tibble: 6 x 4
# sample a b c
# <chr> <int> <int> <int>
# 1 a1 2 5 2
# 2 a2 2 5 5
# 3 a3 3 2 4
# 4 a4 1 2 2
# 5 a5 3 3 1
# 6 a6 3 6 1
In addition to all the methods I proposed, I also interested two other methods proposed by others: the prop.table method proposed by Henrik in the comments, and the apply method by Spacedman. I called all my solutions with m1_1, m1_2, m2_1, ... to m5. If there are two methods in one solution, I used _ to separate them. I also called the prop.table method as m6 and the apply method as m7. Notice that I modified m6 to have an output as a data frame so that all the methods can have data frame, tibble, or data.table output.
Here is the code I used to assess the performance.
per <- microbenchmark(m1_1 = {dat2 <- dat %>% mutate_at(vars(-sample), funs(./sum(.)))},
m1_2 = {dat2 <- dat %>% mutate_if(is.numeric, funs(./sum(.)))},
m2_1 = {dat2 <- dat %>%
map_at(c("a", "b", "c"), ~./sum(.)) %>%
as_data_frame()
},
m2_2 = {dat2 <- dat %>%
map_if(is.numeric, ~./sum(.)) %>%
as_data_frame()},
m3 = {dat_dt2 <- copy(dat_dt)
dat_dt2[, c("a", "b", "c") := lapply(.SD, function(x) x/sum(x)),
.SDcols = c("a", "b", "c")]},
m4_1 = {dat2 <- dat
dat2[, -1] <- lapply(dat2[, -1], function(x) x/sum(x))},
m4_2 = {dat2 <- dat
dat2[] <- lapply(dat2[], function(x){
if (is.numeric(x)){
return(x/sum(x))
} else{
return(x)
}
})},
m5 = {dat2 <- dat %>%
gather(Column, Value, -sample) %>%
group_by(Column) %>%
mutate(Value = Value/sum(Value)) %>%
spread(Column, Value)},
m6 = {dat2 <- dat
dat2[-1] <- prop.table(as.matrix(dat2[-1]), margin = 2)},
m7 = {dat2 <- dat
dat2[, -1] = apply(dat2[, -1], 2, function(x) {x/sum(x)})}
)
print(per)
# Unit: milliseconds
# expr min lq mean median uq max neval
# m1_1 23.335600 24.326445 28.71934 25.134798 27.465017 75.06974 100
# m1_2 20.373093 21.202780 29.73477 21.967439 24.897305 216.27853 100
# m2_1 9.452987 9.817967 17.83030 10.052634 11.056073 175.00184 100
# m2_2 10.009197 10.342819 16.43832 10.679270 11.846692 163.62731 100
# m3 16.195868 17.154327 34.40433 18.975886 46.521868 190.50681 100
# m4_1 8.100504 8.342882 12.66035 8.778545 9.348634 181.45273 100
# m4_2 8.130833 8.499926 15.84080 8.766979 9.732891 172.79242 100
# m5 5373.395308 5652.938528 5791.73180 5737.383894 5825.141584 6660.35354 100
# m6 117.038355 150.688502 191.43501 166.665125 218.837502 325.58701 100
# m7 119.680606 155.743991 199.59313 174.007653 215.295395 357.02775 100
library(ggplot2)
autoplot(per)
The result shows that methods based on lapply (m4_1 and m4_2) are the fastest, while the tidyr approach (m5) is the slowest, indicating that when row numbers are large it is not a good idea to use the gather and spread method.
DATA
dat <- read.table(text = "sample a b c
a2 1 4 6
a3 5 5 4",
header = TRUE, stringsAsFactors = FALSE)
Given this:
> d = data.frame(sample=c("a2","a3"),a=c(1,5),b=c(4,5),c=c(6,4))
> d
sample a b c
1 a2 1 4 6
2 a3 5 5 4
You can replace every column other than the first by applying over the rest:
> d[,-1] = apply(d[,-1],2,function(x){x/sum(x)})
> d
sample a b c
1 a2 0.1666667 0.4444444 0.6
2 a3 0.8333333 0.5555556 0.4
If you don't want d being stomped on make a copy beforehand.
You could do this in dplyr as well.
sample <- c("a2", "a3")
a <- c(1, 5)
b <- c(4, 5)
c <- c(6, 4)
dat <- data.frame(sample, a, b, c)
dat
library(dplyr)
dat %>%
mutate(
a.PCT = round(a/sum(a), 3),
b.PCT = round(b/sum(b), 3),
c.PCT = round(c/sum(c), 3))
sample a b c a.PCT b.PCT c.PCT
1 a2 1 4 6 0.167 0.444 0.6
2 a3 5 5 4 0.833 0.556 0.4
You can use the transpose of the matrix and then transpose again:
t(t(as.matrix(df))/colSums(df))
try apply:
mat <- matrix(1:6, ncol=3)
apply(mat,2, function(x) x / sum(x))
okay, if you have not numeric values in you columns you can force them to be numeric:
df <- data.frame( a=c('a', 'b'), b=c(3,4), d=c(1,6))
apply(df,2, function(x) {
x <- as.numeric(x)
x / sum(x)
})
I have a toy example of a tibble.
What is the most efficient way to sum two consecutive rows of y grouped by x
library(tibble)
l = list(x = c("a", "b", "a", "b", "a", "b"), y = c(1, 4, 3, 3, 7, 0))
df <- as_tibble(l)
df
#> # A tibble: 6 x 2
#> x y
#> <chr> <dbl>
#> 1 a 1
#> 2 b 4
#> 3 a 3
#> 4 b 3
#> 5 a 7
#> 6 b 0
So the output would be something like this
group sum seq
a 4 1
a 10 2
b 7 1
b 3 2
I'd like to use the tidyverse and possibly roll_sum() from the RcppRoll package
and have the code so that a variable length of consecutive rows could be used for real world data in which there would be many groups
TIA
One way to do this is use group_by %>% do where you can customize the returned data frame in do:
library(RcppRoll); library(tidyverse)
n = 2
df %>%
group_by(x) %>%
do(
data.frame(
sum = roll_sum(.$y, n),
seq = seq_len(length(.$y) - n + 1)
)
)
# A tibble: 4 x 3
# Groups: x [2]
# x sum seq
# <chr> <dbl> <int>
#1 a 4 1
#2 a 10 2
#3 b 7 1
#4 b 3 2
Edit: Since this is not as efficient, probably due to the data frame construction header and binding data frames on the go, here is an improved version (still somewhat slower than data.table but not as much now):
df %>%
group_by(x) %>%
summarise(sum = list(roll_sum(y, n)), seq = list(seq_len(n() -n + 1))) %>%
unnest()
Timing, use #Matt's data and setup:
library(tibble)
library(dplyr)
library(RcppRoll)
library(stringi) ## Only included for ability to generate random strings
## Generate data with arbitrary number of groups and rows --------------
rowCount <- 100000
groupCount <- 10000
sumRows <- 2L
set.seed(1)
l <- tibble(x = sample(stri_rand_strings(groupCount,3),rowCount,rep=TRUE),
y = sample(0:10,rowCount,rep=TRUE))
## Using dplyr and tibble -----------------------------------------------
ptm <- proc.time() ## Start the clock
dplyr_result <- l %>%
group_by(x) %>%
summarise(sum = list(roll_sum(y, n)), seq = list(seq_len(n() -n + 1))) %>%
unnest()
dplyr_time <- proc.time() - ptm ## Stop the clock
## Using data.table instead ----------------------------------------------
library(data.table)
ptm <- proc.time() ## Start the clock
setDT(l) ## Convert l to a data.table
dt_result <- l[,.(sum = RcppRoll::roll_sum(y, n = sumRows, fill = NA, align = "left"),
seq = seq_len(.N)),
keyby = .(x)][!is.na(sum)]
data.table_time <- proc.time() - ptm
Result is:
dplyr_time
# user system elapsed
# 0.688 0.003 0.689
data.table_time
# user system elapsed
# 0.422 0.009 0.430
Here is one approach for you. Since you want to sum up two consecutive rows, you could use lead() and do the calculation for sum. For seq, I think you can simply take row numbers, seeing your expected outcome. Once you are done with these operations, you arrange your data by x (if necessary, x and seq). Finally, you drop rows with NAs. If necessary, you may want to drop y by writing select(-y) at the end of the code.
group_by(df, x) %>%
mutate(sum = y + lead(y),
seq = row_number()) %>%
arrange(x) %>%
ungroup %>%
filter(complete.cases(.))
# x y sum seq
# <chr> <dbl> <dbl> <int>
#1 a 1 4 1
#2 a 3 10 2
#3 b 4 7 1
#4 b 3 3 2
I notice you asked for the most efficient way-- if you are looking at scaling this up to a much larger set, I would strongly recommend data.table.
library(data.table)
library(RcppRoll)
l[, .(sum = RcppRoll::roll_sum(y, n = 2L, fill = NA, align = "left"),
seq = seq_len(.N)),
keyby = .(x)][!is.na(sum)]
A rough benchmark comparison of this vs an answer using the tidyverse packages with 100,000 rows and 10,000 groups illustrates the significant difference.
(I used Psidom's answer instead of jazzurro's since jazzuro's did not allow for an arbritary number of rows to be summed.)
library(tibble)
library(dplyr)
library(RcppRoll)
library(stringi) ## Only included for ability to generate random strings
## Generate data with arbitrary number of groups and rows --------------
rowCount <- 100000
groupCount <- 10000
sumRows <- 2L
set.seed(1)
l <- tibble(x = sample(stri_rand_strings(groupCount,3),rowCount,rep=TRUE),
y = sample(0:10,rowCount,rep=TRUE))
## Using dplyr and tibble -----------------------------------------------
ptm <- proc.time() ## Start the clock
dplyr_result <- l %>%
group_by(x) %>%
do(
data.frame(
sum = roll_sum(.$y, sumRows),
seq = seq_len(length(.$y) - sumRows + 1)
)
)
|========================================================0% ~0 s remaining
dplyr_time <- proc.time() - ptm ## Stop the clock
## Using data.table instead ----------------------------------------------
library(data.table)
ptm <- proc.time() ## Start the clock
setDT(l) ## Convert l to a data.table
dt_result <- l[,.(sum = RcppRoll::roll_sum(y, n = sumRows, fill = NA, align = "left"),
seq = seq_len(.N)),
keyby = .(x)][!is.na(sum)]
data.table_time <- proc.time() - ptm ## Stop the clock
Results:
> dplyr_time
user system elapsed
10.28 0.04 10.36
> data.table_time
user system elapsed
0.35 0.02 0.36
> all.equal(dplyr_result,as.tibble(dt_result))
[1] TRUE
A solution using tidyverse and zoo. This is similar to Psidom's approach.
library(tidyverse)
library(zoo)
df2 <- df %>%
group_by(x) %>%
do(data_frame(x = unique(.$x),
sum = rollapplyr(.$y, width = 2, FUN = sum))) %>%
mutate(seq = 1:n()) %>%
ungroup()
df2
# A tibble: 4 x 3
x sum seq
<chr> <dbl> <int>
1 a 4 1
2 a 10 2
3 b 7 1
4 b 3 2
zoo + dplyr
library(zoo)
library(dplyr)
df %>%
group_by(x) %>%
mutate(sum = c(NA, rollapply(y, width = 2, sum)),
seq = row_number() - 1) %>%
drop_na()
# A tibble: 4 x 4
# Groups: x [2]
x y sum seq
<chr> <dbl> <dbl> <dbl>
1 a 3 4 1
2 b 3 7 1
3 a 7 10 2
4 b 0 3 2
If the moving window only equal to 2 using lag
df %>%
group_by(x) %>%
mutate(sum = y + lag(y),
seq = row_number() - 1) %>%
drop_na()
# A tibble: 4 x 4
# Groups: x [2]
x y sum seq
<chr> <dbl> <dbl> <dbl>
1 a 3 4 1
2 b 3 7 1
3 a 7 10 2
4 b 0 3 2
EDIT :
n = 3 # your moving window
df %>%
group_by(x) %>%
mutate(sum = c(rep(NA, n - 1), rollapply(y, width = n, sum)),
seq = row_number() - n + 1) %>%
drop_na()
A small variant on existing answers: first convert the data to list-column format, then use purrr to map() roll_sum() onto the data.
l = list(x = c("a", "b", "a", "b", "a", "b"), y = c(1, 4, 3, 3, 7, 0))
as.tibble(l) %>%
group_by(x) %>%
summarize(list_y = list(y)) %>%
mutate(rollsum = map(list_y, ~roll_sum(.x, 2))) %>%
select(x, rollsum) %>%
unnest %>%
group_by(x) %>%
mutate(seq = row_number())
I think if you have the latest version of purrr you could get rid of the last two lines (the final group_by() and mutate()) by using imap() instead of map.