Calculate rolling sum by group [duplicate] - r

This question already has answers here:
Why are my dplyr group_by & summarize not working properly? (name-collision with plyr)
(5 answers)
Closed 2 years ago.
I would like to calculate a rolling sum (or a custom function) of 3 previous values, treating each group separately. I have tried this:
require(dplyr)
# Build dataframe
df <- data.frame(person = c(rep("Peter", 5), rep("James", 5)),
score1 = c(1,3,2,5,4,6,8,4,5,3),
score2 = c(1,1,1,5,1,3,4,8,9,0))
# Attempt rolling sum by group
df %>%
group_by(person) %>%
mutate(s1_rolling = rollsumr(score1, k = 3, fill = NA),
s2_rolling = rollsumr(score2, k = 3, fill = NA))
But the new columns do not treat each group separately, instead continuing down the whole dataset:
person score1 score2 s1_rolling s2_rolling
<chr> <dbl> <dbl> <dbl> <dbl>
1 Peter 1 1 NA NA
2 Peter 3 1 NA NA
3 Peter 2 1 6 3
4 Peter 5 5 10 7
5 Peter 4 1 11 7
6 James 6 3 15 9
7 James 8 4 18 8
8 James 4 8 18 15
9 James 5 9 17 21
10 James 3 0 12 17
I would like row 6 and 7 to show NA in the two new columns, because until row 8 there is insufficient James data to sum 3 rows.
How can I do this?

It could be that plyr was also loaded and the mutate from plyr masked the mutate from dplyr. We could use dplyr::mutate
library(dplyr)
library(zoo)
df %>%
group_by(person) %>%
dplyr::mutate(s1_rolling = rollsumr(score1, k = 3, fill = NA),
s2_rolling = rollsumr(score2, k = 3, fill = NA))
# A tibble: 10 x 5
# Groups: person [2]
# person score1 score2 s1_rolling s2_rolling
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 Peter 1 1 NA NA
# 2 Peter 3 1 NA NA
# 3 Peter 2 1 6 3
# 4 Peter 5 5 10 7
# 5 Peter 4 1 11 7
# 6 James 6 3 NA NA
# 7 James 8 4 NA NA
# 8 James 4 8 18 15
# 9 James 5 9 17 21
#10 James 3 0 12 17
If there are more than one column, we can also use across
df %>%
group_by(person) %>%
dplyr::mutate(across(starts_with('score'),
~ rollsumr(., k = 3, fill = NA), .names = '{col}_rolling'))
For a faster version, use RcppRoll::roll_sumr
df %>%
group_by(person) %>%
dplyr::mutate(across(starts_with('score'),
~ RcppRoll::roll_sumr(., 3, fill = NA), .names = '{col}_rolling'))
The behavior can be reproduced with plyr::mutate
df %>%
group_by(person) %>%
plyr::mutate(s1_rolling = rollsumr(score1, k = 3, fill = NA),
s2_rolling = rollsumr(score2, k = 3, fill = NA))
# A tibble: 10 x 5
# Groups: person [2]
# person score1 score2 s1_rolling s2_rolling
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 Peter 1 1 NA NA
# 2 Peter 3 1 NA NA
# 3 Peter 2 1 6 3
# 4 Peter 5 5 10 7
# 5 Peter 4 1 11 7
# 6 James 6 3 15 9
# 7 James 8 4 18 8
# 8 James 4 8 18 15
# 9 James 5 9 17 21
#10 James 3 0 12 17

I would suggest a slider approach with slide_dbl() function with works similar to zoo and it is compatible with dplyr:
library(slider)
library(dplyr)
#Code
# Build dataframe
df <- data.frame(person = c(rep("Peter", 5), rep("James", 5)),
score1 = c(1,3,2,5,4,6,8,4,5,3),
score2 = c(1,1,1,5,1,3,4,8,9,0))
# Attempt rolling sum by group
df %>%
group_by(person) %>%
mutate(s1_rolling = slide_dbl(score1, sum, .before = 2, .complete = TRUE),
s2_rolling = slide_dbl(score2, sum, .before = 2, .complete = TRUE))
Output:
# A tibble: 10 x 5
# Groups: person [2]
person score1 score2 s1_rolling s2_rolling
<fct> <dbl> <dbl> <dbl> <dbl>
1 Peter 1 1 NA NA
2 Peter 3 1 NA NA
3 Peter 2 1 6 3
4 Peter 5 5 10 7
5 Peter 4 1 11 7
6 James 6 3 NA NA
7 James 8 4 NA NA
8 James 4 8 18 15
9 James 5 9 17 21
10 James 3 0 12 17

Related

Count variable until observations changes [duplicate]

This question already has answers here:
Create counter within consecutive runs of values
(3 answers)
Closed 1 year ago.
Unfortunately, I can't wrap my head around this but I'm sure there is a straightforward solution. I've a data.frame that looks like this:
set.seed(1)
mydf <- data.frame(group=sample(c("a", "b"), 20, replace=T))
I'd like to create a new variable that counts from top to bottom, how many times the group occured in a row. Hence, within the example from above it should look like this:
mydf$question <- c(1, 2, 1, 2, 1, 1, 2, 3, 4, 1, 2, 3, 1, 1, 1, 1, 1, 2, 1, 1)
> mydf[1:10,]
group question
1 a 1
2 a 2
3 b 1
4 b 2
5 a 1
6 b 1
7 b 2
8 b 3
9 b 4
10 a 1
Thanks for help.
Using data.table::rleid and dplyr you could do:
set.seed(1)
mydf <- data.frame(group=sample(c("a", "b"), 20, replace=T))
library(dplyr)
library(data.table)
mydf %>%
mutate(id = data.table::rleid(group)) %>%
group_by(id) %>%
mutate(question = row_number()) %>%
ungroup()
#> # A tibble: 20 × 3
#> group id question
#> <chr> <int> <int>
#> 1 a 1 1
#> 2 b 2 1
#> 3 a 3 1
#> 4 a 3 2
#> 5 b 4 1
#> 6 a 5 1
#> 7 a 5 2
#> 8 a 5 3
#> 9 b 6 1
#> 10 b 6 2
#> 11 a 7 1
#> 12 a 7 2
#> 13 a 7 3
#> 14 a 7 4
#> 15 a 7 5
#> 16 b 8 1
#> 17 b 8 2
#> 18 b 8 3
#> 19 b 8 4
#> 20 a 9 1
Update: Most is the same as stefan but without data.table package:
library(dplyr)
mydf %>%
mutate(myrleid = with(rle(group), rep(seq_along(lengths), lengths))) %>%
group_by(myrleid) %>%
mutate(question = row_number()) %>%
ungroup()
group myrleid question
<chr> <int> <int>
1 a 1 1
2 b 2 1
3 a 3 1
4 a 3 2
5 b 4 1
6 a 5 1
7 a 5 2
8 a 5 3
9 b 6 1
10 b 6 2
11 a 7 1
12 a 7 2
13 a 7 3
14 a 7 4
15 a 7 5
16 b 8 1
17 b 8 2
18 b 8 3
19 b 8 4
20 a 9 1

Unnest or unchop dataframe containing lists of different lengths

I have a dataframe with several columns containing list columns that I want to unnest (or unchop). BUT, they are different lengths, so the resulting error is Error: No common size for...
Here is a reprex to show what works and doesn't work.
library(tidyr)
library(vctrs)
# This works as expected
df_A <- tibble(
ID = 1:3,
A = as_list_of(list(c(9, 8, 5), c(7,6), c(6, 9)))
)
unchop(df_A, cols = c(A))
# A tibble: 7 x 2
ID A
<int> <dbl>
1 1 9
2 1 8
3 1 5
4 2 7
5 2 6
6 3 6
7 3 9
# This works as expected as the lists are the same lengths
df_AB_1 <- tibble(
ID = 1:3,
A = as_list_of(list(c(9, 8, 5), c(7,6), c(6, 9))),
B = as_list_of(list(c(1, 2, 3), c(4, 5), c(7, 8)))
)
unchop(df_AB_1, cols = c(A, B))
# A tibble: 7 x 3
ID A B
<int> <dbl> <dbl>
1 1 9 1
2 1 8 2
3 1 5 3
4 2 7 4
5 2 6 5
6 3 6 7
7 3 9 8
# This does NOT work as the lists are different lengths
df_AB_2 <- tibble(
ID = 1:3,
A = as_list_of(list(c(9, 8, 5), c(7,6), c(6, 9))),
B = as_list_of(list(c(1, 2), c(4, 5, 6), c(7, 8, 9, 0)))
)
unchop(df_AB_2, cols = c(A, B))
# Error: No common size for `A`, size 3, and `B`, size 2.
The output that I would like to achieve for df_AB_2 above is as follows where each list is unchopped and missing values are filled with NA:
# A tibble: 10 x 3
ID A B
<dbl> <dbl> <dbl>
1 1 9 1
2 1 8 2
3 1 5 NA
4 2 7 4
5 2 6 5
6 2 NA 6
7 3 6 7
8 3 9 8
9 3 NA 9
10 3 NA 0
I have referenced this issue on Github and StackOverflow here.
Any ideas how to achieve the result above?
Versions
> packageVersion("tidyr")
[1] ‘1.0.0’
> packageVersion("vctrs")
[1] ‘0.2.0.9001’
Here is an idea via dplyr that you can generalise to as many columns as you want,
library(tidyverse)
df_AB_2 %>%
pivot_longer(c(A, B)) %>%
mutate(value = lapply(value, `length<-`, max(lengths(value)))) %>%
pivot_wider(names_from = name, values_from = value) %>%
unnest() %>%
filter(rowSums(is.na(.[-1])) != 2)
which gives,
# A tibble: 10 x 3
ID A B
<int> <dbl> <dbl>
1 1 9 1
2 1 8 2
3 1 5 NA
4 2 7 4
5 2 6 5
6 2 NA 6
7 3 6 7
8 3 9 8
9 3 NA 9
10 3 NA 0
Defining a helper function to update the lengths of the element and proceeding with dplyr:
foo <- function(x, len_vec) {
lapply(
seq_len(length(x)),
function(i) {
length(x[[i]]) <- len_vec[i]
x[[i]]
}
)
}
df_AB_2 %>%
mutate(maxl = pmax(lengths(A), lengths(B))) %>%
mutate(A = foo(A, maxl), B = foo(B, maxl)) %>%
unchop(cols = c(A, B)) %>%
select(-maxl)
# A tibble: 10 x 3
ID A B
<int> <dbl> <dbl>
1 1 9 1
2 1 8 2
3 1 5 NA
4 2 7 4
5 2 6 5
6 2 NA 6
7 3 6 7
8 3 9 8
9 3 NA 9
10 3 NA 0
Using data.table:
library(data.table)
setDT(df_AB_2)
df_AB_2[, maxl := pmax(lengths(A), lengths(B))]
df_AB_2[, .(unlist(A)[seq_len(maxl)], unlist(B)[seq_len(maxl)]), by = ID]

R how to fill in NA with rules

data=data.frame(person=c(1,1,1,2,2,2,2,3,3,3,3),
t=c(3,NA,9,4,7,NA,13,3,NA,NA,12),
WANT=c(3,6,9,4,7,10,13,3,6,9,12))
So basically I am wanting to create a new variable 'WANT' which takes the PREVIOUS value in t and ADDS 3 to it, and if there are many NA in a row then it keeps doing this. My attempt is:
library(dplyr)
data %>%
group_by(person) %>%
mutate(WANT_TRY = fill(t) + 3)
Here's one way -
data %>%
group_by(person) %>%
mutate(
# cs = cumsum(!is.na(t)), # creates index for reference value; uncomment if interested
w = case_when(
# rle() gives the running length of NA
is.na(t) ~ t[cumsum(!is.na(t))] + 3*sequence(rle(is.na(t))$lengths),
TRUE ~ t
)
) %>%
ungroup()
# A tibble: 11 x 4
person t WANT w
<dbl> <dbl> <dbl> <dbl>
1 1 3 3 3
2 1 NA 6 6
3 1 9 9 9
4 2 4 4 4
5 2 7 7 7
6 2 NA 10 10
7 2 13 13 13
8 3 3 3 3
9 3 NA 6 6
10 3 NA 9 9
11 3 12 12 12
Here is another way. We can do linear interpolation with the imputeTS package.
library(dplyr)
library(imputeTS)
data2 <- data %>%
group_by(person) %>%
mutate(WANT2 = na.interpolation(WANT)) %>%
ungroup()
data2
# # A tibble: 11 x 4
# person t WANT WANT2
# <dbl> <dbl> <dbl> <dbl>
# 1 1 3 3 3
# 2 1 NA 6 6
# 3 1 9 9 9
# 4 2 4 4 4
# 5 2 7 7 7
# 6 2 NA 10 10
# 7 2 13 13 13
# 8 3 3 3 3
# 9 3 NA 6 6
# 10 3 NA 9 9
# 11 3 12 12 12
This is harder than it seems because of the double NA at the end. If it weren't for that, then the following:
ifelse(is.na(data$t), c(0, data$t[-nrow(data)])+3, data$t)
...would give you want you want. The simplest way, that uses the same logic but doesn't look very clever (sorry!) would be:
.impute <- function(x) ifelse(is.na(x), c(0, x[-length(x)])+3, x)
.impute(.impute(data$t))
...which just cheats by doing it twice. Does that help?
You can use functional programming from purrr and "NA-safe" addition from hablar:
library(hablar)
library(dplyr)
library(purrr)
data %>%
group_by(person) %>%
mutate(WANT2 = accumulate(t, ~.x %plus_% 3))
Result
# A tibble: 11 x 4
# Groups: person [3]
person t WANT WANT2
<dbl> <dbl> <dbl> <dbl>
1 1 3 3 3
2 1 NA 6 6
3 1 9 9 9
4 2 4 4 4
5 2 7 7 7
6 2 NA 10 10
7 2 13 13 13
8 3 3 3 3
9 3 NA 6 6
10 3 NA 9 9
11 3 12 12 12

dplyr::first() to choose first non NA value

I am looking for a way to extract the first and last non-NA value from each group. I am using dplyr::first() and dplyr::last(), but I can´t work out how to choose the first or last non-NA value.
library(dplyr)
set.seed(123)
d <- data.frame(
group = rep(1:3, each = 3),
year = rep(seq(2000,2002,1),3),
value = sample(1:9, r = T))
#Introduce NA values in first row of group 2 and last row of group 3
d %>%
mutate(
value = case_when(
group == 2 & year ==2000 ~ NA_integer_,
group == 3 & year ==2002 ~ NA_integer_,
TRUE ~ value))%>%
group_by(group) %>%
mutate(
first = dplyr::first(value),
last = dplyr::last(value))
RESULT (with issue)
# A tibble: 9 x 5
# Groups: group [3]
group year value first last
<int> <dbl> <int> <int> <int>
1 1 2000 3 3 4
2 1 2001 8 3 4
3 1 2002 4 3 4
4 2 2000 NA NA 1
5 2 2001 9 NA 1
6 2 2002 1 NA 1
7 3 2000 5 5 NA
8 3 2001 9 5 NA
9 3 2002 NA 5 NA
Can you help me make the values in the "first" column for group 2 = 9 and the values in the "last" column from group 3 = 9?
I very much prefer a tidyverse solution if one such exists?
Use na.omit, compare:
first(c(NA, 11, 22))
# [1] NA
first(na.omit(c(NA, 11, 22)))
# [1] 11
Using example data:
d %>%
mutate(
value = case_when(
group == 2 & year ==2000 ~ NA_integer_,
group == 3 & year ==2002 ~ NA_integer_,
TRUE ~ value))%>%
group_by(group) %>%
mutate(
first = dplyr::first(na.omit(value)),
last = dplyr::last(na.omit(value)))
# # A tibble: 9 x 5
# # Groups: group [3]
# group year value first last
# <int> <dbl> <int> <int> <int>
# 1 1 2000 3 3 4
# 2 1 2001 8 3 4
# 3 1 2002 4 3 4
# 4 2 2000 NA 9 1
# 5 2 2001 9 9 1
# 6 2 2002 1 9 1
# 7 3 2000 5 5 9
# 8 3 2001 9 5 9
# 9 3 2002 NA 5 9

debugging: function to create multiple lags for multiple columns (dplyr)

I want to create multiple lags of multiple variables, so I thought writing a function would be helpful. My code throws a warning ("Truncating vector to length 1 ") and false results:
library(dplyr)
time <- c(2000:2009, 2000:2009)
x <- c(1:10, 10:19)
id <- c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2)
df <- data.frame(id, time, x)
three_lags <- function (data, column, group, ordervar) {
data <- data %>%
group_by_(group) %>%
mutate(a = lag(column, 1L, NA, order_by = ordervar),
b = lag(column, 2L, NA, order_by = ordervar),
c = lag(column, 3L, NA, order_by = ordervar))
}
df_lags <- three_lags(data=df, column=x, group=id, ordervar=time) %>%
arrange(id, time)
Also I wondered if there might be a more elegant solution using mutate_each, but I didn't get that to work either. I can of course just write a long code with a line for each new lagged variable, but Id like to avoid that.
EDIT:
akrun's dplyr answer works, but takes a long time to compute for large data frames. The solution using data.table seems to be more efficient. So a dplyr or other solution that also allows the be implemented for several columns & several lags is still to be found.
EDIT 2:
For multiple columns and no groups (e.g. "ID") the following solution seems very well suited to me, due to its simplicity. The code may of course be shortened, but step by step:
df <- arrange(df, time)
df.lag <- shift(df[,1:24], n=1:3, give.names = T) ##column indexes of columns to be lagged as "[,startcol:endcol]", "n=1:3" sepcifies the number of lags (lag1, lag2 and lag3 in this case)
df.result <- bind_cols(df, df.lag)
We can use shift from data.table which can take multiple values for 'n'
library(data.table)
setDT(df)[order(time), c("a", "b", "c") := shift(x, 1:3) , id][order(id, time)]
Suppose, we need to do this on multiple columns
df$y <- df$x
setDT(df)[order(time), paste0(rep(c("x", "y"), each =3),
c("a", "b", "c")) :=shift(.SD, 1:3), id, .SDcols = x:y]
The shift can also be used in the dplyr
library(dplyr)
df %>%
group_by(id) %>%
arrange(id, time) %>%
do(data.frame(., setNames(shift(.$x, 1:3), c("a", "b", "c"))))
# id time x a b c
# <dbl> <int> <int> <int> <int> <int>
#1 1 2000 1 NA NA NA
#2 1 2001 2 1 NA NA
#3 1 2002 3 2 1 NA
#4 1 2003 4 3 2 1
#5 1 2004 5 4 3 2
#6 1 2005 6 5 4 3
#7 1 2006 7 6 5 4
#8 1 2007 8 7 6 5
#9 1 2008 9 8 7 6
#10 1 2009 10 9 8 7
#11 2 2000 10 NA NA NA
#12 2 2001 11 10 NA NA
#13 2 2002 12 11 10 NA
#14 2 2003 13 12 11 10
#15 2 2004 14 13 12 11
#16 2 2005 15 14 13 12
#17 2 2006 16 15 14 13
#18 2 2007 17 16 15 14
#19 2 2008 18 17 16 15
#20 2 2009 19 18 17 16
Could also create a function that will output a tibble:
library(tidyverse)
lag_multiple <- function(x, n_vec){
map(n_vec, lag, x = x) %>%
set_names(paste0("lag", n_vec)) %>%
as_tibble()
}
tibble(x = 1:30) %>%
mutate(lag_multiple(x, 1:5))
#> # A tibble: 30 x 6
#> x lag1 lag2 lag3 lag4 lag5
#> <int> <int> <int> <int> <int> <int>
#> 1 1 NA NA NA NA NA
#> 2 2 1 NA NA NA NA
#> 3 3 2 1 NA NA NA
#> 4 4 3 2 1 NA NA
#> 5 5 4 3 2 1 NA
#> 6 6 5 4 3 2 1
#> 7 7 6 5 4 3 2
#> 8 8 7 6 5 4 3
#> 9 9 8 7 6 5 4
#> 10 10 9 8 7 6 5
#> # ... with 20 more rows

Resources