r group lag sum - r

I have some data with groups for which I want to compute a summary (sum or mean) over a fixed number of periods. I'm trying to do this with a group_by followed by mutate and then operating with the variable and its dplyr::lag. Here is an example:
library(tidyverse)
df <- data.frame(group = rep(c("A", "B"), 5),
x = c(1, 3, 4, 7, 9, 10, 17, 29, 30, 55))
df %>%
group_by(group) %>%
mutate(cs = x + lag(x, 1, 0) + lag(x, 2, 0) + lag(x, 3, 0)) %>%
ungroup()
Which yields the desired result:
# A tibble: 10 x 3
group x cs
<fctr> <dbl> <dbl>
1 A 1 1
2 B 3 3
3 A 4 5
4 B 7 10
5 A 9 14
6 B 10 20
7 A 17 31
8 B 29 49
9 A 30 60
10 B 55 101
Is there a shorter way to accomplish this? (Here I calculated four values but I actually need twelve or more).

Perhaps you could use the purrr functions reduce and map included with the tidyverse:
library(tidyverse)
df <- data.frame(group = rep(c("A", "B"), 5),
x = c(1, 3, 4, 7, 9, 10, 17, 29, 30, 55))
df %>%
group_by(group) %>%
mutate(cs = reduce(map(0:3, ~ lag(x, ., 0)), `+`)) %>%
ungroup()
#> # A tibble: 10 x 3
#> group x cs
#> <fctr> <dbl> <dbl>
#> 1 A 1 1
#> 2 B 3 3
#> 3 A 4 5
#> 4 B 7 10
#> 5 A 9 14
#> 6 B 10 20
#> 7 A 17 31
#> 8 B 29 49
#> 9 A 30 60
#> 10 B 55 101
To see what's happening here it's probably easier to see with a simpler example that doesn't require a group.
v <- 1:5
lagged_v <- map(0:3, ~ lag(v, ., 0))
lagged_v
#> [[1]]
#> [1] 1 2 3 4 5
#>
#> [[2]]
#> [1] 0 1 2 3 4
#>
#> [[3]]
#> [1] 0 0 1 2 3
#>
#> [[4]]
#> [1] 0 0 0 1 2
reduce(lagged_v, `+`)
#> [1] 1 3 6 10 14

Related

How to add rows so that each group has equal number of rows?

I have a data frame with unequal numbers of rows per group, see df in the example below. I would like to add rows containing the group name and NAs in all other columns so that there is an equal number of rows per group like in df.desired. The rows should be added after the last row from the respective group.
Example:
df = data.frame(group = c("A","A","A","A","B","B","B","C","C"),
col1 = c(1, 1, 1, 1, 2, 2, 2, 3, 3),
col2 = c(12, 13, 14, 15, 21, 22, 23, 31, 32))
> df
group col1 col2
1 A 1 12
2 A 1 13
3 A 1 14
4 A 1 15
5 B 2 21
6 B 2 22
7 B 2 23
8 C 3 31
9 C 3 32
df.desired = data.frame(group = c("A","A","A","A","B","B","B","B","C","C","C","C"),
col1 = c(1, 1, 1, 1, 2, 2, 2, NA, 3, 3, NA, NA),
col2 = c(12, 13, 14, 15, 21, 22, 23, NA, 31, 32, NA, NA))
> df.desired
group col1 col2
1 A 1 12
2 A 1 13
3 A 1 14
4 A 1 15
5 B 2 21
6 B 2 22
7 B 2 23
8 B NA NA
9 C 3 31
10 C 3 32
11 C NA NA
12 C NA NA
I know how to do this with a loop but that would be super slow and I would prefer to use dplyr if possible. Does anyone have any ideas?
How about this:
library(dplyr)
df = data.frame(group = c("A","A","A","A","B","B","B","C","C"),
col1 = c(1, 1, 1, 1, 2, 2, 2, 3, 3),
col2 = c(12, 13, 14, 15, 21, 22, 23, 31, 32))
maxgp <- max(table(df$group))
df %>%
group_by(group) %>%
summarise(across(everything(), ~c(.x, rep(NA, maxgp-n()))))
#> `summarise()` has grouped output by 'group'. You can override using the
#> `.groups` argument.
#> # A tibble: 12 × 3
#> # Groups: group [3]
#> group col1 col2
#> <chr> <dbl> <dbl>
#> 1 A 1 12
#> 2 A 1 13
#> 3 A 1 14
#> 4 A 1 15
#> 5 B 2 21
#> 6 B 2 22
#> 7 B 2 23
#> 8 B NA NA
#> 9 C 3 31
#> 10 C 3 32
#> 11 C NA NA
#> 12 C NA NA
Created on 2023-02-01 by the reprex package (v2.0.1)
You can create row numbers for each group and then tidyr::complete:
library(dplyr)
df %>%
group_by(group) %>%
mutate(id = row_number()) %>%
ungroup() %>%
tidyr::complete(group, id) %>%
select(-id)
# # A tibble: 12 × 3
# group col1 col2
# <chr> <dbl> <dbl>
# 1 A 1 12
# 2 A 1 13
# 3 A 1 14
# 4 A 1 15
# 5 B 2 21
# 6 B 2 22
# 7 B 2 23
# 8 B NA NA
# 9 C 3 31
# 10 C 3 32
# 11 C NA NA
# 12 C NA NA
Update (from #Maël's answer)
After dplyr 1.1.0, Per-operation grouping with .by/by is supported for mutate(), summarise(), filter(), and the slice() family. The code can be simplified to
df %>%
mutate(id = row_number(), .by = group) %>%
tidyr::complete(group, id) %>%
select(-id)

how to replace age with previous value + 1

I'm trying to replace missing age values in one wave by adding 1 to the value from the previous wave. So, for instance:
ID
Age
Wave
1
20
1
1
NA
2
2
61
1
2
NA
2
would become
ID
Age
Wave
1
20
1
1
21
2
2
61
1
2
62
2
library(tidyverse)
df %>%
mutate(Age = case_when(is.na(Age) ~ lag(Age) + 1,
TRUE ~ Age))
# A tibble: 4 x 3
ID Age Wave
<dbl> <dbl> <dbl>
1 1 20 1
2 1 21 2
3 2 61 1
4 2 62 2
Base R
> ave(df$Age,df$ID,FUN=function(x){x[1]+seq_along(x)-1})
[1] 20 21 61 62
With tidyverse, assuming your data is in df dataframe:
library(tidyverse)
df %>%
group_by(ID) %>% arrange(ID, Wave) %>%
mutate(missing_grp = cumsum( (is.na(Age)!=is.na(lag(Age))) | !is.na(Age) )) %>%
group_by(ID, missing_grp) %>%
mutate(age_offset=cumsum(is.na(Age))) %>%
group_by(ID) %>%
fill(Age, .direction='down') %>%
mutate(Age = Age + age_offset) %>%
ungroup() %>% select(-missing_grp, -age_offset)
It works also with multiple successive missing ages.
For the following input:
df <- tribble(
~ID, ~Age, ~Wave,
1, 21, 1,
1, NA, 2,
2, 61, 1,
2, NA, 2,
2, NA, 3,
2, 70, 4,
2, NA, 5,
)
it returns:
# A tibble: 7 × 3
ID Age Wave
<dbl> <dbl> <dbl>
1 1 21 1
2 1 22 2
3 2 61 1
4 2 62 2
5 2 63 3
6 2 70 4
7 2 71 5
In base R
within(df, Age[is.na(Age)] <- Age[which(is.na(Age)) - 1] + 1)
#> ID Age Wave
#> 1 1 20 1
#> 2 1 21 2
#> 3 2 61 1
#> 4 2 62 2
If you have more than two waves, we could use the row number:
library(dplyr)
library(tidyverse)
df |>
group_by(ID) |>
fill(Age) |>
mutate(Age = Age + row_number() - 1) |>
ungroup()
Output:
# A tibble: 5 × 3
ID Age Wave
<dbl> <dbl> <dbl>
1 1 21 1
2 1 22 2
3 2 61 1
4 2 62 2
5 2 63 3

How to create combinations of values of one variable by group using tidyverse in R

I am using the combn function in R to get all the combinations of the values of variable y taking each time 2 values, grouping by the values of x. My expected final result is the tibble c.
But when I try to do it in tidyverse something is (very) wrong.
library(tidyverse)
df <- tibble(x = c(1, 1, 1, 2, 2, 2, 2),
y = c(8, 9, 7, 3, 5, 2, 1))
# This is what I want
a <- combn(df$y[df$x == 1], 2)
a <- rbind(a, rep(1, ncol(a)))
b <- combn(df$y[df$x == 2], 2)
b <- rbind(b, rep(2, ncol(b)))
c <- cbind(a, b)
c <- tibble(c)
c <- t(c)
# but using tidyverse it does not work
df %>% group_by(x) %>% mutate(z = combn(y, 2))
#> Error: Problem with `mutate()` input `z`.
#> x Input `z` can't be recycled to size 3.
#> i Input `z` is `combn(y, 2)`.
#> i Input `z` must be size 3 or 1, not 2.
#> i The error occurred in group 1: x = 1.
Created on 2020-11-18 by the reprex package (v0.3.0)
Try with combn
out = df %>% group_by(x) %>% do(data.frame(t(combn(.$y, 2))))
# A tibble: 9 x 3
# Groups: x [2]
x X1 X2
<dbl> <dbl> <dbl>
1 1 8 9
2 1 8 7
3 1 9 7
4 2 3 5
5 2 3 2
6 2 3 1
7 2 5 2
8 2 5 1
9 2 2 1
If you have dplyr v1.0.2, you can do this
df %>% group_by(x) %>% group_modify(~as_tibble(t(combn(.$y, 2L))))
Output
# A tibble: 9 x 3
# Groups: x [2]
x V1 V2
<dbl> <dbl> <dbl>
1 1 8 9
2 1 8 7
3 1 9 7
4 2 3 5
5 2 3 2
6 2 3 1
7 2 5 2
8 2 5 1
9 2 2 1
An option with summarise and unnest
library(dplyr)
library(tidyr)
df %>%
group_by(x) %>%
summarise(y = list(as.data.frame(t(combn(y, 2)))), .groups = 'drop') %>%
unnest(c(y))
# A tibble: 9 x 3
# x V1 V2
# <dbl> <dbl> <dbl>
#1 1 8 9
#2 1 8 7
#3 1 9 7
#4 2 3 5
#5 2 3 2
#6 2 3 1
#7 2 5 2
#8 2 5 1
#9 2 2 1

How to use group_by with summarise and summarise_all?

x y
1 1 1
2 3 2
3 2 3
4 3 4
5 2 5
6 4 6
7 5 7
8 2 8
9 1 9
10 1 10
11 3 11
12 4 12
The above is part of the input.
Let's suppose that it also has a bunch of other columns
I want to:
group_by x
summarise y by sum
And for all other columns, I want to summarise_all by just taking the first value
Here's an approach that breaks it into two problems and combines them:
library(dplyr)
left_join(
# Here we want to treat column y specially
df %>%
group_by(x) %>%
summarize(sum_y = sum(y)),
# Here we exclude y and use a different summation for all the remaining columns
df %>%
group_by(x) %>%
select(-y) %>%
summarise_all(first)
)
# A tibble: 5 x 3
x sum_y z
<int> <int> <int>
1 1 20 1
2 2 16 3
3 3 17 2
4 4 18 2
5 5 7 3
Sample data:
df <- read.table(
header = T,
stringsAsFactors = F,
text="x y z
1 1 1
3 2 2
2 3 3
3 4 4
2 5 1
4 6 2
5 7 3
2 8 4
1 9 1
1 10 2
3 11 3
4 12 4")
library(dplyr)
df1 %>%
group_by(x) %>%
summarise_each(list(avg = mean), -y) %>%
bind_cols(.,{df1 %>%
group_by(x) %>%
summarise_at(vars(y), funs(sum)) %>%
select(-x)
})
#> # A tibble: 5 x 4
#> x r_avg r.1_avg y
#> <int> <dbl> <dbl> <int>
#> 1 1 6.67 6.67 20
#> 2 2 5.33 5.33 16
#> 3 3 5.67 5.67 17
#> 4 4 9 9 18
#> 5 5 7 7 7
Created on 2019-06-20 by the reprex package (v0.3.0)
Data:
df1 <- read.table(text="
r x y
1 1 1
2 3 2
3 2 3
4 3 4
5 2 5
6 4 6
7 5 7
8 2 8
9 1 9
10 1 10
11 3 11
12 4 12", header=T)
df1 <- df1[,c(2,3,1,1)]
library(tidyverse)
df <- tribble(~x, ~y, # making a sample data frame
1, 1,
3, 2,
2, 3,
3, 4,
2, 5,
4, 6,
5, 7,
2, 8,
1, 9,
1, 10,
3, 11,
4, 12)
df <- df %>%
add_column(z = sample(1:nrow(df))) #add another column for the example
df
# If there is only one additional column and you need the first value
df %>%
group_by(x) %>%
summarise(sum_y = sum(y), z_1st = z[1])
# otherwise use summarise_at to address all the other columns
f <- function(x){x[1]} # function to extract the first value
df %>%
group_by(x) %>%
summarise_at(.vars = vars(-c('y')), .funs = f) # exclude column y from the calculations

Exclude column in `dplyr` `mutate_at` while using data in this column

I want to rescale all variables (but year and gender) in a df by one specific year, grouped by gender:
set.seed(1)
df <- data.frame(gender = c(rep("m", 5), rep("f", 5)), year = rep(1:5, 2), var_a = 1:10, var_b = 0:9)
df
gender year var_a var_b
1 m 1 1 0
2 m 2 2 1
3 m 3 3 2
4 m 4 4 3
5 m 5 5 4
6 f 1 6 5
7 f 2 7 6
8 f 3 8 7
9 f 4 9 8
10 f 5 10 9
I can generate what I expect using:
df %>% group_by(gender) %>% mutate(var_a = ifelse(year == 3, 0, var_a - var_a[year == 3])) %>%
mutate(var_b = ifelse(year == 3, 0, var_b - var_b[year == 3]))
gender year var_a var_b
<fct> <int> <dbl> <dbl>
1 m 1 -2 -2
2 m 2 -1 -1
3 m 3 0 0
4 m 4 1 1
5 m 5 2 2
6 f 1 -2 -2
7 f 2 -1 -1
8 f 3 0 0
9 f 4 1 1
10 f 5 2 2
However, this is not an option since I have too many columns.
So I tried (with no success):
df %>% group_by(gender) %>% mutate_at(vars(-gender, -year), ifelse(year == 3, 0, var_a - var_a[year == 3]))
Error in ifelse(year == 3, 0, var_a - var_a[year == 3]) : object
'year' not found
How can I exclude column names in mutate_at (or an alternative) using vars(-col_name) while still reading the data in those columns?
This is related to this one
Use position in mutate_at
library(dplyr)
df %>%
group_by(gender) %>%
mutate_at(-c(1, 2), ~ifelse(year == 3, 0, . - .[year == 3]))
# gender year var_a var_b
# <fct> <int> <dbl> <dbl>
# 1 m 1 -2 -2
# 2 m 2 -1 -1
# 3 m 3 0 0
# 4 m 4 1 1
# 5 m 5 2 2
# 6 f 1 -2 -2
# 7 f 2 -1 -1
# 8 f 3 0 0
# 9 f 4 1 1
#10 f 5 2 2
In case, if you do not know the position of columns beforehand you can first find it
cols <- which(names(df) %in% c("gender", "year"))
df %>%
group_by(gender) %>%
mutate_at(-cols, ~ifelse(year == 3, 0, . - .[year == 3]))
Or select columns which starts_with
df %>%
group_by(gender) %>%
mutate_at(vars(starts_with("var")), ~ifelse(year == 3, 0, . - .[year == 3]))
If you add a ~ before the function you should get the wanted output.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
set.seed(1)
df <- data.frame(gender = c(rep("m", 5),
rep("f", 5)),
year = rep(1:5, 2), var_a = 1:10, var_b = 0:9)
df
#> gender year var_a var_b
#> 1 m 1 1 0
#> 2 m 2 2 1
#> 3 m 3 3 2
#> 4 m 4 4 3
#> 5 m 5 5 4
#> 6 f 1 6 5
#> 7 f 2 7 6
#> 8 f 3 8 7
#> 9 f 4 9 8
#> 10 f 5 10 9
df %>%
group_by(gender) %>%
mutate_at(vars(-gender, -year),
~ifelse(year == 3, 0, . - .[year == 3]))
#> # A tibble: 10 x 4
#> # Groups: gender [2]
#> gender year var_a var_b
#> <fct> <int> <dbl> <dbl>
#> 1 m 1 -2 -2
#> 2 m 2 -1 -1
#> 3 m 3 0 0
#> 4 m 4 1 1
#> 5 m 5 2 2
#> 6 f 1 -2 -2
#> 7 f 2 -1 -1
#> 8 f 3 0 0
#> 9 f 4 1 1
#> 10 f 5 2 2
Created on 2019-04-29 by the reprex package (v0.2.1)
EDIT:
In older versions of dplyr you would use funs(), but it is soft deprecated as of dplyr 0.8.0
df %>%
group_by(gender) %>%
mutate_at(vars(-gender, -year),
funs(ifelse(year == 3, 0, . - .[year == 3])))

Resources