adding several lags/shifts to list of columns - r

I would like to lag several columns (e.g. value_1 + value_2 + x - see below), define their number of lags (e.g. 3) and their naming. This is some working tedious/manual code:
library(dplyr)
library(lubridate)
library(data.table)
haves <- data.frame(
id = c("a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b")
, date = as.Date(c("2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01", "2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01"))
, value_1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
, value_2 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
, x = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
)
haves$value_2 <- haves$value_2 + 1
haves$x <- haves$x + 2
haves
wants <- haves %>%
group_by(id) %>%
mutate(
value_1_lag_1 = lag(value_1, n = 1, order_by = date)
, value_1_lag_2 = lag(value_1, n = 2, order_by = date)
, value_1_lag_3 = lag(value_1, n = 3, order_by = date)
, value_2_lag_1 = lag(value_2, n = 1, order_by = date)
, value_2_lag_2 = lag(value_2, n = 2, order_by = date)
, value_2_lag_3 = lag(value_2, n = 3, order_by = date)
, x_lag_1 = lag(x, n = 1, order_by = date)
, x_lag_2 = lag(x, n = 2, order_by = date)
, x_lag_3 = lag(x, n = 3, order_by = date)
)
wants
Someone suggested that this offers a solution, so I tried the data table approach without success:
setDT(haves)
haves[, sapply(1:3, function(x){paste0('', x, '_lag_', 1:3)}) := shift(.SD, 1:3),
by = id, .SDcols = value_1:x][]
It does not produce my wants. This comes closer:
colnames <- colnames(haves)
setDT(haves)
haves[, sapply(1:3, function(x){paste0(colnames[x + 2], x, '_lag_', 1:3)}) := shift(.SD, 1:3), by = id, .SDcols = value_1:x][]
Alternatively, I may just use a loop and a function like this:
appender <- function(df, column, lag){
df %>%
group_by(
id
) %>%
mutate(
!!paste0(column, "_lag_", lag) := lag(!!rlang::sym(column), n = lag, order_by = date)
)
}
temp <- appender(haves, "value_2", 3)
Any help would be very much appreciated. Thanks!

Here's the adapted dplyr solution from the linked answer.
haves %>%
group_by(id) %>%
nest %>%
mutate(data = map(data, ~arrange(., date))) %>%
mutate(lags = map(data, function(dat) {
imap_dfc(dat[-1], ~set_names(map(1:3, lag, x = .x),
paste0(.y, "_lag_", 1:3)))
})) %>%
unnest(c(data, lags))
Is this what you're looking for?

Here is an alternative way through data.table.
library(data.table)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:data.table':
#>
#> hour, isoweek, mday, minute, month, quarter, second, wday, week,
#> yday, year
#> The following object is masked from 'package:base':
#>
#> date
library(stringr)
haves <- data.frame(
id = c("a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b")
, date = as.Date(c("2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01", "2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01"))
, value_1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
, value_2 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
, value_3 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
)
setDT(haves)
setorder(haves, date)
N_vars = 3
N_lags = 3
current_vars = paste0( "value_", rep(1:N_vars, each = 1) )
new_vars = paste0( "value_", rep(1:N_vars, each = N_lags), "_lag_", 1:N_lags )
to_define = new_vars[ str_detect(new_vars, "lag_1") ]
haves[ , (new_vars) := shift( .SD, 1:N_lags ), .SDcols = current_vars]
haves
#> id date value_1 value_2 value_3 value_1_lag_1 value_1_lag_2
#> 1: a 2015-01-01 1 1 1 NA NA
#> 2: b 2015-01-01 7 7 7 1 NA
#> 3: a 2015-02-01 2 2 2 7 1
#> 4: b 2015-02-01 8 8 8 2 7
#> 5: a 2015-03-01 3 3 3 8 2
#> 6: b 2015-03-01 9 9 9 3 8
#> 7: a 2015-04-01 4 4 4 9 3
#> 8: b 2015-04-01 10 10 10 4 9
#> 9: a 2015-05-01 5 5 5 10 4
#> 10: b 2015-05-01 11 11 11 5 10
#> 11: a 2015-06-01 6 6 6 11 5
#> 12: b 2015-06-01 12 12 12 6 11
#> value_1_lag_3 value_2_lag_1 value_2_lag_2 value_2_lag_3 value_3_lag_1
#> 1: NA NA NA NA NA
#> 2: NA 1 NA NA 1
#> 3: NA 7 1 NA 7
#> 4: 1 2 7 1 2
#> 5: 7 8 2 7 8
#> 6: 2 3 8 2 3
#> 7: 8 9 3 8 9
#> 8: 3 4 9 3 4
#> 9: 9 10 4 9 10
#> 10: 4 5 10 4 5
#> 11: 10 11 5 10 11
#> 12: 5 6 11 5 6
#> value_3_lag_2 value_3_lag_3
#> 1: NA NA
#> 2: NA NA
#> 3: 1 NA
#> 4: 7 1
#> 5: 2 7
#> 6: 8 2
#> 7: 3 8
#> 8: 9 3
#> 9: 4 9
#> 10: 10 4
#> 11: 5 10
#> 12: 11 5
Created on 2020-05-04 by the reprex package (v0.3.0)

Related

R incrementing a variable in dplyr

I have the following grouped data frame:
library(dplyr)
# Create a sample dataframe
df <- data.frame(
student = c("A", "A", "A","B","B", "B", "C", "C","C"),
grade = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
age= c(NA, 6, 6, 7, 7, 7, NA, NA, 9)
)
I want to update the age of each student so that it is one plus the age in the previous year, with their age in the first year they appear in the dataset remaining unchanged. For example, student A's age should be NA, 6, 7, student B's age should be 7,8,9, and student C's age should be NA, NA, 9.
How about this:
library(dplyr)
df <- data.frame(
student = c("A", "A", "A","B","B", "B", "C", "C","C"),
grade = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
age= c(NA, 6, 6, 7, 7, 7, NA, NA, 9)
)
df %>%
group_by(student) %>%
mutate(age = age + cumsum(!is.na(age))-1)
#> # A tibble: 9 × 3
#> # Groups: student [3]
#> student grade age
#> <chr> <dbl> <dbl>
#> 1 A 1 NA
#> 2 A 2 6
#> 3 A 3 7
#> 4 B 1 7
#> 5 B 2 8
#> 6 B 3 9
#> 7 C 1 NA
#> 8 C 2 NA
#> 9 C 3 9
Created on 2022-12-30 by the reprex package (v2.0.1)
in data.table, assuming the order of the rows is the 'correct' order:
library(data.table)
setDT(df)[, new_age := age + rowid(age) - 1, by = .(student)]
# student grade age new_age
# 1: A 1 NA NA
# 2: A 2 6 6
# 3: A 3 6 7
# 4: B 1 7 7
# 5: B 2 7 8
# 6: B 3 7 9
# 7: C 1 NA NA
# 8: C 2 NA NA
# 9: C 3 9 9

Conditional replace in vector

I am trying to replace all values with NAs in a vector (by group) if there
are at least two values that are greater than 4 where x is between 2 and
3.
In this example, in group a, there are 2 values greater than 4 for 2 <= x <= 3.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
tibble(
grp = c("a", "a", "a", "b", "b", "b"),
x = c(1, 2, 3, 1, 2, 3),
val = c(4, 5, 6, 1, 2, 1)
) %>%
group_by(grp) %>%
mutate(val2 = ifelse(sum(val[between(x, 2, 3)] > 4) >= 2, NA, val))
#> # A tibble: 6 × 4
#> # Groups: grp [2]
#> grp x val val2
#> <chr> <dbl> <dbl> <dbl>
#> 1 a 1 4 NA
#> 2 a 2 5 NA
#> 3 a 3 6 NA
#> 4 b 1 1 1
#> 5 b 2 2 1
#> 6 b 3 1 1
Expected output
tibble(
grp = c("a", "a", "a", "b", "b", "b"),
x = c(1, 2, 3, 1, 2, 3),
val = c(4, 5, 6, 1, 2, 1),
val2 = c(NA, NA, NA, 1, 2, 1)
)
#> # A tibble: 6 × 4
#> grp x val val2
#> <chr> <dbl> <dbl> <dbl>
#> 1 a 1 4 NA
#> 2 a 2 5 NA
#> 3 a 3 6 NA
#> 4 b 1 1 1
#> 5 b 2 2 2
#> 6 b 3 1 1
Created on 2021-10-25 by the reprex package (v2.0.1)
The problem is that ifelse return a vector with length equal to the first parameter. Since sum(val[between(x, 2, 3)] > 4) >= 2 returns a logical vector of length 1, only the first val is returned and then it's recycled to the full length. For example ifelse(TRUE, 1:3, 11:13) will only return 1. You could either use rep to repeat that value for the full length
mutate(val2 = ifelse(rep(sum(val[between(x, 2, 3)] > 4) >= 2, n()), NA, val))
or use a standard if/else statement
mutate(val2 = if(sum(val[between(x, 2, 3)] > 4) >= 2) NA else val)

Filling the missing values by selecting the values in the previous row using R [duplicate]

This question already has answers here:
Filling missing value in group
(3 answers)
Closed 1 year ago.
In my table below, the thrid column is ternary character (A, B & C): Table
I like to fill the empty cells with the value from the prevoius cell except for C in this way:Table
The data frame is:
df <- data.frame(
'ID' = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23)
,'V1' = c("desc1", "desc2", "desc3", "desc4", "desc5", "desc6", "desc7", "desc8", "desc9", "desc10", "desc11", "desc12", "desc13", "desc14", "desc15", "desc16", "desc17", "desc18", "desc19", "desc20", "desc21", "desc22", "desc23")
,'V2' =c("A", "", "", "B", "", "", "", "C", "", "A", "", "", "", "B", "", "", "C", "", "A", "B", "", "C", ""))
library(data.table)
library(magrittr)
df <- data.frame(
'ID' = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23)
,'V1' = c("desc1", "desc2", "desc3", "desc4", "desc5", "desc6", "desc7", "desc8", "desc9", "desc10", "desc11", "desc12", "desc13", "desc14", "desc15", "desc16", "desc17", "desc18", "desc19", "desc20", "desc21", "desc22", "desc23")
,'V2' =c("A", "", "", "B", "", "", "", "C", "", "A", "", "", "", "B", "", "", "C", "", "A", "B", "", "C", ""))
df[df == ""] <- NA
setDT(df)[, V2 := zoo::na.locf(V2)] %>%
.[, id := rowid(V2), by = rleid(V2)] %>%
.[, V2 := ifelse(V2 == "C" & id > 1, NA, V2)] %>%
.[, id := NULL] %>%
.[]
#> ID V1 V2
#> 1: 1 desc1 A
#> 2: 2 desc2 A
#> 3: 3 desc3 A
#> 4: 4 desc4 B
#> 5: 5 desc5 B
#> 6: 6 desc6 B
#> 7: 7 desc7 B
#> 8: 8 desc8 C
#> 9: 9 desc9 <NA>
#> 10: 10 desc10 A
#> 11: 11 desc11 A
#> 12: 12 desc12 A
#> 13: 13 desc13 A
#> 14: 14 desc14 B
#> 15: 15 desc15 B
#> 16: 16 desc16 B
#> 17: 17 desc17 C
#> 18: 18 desc18 <NA>
#> 19: 19 desc19 A
#> 20: 20 desc20 B
#> 21: 21 desc21 B
#> 22: 22 desc22 C
#> 23: 23 desc23 <NA>
#> ID V1 V2
Created on 2021-08-18 by the reprex package (v2.0.1)
We could use fill function from tidyr package combining with an ifelse condition:
library(dplyr)
library(tidyr)
df %>%
mutate(V3 = V2) %>%
fill(V3) %>%
mutate(V3 = ifelse(V3 == "C", V2, V3))
Output:
ID V1 V2 V3
<dbl> <chr> <chr> <chr>
1 1 desc1 A A
2 2 desc2 NA A
3 3 desc3 NA A
4 4 desc4 B B
5 5 desc5 NA B
6 6 desc6 NA B
7 7 desc7 NA B
8 8 desc8 C C
9 9 desc9 NA NA
10 10 desc10 A A
# … with 13 more rows

Reshaping data by appending rows from different groups to the same row

I have data as follows:
DT <- structure(list(Area = c("A", "A", "A", "A", "B", "B", "B", "B"
), Year = c(1, 1, 2, 2, 1, 1, 2, 2), Group = c(1, 2, 1, 2, 1,
2, 1, 2), Population_Count = c(10, 12, 10, 12, 10, 13, 10, 11
), Male_Count = c(5, 7, 5, 4, 5, 8, 5, 6), Female_Count = c(5,
5, 5, 8, 5, 5, 5, 5)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
# A tibble: 8 x 6
Area Year Group Population_Count Male_Count Female_Count
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 1 10 5 5
2 A 1 2 12 7 5
3 A 2 1 10 5 5
4 A 2 2 12 4 8
5 B 1 1 10 5 5
6 B 1 2 13 8 5
7 B 2 1 10 5 5
8 B 2 2 11 6 5
I would like to keep one observations per Area-Year, without losing any information. I tried to do
DTcast <- dcast(DT, Area + Year ~ Group + Population_Count + Male_Count + Female_Count)
But that results in a lot of rubbish:
Area Year 1_10_5_5 2_11_6_5 2_12_4_8 2_12_7_5 2_13_8_5
1 A 1 5 NA NA 5 NA
2 A 2 5 NA 8 NA NA
3 B 1 5 NA NA NA 5
4 B 2 5 5 NA NA NA
In addition, when I apply it to the actual data, I get:
Using 'H_FEMALE' as value column. Use 'value.var' to override
Error in CJ(1:72284, 1:1333365) :
Cross product of elements provided to CJ() would result in 96380955660 rows which exceeds .Machine$integer.max == 2147483647
So I think I am doing something wrong. I think it maybe has to do with the value.var which I do not know how to select.
Desired result:
# A tibble: 4 x 9
Area Year Group `Population_Count_ Group_1` `Male_Count_ Group_1` `Female_Count_ Group_1` `Population_Count_ Group_2` `Male_Count_ Group_2` `Female_Count_ Group_2`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 1 10 5 5 12 7 5
2 A 2 1 10 5 5 12 4 8
3 B 1 1 10 5 5 13 8 5
4 B 2 1 10 5 5 11 6 5
library(tidyverse)
DT %>% pivot_wider(id_cols = c("Area", "Year"), names_from = "Group", values_from = 4:6)
> DT %>% pivot_wider(id_cols = c("Area", "Year"), names_from = "Group", values_from = 4:6)
# A tibble: 4 x 8
Area Year Population_Count_1 Population_Count_2 Male_Count_1 Male_Count_2 Female_Count_1 Female_Count_2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 10 12 5 7 5 5
2 A 2 10 12 5 4 5 8
3 B 1 10 13 5 8 5 5
4 B 2 10 11 5 6 5 5
This will name your columns as desired
DT %>% pivot_wider(id_cols = c("Area", "Year"),
names_from = "Group",
values_from = 4:6,
names_sep = "_Group_")
use data.table
library(data.table)
dt <- structure(list(Area = c("A", "A", "A", "A", "B", "B", "B", "B"
), Year = c(1, 1, 2, 2, 1, 1, 2, 2), Group = c(1, 2, 1, 2, 1,
2, 1, 2), Population_Count = c(10, 12, 10, 12, 10, 13, 10, 11
), Male_Count = c(5, 7, 5, 4, 5, 8, 5, 6), Female_Count = c(5,
5, 5, 8, 5, 5, 5, 5)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
setDT(dt)
dcast(
dt,
formula = Area + Year ~ Group,
value.var = grep("_Count", names(dt), value = T)
)
#> Area Year Population_Count_1 Population_Count_2 Male_Count_1 Male_Count_2
#> 1: A 1 10 12 5 7
#> 2: A 2 10 12 5 4
#> 3: B 1 10 13 5 8
#> 4: B 2 10 11 5 6
#> Female_Count_1 Female_Count_2
#> 1: 5 5
#> 2: 5 8
#> 3: 5 5
#> 4: 5 5
Created on 2020-12-18 by the reprex package (v0.3.0)

dplyr row sum on selected rows [duplicate]

This question already has an answer here:
Row sum using mutate and select [duplicate]
(1 answer)
Closed 4 years ago.
I have the following data:
library(dplyr)
library(purrr)
d <- data.frame(
Type= c("d", "e", "d", "e"),
"2000"= c(1, 5, 1, 5),
"2001" = c(2, 5 , 6, 4),
"2002" = c(8, 9, 6, 3))
I would like to use rowsum and mutate to generate a new row which is the sum of 'd' and another row which is the sum of 'e' so that the data looks like this:
d2 <- data.frame(
Type= c("d", "e", "d", "e", "sum_of_d", "Sum_of_e"),
"2000"= c(1, 5, 1, 5, 2, 10),
"2001" = c(2, 5 , 6, 4, 8, 9),
"2002" = c(8, 9, 6, 3, 14, 12))
I think the code should look something like this:
d %>%
dplyr::mutate(sum_of_d = rowSums(d[1,3], na.rm = TRUE)) %>%
dplyr::mutate(sum_of_e = rowSums(d[2,4], na.rm = TRUE)) -> d2
however this does not quite work. Any ideas?
Thanks
You're looking for the sum by Type across all other columns, so..
library(dplyr)
d %>%
group_by(Type) %>%
summarise_all(sum) %>%
mutate(Type = paste0("sum_of_", Type)) %>%
rbind(d, .)
Type X2000 X2001 X2002
1 d 1 2 8
2 e 5 5 9
3 d 1 6 6
4 e 5 4 3
5 sum_of_d 2 8 14
6 sum_of_e 10 9 12
d %>%
group_by(Type) %>%
summarize_all(sum) %>%
mutate(Type=paste0("sum_of_",Type)) %>%
bind_rows(d,.)
Type X2000 X2001 X2002
1 d 1 2 8
2 e 5 5 9
3 d 1 6 6
4 e 5 4 3
5 sum_of_d 2 8 14
6 sum_of_e 10 9 12

Resources