Add column to table with values depending on dates - r

I have a table that has dates as a number and a value with each date. Now I'd like to add another column, weekSum, which contains the sum of value over the last week. However some dates are missing (so I can't always use the current and last 6 rows). My table looks like this:
df <- data.frame('date' = c(20160309, 20160310, 20160311, 20160312, 20160313, 20160314, 20160315, 20160317, 20160318, 20160319, 20160321), 'value' = c(1, 2, 3, 4, 5, 6, 7 ,8, 9, 10, 11))
date value
20160309 1
20160310 2
20160311 3
20160312 4
20160313 5
20160314 6
20160315 7
20160316 8
20160318 9 #17th skipped
20160319 10
20160321 11 #20th skipped
I'd like to get the following as output:
date value weekSum
20160309 1 NA
20160310 2 NA
20160311 3 NA
20160312 4 NA
20160313 5 NA
20160314 6 NA
20160315 7 28 # 1+2+3+4+5+6+7
20160316 8 35 # 2+3+4+5+6+7+8
20160318 9 39 # 4+5+6+7+8+9
20160319 10 45 # 5+6+7+8+9+10
20160321 11 45 # 7+8+9+10+11
How can this be done?

1) Convert the data frame to zoo and define a weekSum function which subsets its input to the last week and sums that. Then use rollapplyr with coredata = FALSE so that it passes a zoo object with times, not just the core data, to the weekSum function.
library(zoo)
z <- read.zoo(df, format = "%Y%m%d")
weekSum <- function(z) sum(z[time(z) > tail(time(z), 1) - 7])
transform(df, weekSum = rollapplyr(z, 7, weekSum, fill = NA, coredata = FALSE))
giving:
date value weekSum
2016-03-09 20160309 1 NA
2016-03-10 20160310 2 NA
2016-03-11 20160311 3 NA
2016-03-12 20160312 4 NA
2016-03-13 20160313 5 NA
2016-03-14 20160314 6 NA
2016-03-15 20160315 7 28
2016-03-16 20160316 8 35
2016-03-18 20160318 9 39
2016-03-19 20160319 10 45
2016-03-21 20160321 11 45
2) An alternative is to fill in the value at the missing dates with zero and then just use rollsumr with width of 7. z is from (1).
z0 <- merge(z, zoo(, seq(start(z), end(z), "day")), fill = 0)
transform(df, weekSum = rollsumr(z0, 7, fill = NA)[z0 != 0])

With base R it can be done like this:
res <- merge(df, data.frame(date = seq(df$date[1], to = df$date[length(d)], by = "days")), all.y = TRUE)
res$weekSum <- NA
for(i in seq_along(res$sum)[-seq_len(6)]){
res$weekSum[i] <- sum(res$value[(i - 6):i], na.rm = TRUE)
}
res <- res[!is.na(res$value), ]
res
# date value sum weekSum
#1 2016-03-09 1 NA NA
#2 2016-03-10 2 NA NA
#3 2016-03-11 3 NA NA
#4 2016-03-12 4 NA NA
#5 2016-03-13 5 NA NA
#6 2016-03-14 6 NA NA
#7 2016-03-15 7 28 28
#9 2016-03-17 8 33 35
#10 2016-03-18 9 39 42
#11 2016-03-19 10 45 49
#13 2016-03-21 11 45 56

Here is an approach using tidyverse tools. This method uses tidyr::complete to construct the full date sequence, making it easy to take the current row and the previous 6 as suggested. Be careful here if there are
NA values in value to begin with, as currently those rows will be filtered out at the end. Tweaks possible to avoid this case if necessary.
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#>
#> date
df <- data.frame('date' = c(20160309, 20160310, 20160311, 20160312, 20160313, 20160314, 20160315, 20160317, 20160318, 20160319, 20160321), 'value' = c(1, 2, 3, 4, 5, 6, 7 ,8, 9, 10, 11))
df %>%
mutate(date = ymd(date)) %>%
complete(date = seq.Date(min(date), max(date), by = 1)) %>%
arrange(date) %>%
mutate(
newval = replace_na(value, 0),
weekSum = newval + lag(newval) + lag(newval, 2) + lag(newval, 3) +
lag(newval, 4) + lag(newval, 5) + lag(newval, 6)
) %>%
select(-newval) %>%
filter(!is.na(value))
#> # A tibble: 11 x 3
#> date value weekSum
#> <date> <dbl> <dbl>
#> 1 2016-03-09 1. NA
#> 2 2016-03-10 2. NA
#> 3 2016-03-11 3. NA
#> 4 2016-03-12 4. NA
#> 5 2016-03-13 5. NA
#> 6 2016-03-14 6. NA
#> 7 2016-03-15 7. 28.
#> 8 2016-03-17 8. 33.
#> 9 2016-03-18 9. 39.
#> 10 2016-03-19 10. 45.
#> 11 2016-03-21 11. 45.
Created on 2018-05-07 by the reprex package (v0.2.0).

Related

How to filter dates with double curly brackets?

Similar to my other question about double curly brackets within ggplot, I'm also struggling with using double curly brackets with filtering dates generally. In this function, users can specify a time period (week, month) and then the function filters based on the respective time period. For instance, selecting 'month' should cause the function to focus just on the Month and Month_score columns. However, it keeps failing and providing incorrect output:
Here's my sample data along with 2 attempts that I made:
#Sample data
library(dplyr)
library(lubdridate)
test <- tibble(Week = seq(as.Date("2014-09-04"), by = "week", length.out = 8),
Month = ymd(rep('2014-09-01', 4), rep('2014-10-01', 4)),
Week_score = c(2, 3, 4, 6, 5, 7, 8, 9),
Month_score = c(15, NA, NA, NA, 29, NA, NA, NA))
Attempt 1--no columns are removed (which is incorrect):
date_filter <- function(data, time_period = c("Week", "Month"), start_date = NA_Date_) {
data %>%
filter({{time_period}} > start_date)
}
date_filter(data = test, time_period = "Week", start_date = '2014-09-06')
Attempt 2: I explicitly callout that start_date is a date, but I get the below error
date_filter <- function(data, time_period = c("Week", "Month"), start_date = NA_Date_) {
data %>%
filter({{time_period}} > as.Date(start_date))
}
date_filter(data = test, time_period = "Week", start_date = '2014-09-06')
{{}} is meant to be used on unquoted variable names, not on strings.
Remove the "" from time_period when calling your function, and it works:
date_filter <- function(data, time_period, start_date = NA_Date_) {
data %>%
filter({{time_period}} > as.Date(start_date))
}
date_filter(data = test, time_period = Week, start_date = '2014-09-06')
#> # A tibble: 7 × 4
#> Week Month Week_score Month_score
#> <date> <date> <dbl> <dbl>
#> 1 2014-09-11 2014-09-01 3 NA
#> 2 2014-09-18 2014-09-01 4 NA
#> 3 2014-09-25 2014-09-01 6 NA
#> 4 2014-10-02 2014-10-01 5 29
#> 5 2014-10-09 2014-10-01 7 NA
#> 6 2014-10-16 2014-10-01 8 NA
#> 7 2014-10-23 2014-10-01 9 NA
Created on 2022-04-02 by the reprex package (v2.0.1)
rlang has a nice article which I recommend everybody to read before working with {{}}.
Why does time_period = "Week" work in the first function but not in the other?
Because {{time_period}} gets interpreted as a string, and not the variable Week. You quite literally are trying to compare "Week" > "2014-09-06" in the first function (which is always TRUE), and "Week" > as.Date("2014-09-06") in the second function (which makes R want to convert "Week" into a date, and thus you get an error).
If we want to pass either quoted or unquoted, then convert to sym with ensym
date_filter <- function(data, time_period = c("Week", "Month"),
start_date = NA_Date_) {
time_period <- rlang::ensym(time_period)
data %>%
filter(!!time_period > start_date)
}
-testing
> date_filter(data = test, time_period = "Week", start_date = '2014-09-06')
# A tibble: 7 × 4
Week Month Week_score Month_score
<date> <date> <dbl> <dbl>
1 2014-09-11 2014-09-01 3 NA
2 2014-09-18 2014-09-01 4 NA
3 2014-09-25 2014-09-01 6 NA
4 2014-10-02 2014-10-01 5 29
5 2014-10-09 2014-10-01 7 NA
6 2014-10-16 2014-10-01 8 NA
7 2014-10-23 2014-10-01 9 NA
>
> date_filter(data = test, time_period = Week, start_date = '2014-09-06')
# A tibble: 7 × 4
Week Month Week_score Month_score
<date> <date> <dbl> <dbl>
1 2014-09-11 2014-09-01 3 NA
2 2014-09-18 2014-09-01 4 NA
3 2014-09-25 2014-09-01 6 NA
4 2014-10-02 2014-10-01 5 29
5 2014-10-09 2014-10-01 7 NA
6 2014-10-16 2014-10-01 8 NA
7 2014-10-23 2014-10-01 9 NA

Making rollApply() skip n steps - R

Below is my attempt at a minimal reproducible example. Briefly explained, I am using rollApply from the rowr package to calculate a function over a rolling window, and using data from two columns simultaneously. If possible, I would like to skip n steps between each time the function is calculated on a new window. I will try to make it clear what I mean in the example below.
Here is the example data:
df1 <- tibble(
x = c(1:9),
y = c(1:9),
Date = as.Date(c("2015-08-08", "2015-08-15", "2015-08-22",
"2015-08-29","2015-09-05", "2015-09-12", "2015-09-19",
"2015-09-26", "2015-10-03"))
)
Here are the example functions:
calc_ex <- function(y){
sum(y[,1] + y[,2])
}
roll_calc_ex <- function(y){
vec <- c(rep(NA, 2), rowr::rollApply(y, calc_ex, window = 3, minimum = 3))
y <- y %>%
mutate(estimate = vec)
return(y)
}
Applying the function roll_calc_ex() to df1, I get the following output:
> roll_calc_ex(df1)
# A tibble: 9 x 4
x y Date estimate
<int> <int> <date> <int>
1 1 1 2015-08-08 NA
2 2 2 2015-08-15 NA
3 3 3 2015-08-22 12
4 4 4 2015-08-29 18
5 5 5 2015-09-05 24
6 6 6 2015-09-12 30
7 7 7 2015-09-19 36
8 8 8 2015-09-26 42
9 9 9 2015-10-03 48
Ideally, I would to have a rolling window that skips n steps, say n=2, to produce the following output:
# A tibble: 9 x 4
x y Date estimate
<int> <int> <date> <int>
1 1 1 2015-08-08 NA
2 2 2 2015-08-15 NA
3 3 3 2015-08-22 12
4 4 4 2015-08-29 NA
5 5 5 2015-09-05 NA
6 6 6 2015-09-12 30
7 7 7 2015-09-19 NA
8 8 8 2015-09-26 NA
9 9 9 2015-10-03 48
Alternatively, instead of returning NA for every row skipped, the number from the previous calculation could be filled in (something I am planning to do later aynway using fill() from tidyverse).
If this is possible to solve using for example rollapply() from the zoo package, that would also be interesting to hear. I am only using rowr::rollApply() because I need to apply the function to two columns simultaneously. I know it is possible to use runner() from the package "runner", but in my more complicated problem I need to run parallel computations. I am using the furrr package for parallelization, and my code works well with rollApply, but not with runner(). The problem I have with runner is explained here: Problem with parallelization using furrr [and runner::runner() ] in R .
Thanks to anyone that took the time to read this post. Any help will be much appreciated.
1) The rowr package was removed from CRAN but we can use rollapplyr (like rollapply but the r on the end means to default to right alignment) from zoo which has a by.column= argument to specify whether processing is performed column by column (TRUE) or all columns are passed at once (FALSE) and a by= argument which causes skipping.
library(dplyr)
library(zoo)
mutate(df1, roll =
rollapplyr(cbind(x, y), 3, calc_ex, fill = NA, by.column = FALSE, by = 2)
)
giving:
x y Date roll
1 1 1 2015-08-08 NA
2 2 2 2015-08-15 NA
3 3 3 2015-08-22 12
4 4 4 2015-08-29 NA
5 5 5 2015-09-05 24
6 6 6 2015-09-12 NA
7 7 7 2015-09-19 36
8 8 8 2015-09-26 NA
9 9 9 2015-10-03 48
2) Using complex arithmetic would also work:
f <- function(v) calc_ex(cbind(Re(v), Im(v)))
mutate(df1, roll = rollapplyr(x + y * 1i, 3, f, fill = NA, by = 2))
3) and if we look into call_ex then it could be written (although this does not generalize):
mutate(df1, roll = rollapplyr(x + y, 3, sum, fill = NA, by = 2))
4) We could also consider using zoo objects rather than data frames:
z <- read.zoo(df1, index = "Date")
merge(z, roll = rollapplyr(z, 3, calc_ex, by.column = FALSE, by = 2))
If we were to use the slider package
library(tidyverse)
library(slider)
df1 <- tibble(
x = c(1:9),
y = c(1:9),
Date = as.Date(c("2015-08-08", "2015-08-15", "2015-08-22",
"2015-08-29","2015-09-05", "2015-09-12", "2015-09-19",
"2015-09-26", "2015-10-03")))
df1 |>
mutate(rolling_sum = slide2_dbl(.x = x,.y = y,.f = sum,
.step = 3,.before = 2,.complete = T
))
#> # A tibble: 9 x 4
#> x y Date rolling_sum
#> <int> <int> <date> <dbl>
#> 1 1 1 2015-08-08 NA
#> 2 2 2 2015-08-15 NA
#> 3 3 3 2015-08-22 12
#> 4 4 4 2015-08-29 NA
#> 5 5 5 2015-09-05 NA
#> 6 6 6 2015-09-12 30
#> 7 7 7 2015-09-19 NA
#> 8 8 8 2015-09-26 NA
#> 9 9 9 2015-10-03 48
Created on 2021-10-21 by the reprex package (v2.0.1)

How to extract values of one dataframe from another dataframe having same columns in r?

How to Get values of columns from df2 to df1 having matching df1 dates
df1 <- data.frame(a= seq.Date(from = as.Date("2021-08-14"), to = as.Date("2021-08- 20")
, by = 1),
vv= NA,
tv= NA)
df2 <- data.frame(a=as.Date(c("2021-08-14","2021-08-15","2021-08-22",
"2021-08-16","2021-08-17","2021-08-25","2021-08-26")),
vv=c(12,11,13,14,15,16,10),
tv= c(1,2,3,4,5,6,7))
Required Output
df3 <- data.frame(
a = seq.Date(from = as.Date("2021-08-14"), to = as.Date("2021-08-19"), by = 1),
vv = c(12, 11, 14,15, NA,NA),
tv = c(1,2,4,5,NA, NA)
)
You could use rows_update:
library(dplyr)
df1 %>%
rows_patch(semi_join(df2, df1, by = "a"))
returns
a vv tv
1 2021-08-14 12 1
2 2021-08-15 11 2
3 2021-08-16 14 4
4 2021-08-17 15 5
5 2021-08-18 NA NA
6 2021-08-19 NA NA
7 2021-08-20 NA NA
I couldn't reproduce your data samples at first, changed a little bit (deleted a row because of a typo)
df1= data.frame(a= seq.Date(from = as.Date("2021-08-14"), to = as.Date("2021-08- 20"), by = 1),
vv= NA,
tv= NA)
df2= data.frame(a=as.Date(c("2021-08-14","2021-08-15","2021-08-22",
"2021-08-16","2021-08-17","2021-08-25","2021-08-26")),
vv=c(12,11,13,14,15,16,10),
tv= c(1,2,3,4,5,6,7))
df1$vv <- NULL
df1$tv <- NULL
df4 <- merge(df1,df2,by = 'a',all.x=T)
output ;
a vv tv
<date> <dbl> <dbl>
1 2021-08-14 12 1
2 2021-08-15 11 2
3 2021-08-16 14 4
4 2021-08-17 15 5
5 2021-08-18 NA NA
6 2021-08-19 NA NA
7 2021-08-20 NA NA
We could do a left_join and remove the columns not wanted with select(-c(vv.x, tv.x))
library(dplyr)
left_join(df1, df2, by="a") %>%
select(-c(vv.x, tv.x), vv=vv.y, tv=tv.y)
a vv tv
1 2021-08-14 12 1
2 2021-08-15 11 2
3 2021-08-16 14 4
4 2021-08-17 15 5
5 2021-08-18 NA NA
6 2021-08-19 NA NA
7 2021-08-20 NA NA

Find n overlapping dates within n number of days

I'm looking to find >=4 unique events that all occur within a group within a 90 day period and then flag the ID.
Just a test example:
library(dplyr)
set.seed(1)
test <- data.frame(
PATID = sample(1:1e4, 1e5, replace = TRUE),
PROV = sample(1:50, 1e5, replace = TRUE),
GROUP = sample(0:1, 1e5, replace = TRUE),
DATE = as.Date(sample(
as.Date("2020-01-01"):as.Date("2020-12-31"),
1e5,
replace = TRUE
), origin = "1970-01-01")
)
If we look at PATID==5 we can see there are 4 unique PROVs with overlapping dates within 90 days and within our group of interest and so should be flagged.
> test %>% filter(PATID==5) %>% arrange(GROUP,DATE)
PATID PROV GROUP DATE
1 5 2 0 2020-05-07
2 5 3 0 2020-05-20
3 5 3 0 2020-11-15
4 5 49 0 2020-12-14
5 5 45 1 2020-02-16
6 5 50 1 2020-03-19
7 5 38 1 2020-03-25
8 5 27 1 2020-03-29
9 5 42 1 2020-08-30
10 5 46 1 2020-11-03
11 5 25 1 2020-11-13
12 5 29 1 2020-12-26
> as.Date("2020-03-29")-as.Date("2020-02-16")<=90
[1] TRUE
Ultimately, I'm looking for the proportion of GROUP==1 vs GROUP==0 with >=4 unique PROVs within 90 days. Ideally I'd prefer using data.table simply due to the scale of data.
Trying out some code:
test %>%
filter(PATID %in% 1:5) %>%
group_by(PATID,GROUP) %>%
arrange(GROUP, DATE) %>%
mutate(lag = DATE - lag(DATE),
day_count = case_when(lag <= 90 ~ TRUE,
is.na(lag) ~ TRUE,
TRUE ~ FALSE)) %>%
mutate(crit = cumsum_reset(day_count)) %>%
ungroup() %>%
group_by(PATID) %>%
mutate(flag = case_when(max(crit) >= 4 ~ 1,
TRUE ~ 0)) %>%
arrange(PATID)
Getting closer, just need to sort out the 90 window versus just crudely testing if each date is within 90 days.
Maybe the following is what you are after. Please check if the logic is what you meant. I left more explicit than necessary so that the idea can be more easily understood. The main idea is that if after sorting there is a observation from same PATDI & GROUP that is within 90 days from the 3rd lag diff_3 := DATE - shift(DATE, 3), than it should be flagged. This is done by checking diff_check = diff_3<=90. If any observation for any PATID/GROUP is flagged, the corresponding ID will be flagged by the keep = max(diff_check, na.rm = TRUE, pmin = 0) after grouping by only PATID.
Using the third lag to account for 4 or more and not strictly more than 4 observations.
Does it, all in all, make any sense?
library(data.table)
set.seed(1)
test <- data.frame(
PATID = sample(1:1e4, 1e5, replace = TRUE),
PROV = sample(1:50, 1e5, replace = TRUE),
GROUP = sample(0:1, 1e5, replace = TRUE),
DATE = as.Date(sample(
as.Date("2020-01-01"):as.Date("2020-12-31"),
1e5,
replace = TRUE
), origin = "1970-01-01")
)
test %>% filter(PATID==5) %>% arrange(GROUP,DATE)
#> Error in test %>% filter(PATID == 5) %>% arrange(GROUP, DATE): could not find function "%>%"
dt <- as.data.table(test)
dt <- dt[order(PATID, GROUP, DATE)]
dt[, diff_3 := DATE - shift(DATE, 3), by = c("PATID", "GROUP")]
# check amount of unique values of PROV in previous 4 observations
dt[, unique_last_4 := frollapply(x = PROV, n = 4, FUN = uniqueN), by = c("PATID", "GROUP")]
# check if within 90 days and unique PROVs
dt[, diff_check := diff_3<=90 & unique_last_4==4, by = c("PATID", "GROUP")]
# final check to flag all observations of ID that satisfied at least once the above checks
dt[, to_keep := max(diff_check, na.rm = TRUE, pmin = 0), by = "PATID"]
# NOTE: unsure if you mean to group only by PATID here or by PATID & GROUP.
head(dt[to_keep==1], 20)
#> PATID PROV GROUP DATE diff_3 unique_last_4 diff_check to_keep
#> 1: 5 2 0 2020-05-07 NA days NA NA 1
#> 2: 5 3 0 2020-05-20 NA days NA NA 1
#> 3: 5 3 0 2020-11-15 NA days NA NA 1
#> 4: 5 49 0 2020-12-14 221 days 3 FALSE 1
#> 5: 5 45 1 2020-02-16 NA days NA NA 1
#> 6: 5 50 1 2020-03-19 NA days NA NA 1
#> 7: 5 38 1 2020-03-25 NA days NA NA 1
#> 8: 5 27 1 2020-03-29 42 days 4 TRUE 1
#> 9: 5 42 1 2020-08-30 164 days 4 FALSE 1
#> 10: 5 46 1 2020-11-03 223 days 4 FALSE 1
#> 11: 5 25 1 2020-11-13 229 days 4 FALSE 1
#> 12: 5 29 1 2020-12-26 118 days 4 FALSE 1
#> 13: 7 1 0 2020-04-10 NA days NA NA 1
#> 14: 7 44 0 2020-04-29 NA days NA NA 1
#> 15: 7 27 0 2020-05-05 NA days NA NA 1
#> 16: 7 41 0 2020-06-11 62 days 4 TRUE 1
#> 17: 7 35 0 2020-06-30 62 days 4 TRUE 1
#> 18: 7 11 0 2020-12-18 227 days 4 FALSE 1
#> 19: 7 24 1 2020-12-24 NA days NA NA 1
#> 20: 7 13 1 2020-12-29 NA days NA NA 1
Created on 2021-06-22 by the reprex package (v2.0.0)
dplyr version
test_keep <- test %>% arrange(PATID, GROUP, DATE) %>%
head(1000) %>% # otherwise it takes too long in my pc, which shows data.table's efficiency!
group_by(PATID, GROUP) %>%
mutate(diff_3 = DATE - lag(DATE, 3),
diff_check = diff_3<=90,
unique_last_4 = frollapply(x = PROV, n = 4, FUN = uniqueN)
) %>% group_by(PATID) %>%
mutate(keep = max(diff_check, na.rm = TRUE, pmin = 0)) %>%
arrange(PATID, GROUP)
test_keep %>% filter(keep==1) %>% head(20)
Based on I'm looking for the annual "group" proportion of patients that visit >=4 providers within 90 days, you can try this:
library(data.table) #data.table 1.13.2
setDT(test)[, c("d90ago", "d90aft") := .(DATE - 90L, DATE + 90L)]
setkey(test, PATID, DATE)
test[, grp :=
.SD[.SD, on=.(PATID, DATE>=d90ago, DATE<=d90aft), by=.EACHI, +(length(unique(x.PROV))>=4L)]$V1
]
The above allows PROV within overlapping windows of 90 days to be re-used.
There's some ambiguities in the question, so this may not be quite right. I tried doing this using dplyr and local data frames, but the self-join causes an overflow (100,000 times 100,000).
It seems to work using data.table and using PostgreSQL, which has an OVERLAPS function.
(Note that I used lower-case variable names to make working with SQL easier.)
In the answer below, I start with a patient visit ((patid, prov, group, date) combination) and look forward 90 days to capture all visits by that patient (patid) to other providers (prov != prov_other). I then count the number of distinct providers in that lookahead period (this will be NA when there are no visits, as when looking at a patient's last visit in the sample). I then count the number of visits where the number of additional distinct providers in the subsequent 90 days is 3 or more.
Finally, I group by (group, year) and count the proportion of visits that are followed by visits to at least three other providers during the subsequent 90 days. Given the way the data are generated, it is no surprise that the two groups look similar on this metric.
Note that each patient visit forms a unit of observation here. In practice, it may make sense to aggregate by (say) (patid, year) before calculating statistics or do some other kind of aggregation.
library(data.table)
library(dplyr, warn.conflicts = FALSE)
set.seed(1)
test <- tibble(
patid = sample(1:1e4, 1e5, replace = TRUE),
prov = sample(1:50, 1e5, replace = TRUE),
group = sample(0:1, 1e5, replace = TRUE),
date = as.Date(sample(
as.Date("2020-01-01"):as.Date("2020-12-31"),
1e5,
replace = TRUE
), origin = "1970-01-01")) %>%
as.data.table()
test
#> patid prov group date
#> 1: 1017 6 1 2020-08-03
#> 2: 8004 34 0 2020-12-15
#> 3: 4775 32 0 2020-06-21
#> 4: 9725 47 1 2020-09-25
#> 5: 8462 15 0 2020-03-05
#> ---
#> 99996: 949 47 0 2020-07-05
#> 99997: 2723 37 0 2020-08-18
#> 99998: 201 27 1 2020-01-06
#> 99999: 163 9 0 2020-03-06
#> 100000: 3204 48 1 2020-11-17
df_overlap <-
test %>%
inner_join(test, by = "patid", suffix = c("", "_other")) %>%
filter(prov != prov_other) %>%
filter(date_other >= date & date_other <= date + 90L)
mt_4_provs_df <-
df_overlap %>%
group_by(patid, prov, group, date) %>%
summarize(n_providers = n_distinct(prov_other), .groups = "drop")
results <-
test %>%
left_join(mt_4_provs_df, by = c("patid", "prov", "group", "date")) %>%
mutate(mt_4_provs = n_providers >= 3,
year = year(date)) %>%
group_by(group, year) %>%
summarize(prop_mt_4_provs = mean(mt_4_provs, na.rm = TRUE),
.groups = "drop")
results
#> # A tibble: 2 x 3
#> group year prop_mt_4_provs
#> <int> <int> <dbl>
#> 1 0 2020 0.426
#> 2 1 2020 0.423
Created on 2021-06-22 by the reprex package (v2.0.0)

Adding multiple lag variables using dplyr and for loops

I have time series data that I'm predicting on, so I am creating lag variables to use in my statistical analysis. I'd like a quick way to create multiple variables given specific inputs so that I can easily cross-validate and compare models.
The following is example code that adds 2 lags for 2 different variables (4 total) given a certain category (A, B, C):
# Load dplyr
library(dplyr)
# create day, category, and 2 value vectors
days = 1:9
cats = rep(c('A','B','C'),3)
set.seed = 19
values1 = round(rnorm(9, 16, 4))
values2 = round(rnorm(9, 16, 16))
# create data frame
data = data.frame(days, cats, values1, values2)
# mutate new lag variables
LagVal = data %>% arrange(days) %>% group_by(cats) %>%
mutate(LagVal1.1 = lag(values1, 1)) %>%
mutate(LagVal1.2 = lag(values1, 2)) %>%
mutate(LagVal2.1 = lag(values2, 1)) %>%
mutate(LagVal2.2 = lag(values2, 2))
LagVal
days cats values1 values2 LagVal1.1 LagVal1.2 LagVal2.1 LagVal2.2
<int> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 A 16 -10 NA NA NA NA
2 2 B 14 24 NA NA NA NA
3 3 C 16 -6 NA NA NA NA
4 4 A 12 25 16 NA -10 NA
5 5 B 20 14 14 NA 24 NA
6 6 C 18 -5 16 NA -6 NA
7 7 A 21 2 12 16 25 -10
8 8 B 19 5 20 14 14 24
9 9 C 18 -3 18 16 -5 -6
My problem comes in at the # mutate new lag variables step, since I have about a dozen predictor variables that I would potentially want to lag up to 10 times (~13k row dataset), and I don't have the heart to create 120 new variables.
Here is my attempt at writing a function which mutates new variables given the inputs for data (dataset to mutate), variables (the variables you wish to lag), and lags (the number of lags per variable):
MultiMutate = function(data, variables, lags){
# select the data to be working with
FuncData = data
# Loop through desired variables to mutate
for (i in variables){
# Loop through number of desired lags
for (u in 1:lags){
FuncData = FuncData %>% arrange(days) %>% group_by(cats) %>%
# Mutate new variable for desired number of lags. Give new variable a name with the lag number appended
mutate(paste(i, u) = lag(i, u))
}
}
FuncData
}
To be honest I'm just sort of lost on how to get this to work. The ordering of my for-loops and overall logic makes sense, but the way the function takes characters into variables and the overall syntax seems way off. Is there a simple way to fix up this function to get my desired result?
In particular, I'm looking for:
A function like MultiMutate(data = data, variables = c(values1, values2), lags = 2) that would create the exact result of LagVal from above.
Dynamically naming the variables based on the variable and their lag. I.e. value1.1, value1.2, value2.1, value2.2, etc.
Thank you in advance and let me know if you need additional information. If there's a simpler way to get what I'm looking for, then I am all ears.
You'll have to reach deeper into the tidyverse toolbox to add them all at once. If you nest data for each value of cats, you can iterate over the nested data frames, iterating the lags over the values* columns in each.
library(tidyverse)
set.seed(47)
df <- data_frame(days = 1:9,
cats = rep(c('A','B','C'),3),
values1 = round(rnorm(9, 16, 4)),
values2 = round(rnorm(9, 16, 16)))
df %>% nest(-cats) %>%
mutate(lags = map(data, function(dat) {
imap_dfc(dat[-1], ~set_names(map(1:2, lag, x = .x),
paste0(.y, '_lag', 1:2)))
})) %>%
unnest() %>%
arrange(days)
#> # A tibble: 9 x 8
#> cats days values1 values2 values1_lag1 values1_lag2 values2_lag1
#> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 1 24. -7. NA NA NA
#> 2 B 2 19. 1. NA NA NA
#> 3 C 3 17. 17. NA NA NA
#> 4 A 4 15. 24. 24. NA -7.
#> 5 B 5 16. -13. 19. NA 1.
#> 6 C 6 12. 17. 17. NA 17.
#> 7 A 7 12. 27. 15. 24. 24.
#> 8 B 8 16. 15. 16. 19. -13.
#> 9 C 9 15. 36. 12. 17. 17.
#> # ... with 1 more variable: values2_lag2 <dbl>
data.table::shift makes this simpler, as it's vectorized. Naming takes more work than the actual lagging:
library(data.table)
setDT(df)
df[, sapply(1:2, function(x){paste0('values', x, '_lag', 1:2)}) := shift(.SD, 1:2),
by = cats, .SDcols = values1:values2][]
#> days cats values1 values2 values1_lag1 values1_lag2 values2_lag1
#> 1: 1 A 24 -7 NA NA NA
#> 2: 2 B 19 1 NA NA NA
#> 3: 3 C 17 17 NA NA NA
#> 4: 4 A 15 24 24 NA -7
#> 5: 5 B 16 -13 19 NA 1
#> 6: 6 C 12 17 17 NA 17
#> 7: 7 A 12 27 15 24 24
#> 8: 8 B 16 15 16 19 -13
#> 9: 9 C 15 36 12 17 17
#> values2_lag2
#> 1: NA
#> 2: NA
#> 3: NA
#> 4: NA
#> 5: NA
#> 6: NA
#> 7: -7
#> 8: 1
#> 9: 17
In these cases, I rely on the magic of dplyr and tidyr:
library(dplyr)
library(tidyr)
set.seed(47)
# create data
s_data = data_frame(
days = 1:9,
cats = rep(c('A', 'B', 'C'), 3),
values1 = round(rnorm(9, 16, 4)),
values2 = round(rnorm(9, 16, 16))
)
max_lag = 2 # define max number of lags
# create lags
s_data %>%
gather(select = -c("days", "cats")) %>% # gather all variables that will be lagged
mutate(n_lag = list(0:max_lag)) %>% # add list-column with lag numbers
unnest() %>% # unnest the list column
arrange(cats, key, n_lag, days) %>% # order the data.frame
group_by(cats, key, n_lag) %>% # group by relevant variables
# create lag. when grouped by vars above, n_lag is a constant vector, take 1st value
mutate(lag_val = lag(value, n_lag[1])) %>%
ungroup() %>%
# create some fancy labels
mutate(var_name = ifelse(n_lag == 0, key, paste0("Lag", key, ".", n_lag))) %>%
select(-c(key, value, n_lag)) %>% # drop unnecesary data
spread(var_name, lag_val) %>% # spread your newly created variables
select(days, cats, starts_with("val"), starts_with("Lag")) # reorder
## # A tibble: 9 x 8
## days cats values1 values2 Lagvalues1.1 Lagvalues1.2 Lagvalues2.1 Lagvalues2.2
## <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 A 24. -7. NA NA NA NA
## 2 2 B 19. 1. NA NA NA NA
## 3 3 C 17. 17. NA NA NA NA
## 4 4 A 15. 24. 24. NA -7. NA
## 5 5 B 16. -13. 19. NA 1. NA
## 6 6 C 12. 17. 17. NA 17. NA
## 7 7 A 12. 27. 15. 24. 24. -7.
## 8 8 B 16. 15. 16. 19. -13. 1.
## 9 9 C 15. 36. 12. 17. 17. 17.

Resources