rolling mutate function applied to lists data frames - r

I have some time series data where I use the rolling_origin function to apply different time series splits to the data which generates a number of lists. The time series starts from 2020-03-01 until 2020-10-30.
I want to start from 2020-04-15 such that I have 1 month before (2020-03-15) and 1 month after (2020-05-15). I can use an ifelse statement to add a 1 for observations after and a 0 for observations before.
rolledData %>%
map(., ~mutate(.x,
treatment_control = ifelse(date >= as.Date("2020-04-15"), 1, 0)
))
But what I want to do is to increment the ifelse date when mapped over the list. So the first one might start on the 2020-04-15 but in the next list in the sequence it would be changed to 2020-04-16, and the next list 2020-04-17, .... , until the end.
I could manually write out the results:
lst1 <- rolledData[[12]] %>%
mutate(
treatment_control = ifelse(date >= as.Date("2020-04-15"), 1, 0)
)
lst2 <- rolledData[[13]] %>%
mutate(
treatment_control = ifelse(date >= as.Date("2020-04-16"), 1, 0)
)
lst3 <- rolledData[[14]] %>%
mutate(
treatment_control = ifelse(date >= as.Date("2020-04-17"), 1, 0)
)
How can I map over the list and increment the treatment_control mutate?
Note: Because I am using financial data (which was just the easiest to obtain for a reproducible example) the weekends are removed (in my data I have a full week)
Data:
library(tidyquant)
library(rsample)
data <- tq_get(c("AAPL"),
get = "stock.prices",
from = "2020-03-01",
to = "2020-10-30")
rolledData <- data %>%
rolling_origin(
data = .,
initial = 60, # 2 months of data
assess = 0,
cumulative = FALSE,
skip = 0
)
rolledData <- rolledData$splits %>%
map(., ~analysis(.x))

If the dates are different, we can pass a vector of custom dates that have the same length as the rolledData in map2
library(dplyr)
library(purrr)
rolleData2 <- rolledData %>%
map2(., newdates,
~ .x %>%
mutate(treatment_control = +(date >= .y)
))
where
newdates <- seq(as.Date("2020-03-15"), length.out = length(rolledData), by = "1 day")
If it is based on the next month from the first 'date' value
library(lubridate)
rolledData2 <- rolledData %>%
map(~ .x %>%
mutate(treatment_control =
+(date >= (first(date) %m+% months(1)))))

Related

Set up ouput-list names in slider::slide_index()

slider::slider_index() uses a purrr::map()-like syntax to apply functions on rolling (here time) window. The output of this function is a list. My question is, how to set up the names of this list?
The slider_index() function does not have a .names_to argument like slider_index_dfr().
See reproducible example below:
library(slider)
library(lubridate)
library(dplyr)
storms_summary <- storms %>%
# filter dataset so it does not eat all memory and save computing time ;)
filter(year <= 2000 & year >= 1999) %>%
# make a date column to mimic my real data
mutate(storm_date = as.Date(paste(year, month, day, sep = "-"))) %>%
arrange(storm_date) %>%
slider::slide_index(
.x = .,
.i = .$storm_date,
.f = ~ summary(.x),
.after = lubridate::weeks(2),
.complete = TRUE
)
Expected output is the list storms_summary with names that are character strings indicating the first and the last date of the period summarized. With the code below we get:
> names(storms_summary)
NULL
If there is no way to assign the names within the function we can create names on our own and use setNames to assign it.
library(slider)
library(lubridate)
library(dplyr)
storms_summary <- storms %>%
# filter dataset so it does not eat all memory and save computing time ;)
filter(year <= 2000 & year >= 1999) %>%
# make a date column to mimic my real data
mutate(storm_date = as.Date(paste(year, month, day, sep = "-"))) %>%
arrange(storm_date) %>%
mutate(sliding_output = slider::slide_index(.x = cur_data(),
.i = storm_date,
.f = ~summary(.x),
.after = lubridate::weeks(2),
.complete = TRUE
),
names = slider::slide_index_chr(.x = storm_date,
.i = storm_date,
.f = ~paste0(range(.x), collapse = '-'),
.after = lubridate::weeks(2)),
sliding_output = setNames(sliding_output, names)) %>%
select(-names)
names(storms_summary$sliding_output)
# [1] "1999-07-02-1999-07-03" "1999-07-03-1999-07-03" "1999-07-03-1999-07-03"
# [4] "1999-07-03-1999-07-03" "1999-08-24-1999-09-07" "1999-08-24-1999-09-07"
# [7] "1999-08-24-1999-09-07" "1999-08-25-1999-09-08" "1999-08-25-1999-09-08"
#[10] "1999-08-25-1999-09-08" "1999-08-25-1999-09-08" "1999-08-26-1999-09-09"
#...
#...

Manipulating data.frame while using cycles and storing values in a list

I have 2 codes that manipulate and filter (by date) my data.frame and that work perfectly. Now I want to run the code for not only one day, but for every day in vector:
seq(from=as.Date('2020-03-02'), to=Sys.Date(),by='days')` #.... 538 days
The code I want to run for all the days between 2020-03-02 and today is:
KOKOKO <- data.frame %>%
filter(DATE < '2020-03-02')%>%
summarize(DATE = '2020-03-02', CZK = sum(Objem.v.CZK,na.rm = T)
STAVPTF <- data.frame %>%
filter (DATE < '2020-03-02')%>%
group_by(CP) %>%
summarize(mnozstvi = last(AKTUALNI_MNOZSTVI_AKCIE), DATE = '2020-03-02') %>%
select(DATE,CP,mnozstvi) %>%
rbind(KOKOKO)%>%
drop_na() %>%
So instead of '2020-03-02' I want to fill in all days since '2020-03-02' one after another. And each of the KOKOKO and STAVPTF created for the unique day like this I want to save as a separate data.frame and all of them store in a list.
We could use map to loop over the sequence and apply the code
library(dplyr)
library(purrr)
out <- map(s1, ~ data.frame %>%
filter(DATE < .x)%>%
summarize(DATE = .x, CZK = sum(Objem.v.CZK,na.rm = TRUE))
As this is repeated cycle, a function would make it cleaner
f1 <- function(dat, date_col, group_col, Objem_col, aktualni_col, date_val) {
filtered <- dat %>%
filter({{date_col}} < date_val)
KOKOKO <- filtered %>%
summarize({{date_col}} := date_val,
CZK = sum({{Objem_col}}, na.rm = TRUE)
STAVPTF <- filtered %>%
group_by({{group_col}}) %>%
summarize(mnozstvi = last({{aktualni_col}}),
{{date_col}} := date_val) %>%
select({{date_col}}, {{group_col}}, mnozstvi) %>%
bind_rows(KOKOKO)%>%
drop_na()
return(STAVPTF)
}
and call as
map(s1, ~ f1(data.frame, DATE, CP, Objem.v.CZK, AKTUALNI_MNOZSTVI_AKCIE, !!.x))
where
s1 <- seq(from=as.Date('2020-03-02'), to=Sys.Date(), by='days')
It would be easier to answer your question, if you would provide a minimal reproducible example. It's easy done with tidyverses reprex packages
However, your KOKOKO code can be rewritten as simple cumulative sum:
KOKOKO =
data.frame %>%
arrange(DATE) %>% # if necessary
group_by(DATE) %>%
summarise(CZK = sum(Objem.v.CZK), .groups = 'drop') %>% # summarise per DATE (if necessary)
mutate(CZK = cumsum(CZK) - CZK) # cumulative sum excluding current row (current DATE)
Even STAVPTF code can probably be rewritten without iterations. First find the last value of AKTUALNI_MNOZSTVI_AKCIE per CP and DATE. Then this value is assigned to the next DATE:
STAVPTF <-
data.frame %>%
group_by(CP, DATE) %>%
summarise(mnozstvi = last(AKTUALNI_MNOZSTVI_AKCIE), .groups='drop_last') %>%
arrange(DATE) %>% # if necessary
mutate(DATE = lead(DATE))

Is there a faster way than applying 'ddply' to aggregate columns by groups with a large dataset?

Purpose
I am trying to check whether a pair of values in two columns appear in the previous event, and aggregate the dummy variables by groups.
Specifically, I have each event id (i.e., oid) and dyad-level observations associated with each event: agent (i.e., aid), partner (i.e., pid). The events are sorted by time when the event occurs (i.e., o4.in).
(1)I made a dummy variable indicating if a pair of agent and partner appear together in the previous event.
(2) Also, I used ddply to aggregate the dummy variable by groups, as specified in the below example.
I find that ddply and lag functions take so much time with a large dataset, and I am wondering if there is a faster way to achieved these tasks.
Dataset
library(tidyverse)
library(tibble)
rename <- dplyr::rename
select <- dplyr::select
set.seed(10001)
cases <- sample(1:5, 1000, replace=T)
set.seed(10002)
agent <- sample(1:20, 1000, replace=T)
set.seed(10003)
partner <- sample(1:20, 1000, replace=T)
set.seed(123)
n <- 1000 # no of random datetimes needed
minDate <- as.POSIXct("1999/01/01")
maxDate <- as.POSIXct("2000-01-01")
epoch <- "1970-01-01"
timestamps <-
as.POSIXct(pmax(runif(n, minDate, maxDate), runif(n, minDate, maxDate)), origin = epoch)
df <-
data.frame(cases, agent, partner, timestamps) %>%
rename(
aid = agent,
pid = partner,
oid = cases,
o4.in = timestamps
) %>%
filter(aid != pid)
Current Methods
# creating dummy variable
d <-
df %>%
arrange(o4.in) %>%
group_by(aid) %>%
mutate(
oid.lag.a = lag(oid)
) %>%
ungroup %>%
group_by(pid) %>%
mutate(
oid.lag.p = lag(oid)
) %>%
ungroup %>%
mutate(
j2.consecutive = ifelse(oid.lag.a == oid.lag.p, 1, 0),
j2.consecutive = ifelse(is.na(j2.consecutive), 0, j2.consecutive)
) %>%
select(-oid.lag.a, -oid.lag.p)
# aggregating the dummy variable by groups
t <-
d %>%
ungroup %>%
ddply(c('oid', 'aid'), function(i){
i %>%
mutate(aj1.consecutive = (sum(j2.consecutive) - j2.consecutive)/(n()-1))
} , .progress = 'text') %>%
arrange(oid, pid) %>%
ddply(c('oid', 'pid'), function(i){
i %>%
mutate(apj1.consecutive = (sum(j2.consecutive) - j2.consecutive)/(n()-1))
} , .progress = 'text')
Update for Future Readers
Task (1) is achieved by the answer by #akrun below.
Task (2) solution is answered by #akrun in a separate post: A faster way than applying 'ddply' to aggregate a variable by a function by groups
Special thanks to #akrun!!
We can use data.table methods to make it faster
library(data.table)
df2 <- copy(df)
df3 <- setDT(df2)[order(o4.in)]
df3[, oid.lag.a := shift(oid), by = aid
][, oid.lag.p := shift(oid), by = pid]
df3[, j2.consecutive := fcoalesce(+(oid.lag.a == oid.lag.p), 0L)]
Also, note that some things in the OP's code are unnecessary i.e. using ifelse to convert a logical to binary. It can just be as.integer or coercion with +. The second line again with ifelse can be removed as well with coalesce
library(dplyr)
out <- df %>%
arrange(o4.in) %>%
group_by(aid) %>%
mutate(
oid.lag.a = lag(oid)
) %>%
group_by(pid) %>%
mutate(
oid.lag.p = lag(oid)
) %>%
ungroup %>%
mutate(j2.consecutive = coalesce(+(oid.lag.a == oid.lag.p), 0))
-checking the output from dplyr/data.table
all(out$j2.consecutive == df3$j2.consecutive )
[1] TRUE

Dplyr grouped percentages in different timeframes

I have data in the following format:
DATE GROUP EVENT ELIGIBLE
2021-3-9 A 1 1
2021-3-1 A 0 0
2021-3-1 B 0 1
2021-2-20 B 1 1
I would like to group the data by the GROUP column and then add three new columns that calculate by group the sum of (EVENT / ELIGIBLE) for the following time frames. Last 3 months, 3 months back to six months back, and the last year.
I have calculated the overall percentage without separate timeframes by doing the following:
grouped <- data %>%
filter(ELIGIBLE == 1 ) %>%
group_by(GROUP) %>%
mutate(count_Eligible = sum(ELIGIBLE == 1 )) %>%
mutate(count_events = sum(EVENT == 1 )) %>%
mutate(Percentage = round(100*count_events/count_Eligible,2))
I am wondering what the cleanest way would be to add the three different percentages within the timeframes. So far I have pulled the dates to do the filtering with the following code:
today <- Sys.Date()
three_month_lookback <- as.Date(today) - months(3)
six_month_lookback <- as.Date(today) - months(6)
one_year_lookback <- as.Date(today) - months(12)
We can create a function to do the calculation
library(dplyr)
library(purrr)
f1 <- function(data) {
data %>%
filter(ELIGIBLE == 1 ) %>%
group_by(GROUP) %>%
transmute(count_Eligible = sum(ELIGIBLE == 1 ),
count_events = sum(EVENT == 1 ),
Percentage = round(100*count_events/count_Eligible,2))
}
Then, loop over the 'lookback' periods, subset the data based on the 'DATE' column and apply the function
map2_dfr(list(three_month_lookback, six_month_lookback,
one_year_lookback) list(today(), three_month_lookback, today()),
~ data %>%
mutate(DATE = as.Date(DATE)) %>%
filter(DATE >= .x, DATE <= .y) %>%
f1(.), .id = 'grp'
)
If we need to combine by columns
map2(list(three_month_lookback, six_month_lookback,
one_year_lookback) list(today(), three_month_lookback, today()),
~ data %>%
mutate(DATE = as.Date(DATE)) %>%
filter(DATE >= .x, DATE <= .y) %>%
f1(.)
) %>%
reduce(full_join, by = "GROUP")

Creating data partitions over a selected range of data to be fed into caret::train function for cross-validation

I want to create jack-knife data partitions for the data frame below, with the partitions to be used in caret::train (like the caret::groupKFold() produces). However, the catch is that I want to restrict the test points to say greater than 16 days, whilst using the remainder of these data as the training set.
df <- data.frame(Effect = seq(from = 0.05, to = 1, by = 0.05),
Time = seq(1:20))
The reason I want to do this is that I am only really interested in how well the model is predicting the upper bound, as this is the region of interest. I feel like there is a way to do this with the caret::groupKFold() function but I am not sure how. Any help would be greatly appreciated.
An example of what each CV fold would comprise:
TrainSet1 <- subset(df, Time != 16)
TestSet1 <- subset(df, Time == 16)
TrainSet2 <- subset(df, Time != 17)
TestSet2 <- subset(df, Time == 17)
TrainSet3 <- subset(df, Time != 18)
TestSet3 <- subset(df, Time == 18)
TrainSet4 <- subset(df, Time != 19)
TestSet4 <- subset(df, Time == 19)
TrainSet5 <- subset(df, Time != 20)
TestSet5 <- subset(df, Time == 20)
Albeit in the format that the caret::groupKFold function outputs, so that the folds could be fed into the caret::train function:
CVFolds <- caret::groupKFold(df$Time)
CVFolds
Thanks in advance!
For customized folds I find in built functions are usually not flexible enough. Therefore I usually produce them using tidyverse. One approach to your problem would be:
library(tidyverse)
df %>%
mutate(id = row_number()) %>% #use the row number as a column called id
filter(Time > 15) %>% #filter Time as per your need
split(.$Time) %>% #split df to a list by Time
map(~ .x %>% select(id)) #select row numbers for each list element
example with two rows per each time:
df <- data.frame(Effect = seq(from = 0.025, to = 1, by = 0.025),
Time = rep(1:20, each = 2))
df %>%
mutate(id = row_number()) %>%
filter(Time > 15) %>%
split(.$Time) %>%
map(~ .x %>% select(id)) -> test_folds
test_folds
#output
$`16`
id
1 31
2 32
$`17`
id
3 33
4 34
$`18`
id
5 35
6 36
$`19`
id
7 37
8 38
$`20`
id
9 39
10 40
with unequal number of rows per time
df <- data.frame(Effect = seq(from = 0.55, to = 1, by = 0.05),
Time = c(rep(1, 5), rep(2, 3), rep(rep(3, 2))))
df %>%
mutate(id = row_number()) %>%
filter(Time > 1) %>%
split(.$Time) %>%
map(~ .x %>% select(id))
$`2`
id
1 6
2 7
3 8
$`3`
id
4 9
5 10
Now you can define these hold out folds inside trainControl with the argument indexOut.
EDIT: to get similar output as caret::groupKFold one can:
df %>%
mutate(id = row_number()) %>%
filter(Time > 1) %>%
split(.$Time) %>%
map(~ .x %>%
select(id) %>%
unlist %>%
unname) %>%
unname

Resources