Efficient vectorization on dates - r

Basicly I've got a function that I'll need to run close to 1M times and is taking a lot of time because it is not vectorized (my guess)
The idea is that there's a parameter pf.d.day that contains a date, and the output will be a transformation of that date (adding/removing days)
pf.s.Freq will offset the date to the next period.
08 Apr 2020 with Freq = "month" will become 01 May 2020
08 Apr 2020 with Freq = "week" will become 13 Apr 2020 #week starts on monday
08 Apr 2020 with Freq = "year" will become 01 Jan 2021
library(dplyr)
library(lubridate)
fn.Delay <- function(pf.d.day, pf.s.Freq){
d.DateWithouthDelay <- as.Date(
#note: using chained ifs instead of parsing pf.s.Freq into unit to avoid errors from misspells on excel file
ifelse(pf.s.Freq == "day", as.character(ceiling_date(pf.d.day + days(1), unit = "day" )),
ifelse(pf.s.Freq == "week", as.character(ceiling_date(pf.d.day + days(1), unit = "week", week_start = 1)),
ifelse(pf.s.Freq == "month", as.character(ceiling_date(pf.d.day + days(1), unit = "month" )),
ifelse(pf.s.Freq == "quarter", as.character(ceiling_date(pf.d.day + days(1), unit = "quarter")),
ifelse(pf.s.Freq == "year", as.character(ceiling_date(pf.d.day + days(1), unit = "year" )),
ifelse(pf.s.Freq != "BiWeek", "1900-1-2", #default date if pf.s.Freq is wrong
ifelse( day(pf.d.day) < 15,
as.character(pf.d.day - day(pf.d.day) +15),
as.character(ceiling_date(pf.d.day, unit = "month")))
)))))))
return(d.DateWithouthDelay)
}
for a small example:
data.frame(
Di = as.Date(c("2020-4-8", "2020-4-8", "2020-4-8", "2020-4-8", "2020-4-8", "2020-4-8", "2020-4-8")),
Fr = c("day", "week", "month", "quarter", "year", "BiWeek", "ups")) %>%
rowwise() %>%
mutate(Df = fn.Delay(Di, Fr)) %>%
data.frame()
The main problem with this code is it's speed. Mainly because it's not vectorized but probably also because I'm having to constantly change between dates and characters simply because ifelse likes to mess the dates

Your function is vectorized. Remove the rowwise for a speed increase and the same result:
identical(
dd %>% mutate(Df = fn.Delay(Di, Fr)) %>% pull(Df),
dd %>%rowwise() %>% mutate(Df = fn.Delay(Di, Fr)) %>% pull(Df)
)
# TRUE
ifelse isn't actually that bad. Here's a simplified version using case_when, but the performance difference vs ifelse is negligible - a tiny bit slower actually. But the code is cleaner.
fn.Delay2 <- function(pf.d.day, pf.s.Freq){
case_when(
pf.s.Freq == "day" ~ ceiling_date(pf.d.day + days(1), unit = "day"),
pf.s.Freq == "week" ~ ceiling_date(pf.d.day + days(1), unit = "week", week_start = 1),
pf.s.Freq == "month" ~ ceiling_date(pf.d.day + days(1), unit = "month" ),
pf.s.Freq == "quarter" ~ ceiling_date(pf.d.day + days(1), unit = "quarter"),
pf.s.Freq == "year" ~ ceiling_date(pf.d.day + days(1), unit = "year" ),
pf.s.Freq != "BiWeek" ~ as.Date("1900-1-2"), #default date if pf.s.Freq is wrong
day(pf.d.day) < 15 ~ pf.d.day - day(pf.d.day) + 15,
TRUE ~ ceiling_date(pf.d.day, unit = "month")
)
}
microbenchmark::microbenchmark(
rowwise = dd %>%rowwise() %>% mutate(Df = fn.Delay(Di, Fr)),
vectorized = dd %>% mutate(Df = fn.Delay(Di, Fr)),
case_when = dd %>% mutate(Df = fn.Delay2(Di, Fr))
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# rowwise 10.0593 12.47230 13.59725 13.00590 14.1138 30.3810 100
# vectorized 7.5237 7.97235 10.21504 10.26205 10.7905 25.7858 100
# case_when 7.7331 8.43595 10.42024 10.54705 11.1035 21.4732 100

Related

For Loop Efficiency

The following loop is effective in that it gets me to the finish line but i'm looking for a way to make it more efficient as I'm looping through a large dataset. Possibly using a Purrr function?
library(tidyverse)
library(timetk)
#### CREATE DATA
df_1 <- data.frame(Date = seq.Date(as.Date("2016-01-01"), length.out = 36, by = "month"),
Inventory = round(runif(36,5,100),0),
Purchases = round(runif(36,5,100),0),
Sales = round(runif(36,5,100),0),
Ending_Inventory = round(runif(36,5,100),0)) %>%
mutate(Starting_Inventory = lag(Ending_Inventory,1)) %>%
mutate(product = "Product_1")
df_2 <- data.frame(Date = seq.Date(as.Date("2016-01-01"), length.out = 36, by = "month"),
Inventory = round(runif(36,5,100),0),
Purchases = round(runif(36,5,100),0),
Sales = round(runif(36,5,100),0),
Ending_Inventory = round(runif(36,5,100),0)) %>%
mutate(Starting_Inventory = lag(Ending_Inventory,1)) %>%
mutate(product = "Product_2")
df <- rbind(df_1, df_2) %>%
group_by(product) %>%
timetk::future_frame(
.date_var = Date,
.length_out = "12 months",
.bind_data = TRUE
)
Here I'm creating a date sequence to iterate through the for loop
#### CREATE DATE SEQUENCE
Dates <- seq(min(df$Date) %m+% months(36), min(df$Date) %m+% months(48), by = "month")
The dates from the sequence above will iterate through the loop to fill in the future data and then I join, rename some columns, and drop all that contain ("y")... Seems like I'm performing some steps that aren't necessary.
for (i in 1:length(Dates)){
df <- df %>%
mutate(Purchases = case_when(Date < Dates[i] ~ Purchases,
Date == Dates[i] ~ lag(Purchases, 12)*1.05,
TRUE ~ 0
)) %>%
mutate(Starting_Inventory = case_when(Date < Dates[i] ~ Starting_Inventory,
Date == Dates[i] ~ lag(Ending_Inventory,1),
TRUE ~ 0
)) %>%
mutate(Sales = case_when(Date < Dates[i] ~ Sales,
Date == Dates[i] ~ lag(Sales,12) * 1.15,
TRUE ~ 0
)) %>%
mutate(Ending_Inventory = case_when(Date < Dates[i] ~ Ending_Inventory,
Date == Dates[i] ~ Starting_Inventory + Sales + Purchases,
TRUE ~ 0
)) %>%
mutate(Inventory = case_when(Date < Dates[i] ~ Inventory,
Date == Dates[i] ~ Ending_Inventory,
TRUE ~ 0
))
new_data <- df[df$Date == (Dates[i]),]
df <- df %>%
left_join(., new_data, by = c("product", "Date")) %>%
mutate(Inventory.x = ifelse(Date == Dates[i],Inventory.y,Inventory.x),
Purchases.x = ifelse(Date == Dates[i],Purchases.y,Purchases.x),
Sales.x = ifelse(Date == Dates[i],Sales.y,Sales.x),
Starting_Inventory.x = ifelse(Date == Dates[i],Starting_Inventory.y,Starting_Inventory.x),
Ending_Inventory.x = ifelse(Date == Dates[i],Ending_Inventory.y,Ending_Inventory.x),
) %>%
rename(Inventory = Inventory.x,
Purchases = Purchases.x,
Starting_Inventory = Starting_Inventory.x,
Sales = Sales.x,
Ending_Inventory = Ending_Inventory.x) %>%
dplyr::select(-contains(".y"))
return
print(i)
gc()
}
There are a lot of unnecessary steps in there.
Mutate can take more than one expression at once.
The case_when is unnecessary since in the next step you only keep the rows that got modified.
Then, for the same reason, the join and renaming is more steps than needed, you can just replace the old rows with the new row by selecting a subset.
for (i in seq_along(Dates)){
new_data <- df2 %>%
mutate(Purchases = lag(Purchases, 12)*1.05,
Starting_Inventory = lag(Ending_Inventory,1),
Sales = lag(Sales,12) * 1.15,
Ending_Inventory = Starting_Inventory + Sales + Purchases,
Inventory = Ending_Inventory)
df2[df2$Date == Dates[i],] <- new_data[new_data$Date == Dates[i],]
}
But then you're stil recalculating your whole data.frame for each loop. No need for that either since mutate() is iterative. You can do it all with just that function.
Also, since there are only 2 conditions really needed, you can replace the case_when with ifelse and it's faster.
df <- df %>%
mutate(
Purchases = ifelse(
Date %in% Dates, lag(Purchases, 12)*1.05, Purchases
),
Starting_Inventory = ifelse(
Date %in% Dates, lag(Ending_Inventory,1), Starting_Inventory
),
Sales = ifelse(
Date %in% Dates, lag(Sales,12) * 1.15, Sales
),
Ending_Inventory = ifelse(
Date %in% Dates, Starting_Inventory + Sales + Purchases,
Ending_Inventory
),
Inventory = ifelse(
Date %in% Dates, Ending_Inventory, Inventory
)
)
Edit:
I think it's important to break down what you're trying to do when you end up with long for loop like this. Since you're trying to do in place modifications, even in base R, you could do this with this short a for loop :
df3 <- df.o
df3 <- df3 |> within({
for (i in which(Date %in% Dates)){
Purchases[i] = Purchases[i-12]*1.05
Sales[i] = Sales[i-12] * 1.15
Ending_Inventory[i] = Starting_Inventory[i] + Sales[i] + Purchases[i]
Inventory[i] = Ending_Inventory[i]
Starting_Inventory[i] = Ending_Inventory[i-1]
}
i = NULL
})
A bit slower than mutate, but it's the same logic.

Calculate Mean for Each Unique Value up to a certain date

Data for my example.
date1 = seq(as.Date("2019/01/01"), by = "month", length.out = 48)
date2 = seq(as.Date("2019/02/01"), by = "month", length.out = 48)
date3 = seq(as.Date("2019/02/01"), by = "month", length.out = 48)
date4 = seq(as.Date("2019/02/01"), by = "month", length.out = 48)
date = c(date1,date2,date3,date4)
subproducts1=rep("1",48)
subproducts2=rep("2",48)
subproductsx=rep("x",48)
subproductsy=rep("y",48)
b1 <- c(rnorm(48,5))
b2 <- c(rnorm(48,5))
b3 <-c(rnorm(48,5) )
b4 <- c(rnorm(48,5))
dfone <- data.frame(
"date"= date,
"subproduct"=
c(subproducts1,subproducts2,subproductsx,subproductsy),
"actuals"= c(b1,b2,b3,b4))
This creates Jan 2019 for date2,3,4 with value 0.
dfone <-dfone %>%
complete(date = seq.Date(from = min(date), to = as.Date('2021-06-01'), by = 'month'),
nesting(subproduct), fill = list(actuals = 0))
QUESTION: This calculates the mean for each unique sub product and replaces 0's with the mean of each, but how do I have a hard cutoff so the mean is only based off Jan-2019 to Dec-2020 and not Jan 2019 to Dec 2022?
library(dplyr)
dfone_new <- dfone %>%
group_by(subproduct) %>%
mutate(actuals = replace(actuals, actuals == 0,
mean(actuals[actuals != 0], na.rm = TRUE))) %>%
ungroup
We may need one more logical expression while subsetting the 'actuals' i.e. the 'date' should be between the 2019 Jan and 2020 Dec while calculating the mean
library(dplyr)
library(tidyr)
dfone %>%
group_by(subproduct) %>%
mutate(actuals = replace(actuals, actuals == 0,
mean(actuals[actuals != 0 &
between(date, as.Date("2019-01-01"), as.Date("2020-12-31"))],
na.rm = TRUE)))

Creating list with the same number of values

I have a data set with a date, ID, and coordinates that I would like to split into seasonal months. For example for winter I have January to winter1, February to winter2, and March to winter3. I have done the same for the summer months.
I would like to filter out the IDs that have all of these months, so that when I split the data by ID and year, I would have identical list lengths.
I wasn't sure how to simulate uneven values for each ID in the sample code below, but in my actual data some IDs only have summer1 and not winter1, while it could be flipped around for summer2 and winter2`.
library(lubridate)
library(tidyverse)
date <- rep_len(seq(dmy("01-01-2010"), dmy("31-12-2013"), by = "days"),1000)
ID <- rep(seq(1, 5), 100)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$month <- month(df$date)
df$year <- year(df$date)
df1 <- df %>%
mutate(season_categ = case_when(month %in% 6 ~ 'summer1',
month %in% 7 ~ 'summer2',
month %in% 8 ~ 'summer3',
month %in% 1 ~ 'winter1',
month %in% 2 ~ 'winter2',
month %in% 3 ~ 'winter3')) %>%
group_by(year, ID )%>%
filter(any(month %in% 6:8) &
any(month %in% 1:3))
summer_list <- df1 %>%
filter(season_categ == "summer1") %>%
group_split(year, ID)
# Renames the names in the list to AnimalID and year
names(summer_list) <- sapply(summer_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
# Creates a list for each year and by ID
winter_list <- df1 %>%
filter(season_categ == "winter1") %>%
group_split(year, ID)
names(winter_list) <- sapply(winter_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
Not sure if that is what you want, but I understood that you would want to get rid of IDs that have less than the 6 months of Q1 and Q3 in any of the years, but you could modify the filter or grouping if that assumption was wrong.
Here is one approach:
library(lubridate)
library(dplyr)
set.seed(12345)
# random sampling of dates with this seed gives no July date for ID 2 in 2010
df <- tibble(
date = sample(seq(dmy("01-01-2010"), dmy("31-12-2013"), by = "days"),
1000, replace = TRUE),
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID = rep(1:5, 200),
month = month(date),
year =year(date)) %>%
arrange(ID, date)
df %>%
filter(month %in% c(1:3, 6:8)) %>%
group_by(ID, year) %>%
mutate(complete = length(unique(month)) == 6) %>%
group_by(ID) %>%
filter(all(complete)) %>%
group_by(ID, year) %>%
group_split()
To me it is not really clear as to what your are looking for. Before you split the data into a list sort the rows by columns
df1<-df1[order(ID,season_categ),]
### Determine which ID's have uneven numbers ###
df1 %>%
group_by(ID) %>%
summarize(month_seq = paste(season_categ , collapse = "_"),
number_of_months = n(season_categ))
#### Remove odd numbers###

Select specific date / hour range from list elements and create dataframe

I have a list with approximately 150 elements (data frames) of weather data (ID,date,time,temperature). I want to select specific date range and time from each list element (df) and create a data frame (or multiple) with these selected rows. Given the fact I can't provide real data I've created a reproducible example:
library(lubridate)
library(dplyr)
library(tidyr)
library(purrr)
z1 <- seq.POSIXt(as.POSIXct(Sys.Date()), as.POSIXct(Sys.Date()+780), by = "10 min")
z2 <- seq.POSIXt(as.POSIXct(Sys.Date()), as.POSIXct(Sys.Date()+780), by = "10 min")
z3 <- seq.POSIXt(as.POSIXct(Sys.Date()), as.POSIXct(Sys.Date()+780), by = "10 min")
z4 <- seq.POSIXt(as.POSIXct(Sys.Date()), as.POSIXct(Sys.Date()+780), by = "10 min")
temperature1 <- runif(112321, min = -5, max = 45)
temperature2 <- runif(112321, min = -5, max = 45)
temperature3 <- runif(112321, min = -5, max = 45)
temperature4 <- runif(112321, min = -5, max = 45)
station1 <- data.frame(date = z1, temp = temperature1)
station2 <- data.frame(date = z2, temp = temperature2)
station3 <- data.frame(date = z3, temp = temperature3)
station4 <- data.frame(date = z4, temp = temperature4)
##isolate date from time
station1 <- separate(station1, date, c("date", "time"), sep = " ")
station2 <- separate(station2, date, c("date", "time"), sep = " ")
station3 <- separate(station3, date, c("date", "time"), sep = " ")
station4 <- separate(station4, date, c("date", "time"), sep = " ")
## list of all stations
stations_list <- list(station1,station2,station3,station4)
#create a column with station ID (name) ##
ID_names <- c("station1","station2","station3","station4")
stations_list <- mapply(cbind,stations_list, "ID" = ID_names, SIMPLIFY = F)
Now in this list I want to select specific date and time range so I used the following script:
selected_date_time <- map_dfr(stations_list,
~ filter(.x, date >= "2021-06-01" &
date <= "2021-10-15" & time >= "18:00" & time <= "10:00" |
date > "2022-08-18" & date <= "2022-10-05" & time >= "09:00"
& time <= "17:00"))
In this case, I got a data frame with only 2022 year and no selection fro 2021. I changed slightly the code and I selected different hour range :
selected_date_time <- map_dfr(stations_list,
~ filter(.x, date >= "2021-06-01" &
date <= "2021-10-15" & time >= "18:00" & time <= "10:00" |
date > "2022-08-18" & date <= "2022-10-05" & time <= "09:00"
& time >= "17:00"))
In the last case I got a data frame with zero observations. What am I doing wrong ?!
As mentioned by #AntoniosK, your filter logic was off so I made a few amendments but most importantly, for this filter to work, we need to make sure the date and time are "date" and "time" class respectively.
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
library(tidyverse)
library(hms)
#>
#> Attaching package: 'hms'
#> The following object is masked from 'package:lubridate':
#>
#> hms
z1 <-seq.POSIXt(as.POSIXct(Sys.Date()), as.POSIXct(Sys.Date() + 780), by = "10 min")
z2 <-seq.POSIXt(as.POSIXct(Sys.Date()), as.POSIXct(Sys.Date() + 780), by = "10 min")
z3 <-seq.POSIXt(as.POSIXct(Sys.Date()), as.POSIXct(Sys.Date() + 780), by = "10 min")
z4 <-seq.POSIXt(as.POSIXct(Sys.Date()), as.POSIXct(Sys.Date() + 780), by = "10 min")
temperature1 <- runif(112321, min = -5, max = 45)
temperature2 <- runif(112321, min = -5, max = 45)
temperature3 <- runif(112321, min = -5, max = 45)
temperature4 <- runif(112321, min = -5, max = 45)
station1 <- tibble(date = z1, temp = temperature1)
station2 <- tibble(date = z2, temp = temperature2)
station3 <- tibble(date = z3, temp = temperature3)
station4 <- tibble(date = z4, temp = temperature4)
station1 <- station1 %>%
mutate(time = hms::as_hms(date),
date = as_date(date)) %>%
relocate(date, time)
station2 <- station2 %>%
mutate(time = hms::as_hms(date),
date = as_date(date)) %>%
relocate(date, time)
station3 <- station3 %>%
mutate(time = hms::as_hms(date),
date = as_date(date)) %>%
relocate(date, time)
station4 <- station4 %>%
mutate(time = hms::as_hms(date),
date = as_date(date)) %>%
relocate(date, time)
## list of all stations
stations_list <- list(station1, station2, station3, station4)
#create a column with station ID (name) ##
ID_names <- c("station1", "station2", "station3", "station4")
stations_list <-
mapply(cbind, stations_list, "ID" = ID_names, SIMPLIFY = F)
stations_list %>%
map_dfr(~ filter(
.x,
(
between(date, as.Date("2021-06-01"), as.Date("2021-10-15")) &
(time >= as_hms("18:00:00") | time <= as_hms("10:00:00"))
) |
(date > as.Date("2022-08-18") &
date <= as.Date("2022-10-05")) &
(time <= as_hms("09:00:00") | time >= as_hms("17:00:00"))
)) %>%
arrange(date) %>%
head()
#> date time temp ID
#> 1 2021-06-01 00:00:00 20.259581 station1
#> 2 2021-06-01 00:10:00 37.558833 station1
#> 3 2021-06-01 00:20:00 18.729679 station1
#> 4 2021-06-01 00:30:00 5.880394 station1
#> 5 2021-06-01 00:40:00 2.393515 station1
#> 6 2021-06-01 00:50:00 36.030296 station1
Created on 2021-05-26 by the reprex package (v2.0.0)

replace historical data of a data.frame with the most recent year data in R?

I want to replace Jan 01 to Jun 25 of all the years in FakeData with data from Ob2020 for the two variables (Level & Flow) of my data.frame. Here is what i have started and am looking for suggestions to achieving my goal.
library(tidyverse)
library(lubridate)
set.seed(1500)
FakeData <- data.frame(Date = seq(as.Date("2010-01-01"), to = as.Date("2018-12-31"), by = "days"),
Level = runif(3287, 0, 30), Flow = runif(3287, 1,10))
Ob2020 <- data.frame(Date = seq(as.Date("2020-01-01"), to = as.Date("2020-06-25"), by = "days"),
Level = runif(177, 0, 30), Flow = runif(177, 1,10))
Here's a way using dplyr and lubridate :
library(dplyr)
library(lubridate)
FakeData %>%
mutate(day = day(Date), month = month(Date)) %>%
left_join(Ob2020 %>%
mutate(day = day(Date), month = month(Date)),
by = c('day', 'month')) %>%
mutate(Level = coalesce(Level.y, Level.x),
Flow = coalesce(Flow.y, Flow.x)) %>%
select(Date = Date.x, Level, Flow)
If you dont mind a data.table solution, here is an update join:
library(data.table)
#extract year and month of the date
setDT(FakeData)[, c("day", "mth") := .(mday(Date), month(Date))]
setDT(Ob2020)[, c("day", "mth") := .(mday(Date), month(Date))]
#print to console to show old values
head(FakeData)
head(Ob2020)
cols <- c("Level", "Flow")
FakeData[Ob2020[mth<=6L & day<=25], on=.(day, mth),
(cols) := mget(paste0("i.", cols))]
#print to console to show new values
head(FakeData)

Resources