For Loop Efficiency - r

The following loop is effective in that it gets me to the finish line but i'm looking for a way to make it more efficient as I'm looping through a large dataset. Possibly using a Purrr function?
library(tidyverse)
library(timetk)
#### CREATE DATA
df_1 <- data.frame(Date = seq.Date(as.Date("2016-01-01"), length.out = 36, by = "month"),
Inventory = round(runif(36,5,100),0),
Purchases = round(runif(36,5,100),0),
Sales = round(runif(36,5,100),0),
Ending_Inventory = round(runif(36,5,100),0)) %>%
mutate(Starting_Inventory = lag(Ending_Inventory,1)) %>%
mutate(product = "Product_1")
df_2 <- data.frame(Date = seq.Date(as.Date("2016-01-01"), length.out = 36, by = "month"),
Inventory = round(runif(36,5,100),0),
Purchases = round(runif(36,5,100),0),
Sales = round(runif(36,5,100),0),
Ending_Inventory = round(runif(36,5,100),0)) %>%
mutate(Starting_Inventory = lag(Ending_Inventory,1)) %>%
mutate(product = "Product_2")
df <- rbind(df_1, df_2) %>%
group_by(product) %>%
timetk::future_frame(
.date_var = Date,
.length_out = "12 months",
.bind_data = TRUE
)
Here I'm creating a date sequence to iterate through the for loop
#### CREATE DATE SEQUENCE
Dates <- seq(min(df$Date) %m+% months(36), min(df$Date) %m+% months(48), by = "month")
The dates from the sequence above will iterate through the loop to fill in the future data and then I join, rename some columns, and drop all that contain ("y")... Seems like I'm performing some steps that aren't necessary.
for (i in 1:length(Dates)){
df <- df %>%
mutate(Purchases = case_when(Date < Dates[i] ~ Purchases,
Date == Dates[i] ~ lag(Purchases, 12)*1.05,
TRUE ~ 0
)) %>%
mutate(Starting_Inventory = case_when(Date < Dates[i] ~ Starting_Inventory,
Date == Dates[i] ~ lag(Ending_Inventory,1),
TRUE ~ 0
)) %>%
mutate(Sales = case_when(Date < Dates[i] ~ Sales,
Date == Dates[i] ~ lag(Sales,12) * 1.15,
TRUE ~ 0
)) %>%
mutate(Ending_Inventory = case_when(Date < Dates[i] ~ Ending_Inventory,
Date == Dates[i] ~ Starting_Inventory + Sales + Purchases,
TRUE ~ 0
)) %>%
mutate(Inventory = case_when(Date < Dates[i] ~ Inventory,
Date == Dates[i] ~ Ending_Inventory,
TRUE ~ 0
))
new_data <- df[df$Date == (Dates[i]),]
df <- df %>%
left_join(., new_data, by = c("product", "Date")) %>%
mutate(Inventory.x = ifelse(Date == Dates[i],Inventory.y,Inventory.x),
Purchases.x = ifelse(Date == Dates[i],Purchases.y,Purchases.x),
Sales.x = ifelse(Date == Dates[i],Sales.y,Sales.x),
Starting_Inventory.x = ifelse(Date == Dates[i],Starting_Inventory.y,Starting_Inventory.x),
Ending_Inventory.x = ifelse(Date == Dates[i],Ending_Inventory.y,Ending_Inventory.x),
) %>%
rename(Inventory = Inventory.x,
Purchases = Purchases.x,
Starting_Inventory = Starting_Inventory.x,
Sales = Sales.x,
Ending_Inventory = Ending_Inventory.x) %>%
dplyr::select(-contains(".y"))
return
print(i)
gc()
}

There are a lot of unnecessary steps in there.
Mutate can take more than one expression at once.
The case_when is unnecessary since in the next step you only keep the rows that got modified.
Then, for the same reason, the join and renaming is more steps than needed, you can just replace the old rows with the new row by selecting a subset.
for (i in seq_along(Dates)){
new_data <- df2 %>%
mutate(Purchases = lag(Purchases, 12)*1.05,
Starting_Inventory = lag(Ending_Inventory,1),
Sales = lag(Sales,12) * 1.15,
Ending_Inventory = Starting_Inventory + Sales + Purchases,
Inventory = Ending_Inventory)
df2[df2$Date == Dates[i],] <- new_data[new_data$Date == Dates[i],]
}
But then you're stil recalculating your whole data.frame for each loop. No need for that either since mutate() is iterative. You can do it all with just that function.
Also, since there are only 2 conditions really needed, you can replace the case_when with ifelse and it's faster.
df <- df %>%
mutate(
Purchases = ifelse(
Date %in% Dates, lag(Purchases, 12)*1.05, Purchases
),
Starting_Inventory = ifelse(
Date %in% Dates, lag(Ending_Inventory,1), Starting_Inventory
),
Sales = ifelse(
Date %in% Dates, lag(Sales,12) * 1.15, Sales
),
Ending_Inventory = ifelse(
Date %in% Dates, Starting_Inventory + Sales + Purchases,
Ending_Inventory
),
Inventory = ifelse(
Date %in% Dates, Ending_Inventory, Inventory
)
)
Edit:
I think it's important to break down what you're trying to do when you end up with long for loop like this. Since you're trying to do in place modifications, even in base R, you could do this with this short a for loop :
df3 <- df.o
df3 <- df3 |> within({
for (i in which(Date %in% Dates)){
Purchases[i] = Purchases[i-12]*1.05
Sales[i] = Sales[i-12] * 1.15
Ending_Inventory[i] = Starting_Inventory[i] + Sales[i] + Purchases[i]
Inventory[i] = Ending_Inventory[i]
Starting_Inventory[i] = Ending_Inventory[i-1]
}
i = NULL
})
A bit slower than mutate, but it's the same logic.

Related

Extremely slow calculations in SPARK (using sparklyr)

I'm using sparklyr library in order to calculate some metrics on big data. I've decide to test my code on a small portion of data (parquet-table format ~ 1,3m rows, 40MB), but after executing some basic filtering SPARK needs approximately 2-3 minutes to calculate sum.
Here is an example of my code
#Connecting
df <- spark_read_parquet(spark_conn, name = "df", path = "hdfs://****.db/table", header = TRUE, memory = FALSE, repartition = 1, overwrite = FALSE)
#Basic filtering
df <- df %>%
mutate (ZD = datediff(Maturity_Date - Treaty_Date))
df <- df %>%
filter(Treaty_Amount <= 30000, ZD <= 30)
df <- df %>%
dplyr::mutate(MD_Received_From_Borrower_2018_1 = rowSums(.[139:150], na.rm = TRUE))
df <- df %>%
dplyr::mutate(PD_Received_From_Borrower_2018_1 = rowSums(.[175:186], na.rm = TRUE))
df <- df %>%
mutate (Total_Amount_Received_From_Borrower_2018_1= (MD_Received_From_Borrower_2018_1 + PD_Received_From_Borrower_2018_1))
#More filtering for a project purposes
df <- df %>%
mutate(Preditog1_2018_01 = ifelse((Debt_Duration_As_Of_31.01.2018 >= 361 & Debt_Duration_As_Of_31.01.2019 == 0 & Total_Amount_Received_From_Borrower_2018_1 == 0), 361, 0))
df <- df %>%
mutate(Preditog2_2018_01 = ifelse((Date_Of_Debt_Sale <= "2019-01-31" & !is.na(Date_Of_Debt_Sale)), 361, 0))
df <- df %>%
mutate(Preditog3_2018_01 = ifelse((Date_Of_Debt_Cancellation <= "2019-01-31" & !is.na(Date_Of_Debt_Cancellation )), 361, 0))
df <- df %>%
mutate(Preditog_2018_01 = ifelse((Preditog1_2018_01 == 361 | Preditog2_2018_01 == 361 | Preditog3_2018_01 == 361), 361, 0))
#df <- df %>%
mutate(Itog_2018_01 = ifelse(Preditog_2018_01 == 361, 361, Debt_Duration_As_Of_31.01.2019))
#Calculations (Where the process is extremely slow especially when calculating sum)
##Extra filtering
df <- df %>%
mutate(f_0_v_360_2018_01_d = ifelse((Debt_Duration_As_Of_31.01.2018 == 0 & Itog_2018_01 > 360), Main_Debt_As_Of_31.01.2018, 0))
##Calculating sum
df %>%
summarise(res1 = sum(f_0_v_360_2018_01_d)
Maybe I'm doing smth wrong?
How is it possible to optimize calculation speed?
Thank you in advance for any help!

How I can filter data just for one group in r?

I want to filter a dataframe for dates greater than or equal to something, but only for certain rows according to a condition, without affecting the rest of the observations.
I am not sure how to do this.
This is the code I have so far:
library(dplyr)
dat <- tibble(id = rep(LETTERS[1:4], 10),
date_a = c(
seq(as.Date("2020-01-01"), as.Date("2021-08-01"), "months"),
seq(as.Date("2020-01-01"), as.Date("2021-08-01"), "months")
))
dat |>
filter(date_a >= as.Date("2021-01-01"))
We may use
tibble(
id = rep(LETTERS[1:4],10),
date_a = c(seq(as.Date("2020-01-01"), as.Date("2021-08-01"), "months"),
seq(as.Date("2020-01-01"), as.Date("2021-08-01"), "months")
)
) |>
filter((date_a >= as.Date("2021-01-01") & id == 'A')| (id != 'A'))

mutate() function in R produces values different from a direct calculation of the same input

I want to modify the "lakers" dataset from the lubridate library, so I have a first code which produces the chartData.
After that I need to add 2 calculated columns, and I use the mutate() function.
library(tidyverse)
library(ggrepel)
library(ggthemes)
library(scales)
library(lubridate)
chartData <-
lakers %>%
filter(
(player != "" & team == "LAL" &
result == "made" &
etype %in% c("shot","free throw")
) |
(player != "" &
team == "LAL" &
etype == "rebound")
) %>%
mutate(
rebound = ifelse(etype == "rebound", 1, 0)
) %>%
group_by(game_type, period, player) %>%
summarize(
`Total Points` = sum(points),
Rebounds = sum(rebound)
) %>%
rename(
`Game Type` = game_type,
Period = period,
Player = player
)
graph_data <-
chartData %>%
mutate(
Scaled.Rebounds =
round(rescale(Rebounds, to = c(-10, 10)), 1)
,Scaled.TotalPoints =
round(rescale(`Total Points`, to = c(-10, 10)), 1)
)
Why if I calculate separately
round(rescale(chartData$Rebounds, to = c(-10, 10)), 1)
I obtain a different output from the one which is produced by the mutate() function?

R - Create data set showing the delta/progress between two similar data sets

I have two data sets, snapshots of records taken at two different time periods. Each record has a unique key (Player.ID). There is identifying information that needs to be carried forward (Alliance.ID, Alliance.Tag, Player.ID, Player) with all of the remaining attributes needing to be evaluated for change over time (all new data points minus all old data points) producing a new, third data set. Lastly, we need to account for new players(all stats set to NEW) and deleted players (all stats set to MISSING).
Each data set has about 60,000 records.
Three data sets; old, new, calculated
I have no code to share. Any advice on approach, packages or code is appreciated.
Something like this?
library(tidyr)
library(dplyr)
df1$Source <- "df1"
df2$Source <- "df2"
output <- bind_rows(df1,df2) %>%
group_by(Player.ID) %>%
mutate(
DATE = as.Date(DATE,tryFormats = c("%m/%d/%Y")),
ROW.COUNT = n(),
POINTS = case_when(
ROW.COUNT == 1 & Source %in% "df1" ~ "Missing",
ROW.COUNT == 1 &
Source %in% "df2" ~ "NEW",
ROW.COUNT > 1 ~ as.character(as.numeric(POINTS) - lag(as.numeric(POINTS), order_by = DATE)),
TRUE ~ "ERROR"
),
POWER.POINTS = case_when(
ROW.COUNT == 1 & Source %in% "df1" ~ "Missing",
ROW.COUNT == 1 &
Source %in% "df2" ~ "NEW",
ROW.COUNT > 1 ~ as.character(as.numeric(POWER.POINTS) - lag(as.numeric(POWER.POINTS), order_by = DATE)),
TRUE ~ "ERROR"
),
PLAYERS.KILLED = case_when(
ROW.COUNT == 1 & Source %in% "df1" ~ "Missing",
ROW.COUNT == 1 &
Source %in% "df2" ~ "NEW",
ROW.COUNT > 1 ~ as.character(as.numeric(PLAYERS.KILLED) - lag(as.numeric(PLAYERS.KILLED), order_by = DATE)),
TRUE ~ "ERROR"
)
) %>%
filter(DATE == max(DATE)) %>%
select(-DATE,-Source,-ROW.COUNT)
Data
df1 <- structure(
list(
DATE = c("12/18/2020","12/28/2020"),
Alliance.ID = c("9745908","8798794"),
Alliance.Tag = c("StkOvflw","ILoveR"),
Player.ID = c("z90c0b60dd58","grfk349i342k3"),
PLAYER = c("Deanna","Gregor"),
LVL = c(89,22),
RANK = c("Admiral","Newb"),
POINTS = c("16746162","19269094"),
POWER.POINTS = c("77083200","87691376"),
PLAYERS.KILLED = c("7337","4698")
),
.Names = c(
"DATE",
"Alliance.ID",
"Alliance.Tag",
"Player.ID",
"PLAYER",
"LVL",
"RANK",
"POINTS",
"POWER.POINTS",
"PLAYERS.KILLED"
),
row.names = c(NA, -2L),
class = "data.frame"
)
df2 <- structure(
list(
DATE = c("1/4/2021","1/4/2021"),
Alliance.ID = c("9745908","5874162"),
Alliance.Tag = c("StkOvflw","NewGuy"),
Player.ID = c("z90c0b60dd58","2387hyf23u8i4"),
PLAYER = c("Deanna","NewGuy"),
LVL = c(90,1),
RANK = c("Admiral","Newb"),
POINTS = c("16786133","1254"),
POWER.POINTS = c("77089878","0"),
PLAYERS.KILLED = c("7368","3")
),
.Names = c(
"DATE",
"Alliance.ID",
"Alliance.Tag",
"Player.ID",
"PLAYER",
"LVL",
"RANK",
"POINTS",
"POWER.POINTS",
"PLAYERS.KILLED"
),
row.names = c(NA, -2L),
class = "data.frame"
)

Obtaining the start and end dates from a single column of data

I'm trying to find the Start and End date of a product's sales period from a column of data that is a dummy variable for sale. Here is a proxy of the type of data that I am working with:
The result I am looking for is:
The actual data set I am working on is much larger than this and does not necessarily look at just 2010-01 to 2011-12.
Thank you!
This assumes only one sale per product
require(tidyverse)
df <- data.frame(product = 'Product A',
month = seq(as.Date('2010-01-01'),
as.Date('2010-10-01'),
by = 'month'
),
onSale = c(rep(0,3), rep(1,4),rep(0,3))
)
df %>%
group_by(product) %>%
summarise(saleStart = month[which.min(month[onSale == 1])],
salend = month[which.max(month[onSale == 1])]
)
Edit:
df <- data.frame(product = 'Product A',
month = seq(as.Date('2010-01-01'),
as.Date('2011-09-01'),
by = 'month'
),
onSale = c(rep(0,3), rep(1,4),rep(0,3), rep(1,4),rep(0,3), rep(1,4))
)
df %>%
group_by(product) %>%
mutate(diff = c(0,diff(onSale))) %>%
group_by(product, diff) %>%
filter(diff == 1) %>%
mutate(monthStart = month, monthEnd = month %m+% months(1)) %>%
select(-month,-diff)

Resources