Using purrr to help transform a large data file - r

I have a bit of code that goes through a number of columns containing dates and selects the earliest date from the options to populate a new column with. To do this I was using the dplyr::rowwise function.
Unfortunately, the data set is quite big and comes at a time cost in obtaining an output. Here is an example of my initial approach.
library(tidyverse)
library(lubridate)
set.seed(101)
data <- tibble(date1 = sample(
seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date2 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date3 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date4 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date5 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE))
So for the first attempt I opted for rowwise. I hadn't used this before, but the output is identified as 'rowwise_df', which I take to be similar if I had used group_by.
data <- data %>%
rowwise() %>%
mutate(earlierst_date = min(c(date1, date2, date3, date4, date5),
na.rm = TRUE))
Having looked around, it would appear that rowwise is not considered the best approach (see excellent back and forth here). Reading through, I attempted the following...
data <- data %>%
mutate(try_again = pmap(list(date1, date2, date3, date4, date5),
min, na.rm = TRUE)) %>%
mutate(try_again = as_date(try_again))
table(data$earlierst_date == data$try_again)
#>
#> TRUE
#> 100
According to my reprex run the second option is twice as fast.
start.time <- Sys.time()
data <- data %>%
rowwise() %>%
mutate(earlierst_date = min(c(date1, date2, date3, date4, date5),
na.rm = TRUE))
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
#> Time difference of 0.07597804 secs
start.time <- Sys.time()
data <- data %>%
mutate(try_again = pmap(list(date1, date2, date3, date4, date5),
min, na.rm = TRUE)) %>%
mutate(try_again = as_date(try_again))
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
#> Time difference of 0.03266287 secs
My questions:
1. Is the second strategy using pmap fit for purpose or is there some inherent error present that I can't see? For example, in earlier attempts, the output column contained list values rather than vectors which threw me.
I get dizzy anytime I have to work with dates, especially when I read comments such as "A date is a day stored as the number of days since 1970-01-01"...
2. Do the code run times make sense?
Any improvements/direction greatly received.

I agree with #det that rowwise isn't the way to go. I think perhaps the pmin function might be the best suited to the task, e.g.
data <- transform(data, earliest_date = pmin(date1, date2, date3, date4, date5, na.rm = TRUE))
Benchmarking (updated to include a data.table solution):
library(tidyverse)
library(lubridate)
set.seed(101)
data <- tibble(date1 = sample(
seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date2 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date3 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date4 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date5 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE))
rowwise_func <- function(data){
data %>%
rowwise() %>%
mutate(earliest_date = min(c(date1, date2, date3, date4, date5),
na.rm = TRUE)) %>%
ungroup()
}
pmap_func <- function(data){
data %>%
mutate(try_again = pmap(list(date1, date2, date3, date4, date5),
min, na.rm = TRUE))
}
det_func1 <- function(data){
data %>%
mutate(min_date = pmap_dbl(select(., matches("^date")), min) %>% as.Date(origin = "1970-01-01"))
}
det_faster <- function(data){
data[["min_date"]] <- data %>%
mutate(across(where(is.Date), as.integer)) %>%
as.matrix() %>%
apply(1, function(x) x[which.min(x)]) %>%
as.Date(origin = "1970-01-01")
}
transform_func <- function(data){
as_tibble(transform(data, earliest_date = pmin(date1, date2, date3, date4, date5, na.rm = TRUE)))
}
dt_func <- function(data){
setDT(data)
data[, earliest_date := pmin(date1, date2, date3, date4, date5, na.rm = TRUE)]
}
times <- microbenchmark::microbenchmark(rowwise_func(data), pmap_func(data), det_func1(data), det_faster(data), transform_func(data), dt_func(data))
autoplot(times)
data2 <- transform_func(data)
data3 <- rowwise_func(data)
identical(data2, data3)
#> TRUE
Unit: microseconds
expr min lq mean median uq max neval cld
rowwise_func(data) 6764.693 6919.6720 7375.0418 7066.6220 7271.5850 16290.696 100 ab
pmap_func(data) 3994.973 4150.1360 9425.3880 4252.9850 4437.2950 491030.248 100 b
det_func1(data) 5576.240 5724.6820 6249.7573 5845.3305 5985.5940 15106.741 100 ab
det_faster(data) 3182.016 3305.3525 3556.8628 3362.8720 3444.0505 12771.952 100 ab
transform_func(data) 564.194 624.1055 697.5630 680.1130 718.7975 1513.184 100 a
dt_func(data) 650.611 723.7235 956.7916 759.3355 782.0565 10806.902 100 a
So, based on the functions I used above, the transform + pmin method was ~ 10X faster than the rowwise method.

From my experiance rowwise is extremely slow so I prefer using any other option (at the cost of having less tidy code) especially if I have numeric columns (then I convert to matrix). pmap is definitely option, but sometimes I have trouble listing all needed columns (doesn't have tidy select option). This can be somewhat avoided by using select within pmap:
data <- data %>%
mutate(min_date = pmap_dbl(select(., matches("^date")), min) %>% as.Date(origin = "1970-01-01"))
Converting to matrix was usually fastest way (much faster) for my problems (in combination with function like apply or sweep:
data[["min_date"]] <- data %>%
mutate(across(where(is.Date), as.integer)) %>%
as.matrix() %>%
apply(1, function(x) x[which.min(x)]) %>%
as.Date(origin = "1970-01-01")

Related

Using summarize across with multiple functions when there are missing values

If I want to get the mean and sum of all the numeric columns using the mtcars data set, I would use following codes:
group_by(gear) %>%
summarise(across(where(is.numeric), list(mean = mean, sum = sum)))
But if I have missing values in some of the columns, how do I take that into account? Here is a reproducible example:
test.df1 <- data.frame("Year" = sample(2018:2020, 20, replace = TRUE),
"Firm" = head(LETTERS, 5),
"Exporter"= sample(c("Yes", "No"), 20, replace = TRUE),
"Revenue" = sample(100:200, 20, replace = TRUE),
stringsAsFactors = FALSE)
test.df1 <- rbind(test.df1,
data.frame("Year" = c(2018, 2018),
"Firm" = c("Y", "Z"),
"Exporter" = c("Yes", "No"),
"Revenue" = c(NA, NA)))
test.df1 <- test.df1 %>% mutate(Profit = Revenue - sample(20:30, 22, replace = TRUE ))
test.df_summarized <- test.df1 %>% group_by(Firm) %>% summarize(across(where(is.numeric)), list(mean = mean, sum = sum)))
If I would just summarize each variable separately, I could use the following:
test.df1 %>% group_by(Firm) %>% summarize(Revenue_mean = mean(Revenue, na.rm = TRUE,
Profit_mean = mean(Profit, na.rm = TRUE)
But I am trying to figure out how can I tweak the code I wrote above for mtcars to the example data set I have provided here.
Because your functions all have a na.rm argument, you can pass it along with the ...
test.df1 %>% summarize(across(where(is.numeric), list(mean = mean, sum = sum), na.rm = TRUE))
# Year_mean Year_sum Revenue_mean Revenue_sum Profit_mean Profit_sum
# 1 2019.045 44419 162.35 3247 138.25 2765
(I left out the group_by because it's not specified properly in your code and the example is still well-illustrated without it. Also make sure that your functions are inside across().)
Just for the record, you could also do it like this (and this works when the different functions have different arguments)
test.df1 %>%
summarise(across(where(is.numeric),
list(
mean = ~ mean(.x, na.rm = T),
sum = ~ sum(.x, na.rm = T))
)
)
# Year_mean Year_sum Revenue_mean Revenue_sum Profit_mean Profit_sum
# 1 2019.045 44419 144.05 2881 119.3 2386

summarise data for multiple variables of a data.frame in r?

I am trying to compute the upper and lower quartile of the two variables in my data.frame across the time period of my interest. The code below gave me single digit for upper and lower value.
set.seed(50)
FakeData <- data.frame(seq(as.Date("2001-01-01"), to= as.Date("2003-12-31"), by="day"),
A = runif(1095, 0,10),
D = runif(1095,5,15))
colnames(FakeData) <- c("Date", "A","D")
statistics <- FakeData %>%
gather(-Date, key = "Variable", value = "Value") %>%
mutate(Year = year(Date), Month = month(Date)) %>%
filter(between(Month,3,5)) %>%
mutate(NewDate = ymd(paste("2020", Month,day(Date), sep = "-"))) %>%
group_by(Variable, NewDate) %>%
summarise(Upper = quantile(Value,0.75, na.rm = T),
Lower = quantile(Value, 0.25, na.rm = T))
I would want an output like below (the Final_output is what i am interested)
Output1 <- data.frame(seq(as.Date("2000-03-01"), to= as.Date("2000-05-31"), by="day"),
Upper = runif(92, 0,10), lower = runif(92,5,15), Variable = rep("A",92))
colnames(Output1)[1] <- "Date"
Output2 <- data.frame(seq(as.Date("2000-03-01"), to= as.Date("2000-05-31"), by="day"),
Upper = runif(92, 2,10), lower = runif(92,5,15), Variable = rep("D",92))
colnames(Output2)[1] <- "Date"
Final_Output<- bind_rows(Output1,Output2)
I can propose you a data.table solution. In fact there are several ways to do that.
The final steps (apply quartile by group on the Value variable) could be translated into (if you want, as in your example, two columns):
statistics[,.('p25' = quantile(get('Value'), probs = 0.25), 'p75' = quantile(get('Value'), probs = 0.75)),
by = c("Variable", "NewDate")]
If you prefer long-formatted output:
library(data.table)
setDT(statistics)
statistics[,.(lapply(get('Value'), quantile, probs = .25,.75)) ,
by = c("Variable", "NewDate")]
All steps together
It's probably better if you chose to use data.table to do all steps using data.table verbs. I will assume your data have the structure similar to the dataframe you generated and arranged, i.e.
statistics <- FakeData %>%
gather(-Date, key = "Variable", value = "Value")
In that case, mutate and filter steps would become
statistics[,`:=`(Year = year(Date), Month = month(Date))]
statistics <- statistics[Month %between% c(3,5)]
statistics[, NewDate = :ymd(paste("2020", Month,day(Date), sep = "-"))]
And choose the final step you prefer, e.g.
statistics[,.('p25' = quantile(get('Value'), probs = 0.25), 'p75' = quantile(get('Value'), probs = 0.75)),
by = c("Variable", "NewDate")]

Difference between indexing with $ and [[]] ???

i have a question regarding indexing a dataframe in R. This is the Code:
Gewicht <- data %>%
group_by(data[[376]]) %>%
summarise(weights = mean(data[[10190]], na.rm = TRUE))
Gewicht2 <- data %>%
group_by(data[[376]]) %>%
summarise(weights = mean(Weights, na.rm = TRUE))
a <- seq(1:10)
b <- rep(c("male", "female"),5)
c <- seq(1:10)
data <- as.data.frame(cbind(a,b,c))
data$c <- as.numeric(data$c)
newdata <- data %>%
group_by(data[[2]]) %>%
summarise(Mean = mean(c, na.rm = TRUE))
newdata2 <- data %>%
group_by(data[[2]]) %>%
summarise(Mean = mean(data[[3]], na.rm = TRUE))
print(newdata)
print(newdata2)
I get different results for both dataframes. The desired result in the "newdata". Can you tell me WHY i get different values for these two calculations?
I need brackets for a more complex custom function, but it seems it writes the mean for the whole dataframe, where i would hope to get the mean for each group.
How to use [] or [[]] correctly here?
a <- c(1,2,3,4,5,6,7,8,9,10)
b <- rep(c("male", "female"),5)
c <- c(1,2,3,4,5,6,7,8,9,10)
data <- data.frame(cbind(a,b,c))
data$c <- as.numeric(as.character(data$c))
c
data$c
print(newdata)
print(newdata2)
newdata <- data %>%
group_by(data[[2]]) %>%
summarise(Mean = mean(c, na.rm = TRUE))
newdata2 <- data %>%
group_by(data[[2]]) %>%
summarise(Mean = mean(data[[3]], na.rm = TRUE))
newdata
newdata2
updated code, still different results :(
Gewicht <- aggregate(data[[varGewicht]], by=list(data[[varx]]), FUN=mean, na.rm = TRUE)
Aggregate function works :-)

Normalising data with dplyr mutate() brings inconsistencies

I'm trying to reproduce the framework from this blogpost http://www.luishusier.com/2017/09/28/balance/ with the following code but it looks like I get inconsistent results
library(tidyverse)
library(magrittr)
ids <- c("1617", "1516", "1415", "1314", "1213", "1112", "1011", "0910", "0809", "0708", "0607", "0506")
data <- ids %>%
map(function(i) {read_csv(paste0("http://www.football-data.co.uk/mmz4281/", i ,"/F1.csv")) %>%
select(Date:AST) %>%
mutate(season = i)})
data <- bind_rows(data)
data <- data[complete.cases(data[ , 1:3]), ]
tmp1 <- data %>%
select(season, HomeTeam, FTHG:FTR,HS:AST) %>%
rename(BP = FTHG,
BC = FTAG,
TP = HS,
TC = AS,
TCP = HST,
TCC = AST,
team = HomeTeam)%>%
mutate(Pts = ifelse(FTR == "H", 3, ifelse(FTR == "A", 0, 1)),
Terrain = "Domicile")
tmp2 <- data %>%
select(season, AwayTeam, FTHG:FTR, HS:AST) %>%
rename(BP = FTAG,
BC = FTHG,
TP = AS,
TC = HS,
TCP = AST,
TCC = HST,
team = AwayTeam)%>%
mutate(Pts = ifelse(FTR == "A", 3 ,ifelse(FTR == "H", 0 , 1)),
Terrain = "Extérieur")
tmp3 <- bind_rows(tmp1, tmp2)
l1_0517 <- tmp3 %>%
group_by(season, team)%>%
summarise(j = n(),
pts = sum(Pts),
diff_but = (sum(BP) - sum(BC)),
diff_t_ca = (sum(TCP, na.rm = T) - sum(TCC, na.rm = T)),
diff_t = (sum(TP, na.rm = T) - sum(TC, na.rm = T)),
but_p = sum(BP),
but_c = sum(BC),
tir_ca_p = sum(TCP, na.rm = T),
tir_ca_c = sum(TCC, na.rm = T),
tir_p = sum(TP, na.rm = T),
tir_c = sum(TC, na.rm = T)) %>%
arrange((season), desc(pts), desc(diff_but))
Then I apply the framework mentioned above:
l1_0517 <- l1_0517 %>%
mutate(
# First, see how many goals the team scores relative to the average
norm_attack = but_p %>% divide_by(mean(but_p)) %>%
# Then, transform it into an unconstrained scale
log(),
# First, see how many goals the team concedes relative to the average
norm_defense = but_c %>% divide_by(mean(but_c)) %>%
# Invert it, so a higher defense is better
raise_to_power(-1) %>%
# Then, transform it into an unconstrained scale
log(),
# Now that we have normalized attack and defense ratings, we can compute
# measures of quality and attacking balance
quality = norm_attack + norm_defense,
balance = norm_attack - norm_defense
) %>%
arrange(desc(norm_attack))
When I look at the column norm_attack, I expect to find the same value for equivalent but_p values, which is not the case here:
head(l1_0517, 10)
for instance when but_p has value 83, row 5 and row 7, I get norm_attack at 0.5612738 and 0.5128357 respectively.
Is it normal? I would expect mean(l1_0517$but_p) to be fixed and therefore obtaining the same result when a value of l1_0517$but_p is log normalised?
UPDATE
I have tried to work on a simpler example but I can't reproduce this issue:
df <- tibble(a = as.integer(runif(200, 15, 100)))
df <- df %>%
mutate(norm_a = a %>% divide_by(mean(a)) %>%
log())
I found the solution after looking at the type of l1_0517
It is a grouped_df hence the different results.
The correct code is:
l1_0517 <- tmp3 %>%
group_by(season, team)%>%
summarise(j = n(),
pts = sum(Pts),
diff_but = (sum(BP) - sum(BC)),
diff_t_ca = (sum(TCP, na.rm = T) - sum(TCC, na.rm = T)),
diff_t = (sum(TP, na.rm = T) - sum(TC, na.rm = T)),
but_p = sum(BP),
but_c = sum(BC),
tir_ca_p = sum(TCP, na.rm = T),
tir_ca_c = sum(TCC, na.rm = T),
tir_p = sum(TP, na.rm = T),
tir_c = sum(TC, na.rm = T)) %>%
ungroup() %>%
arrange((season), desc(pts), desc(diff_but))

Multi windows range calculations data.table vs dplyr

I'm doing range calculations (i.e. max and min) over multiple windows on stocks returns.
I have my version in dplyr, but many people publishing benchmarking where calculations with data.table are much faster. I've created the version with data.table syntax, however it's slower than dplyr one.
Could anyone help me to find better way to use data.table to make it faster?
Many thanks.
library(Quandl)
library(tidyr)
library(dplyr)
library(data.table)
library(microbenchmark)
tickers <- c("GOOG/NASDAQ_AAPL", "GOOG/NASDAQ_MSFT",
"GOOG/NYSE_IBM", "GOOG/NASDAQ_GOOG")
data <- Quandl(tickers,transformation = "rdiff")
returns <- gather(data, stock, value, -Date) %>%
separate(stock, c("name", "field"), " - ") %>%
filter(
field == "Close"
) %>%
select(
- field
)
returns_dt <- data.table(returns)
multi_window_range <- function(data) {
result_1y <- data %>%
filter(
Date >= Sys.Date() - 365
) %>%
group_by(name) %>%
summarise(
max_1y = max(value, na.rm = TRUE),
min_1y = min(value, na.rm = TRUE)
)
result_2y <- data %>%
filter(
Date >= Sys.Date() - 365 * 2
) %>%
group_by(name) %>%
summarise(
max_2y = max(value, na.rm = TRUE),
min_2y = min(value, na.rm = TRUE)
)
result_5y <- data %>%
filter(
Date >= Sys.Date() - 365 * 5
) %>%
group_by(name) %>%
summarise(
max_5y = max(value, na.rm = TRUE),
min_5y = min(value, na.rm = TRUE)
)
return(inner_join(inner_join(result_1y, result_2y, by = "name"), result_5y, by = "name"))
}
multi_window_range_dt <- function(data) {
setkey(data, name)
result_1y <- data[Date >= Sys.Date() - 365,
list(
max_1y = max(value, na.rm = TRUE),
min_1y = min(value, na.rm = TRUE)
), by = "name"]
result_2y <- data[Date >= Sys.Date() - 365 * 2,
list(
max_2y = max(value, na.rm = TRUE),
min_2y = min(value, na.rm = TRUE)
), by = "name"]
result_5y <- data[Date >= Sys.Date() - 365 * 5,
list(
max_5y = max(value, na.rm = TRUE),
min_5y = min(value, na.rm = TRUE)
), by = "name"]
return(result_1y[result_2y][result_5y])
}
microbenchmark(
multi_window_range(returns),
multi_window_range_dt(returns_dt)
)
Unit: milliseconds
expr min lq mean median uq max neval
multi_window_range(returns) 6.341532 6.522303 6.915266 6.692666 6.922623 10.16709 100
multi_window_range_dt(returns_dt) 7.537073 7.738516 8.066579 7.865968 8.073114 12.68021 100
Try this:
multi_window_range_dt2 <- function(data) {
data[, {
rng1 <- range(value[Date > Sys.Date() - 365], na.rm = TRUE)
rng2 <- range(value[Date > Sys.Date() - 2*365], na.rm = TRUE)
rng5 <- range(value[Date > Sys.Date() - 5*365], na.rm = TRUE)
list(max_1y = rng1[2], min_1y = rng1[1],
max_2y = rng2[2], min_2y = rng2[1],
max_5y = rng5[2], min_5y = rng5[1])
}, by = "name"]
}
library(rbenchmark)
benchmark(multi_window_range(returns), multi_window_range_dt2(returns_dt))[1:4]
which gives this on my laptop:
test replications elapsed relative
1 multi_window_range(returns) 100 2.39 1.189
2 multi_window_range_dt2(returns_dt) 100 2.01 1.000
This indicates that multi_window_range takes 18.9% more time than multi_window_range_dt2:

Resources