Multi windows range calculations data.table vs dplyr - r

I'm doing range calculations (i.e. max and min) over multiple windows on stocks returns.
I have my version in dplyr, but many people publishing benchmarking where calculations with data.table are much faster. I've created the version with data.table syntax, however it's slower than dplyr one.
Could anyone help me to find better way to use data.table to make it faster?
Many thanks.
library(Quandl)
library(tidyr)
library(dplyr)
library(data.table)
library(microbenchmark)
tickers <- c("GOOG/NASDAQ_AAPL", "GOOG/NASDAQ_MSFT",
"GOOG/NYSE_IBM", "GOOG/NASDAQ_GOOG")
data <- Quandl(tickers,transformation = "rdiff")
returns <- gather(data, stock, value, -Date) %>%
separate(stock, c("name", "field"), " - ") %>%
filter(
field == "Close"
) %>%
select(
- field
)
returns_dt <- data.table(returns)
multi_window_range <- function(data) {
result_1y <- data %>%
filter(
Date >= Sys.Date() - 365
) %>%
group_by(name) %>%
summarise(
max_1y = max(value, na.rm = TRUE),
min_1y = min(value, na.rm = TRUE)
)
result_2y <- data %>%
filter(
Date >= Sys.Date() - 365 * 2
) %>%
group_by(name) %>%
summarise(
max_2y = max(value, na.rm = TRUE),
min_2y = min(value, na.rm = TRUE)
)
result_5y <- data %>%
filter(
Date >= Sys.Date() - 365 * 5
) %>%
group_by(name) %>%
summarise(
max_5y = max(value, na.rm = TRUE),
min_5y = min(value, na.rm = TRUE)
)
return(inner_join(inner_join(result_1y, result_2y, by = "name"), result_5y, by = "name"))
}
multi_window_range_dt <- function(data) {
setkey(data, name)
result_1y <- data[Date >= Sys.Date() - 365,
list(
max_1y = max(value, na.rm = TRUE),
min_1y = min(value, na.rm = TRUE)
), by = "name"]
result_2y <- data[Date >= Sys.Date() - 365 * 2,
list(
max_2y = max(value, na.rm = TRUE),
min_2y = min(value, na.rm = TRUE)
), by = "name"]
result_5y <- data[Date >= Sys.Date() - 365 * 5,
list(
max_5y = max(value, na.rm = TRUE),
min_5y = min(value, na.rm = TRUE)
), by = "name"]
return(result_1y[result_2y][result_5y])
}
microbenchmark(
multi_window_range(returns),
multi_window_range_dt(returns_dt)
)
Unit: milliseconds
expr min lq mean median uq max neval
multi_window_range(returns) 6.341532 6.522303 6.915266 6.692666 6.922623 10.16709 100
multi_window_range_dt(returns_dt) 7.537073 7.738516 8.066579 7.865968 8.073114 12.68021 100

Try this:
multi_window_range_dt2 <- function(data) {
data[, {
rng1 <- range(value[Date > Sys.Date() - 365], na.rm = TRUE)
rng2 <- range(value[Date > Sys.Date() - 2*365], na.rm = TRUE)
rng5 <- range(value[Date > Sys.Date() - 5*365], na.rm = TRUE)
list(max_1y = rng1[2], min_1y = rng1[1],
max_2y = rng2[2], min_2y = rng2[1],
max_5y = rng5[2], min_5y = rng5[1])
}, by = "name"]
}
library(rbenchmark)
benchmark(multi_window_range(returns), multi_window_range_dt2(returns_dt))[1:4]
which gives this on my laptop:
test replications elapsed relative
1 multi_window_range(returns) 100 2.39 1.189
2 multi_window_range_dt2(returns_dt) 100 2.01 1.000
This indicates that multi_window_range takes 18.9% more time than multi_window_range_dt2:

Related

How to write a bootstrapped mean difference using tidyverse?

I am interested in the difference in the mean of some variable according to a binary covariate.
I am computing the confidence interval of this difference by bootstraping:
library(tidyverse)
df = mtcars %>%
select(disp, vs) %>%
mutate(vs=factor(vs, labels=c("vshaped", "straight")))
by1="straight"
by2="vshaped"
R=1000
set.seed(1)
beffect = numeric(length=R)
for (i in 1:R) {
ib = sample(1:nrow(df), replace = TRUE)
xi = df$disp[ib]
byi = df$vs[ib]
beffect[i] = mean(xi[byi==by2], na.rm = TRUE) - mean(xi[byi==by1], na.rm = TRUE)
}
mean(beffect)
#> [1] 175.9203
sd(beffect)
#> [1] 29.3409
Created on 2021-06-13 by the reprex package (v2.0.0)
This works, but I find it quite unreadable and I wonder about its efficiency, as for loops are often considered a bad design in R.
Being a heavy user of the tidyverse, I would like to rewrite this using this framework.
Is there a fast and readable way to do so?
PS: Here is the closest I could get, but it is far from being more readable and it is 250 times slower:
beffect2 = replicate(R, {
df %>%
slice_sample(prop=1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m=mean(disp)) %>%
pivot_wider(names_from = "vs", values_from = "m") %>%
transmute(x=!!ensym(by2) - !!ensym(by1))
}, simplify = FALSE) %>%
map_dbl(identity)
EDIT: here are the benchmarks of all methods so far:
# with R=50 ***********
# microbenchmark::microbenchmark(f_dc(50), f_akrun(50), f_akrun_diff(50), f_akrun_bindout(50), f_cole(50), f_forloop(50), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_dc() 497.4559 524.9582 560.94690 553.6271 572.2261 656.4672 5
# f_akrun() 101.6295 108.5232 111.22400 110.7238 111.4105 123.8330 5
# f_akrun_diff() 270.0261 283.3257 308.92806 283.6411 314.7233 392.9241 5
# f_akrun_bindout() 21.8185 21.9725 76.68770 22.9811 30.2129 286.4535 5
# f_cole() 2.7685 3.1343 3.63484 3.2679 4.4346 4.5689 5
# f_forloop() 2.1136 2.1277 3.14156 3.4968 3.6740 4.2957 5
# with R=500 **********
# microbenchmark::microbenchmark(f_dc(500), f_akrun(500), f_akrun_diff(500), f_akrun_bindout(500), f_cole(500), f_forloop(500), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_dc() 4270.2451 4535.4618 4543.85930 4539.3032 4613.5823 4760.7041 5
# f_akrun() 936.3249 951.3230 970.27424 956.3674 992.3162 1015.0397 5
# f_akrun_diff() 2501.3871 2509.5429 2589.47288 2608.5254 2649.3819 2678.5271 5
# f_akrun_bindout() 108.3761 108.7238 113.26746 112.2521 118.4673 118.5180 5
# f_cole() 23.1283 23.4074 24.75386 23.9244 26.4594 26.8498 5
# f_forloop() 20.4243 21.1367 23.26222 21.2130 22.5616 30.9755 5
This may be overlooking the obvious, but the tidyverse equivalent of a for loop would involve something like purrr::map(). The simplest conversion would be to use purrr::map_dbl(1:R, ...) such as:
library(purrr)
## better for memory and performance to extract vectors ahead of loop
disp = dt$disp
vs = dt$vs
map_dbl(1:R,
~ {
ib = sample(nrow(df), replace = TRUE)
xi = disp[ib]
byi = vs[ib]
mean(xi[byi == by2], na.rm = TRUE) - mean(xi[byi == by1], na.rm = TRUE)
})
Also, since by is binary, you may be able to improve performance by translating this into rcpp.
We could use map and avoid the multiple pivot_wider steps
library(purrr)
library(dplyr)
set.seed(1)
out <- map_dfr(seq_len(R), ~ {
ib <- sample(1:nrow(df), replace = TRUE)
df %>%
slice(ib) %>%
summarise(beffect = mean(disp[vs == by2], na.rm = TRUE) -
mean(disp[vs == by1], na.rm = TRUE))
})
-checking
mean(out$beffect)
#[1] 175.9203
sd(out$beffect)
#[1] 29.3409
Or may use diff instead of pivot_wider
set.seed(1)
out2 <- replicate(R, df %>%
slice_sample(prop = 1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m = mean(disp), .groups = 'drop') %>%
summarise(beffect = diff(m[2:1])), simplify = FALSE) %>%
bind_rows
-checking
mean(out2$beffect)
#[1] 175.9203
Or another option would be to do the sample, bind them together with a group identifier, use that to extract the values of the columns, do a group by the group identifier and 'vs' and get the mean
set.seed(1)
out3 <- replicate(R, sample(seq_len(nrow(df)), replace = TRUE) %>%
as_tibble, simplify = FALSE) %>%
bind_rows(.id = 'grp') %>%
mutate(vs = df$vs[value], disp = df$disp[value]) %>%
group_by(grp, vs) %>%
summarise(beffect = mean(disp), .groups = 'drop_last') %>%
group_by(grp) %>%
summarise(beffect = diff(beffect[2:1]), .groups = 'drop')
-checking
mean(out3$beffect)
#[1] 175.9203
Benchmarks
system.time({set.seed(1)
out3 <- replicate(R, sample(seq_len(nrow(df)), replace = TRUE) %>%
as_tibble, simplify = FALSE) %>%
bind_rows(.id = 'grp') %>%
mutate(vs = df$vs[value], disp = df$disp[value]) %>%
group_by(grp, vs) %>%
summarise(beffect = mean(disp), .groups = 'drop_last') %>%
group_by(grp) %>%
summarise(beffect = diff(beffect[2:1]), .groups = 'drop')})
# user system elapsed
# 0.202 0.007 0.208
Or with map
system.time({
set.seed(1)
out <- map_dfr(seq_len(R), ~ {
ib <- sample(1:nrow(df), replace = TRUE)
df %>%
slice(ib) %>%
summarise(beffect = mean(disp[vs == by2], na.rm = TRUE) -
mean(disp[vs == by1], na.rm = TRUE))
})
})
# user system elapsed
# 1.329 0.013 1.338
Or instead of pivot_wider, take the diff
system.time({set.seed(1)
out2 <- replicate(R, df %>%
slice_sample(prop = 1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m = mean(disp), .groups = 'drop') %>%
summarise(beffect = diff(m[2:1])), simplify = FALSE) %>%
bind_rows
})
# user system elapsed
# 3.753 0.027 3.758
Or a similar approach in data.table
library(data.table)
system.time({
setDT(df)
set.seed(1)
out3 <- rbindlist(
replicate(R,
df[df[, .I[sample(seq_len(.N), replace = TRUE)]
]][, .(m = mean(disp)), vs][, .(beffect = m[2]- m[1])],
simplify = FALSE)
)
})
# user system elapsed
# 1.181 0.055 1.230
-OP's method
system.time({replicate(R, {
df %>%
slice_sample(prop=1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m=mean(disp)) %>%
pivot_wider(names_from = "vs", values_from = "m") %>%
transmute(x=!!ensym(by2) - !!ensym(by1))
}, simplify = FALSE)})
user system elapsed
6.991 0.063 7.009
microbenchmark::microbenchmark(f_dc(), f_akrun1(), f_akrun2(), f_akrun3(), f_forloop(), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
f_dc() 6453.14052 6512.34196 6772.0079 6534.08171 6939.61358 7420.86152 5 d
f_akrun1() 1288.96812 1328.96075 1377.0833 1353.79346 1372.30852 1541.38573 5 b
f_akrun2() 3685.33619 3703.33018 3814.8367 3801.52657 3915.75432 3968.23609 5 c
f_akrun3() 178.30997 179.77604 194.0712 189.18425 205.37485 217.71095 5 a
f_forloop() 30.11329 33.37171 35.0534 36.80903 36.95909 38.01389 5 a

Using purrr to help transform a large data file

I have a bit of code that goes through a number of columns containing dates and selects the earliest date from the options to populate a new column with. To do this I was using the dplyr::rowwise function.
Unfortunately, the data set is quite big and comes at a time cost in obtaining an output. Here is an example of my initial approach.
library(tidyverse)
library(lubridate)
set.seed(101)
data <- tibble(date1 = sample(
seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date2 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date3 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date4 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date5 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE))
So for the first attempt I opted for rowwise. I hadn't used this before, but the output is identified as 'rowwise_df', which I take to be similar if I had used group_by.
data <- data %>%
rowwise() %>%
mutate(earlierst_date = min(c(date1, date2, date3, date4, date5),
na.rm = TRUE))
Having looked around, it would appear that rowwise is not considered the best approach (see excellent back and forth here). Reading through, I attempted the following...
data <- data %>%
mutate(try_again = pmap(list(date1, date2, date3, date4, date5),
min, na.rm = TRUE)) %>%
mutate(try_again = as_date(try_again))
table(data$earlierst_date == data$try_again)
#>
#> TRUE
#> 100
According to my reprex run the second option is twice as fast.
start.time <- Sys.time()
data <- data %>%
rowwise() %>%
mutate(earlierst_date = min(c(date1, date2, date3, date4, date5),
na.rm = TRUE))
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
#> Time difference of 0.07597804 secs
start.time <- Sys.time()
data <- data %>%
mutate(try_again = pmap(list(date1, date2, date3, date4, date5),
min, na.rm = TRUE)) %>%
mutate(try_again = as_date(try_again))
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
#> Time difference of 0.03266287 secs
My questions:
1. Is the second strategy using pmap fit for purpose or is there some inherent error present that I can't see? For example, in earlier attempts, the output column contained list values rather than vectors which threw me.
I get dizzy anytime I have to work with dates, especially when I read comments such as "A date is a day stored as the number of days since 1970-01-01"...
2. Do the code run times make sense?
Any improvements/direction greatly received.
I agree with #det that rowwise isn't the way to go. I think perhaps the pmin function might be the best suited to the task, e.g.
data <- transform(data, earliest_date = pmin(date1, date2, date3, date4, date5, na.rm = TRUE))
Benchmarking (updated to include a data.table solution):
library(tidyverse)
library(lubridate)
set.seed(101)
data <- tibble(date1 = sample(
seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date2 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date3 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date4 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date5 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE))
rowwise_func <- function(data){
data %>%
rowwise() %>%
mutate(earliest_date = min(c(date1, date2, date3, date4, date5),
na.rm = TRUE)) %>%
ungroup()
}
pmap_func <- function(data){
data %>%
mutate(try_again = pmap(list(date1, date2, date3, date4, date5),
min, na.rm = TRUE))
}
det_func1 <- function(data){
data %>%
mutate(min_date = pmap_dbl(select(., matches("^date")), min) %>% as.Date(origin = "1970-01-01"))
}
det_faster <- function(data){
data[["min_date"]] <- data %>%
mutate(across(where(is.Date), as.integer)) %>%
as.matrix() %>%
apply(1, function(x) x[which.min(x)]) %>%
as.Date(origin = "1970-01-01")
}
transform_func <- function(data){
as_tibble(transform(data, earliest_date = pmin(date1, date2, date3, date4, date5, na.rm = TRUE)))
}
dt_func <- function(data){
setDT(data)
data[, earliest_date := pmin(date1, date2, date3, date4, date5, na.rm = TRUE)]
}
times <- microbenchmark::microbenchmark(rowwise_func(data), pmap_func(data), det_func1(data), det_faster(data), transform_func(data), dt_func(data))
autoplot(times)
data2 <- transform_func(data)
data3 <- rowwise_func(data)
identical(data2, data3)
#> TRUE
Unit: microseconds
expr min lq mean median uq max neval cld
rowwise_func(data) 6764.693 6919.6720 7375.0418 7066.6220 7271.5850 16290.696 100 ab
pmap_func(data) 3994.973 4150.1360 9425.3880 4252.9850 4437.2950 491030.248 100 b
det_func1(data) 5576.240 5724.6820 6249.7573 5845.3305 5985.5940 15106.741 100 ab
det_faster(data) 3182.016 3305.3525 3556.8628 3362.8720 3444.0505 12771.952 100 ab
transform_func(data) 564.194 624.1055 697.5630 680.1130 718.7975 1513.184 100 a
dt_func(data) 650.611 723.7235 956.7916 759.3355 782.0565 10806.902 100 a
So, based on the functions I used above, the transform + pmin method was ~ 10X faster than the rowwise method.
From my experiance rowwise is extremely slow so I prefer using any other option (at the cost of having less tidy code) especially if I have numeric columns (then I convert to matrix). pmap is definitely option, but sometimes I have trouble listing all needed columns (doesn't have tidy select option). This can be somewhat avoided by using select within pmap:
data <- data %>%
mutate(min_date = pmap_dbl(select(., matches("^date")), min) %>% as.Date(origin = "1970-01-01"))
Converting to matrix was usually fastest way (much faster) for my problems (in combination with function like apply or sweep:
data[["min_date"]] <- data %>%
mutate(across(where(is.Date), as.integer)) %>%
as.matrix() %>%
apply(1, function(x) x[which.min(x)]) %>%
as.Date(origin = "1970-01-01")

summarise() doesn't recognize a variable

I'm wondering why I get Error: Problem with summarise() input wt_avg below?
library(tidyverse)
CA_vacc <- read_csv('https://raw.githubusercontent.com/rnorouzian/e/master/2017-2018%20CA%20Vaccination%20Data.csv',
na = c(".","--*"))
CA_vacc %>% summarise(
wt_avg = sum(HEPB_percent * ENROLLMENT, na.rm = TRUE) / sum(ENROLLMENT, na.rm = TRUE)
)
# Error: Problem with `summarise()` input `wt_avg`.
Does this work:
library(dplyr)
library(readr)
CA_vacc %>% summarise(
wt_avg = sum(parse_number(HEPB_percent) * ENROLLMENT, na.rm = TRUE) / sum(ENROLLMENT, na.rm = TRUE)
+ )
# A tibble: 1 x 1
wt_avg
<dbl>
1 96.8
library(tidyverse)
CA_vacc %>%
mutate(HEPB_percent = as.numeric(str_remove_all(CA_vacc$HEPB_percent, "\\?|%"))) %>%
summarise(
wt_avg = sum(HEPB_percent * ENROLLMENT, na.rm = TRUE) / sum(ENROLLMENT, na.rm = TRUE)
)
Using base R
with(CA_vacc, sum(as.numeric(gsub("[?%]", "", HEPB_percent)) *
ENROLLMENT, na.rm = TRUE)/sum(ENROLLMENT, na.rm = TRUE))
#[1] 96.76707

How to Add Column Totals to Grouped Summaries in R

I'm in the process of creating summaries tables based on subgroups and would love to add an overall summary in a tidyer/more efficient manner.
What I have so far is this. I've created summaries via levels within my factor variables.
library(tidyverse)
df <- data.frame(var1 = 10:18,
var2 = c("A","B","A","B","A","B","A","B","A"))
group_summary <- df %>% group_by(var2) %>%
filter(var2 != "NA") %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n())
Next I created an overall summary.
Summary <- df %>%
filter(var2 != "NA") %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n())
Finally, I bound the two objects with dplyr::bind_rows
complete_summary <- bind_rows(Summary, group_summary)
What I've done works but it is very, very verbose and can't be the most efficient way. I tried to use ungroup
group_summary <- df %>% group_by(var2) %>%
filter(var2 != "NA") %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n()) %>% ungroup %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n())
but it threw an error:
Evaluation error: object var1 not found.
Thanks in advance for your assistance.
Ideally, if you want to do it in one-chain, this is how you can do by using bind_rows to combine both the results, just like you've done - but removing the temporary objects you created.
library(tidyverse)
#> Warning: package 'tibble' was built under R version 3.5.2
df <- data.frame(var1 = 10:18,
var2 = c("A","B","A","B","A","B","A","B","A"))
df %>% group_by(var2) %>%
filter(var2 != "NA") %>%
summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n()) %>% #ungroup() %>%
bind_rows( df %>% summarise("Max" = max(var1, na.rm = TRUE),
"Median" = median(var1, na.rm = TRUE),
"Min" = min(var1, na.rm = TRUE),
"IQR" = IQR(var1, na.rm = TRUE),
"Count" = n()))
#> # A tibble: 3 x 6
#> var2 Max Median Min IQR Count
#> <fct> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 A 18 14 10 4 5
#> 2 B 17 14 11 3 4
#> 3 <NA> 18 14 10 4 9
Created on 2019-01-29 by the reprex package (v0.2.1)
Not the most elegant solution either, but simple:
c <- mtcars %>%
mutate(total_mean = mean(wt),
total_median = median(wt)) %>%
group_by(cyl) %>%
summarise(meanweight = mean(wt),
medianweight = median(wt),
total_mean = first(total_mean),
total_median = first(total_median))

Normalising data with dplyr mutate() brings inconsistencies

I'm trying to reproduce the framework from this blogpost http://www.luishusier.com/2017/09/28/balance/ with the following code but it looks like I get inconsistent results
library(tidyverse)
library(magrittr)
ids <- c("1617", "1516", "1415", "1314", "1213", "1112", "1011", "0910", "0809", "0708", "0607", "0506")
data <- ids %>%
map(function(i) {read_csv(paste0("http://www.football-data.co.uk/mmz4281/", i ,"/F1.csv")) %>%
select(Date:AST) %>%
mutate(season = i)})
data <- bind_rows(data)
data <- data[complete.cases(data[ , 1:3]), ]
tmp1 <- data %>%
select(season, HomeTeam, FTHG:FTR,HS:AST) %>%
rename(BP = FTHG,
BC = FTAG,
TP = HS,
TC = AS,
TCP = HST,
TCC = AST,
team = HomeTeam)%>%
mutate(Pts = ifelse(FTR == "H", 3, ifelse(FTR == "A", 0, 1)),
Terrain = "Domicile")
tmp2 <- data %>%
select(season, AwayTeam, FTHG:FTR, HS:AST) %>%
rename(BP = FTAG,
BC = FTHG,
TP = AS,
TC = HS,
TCP = AST,
TCC = HST,
team = AwayTeam)%>%
mutate(Pts = ifelse(FTR == "A", 3 ,ifelse(FTR == "H", 0 , 1)),
Terrain = "Extérieur")
tmp3 <- bind_rows(tmp1, tmp2)
l1_0517 <- tmp3 %>%
group_by(season, team)%>%
summarise(j = n(),
pts = sum(Pts),
diff_but = (sum(BP) - sum(BC)),
diff_t_ca = (sum(TCP, na.rm = T) - sum(TCC, na.rm = T)),
diff_t = (sum(TP, na.rm = T) - sum(TC, na.rm = T)),
but_p = sum(BP),
but_c = sum(BC),
tir_ca_p = sum(TCP, na.rm = T),
tir_ca_c = sum(TCC, na.rm = T),
tir_p = sum(TP, na.rm = T),
tir_c = sum(TC, na.rm = T)) %>%
arrange((season), desc(pts), desc(diff_but))
Then I apply the framework mentioned above:
l1_0517 <- l1_0517 %>%
mutate(
# First, see how many goals the team scores relative to the average
norm_attack = but_p %>% divide_by(mean(but_p)) %>%
# Then, transform it into an unconstrained scale
log(),
# First, see how many goals the team concedes relative to the average
norm_defense = but_c %>% divide_by(mean(but_c)) %>%
# Invert it, so a higher defense is better
raise_to_power(-1) %>%
# Then, transform it into an unconstrained scale
log(),
# Now that we have normalized attack and defense ratings, we can compute
# measures of quality and attacking balance
quality = norm_attack + norm_defense,
balance = norm_attack - norm_defense
) %>%
arrange(desc(norm_attack))
When I look at the column norm_attack, I expect to find the same value for equivalent but_p values, which is not the case here:
head(l1_0517, 10)
for instance when but_p has value 83, row 5 and row 7, I get norm_attack at 0.5612738 and 0.5128357 respectively.
Is it normal? I would expect mean(l1_0517$but_p) to be fixed and therefore obtaining the same result when a value of l1_0517$but_p is log normalised?
UPDATE
I have tried to work on a simpler example but I can't reproduce this issue:
df <- tibble(a = as.integer(runif(200, 15, 100)))
df <- df %>%
mutate(norm_a = a %>% divide_by(mean(a)) %>%
log())
I found the solution after looking at the type of l1_0517
It is a grouped_df hence the different results.
The correct code is:
l1_0517 <- tmp3 %>%
group_by(season, team)%>%
summarise(j = n(),
pts = sum(Pts),
diff_but = (sum(BP) - sum(BC)),
diff_t_ca = (sum(TCP, na.rm = T) - sum(TCC, na.rm = T)),
diff_t = (sum(TP, na.rm = T) - sum(TC, na.rm = T)),
but_p = sum(BP),
but_c = sum(BC),
tir_ca_p = sum(TCP, na.rm = T),
tir_ca_c = sum(TCC, na.rm = T),
tir_p = sum(TP, na.rm = T),
tir_c = sum(TC, na.rm = T)) %>%
ungroup() %>%
arrange((season), desc(pts), desc(diff_but))

Resources