Optimizing rollapplyr custom function - r

I have the following data:
y <- data.table(cbind(week = rep(1:61,5352),
ID = rep(1:5352, each = 61), w = runif(326472), v = runif(326472)))
y$v[sample(1:326472, 10000, replace=FALSE)] <- NA
for which I'm running the code bellow that creates a rolling mean of variable v, ignoring outliers and NAs.
The code is working, but with poor perfomance.
I'm sure there is more efficient way to run it using apply or something similar, but I've been unsuccessful in creating a faster version. Can anyone shed some light on how to make it more efficient?
IDs <- unique(y$ID)
y$vol_m12 <- 0
for (i in 1:length(IDs)) {
x <- y[ID==IDs[i]]
outlier <- 0.2
w_outlier <- quantile(x$w, c(outlier), na.rm = T)
v_outlier <-quantile(x$v, c(1 - outlier), na.rm = T)
# Ignore outliers
x$v_temp <- x$v
x$v_temp[((x$v_temp >= v_outlier)
& (x$w <= w_outlier))] <- NA
# Creating rolling mean
y$vol_m12[y$ID==IDs[i]] <- x[, rollapplyr(v_temp, 12, (mean), fill = NA, na.rm=T)]
}

Thanks for the replies.
Following 42 advice, I've produced the following code:
library(RcppRoll)
# Ignore outliers
y[, w_out := quantile(w, c(outlier), na.rm = T), by=ID]
y[, v_out := quantile(v, c(1-outlier), na.rm = T), by=ID]
y[((v <= v_out) & (w >= w_out)), v_temp := v]
y[,w_out := NULL]
y[,v_out := NULL]
y[, v_m12 := roll_mean(as.matrix(v_temp), n =12L, fill = NA,
align = c("right"), normalize = TRUE, na.rm = T), by = ID]
System time is about .59 seconds against 10.36 for the solution bellow, which uses rollapplyr (but probably it is possible to make the outlier removal more efficient).
y[, v_m12 :=rollapplyr(v_temp, 12, (mean), fill = NA, na.rm=T), by = ID]

Related

Simulating data with the purrr::map-family: truncated normal distribution for each row consumes the RAM

I try to assess the combined uncertainties related to different input parameters using Markov Chain Monte Carlo method in R. In other words, using the uncertainty parameters reported in input data documentation, I try to generate distributions for each of the datasets by creating 1000 random values within the used distribution (normal distribution or truncated normal distribution).
However, I don't know how to do this with the purrr::map() functions faster and without exhausting the RAM. The dataset has 2m rows and 80 cols.
Here is a simplified example:
library(tidyverse); library(truncnorm);library(data.table); library(dtplyr)
n <- 1000 # number of simulations
n_obs <- 10000 # number of observations. Does not work if e.g. 50000
Create a data.frame
dt <- data.frame(
var1 = runif(n_obs, 0, 100),
var2_low = runif(n_obs, 0, 1),
var2_mean = runif(n_obs, 0, 5),
var2_up = runif(n_obs, 0, 10)
)
Convert to lazy data table to speed things up
dt1 <- dt %>% as.data.table() %>%
lazy_dt(., immutable = FALSE)
Simulate
dt_sim <- dt1 %>%
mutate(mean_val = rep(1, nrow(.)), # just row of 1
var1_rnorm = map(.x = mean_val,~rnorm(n, mean = .x, sd = 0.10)), # normal distribution with given sd
sim_var1 = map2(.x = var1, .y = var1_rnorm, ~(.x*.y))) %>% # multiply the data with simulated distribution
# add truncated normal distribution for each row (var2)
mutate(sim_var2 = pmap(.,~ rtruncnorm(n,
a = dt$var2_low,
b = dt$var2_up,
mean =dt$var2_mean))) %>%
# multiply simulated variables sim_var1 and sim_var2
mutate(sim_vars_multiplied =
pmap(list(x = .$sim_var1,
y = .$sim_var2),
function(x,y) (x*y))) %>%
# derive coefficient of variation
mutate(var_mean =map(.x = sim_vars_multiplied, ~ mean(.x, na.rm = TRUE)),
var_sd = map(.x = sim_vars_multiplied, ~ sd(.x, na.rm = TRUE)),
var_cv = unlist(var_sd) / unlist(var_mean)) %>%
# select only the variables needed
dplyr::select(var_cv)
# collect the results
sim_results <- dt_sim %>% as.data.table()
This may help
library(data.table)
#n <- 1000 # number of simulations
n_obs <- 10000
dt <- data.table(
var1 = runif(n_obs, 0, 100),
var2_low = runif(n_obs, 0, 1),
var2_mean = runif(n_obs, 0, 5),
var2_up = runif(n_obs, 0, 10)
)
dt[, mean_val := rep(1,.N)]
dt[, var1_rnorm := rnorm(.N, mean = mean_val, sd = 0.10)]
dt[, sim_var1 := var1 * var1_rnorm]
dt[, sim_var2 := truncnorm::rtruncnorm(.N, a = var2_low, b = var2_up, mean = var2_mean)]
dt[, sim_vars_multiplied := sim_var1 * sim_var2]
dt[, var_mean := mean(sim_vars_multiplied, na.rm=TRUE)]
dt[, var_sd := sd(sim_vars_multiplied, na.rm=TRUE)]
dt[, var_cv := var_sd / var_mean]
sim_results <- dt[,var_cv]

What would be the best way to improve calcul performance in a big data.table?

In a single data.table, I have many calculs to perform. Simple, but combining many configurations : creating X variables from Y others, making groups based on X different variables, etc...
Step by step, I manage to perform all the calculations I need (with my knowledge in data.table), but my real challenge is called PERFORMANCE. My data.table contains millions of lines, and the calculations are made on dozens and dozens of columns.
What I would like to know :
Is there a better way to write this code to improve performance ?
Some of my options do not work (1.3 and 2.2, tag with KO) : good approach ? How to write it ?
My microbenchmark seems to show me that the best option depends on the number of lines ? Right ?
Here is my code with a reprex :
library(data.table)
library(stringr)
library(microbenchmark)
n.row <- 1e5
foo <- data.table(id = 101:(101+n.row-1),
crit = rep(c('fr', 'ca', 'al', 'se', 'is'), 5),
val_1 = round(runif(n.row, 0.5, 50), digits = 2),
val_2 = round(runif(n.row, 1, 20), digits = 0),
val_3 = round(runif(n.row, 1, 5), digits = 0),
eff = 28500,
num = sample(0:1,n.row, replace = TRUE),
num_2 = round(runif(n.row, 1, 10), digits = 1),
num_17 = round(runif(n.row, 1, 10), digits = 1),
num_69 = round(runif(n.row, 0, 1), digits = 2),
num_5 = round(runif(n.row, 10, 20), digits = 0),
cof = round(runif(n.row, 0.1, 2), digits = 5),
ToDo = rep(1, n.row),
grp_1 = sample(LETTERS[c(1,3)], n.row, replace = TRUE))
foo[, c("grp_2", "grp_3") := {
grp_2 = fcase(grp_1 %in% LETTERS[c(1)], sample(LETTERS[c(5,8,9)], n.row, replace = TRUE),
grp_1 %in% LETTERS[c(3)], sample(LETTERS[c(14,16)], n.row, replace = TRUE))
grp_3 = fcase(grp_1 %in% LETTERS[c(1)], sample(LETTERS[c(20:23)], n.row, replace = TRUE),
grp_1 %in% LETTERS[c(3)], sample(LETTERS[c(24:26)], n.row, replace = TRUE))
list(grp_2, grp_3)
}]
# Calcul sd and qa
foo[, sd := (val_1 * cof)]
foo[num == 1, qa := (val_2 * cof)]
foo[num != 1, qa := (val_3 * cof)]
foo1 <- copy(foo)
foo2 <- copy(foo)
foo3 <- copy(foo)
# calcul of qa_X
var.calc <- names(foo)[str_which(names(foo), "^num.\\d+$")]
# 1.1
for (j in var.calc){
foo1[, paste0("qa_", str_extract(j, "\\d+$")) := qa * get(j)]
}
# 1.2
setDT(foo2)[, paste0("qa_", str_extract(var.calc, "\\d+$")) := lapply(.SD, function(x) x * qa), .SDcols = var.calc ]
# 1.3 KO
for (j in var.calc){ set(foo3, paste0("qa_", str_extract(j, "\\d+$")) := qa * get(j)) }
# comparaison
mbm <- microbenchmark(
Test.for = for (j in var.calc){ foo1[, paste0("qa_", str_extract(j, "\\d+$")) := qa * get(j)] },
Test.set = setDT(foo2)[, paste0("qa_", str_extract(var.calc, "\\d+$")) := lapply(.SD, function(x) x * qa), .SDcols = var.calc ],
times = 10
)
mbm
# calcul by groups
var.grp <- names(foo)[grepl("^grp.\\d+$", names(foo))]
# 2.1
for (j in var.grp) {
foo1[, paste0("s.sd.", j) := sum(sd, na.rm = TRUE), by = get(j)]
foo1[, paste0("s.qa.", j) := sum(qa, na.rm = TRUE), by = get(j)]
}
# 2.2 KO
setDT(foo2)[, paste0("s.sd.", var.grp) := lapply(.SD, function(x) sum(x)), .SDcols = var.calc, by = .SD ]
Many thanks for helping or suggestions.
(If I have to split my request, I will).
question: I would use:
for (j in var.calc) set(foo3, j = paste0("qa_", str_extract(j, "\\d+$")), value = foo3$qa * foo3[[j]])
(fixed 1.3 example)
question: 2.1 seams fine
Notes:
you don't need to constantly use setDT(foo2)
read documentation of data.table! there are lots of useful example, etc.: https://rdatatable.gitlab.io/data.table/
don't look at microbenchmark's, try the code on your real data and time that, because the results(time) will be different and the overhead, that some of data.tables functions have, will be insignificant.

rowmean and standard deviation using data.table

x <- matrix(rnorm(500 * 10), nrow = 500, ncol = 10)
x[, 1] <- 1:500
x <- data.frame(x)
names(x) <- c('ID', 2000:2008)
library(data.table)
setDT(x)
I want to calculate mean, sd and no. of data points per row but I am getting error
x[, c("meanY",'sdY',"nY") := .(rowMeans(.SD, na.rm = TRUE), sd(.SD, na.rm = TRUE), rowSums(!is.na(.SD))), .SDcols=c(2:10)]
The issues lies in sd() which doesn't work row-wise.
x[,
c("meanY",'sdY',"nY") :=
.(rowMeans(.SD, na.rm = TRUE),
apply(.SD, 1, sd, na.rm = TRUE),
rowSums(!is.na(.SD))),
.SDcols = 2:10]
Assuming the output as a list, you can use following code to have a try:
op <- c("mean","sd","length")
r <- lapply(op, function(v) apply(x, 1, eval(parse(text = v))))
names(r) <- op
where it should work with your data.frame example:
x <- matrix(rnorm(500 * 10), nrow = 500, ncol = 10)
x[, 1] <- 1:500
x <- data.frame(x)
names(x) <- c('ID', 2000:2008)

R data.table Performing a by aggregation on a joined column

I want to do an aggregation on the result of a join (x and i are keyed data.tables) without materializing the whole result. It is similar to what .EACHI tries to resolve but for columns that are added by the join.
D1 <- data.table(x = rnorm(1:100), i = sample(1:100, 100, replace = TRUE), k = sample(1:100, 100, replace = TRUE))
D2 <- data.table(x = rnorm(1:100), j = sample(1:100, 100, replace = TRUE), k = sample(1:100, 100, replace = TRUE))
setkey(D1, k)
setkey(D2, k)
I would like to sum x on the result of the join of D1 and D2 for the pairs i and j without materializing the result of D1[D2]
D1[D2, list(x = sum(x * i.x)), by = list(i, j), allow.cartesian = TRUE] ## Fails
D1[D2, allow.cartesian = TRUE][, list(x = sum(x * i.x)), by = list(i, j)] ## Is the result I want but uses up more memory
Is there a way to do this? If not will it be implemented in a future version?

R: data.table cross-join not working

I have two data.tables that I want to join (form a Cartesian product of). One of the data.tables is keyed on a Date vector, and the other on a numeric vector:
# data.table with dates (as numeric)
dtDates2 = data.table(date =
as.numeric(seq(from = as.Date('2014/01/01'),
to = as.Date('2014/07/01'), by = 'weeks')),
data1 = rnorm(26))
# data.table with dates
dtDates1 = data.table(date =
seq(from = as.Date('2014/01/01'),
to = as.Date('2014/07/01'), by = 'weeks'),
data1 = rnorm(26))
# data.table with customer IDs
dtCustomers = data.table(customerID = seq(1, 100),
data2 = rnorm(100))
I setkey and try to cross-join them using CJ:
# cross join the two datatables
setkey(dtCustomers, customerID)
setkey(dtDates1, date)
setkey(dtDates2, date)
CJ(dtCustomers, dtDates1)
CJ(dtCustomers, dtDates2)
but get the following error:
Error in FUN(X[[1L]], ...) :
Invalid column: it has dimensions. Can't format it. If it's the result of data.table(table()), use as.data.table(table()) instead.
Not sure what I am doing wrong.
There is no cross join functionality available in data.table out of the box.
Yet there is CJ.dt function (a CJ like but designed for data.tables) to achieve cartesian product (cross join) available in optiRum package (available in CRAN).
You can create the function:
CJ.dt = function(X,Y) {
stopifnot(is.data.table(X),is.data.table(Y))
k = NULL
X = X[, c(k=1, .SD)]
setkey(X, k)
Y = Y[, c(k=1, .SD)]
setkey(Y, NULL)
X[Y, allow.cartesian=TRUE][, k := NULL][]
}
CJ.dt(dtCustomers, dtDates1)
CJ.dt(dtCustomers, dtDates2)
Yet there is a FR for convenience way to perform cross join filled in data.table#1717, so you could check there if there is a nicer api for cross join.
thank you jangorecki for the very useful function
I had to add support for empty X and/or Y:
CJ.dt = function(X, Y) {
stopifnot(is.data.table(X), is.data.table(Y))
if(nrow(X) > 0 & nrow(Y) > 0){
k = NULL
X = X[, c(k = 1, .SD)]
setkey(X, k)
Y = Y[, c(k = 1, .SD)]
setkey(Y, NULL)
return(X[Y, allow.cartesian = T][, k := NULL][])
} else {
duplicatedNames <- names(Y)[names(Y) %in% names(X)]
if(length(duplicatedNames) > 0) {
setnames(Y, duplicatedNames, paste0("i.", duplicatedNames))
}
setkey(Y)
setkey(X)
return(cbind(X[!X], Y[!Y]))
}
}
# X <- data.table(a = c(1, 2))
# Y <- data.table(a = c(2, 3), b = c(4, 5))
#
# CJ.dt(X, Y)
# CJ.dt(X[a > 2], Y)

Resources