Modify Moving Average calc from looping to a vector - r

I'm working on a moving average calculator that works as intended, but just takes a long time to run the calc because it is currently looping through the formula instead of being vectorized. The data set is about ~16000 rows. The MA formula is written to filter out the top quartile of values appearing in the previous 45 days of usd_price. Any tips/changes to get this running more efficiently as a vector calc?
The dput output is:
> dput(data)
structure(list(loan_price = c(50000, 60000, 40000, 35000, 1e+05,
95000), cad_price = c(62500, 75000, 50000, 43750, 125000, 118750
), day = structure(c(1642118400, 1641772800, 1639958400, 1639785600,
1638316800, 1640995200), tzone = "UTC", class = c("POSIXct", "POSIXt")), fourtyfive_avg = c(251435.529507523, 251435.529507523,
251435.529507523, 251435.529507523, 251435.529507523, 251435.529507523
), Loan = c(TRUE, TRUE, TRUE, TRUE, TRUE, FALSE)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
library(readxl)
library(tidyverse)
library(zoo)
library(dplyr)
data<- data%>% mutate(day=lubridate::parse_date_time(day, "ymd"))
myfunc <- function(x){
fourtyfive_days <- as.Date(x - ddays(45))
data<-
data %>%
filter(day <= x) %>%
filter(day >= fourtyfive_days) %>%
filter(loan_price<= quantile(loan_price, probs = 0.75)) %>%
summarize(fourtyfive_avg = mean(loan_price))
return(data$fourtyfive_avg)
}
data$fourtyfive_avg <- sapply(data$day, simplify = TRUE, FUN = myfunc)

Functions in the tidyverse are notoriously slow so moving away from tidy functions will improve the speed significantly. You could also try the data.table package, but 16,000 rows isn’t a lot of data so I’m not sure how necessary this would be. I typically only use it when the number of rows in my data is in the millions. You also have to account for the time it takes to index, so I’ve included that in the benchmark below.
The functions we'll be comparing:
library(data.table)
library(tidyverse)
library(lubridate)
library(microbenchmark)
## dplyr
myfunc <- function(x){
fourtyfive_days <- as.Date(x - ddays(45))
data<-
data %>%
filter(day <= x) %>%
filter(day >= fourtyfive_days) %>%
filter(loan_price<= quantile(loan_price, probs = 0.75)) %>%
summarize(fourtyfive_avg = mean(loan_price))
return(data$fourtyfive_avg)
}
# data$fourtyfive_avg <- sapply(data$day, simplify = TRUE, FUN = myfunc)
## base
mybasefunc <- function(x){
fortyfive_days <- as.Date(x - ddays(45))
data <- data[data$day >= fortyfive_days & data$day <= x ,]
q75 <- quantile(data$loan_price, probs = 0.75)
data <- data[data$loan_price <= q75,]
fortyfive_avg <- mean(data$loan_price)
return(fortyfive_avg)
}
# data$fortyfive_avg <- sapply(data$day, simplify = TRUE, FUN = mybasefunc)
## data.table
# dat <- data.table(data, key = c("day", "loan_price"))
mydtfunc <- function(x){
fortyfive_days <- as.Date(x - ddays(45))
dat <- dat[day >= fortyfive_days & day <= x]
q75 <- quantile(dat$loan_price, probs = 0.75)
dat <- dat[dat$loan_price <= q75]
fortyfive_avg <- mean(dat$loan_price)
return(fortyfive_avg)
}
# dat[ , fortyfive_avg := sapply(day, mydtfunc), ]
And benchmarking:
set.seed(1)
m <-
microbenchmark("dplyr" = {data$fourtyfive_avg <- sapply(data$day, simplify = TRUE, FUN = myfunc)},
"base" = {data$fortyfive_avg <- sapply(data$day, simplify = TRUE, FUN = mybasefunc)},
"dt" = {dat <- data.table(data, key = c("day", "loan_price")); dat[ , fortyfive_avg := sapply(day, mydtfunc), ]})
m
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> dplyr 29.6519 30.71400 32.594319 32.42595 33.68590 44.0838 100 c
#> base 4.2316 4.37210 4.631541 4.42665 4.58615 12.3656 100 a
#> dt 5.6883 5.83755 6.254143 5.97100 6.11905 15.6615 100 b
autoplot(m)
The benchmark seems pretty conclusive: you can see a significant improvement by moving away from dplyr to either base or data.table. It's worth noting I don't use data.table often, so there may be a more efficient way to accomplish what I've done, but it's still much faster than dplyr.
Created on 2022-01-31 by the reprex package (v2.0.1)

Related

R reshape() extremely slow

I need to perform a simple reshape of data from long to wide, and this needs to work in base R. For this use case, reshape() seems to be extraordinarily slow (despite assertions that it is very fast https://stackoverflow.com/a/12073077/3017280). This example is a reasonable approximation of my data. I know that in this example I do not need both Index columns, but I do in the real data. On my laptop 10,000 rows takes 3 seconds, and 40,000 rows takes over 200 seconds. The real data has over one million rows, so reshape() is obviously a non-starter. Can anyone shed any light on why it takes so long in this case? I worked around the problem using split / lapply / Reduce + merge, which is clumsy but very much quicker.
n <- 5000
dfLong <- data.frame(Index1 = rep(sample(1E6:2E6, n), 4),
Index2 = rep(sample(3E6:4E6, n), 4),
Key = rep(1:4, each = n),
Date = sample(seq.Date(as.Date("2020-01-01"),
as.Date("2021-12-31"),
by = "1 day"),
size = n * 4, replace = TRUE),
Score = sample(0:48, n * 4, replace = TRUE))
system.time(dfWide <- reshape(data = dfLong,
v.names = c("Date", "Score"),
timevar = "Key",
idvar = c("Index1", "Index2"),
sep = "_Q",
direction = "wide"))
If you look at what functions reshape calls with the profvis package, you can see that almost all of the total time spent is on this one line in the function. The interaction function is used only to combine your two id columns into a single column.
data[, tempidname] <- interaction(data[, idvar],
drop = TRUE)
Rather than interaction, you could use do.call(paste0, data[, idvar]). You can use a function to create an environment with interaction equal to this faster function.
new_reshape <- function(...){
interaction <- function(x, drop) do.call(paste0, x)
environment(reshape) <- environment()
reshape(...)
}
Now it's much faster
system.time(dfWide <- reshape(data = dfLong,
v.names = c("Date", "Score"),
timevar = "Key",
idvar = c("Index1", "Index2"),
sep = "_Q",
direction = "wide"))
# user system elapsed
# 35.292 0.538 36.236
system.time(new_dfWide <- new_reshape(data = dfLong,
v.names = c("Date", "Score"),
timevar = "Key",
idvar = c("Index1", "Index2"),
sep = "_Q",
direction = "wide"))
# user system elapsed
# 0.071 0.009 0.081
all.equal(new_dfWide, dfWide)
# [1] TRUE
You can be even faster than that by using plyr:::ninteraction. The only non-base dependency of this function is plyr:::id_var, which has no dependencies, meaning if you can't install packages you can just copy-paste this function definition pretty easily (adding a comment giving credit).
new_reshape <- function(...){
# interaction = plyr:::ninteraction
# id_var = plyr:::id_var
interaction <-
function (.variables, drop = FALSE)
{
lengths <- vapply(.variables, length, integer(1))
.variables <- .variables[lengths != 0]
if (length(.variables) == 0) {
n <- nrow(.variables) %||% 0L
return(structure(seq_len(n), n = n))
}
if (length(.variables) == 1) {
return(id_var(.variables[[1]], drop = drop))
}
ids <- rev(lapply(.variables, id_var, drop = drop))
p <- length(ids)
ndistinct <- vapply(ids, attr, "n", FUN.VALUE = numeric(1),
USE.NAMES = FALSE)
n <- prod(ndistinct)
if (n > 2^31) {
char_id <- do.call("paste", c(ids, sep = "\r"))
res <- match(char_id, unique(char_id))
}
else {
combs <- c(1, cumprod(ndistinct[-p]))
mat <- do.call("cbind", ids)
res <- c((mat - 1L) %*% combs + 1L)
}
attr(res, "n") <- n
if (drop) {
id_var(res, drop = TRUE)
}
else {
structure(as.integer(res), n = attr(res, "n"))
}
}
id_var <-
function (x, drop = FALSE)
{
if (length(x) == 0)
return(structure(integer(), n = 0L))
if (!is.null(attr(x, "n")) && !drop)
return(x)
if (is.factor(x) && !drop) {
x <- addNA(x, ifany = TRUE)
id <- as.integer(x)
n <- length(levels(x))
}
else {
levels <- sort(unique(x), na.last = TRUE)
id <- match(x, levels)
n <- max(id)
}
structure(id, n = n)
}
environment(reshape) <- environment()
reshape(...)
}
system.time(new_dfWide <- new_reshape(data = dfLong,
v.names = c("Date", "Score"),
timevar = "Key",
idvar = c("Index1", "Index2"),
sep = "_Q",
direction = "wide"))
# user system elapsed
# 0.015 0.000 0.015
I don't know that I've ever made the claim that stats::reshape is the fastest.
For comparisons, stats::reshape is not as fast on my i9/64GB-ram system:
system.time(
dfWide <- reshape(data = dfLong,
v.names = c("Date", "Score"),
timevar = "Key",
idvar = c("Index1", "Index2"),
sep = "_Q",
direction = "wide")
)
# user system elapsed
# 19.63 0.03 19.73
But other reshaping functions do much better:
system.time(
tidyrWide <- pivot_wider(
dfLong, c("Index1", "Index2"),
names_prefix = "Q", names_from = "Key",
values_from = c("Date", "Score"))
)
# user system elapsed
# 0.01 0.00 0.02
nms <- names(dfWide)
tidyrWide <- subset(tidyrWide, select = nms) # column order
dfOrder <- do.call(order, dfWide)
tidyrOrder <- do.call(order, tidyrWide)
all.equal(dfWide[dfOrder,], as.data.frame(tidyrWide)[tidyrOrder,], check.attributes = FALSE)
# [1] TRUE
Similarly, data.table::dcast is equally fast:
dtLong <- as.data.table(dfLong)
system.time(
dtWide <- data.table::dcast(
Index1 + Index2 ~ paste0("Q", Key),
data = dtLong, value.var = c("Date", "Score"))
)
# user system elapsed
# 0.00 0.01 0.02
dtWide <- subset(dtWide, select = nms) # column order
dtOrder <- do.call(order, dtWide)
all.equal(dfWide[dfOrder,nms], as.data.frame(dtWide)[dtOrder,nms], check.attributes = FALSE)
# [1] TRUE
Consider an advanced modified version of #Moody_Mudskipper's matrix_spread, using base R. Since matrix will simplify complex types like Date, some adhoc changes will be required:
Function
matrix_spread <- function(df1, id, key, value, sep){
unique_ids <- unique(df1[[key]])
mats <- lapply(df1[value], function(x)
matrix(x, ncol=length(unique_ids), byrow = FALSE)
)
df2 <- do.call(
data.frame, list(unique(df1[id]), mats)
)
# RENAME COLS
names(df2)[(length(id)+1):ncol(df2)] <- as.vector(
sapply(value, function(x, y) paste0(x, sep, y), unique_ids)
)
# REORDER COLS
df2 <- df2[c(id, as.vector(
outer(c(value), unique_ids, function(x, y) paste0(x, sep, y))
))]
return(df2)
}
Application
system.time(
dfWide2 <- matrix_spread(
df1 = dfLong,
id = c("Index1", "Index2"),
key = "Key",
value = c("Date", "Score"),
sep = "_Q"
)
)
# user system elapsed
# 0.022 0.000 0.023
# CONVERT INTEGERS TO DATES
dfWide2[grep("Date", names(dfWide2))] <- lapply(
dfWide2[grep("Date", names(dfWide2))],
as.Date,
origin = "1970-01-01"
)
# REPLICATES OP'S reshape
identical(data.frame(dfWide), dfWide2)
# [1] TRUE

Apply statistical test to many variables: improve speed

I have a dataframe with 40 rows and ~40000 columns. The 40 rows are split into group "A" and group "B" (20 each). For each column, I would like to apply a statistical test (wilcox.test()) comparing the two groups. I started using a for loop to run through the 40000 columns but it was very slow.
Minimal Reproducible Example (MRE):
library(tidyverse)
set.seed(123)
metrics <- paste("metric_", 1:40000, sep = "")
patient_IDs <- paste("patientID_", 1:40, sep = "")
m <- matrix(sample(1:20, 1600000, replace = TRUE), ncol = 40000, nrow = 40,
dimnames=list(patient_IDs, metrics))
test_data <- as.data.frame(m)
test_data$group <- c(rep("A", 20), rep("B", 20))
# Collate list of metrics to analyse ("check") for significance
list_to_check <- colnames(test_data)[1:40000]
Original 'loop' method (this is what I want to vectorise):
# Create a variable to store the results
results_A_vs_B <- c()
# Loop through the "list_to_check" and,
# for each 'value', compare group A with group B
# and load the results into the "results_A_vs_B" variable
for (i in list_to_check) {
outcome <- wilcox.test(test_data[test_data$group == "A", ][[i]],
test_data[test_data$group == "B", ][[i]],
exact = FALSE)
if (!is.nan(outcome$p.value) && outcome$p.value <= 0.05) {
results_A_vs_B[i] <- paste(outcome$p.value, sep = "\t")
}
}
# Format the results into a dataframe
summarised_results_A_vs_B <- as.data.frame(results_A_vs_B) %>%
rownames_to_column(var = "A vs B") %>%
rename("Wilcox Test P-value" = "results_A_vs_B")
Benchmarking the answers so far:
# Ronak Shah's "Map" approach
Map_func <- function(dataset, list_to_check) {
tmp <- split(dataset[list_to_check], dataset$group)
stack(Map(function(x, y) wilcox.test(x, y, exact = FALSE)$p.value, tmp[[1]], tmp[[2]]))
}
# #Onyambu's data.table method
dt_func <- function(dataset, list_to_check) {
melt(setDT(dataset), measure.vars = list_to_check)[, dcast(.SD, rowid(group) + variable ~ group)][, wilcox.test(A, B, exact = FALSE)$p.value, variable]
}
# #Park's dplyr method (with some minor tweaks)
dplyr_func <- function(dataset, list_to_check){
dataset %>%
summarise(across(all_of(list_to_check),
~ wilcox.test(.x ~ group, exact = FALSE)$p.value)) %>%
pivot_longer(cols = everything(),
names_to = "Metrics",
values_to = "Wilcox Test P-value")
}
library(microbenchmark)
res_map <- microbenchmark(Map_func(test_data, list_to_check), times = 10)
res_dplyr <- microbenchmark(dplyr_func(test_data, list_to_check), times = 2)
library(data.table)
res_dt <- microbenchmark(dt_func(test_data, list_to_check), times = 10)
autoplot(rbind(res_map, res_dt, res_dplyr))
# Excluding dplyr
autoplot(rbind(res_map, res_dt))
--
Running the code on a server took a couple of seconds longer but the difference between Map and data.table was more pronounced (laptop = 4 cores, server = 8 cores):
autoplot(rbind(res_map, res_dt))
Here is another option -
Map_approach <- function(dataset, list_to_check) {
tmp <- split(dataset[list_to_check], dataset$group)
stack(Map(function(x, y) wilcox.test(x, y)$p.value, tmp[[1]], tmp[[2]]))
}
Map_approach(data_subset, list_to_check)
# values ind
#1 5.359791e-05 value_1
#2 5.499685e-08 value_2
#3 1.503951e-06 value_3
#4 6.179352e-08 value_4
#5 5.885650e-08 value_5
Testing it on larger sample Map is slightly faster than the for loop.
n <- 1e6
data_subset <- data.frame(patient_ID = 1:n,
group = c(rep("A", n/2),
rep("B", n/2)),
value_1 = c(sample(1:10, n/2, replace = TRUE),
sample(5:15, n/2, replace = TRUE)),
value_2 = c(sample(1:5, n/2, replace = TRUE),
sample(15:n/2, n/2, replace = TRUE)),
value_3 = c(sample(1:12, n/2, replace = TRUE),
sample(8:17, n/2, replace = TRUE)),
value_4 = c(sample(5:10, n/2, replace = TRUE),
sample(15:25, n/2, replace = TRUE)),
value_5 = c(sample(20:40, n/2, replace = TRUE),
sample(10:15, n/2, replace = TRUE)))
microbenchmark::microbenchmark(loop = wilcox_loop(data_subset, list_to_check),
Map = Map_approach(data_subset, list_to_check))
#Unit: seconds
# expr min lq mean median uq max neval cld
# loop 5.254573 5.631162 5.788624 5.734480 5.920424 6.756319 100 b
# Map 4.710790 5.084783 5.201711 5.160722 5.309048 5.721540 100 a
May you try this code? It's slightly faster in my computer.
wilcox_loop2 <- function(data_subset, list_to_check){
A = data_subset[data_subset$group == "A",]
B = data_subset[data_subset$group == "B",]
outcome <- sapply(list_to_check, function(x) wilcox.test(A[[x]],
B[[x]],
exact = FALSE)$p.value)
as.data.frame(outcome) %>%
rownames_to_column(var = "A vs B") %>%
rename("Wilcox Test P-value" = "outcome")
}
I'm not sure it's OK to split data into A and B...
My system time costs is like
microbenchmark::microbenchmark(origin = wilcox_loop(data_subset, list_to_check),
test = wilcox_loop2(data_subset, list_to_check))
Unit: milliseconds
expr min lq mean median uq max neval cld
origin 4.815601 5.006951 6.490757 5.385502 6.790752 21.5876 100 b
test 3.817801 4.116151 5.146963 4.330500 4.870651 15.8271 100 a

Using purrr to help transform a large data file

I have a bit of code that goes through a number of columns containing dates and selects the earliest date from the options to populate a new column with. To do this I was using the dplyr::rowwise function.
Unfortunately, the data set is quite big and comes at a time cost in obtaining an output. Here is an example of my initial approach.
library(tidyverse)
library(lubridate)
set.seed(101)
data <- tibble(date1 = sample(
seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date2 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date3 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date4 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date5 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE))
So for the first attempt I opted for rowwise. I hadn't used this before, but the output is identified as 'rowwise_df', which I take to be similar if I had used group_by.
data <- data %>%
rowwise() %>%
mutate(earlierst_date = min(c(date1, date2, date3, date4, date5),
na.rm = TRUE))
Having looked around, it would appear that rowwise is not considered the best approach (see excellent back and forth here). Reading through, I attempted the following...
data <- data %>%
mutate(try_again = pmap(list(date1, date2, date3, date4, date5),
min, na.rm = TRUE)) %>%
mutate(try_again = as_date(try_again))
table(data$earlierst_date == data$try_again)
#>
#> TRUE
#> 100
According to my reprex run the second option is twice as fast.
start.time <- Sys.time()
data <- data %>%
rowwise() %>%
mutate(earlierst_date = min(c(date1, date2, date3, date4, date5),
na.rm = TRUE))
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
#> Time difference of 0.07597804 secs
start.time <- Sys.time()
data <- data %>%
mutate(try_again = pmap(list(date1, date2, date3, date4, date5),
min, na.rm = TRUE)) %>%
mutate(try_again = as_date(try_again))
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
#> Time difference of 0.03266287 secs
My questions:
1. Is the second strategy using pmap fit for purpose or is there some inherent error present that I can't see? For example, in earlier attempts, the output column contained list values rather than vectors which threw me.
I get dizzy anytime I have to work with dates, especially when I read comments such as "A date is a day stored as the number of days since 1970-01-01"...
2. Do the code run times make sense?
Any improvements/direction greatly received.
I agree with #det that rowwise isn't the way to go. I think perhaps the pmin function might be the best suited to the task, e.g.
data <- transform(data, earliest_date = pmin(date1, date2, date3, date4, date5, na.rm = TRUE))
Benchmarking (updated to include a data.table solution):
library(tidyverse)
library(lubridate)
set.seed(101)
data <- tibble(date1 = sample(
seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date2 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date3 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date4 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE),
date5 = sample(seq(ymd('2021-03-20'), ymd('2021-05-20'), by = 'day'),
100, replace = TRUE))
rowwise_func <- function(data){
data %>%
rowwise() %>%
mutate(earliest_date = min(c(date1, date2, date3, date4, date5),
na.rm = TRUE)) %>%
ungroup()
}
pmap_func <- function(data){
data %>%
mutate(try_again = pmap(list(date1, date2, date3, date4, date5),
min, na.rm = TRUE))
}
det_func1 <- function(data){
data %>%
mutate(min_date = pmap_dbl(select(., matches("^date")), min) %>% as.Date(origin = "1970-01-01"))
}
det_faster <- function(data){
data[["min_date"]] <- data %>%
mutate(across(where(is.Date), as.integer)) %>%
as.matrix() %>%
apply(1, function(x) x[which.min(x)]) %>%
as.Date(origin = "1970-01-01")
}
transform_func <- function(data){
as_tibble(transform(data, earliest_date = pmin(date1, date2, date3, date4, date5, na.rm = TRUE)))
}
dt_func <- function(data){
setDT(data)
data[, earliest_date := pmin(date1, date2, date3, date4, date5, na.rm = TRUE)]
}
times <- microbenchmark::microbenchmark(rowwise_func(data), pmap_func(data), det_func1(data), det_faster(data), transform_func(data), dt_func(data))
autoplot(times)
data2 <- transform_func(data)
data3 <- rowwise_func(data)
identical(data2, data3)
#> TRUE
Unit: microseconds
expr min lq mean median uq max neval cld
rowwise_func(data) 6764.693 6919.6720 7375.0418 7066.6220 7271.5850 16290.696 100 ab
pmap_func(data) 3994.973 4150.1360 9425.3880 4252.9850 4437.2950 491030.248 100 b
det_func1(data) 5576.240 5724.6820 6249.7573 5845.3305 5985.5940 15106.741 100 ab
det_faster(data) 3182.016 3305.3525 3556.8628 3362.8720 3444.0505 12771.952 100 ab
transform_func(data) 564.194 624.1055 697.5630 680.1130 718.7975 1513.184 100 a
dt_func(data) 650.611 723.7235 956.7916 759.3355 782.0565 10806.902 100 a
So, based on the functions I used above, the transform + pmin method was ~ 10X faster than the rowwise method.
From my experiance rowwise is extremely slow so I prefer using any other option (at the cost of having less tidy code) especially if I have numeric columns (then I convert to matrix). pmap is definitely option, but sometimes I have trouble listing all needed columns (doesn't have tidy select option). This can be somewhat avoided by using select within pmap:
data <- data %>%
mutate(min_date = pmap_dbl(select(., matches("^date")), min) %>% as.Date(origin = "1970-01-01"))
Converting to matrix was usually fastest way (much faster) for my problems (in combination with function like apply or sweep:
data[["min_date"]] <- data %>%
mutate(across(where(is.Date), as.integer)) %>%
as.matrix() %>%
apply(1, function(x) x[which.min(x)]) %>%
as.Date(origin = "1970-01-01")

data.table is slow compared to data.frame for group filtering

data.table is slow if I filter based on numeric value (greater than or less than)
library(data.table)
df <- data.frame(group = sample(paste("group", 0:2000, sep = ""), 7000, replace = TRUE),
pvalue = 10^(sample(seq(from = -5, to = -1, by = 0.1), 7000, replace = TRUE)))
groups <- setdiff(unique(df$group), "group0")
# data.frame takes 0.16 sec
system.time( lapply(groups, function(r) {
df.temp <- df[df$group == r,]
any(df.temp[["pvalue"]] < 0.01, na.rm = TRUE)
}))
DT <- as.data.table(df)
setkeyv(DT, c("group"))
# data.table takes 0.9 sec
system.time(lapply(groups, function(r) any(DT[.(r), pvalue <= 0.01], na.rm = TRUE)))
Does anyone know, what could I be doing wrong?
You are not using data.table correctly. It is expensive to call DT[ many times. Instead you could do something like the following
setkeyv(DT, c("group"))
DT[!("group0"), any(pvalue <= 0.01), by = group]
group V1
1: group1 TRUE
2: group10 TRUE
3: group100 TRUE
4: group1000 TRUE
5: group1001 TRUE

Efficiently fill 2D matrices by rows in a list in R

I have a list of 2D matrices. Each matrix is filled using the function fillMatrices. This function adds a number of individuals to each day 0 in a matrix and updates the columns a_M, b_M and c_M. The numbers of individuals come from an initial matrix ind. The code works but it is slow when the number of matrices within the list is large. For example with n = 10000:
user system elapsed
3.73 0.83 4.55
If possible, I would like to reduce the elapsed time to <= 1 sec and increase the n to 720000 matrices. So, I am looking for way to optimize only the section 3. Here is the code:
###############################################
###############################################
## Section 3
## Run the function "fillMatrices"
indexTime <- 1
dt_t_1 <- do.call(rbind, lapply(list_matrices, function(x) x[1,]))
dt_t <- fillMatrices(dt_t_1 = dt_t_1, species = c("a_M", "b_M", "c_M"), maxDuration = 5, matrixColumns = col_mat)
## Fill the matrices within the list
system.time(for(i in 1:n){
list_matrices[[i]][indexTime + 1,] <- dt_t[,i]
})
## test <- list_matrices[[1]]
The code of the section 1 is used to initialize the matrices and the function fillMatrices can be found in the section 2. In my example, the function is used to fill matrices for one species. In reality, the function is used for 3 species (i.e., is applied three times) by changing the argument species = c("a_M", "b_M", "c_M"). How can I speed up my code? Any advice would be much appreciated.
Here are the codes of sections 1 and 2:
rm(list=ls(all=TRUE))
library(ff)
library(dplyr)
set.seed(12345)
## Define the number of individuals
n <- 10000
###############################################
###############################################
## Section 1
## Build the list of 2D matrices
v_date <- as.vector(outer(c(paste(seq(0, 1, by = 1), "day", sep="_"), paste(seq(2, 5, by = 1), "days", sep="_")), c("a_M", "b_M", "c_M"), paste, sep="|"))
col_mat <- c("year", "day", "time", "ID", "died", v_date)
list_matrices <- list()
for(i in 1:n){
print(i)
list_matrices[[i]] <- ff(-999, dim=c(3650, length(col_mat)), dimnames=list(NULL, col_mat), vmode="double", overwrite = TRUE)
}
## test <- list_matrices[[1]]
## dim(list_matrices[[1]])
## Fill the first row of each matrix
for(i in 1:n){
print(i)
list_matrices[[i]][1,] <- c(1, 1, 1, i-1, 0, rep(0, length(v_date)))
}
## test <- list_matrices[[2]]
## Build the matrix "individual"
ind <- as.matrix(data.frame(year = rep(1, n), day = rep(1, n), time = rep(1, n), died = rep(0, n), ID = (seq(1, n, 1))- 1, a_M = sample(1:10, n, replace = T), b_M = sample(1:10, n, replace = T), c_M = sample(1:10, n, replace = T)))
## print(ind)
###############################################
###############################################
## Section 2
## Function to convert a data frame into a matrix
convertDFToMat <- function(x){
mat <- as.matrix(x[,-1])
ifelse(is(x[,1], "data.frame"), rownames(mat) <- pull(x[,1]), rownames(mat) <- x[,1])
## Convert character matrix into numeric matrix
mat <- apply(mat, 2, as.numeric)
return(mat)
}
## Define the function that is used to fill the matrices within the list
fillMatrices <- function(dt_t_1, species, maxDuration, matrixColumns){
## Format data
dt <- as.data.frame(dt_t_1) %>%
reshape::melt(id = c("ID")) %>%
arrange(ID) %>%
dplyr::mutate_all(as.character)
## summary(dt)
## Break out the variable "variable" into different columns, with one row for each individual-day
dt_reshape_filter_1 <- dt %>%
dplyr::filter(!variable %in% c("year", "day", "time", "ID", "died")) %>%
dplyr::mutate(day = variable %>% gsub(pattern = "\\_.*", replacement = "", x = .), col = variable %>% gsub(pattern = ".*\\|", replacement = "", x = .)) %>%
dplyr::select(-variable) %>%
tidyr::spread(col, value) %>%
dplyr::mutate_all(as.numeric) %>%
dplyr::arrange(ID, day)
## summary(dt_reshape_filter_1)
## Apply requested transformations and build the data frame
dt_transform <- dt_reshape_filter_1 %>%
dplyr::rename_at(vars(species), ~ c("a", "b", "c")) %>%
dplyr::mutate(day = day + 1) %>%
dplyr::filter(day < maxDuration + 1) %>%
dplyr::bind_rows(tibble(ID = ind[,c("ID")], day = 0, a = ind[,c("a_M")], b = ind[,c("b_M")])) %>%
dplyr::mutate(c = a + b) %>%
dplyr::rename_at(vars("a", "b", "c"), ~ species) %>%
dplyr::arrange(ID, day)
## summary(dt_transform)
## Take different columns of the data frame and gather them into a single column
dt_gather <- dt_transform %>%
tidyr::gather(variable, value, species) %>%
dplyr::mutate(day = if_else(day > 1, paste0(day, "_days"), paste0(day, "_day"))) %>%
tidyr::unite(variable, c("day", "variable"), sep = "|") %>%
dplyr::rename(var2 = ID) %>%
dplyr::mutate_all(as.character)
## summary(dt_gather)
## Add the other columns in the data frame and convert the resulting data frame into a matrix
dt_reshape_filter_2 <- dt %>%
dplyr::rename(var2 = ID) %>%
dplyr::filter(variable %in% c("year", "day", "time", "ID", "died")) %>%
tidyr::spread(variable, value) %>%
dplyr::arrange(as.numeric(var2)) %>%
dplyr::mutate(year = ind[,c("year")],
day = ind[,c("day")],
time = ind[,c("time")],
ID = ind[,c("ID")],
died = ind[,c("died")]) %>%
tidyr::gather(variable, value, c(year, day, time, ID, died)) %>%
dplyr::arrange(as.numeric(var2)) %>%
dplyr::mutate_all(as.character)
## summary(dt_reshape_filter_2)
## Build the output matrix
dt_bind <- bind_rows(dt_reshape_filter_2, dt_gather) %>%
tidyr::spread(var2, value) %>%
dplyr::arrange(match(variable, matrixColumns)) %>%
dplyr::select("variable", as.character(ind[,c("ID")]))
## summary(dt_bind)
dt_mat <- convertDFToMat(dt_bind)
## summary(dt_mat)
return(dt_mat)
}
Making a 3D array instead of a 2D list of matrices gives you more options
library(ff)
library(dplyr)
set.seed(12345)
## Define the number of individuals
n <- 10000L
n_row <- 3650L
#array way:
v_date <- as.vector(outer(c(paste(seq(0, 1, by = 1), "day", sep="_"), paste(seq(2, 5, by = 1), "days", sep="_")), c("a_M", "b_M", "c_M"), paste, sep="|"))
col_mat <- c("year", "day", "time", "ID", "died", v_date)
arr1 <- ff(-999L, dim = c(n_row, length(col_mat), n), dimnames = list(NULL, col_mat, NULL))
## Fill the first row of each matrix slice
arr1[1, , ] <- c(1L, 1L, 1L, NA, 0L, rep(0L, length(v_date)))
arr1[1, 4, ] <- seq_len(n)-1L
## Build the matrix "individual"
ind <- as.matrix(data.frame(year = rep(1L, n), day = rep(1L, n), time = rep(1L, n), died = rep(0L, n), ID = (seq(1L, n, 1L))- 1L, a_M = sample(1L:10L, n, replace = T), b_M = sample(1L:10L, n, replace = T), c_M = sample(1L:10L, n, replace = T)))
##fill the matrix
indexTime <- 1L
dt_t <- fillMatrices(dt_t_1 = t(arr1[1, ,]), species = c("a_M", "b_M", "c_M"), maxDuration = 5, matrixColumns = col_mat)
## reassign
system.time(
arr1[indexTime + 1, ,] <- dt_t
)
user system elapsed
0.05 0.70 0.7
# for comparison
#> system.time(for(i in 1:n){
#+ list_matrices[[i]][indexTime + 1,] <- dt_t[,i]
#+ })
# user system elapsed
# 4.75 1.08 5.90
As far as I can tell, it's giving me the same results as your original approach but does so a lot faster.

Resources