Efficient way to find species specific coefficients for a function call - r

Andrew Robinson shows in irebreakeR how to compute tree volume using diameter and height. He creates a function which uses coefficients depending on species and diameter. A simplified version looks like:
funRobinson <- function(species, diameter, height) {
bf_params <- data.frame(species = c("Spruce", "Oak"),
b0_small = c(26.729, 29.790),
b1_small = c( 0.01189, 0.00997),
b0_large = c(32.516, 85.150),
b1_large = c( 0.01181, 0.00841))
dimensions <- data.frame(diameter = diameter,
height = height,
species = as.character(species),
this_order = 1:length(species))
dimensions <- merge(y=dimensions, x=bf_params, all.y=TRUE, all.x=FALSE)
dimensions <- dimensions[order(dimensions$this_order, decreasing=FALSE),]
b0 <- with(dimensions, ifelse(diameter <= 20.5, b0_small, b0_large))
b1 <- with(dimensions, ifelse(diameter <= 20.5, b1_small, b1_large))
b0 + b1 * dimensions$diameter^2 * dimensions$height
}
For me this method looks straight forward but it creates an additional data.frame which needs to be sorted and calls ifelse twice to distinguish between small (diameter <= 20.5) and large trees. I'm looking for a more efficient way (low memory consumption, execution time) to find species specific coefficients. I would appreciate the possibility to add coefficients for other species without editing the function.
Example data-set and Performance:
dat <- data.frame(species = c("Spruce", "Spruce", "Oak", "Oak", "Fir"),
diameter = c(4, 30, 4, 30, 30),
height = c(30, 100, 30, 100, 100))
with(dat, funRobinson(species, diameter, height))
#[1] 32.4362 1095.4160 34.5756 842.0500 NA
library(microbenchmark)
microbenchmark(
Robinson = with(dat, funRobinson(species, diameter, height))
)
#Unit: milliseconds
# expr min lq mean median uq max neval
# Robinson 1.832604 1.860334 1.948054 1.876155 1.905009 3.054021 100
set.seed(0)
size <- 1e5
dat2 <- data.frame(species = sample(c("Spruce", "Oak", "Fir"), size=size, replace = TRUE)
, diameter = runif(size, 1, 50)
, height = runif(size, 1, 100))
microbenchmark(
Robinson = with(dat2, funRobinson(species, diameter, height))
)
#Unit: milliseconds
# expr min lq mean median uq max neval
# Robinson 203.8171 219.9265 234.0798 227.5911 250.6204 278.9918 100

I guess it's avoiding the data frame but directly calling the values out from a vector (or matrix). And the values called are the same for b0 and b1, so we only need to calculate it once.
Below is a quick attempt, most likely it can be made faster. I basically make 2 matrices for each parameter, and call out the corresponding rows and columns, according to
f2 <- function(species, diameter, height) {
species_avail=c("Spruce", "Oak")
params_b0 = cbind(b0_small = c(26.729, 29.790),
b0_large = c(32.516, 85.150))
rownames(params_b0) = species_avail
params_b1 = cbind(b1_small = c( 0.01189, 0.00997),
b1_large = c( 0.01181, 0.00841))
rownames(params_b1) = species_avail
ROWS = match(species,species_avail)
COLS = +(diameter > 20.5) + 1
idx = cbind(ROWS,COLS)
b0 <- params_b0[idx]
b1 <- params_b1[idx]
b0 + b1 * diameter^2 * height
}
Create data:
set.seed(0)
size <- 1e5
dat2 <- data.frame(species = sample(c("Spruce", "Oak", "Fir"), size=size, replace = TRUE)
, diameter = runif(size, 1, 50)
, height = runif(size, 1, 100))
check the function returns same thing:
identical(
with(dat2,funRobinson(species, diameter, height)),
with(dat2,f2(species,diameter,height))
)
[1] TRUE
Test:
microbenchmark(
Robinson = with(dat2, funRobinson(species, diameter, height)),
f2 = with(dat2, f2(species, diameter, height))
)
Unit: milliseconds
expr min lq mean median uq max neval
Robinson 249.677157 275.23375 303.97532 298.72475 329.04318 391.53807 100
f2 9.423471 10.16365 13.86918 10.48073 16.06827 65.19541 100
cld
b
a

Using the same approach like #StupidWolf but removing the match by directly using the number of the factor of the tree species by storing the coefficients sorted by those factors. Storing the coefficients in an environment avoids setting up the coefficients each time the function is called.
funGKiCl <- function(params, speciesLevels) {
force(params)
force(speciesLevels)
nSpecies <- length(speciesLevels)
i <- match(speciesLevels, params$species)
params_b0 <- c(params$b0_small[i], params$b0_large[i])
params_b1 <- c(params$b1_small[i], params$b1_large[i])
rm(i, params, speciesLevels)
function(species, diameter, height) {
i <- unclass(species) + (diameter > 20.5) * nSpecies
params_b0[i] + params_b1[i] * diameter * diameter * height
}
}
dat <- data.frame(species = c("Spruce", "Spruce", "Oak", "Oak", "Fir")
, diameter = c(4, 30, 4, 30, 30)
, height = c(30, 100, 30, 100, 100))
params <- read.table(header = TRUE, text = "
species b0_small b1_small b0_large b1_large
Spruce 26.729 0.01189 32.516 0.01181
Oak 29.790 0.00997 85.150 0.00841")
funGKi <- compiler::cmpfun(funGKiCl(params, levels(dat$species)))
with(dat, funGKi(species, diameter, height))
#[1] 32.4362 1095.4160 34.5756 842.0500 NA

Currently the method from #GKi is the fastest and uses lowest memory.
Data:
dat <- data.frame(species = c("Spruce", "Spruce", "Oak", "Oak", "Fir")
, diameter = c(4, 30, 4, 30, 30)
, height = c(30, 100, 30, 100, 100))
set.seed(0)
size <- 1e5
dat2 <- data.frame(
species = sample(c("Spruce", "Oak", "Fir"), size=size, replace = TRUE)
, diameter = runif(size, 1, 50)
, height = runif(size, 1, 100))
Methods:
funRobinson <- function(species, diameter, height) {
bf_params <- data.frame(species = c("Spruce", "Oak"),
b0_small = c(26.729, 29.790),
b1_small = c( 0.01189, 0.00997),
b0_large = c(32.516, 85.150),
b1_large = c( 0.01181, 0.00841))
dimensions <- data.frame(diameter = diameter,
height = height,
species = as.character(species),
this_order = 1:length(species))
dimensions <- merge(y=dimensions, x=bf_params, all.y=TRUE, all.x=FALSE)
dimensions <- dimensions[order(dimensions$this_order, decreasing=FALSE),]
b0 <- with(dimensions, ifelse(diameter <= 20.5, b0_small, b0_large))
b1 <- with(dimensions, ifelse(diameter <= 20.5, b1_small, b1_large))
b0 + b1 * dimensions$diameter^2 * dimensions$height
}
with(dat, funRobinson(species, diameter, height))
funStupidWolf <- function(species, diameter, height) {
species_avail=c("Spruce", "Oak")
params_b0 = cbind(b0_small = c(26.729, 29.790),
b0_large = c(32.516, 85.150))
rownames(params_b0) = species_avail
params_b1 = cbind(b1_small = c( 0.01189, 0.00997),
b1_large = c( 0.01181, 0.00841))
rownames(params_b1) = species_avail
ROWS = match(species,species_avail)
COLS = +(diameter > 20.5) + 1
idx = cbind(ROWS,COLS)
b0 <- params_b0[idx]
b1 <- params_b1[idx]
b0 + b1 * diameter^2 * height
}
with(dat, funStupidWolf(species, diameter, height))
funGKiCl <- function(params, speciesLevels) {
force(params)
force(speciesLevels)
nSpecies <- length(speciesLevels)
i <- match(speciesLevels, params$species)
params_b0 <- c(params$b0_small[i], params$b0_large[i])
params_b1 <- c(params$b1_small[i], params$b1_large[i])
rm(i, params, speciesLevels)
function(species, diameter, height) {
i <- unclass(species) + (diameter > 20.5) * nSpecies
params_b0[i] + params_b1[i] * diameter * diameter * height
}
}
params <- read.table(header = TRUE, text = "
species b0_small b1_small b0_large b1_large
Spruce 26.729 0.01189 32.516 0.01181
Oak 29.790 0.00997 85.150 0.00841")
funGKi <- compiler::cmpfun(funGKiCl(params, levels(dat$species)))
with(dat, funGKi(species, diameter, height))
rm(funGKiCl, params)
fun <- alist(Robinson = funRobinson(species, diameter, height)
, StupidWolf = funStupidWolf(species, diameter, height)
, GKi = funGKi(species, diameter, height))
Time:
library(microbenchmark)
attach(dat)
microbenchmark(list = fun, check = "equal")
#Unit: microseconds
# expr min lq mean median uq max neval
# Robinson 1876.491 1911.583 1997.00924 1934.8835 1962.3145 3131.453 100
# StupidWolf 15.618 17.371 22.30764 18.9995 26.5125 33.239 100
# GKi 2.270 2.965 4.04041 3.6825 5.0415 7.434 100
microbenchmark(list = fun, check = "equal", control=list(order="block"))
#Unit: microseconds
# expr min lq mean median uq max neval
# Robinson 1887.906 1918.0475 2000.55586 1938.847 1962.9540 3131.112 100
# StupidWolf 15.184 16.2775 16.97111 16.668 17.2230 34.646 100
# GKi 2.063 2.1560 2.37552 2.255 2.4015 5.616 100
attach(dat2)
microbenchmark(list = fun, setup = gc(), check = "equal")
#Unit: milliseconds
# expr min lq mean median uq max neval
# Robinson 189.342408 193.222831 193.682868 193.573419 194.181910 198.231698 100
# StupidWolf 6.755601 6.786439 6.836253 6.804451 6.832409 7.370937 100
# GKi 1.756241 1.767335 1.794328 1.782949 1.806370 1.964409 100
library(bench)
attach(dat)
mark(exprs = fun, iterations = 100)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#1 Robinson 1.87ms 1.9ms 521. 0B 10.6 98 2 188.1ms
#2 StupidWolf 16.48µs 17.46µs 55666. 0B 0 100 0 1.8ms
#3 GKi 2.67µs 2.86µs 337265. 0B 0 100 0 296.5µs
attach(dat2)
mark(exprs = fun, iterations = 100)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#1 Robinson 188.96ms 216.15ms 4.44 67.71MB 11.4 100 257
#2 StupidWolf 6.79ms 6.85ms 131. 8.77MB 30.0 100 23
#3 GKi 1.7ms 1.72ms 552. 2.29MB 22.1 100 4
#Some expressions had a GC in every iteration; so filtering is disabled.
Memory:
memUse <- function(list, setup = "", gctort = FALSE) {
as.data.frame(lapply(list, function(z) {
eval(setup)
tt <- sum(.Internal(gc(FALSE, TRUE, TRUE))[13:14])
gctorture(on = gctort)
eval(z)
gctorture(on = FALSE)
sum(.Internal(gc(FALSE, FALSE, TRUE))[13:14]) - tt
}))
}
attach(dat)
memUse(list=fun, gctort = FALSE)
# Robinson StupidWolf GKi
#1 0.9 0 0
memUse(list=fun, gctort = TRUE)
# Robinson StupidWolf GKi
#1 0 0 0
attach(dat2)
memUse(list=fun, gctort = FALSE)
# Robinson StupidWolf GKi
#1 71.9 8.8 2.3
memUse(list=fun, gctort = TRUE)
# Robinson StupidWolf GKi
#1 29.7 6.5 2.3
object.size(funRobinson)
#109784 bytes
object.size(funStupidWolf)
#68240 bytes
object.size(funGKi)
#21976 bytes

Related

Sample from a 0:Vector[i]

In R:
I have a vector **
y = sample(0:200, 1e4, replace = TRUE)
I want to create a variable ‘x’ such that:
x = sample(0:y[i], 1e4, replace = TRUE)
Where y[i] are the values of y1, y2, …, y1e4 created from the sample function before. For example if y1 = 50 then I would like the first entry of x = sample(0:50) etc. However I am not sure how to do this. I have tried for loops but have gotten no where.
Any help is much appreciated!
How about
x <- as.integer(runif(1e4)*sample(201, 1e4, TRUE))
Benchmarking:
f1 <- function() sapply(sample(0:200, 1e4, replace = TRUE), function(i) sample(0:i, size = 1))
f2 <- function() as.integer(runif(1e4)*sample(201, 1e4, TRUE))
microbenchmark::microbenchmark(f1 = f1(),
f2 = f2())
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> f1 38877.3 47070.50 49294.770 48625.00 50175.35 97045.0 100
#> f2 508.2 522.05 555.602 531.45 549.45 2080.8 100
This should work:
y = sample(0:200, 1e4, replace = TRUE)
x = sapply(y, \(i) sample(0:i, size = 1))
Or the equivalent using a for loop:
x = numeric(length(y))
for(i in seq_along(y)) {
x[i] = sample(0:y[i], size = 1)
}
If efficiency matters, this might be a bit faster on very long input:
x = floor(runif(length(y), min = 0, max = y + 1))

Fastest way to map multiple character columns to numerical values

I have an algorithm that at each iteration calculates means for certain groups (the groups do not change only their values).
The table of the values -
d1 <- data.frame(x = sample(LETTERS, N, replace = TRUE),
y1=rnorm(N))
head(d1)
# x y1
# 1 H -0.7852538
# 2 G -0.6739159
# 3 V -1.7783771
# 4 L -0.2849846
# 5 I -0.1760284
# 6 V -0.2785826
I can calculate the means (in several ways: dplyr, data.table and tapply). I have another data.frame consisting of two columns with the group names.
d2 <- data.frame('group.high' = sample(LETTERS, N * 2, replace = TRUE),
'group.low' = sample(LETTERS, N * 2, replace = TRUE))
head(d2)
# group.high group.low
# 1 U L
# 2 K J
# 3 C Q
# 4 Q A
# 5 Q U
# 6 K W
I want to add to columns, mean.high and mean.better, of the mean values of each group based on d1.
So far I have tried two options from dplyr and data.table. I had to use left_join twice in either of them. They are both similar in speed.
microbenchmark(
dplyr = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
### Solution 1
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))},
data.table = {
### Solution 2
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
}
)
Unit: milliseconds
expr min lq mean median uq max neval cld
dplyr 34.0837 36.88650 53.22239 42.9227 47.50660 231.5066 100 a
data.table 40.2071 47.70735 87.46804 51.2517 59.05385 258.4999 100 b
Is there a better way? How can I speed the calculation?
As mentioned in the comments, there is an iterative process of updating the values. Here is an example.
N <- 10000
iterFuncDplyr <- function(d1, d2) {
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))
return(var(d1$y1))
}
iterFuncData <- function(d1, d2) {
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table:::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table:::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
return(var(d1$y1))
}
d1 <- data.frame(x = sample(LETTERS, N, replace = TRUE),
y1=rnorm(N))
d2 <- data.frame('group.high' = sample(LETTERS, N * 2, replace = TRUE),
'group.low' = sample(LETTERS, N * 2, replace = TRUE))
library(data.table)
library(dplyr)
microbenchmark::microbenchmark(dplyr = {
temp.val <- 0
for (i in 1:10) {
d1$y1 <- temp.val + rnorm(N)
temp.val <- iterFuncDplyr(d1, d2)
}},
data.table = {
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
temp.val <- 0
for (i in 1:10) {
d1$y1 <- temp.val + rnorm(N)
temp.val <- iterFuncData(d1, d2)
}
}
)
Unit: milliseconds
expr min lq mean median uq max neval
dplyr 46.22904 50.67959 52.78275 51.96358 53.34825 108.2874 100
data.table 63.81111 67.13257 70.85537 69.85712 72.72446 127.4228 100
You could subset the named vector means to create new columns and match your output:
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
d2$mean.high <- means[d2$group.high]
d2$mean.low <- means[d2$group.low]
identical(as.matrix(d2), as.matrix(d3)) #factor vs character, used d3 w/ benchmark
[1] TRUE
Unit: microseconds
expr min lq mean median uq max neval
dplyr 4868.2 5316.25 5787.123 5524.15 5892.70 12187.3 100
data.table 8254.4 9606.60 10438.424 10118.35 10771.75 20966.5 100
subset 481.2 529.40 651.194 550.35 582.55 7849.9 100
Benchmark code:
d3 <- d2
microbenchmark::microbenchmark( # N = 10000
dplyr = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
### Solution 1
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))},
data.table = {
### Solution 2
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
},
subset = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
d3$mean.high <- means[d2$group.high]
d3$mean.low <- means[d2$group.low]
}
)
Here is an answer very similar to Andrew's but relying on data.table instead of tapply() (which seems faster for very big N).
library(data.table)
# Create a named vector "means"
means <- setDT(d1)[, mean(y1), by = x][, setNames(V1, x)]
setDT(d2)[, c("mean.high.means", "mean.low.means") :=
.(means[as.character(group.high)], means[as.character(group.low)])]
Output:
group.high group.low mean.high.means mean.low.means
1: Z W 0.017032792 0.0091625547
2: A A 0.013796137 0.0137961371
3: V S -0.011570159 0.0004560325
4: D X 0.005475629 0.0200984250
5: U H -0.008249901 0.0054537833
---
199996: H K 0.005453783 0.0079905631
199997: A T 0.013796137 -0.0068537963
199998: W U 0.009162555 -0.0082499015
199999: T V -0.006853796 -0.0115701585
200000: G J 0.014829259 0.0206598470
Reproducible data:
N = 1e5
set.seed(1)
d1 <- data.frame(
x = sample(LETTERS, N, replace = TRUE),
y1 = rnorm(N)
)
d2 <- data.frame(
group.high = sample(LETTERS, N * 2, replace = TRUE),
group.low = sample(LETTERS, N * 2, replace = TRUE)
)

create data frame containing rows that add up to 100

This is my first stab at this:
library(dplyr)
step_size <- 5
grid <- expand.grid(
x1 = seq(0, 100, step_size)
, x2 = seq(0, 100, step_size)
, x3 = seq(0, 100, step_size)
)
grid$sum = grid$x1 + grid$x2 + grid$x3
grid$x1 <- (grid$x1 / grid$sum) * 100
grid$x2 <- (grid$x2 / grid$sum) * 100
grid$x3 <- (grid$x3 / grid$sum) * 100
grid$sum <- grid$x1 + grid$x2 + grid$x3
nrow(grid)
result <- distinct(grid) %>% filter(!is.na(sum))
head(result, 20)
nrow(result)
Basically, I want to create a data frame that contains as many rows as possible that add up to 100 and are uniformly distributed.
is there an easier better approach in R? thanks!
Using data.table...
library(data.table)
grid <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
setDT(grid)
res <- grid[grid[, rowSums(.SD) == 100], ]
res[, summation := rowSums(.SD)]
Result:
> res[, unique(summation)]
[1] 100
This can also be done in base but data.table is faster:
library(data.table)
grid <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
grid2 <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
setDT(grid)
microbenchmark::microbenchmark(
data.table = {
res <- grid[grid[, rowSums(.SD) == 100], ]
},
base = {
res2 <- grid2[rowSums(grid2) == 100, ]
}
)
Unit: milliseconds
expr min lq mean median uq max neval cld
data.table 59.41157 89.6700 109.0462 107.7415 124.2675 183.9730 100 a
base 65.70521 109.6471 154.1312 125.4238 156.9168 611.0169 100 b
Here's a simple function. You can specify how many rows/columns you want, and what each row sums to.
func <- function(cols = 3, rows = 10, rowTotal = 100) {
dt1 <- replicate(n = cols, runif(n = rows))
dt1 <- data.frame(apply(X = dt1, MARGIN = 2, FUN = function(x) x / rowSums(dt1) * rowTotal))
return(dt1)
}
rowSums(func()) # default values (3 cols, 10 rows, each row sums to 100)
rowSums(func(cols = 5, rows = 10, rowTotal = 50)) # 5 cols, 10 rows, row sums to 50)

method for calculating distance between all points quickly

I have some code that generates a matrix of distances between all points in a dataframe that uses functions from tidyverse. However, it works prohibitively slow. Does anyone know of a method to do the same thing that is faster?
Example data and working code:
library(tidyverse)
locs <- data.frame(ID = 1:4000, x = runif (4000, 0, 1), y = runif (4000, 0, 1))
df1 <- locs %>%
mutate(k = 1)
df2 <- df1 %>%
full_join(df1, by = "k") %>%
mutate(length = sqrt((x.x - x.y)^2 + (y.x - y.y)^2)) %>%
select(ID.x, ID.y, length)
dists <- matrix(data = df2$length, nrow = nrow(df1), ncol = nrow(df1))
You could use the base R function dist:
locs <- data.frame(ID = 1:10, x = runif (10, 0, 1), y = runif (10, 0, 1))
dist(locs[,2:3], upper = T, diag = T)
Output:
1 2 3 4 5 6 7 8 9 10
1 0.00000000 1.10309601 0.98790825 0.54490600 0.42478532 1.06323764 0.31094245 0.52593635 0.44695830 0.85010761
2 1.10309601 0.00000000 0.29292865 0.93412638 0.74551902 0.17160290 0.83557056 0.62393711 0.74218236 0.57669081
3 0.98790825 0.29292865 0.00000000 0.69626767 0.72278486 0.13085561 0.78064096 0.46359296 0.73098652 0.72732431
4 0.54490600 0.93412638 0.69626767 0.00000000 0.65426980 0.81617143 0.59851262 0.36551106 0.68253093 1.00018238
5 0.42478532 0.74551902 0.72278486 0.65426980 0.00000000 0.75537605 0.11384534 0.36844164 0.02911855 0.42844270
6 1.06323764 0.17160290 0.13085561 0.81617143 0.75537605 0.00000000 0.82826619 0.55014297 0.75867851 0.68258388
7 0.31094245 0.83557056 0.78064096 0.59851262 0.11384534 0.82826619 0.00000000 0.37224997 0.13688270 0.54088523
8 0.52593635 0.62393711 0.46359296 0.36551106 0.36844164 0.55014297 0.37224997 0.00000000 0.39086196 0.64185453
9 0.44695830 0.74218236 0.73098652 0.68253093 0.02911855 0.75867851 0.13688270 0.39086196 0.00000000 0.40400339
10 0.85010761 0.57669081 0.72732431 1.00018238 0.42844270 0.68258388 0.54088523 0.64185453 0.40400339 0.00000000
Benchmark with 1000 records:
library(dplyr)
library(microbenchmark)
locs <- data.frame(ID = 1:1000, x = runif (1000, 0, 1), y = runif (1000, 0, 1))
f1 <- function()
{
df1 <- locs %>%
mutate(k = 1)
df2 <- df1 %>%
full_join(df1, by = "k") %>%
mutate(length = sqrt((x.x - x.y)^2 + (y.x - y.y)^2)) %>%
select(ID.x, ID.y, length)
dists <- matrix(data = df2$length, nrow = nrow(df1), ncol = nrow(df1))
}
f2 <- function(){dist(locs[,2:3],upper = T,diag=T)}
microbenchmark(f1())
microbenchmark(f2())
Results:
Unit: milliseconds
expr min lq mean median uq max neval
f1() 81.74188 245.8014 276.4318 259.7682 294.01 567.9409 100
and
Unit: milliseconds
expr min lq mean median uq max neval
f2() 6.956302 7.330661 8.675304 8.11507 8.981121 18.77783 100
Hope this helps!

Adaptive moving average - top performance in R

I am looking for some performance gains in terms of rolling/sliding window functions in R. It is quite common task which can be used in any ordered observations data set. I would like to share some of my findings, maybe somebody would be able to provide feedback to make it even faster.
Important note is that I focus on the case align="right" and adaptive rolling window, so width is a vector (same length as our observation vector). In case if we have width as scalar there are already very well developed functions in zoo and TTR packages which would be very hard to beat (4 years later: it was easier than I expected) as some of them are even using Fortran (but still user-defined FUNs can be faster using mentioned below wapply).
RcppRoll package is worth to mention due to its great performance, but so far there is no function which answers to that question. Would be great if someone could extend it to answer the question.
Consider we have a following data:
x = c(120,105,118,140,142,141,135,152,154,138,125,132,131,120)
plot(x, type="l")
And we want to apply rolling function over x vector with variable rolling window width.
set.seed(1)
width = sample(2:4,length(x),TRUE)
In this particular case we would have rolling function adaptive to sample of c(2,3,4).
We will apply mean function, expected results:
r = f(x, width, FUN = mean)
print(r)
## [1] NA NA 114.3333 120.7500 141.0000 135.2500 139.5000
## [8] 142.6667 147.0000 146.0000 131.5000 128.5000 131.5000 127.6667
plot(x, type="l")
lines(r, col="red")
Any indicator can be employed to produce width argument as different variants of adaptive moving averages, or any other function.
Looking for a top performance.
December 2018 update
Efficient implementation of adaptive rolling functions has been made in
data.table recently - more info in ?froll manual. Additionally an efficient alternative solution using base R has been identified (fastama below). Unfortunately Kevin Ushey's answer does not address the question thus it is not included in benchmark.
Scale of benchmark has been increased as it pointless to compare microseconds.
set.seed(108)
x = rnorm(1e6)
width = rep(seq(from = 100, to = 500, by = 5), length.out=length(x))
microbenchmark(
zoo=rollapplyr(x, width = width, FUN=mean, fill=NA),
mapply=base_mapply(x, width=width, FUN=mean, na.rm=T),
wmapply=wmapply(x, width=width, FUN=mean, na.rm=T),
ama=ama(x, width, na.rm=T),
fastama=fastama(x, width),
frollmean=frollmean(x, width, na.rm=T, adaptive=TRUE),
frollmean_exact=frollmean(x, width, na.rm=T, adaptive=TRUE, algo="exact"),
times=1L
)
#Unit: milliseconds
# expr min lq mean median uq max neval
# zoo 32371.938248 32371.938248 32371.938248 32371.938248 32371.938248 32371.938248 1
# mapply 13351.726032 13351.726032 13351.726032 13351.726032 13351.726032 13351.726032 1
# wmapply 15114.774972 15114.774972 15114.774972 15114.774972 15114.774972 15114.774972 1
# ama 9780.239091 9780.239091 9780.239091 9780.239091 9780.239091 9780.239091 1
# fastama 351.618042 351.618042 351.618042 351.618042 351.618042 351.618042 1
# frollmean 7.708054 7.708054 7.708054 7.708054 7.708054 7.708054 1
# frollmean_exact 194.115012 194.115012 194.115012 194.115012 194.115012 194.115012 1
ama = function(x, n, na.rm=FALSE, fill=NA, nf.rm=FALSE) {
# more or less the same as previous forloopply
stopifnot((nx<-length(x))==length(n))
if (nf.rm) x[!is.finite(x)] = NA_real_
ans = rep(NA_real_, nx)
for (i in seq_along(x)) {
ans[i] = if (i >= n[i])
mean(x[(i-n[i]+1):i], na.rm=na.rm)
else as.double(fill)
}
ans
}
fastama = function(x, n, na.rm, fill=NA) {
if (!missing(na.rm)) stop("fast adaptive moving average implemented in R does not handle NAs, input having NAs will result in incorrect answer so not even try to compare to it")
# fast implementation of adaptive moving average in R, in case of NAs incorrect answer
stopifnot((nx<-length(x))==length(n))
cs = cumsum(x)
ans = rep(NA_real_, nx)
for (i in seq_along(cs)) {
ans[i] = if (i == n[i])
cs[i]/n[i]
else if (i > n[i])
(cs[i]-cs[i-n[i]])/n[i]
else as.double(fill)
}
ans
}
Old answer:
I chose 4 available solutions which doesn't need to do to C++, quite easy to find or google.
# 1. rollapply
library(zoo)
?rollapplyr
# 2. mapply
base_mapply <- function(x, width, FUN, ...){
FUN <- match.fun(FUN)
f <- function(i, width, data){
if(i < width) return(NA_real_)
return(FUN(data[(i-(width-1)):i], ...))
}
mapply(FUN = f,
seq_along(x), width,
MoreArgs = list(data = x))
}
# 3. wmapply - modified version of wapply found: https://rmazing.wordpress.com/2013/04/23/wapply-a-faster-but-less-functional-rollapply-for-vector-setups/
wmapply <- function(x, width, FUN = NULL, ...){
FUN <- match.fun(FUN)
SEQ1 <- 1:length(x)
SEQ1[SEQ1 < width] <- NA_integer_
SEQ2 <- lapply(SEQ1, function(i) if(!is.na(i)) (i - (width[i]-1)):i)
OUT <- lapply(SEQ2, function(i) if(!is.null(i)) FUN(x[i], ...) else NA_real_)
return(base:::simplify2array(OUT, higher = TRUE))
}
# 4. forloopply - simple loop solution
forloopply <- function(x, width, FUN = NULL, ...){
FUN <- match.fun(FUN)
OUT <- numeric()
for(i in 1:length(x)) {
if(i < width[i]) next
OUT[i] <- FUN(x[(i-(width[i]-1)):i], ...)
}
return(OUT)
}
Below are the timings for prod function. mean function might be already optimized inside rollapplyr. All results equal.
library(microbenchmark)
# 1a. length(x) = 1000, window = 5-20
x <- runif(1000,0.5,1.5)
width <- rep(seq(from = 5, to = 20, by = 5), length(x)/4)
microbenchmark(
rollapplyr(data = x, width = width, FUN = prod, fill = NA),
base_mapply(x = x, width = width, FUN = prod, na.rm=T),
wmapply(x = x, width = width, FUN = prod, na.rm=T),
forloopply(x = x, width = width, FUN = prod, na.rm=T),
times=100L
)
Unit: milliseconds
expr min lq median uq max neval
rollapplyr(data = x, width = width, FUN = prod, fill = NA) 59.690217 60.694364 61.979876 68.55698 153.60445 100
base_mapply(x = x, width = width, FUN = prod, na.rm = T) 14.372537 14.694266 14.953234 16.00777 99.82199 100
wmapply(x = x, width = width, FUN = prod, na.rm = T) 9.384938 9.755893 9.872079 10.09932 84.82886 100
forloopply(x = x, width = width, FUN = prod, na.rm = T) 14.730428 15.062188 15.305059 15.76560 342.44173 100
# 1b. length(x) = 1000, window = 50-200
x <- runif(1000,0.5,1.5)
width <- rep(seq(from = 50, to = 200, by = 50), length(x)/4)
microbenchmark(
rollapplyr(data = x, width = width, FUN = prod, fill = NA),
base_mapply(x = x, width = width, FUN = prod, na.rm=T),
wmapply(x = x, width = width, FUN = prod, na.rm=T),
forloopply(x = x, width = width, FUN = prod, na.rm=T),
times=100L
)
Unit: milliseconds
expr min lq median uq max neval
rollapplyr(data = x, width = width, FUN = prod, fill = NA) 71.99894 74.19434 75.44112 86.44893 281.6237 100
base_mapply(x = x, width = width, FUN = prod, na.rm = T) 15.67158 16.10320 16.39249 17.20346 103.6211 100
wmapply(x = x, width = width, FUN = prod, na.rm = T) 10.88882 11.54721 11.75229 12.19790 106.1170 100
forloopply(x = x, width = width, FUN = prod, na.rm = T) 15.70704 16.06983 16.40393 17.14210 108.5005 100
# 2a. length(x) = 10000, window = 5-20
x <- runif(10000,0.5,1.5)
width <- rep(seq(from = 5, to = 20, by = 5), length(x)/4)
microbenchmark(
rollapplyr(data = x, width = width, FUN = prod, fill = NA),
base_mapply(x = x, width = width, FUN = prod, na.rm=T),
wmapply(x = x, width = width, FUN = prod, na.rm=T),
forloopply(x = x, width = width, FUN = prod, na.rm=T),
times=100L
)
Unit: milliseconds
expr min lq median uq max neval
rollapplyr(data = x, width = width, FUN = prod, fill = NA) 753.87882 781.8789 809.7680 872.8405 1116.7021 100
base_mapply(x = x, width = width, FUN = prod, na.rm = T) 148.54919 159.9986 231.5387 239.9183 339.7270 100
wmapply(x = x, width = width, FUN = prod, na.rm = T) 98.42682 105.2641 117.4923 183.4472 245.4577 100
forloopply(x = x, width = width, FUN = prod, na.rm = T) 533.95641 602.0652 646.7420 672.7483 922.3317 100
# 2b. length(x) = 10000, window = 50-200
x <- runif(10000,0.5,1.5)
width <- rep(seq(from = 50, to = 200, by = 50), length(x)/4)
microbenchmark(
rollapplyr(data = x, width = width, FUN = prod, fill = NA),
base_mapply(x = x, width = width, FUN = prod, na.rm=T),
wmapply(x = x, width = width, FUN = prod, na.rm=T),
forloopply(x = x, width = width, FUN = prod, na.rm=T),
times=100L
)
Unit: milliseconds
expr min lq median uq max neval
rollapplyr(data = x, width = width, FUN = prod, fill = NA) 912.5829 946.2971 1024.7245 1071.5599 1431.5289 100
base_mapply(x = x, width = width, FUN = prod, na.rm = T) 171.3189 180.6014 260.8817 269.5672 344.4500 100
wmapply(x = x, width = width, FUN = prod, na.rm = T) 123.1964 131.1663 204.6064 221.1004 484.3636 100
forloopply(x = x, width = width, FUN = prod, na.rm = T) 561.2993 696.5583 800.9197 959.6298 1273.5350 100
For reference, you should definitely check out RcppRoll if you have only a single window length to 'roll' over:
library(RcppRoll) ## install.packages("RcppRoll")
library(microbenchmark)
x <- runif(1E5)
all.equal( rollapplyr(x, 10, FUN=prod), roll_prod(x, 10) )
microbenchmark( times=5,
rollapplyr(x, 10, FUN=prod),
roll_prod(x, 10)
)
gives me
> library(RcppRoll)
> library(microbenchmark)
> x <- runif(1E5)
> all.equal( rollapplyr(x, 10, FUN=prod), roll_prod(x, 10) )
[1] TRUE
> microbenchmark( times=5,
+ zoo=rollapplyr(x, 10, FUN=prod),
+ RcppRoll=roll_prod(x, 10)
+ )
Unit: milliseconds
expr min lq median uq max neval
zoo 924.894069 968.467299 997.134932 1029.10883 1079.613569 5
RcppRoll 1.509155 1.553062 1.760739 1.90061 1.944999 5
It's a bit faster ;) and the package is flexible enough for users to define and use their own rolling functions (with C++). I may extend the package in the future to allow multiple window widths, but I am sure it will be tricky to get right.
If you want to define the prod yourself, you can do so -- RcppRoll allows you to define your own C++ functions to pass through and generate a 'rolling' function if you'd like. rollit gives a somewhat nicer interface, while rollit_raw just lets you write a C++ function yourself, somewhat like you might do with Rcpp::cppFunction. The philosophy being, you should only have to express the computation you wish to perform on a particular window, and RcppRoll can take care of iterating over windows of some size.
library(RcppRoll)
library(microbenchmark)
x <- runif(1E5)
my_rolling_prod <- rollit(combine="*")
my_rolling_prod2 <- rollit_raw("
double output = 1;
for (int i=0; i < n; ++i) {
output *= X(i);
}
return output;
")
all.equal( roll_prod(x, 10), my_rolling_prod(x, 10) )
all.equal( roll_prod(x, 10), my_rolling_prod2(x, 10) )
microbenchmark( times=5,
rollapplyr(x, 10, FUN=prod),
roll_prod(x, 10),
my_rolling_prod(x, 10),
my_rolling_prod2(x, 10)
)
gives me
> library(RcppRoll)
> library(microbenchmark)
> # 1a. length(x) = 1000, window = 5-20
> x <- runif(1E5)
> my_rolling_prod <- rollit(combine="*")
C++ source file written to /var/folders/m7/_xnnz_b53kjgggkb1drc1f8c0000gn/T//RtmpcFMJEV/file80263aa7cca2.cpp .
Compiling...
Done!
> my_rolling_prod2 <- rollit_raw("
+ double output = 1;
+ for (int i=0; i < n; ++i) {
+ output *= X(i);
+ }
+ return output;
+ ")
C++ source file written to /var/folders/m7/_xnnz_b53kjgggkb1drc1f8c0000gn/T//RtmpcFMJEV/file802673777da2.cpp .
Compiling...
Done!
> all.equal( roll_prod(x, 10), my_rolling_prod(x, 10) )
[1] TRUE
> all.equal( roll_prod(x, 10), my_rolling_prod2(x, 10) )
[1] TRUE
> microbenchmark(
+ rollapplyr(x, 10, FUN=prod),
+ roll_prod(x, 10),
+ my_rolling_prod(x, 10),
+ my_rolling_prod2(x, 10)
+ )
> microbenchmark( times=5,
+ rollapplyr(x, 10, FUN=prod),
+ roll_prod(x, 10),
+ my_rolling_prod(x, 10),
+ my_rolling_prod2(x, 10)
+ )
Unit: microseconds
expr min lq median uq max neval
rollapplyr(x, 10, FUN = prod) 979710.368 1115931.323 1117375.922 1120085.250 1149117.854 5
roll_prod(x, 10) 1504.377 1635.749 1638.943 1815.344 2053.997 5
my_rolling_prod(x, 10) 1507.687 1572.046 1648.031 2103.355 7192.493 5
my_rolling_prod2(x, 10) 774.381 786.750 884.951 1052.508 1434.660 5
So really, as long as you are capable of expressing the computation you wish to perform in a particular window through either the rollit interface or with a C++ function passed through rollit_raw (whose interface is a bit rigid, but still functional), you are in good shape.
Somehow people have missed the ultra fast runmed() in base R (stats package). It's not adaptive, as far as I understand the original question, but for a rolling median, it's FAST! Comparing here to roll_median() from RcppRoll.
> microbenchmark(
+ runmed(x = x, k = 3),
+ roll_median(x, 3),
+ times=1000L
+ )
Unit: microseconds
expr min lq mean median uq max neval
runmed(x = x, k = 3) 41.053 44.854 47.60973 46.755 49.795 117.838 1000
roll_median(x, 3) 101.872 105.293 108.72840 107.574 111.375 178.657 1000

Resources