method for calculating distance between all points quickly - r

I have some code that generates a matrix of distances between all points in a dataframe that uses functions from tidyverse. However, it works prohibitively slow. Does anyone know of a method to do the same thing that is faster?
Example data and working code:
library(tidyverse)
locs <- data.frame(ID = 1:4000, x = runif (4000, 0, 1), y = runif (4000, 0, 1))
df1 <- locs %>%
mutate(k = 1)
df2 <- df1 %>%
full_join(df1, by = "k") %>%
mutate(length = sqrt((x.x - x.y)^2 + (y.x - y.y)^2)) %>%
select(ID.x, ID.y, length)
dists <- matrix(data = df2$length, nrow = nrow(df1), ncol = nrow(df1))

You could use the base R function dist:
locs <- data.frame(ID = 1:10, x = runif (10, 0, 1), y = runif (10, 0, 1))
dist(locs[,2:3], upper = T, diag = T)
Output:
1 2 3 4 5 6 7 8 9 10
1 0.00000000 1.10309601 0.98790825 0.54490600 0.42478532 1.06323764 0.31094245 0.52593635 0.44695830 0.85010761
2 1.10309601 0.00000000 0.29292865 0.93412638 0.74551902 0.17160290 0.83557056 0.62393711 0.74218236 0.57669081
3 0.98790825 0.29292865 0.00000000 0.69626767 0.72278486 0.13085561 0.78064096 0.46359296 0.73098652 0.72732431
4 0.54490600 0.93412638 0.69626767 0.00000000 0.65426980 0.81617143 0.59851262 0.36551106 0.68253093 1.00018238
5 0.42478532 0.74551902 0.72278486 0.65426980 0.00000000 0.75537605 0.11384534 0.36844164 0.02911855 0.42844270
6 1.06323764 0.17160290 0.13085561 0.81617143 0.75537605 0.00000000 0.82826619 0.55014297 0.75867851 0.68258388
7 0.31094245 0.83557056 0.78064096 0.59851262 0.11384534 0.82826619 0.00000000 0.37224997 0.13688270 0.54088523
8 0.52593635 0.62393711 0.46359296 0.36551106 0.36844164 0.55014297 0.37224997 0.00000000 0.39086196 0.64185453
9 0.44695830 0.74218236 0.73098652 0.68253093 0.02911855 0.75867851 0.13688270 0.39086196 0.00000000 0.40400339
10 0.85010761 0.57669081 0.72732431 1.00018238 0.42844270 0.68258388 0.54088523 0.64185453 0.40400339 0.00000000
Benchmark with 1000 records:
library(dplyr)
library(microbenchmark)
locs <- data.frame(ID = 1:1000, x = runif (1000, 0, 1), y = runif (1000, 0, 1))
f1 <- function()
{
df1 <- locs %>%
mutate(k = 1)
df2 <- df1 %>%
full_join(df1, by = "k") %>%
mutate(length = sqrt((x.x - x.y)^2 + (y.x - y.y)^2)) %>%
select(ID.x, ID.y, length)
dists <- matrix(data = df2$length, nrow = nrow(df1), ncol = nrow(df1))
}
f2 <- function(){dist(locs[,2:3],upper = T,diag=T)}
microbenchmark(f1())
microbenchmark(f2())
Results:
Unit: milliseconds
expr min lq mean median uq max neval
f1() 81.74188 245.8014 276.4318 259.7682 294.01 567.9409 100
and
Unit: milliseconds
expr min lq mean median uq max neval
f2() 6.956302 7.330661 8.675304 8.11507 8.981121 18.77783 100
Hope this helps!

Related

Efficient way to find species specific coefficients for a function call

Andrew Robinson shows in irebreakeR how to compute tree volume using diameter and height. He creates a function which uses coefficients depending on species and diameter. A simplified version looks like:
funRobinson <- function(species, diameter, height) {
bf_params <- data.frame(species = c("Spruce", "Oak"),
b0_small = c(26.729, 29.790),
b1_small = c( 0.01189, 0.00997),
b0_large = c(32.516, 85.150),
b1_large = c( 0.01181, 0.00841))
dimensions <- data.frame(diameter = diameter,
height = height,
species = as.character(species),
this_order = 1:length(species))
dimensions <- merge(y=dimensions, x=bf_params, all.y=TRUE, all.x=FALSE)
dimensions <- dimensions[order(dimensions$this_order, decreasing=FALSE),]
b0 <- with(dimensions, ifelse(diameter <= 20.5, b0_small, b0_large))
b1 <- with(dimensions, ifelse(diameter <= 20.5, b1_small, b1_large))
b0 + b1 * dimensions$diameter^2 * dimensions$height
}
For me this method looks straight forward but it creates an additional data.frame which needs to be sorted and calls ifelse twice to distinguish between small (diameter <= 20.5) and large trees. I'm looking for a more efficient way (low memory consumption, execution time) to find species specific coefficients. I would appreciate the possibility to add coefficients for other species without editing the function.
Example data-set and Performance:
dat <- data.frame(species = c("Spruce", "Spruce", "Oak", "Oak", "Fir"),
diameter = c(4, 30, 4, 30, 30),
height = c(30, 100, 30, 100, 100))
with(dat, funRobinson(species, diameter, height))
#[1] 32.4362 1095.4160 34.5756 842.0500 NA
library(microbenchmark)
microbenchmark(
Robinson = with(dat, funRobinson(species, diameter, height))
)
#Unit: milliseconds
# expr min lq mean median uq max neval
# Robinson 1.832604 1.860334 1.948054 1.876155 1.905009 3.054021 100
set.seed(0)
size <- 1e5
dat2 <- data.frame(species = sample(c("Spruce", "Oak", "Fir"), size=size, replace = TRUE)
, diameter = runif(size, 1, 50)
, height = runif(size, 1, 100))
microbenchmark(
Robinson = with(dat2, funRobinson(species, diameter, height))
)
#Unit: milliseconds
# expr min lq mean median uq max neval
# Robinson 203.8171 219.9265 234.0798 227.5911 250.6204 278.9918 100
I guess it's avoiding the data frame but directly calling the values out from a vector (or matrix). And the values called are the same for b0 and b1, so we only need to calculate it once.
Below is a quick attempt, most likely it can be made faster. I basically make 2 matrices for each parameter, and call out the corresponding rows and columns, according to
f2 <- function(species, diameter, height) {
species_avail=c("Spruce", "Oak")
params_b0 = cbind(b0_small = c(26.729, 29.790),
b0_large = c(32.516, 85.150))
rownames(params_b0) = species_avail
params_b1 = cbind(b1_small = c( 0.01189, 0.00997),
b1_large = c( 0.01181, 0.00841))
rownames(params_b1) = species_avail
ROWS = match(species,species_avail)
COLS = +(diameter > 20.5) + 1
idx = cbind(ROWS,COLS)
b0 <- params_b0[idx]
b1 <- params_b1[idx]
b0 + b1 * diameter^2 * height
}
Create data:
set.seed(0)
size <- 1e5
dat2 <- data.frame(species = sample(c("Spruce", "Oak", "Fir"), size=size, replace = TRUE)
, diameter = runif(size, 1, 50)
, height = runif(size, 1, 100))
check the function returns same thing:
identical(
with(dat2,funRobinson(species, diameter, height)),
with(dat2,f2(species,diameter,height))
)
[1] TRUE
Test:
microbenchmark(
Robinson = with(dat2, funRobinson(species, diameter, height)),
f2 = with(dat2, f2(species, diameter, height))
)
Unit: milliseconds
expr min lq mean median uq max neval
Robinson 249.677157 275.23375 303.97532 298.72475 329.04318 391.53807 100
f2 9.423471 10.16365 13.86918 10.48073 16.06827 65.19541 100
cld
b
a
Using the same approach like #StupidWolf but removing the match by directly using the number of the factor of the tree species by storing the coefficients sorted by those factors. Storing the coefficients in an environment avoids setting up the coefficients each time the function is called.
funGKiCl <- function(params, speciesLevels) {
force(params)
force(speciesLevels)
nSpecies <- length(speciesLevels)
i <- match(speciesLevels, params$species)
params_b0 <- c(params$b0_small[i], params$b0_large[i])
params_b1 <- c(params$b1_small[i], params$b1_large[i])
rm(i, params, speciesLevels)
function(species, diameter, height) {
i <- unclass(species) + (diameter > 20.5) * nSpecies
params_b0[i] + params_b1[i] * diameter * diameter * height
}
}
dat <- data.frame(species = c("Spruce", "Spruce", "Oak", "Oak", "Fir")
, diameter = c(4, 30, 4, 30, 30)
, height = c(30, 100, 30, 100, 100))
params <- read.table(header = TRUE, text = "
species b0_small b1_small b0_large b1_large
Spruce 26.729 0.01189 32.516 0.01181
Oak 29.790 0.00997 85.150 0.00841")
funGKi <- compiler::cmpfun(funGKiCl(params, levels(dat$species)))
with(dat, funGKi(species, diameter, height))
#[1] 32.4362 1095.4160 34.5756 842.0500 NA
Currently the method from #GKi is the fastest and uses lowest memory.
Data:
dat <- data.frame(species = c("Spruce", "Spruce", "Oak", "Oak", "Fir")
, diameter = c(4, 30, 4, 30, 30)
, height = c(30, 100, 30, 100, 100))
set.seed(0)
size <- 1e5
dat2 <- data.frame(
species = sample(c("Spruce", "Oak", "Fir"), size=size, replace = TRUE)
, diameter = runif(size, 1, 50)
, height = runif(size, 1, 100))
Methods:
funRobinson <- function(species, diameter, height) {
bf_params <- data.frame(species = c("Spruce", "Oak"),
b0_small = c(26.729, 29.790),
b1_small = c( 0.01189, 0.00997),
b0_large = c(32.516, 85.150),
b1_large = c( 0.01181, 0.00841))
dimensions <- data.frame(diameter = diameter,
height = height,
species = as.character(species),
this_order = 1:length(species))
dimensions <- merge(y=dimensions, x=bf_params, all.y=TRUE, all.x=FALSE)
dimensions <- dimensions[order(dimensions$this_order, decreasing=FALSE),]
b0 <- with(dimensions, ifelse(diameter <= 20.5, b0_small, b0_large))
b1 <- with(dimensions, ifelse(diameter <= 20.5, b1_small, b1_large))
b0 + b1 * dimensions$diameter^2 * dimensions$height
}
with(dat, funRobinson(species, diameter, height))
funStupidWolf <- function(species, diameter, height) {
species_avail=c("Spruce", "Oak")
params_b0 = cbind(b0_small = c(26.729, 29.790),
b0_large = c(32.516, 85.150))
rownames(params_b0) = species_avail
params_b1 = cbind(b1_small = c( 0.01189, 0.00997),
b1_large = c( 0.01181, 0.00841))
rownames(params_b1) = species_avail
ROWS = match(species,species_avail)
COLS = +(diameter > 20.5) + 1
idx = cbind(ROWS,COLS)
b0 <- params_b0[idx]
b1 <- params_b1[idx]
b0 + b1 * diameter^2 * height
}
with(dat, funStupidWolf(species, diameter, height))
funGKiCl <- function(params, speciesLevels) {
force(params)
force(speciesLevels)
nSpecies <- length(speciesLevels)
i <- match(speciesLevels, params$species)
params_b0 <- c(params$b0_small[i], params$b0_large[i])
params_b1 <- c(params$b1_small[i], params$b1_large[i])
rm(i, params, speciesLevels)
function(species, diameter, height) {
i <- unclass(species) + (diameter > 20.5) * nSpecies
params_b0[i] + params_b1[i] * diameter * diameter * height
}
}
params <- read.table(header = TRUE, text = "
species b0_small b1_small b0_large b1_large
Spruce 26.729 0.01189 32.516 0.01181
Oak 29.790 0.00997 85.150 0.00841")
funGKi <- compiler::cmpfun(funGKiCl(params, levels(dat$species)))
with(dat, funGKi(species, diameter, height))
rm(funGKiCl, params)
fun <- alist(Robinson = funRobinson(species, diameter, height)
, StupidWolf = funStupidWolf(species, diameter, height)
, GKi = funGKi(species, diameter, height))
Time:
library(microbenchmark)
attach(dat)
microbenchmark(list = fun, check = "equal")
#Unit: microseconds
# expr min lq mean median uq max neval
# Robinson 1876.491 1911.583 1997.00924 1934.8835 1962.3145 3131.453 100
# StupidWolf 15.618 17.371 22.30764 18.9995 26.5125 33.239 100
# GKi 2.270 2.965 4.04041 3.6825 5.0415 7.434 100
microbenchmark(list = fun, check = "equal", control=list(order="block"))
#Unit: microseconds
# expr min lq mean median uq max neval
# Robinson 1887.906 1918.0475 2000.55586 1938.847 1962.9540 3131.112 100
# StupidWolf 15.184 16.2775 16.97111 16.668 17.2230 34.646 100
# GKi 2.063 2.1560 2.37552 2.255 2.4015 5.616 100
attach(dat2)
microbenchmark(list = fun, setup = gc(), check = "equal")
#Unit: milliseconds
# expr min lq mean median uq max neval
# Robinson 189.342408 193.222831 193.682868 193.573419 194.181910 198.231698 100
# StupidWolf 6.755601 6.786439 6.836253 6.804451 6.832409 7.370937 100
# GKi 1.756241 1.767335 1.794328 1.782949 1.806370 1.964409 100
library(bench)
attach(dat)
mark(exprs = fun, iterations = 100)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#1 Robinson 1.87ms 1.9ms 521. 0B 10.6 98 2 188.1ms
#2 StupidWolf 16.48µs 17.46µs 55666. 0B 0 100 0 1.8ms
#3 GKi 2.67µs 2.86µs 337265. 0B 0 100 0 296.5µs
attach(dat2)
mark(exprs = fun, iterations = 100)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#1 Robinson 188.96ms 216.15ms 4.44 67.71MB 11.4 100 257
#2 StupidWolf 6.79ms 6.85ms 131. 8.77MB 30.0 100 23
#3 GKi 1.7ms 1.72ms 552. 2.29MB 22.1 100 4
#Some expressions had a GC in every iteration; so filtering is disabled.
Memory:
memUse <- function(list, setup = "", gctort = FALSE) {
as.data.frame(lapply(list, function(z) {
eval(setup)
tt <- sum(.Internal(gc(FALSE, TRUE, TRUE))[13:14])
gctorture(on = gctort)
eval(z)
gctorture(on = FALSE)
sum(.Internal(gc(FALSE, FALSE, TRUE))[13:14]) - tt
}))
}
attach(dat)
memUse(list=fun, gctort = FALSE)
# Robinson StupidWolf GKi
#1 0.9 0 0
memUse(list=fun, gctort = TRUE)
# Robinson StupidWolf GKi
#1 0 0 0
attach(dat2)
memUse(list=fun, gctort = FALSE)
# Robinson StupidWolf GKi
#1 71.9 8.8 2.3
memUse(list=fun, gctort = TRUE)
# Robinson StupidWolf GKi
#1 29.7 6.5 2.3
object.size(funRobinson)
#109784 bytes
object.size(funStupidWolf)
#68240 bytes
object.size(funGKi)
#21976 bytes

Fastest way to map multiple character columns to numerical values

I have an algorithm that at each iteration calculates means for certain groups (the groups do not change only their values).
The table of the values -
d1 <- data.frame(x = sample(LETTERS, N, replace = TRUE),
y1=rnorm(N))
head(d1)
# x y1
# 1 H -0.7852538
# 2 G -0.6739159
# 3 V -1.7783771
# 4 L -0.2849846
# 5 I -0.1760284
# 6 V -0.2785826
I can calculate the means (in several ways: dplyr, data.table and tapply). I have another data.frame consisting of two columns with the group names.
d2 <- data.frame('group.high' = sample(LETTERS, N * 2, replace = TRUE),
'group.low' = sample(LETTERS, N * 2, replace = TRUE))
head(d2)
# group.high group.low
# 1 U L
# 2 K J
# 3 C Q
# 4 Q A
# 5 Q U
# 6 K W
I want to add to columns, mean.high and mean.better, of the mean values of each group based on d1.
So far I have tried two options from dplyr and data.table. I had to use left_join twice in either of them. They are both similar in speed.
microbenchmark(
dplyr = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
### Solution 1
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))},
data.table = {
### Solution 2
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
}
)
Unit: milliseconds
expr min lq mean median uq max neval cld
dplyr 34.0837 36.88650 53.22239 42.9227 47.50660 231.5066 100 a
data.table 40.2071 47.70735 87.46804 51.2517 59.05385 258.4999 100 b
Is there a better way? How can I speed the calculation?
As mentioned in the comments, there is an iterative process of updating the values. Here is an example.
N <- 10000
iterFuncDplyr <- function(d1, d2) {
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))
return(var(d1$y1))
}
iterFuncData <- function(d1, d2) {
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table:::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table:::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
return(var(d1$y1))
}
d1 <- data.frame(x = sample(LETTERS, N, replace = TRUE),
y1=rnorm(N))
d2 <- data.frame('group.high' = sample(LETTERS, N * 2, replace = TRUE),
'group.low' = sample(LETTERS, N * 2, replace = TRUE))
library(data.table)
library(dplyr)
microbenchmark::microbenchmark(dplyr = {
temp.val <- 0
for (i in 1:10) {
d1$y1 <- temp.val + rnorm(N)
temp.val <- iterFuncDplyr(d1, d2)
}},
data.table = {
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
temp.val <- 0
for (i in 1:10) {
d1$y1 <- temp.val + rnorm(N)
temp.val <- iterFuncData(d1, d2)
}
}
)
Unit: milliseconds
expr min lq mean median uq max neval
dplyr 46.22904 50.67959 52.78275 51.96358 53.34825 108.2874 100
data.table 63.81111 67.13257 70.85537 69.85712 72.72446 127.4228 100
You could subset the named vector means to create new columns and match your output:
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
d2$mean.high <- means[d2$group.high]
d2$mean.low <- means[d2$group.low]
identical(as.matrix(d2), as.matrix(d3)) #factor vs character, used d3 w/ benchmark
[1] TRUE
Unit: microseconds
expr min lq mean median uq max neval
dplyr 4868.2 5316.25 5787.123 5524.15 5892.70 12187.3 100
data.table 8254.4 9606.60 10438.424 10118.35 10771.75 20966.5 100
subset 481.2 529.40 651.194 550.35 582.55 7849.9 100
Benchmark code:
d3 <- d2
microbenchmark::microbenchmark( # N = 10000
dplyr = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
### Solution 1
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))},
data.table = {
### Solution 2
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
},
subset = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
d3$mean.high <- means[d2$group.high]
d3$mean.low <- means[d2$group.low]
}
)
Here is an answer very similar to Andrew's but relying on data.table instead of tapply() (which seems faster for very big N).
library(data.table)
# Create a named vector "means"
means <- setDT(d1)[, mean(y1), by = x][, setNames(V1, x)]
setDT(d2)[, c("mean.high.means", "mean.low.means") :=
.(means[as.character(group.high)], means[as.character(group.low)])]
Output:
group.high group.low mean.high.means mean.low.means
1: Z W 0.017032792 0.0091625547
2: A A 0.013796137 0.0137961371
3: V S -0.011570159 0.0004560325
4: D X 0.005475629 0.0200984250
5: U H -0.008249901 0.0054537833
---
199996: H K 0.005453783 0.0079905631
199997: A T 0.013796137 -0.0068537963
199998: W U 0.009162555 -0.0082499015
199999: T V -0.006853796 -0.0115701585
200000: G J 0.014829259 0.0206598470
Reproducible data:
N = 1e5
set.seed(1)
d1 <- data.frame(
x = sample(LETTERS, N, replace = TRUE),
y1 = rnorm(N)
)
d2 <- data.frame(
group.high = sample(LETTERS, N * 2, replace = TRUE),
group.low = sample(LETTERS, N * 2, replace = TRUE)
)

create data frame containing rows that add up to 100

This is my first stab at this:
library(dplyr)
step_size <- 5
grid <- expand.grid(
x1 = seq(0, 100, step_size)
, x2 = seq(0, 100, step_size)
, x3 = seq(0, 100, step_size)
)
grid$sum = grid$x1 + grid$x2 + grid$x3
grid$x1 <- (grid$x1 / grid$sum) * 100
grid$x2 <- (grid$x2 / grid$sum) * 100
grid$x3 <- (grid$x3 / grid$sum) * 100
grid$sum <- grid$x1 + grid$x2 + grid$x3
nrow(grid)
result <- distinct(grid) %>% filter(!is.na(sum))
head(result, 20)
nrow(result)
Basically, I want to create a data frame that contains as many rows as possible that add up to 100 and are uniformly distributed.
is there an easier better approach in R? thanks!
Using data.table...
library(data.table)
grid <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
setDT(grid)
res <- grid[grid[, rowSums(.SD) == 100], ]
res[, summation := rowSums(.SD)]
Result:
> res[, unique(summation)]
[1] 100
This can also be done in base but data.table is faster:
library(data.table)
grid <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
grid2 <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
setDT(grid)
microbenchmark::microbenchmark(
data.table = {
res <- grid[grid[, rowSums(.SD) == 100], ]
},
base = {
res2 <- grid2[rowSums(grid2) == 100, ]
}
)
Unit: milliseconds
expr min lq mean median uq max neval cld
data.table 59.41157 89.6700 109.0462 107.7415 124.2675 183.9730 100 a
base 65.70521 109.6471 154.1312 125.4238 156.9168 611.0169 100 b
Here's a simple function. You can specify how many rows/columns you want, and what each row sums to.
func <- function(cols = 3, rows = 10, rowTotal = 100) {
dt1 <- replicate(n = cols, runif(n = rows))
dt1 <- data.frame(apply(X = dt1, MARGIN = 2, FUN = function(x) x / rowSums(dt1) * rowTotal))
return(dt1)
}
rowSums(func()) # default values (3 cols, 10 rows, each row sums to 100)
rowSums(func(cols = 5, rows = 10, rowTotal = 50)) # 5 cols, 10 rows, row sums to 50)

Discrepancies in Benchmark and Processing Time Results

I've been trying to do a little testing of the most efficient ways to replace NA's in dataframes.
I started with a comparison of of NA's to 0's replacement solutions on a 1 million row, 12 column dataset.
Throwing all the pipe capable ones into microbenchmark I got the following results.
Question 1: Is there a way to test the subset left assignment statements (e.g.:df1[is.na(df1)] <- 0) inside the benchmark function?
library(dplyr)
library(tidyr)
library(microbenchmark)
set.seed(24)
df1 <- as.data.frame(matrix(sample(c(NA, 1:5), 1e6 *12, replace=TRUE),
dimnames = list(NULL, paste0("var", 1:12)), ncol=12))
op <- microbenchmark(
mut_all_ifelse = df1 %>% mutate_all(funs(ifelse(is.na(.), 0, .))),
mut_at_ifelse = df1 %>% mutate_at(funs(ifelse(is.na(.), 0, .)), .cols = c(1:12)),
# df1[is.na(df1)] <- 0 would sit here, but I can't make it work inside this function
replace = df1 %>% replace(., is.na(.), 0),
mut_all_replace = df1 %>% mutate_all(funs(replace(., is.na(.), 0))),
mut_at_replace = df1 %>% mutate_at(funs(replace(., is.na(.), 0)), .cols = c(1:12)),
replace_na = df1 %>% replace_na(list(var1 = 0, var2 = 0, var3 = 0, var4 = 0, var5 = 0, var6 = 0, var7 = 0, var8 = 0, var9 = 0, var10 = 0, var11 = 0, var12 = 0)),
times = 1000L
)
print(op) #standard data frame of the output
Unit: milliseconds
expr min lq mean median uq max neval
mut_all_ifelse 769.87848 844.5565 871.2476 856.0941 895.4545 1274.5610 1000
mut_at_ifelse 713.48399 847.0322 875.9433 861.3224 899.7102 1006.6767 1000
replace 258.85697 311.9708 334.2291 317.3889 360.6112 455.7596 1000
mut_all_replace 96.81479 164.1745 160.6151 167.5426 170.5497 219.5013 1000
mut_at_replace 96.23975 166.0804 161.9302 169.3984 172.7442 219.0359 1000
replace_na 103.04600 161.2746 156.7804 165.1649 168.3683 210.9531 1000
boxplot(op) #boxplot of output
library(ggplot2) #nice log plot of the output
qplot(y=time, data=op, colour=expr) + scale_y_log10()
To test the subset assignment operator I had run these tests originally.
set.seed(24)
> Book1 <- as.data.frame(matrix(sample(c(NA, 1:5), 1e8 *12, replace=TRUE),
+ dimnames = list(NULL, paste0("var", 1:12)), ncol=12))
> system.time({
+ Book1 %>% mutate_all(funs(ifelse(is.na(.), 0, .))) })
user system elapsed
52.79 24.66 77.45
>
> system.time({
+ Book1 %>% mutate_at(funs(ifelse(is.na(.), 0, .)), .cols = c(1:12)) })
user system elapsed
52.74 25.16 77.91
>
> system.time({
+ Book1[is.na(Book1)] <- 0 })
user system elapsed
16.65 7.86 24.51
>
> system.time({
+ Book1 %>% replace_na(list(var1 = 0, var2 = 0, var3 = 0, var4 = 0, var5 = 0, var6 = 0, var7 = 0, var8 = 0, var9 = 0,var10 = 0, var11 = 0, var12 = 0)) })
user system elapsed
3.54 2.13 5.68
>
> system.time({
+ Book1 %>% mutate_at(funs(replace(., is.na(.), 0)), .cols = c(1:12)) })
user system elapsed
3.37 2.26 5.63
>
> system.time({
+ Book1 %>% mutate_all(funs(replace(., is.na(.), 0))) })
user system elapsed
3.33 2.26 5.58
>
> system.time({
+ Book1 %>% replace(., is.na(.), 0) })
user system elapsed
3.42 1.09 4.51
In these tests the base replace() comes in first.
In the benchmarking trials, the replace falls farther back in the ranks while the tidyr replace_na() wins (by a nose)
Running the singular tests repeatedly and on different shapes and sizes of data frames always finds the base replace() in the lead.
Question 2: How could it's benchmark performance be the only result that falls so far out of line with the simple test results?
More perplexingly -
Question 3: How can all the mutate_all/_at(replace()) work faster than the simple replace()?
Many folks report this: http://datascience.la/dplyr-and-a-very-basic-benchmark/ (and all the links in that article) but I still haven't found an explanation for why beyond that hashing and C++ are used.)
with special thanks already to Tyler Rinker: https://www.r-bloggers.com/microbenchmarking-with-r/
and akrun: https://stackoverflow.com/a/41530071/5088194
You can include a complex/multi statement in microbenchmark by wrapping it with {} which, basically, converts to a single expression:
microbenchmark(expr1 = { df1[is.na(df1)] = 0 },
exp2 = { tmp = 1:10; tmp[3] = 0L; tmp2 = tmp + 12L; tmp2 ^ 2 },
times = 10)
#Unit: microseconds
# expr min lq mean median uq max neval cld
# expr1 124953.716 137244.114 158576.030 142405.685 156744.076 284779.353 10 b
# exp2 2.784 3.132 17.748 23.142 24.012 38.976 10 a
Worth noting the side-effects of this:
tmp
#[1] 1 2 0 4 5 6 7 8 9 10
in contrast to, say, something like:
rm(tmp)
microbenchmark(expr1 = { df1[is.na(df1)] = 0 },
exp2 = local({ tmp = 1:10; tmp[3] = 0L; tmp2 = tmp + 12L; tmp2 ^ 2 }),
times = 10)
#Unit: microseconds
# expr min lq mean median uq max neval cld
# expr1 127250.18 132935.149 165296.3030 154509.553 169917.705 314820.306 10 b
# exp2 10.44 12.181 42.5956 54.636 57.072 97.789 10 a
tmp
#Error: object 'tmp' not found
Noticing the side effect a benchmark has, we see that the first operation that removes NA values leaves a fairly light job for the following alternatives:
# re-assign because we changed it before
set.seed(24)
df1 = as.data.frame(matrix(sample(c(NA, 1:5), 1e6 * 12, TRUE),
dimnames = list(NULL, paste0("var", 1:12)), ncol = 12))
unique(sapply(df1, typeof))
#[1] "integer"
any(sapply(df1, anyNA))
#[1] TRUE
system.time({ df1[is.na(df1)] <- 0 })
# user system elapsed
# 0.39 0.14 0.53
The previous benchmark leaves us with:
unique(sapply(df1, typeof))
#[1] "double"
any(sapply(df1, anyNA))
#[1] FALSE
And replacing NA, when there are none, is/should be taken account to do nothing on the input.
Aside of that, note that in all your alternatives you sub-assign a "double" (typeof(0)) to "integer" columns-vectors (sapply(df1, typeof)). While, I don't think there is any case (in the above alternatives) where df1 gets modified in place (since after the creation of a "data.frame", there is stored info to copy its vector-columns in case of modification), there -still- is a minor but avoidable overhead in coercing to "double" and storing as a "double". R before replacing elements in an "integer" vector will allocate and copy (in case of "integer" replacement) or allocate and coerce (in case of "double" replacement). Also, after the first coercion (from a side effect of a benchmark, as noted above), R will operate on "double"s and that contains slower manipulations than on "integer"s. I can't find a straightforward R way to investigate that difference, but in a nutshell (in danger of not being totally accurate) we can simulate these operations by:
# simulate R's copying of int to int
# allocate a new int and copy
int2int = inline::cfunction(sig = c(x = "integer"), body = '
SEXP ans = PROTECT(allocVector(INTSXP, LENGTH(x)));
memcpy(INTEGER(ans), INTEGER(x), LENGTH(x) * sizeof(int));
UNPROTECT(1);
return(ans);
')
# R's coercing of int to double
# 'coerceVector', internally, allocates a double and coerces to populate it
int2dbl = inline::cfunction(sig = c(x = "integer"), body = '
SEXP ans = PROTECT(coerceVector(x, REALSXP));
UNPROTECT(1);
return(ans);
')
# simulate R's copying form double to double
dbl2dbl = inline::cfunction(sig = c(x = "double"), body = '
SEXP ans = PROTECT(allocVector(REALSXP, LENGTH(x)));
memcpy(REAL(ans), REAL(x), LENGTH(x) * sizeof(double));
UNPROTECT(1);
return(ans);
')
And on a benchmark:
x.int = 1:1e7; x.dbl = as.numeric(x.int)
microbenchmark(int2int(x.int), int2dbl(x.int), dbl2dbl(x.dbl), times = 50)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# int2int(x.int) 16.42710 16.91048 21.93023 17.42709 19.38547 54.36562 50 a
# int2dbl(x.int) 35.94064 36.61367 47.15685 37.40329 63.61169 78.70038 50 b
# dbl2dbl(x.dbl) 33.51193 34.18427 45.30098 35.33685 63.45788 75.46987 50 b
To conclude(!) the whole previous note, replacing 0 with 0L shall save some time...
Finally to replicate the benchmark in a more fair way, we could use:
library(dplyr)
library(tidyr)
library(microbenchmark)
set.seed(24)
df1 = as.data.frame(matrix(sample(c(NA, 1:5), 1e6 * 12, TRUE),
dimnames = list(NULL, paste0("var", 1:12)), ncol = 12))
Wrap in functions:
stopifnot(ncol(df1) == 12) #some of the alternatives are hardcoded to 12 columns
mut_all_ifelse = function(x, val) x %>% mutate_all(funs(ifelse(is.na(.), val, .)))
mut_at_ifelse = function(x, val) x %>% mutate_at(funs(ifelse(is.na(.), val, .)), .cols = c(1:12))
baseAssign = function(x, val) { x[is.na(x)] <- val; x }
baseFor = function(x, val) { for(j in 1:ncol(x)) x[[j]][is.na(x[[j]])] = val; x }
base_replace = function(x, val) x %>% replace(., is.na(.), val)
mut_all_replace = function(x, val) x %>% mutate_all(funs(replace(., is.na(.), val)))
mut_at_replace = function(x, val) x %>% mutate_at(funs(replace(., is.na(.), val)), .cols = c(1:12))
myreplace_na = function(x, val) x %>% replace_na(list(var1 = val, var2 = val, var3 = val, var4 = val, var5 = val, var6 = val, var7 = val, var8 = val, var9 = val, var10 = val, var11 = val, var12 = val))
Test for equality of results before the benchmarks:
identical(mut_all_ifelse(df1, 0), mut_at_ifelse(df1, 0))
#[1] TRUE
identical(mut_at_ifelse(df1, 0), baseAssign(df1, 0))
#[1] TRUE
identical(baseAssign(df1, 0), baseFor(df1, 0))
#[1] TRUE
identical(baseFor(df1, 0), base_replace(df1, 0))
#[1] TRUE
identical(base_replace(df1, 0), mut_all_replace(df1, 0))
#[1] TRUE
identical(mut_all_replace(df1, 0), mut_at_replace(df1, 0))
#[1] TRUE
identical(mut_at_replace(df1, 0), myreplace_na(df1, 0))
#[1] TRUE
Test with coercing to "double":
benchnum = microbenchmark(mut_all_ifelse(df1, 0),
mut_at_ifelse(df1, 0),
baseAssign(df1, 0),
baseFor(df1, 0),
base_replace(df1, 0),
mut_all_replace(df1, 0),
mut_at_replace(df1, 0),
myreplace_na(df1, 0),
times = 10)
benchnum
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# mut_all_ifelse(df1, 0) 1368.5091 1441.9939 1497.5236 1509.2233 1550.1416 1629.6959 10 c
# mut_at_ifelse(df1, 0) 1366.1674 1389.2256 1458.1723 1464.5962 1503.4337 1553.7110 10 c
# baseAssign(df1, 0) 532.4975 548.9444 586.8198 564.3940 655.8083 667.8634 10 b
# baseFor(df1, 0) 169.6048 175.9395 206.7038 189.5428 197.6472 308.6965 10 a
# base_replace(df1, 0) 518.7733 547.8381 597.8842 601.1544 643.4970 666.6872 10 b
# mut_all_replace(df1, 0) 169.1970 183.5514 227.1978 194.0903 291.6625 346.4649 10 a
# mut_at_replace(df1, 0) 176.7904 186.4471 227.3599 202.9000 303.4643 309.2279 10 a
# myreplace_na(df1, 0) 172.4926 177.8518 199.1469 186.3645 192.1728 297.0419 10 a
Test without coercing to "double":
benchint = microbenchmark(mut_all_ifelse(df1, 0L),
mut_at_ifelse(df1, 0L),
baseAssign(df1, 0L),
baseFor(df1, 0L),
base_replace(df1, 0L),
mut_all_replace(df1, 0L),
mut_at_replace(df1, 0L),
myreplace_na(df1, 0L),
times = 10)
benchint
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# mut_all_ifelse(df1, 0L) 1291.17494 1313.1910 1377.9265 1353.2812 1417.4389 1554.6110 10 c
# mut_at_ifelse(df1, 0L) 1295.34053 1315.0308 1372.0728 1353.0445 1431.3687 1478.8613 10 c
# baseAssign(df1, 0L) 451.13038 461.9731 477.3161 471.0833 484.9318 528.4976 10 b
# baseFor(df1, 0L) 98.15092 102.4996 115.7392 107.9778 136.2227 139.7473 10 a
# base_replace(df1, 0L) 428.54747 451.3924 471.5011 470.0568 497.7088 516.1852 10 b
# mut_all_replace(df1, 0L) 101.66505 102.2316 137.8128 130.5731 161.2096 243.7495 10 a
# mut_at_replace(df1, 0L) 103.79796 107.2533 119.1180 112.1164 127.7959 166.9113 10 a
# myreplace_na(df1, 0L) 100.03431 101.6999 120.4402 121.5248 137.1710 141.3913 10 a
And a simple way to visualize:
boxplot(benchnum, ylim = range(min(summary(benchint)$min, summary(benchnum)$min),
max(summary(benchint)$max, summary(benchnum)$max)))
boxplot(benchint, add = TRUE, border = "red", axes = FALSE)
legend("topright", c("coerce", "not coerce"), fill = c("black", "red"))
Note that df1 is unchanged after all this (str(df1)).

Filling a dataframe with a 0 and 1 vector [duplicate]

This question already has answers here:
Create binary column (0/1) based on condition in another column
(2 answers)
Closed 8 years ago.
I have a datarame with two columns (A and B). Column A is categorical B is numeric (ranging from 0.0 to 1.0). I want to create a column C for which the values are 1 when the value in Column B is greater than or equal to 0.5 and 0 when the value in column B is less than 0.5. Any suggestions on how to do this? The final df should look like this:
A = c('spA', 'spB', 'spC', 'spD')
B = c(0.25, 0.15, 0.50, 0.75)
C = c(0,0,1,1)
df = data.frame(A, B, C)
Just use
A = c('spA', 'spB', 'spC', 'spD')
B = c(0.25, 0.15, 0.50, 0.75)
df = data.frame(A, B)
df$C <- as.numeric(df$B >= 0.5)
#David Arenburg: Speed comparison of all 3 solutions pointed our above
To be honest i dont know why it is that much faster.
require(microbenchmark)
microbenchmark(
df$C <- ifelse(df$B>=0.5, 1, 0),
transform(df, C = as.numeric(B >= 0.5)),
df$C <- as.numeric(df$B>=0.5)
)
Result:
Unit: microseconds
expr min lq median uq max neval
df$C <- ifelse(df$B >= 0.5, 1, 0) 33.585 35.7580 38.1285 41.6845 140.66 100
transform(df, C = as.numeric(B >= 0.5)) 143.821 149.7470 155.0815 164.5640 284.48 100
df$C <- as.numeric(df$B >= 0.5) 20.546 22.9165 24.2995 27.2630 53.34 100
EDIT: Lager Dataset
df <- data.frame(B=runif(100000))
require(microbenchmark)
microbenchmark(
df$C <- ifelse(df$B>=0.5, 1, 0),
transform(df, C = as.numeric(B >= 0.5)),
df$C <- as.numeric(df$B>=0.5)
)
Unit: microseconds
expr min lq median uq max neval
df$C <- ifelse(df$B >= 0.5, 1, 0) 31620.826 33623.452 34529.8380 55652.9290 62707.064 100
transform(df, C = as.numeric(B >= 0.5)) 811.561 979.286 1032.6255 1248.5550 2333.137 100
df$C <- as.numeric(df$B >= 0.5) 606.498 764.542 808.0045 979.0875 23805.112 100

Resources