Filling a dataframe with a 0 and 1 vector [duplicate] - r

This question already has answers here:
Create binary column (0/1) based on condition in another column
(2 answers)
Closed 8 years ago.
I have a datarame with two columns (A and B). Column A is categorical B is numeric (ranging from 0.0 to 1.0). I want to create a column C for which the values are 1 when the value in Column B is greater than or equal to 0.5 and 0 when the value in column B is less than 0.5. Any suggestions on how to do this? The final df should look like this:
A = c('spA', 'spB', 'spC', 'spD')
B = c(0.25, 0.15, 0.50, 0.75)
C = c(0,0,1,1)
df = data.frame(A, B, C)

Just use
A = c('spA', 'spB', 'spC', 'spD')
B = c(0.25, 0.15, 0.50, 0.75)
df = data.frame(A, B)
df$C <- as.numeric(df$B >= 0.5)
#David Arenburg: Speed comparison of all 3 solutions pointed our above
To be honest i dont know why it is that much faster.
require(microbenchmark)
microbenchmark(
df$C <- ifelse(df$B>=0.5, 1, 0),
transform(df, C = as.numeric(B >= 0.5)),
df$C <- as.numeric(df$B>=0.5)
)
Result:
Unit: microseconds
expr min lq median uq max neval
df$C <- ifelse(df$B >= 0.5, 1, 0) 33.585 35.7580 38.1285 41.6845 140.66 100
transform(df, C = as.numeric(B >= 0.5)) 143.821 149.7470 155.0815 164.5640 284.48 100
df$C <- as.numeric(df$B >= 0.5) 20.546 22.9165 24.2995 27.2630 53.34 100
EDIT: Lager Dataset
df <- data.frame(B=runif(100000))
require(microbenchmark)
microbenchmark(
df$C <- ifelse(df$B>=0.5, 1, 0),
transform(df, C = as.numeric(B >= 0.5)),
df$C <- as.numeric(df$B>=0.5)
)
Unit: microseconds
expr min lq median uq max neval
df$C <- ifelse(df$B >= 0.5, 1, 0) 31620.826 33623.452 34529.8380 55652.9290 62707.064 100
transform(df, C = as.numeric(B >= 0.5)) 811.561 979.286 1032.6255 1248.5550 2333.137 100
df$C <- as.numeric(df$B >= 0.5) 606.498 764.542 808.0045 979.0875 23805.112 100

Related

Plotting CDF and PDF in R with custom function

I was wondering if there was any way to plot this PDF and CDF in R. I found these on a different question a user asked, and was curious.
I know that I have to create a function and then plot this, but I'm struggling with the different parameters and am unsure how to translate this to R. I have only ever plotted PDF/CDF using a normal distribution, or from datasets.
You can write the pdf and cdf as functions, using ifelse to specify the values within the ranges, then simply plot the functions over the desired range:
pdf <- function(x) {
ifelse(x >= 0 & x < 1, 0.5, ifelse(x < 1.5 & x >= 1, 1, 0))
}
cdf <- function(x) {
ifelse(x < 0, 0,
ifelse(x >= 0 & x < 1, 0.5 * x,
ifelse(x < 1.5 & x >= 1, x - 0.5, 1)))
}
plot(pdf, xlim = c(-1, 2), type = "s")
plot(cdf, xlim = c(-1, 2))
Created on 2022-10-27 with reprex v2.0.2
ifelse can be very slow, we can fill an output vector instead. numeric() creates a vector of zeroes of a specified length, we then simply change everything that should not yield zero.
pdf_vec <- function(x) {
out <- numeric(length(x))
out[x >= 0 & x < 1] <- .5
out[x >= 1 & x < 1.5] <- 1
out
}
cdf_vec <- function(x) {
out <- numeric(length(x))
tmp <- x >= 0 & x < 1
out[tmp] <- .5*x[tmp]
tmp <- x >= 1 & x < 1.5
out[tmp] <- x[tmp] - .5
tmp <- x >= 1.5
out[tmp] <- 1
out
}
set.seed(42)
x <- rnorm(1e6)
stopifnot(all.equal(cdf(x), cdf1(x)))
stopifnot(all.equal(pdf(x), pdf1(x)))
#Allan Camero already showed nicely how to plot it.
Microbenchmark
It's about three times faster than the ifelse solution.
microbenchmark::microbenchmark(
cdf_ifelse=cdf(x), cdf_vec=cdf1(x), check='equal'
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# cdf_ifelse 110.66285 121.25428 133.0789 133.86041 142.53296 167.64401 100 b
# cdf_vec 43.56277 45.08759 48.8924 46.83869 49.46047 74.95487 100 a
microbenchmark::microbenchmark(
pdf_ifelse=pdf(x), pdf_vec=pdf1(x), check='equal'
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# pdf_ifelse 71.39501 76.71747 101.17738 81.43037 87.82162 192.3919 100 b
# pdf_vec 27.82642 30.02056 34.55301 30.38457 31.29751 133.2798 100 a
We can try the code below
f <- function(x) 0.5 * (x >= 0) + 0.5 * (x >= 1) - (x >= 1.5)
F <- Vectorize(function(x) integrate(f, -Inf, x)$value)
plot(f, -2, 2, col = "red")
curve(F, -2, 2, add = TRUE, col = "blue")
legend(-1, 0.8,
legend = c("pdf", "cdf"),
col = c("red", "blue"),
lty = 1:2, cex = 2,
box.col = "white"
)

Fastest way to map multiple character columns to numerical values

I have an algorithm that at each iteration calculates means for certain groups (the groups do not change only their values).
The table of the values -
d1 <- data.frame(x = sample(LETTERS, N, replace = TRUE),
y1=rnorm(N))
head(d1)
# x y1
# 1 H -0.7852538
# 2 G -0.6739159
# 3 V -1.7783771
# 4 L -0.2849846
# 5 I -0.1760284
# 6 V -0.2785826
I can calculate the means (in several ways: dplyr, data.table and tapply). I have another data.frame consisting of two columns with the group names.
d2 <- data.frame('group.high' = sample(LETTERS, N * 2, replace = TRUE),
'group.low' = sample(LETTERS, N * 2, replace = TRUE))
head(d2)
# group.high group.low
# 1 U L
# 2 K J
# 3 C Q
# 4 Q A
# 5 Q U
# 6 K W
I want to add to columns, mean.high and mean.better, of the mean values of each group based on d1.
So far I have tried two options from dplyr and data.table. I had to use left_join twice in either of them. They are both similar in speed.
microbenchmark(
dplyr = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
### Solution 1
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))},
data.table = {
### Solution 2
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
}
)
Unit: milliseconds
expr min lq mean median uq max neval cld
dplyr 34.0837 36.88650 53.22239 42.9227 47.50660 231.5066 100 a
data.table 40.2071 47.70735 87.46804 51.2517 59.05385 258.4999 100 b
Is there a better way? How can I speed the calculation?
As mentioned in the comments, there is an iterative process of updating the values. Here is an example.
N <- 10000
iterFuncDplyr <- function(d1, d2) {
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))
return(var(d1$y1))
}
iterFuncData <- function(d1, d2) {
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table:::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table:::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
return(var(d1$y1))
}
d1 <- data.frame(x = sample(LETTERS, N, replace = TRUE),
y1=rnorm(N))
d2 <- data.frame('group.high' = sample(LETTERS, N * 2, replace = TRUE),
'group.low' = sample(LETTERS, N * 2, replace = TRUE))
library(data.table)
library(dplyr)
microbenchmark::microbenchmark(dplyr = {
temp.val <- 0
for (i in 1:10) {
d1$y1 <- temp.val + rnorm(N)
temp.val <- iterFuncDplyr(d1, d2)
}},
data.table = {
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
temp.val <- 0
for (i in 1:10) {
d1$y1 <- temp.val + rnorm(N)
temp.val <- iterFuncData(d1, d2)
}
}
)
Unit: milliseconds
expr min lq mean median uq max neval
dplyr 46.22904 50.67959 52.78275 51.96358 53.34825 108.2874 100
data.table 63.81111 67.13257 70.85537 69.85712 72.72446 127.4228 100
You could subset the named vector means to create new columns and match your output:
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
d2$mean.high <- means[d2$group.high]
d2$mean.low <- means[d2$group.low]
identical(as.matrix(d2), as.matrix(d3)) #factor vs character, used d3 w/ benchmark
[1] TRUE
Unit: microseconds
expr min lq mean median uq max neval
dplyr 4868.2 5316.25 5787.123 5524.15 5892.70 12187.3 100
data.table 8254.4 9606.60 10438.424 10118.35 10771.75 20966.5 100
subset 481.2 529.40 651.194 550.35 582.55 7849.9 100
Benchmark code:
d3 <- d2
microbenchmark::microbenchmark( # N = 10000
dplyr = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
### Solution 1
dplyr.d2 <- left_join(d2,data.frame('group.high' = names(means),
'mean.high' = means, stringsAsFactors = FALSE) ) %>%
left_join(., data.frame('group.low' = names(means),
'mean.low' = means, stringsAsFactors = FALSE))},
data.table = {
### Solution 2
d1 <- as.data.table(d1)
d2 <- as.data.table(d2)
means <- d1[ ,.(means = mean(y1)), by = x]
new.d2 <- data.table::merge.data.table(x = d2, y = means, by.x = 'group.high', by.y = 'x')
data.table.d2 <- data.table::merge.data.table(x = new.d2, y = means, by.x = 'group.low', by.y = 'x')
},
subset = {
means <- tapply(d1$y1, INDEX = d1$x, FUN = mean)
d3$mean.high <- means[d2$group.high]
d3$mean.low <- means[d2$group.low]
}
)
Here is an answer very similar to Andrew's but relying on data.table instead of tapply() (which seems faster for very big N).
library(data.table)
# Create a named vector "means"
means <- setDT(d1)[, mean(y1), by = x][, setNames(V1, x)]
setDT(d2)[, c("mean.high.means", "mean.low.means") :=
.(means[as.character(group.high)], means[as.character(group.low)])]
Output:
group.high group.low mean.high.means mean.low.means
1: Z W 0.017032792 0.0091625547
2: A A 0.013796137 0.0137961371
3: V S -0.011570159 0.0004560325
4: D X 0.005475629 0.0200984250
5: U H -0.008249901 0.0054537833
---
199996: H K 0.005453783 0.0079905631
199997: A T 0.013796137 -0.0068537963
199998: W U 0.009162555 -0.0082499015
199999: T V -0.006853796 -0.0115701585
200000: G J 0.014829259 0.0206598470
Reproducible data:
N = 1e5
set.seed(1)
d1 <- data.frame(
x = sample(LETTERS, N, replace = TRUE),
y1 = rnorm(N)
)
d2 <- data.frame(
group.high = sample(LETTERS, N * 2, replace = TRUE),
group.low = sample(LETTERS, N * 2, replace = TRUE)
)

create data frame containing rows that add up to 100

This is my first stab at this:
library(dplyr)
step_size <- 5
grid <- expand.grid(
x1 = seq(0, 100, step_size)
, x2 = seq(0, 100, step_size)
, x3 = seq(0, 100, step_size)
)
grid$sum = grid$x1 + grid$x2 + grid$x3
grid$x1 <- (grid$x1 / grid$sum) * 100
grid$x2 <- (grid$x2 / grid$sum) * 100
grid$x3 <- (grid$x3 / grid$sum) * 100
grid$sum <- grid$x1 + grid$x2 + grid$x3
nrow(grid)
result <- distinct(grid) %>% filter(!is.na(sum))
head(result, 20)
nrow(result)
Basically, I want to create a data frame that contains as many rows as possible that add up to 100 and are uniformly distributed.
is there an easier better approach in R? thanks!
Using data.table...
library(data.table)
grid <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
setDT(grid)
res <- grid[grid[, rowSums(.SD) == 100], ]
res[, summation := rowSums(.SD)]
Result:
> res[, unique(summation)]
[1] 100
This can also be done in base but data.table is faster:
library(data.table)
grid <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
grid2 <- expand.grid(
x1 = seq(0, 100)
, x2 = seq(0, 100)
, x3 = seq(0, 100)
)
setDT(grid)
microbenchmark::microbenchmark(
data.table = {
res <- grid[grid[, rowSums(.SD) == 100], ]
},
base = {
res2 <- grid2[rowSums(grid2) == 100, ]
}
)
Unit: milliseconds
expr min lq mean median uq max neval cld
data.table 59.41157 89.6700 109.0462 107.7415 124.2675 183.9730 100 a
base 65.70521 109.6471 154.1312 125.4238 156.9168 611.0169 100 b
Here's a simple function. You can specify how many rows/columns you want, and what each row sums to.
func <- function(cols = 3, rows = 10, rowTotal = 100) {
dt1 <- replicate(n = cols, runif(n = rows))
dt1 <- data.frame(apply(X = dt1, MARGIN = 2, FUN = function(x) x / rowSums(dt1) * rowTotal))
return(dt1)
}
rowSums(func()) # default values (3 cols, 10 rows, each row sums to 100)
rowSums(func(cols = 5, rows = 10, rowTotal = 50)) # 5 cols, 10 rows, row sums to 50)

method for calculating distance between all points quickly

I have some code that generates a matrix of distances between all points in a dataframe that uses functions from tidyverse. However, it works prohibitively slow. Does anyone know of a method to do the same thing that is faster?
Example data and working code:
library(tidyverse)
locs <- data.frame(ID = 1:4000, x = runif (4000, 0, 1), y = runif (4000, 0, 1))
df1 <- locs %>%
mutate(k = 1)
df2 <- df1 %>%
full_join(df1, by = "k") %>%
mutate(length = sqrt((x.x - x.y)^2 + (y.x - y.y)^2)) %>%
select(ID.x, ID.y, length)
dists <- matrix(data = df2$length, nrow = nrow(df1), ncol = nrow(df1))
You could use the base R function dist:
locs <- data.frame(ID = 1:10, x = runif (10, 0, 1), y = runif (10, 0, 1))
dist(locs[,2:3], upper = T, diag = T)
Output:
1 2 3 4 5 6 7 8 9 10
1 0.00000000 1.10309601 0.98790825 0.54490600 0.42478532 1.06323764 0.31094245 0.52593635 0.44695830 0.85010761
2 1.10309601 0.00000000 0.29292865 0.93412638 0.74551902 0.17160290 0.83557056 0.62393711 0.74218236 0.57669081
3 0.98790825 0.29292865 0.00000000 0.69626767 0.72278486 0.13085561 0.78064096 0.46359296 0.73098652 0.72732431
4 0.54490600 0.93412638 0.69626767 0.00000000 0.65426980 0.81617143 0.59851262 0.36551106 0.68253093 1.00018238
5 0.42478532 0.74551902 0.72278486 0.65426980 0.00000000 0.75537605 0.11384534 0.36844164 0.02911855 0.42844270
6 1.06323764 0.17160290 0.13085561 0.81617143 0.75537605 0.00000000 0.82826619 0.55014297 0.75867851 0.68258388
7 0.31094245 0.83557056 0.78064096 0.59851262 0.11384534 0.82826619 0.00000000 0.37224997 0.13688270 0.54088523
8 0.52593635 0.62393711 0.46359296 0.36551106 0.36844164 0.55014297 0.37224997 0.00000000 0.39086196 0.64185453
9 0.44695830 0.74218236 0.73098652 0.68253093 0.02911855 0.75867851 0.13688270 0.39086196 0.00000000 0.40400339
10 0.85010761 0.57669081 0.72732431 1.00018238 0.42844270 0.68258388 0.54088523 0.64185453 0.40400339 0.00000000
Benchmark with 1000 records:
library(dplyr)
library(microbenchmark)
locs <- data.frame(ID = 1:1000, x = runif (1000, 0, 1), y = runif (1000, 0, 1))
f1 <- function()
{
df1 <- locs %>%
mutate(k = 1)
df2 <- df1 %>%
full_join(df1, by = "k") %>%
mutate(length = sqrt((x.x - x.y)^2 + (y.x - y.y)^2)) %>%
select(ID.x, ID.y, length)
dists <- matrix(data = df2$length, nrow = nrow(df1), ncol = nrow(df1))
}
f2 <- function(){dist(locs[,2:3],upper = T,diag=T)}
microbenchmark(f1())
microbenchmark(f2())
Results:
Unit: milliseconds
expr min lq mean median uq max neval
f1() 81.74188 245.8014 276.4318 259.7682 294.01 567.9409 100
and
Unit: milliseconds
expr min lq mean median uq max neval
f2() 6.956302 7.330661 8.675304 8.11507 8.981121 18.77783 100
Hope this helps!

Discrepancies in Benchmark and Processing Time Results

I've been trying to do a little testing of the most efficient ways to replace NA's in dataframes.
I started with a comparison of of NA's to 0's replacement solutions on a 1 million row, 12 column dataset.
Throwing all the pipe capable ones into microbenchmark I got the following results.
Question 1: Is there a way to test the subset left assignment statements (e.g.:df1[is.na(df1)] <- 0) inside the benchmark function?
library(dplyr)
library(tidyr)
library(microbenchmark)
set.seed(24)
df1 <- as.data.frame(matrix(sample(c(NA, 1:5), 1e6 *12, replace=TRUE),
dimnames = list(NULL, paste0("var", 1:12)), ncol=12))
op <- microbenchmark(
mut_all_ifelse = df1 %>% mutate_all(funs(ifelse(is.na(.), 0, .))),
mut_at_ifelse = df1 %>% mutate_at(funs(ifelse(is.na(.), 0, .)), .cols = c(1:12)),
# df1[is.na(df1)] <- 0 would sit here, but I can't make it work inside this function
replace = df1 %>% replace(., is.na(.), 0),
mut_all_replace = df1 %>% mutate_all(funs(replace(., is.na(.), 0))),
mut_at_replace = df1 %>% mutate_at(funs(replace(., is.na(.), 0)), .cols = c(1:12)),
replace_na = df1 %>% replace_na(list(var1 = 0, var2 = 0, var3 = 0, var4 = 0, var5 = 0, var6 = 0, var7 = 0, var8 = 0, var9 = 0, var10 = 0, var11 = 0, var12 = 0)),
times = 1000L
)
print(op) #standard data frame of the output
Unit: milliseconds
expr min lq mean median uq max neval
mut_all_ifelse 769.87848 844.5565 871.2476 856.0941 895.4545 1274.5610 1000
mut_at_ifelse 713.48399 847.0322 875.9433 861.3224 899.7102 1006.6767 1000
replace 258.85697 311.9708 334.2291 317.3889 360.6112 455.7596 1000
mut_all_replace 96.81479 164.1745 160.6151 167.5426 170.5497 219.5013 1000
mut_at_replace 96.23975 166.0804 161.9302 169.3984 172.7442 219.0359 1000
replace_na 103.04600 161.2746 156.7804 165.1649 168.3683 210.9531 1000
boxplot(op) #boxplot of output
library(ggplot2) #nice log plot of the output
qplot(y=time, data=op, colour=expr) + scale_y_log10()
To test the subset assignment operator I had run these tests originally.
set.seed(24)
> Book1 <- as.data.frame(matrix(sample(c(NA, 1:5), 1e8 *12, replace=TRUE),
+ dimnames = list(NULL, paste0("var", 1:12)), ncol=12))
> system.time({
+ Book1 %>% mutate_all(funs(ifelse(is.na(.), 0, .))) })
user system elapsed
52.79 24.66 77.45
>
> system.time({
+ Book1 %>% mutate_at(funs(ifelse(is.na(.), 0, .)), .cols = c(1:12)) })
user system elapsed
52.74 25.16 77.91
>
> system.time({
+ Book1[is.na(Book1)] <- 0 })
user system elapsed
16.65 7.86 24.51
>
> system.time({
+ Book1 %>% replace_na(list(var1 = 0, var2 = 0, var3 = 0, var4 = 0, var5 = 0, var6 = 0, var7 = 0, var8 = 0, var9 = 0,var10 = 0, var11 = 0, var12 = 0)) })
user system elapsed
3.54 2.13 5.68
>
> system.time({
+ Book1 %>% mutate_at(funs(replace(., is.na(.), 0)), .cols = c(1:12)) })
user system elapsed
3.37 2.26 5.63
>
> system.time({
+ Book1 %>% mutate_all(funs(replace(., is.na(.), 0))) })
user system elapsed
3.33 2.26 5.58
>
> system.time({
+ Book1 %>% replace(., is.na(.), 0) })
user system elapsed
3.42 1.09 4.51
In these tests the base replace() comes in first.
In the benchmarking trials, the replace falls farther back in the ranks while the tidyr replace_na() wins (by a nose)
Running the singular tests repeatedly and on different shapes and sizes of data frames always finds the base replace() in the lead.
Question 2: How could it's benchmark performance be the only result that falls so far out of line with the simple test results?
More perplexingly -
Question 3: How can all the mutate_all/_at(replace()) work faster than the simple replace()?
Many folks report this: http://datascience.la/dplyr-and-a-very-basic-benchmark/ (and all the links in that article) but I still haven't found an explanation for why beyond that hashing and C++ are used.)
with special thanks already to Tyler Rinker: https://www.r-bloggers.com/microbenchmarking-with-r/
and akrun: https://stackoverflow.com/a/41530071/5088194
You can include a complex/multi statement in microbenchmark by wrapping it with {} which, basically, converts to a single expression:
microbenchmark(expr1 = { df1[is.na(df1)] = 0 },
exp2 = { tmp = 1:10; tmp[3] = 0L; tmp2 = tmp + 12L; tmp2 ^ 2 },
times = 10)
#Unit: microseconds
# expr min lq mean median uq max neval cld
# expr1 124953.716 137244.114 158576.030 142405.685 156744.076 284779.353 10 b
# exp2 2.784 3.132 17.748 23.142 24.012 38.976 10 a
Worth noting the side-effects of this:
tmp
#[1] 1 2 0 4 5 6 7 8 9 10
in contrast to, say, something like:
rm(tmp)
microbenchmark(expr1 = { df1[is.na(df1)] = 0 },
exp2 = local({ tmp = 1:10; tmp[3] = 0L; tmp2 = tmp + 12L; tmp2 ^ 2 }),
times = 10)
#Unit: microseconds
# expr min lq mean median uq max neval cld
# expr1 127250.18 132935.149 165296.3030 154509.553 169917.705 314820.306 10 b
# exp2 10.44 12.181 42.5956 54.636 57.072 97.789 10 a
tmp
#Error: object 'tmp' not found
Noticing the side effect a benchmark has, we see that the first operation that removes NA values leaves a fairly light job for the following alternatives:
# re-assign because we changed it before
set.seed(24)
df1 = as.data.frame(matrix(sample(c(NA, 1:5), 1e6 * 12, TRUE),
dimnames = list(NULL, paste0("var", 1:12)), ncol = 12))
unique(sapply(df1, typeof))
#[1] "integer"
any(sapply(df1, anyNA))
#[1] TRUE
system.time({ df1[is.na(df1)] <- 0 })
# user system elapsed
# 0.39 0.14 0.53
The previous benchmark leaves us with:
unique(sapply(df1, typeof))
#[1] "double"
any(sapply(df1, anyNA))
#[1] FALSE
And replacing NA, when there are none, is/should be taken account to do nothing on the input.
Aside of that, note that in all your alternatives you sub-assign a "double" (typeof(0)) to "integer" columns-vectors (sapply(df1, typeof)). While, I don't think there is any case (in the above alternatives) where df1 gets modified in place (since after the creation of a "data.frame", there is stored info to copy its vector-columns in case of modification), there -still- is a minor but avoidable overhead in coercing to "double" and storing as a "double". R before replacing elements in an "integer" vector will allocate and copy (in case of "integer" replacement) or allocate and coerce (in case of "double" replacement). Also, after the first coercion (from a side effect of a benchmark, as noted above), R will operate on "double"s and that contains slower manipulations than on "integer"s. I can't find a straightforward R way to investigate that difference, but in a nutshell (in danger of not being totally accurate) we can simulate these operations by:
# simulate R's copying of int to int
# allocate a new int and copy
int2int = inline::cfunction(sig = c(x = "integer"), body = '
SEXP ans = PROTECT(allocVector(INTSXP, LENGTH(x)));
memcpy(INTEGER(ans), INTEGER(x), LENGTH(x) * sizeof(int));
UNPROTECT(1);
return(ans);
')
# R's coercing of int to double
# 'coerceVector', internally, allocates a double and coerces to populate it
int2dbl = inline::cfunction(sig = c(x = "integer"), body = '
SEXP ans = PROTECT(coerceVector(x, REALSXP));
UNPROTECT(1);
return(ans);
')
# simulate R's copying form double to double
dbl2dbl = inline::cfunction(sig = c(x = "double"), body = '
SEXP ans = PROTECT(allocVector(REALSXP, LENGTH(x)));
memcpy(REAL(ans), REAL(x), LENGTH(x) * sizeof(double));
UNPROTECT(1);
return(ans);
')
And on a benchmark:
x.int = 1:1e7; x.dbl = as.numeric(x.int)
microbenchmark(int2int(x.int), int2dbl(x.int), dbl2dbl(x.dbl), times = 50)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# int2int(x.int) 16.42710 16.91048 21.93023 17.42709 19.38547 54.36562 50 a
# int2dbl(x.int) 35.94064 36.61367 47.15685 37.40329 63.61169 78.70038 50 b
# dbl2dbl(x.dbl) 33.51193 34.18427 45.30098 35.33685 63.45788 75.46987 50 b
To conclude(!) the whole previous note, replacing 0 with 0L shall save some time...
Finally to replicate the benchmark in a more fair way, we could use:
library(dplyr)
library(tidyr)
library(microbenchmark)
set.seed(24)
df1 = as.data.frame(matrix(sample(c(NA, 1:5), 1e6 * 12, TRUE),
dimnames = list(NULL, paste0("var", 1:12)), ncol = 12))
Wrap in functions:
stopifnot(ncol(df1) == 12) #some of the alternatives are hardcoded to 12 columns
mut_all_ifelse = function(x, val) x %>% mutate_all(funs(ifelse(is.na(.), val, .)))
mut_at_ifelse = function(x, val) x %>% mutate_at(funs(ifelse(is.na(.), val, .)), .cols = c(1:12))
baseAssign = function(x, val) { x[is.na(x)] <- val; x }
baseFor = function(x, val) { for(j in 1:ncol(x)) x[[j]][is.na(x[[j]])] = val; x }
base_replace = function(x, val) x %>% replace(., is.na(.), val)
mut_all_replace = function(x, val) x %>% mutate_all(funs(replace(., is.na(.), val)))
mut_at_replace = function(x, val) x %>% mutate_at(funs(replace(., is.na(.), val)), .cols = c(1:12))
myreplace_na = function(x, val) x %>% replace_na(list(var1 = val, var2 = val, var3 = val, var4 = val, var5 = val, var6 = val, var7 = val, var8 = val, var9 = val, var10 = val, var11 = val, var12 = val))
Test for equality of results before the benchmarks:
identical(mut_all_ifelse(df1, 0), mut_at_ifelse(df1, 0))
#[1] TRUE
identical(mut_at_ifelse(df1, 0), baseAssign(df1, 0))
#[1] TRUE
identical(baseAssign(df1, 0), baseFor(df1, 0))
#[1] TRUE
identical(baseFor(df1, 0), base_replace(df1, 0))
#[1] TRUE
identical(base_replace(df1, 0), mut_all_replace(df1, 0))
#[1] TRUE
identical(mut_all_replace(df1, 0), mut_at_replace(df1, 0))
#[1] TRUE
identical(mut_at_replace(df1, 0), myreplace_na(df1, 0))
#[1] TRUE
Test with coercing to "double":
benchnum = microbenchmark(mut_all_ifelse(df1, 0),
mut_at_ifelse(df1, 0),
baseAssign(df1, 0),
baseFor(df1, 0),
base_replace(df1, 0),
mut_all_replace(df1, 0),
mut_at_replace(df1, 0),
myreplace_na(df1, 0),
times = 10)
benchnum
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# mut_all_ifelse(df1, 0) 1368.5091 1441.9939 1497.5236 1509.2233 1550.1416 1629.6959 10 c
# mut_at_ifelse(df1, 0) 1366.1674 1389.2256 1458.1723 1464.5962 1503.4337 1553.7110 10 c
# baseAssign(df1, 0) 532.4975 548.9444 586.8198 564.3940 655.8083 667.8634 10 b
# baseFor(df1, 0) 169.6048 175.9395 206.7038 189.5428 197.6472 308.6965 10 a
# base_replace(df1, 0) 518.7733 547.8381 597.8842 601.1544 643.4970 666.6872 10 b
# mut_all_replace(df1, 0) 169.1970 183.5514 227.1978 194.0903 291.6625 346.4649 10 a
# mut_at_replace(df1, 0) 176.7904 186.4471 227.3599 202.9000 303.4643 309.2279 10 a
# myreplace_na(df1, 0) 172.4926 177.8518 199.1469 186.3645 192.1728 297.0419 10 a
Test without coercing to "double":
benchint = microbenchmark(mut_all_ifelse(df1, 0L),
mut_at_ifelse(df1, 0L),
baseAssign(df1, 0L),
baseFor(df1, 0L),
base_replace(df1, 0L),
mut_all_replace(df1, 0L),
mut_at_replace(df1, 0L),
myreplace_na(df1, 0L),
times = 10)
benchint
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# mut_all_ifelse(df1, 0L) 1291.17494 1313.1910 1377.9265 1353.2812 1417.4389 1554.6110 10 c
# mut_at_ifelse(df1, 0L) 1295.34053 1315.0308 1372.0728 1353.0445 1431.3687 1478.8613 10 c
# baseAssign(df1, 0L) 451.13038 461.9731 477.3161 471.0833 484.9318 528.4976 10 b
# baseFor(df1, 0L) 98.15092 102.4996 115.7392 107.9778 136.2227 139.7473 10 a
# base_replace(df1, 0L) 428.54747 451.3924 471.5011 470.0568 497.7088 516.1852 10 b
# mut_all_replace(df1, 0L) 101.66505 102.2316 137.8128 130.5731 161.2096 243.7495 10 a
# mut_at_replace(df1, 0L) 103.79796 107.2533 119.1180 112.1164 127.7959 166.9113 10 a
# myreplace_na(df1, 0L) 100.03431 101.6999 120.4402 121.5248 137.1710 141.3913 10 a
And a simple way to visualize:
boxplot(benchnum, ylim = range(min(summary(benchint)$min, summary(benchnum)$min),
max(summary(benchint)$max, summary(benchnum)$max)))
boxplot(benchint, add = TRUE, border = "red", axes = FALSE)
legend("topright", c("coerce", "not coerce"), fill = c("black", "red"))
Note that df1 is unchanged after all this (str(df1)).

Resources