dictionary and list comprehension in R - r

Any generic way of doing the following R code faster? For example in python dict comprehension (see equivalent below) would be a nice faster alternative.
R:
l1 <- 1:3
l2 <- c("a", "b", "c")
foo <- function(x) {return(5*x)}
bar <- list()
for (i in 1:length(l1)) bar[l2[i]] <- foo(l1[i])
Python
l1 = range(1, 4)
l2 = ["a", "b", "c"]
def foo(x):
return 5*x
{b: foo(a) for a, b in zip(l1, l2)}

We're talking about speed, so let's do some benchmarking:
library(microbenchmark)
microbenchmark(op = {for (i in 1:length(l1)) bar[l2[i]] <- foo(l1[i])},
lapply = setNames(lapply(l1,foo),l2),
vectorised = setNames(as.list(foo(l1)), l2))
Unit: microseconds
expr min lq mean median uq max neval
op 7.982 9.122 10.81052 9.693 10.548 36.206 100
lapply 5.987 6.557 7.73159 6.842 7.270 55.877 100
vectorised 4.561 5.132 6.72526 5.417 5.987 80.964 100
But these small values don't mean much, so I pumped up the vector length to 10,000 where you'll really see a difference:
l <- 10000
l1 <- seq_len(l)
l2 <- sample(letters, l, replace = TRUE)
microbenchmark(op = {bar <- list(); for (i in 1:length(l1)) bar[l2[i]] <- foo(l1[i])},
lapply = setNames(lapply(l1,foo),l2),
vectorised = setNames(as.list(foo(l1)), l2),
times = 100)
Unit: microseconds
expr min lq mean median uq max neval
op 30122.865 33325.788 34914.8339 34769.8825 36721.428 41515.405 100
lapply 13526.397 14446.078 15217.5309 14829.2320 15351.933 19241.767 100
vectorised 199.559 259.997 349.0544 296.9155 368.614 3189.523 100
But tacking onto what everyone else said, it doesn't have to be a list. If you remove the list requirement:
microbenchmark(setNames(foo(l1), l2))
Unit: microseconds
expr min lq mean median uq max neval
setNames(foo(l1), l2) 22.522 23.8045 58.06888 25.0875 48.322 1427.417 100

Related

How to substitute multiple words with spaces in R?

Here is an example:
drugs<-c("Lapatinib-Ditosylate", "Caffeic-Acid-Phenethyl-Ester", "Pazopanib-HCl", "D-Pantethine")
ads<-"These are recently new released drugs Lapatinib Ditosylate, Pazopanib HCl, and Caffeic Acid Phenethyl Ester"
What I wanted is to correct the drug names in ads with the names in drugs such that a desired output would be:
"These are recently new released drugs Lapatinib-Ditosylate, Pazopanib-HCl, and Caffeic-Acid-Phenethyl-Ester"
If you create a vector of words to be replaced, then you can loop over that vector and the vector of words to replace them (drugs), replacing all instances of one element in each interation of the loop.
to_repl <- gsub('-', ' ', drugs)
for(i in seq_along(drugs))
ads <- gsub(to_repl[i], drugs[i], ads)
ads
# "These are recently new released drugs Lapatinib-Ditosylate, Pazopanib-HCl, and Caffeic-Acid-Phenethyl-Ester"
Contrary to popular belief, for-loops in R are no slower than lapply
f_lapply <- function(ads){
to_repl <- gsub('-', ' ', drugs)
invisible(lapply(seq_along(to_repl), function(i) {
ads <<- gsub(to_repl[i], drugs[i], ads)
}))
ads
}
f_loop <- function(ads){
to_repl <- gsub('-', ' ', drugs)
for(i in seq_along(to_repl))
ads <- gsub(to_repl[i], drugs[i], ads)
ads
}
f_loop(ads) == f_lapply(ads)
# [1] TRUE
microbenchmark::microbenchmark(f_loop(ads), f_lapply(ads), times = 1e4)
# Unit: microseconds
# expr min lq mean median uq max neval
# f_loop(ads) 59.488 95.180 118.0793 107.487 120.205 7426.866 10000
# f_lapply(ads) 69.333 114.462 147.9732 130.872 152.205 27283.670 10000
Or, using more general examples:
loop_over <- 1:1e5
microbenchmark::microbenchmark(
for_loop = {for(i in loop_over) 1},
lapply = {lapply(loop_over, function(x) 1)}
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# for_loop 4.66174 5.865842 7.725975 6.354867 7.449429 35.26807 100
# lapply 94.09223 114.378778 125.149863 124.665128 134.217326 170.16889 100
loop_over <- 1:1e5
microbenchmark::microbenchmark(
for_loop = {y <- numeric(1e5); for(i in seq_along(loop_over)) y[i] <- loop_over[i]},
lapply = {lapply(loop_over, function(x) x)}
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# for_loop 11.00184 11.49455 15.24015 12.10461 15.26050 134.139 100
# lapply 71.41820 81.14660 93.64569 87.05162 98.59295 357.219 100
This can also be done using lapply() which will be faster than for loop. Modifying #IceCreamToucan's answer, this can be done in lapply as follows
to_repl <- gsub('-', ' ', drugs)
invisible(lapply(seq_along(to_repl), function(i) {
ads <<- gsub(to_repl[i], drugs[i], ads)
}))
# [1] "These are recently new released drugs Lapatinib-Ditosylate, Pazopanib-HCl, and Caffeic-Acid-Phenethyl-Ester"
Microbenchmark
Unit: microseconds
expr min lq mean median uq max neval
lapply 80.514 87.4935 110.1103 93.304 96.1995 1902.861 100
for.loop 2285.164 2318.5665 2463.1554 2338.216 2377.4120 7510.763 100

R *apply vector as input; matrix as output

I'd like to apply over each element of a vector, a function that outputs a vector.
After applying the function to each element of that vector, I should have many vectors, which I'd like to rbind in order to have a matrix.
The code should be equivalent to the following:
my_function <- function(x) x:(x+10)
my_vec <- 1:10
x <- vector()
for(i in seq_along(vec)){
x <- rbind(x,my_function(my_vec[i]))
}
Of course, my_function and my_vec are just examples.
try:
tmp <- lapply(my_vec, my_function)
do.call(rbind, tmp)
or, like Heroka suggested, use sapply. i prefer lapply, then bind my output the way i like (rbind/cbind) instead of potentially transposing.
Here is an alternative:
matrix( unlist(lapply(my_vec,my_function)), length(my_vec), byrow=TRUE )
Speed is almost the same:
library(microbenchmark)
my_function <- function(x) sin(x:(x+10))
for ( n in 1:4 )
{
my_vec <- 1:10^n
print(
microbenchmark( mra68 = matrix( unlist(lapply(my_vec,my_function)), length(my_vec), byrow=TRUE ),
stas.g = do.call(rbind, lapply(my_vec, my_function)),
times = 1000 )
)
print("identical?")
print( identical( matrix( unlist(lapply(my_vec,my_function)), length(my_vec), byrow=TRUE ),
do.call(rbind, lapply(my_vec, my_function)) ) )
}
.
Unit: microseconds
expr min lq mean median uq max neval
mra68 38.496 40.307 68.00539 41.213 110.052 282.148 1000
stas.g 41.213 42.572 72.86443 43.930 115.939 445.186 1000
[1] "identical?"
[1] TRUE
Unit: microseconds
expr min lq mean median uq max neval
mra68 793.002 810.212 850.4857 818.3640 865.2375 7231.669 1000
stas.g 876.786 894.901 946.8165 906.2235 966.9100 7051.873 1000
[1] "identical?"
[1] TRUE
Unit: milliseconds
expr min lq mean median uq max neval
mra68 2.605448 3.028442 5.269003 4.020940 7.807512 14.51225 1000
stas.g 2.959604 3.390071 5.823661 4.500546 8.800462 92.54977 1000
[1] "identical?"
[1] TRUE
Unit: milliseconds
expr min lq mean median uq max neval
mra68 27.29810 30.99387 51.44223 41.20167 79.46185 559.0059 1000
stas.g 33.63622 37.22420 60.10224 49.07643 92.94333 395.3315 1000
[1] "identical?"
[1] TRUE
>

Efficient use of vectors

I am attempting to copy one vector to another using the following syntax:
data<-NULL
for( i in 1:nrow(line)){
data=append(data,line[i*4])
}
From what I have seen, the use of append in this way results in a lot of copying of data, which makes R very slow. What is the syntax for copying the 4th element of one array to another, given that the list you are copying from is of a given size?
Here are three methods with their benchmarks. You can see that preallocating the vector as in the method2 function is quite a bit faster, while the lapply method is middle, and your function is the slowest.
Of course, these are 1D vectors as opposed to arrays of n-D, but I would expected the benchmarks would be similar or even more pronounced.
method1 <- function(line) {
data<-NULL
for( i in 1:length(line)){
data=append(data,line[i])
}
}
method2 <- function(line) {
data <- vector(mode="numeric", length = length(line))
for (i in 1:length(line)) {
data[i] <- line[i]
}
}
library(microbenchmark)
r <- rnorm(1000)
microbenchmark(method2(r), unit="ms")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> method2(r) 2.18085 2.279676 2.428731 2.371593 2.500495 5.24888 100
microbenchmark(lapply(r, function(x) { data<-append(data, x) }), unit="ms")
#> Unit: milliseconds
#> expr min lq
#> lapply(r, function(x) { data <- append(data, x) }) 3.014673 3.091299
#> mean median uq max neval
#> 3.287216 3.150052 3.260199 6.036501 100
microbenchmark(method1(r), unit="ms")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> method1(r) 3.938684 3.978002 5.71831 4.020001 4.280521 98.58584 100
Didn't realize OP wanted only every fourth. Why not just use a data frame or data.table?
d <- data.frame(matrix(rnorm(1000), ncol=1))
microbenchmark(d2 <- d[seq(1,nrow(d), 4),])
#> Unit: microseconds
#> expr min lq mean median uq
#> d2 <- d[seq(1, nrow(d), 4), ] 64.846 65.9915 73.08007 67.225 73.8225
#> max neval
#> 220.438 100
library(data.table)
dt <- data.table(d)
microbenchmark(d2 <- dt[seq(1,nrow(d), 4),])
#> Unit: microseconds
#> expr min lq mean median uq
#> d2 <- dt[seq(1, nrow(d), 4), ] 298.163 315.2025 324.8793 320.554 330.416
#> max neval
#> 655.124 100
If you're trying to extract every fourth element from a vector, you could index using seq to grab the correct elements:
data <- letters[seq(4, length(letters), by=4)]
data
# [1] "d" "h" "l" "p" "t" "x"
Growing the vector one at a time as you show in your question will be slow because you will need to keep re-allocating your vector (see the second circle of The R Inferno for details). However, even pre-allocating your vector and constructing it with a for loop will be slow compared to constructing it in a single vectorized indexing operation.
To get a sense of the speed improvements, consider a comparison to the sort of method you've described, except using pre-allocation:
for.prealloc <- function(x) {
data <- vector(mode="numeric", length = floor(length(x)/4))
for (i in 1:floor(length(x)/4)) {
data[i] <- x[i*4]
}
data
}
josilber <- function(x) x[seq(4, length(x), by=4)]
r <- rnorm(10000)
all.equal(for.prealloc(r), josilber(r))
# [1] TRUE
library(microbenchmark)
microbenchmark(for.prealloc(r), josilber(r))
# Unit: microseconds
# expr min lq mean median uq max neval
# for.prealloc(r) 1846.014 2035.7890 2351.9681 2094.804 2244.56 5283.285 100
# josilber(r) 95.757 97.4125 125.9877 113.179 138.96 259.606 100
The approach I propose is 20x faster than using for and a pre-allocated vector (and it will be even faster than using append and a non-pre-allocated vector).

Fastest way of norming vectors in a matrix

I'd like to sort out what it the fastest way to get the norm of a set of vectors contained in a matrix. I was using apply (this is an example, my matrices are much bigger):
a = matrix(1:9, 3,3)
norm_a = apply(a, 1, function(x) sqrt(sum(x^2)))
but then I wanted to speed up my code and moved to:
norm_a = sqrt(a^2%*%rep(1,dim(a)[2]))
which is actually much faster (seen with system.time, I'm not an expert in benchmarking). But I haven't found any final answer to this question so far. Does anyone have an insight about this ?
thanks
This depends on the size of your matrix:
library(microbenchmark)
microbenchmark(f1 = apply(a, 1, function(x) sqrt(sum(x^2))),
f2 = sqrt(a^2%*%rep(1,dim(a)[2])),
f3 = sqrt(rowSums(a^2)))
#Unit: microseconds
# expr min lq mean median uq max neval cld
# f1 44.656 46.812 52.21050 47.5815 49.4295 191.248 100 c
# f2 1.849 2.772 4.07532 4.3120 4.6210 16.323 100 a
# f3 6.160 7.392 9.25537 9.5480 10.1630 20.943 100 b
set.seed(42)
b <- matrix(rnorm(1e6), 1000)
microbenchmark(f1 = apply(b, 1, function(x) sqrt(sum(x^2))),
f2 = sqrt(b^2%*%rep(1,dim(b)[2])),
f3 = sqrt(rowSums(b^2)))
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# f1 30.851752 55.513228 86.84168 109.439043 112.54796 152.27730 100 b
# f2 5.503050 7.434152 14.36080 8.861268 10.42327 66.41539 100 a
# f3 4.430403 5.895553 12.92235 7.359163 8.62321 74.65256 100 a

Why is `unlist(lapply)` faster than `sapply`?

If so, why do we need sapply?
x <- list(a=1, b=1)
y <- list(a=1)
JSON <- rep(list(x,y),10000)
microbenchmark(sapply(JSON, function(x) x$a),
unlist(lapply(JSON, function(x) x$a)),
sapply(JSON, "[[", "a"),
unlist(lapply(JSON, "[[", "a"))
)
Unit: milliseconds
expr min lq median uq max neval
sapply(JSON, function(x) x$a) 25.22623 28.55634 29.71373 31.76492 88.26514 100
unlist(lapply(JSON, function(x) x$a)) 17.85278 20.25889 21.61575 22.67390 78.54801 100
sapply(JSON, "[[", "a") 18.85529 20.06115 21.53790 23.42480 38.56610 100
unlist(lapply(JSON, "[[", "a")) 11.33859 11.69198 12.25329 13.37008 27.81361 100
In addition to running lapply, sapply runs simplify2array to try and fit the output into an array. To figure out if that is possible, the function needs to check if all the individual outputs have the same length: this is done via a costly unique(lapply(..., length)) which accounts for most of the time difference you were seeing:
b <- lapply(JSON, "[[", "a")
microbenchmark(lapply(JSON, "[[", "a"),
unlist(b),
unique(lapply(b, length)),
sapply(JSON, "[[", "a"),
sapply(JSON, "[[", "a", simplify = FALSE),
unlist(lapply(JSON, "[[", "a"))
)
# Unit: microseconds
# expr min lq median uq max neval
# lapply(JSON, "[[", "a") 14809.151 15384.358 15774.26 16905.226 24944.863 100
# unlist(b) 920.047 1043.719 1158.62 1223.091 8056.231 100
# unique(lapply(b, length)) 10778.065 11060.452 11456.11 12581.414 19717.740 100
# sapply(JSON, "[[", "a") 24827.206 25685.535 26656.88 30519.556 93195.751 100
# sapply(JSON, "[[", "a", simplify = FALSE) 14283.541 14922.780 15526.42 16654.058 26865.022 100
# unlist(lapply(JSON, "[[", "a")) 15334.026 16133.146 16607.12 18476.182 30080.544 100
As droopy and Roland pointed out, sapply is a wrapper function for lapply designed for convenient use. sapply uses simplify2array which is slower than unlist:
> microbenchmark(unlist(as.list(1:1000)), simplify2array(as.list(1:1000)), times=1000)
Unit: microseconds
expr min lq median uq max neval
unlist(as.list(1:1000)) 99.734 109.0230 113.912 118.3120 21343.92 1000
simplify2array(as.list(1:1000)) 892.712 931.0895 947.957 976.3125 22241.52 1000
Also, when returning a matrix, sapply is slower than with other base functions, for example:
a <- list(c(1,2,3,4), c(1,2,3,4), c(1,2,3,4))
microbenchmark(t(do.call(rbind, lapply(a, function(x)x))), sapply(a, function(x)x))
Unit: microseconds
expr min lq median uq max neval
t(do.call(rbind, lapply(a, function(x) x))) 29.823 30.801 32.512 33.734 94.845 100
sapply(a, function(x) x) 57.201 58.179 59.156 60.134 111.956 100
But especially in the second case, sapply is much easier to use.

Resources