replace loop with an *pply alternative - r

I am trying to speedup my code by replacing some lookup loops with tapply (How to do vlookup and fill down (like in Excel) in R?) and I stumbled upon this code piece:
DF<-data.frame(id=c(rep("A", 5),rep("B", 7),rep("C", 9)), series=NA, chi=c(letters[1:5], LETTERS[6:12], letters[13:21]))
for (i in unique(DF$id)){
DF$series[ DF$id==i ]<-1:length(DF$id[ DF$id==i ])
}
DF
Is it possible to replace this with an *apply family function? Or any other way to speed it up?

You may try ave:
DF$series <- ave(DF$id, DF$id, FUN = seq_along)
For larger data sets, dplyr is faster though.
library(dplyr)
fun_ave <- function(df) transform(df, series = ave(id, id, FUN = seq_along))
fun_dp <- function(df) df %.%
group_by(id) %.%
mutate(
series = seq_along(id))
df <- data.frame(id= sample(letters[1:3], 100000, replace = TRUE))
microbenchmark(fun_ave(df))
# Unit: milliseconds
# expr min lq median uq max neval
# fun_ave(df) 38.59112 39.40802 50.77921 51.2844 128.6791 100
microbenchmark(fun_dp(df))
# Unit: milliseconds
# expr min lq median uq max neval
# fun_dp(df) 4.977035 5.034244 5.060663 5.265173 17.16018 100

Could also use data.table
library(data.table)
DT <- data.table(DF)
DT[, series_new := 1:.N, by = id]
and using tapply
DF$series_new <- unlist(tapply(DF$id, DF$id, function(x) 1:length(x)))
Extending #Henrik's comparison above both data.table and dplyr are quite a bit faster for large data sets.
library(data.table)
library(dplyr)
df <- data.frame(id= sample(letters[1:3], 100000, replace = TRUE), stringsAsFactors = F)
dt <- data.table(df)
fun_orig <- function(df){
for (i in unique(df$id)){
df$series[df$id==i]<-1:length(df$id[df$id==i])
}}
fun_tapply <- function(df){
df$series <- unlist(tapply(df$id, df$id, function(x) 1:length(x)))
}
fun_ave <- function(df){
transform(df, series = ave(df$id, df$id, FUN = seq_along))
}
fun_dp <- function(df){
df %.%
group_by(id) %.%
mutate(
series = seq_along(id))
}
fun_dt <- function(dt) dt[, 1:.N, by = id]
microbenchmark(fun_dt(dt), times = 1000)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_dt(dt) 2.473253 2.597031 2.771771 3.76307 40.59909 1000
microbenchmark(fun_dp(df), times = 1000)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_dp(df) 2.71375 2.786829 2.914569 3.081609 40.48445 1000
microbenchmark(fun_orig(df), times = 1000)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_orig(df) 30.65534 31.93449 32.72991 33.88885 75.13967 1000
microbenchmark(fun_tapply(df), times = 1000)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_tapply(df) 56.67636 61.72207 66.37193 102.4189 124.6661 1000
microbenchmark(fun_ave(df), times = 1000)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_ave(df) 97.36992 103.161 107.5007 139.1362 157.9464 1000

Related

Reason to use dplyr::union_all over other row binding methods on a dataframe

Is there a reason to use dplyr::union_all on a dataframe to bind two dataframes together vs other ways to bind rows? When I test it there don't seem to be differences but don't know if there might be edge cases where this wouldn't be a good idea or if there are reasons to choose it over another method. The documentation shows an example on a dataframe.
library(dplyr)
library(data.table)
library(microbenchmark)
data <- data.frame(a=rep('a', 10^6), b= rep(1, 10^6))
dt <- as.data.table(data)
microbenchmark(df1 <- dplyr::union_all(data, data))
microbenchmark(df2 <- rbind(data, data))
microbenchmark(df3 <- dplyr::bind_rows(data, data))
microbenchmark(df4 <- data.table::rbindlist(list(data, data)))
microbenchmark(df5 <- rbind(dt, dt))
all((df1 == df2) && (df2 == df3) && (df3 == as.data.frame(df4)) && (df4 == df5))
# [1] TRUE
They all seem roughly the same speed except for rbind on a data.frame which seems particularly slow. There doesn't seem to be a speed reason to choose union_all over other methods.
microbenchmark(df1 <- dplyr::union_all(data, data))
Unit: milliseconds
expr min lq mean median uq max neval
df1 <- dplyr::union_all(data, data) 8.501586 10.19703 13.77899 11.62611 18.16747 25.73479 100
microbenchmark(df2 <- rbind(data, data))
Unit: milliseconds
expr min lq mean median uq max neval
df2 <- rbind(data, data) 48.4319 50.98856 63.70163 52.65343 61.17889 180.8519 100
microbenchmark(df3 <- dplyr::bind_rows(data, data))
Unit: milliseconds
expr min lq mean median uq max neval
df3 <- dplyr::bind_rows(data, data) 9.121883 10.36146 13.38456 11.13614 12.04666 127.5304 100
microbenchmark(df4 <- data.table::rbindlist(list(data, data)))
Unit: milliseconds
expr min lq mean median uq max neval
df4 <- data.table::rbindlist(list(data, data)) 11.2442 11.84408 13.50861 12.37741 13.17539 22.89314 100
microbenchmark(df5 <- rbind(dt, dt))
Unit: milliseconds
expr min lq mean median uq max neval
df5 <- rbind(dt, dt) 11.02781 12.04254 15.0049 12.69404 13.36917 135.747 100

How to substitute multiple words with spaces in R?

Here is an example:
drugs<-c("Lapatinib-Ditosylate", "Caffeic-Acid-Phenethyl-Ester", "Pazopanib-HCl", "D-Pantethine")
ads<-"These are recently new released drugs Lapatinib Ditosylate, Pazopanib HCl, and Caffeic Acid Phenethyl Ester"
What I wanted is to correct the drug names in ads with the names in drugs such that a desired output would be:
"These are recently new released drugs Lapatinib-Ditosylate, Pazopanib-HCl, and Caffeic-Acid-Phenethyl-Ester"
If you create a vector of words to be replaced, then you can loop over that vector and the vector of words to replace them (drugs), replacing all instances of one element in each interation of the loop.
to_repl <- gsub('-', ' ', drugs)
for(i in seq_along(drugs))
ads <- gsub(to_repl[i], drugs[i], ads)
ads
# "These are recently new released drugs Lapatinib-Ditosylate, Pazopanib-HCl, and Caffeic-Acid-Phenethyl-Ester"
Contrary to popular belief, for-loops in R are no slower than lapply
f_lapply <- function(ads){
to_repl <- gsub('-', ' ', drugs)
invisible(lapply(seq_along(to_repl), function(i) {
ads <<- gsub(to_repl[i], drugs[i], ads)
}))
ads
}
f_loop <- function(ads){
to_repl <- gsub('-', ' ', drugs)
for(i in seq_along(to_repl))
ads <- gsub(to_repl[i], drugs[i], ads)
ads
}
f_loop(ads) == f_lapply(ads)
# [1] TRUE
microbenchmark::microbenchmark(f_loop(ads), f_lapply(ads), times = 1e4)
# Unit: microseconds
# expr min lq mean median uq max neval
# f_loop(ads) 59.488 95.180 118.0793 107.487 120.205 7426.866 10000
# f_lapply(ads) 69.333 114.462 147.9732 130.872 152.205 27283.670 10000
Or, using more general examples:
loop_over <- 1:1e5
microbenchmark::microbenchmark(
for_loop = {for(i in loop_over) 1},
lapply = {lapply(loop_over, function(x) 1)}
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# for_loop 4.66174 5.865842 7.725975 6.354867 7.449429 35.26807 100
# lapply 94.09223 114.378778 125.149863 124.665128 134.217326 170.16889 100
loop_over <- 1:1e5
microbenchmark::microbenchmark(
for_loop = {y <- numeric(1e5); for(i in seq_along(loop_over)) y[i] <- loop_over[i]},
lapply = {lapply(loop_over, function(x) x)}
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# for_loop 11.00184 11.49455 15.24015 12.10461 15.26050 134.139 100
# lapply 71.41820 81.14660 93.64569 87.05162 98.59295 357.219 100
This can also be done using lapply() which will be faster than for loop. Modifying #IceCreamToucan's answer, this can be done in lapply as follows
to_repl <- gsub('-', ' ', drugs)
invisible(lapply(seq_along(to_repl), function(i) {
ads <<- gsub(to_repl[i], drugs[i], ads)
}))
# [1] "These are recently new released drugs Lapatinib-Ditosylate, Pazopanib-HCl, and Caffeic-Acid-Phenethyl-Ester"
Microbenchmark
Unit: microseconds
expr min lq mean median uq max neval
lapply 80.514 87.4935 110.1103 93.304 96.1995 1902.861 100
for.loop 2285.164 2318.5665 2463.1554 2338.216 2377.4120 7510.763 100

shift() in data.table v1.9.6 is slow for many groups

Thanks for implementing shift in dt1.9.6 first.
When I have many different groups, shift() is against expectations slower than my old code:
library(data.table)
library(microbenchmark)
set.seed(1)
mg <- data.table(expand.grid(year = 2012:2016, id = 1:1000),
value = rnorm(5000))
microbenchmark(dt194 = mg[, l1 := c(value[-1], NA), by = .(id)],
dt196 = mg[, l2 := shift(value, n = 1,
type = "lead"), by = .(id)])
## Unit: milliseconds
## expr min lq mean median uq max eval
## dt194 4.93735 5.236034 5.718654 5.623736 5.74395 9.555922 100
## dt196 83.92612 87.530404 91.700317 90.953947 91.43783 257.473242 100
A detailed script is here: https://github.com/nachti/datatable_test/blob/master/leadtest.R
Did I misapply shift()?
Edit: Avoiding := doesn't help (#MichaelChirico):
microbenchmark(dt194 = mg[, c(value[-1], NA), by = id],
dt196 = mg[, shift(value, n = 1,
type = "lead"), by = id])
## Unit: milliseconds
## expr min lq mean median uq max neval
## dt194 5.161973 5.429927 5.78047 5.698263 5.798132 10.42217 100
## dt196 79.526981 87.914502 92.18144 91.240949 91.896799 266.04031 100
Apart from this using := is part of the task ...
In data.table version 1.14.3 this has been resolved and shift becomes faster than ever.
library(data.table)
library(microbenchmark)
set.seed(1)
mg = data.table(expand.grid(year=2012:2016, id=1:1000),
value=rnorm(5000))
microbenchmark(v1.9.4 = mg[, c(value[-1], NA), by=id],
v1.9.6 = mg[, shift_no_opt(value, n=1, type="lead"), by=id],
v1.14.3 = mg[, shift(value, n=1, type="lead"), by=id],
unit="ms")
# Unit: milliseconds
# expr min lq mean median uq max neval
# v1.9.4 3.6600 3.8250 4.4930 4.1720 4.9490 11.700 100
# v1.9.6 18.5400 19.1800 21.5100 20.6900 23.4200 29.040 100
# v1.14.3 0.4826 0.5586 0.6586 0.6329 0.7348 1.318 100

R *apply vector as input; matrix as output

I'd like to apply over each element of a vector, a function that outputs a vector.
After applying the function to each element of that vector, I should have many vectors, which I'd like to rbind in order to have a matrix.
The code should be equivalent to the following:
my_function <- function(x) x:(x+10)
my_vec <- 1:10
x <- vector()
for(i in seq_along(vec)){
x <- rbind(x,my_function(my_vec[i]))
}
Of course, my_function and my_vec are just examples.
try:
tmp <- lapply(my_vec, my_function)
do.call(rbind, tmp)
or, like Heroka suggested, use sapply. i prefer lapply, then bind my output the way i like (rbind/cbind) instead of potentially transposing.
Here is an alternative:
matrix( unlist(lapply(my_vec,my_function)), length(my_vec), byrow=TRUE )
Speed is almost the same:
library(microbenchmark)
my_function <- function(x) sin(x:(x+10))
for ( n in 1:4 )
{
my_vec <- 1:10^n
print(
microbenchmark( mra68 = matrix( unlist(lapply(my_vec,my_function)), length(my_vec), byrow=TRUE ),
stas.g = do.call(rbind, lapply(my_vec, my_function)),
times = 1000 )
)
print("identical?")
print( identical( matrix( unlist(lapply(my_vec,my_function)), length(my_vec), byrow=TRUE ),
do.call(rbind, lapply(my_vec, my_function)) ) )
}
.
Unit: microseconds
expr min lq mean median uq max neval
mra68 38.496 40.307 68.00539 41.213 110.052 282.148 1000
stas.g 41.213 42.572 72.86443 43.930 115.939 445.186 1000
[1] "identical?"
[1] TRUE
Unit: microseconds
expr min lq mean median uq max neval
mra68 793.002 810.212 850.4857 818.3640 865.2375 7231.669 1000
stas.g 876.786 894.901 946.8165 906.2235 966.9100 7051.873 1000
[1] "identical?"
[1] TRUE
Unit: milliseconds
expr min lq mean median uq max neval
mra68 2.605448 3.028442 5.269003 4.020940 7.807512 14.51225 1000
stas.g 2.959604 3.390071 5.823661 4.500546 8.800462 92.54977 1000
[1] "identical?"
[1] TRUE
Unit: milliseconds
expr min lq mean median uq max neval
mra68 27.29810 30.99387 51.44223 41.20167 79.46185 559.0059 1000
stas.g 33.63622 37.22420 60.10224 49.07643 92.94333 395.3315 1000
[1] "identical?"
[1] TRUE
>

Convert a large scale characters to date-format-like characters in r

I have a data frame df with 10 million rows. I want to convert the character format of "birthday" column from "xxxxxxxx" to "xxxx-xx-xx". eg. from "20051023" to "2005-10-23". I can use df$birthday <- lapply(df$birthday, as.Date, "%Y%m%d") to do that, but it wastes a lot of memory and computing time for data transforming. However, I just want to convert it to date-like character, but not date type. Therefore I use stringi package because it is written by C language. Unfortunately, df$birthday <- stri_join(stri_sub(df$birthday, from=c(1,5,7), to=c(4,6,8)), collapse = "-") doesn't work, cuz the function doesn't support vector input. Is there any way to solve this problem? Thanks a lot.
Go with sub.
date <- c("20051023", "20151023")
sub("^(\\d{4})(\\d{2})(\\d{2})$", "\\1-\\2-\\3", date)
# [1] "2005-10-23" "2015-10-23"
as.Date works on vectors
df$birthday <- format(as.Date(df$birthday, "%Y%m%d"), "%Y-%m-%d)
A vectorised function is much faster than apply
library(microbenchmark)
n <- 1e3
df <- data.frame(birthday = rep("20051023", n))
microbenchmark(
lapply(df$birthday, as.Date, "%Y%m%d"),
sapply(df$birthday, as.Date, "%Y%m%d"),
as.Date(df$birthday, "%Y%m%d")
)
Unit: microseconds
expr min lq mean median uq max neval cld
lapply(df$birthday, as.Date, "%Y%m%d") 22833.624 25340.118 29064.7188 28406.154 32346.245 58522.360 100 b
sapply(df$birthday, as.Date, "%Y%m%d") 24048.493 26252.660 29797.9074 28437.156 33119.381 47966.133 100 b
as.Date(df$birthday, "%Y%m%d") 431.469 447.719 481.5221 461.189 475.086 1984.158 100 a
A regular expression is off-course even faster.
microbenchmark(
as.character(as.Date(df$birthday, "%Y%m%d")),
format(as.Date(df$birthday, "%Y%m%d"), "%Y-%m%-d"),
sub("^(\\d{4})(\\d{2})(\\d{2})$", "\\1-\\2-\\3", df$birthday)
)
Unit: microseconds
expr min lq mean
as.character(as.Date(df$birthday, "%Y%m%d")) 4923.189 5057.462 5390.313
format(as.Date(df$birthday, "%Y%m%d"), "%Y-%m%-d") 3428.657 3553.736 3697.660
sub("^(\\\\d{4})(\\\\d{2})(\\\\d{2})$", "\\\\1-\\\\2-\\\\3", df$birthday) 713.699 739.997 815.737
median uq max neval cld
5150.0420 5394.4265 8225.270 100 c
3594.7875 3665.9865 5753.200 100 b
763.0885 783.1865 2433.585 100 a
sub() works on matrices, but not on data.frames. Hence the as.matrix
df <- as.data.frame(matrix("20051023", ncol = 3, nrow = 3))
df$ID <- seq_len(nrow(df))
df[, 1:3] <- sub("^(\\d{4})(\\d{2})(\\d{2})$", "\\1-\\2-\\3", as.matrix(df[, 1:3]))
The matrix solution is faster than the for loop. The difference increases with the number of columns you need to loop over.
df <- as.data.frame(matrix("20051023", ncol = 20, nrow = 3))
df$ID <- seq_len(nrow(df))
library(microbenchmark)
microbenchmark(
matrix = df[, seq_len(ncol(df) - 1)] <- sub("^(\\d{4})(\\d{2})(\\d{2})$", "\\1-\\2-\\3", as.matrix(df[, seq_len(ncol(df) - 1)])),
forloop = {
for(i in seq_len(ncol(df) - 1)){
df[, i] <- sub("^(\\d{4})(\\d{2})(\\d{2})$", "\\1-\\2-\\3", df[, i])
}
}
)
Unit: microseconds
expr min lq mean median uq max neval cld
matrix 460.555 476.805 504.3012 494.1235 507.594 1122.522 100 a
forloop 1554.425 1590.774 1677.3038 1625.8390 1670.312 3563.845 100 b

Resources