Faster alternative than apply for using function utf8ToInt in a matrix - r

I have a string matrix (my_data) of dimensions 9000000x10 with each value being a single character string. I want to transform it to a numeric matrix using the function utf8ToInt, but it takes a long time and crashes my session.
new_matrix <- apply(my_data, 1:2, "utf8ToInt")
The result is what I expect, but I need a more efficient way of doing that.
Any help is deeply appreciated.
Imagine my data is:
my_data <- matrix(c("a","b","c","d"), ncol = 2)
but it is actually 9000000x10 instead of 2x2.

stringi::stri_enc_toutf32 may be an alternative.
From ?stri_enc_toutf32:
This function is roughly equivalent to a vectorized call to utf8ToInt(enc2utf8(str))
On a 1e3 * 2 matrix, stri_enc_toutf32 is about 10 and 20 times faster than vapply / apply + utf8ToInt respectively:
library(stringi)
library(microbenchmark)
nr = 1e3
nc = 2
m = matrix(sample(letters, nr*nc, replace = TRUE), nrow = nr, ncol = nc)
microbenchmark(
f_apply = apply(m, 1:2, utf8ToInt),
f_vapply = structure(vapply(m, utf8ToInt, numeric(1)), dim=dim(m)),
f = matrix(unlist(stri_enc_toutf32(m), use.names = FALSE), nrow = nrow(m)),
times = 10L, check = "equal")
# Unit: microseconds
# expr min lq mean median uq max neval
# f_apply 2283.4 2297.2 2351.17 2325.40 2354.5 2583.6 10
# f_vapply 1276.1 1298.0 1348.88 1322.00 1353.4 1611.3 10
# f 87.6 92.3 108.53 105.15 111.0 163.8 10

Using vapply would be almost twice as fast. Since vapply returns a vector, it is necessary to re-establish the matrix format (here with structure).
library(microbenchmark)
my_data <- matrix(sample(letters, 2*100, replace = TRUE), ncol = 2)
microbenchmark(
apply = apply(my_data, 1:2, utf8ToInt),
vapply = structure(vapply(my_data, utf8ToInt, numeric(1)), dim=dim(my_data)),
times = 500L, check = 'equal'
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> apply 199.201 208.001 224.811 213.801 220.1515 1560.400 500
#> vapply 111.000 115.501 136.343 120.401 124.9505 1525.901 500
Created on 2021-03-06 by the reprex package (v1.0.0)

Related

Efficient and fast application of a function to 3D arrays in R

I have a very large 3D array (say 100 x 100 x 10) that I would like to apply a function over for pairwise comparisons. I've tried a number of solutions, using data.table, mapply, etc. I'm maybe naively hoping for faster speedups, and am considering just doing this with C++/Rcpp. But before doing that, I thought I'd see if anyone is aware of a more elegant / faster solution to this problem? Many thanks!
Example code in R. For this smaller dimension version of what I'm wanting to apply this to, mapply() is a little faster than data.table
m <- 20
n <- 10 # number of data points per row/col combination
R <- array(runif(n*m*m), dim=c(m,m,n)) # 3D array to apply function over
grid <- expand.grid(A = 1:m, B = 1:m, C = 1:m, D = 1:m) # array indices (used as args below)
#function to do basic correlations between R[1,2,] and R[1,10,]
ss2 <- function(a,b,c,d) {
rho = cor(R[a, b, ], R[c, d, ])
}
#solution with data.table
dt <- setDT(grid) # convert from df -> dt
sol_1 <- dt[, ss2(A, B,C,D), by = seq_len(nrow(dt))]
#solution with mapply
sol_2 <- mapply(ss2, grid$A, grid$B, grid$C, grid$D)
I tried this with mapply(), data.table(). I've also tried using a parellelized version of apply() (parApply, https://dept.stat.lsa.umich.edu/~jerrick/courses/stat701/notes/parallel.html)
UPDATE: cora from the Rfast package gives further performance improvements.
By reshaping the array, we can use cor directly for a ~2K times speedup:
library(data.table)
library(Rfast)
m <- 20
n <- 10 # number of data points per row/col combination
R <- array(runif(n*m*m), dim=c(m,m,n)) # 3D array to apply function over
grid <- expand.grid(A = 1:m, B = 1:m, C = 1:m, D = 1:m)
ss2 <- function(a,b,c,d) rho = cor(R[a, b, ], R[c, d, ])
dt <- setDT(grid)
microbenchmark::microbenchmark(
sol_1 = dt[, ss2(A, B, C, D), by = seq_len(nrow(dt))][[2]],
sol_2 = mapply(ss2, grid$A, grid$B, grid$C, grid$D),
sol_3 = c(cor(t(matrix(R, m*m, n)))),
sol_4 = c(cora(t(matrix(R, m*m, n)))),
check = "equal",
times = 10
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> sol_1 2101327.2 2135311.0 2186922.33 2178526.6 2247049.6 2301429.5 10
#> sol_2 2255828.9 2266427.5 2306180.23 2287911.0 2321609.6 2471711.7 10
#> sol_3 1203.8 1222.2 1244.75 1236.1 1243.9 1343.5 10
#> sol_4 922.6 945.8 952.68 951.9 955.8 988.8 10
Timing the full 100 x 100 x 10 array:
m <- 100L
n <- 10L
R <- array(runif(n*m*m), dim=c(m,m,n))
microbenchmark::microbenchmark(
sol_3 = c(cor(t(matrix(R, m*m, n)))),
sol_4 = c(cora(t(matrix(R, m*m, n)))),
check = "equal",
times = 10
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> sol_3 1293.0739 1298.4997 1466.546 1503.453 1513.746 1902.802 10
#> sol_4 879.8659 892.2699 1058.064 1055.668 1143.767 1300.282 10
Note that filling by column then transposing tends to be slightly faster than filling by row in this case. Also note that ss2 and grid are no longer needed.

Faster "Resample with replacement by cluster"

I have the same question as Resample with replacement by cluster, i.e. I want to do cluster bootstrapping. The best answer's approach to that question using rbindlist(lapply(resampled_ids, function(resampled_id) df[df$id == resampled_id,])) works, but because I have a big dataset, this resampling step is rather slow. My question is, is it possible to speed this up?
Use sequence to index. Demonstrated with a larger data.frame:
df <- data.frame(id = rep.int(1:1e2, sample(100:200, 1e2, replace = TRUE))[1:1e4], X = rnorm(1e4))
resampled_ids <- sample(unique(df$id), replace = TRUE)
idx <- sequence(tabulate(df$id)[resampled_ids], match(unique(df$id), df$id)[resampled_ids])
s <- data.frame(id = df$id[idx], X = df$X[idx])
Benchmarking against the rbindlist solution:
library(data.table)
library(microbenchmark)
microbenchmark(rbindlist = rbindlist(lapply(resampled_ids, function(x) df[df$id %in% x,])),
sequence = {idx <- sequence(tabulate(df$id)[resampled_ids], match(unique(df$id), df$id)[resampled_ids])
data.frame(id = df$id[idx], X = df$X[idx])})
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> rbindlist 9480.4 9921.95 11470.567 10431.05 12555.35 31178.2 100
#> sequence 406.7 444.55 564.873 498.10 545.70 2818.4 100
Note that creating a new data.frame from indexed vectors is much faster than row-indexing the original data.frame. The difference is much less pronounced if a data.table is used, but, surprisingly, the rbindlist solution becomes even slower:
microbenchmark(rbindlist = rbindlist(lapply(resampled_ids, function(x) df[df$id %in% x,])),
sequence1 = df[sequence(tabulate(df$id)[resampled_ids], match(unique(df$id), df$id)[resampled_ids]),],
sequence2 = {idx <- sequence(tabulate(df$id)[resampled_ids], match(unique(df$id), df$id)[resampled_ids])
data.frame(id = df$id[idx], X = df$X[idx])})
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> rbindlist 9431.9 9957.7 11101.545 10508.15 12395.25 15363.3 100
#> sequence1 4284.5 4550.3 4866.891 4674.80 5009.90 8350.1 100
#> sequence2 414.1 455.6 541.590 508.40 551.40 2881.1 100
setDT(df)
microbenchmark(rbindlist = rbindlist(lapply(resampled_ids, function(x) df[df$id %in% x,])),
sequence1 = df[sequence(tabulate(df$id)[resampled_ids], match(unique(df$id), df$id)[resampled_ids]),],
sequence2 = {idx <- sequence(tabulate(df$id)[resampled_ids], match(unique(df$id), df$id)[resampled_ids])
data.table(id = df$id[idx], X = df$X[idx])})
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> rbindlist 14877.4 15878.30 17181.572 16348.50 18527.6 22520.9 100
#> sequence1 795.0 1016.80 1187.266 1101.95 1326.7 2566.5 100
#> sequence2 386.4 441.75 556.226 473.70 500.9 3373.6 100
Update
To address the comment from jay.sf:
lens <- tabulate(df$id)[resampled_ids]
idx <- sequence(lens, match(unique(df$id), df$id)[resampled_ids])
s <- data.frame(cluster = rep.int(seq_along(resampled_ids), lens), id = df$id[idx], X = df$X[idx])
cluster corresponds to the index of resampled_ids.
f = data.frame( id=c(1,1,2,2,2,3,3), X = rnorm(7) )
Try this:
ind_id <- split(seq_along(f$id), f$id)
samp_id <- sample(names(ind_id), replace = TRUE)
f[unlist(ind_id[samp_id]), ]

R: Average by every N value in 3-dim matrix

I have a matrix A with hourly data (on a monthly period) and dim [116 152 744]
I am trying to create matrix B with daily data and dim [116 152 31]
Obviously where every dim TSTEP in B is the average of the first 24 TSTEPS in matrix A.
I was successful in creating a matrix C with monthly data with a simple apply
C <- apply(A, c(1,2), function (x) mean(x))
But can't quite figure it out to average over every N values. Thanks.
Take one vector only, to get the mean every 24 values, you can do:
mean24 <- function(x) {
dim(x) <- c(24, length(x) / 24)
colMeans(x)
}
x <- 1:48
mean24(x)
[1] 12.5 36.5
So, in your case, you just have to do:
apply(A, c(1, 2), mean24)
You could also do it with sapply and some indexing:
# create data
arr <- array(dim=c(116,152,744))
arr[] <- runif(length(arr[]))
daily <-sapply(seq(1,744,24),function(ix){
rowMeans(arr[,,ix:(ix+23)],dims = 2)
},simplify = 'array')
> str(daily)
num [1:116, 1:152, 1:31] 0.451 0.522 0.407 0.536 0.432 ...
Edit:
It's also fairly quick compared with the other solution (microbenchmark):
Unit: milliseconds
expr min lq mean median uq max neval
apply(arr, c(1, 2), mean24) 464.4121 509.9772 653.9486 667.2114 699.498 1221.733 100
Unit: milliseconds
expr
sapply(seq(1, 744, 24), function(ix) { rowMeans(arr[, , ix:(ix + 23)], dims = 2) }, simplify = "array")
min lq mean median uq max neval
164.8211 168.3295 189.8147 171.4008 196.2403 409.9638 100

What is the right way to multiply data frame by vector?

I'm trying to multiply a data frame df by a vector v, so that the product is a data frame, where the i-th row is given by df[i,]*v. I can do this, for example, by
df <- data.frame(A=1:5, B=2:6); v <- c(0,2)
as.data.frame(t(t(df) * v))
A B
1 0 4
2 0 6
3 0 8
4 0 10
5 0 12
I am sure there has to be a more R-style approach (and a very simple one!), but nothing comes on my mind. I even tried something like
apply(df, MARGIN=1, function(x) x*v)
but still, non-readable constructions like as.data.frame(t(.)) are required.
How can I find an efficient and elegant workaround here?
This works too:
data.frame(mapply(`*`,df,v))
In that solution, you are taking advantage of the fact that data.frame is a type of list, so you can iterate over both the elements of df and v at the same time with mapply.
Unfortunately, you are limited in what you can output from mapply: as simple list, or a matrix. If your data are huge, this would likely be more efficient:
data.frame(mapply(`*`,df,v,SIMPLIFY=FALSE))
Because it would convert it to a list, which is more efficient to convert to a data.frame.
If you're looking for speed and memory efficiency - data.table to the rescue:
library(data.table)
dt = data.table(df)
for (i in seq_along(dt))
dt[, (i) := dt[[i]] * v[i]]
eddi = function(dt) { for (i in seq_along(dt)) dt[, (i) := dt[[i]] * v[i]] }
arun = function(df) { df * matrix(v, ncol=ncol(df), nrow=nrow(df), byrow=TRUE) }
nograpes = function(df) { data.frame(mapply(`*`,df,v,SIMPLIFY=FALSE)) }
N = 1e6
dt = data.table(A = rnorm(N), B = rnorm(N))
v = c(0,2)
microbenchmark(eddi(copy(dt)), arun(copy(dt)), nograpes(copy(dt)), times = 10)
#Unit: milliseconds
# expr min lq mean median uq max neval
# eddi(copy(dt)) 23.01106 24.31192 26.47132 24.50675 28.87794 34.28403 10
# arun(copy(dt)) 337.79885 363.72081 450.93933 433.21176 516.56839 644.70103 10
# nograpes(copy(dt)) 19.44873 24.30791 36.53445 26.00760 38.09078 95.41124 10
As Arun points out in the comments, one can also use the set function from the data.table package to do this in-place modification on data.frame's as well:
for (i in seq_along(df))
set(df, j = i, value = df[[i]] * v[i])
This of course also works for data.table's and could be significantly faster if the number of columns is large.
A language that lets you combine vectors with matrices has to make a decision at some point whether the matrices are row-major or column-major ordered. The reason:
> df * v
A B
1 0 4
2 4 0
3 0 8
4 8 0
5 0 12
is because R operates down the columns first. Doing the double-transpose trick subverts this. Sorry if this is just explaining what you know, but I don't know another way of doing it, except explicitly expanding v into a matrix of the same size.
Or write a nice function that wraps the not very R-style code into something that is R-stylish.
Whats wrong with
t(apply(df, 1, function(x)x*v))
?
library(purrr)
map2_dfc(df, v, `*`)
Benchmark
N = 1e6
dt = data.table(A = rnorm(N), B = rnorm(N))
v = c(0,2)
eddi = function(dt) { for (i in seq_along(dt)) dt[, (i) := dt[[i]] * v[i]]; dt }
arun = function(df) { df * matrix(v, ncol=ncol(df), nrow=nrow(df), byrow=TRUE) }
nograpes = function(df) { data.frame(mapply(`*`,df,v,SIMPLIFY=FALSE)) }
ryan = function(df) {map2_dfc(df, v, `*`) }
library(microbenchmark)
microbenchmark(
eddi(copy(dt))
, arun(copy(dt))
, nograpes(copy(dt))
, ryan(copy(dt))
, times = 100)
# Unit: milliseconds
# expr min lq mean median uq max neval
# eddi(copy(dt)) 8.367513 11.06719 24.26205 12.29132 19.35958 171.6212 100
# arun(copy(dt)) 94.031272 123.79999 186.42155 148.87042 251.56241 364.2193 100
# nograpes(copy(dt)) 7.910739 10.92815 27.68485 13.06058 21.39931 172.0798 100
# ryan(copy(dt)) 8.154395 11.02683 29.40024 13.73845 21.77236 181.0375 100
I think the fastest way (without testing data.table) is data.frame(t(t(df)*v)).
My tests:
testit <- function(nrow, ncol)
{
df <- as.data.frame(matrix(rnorm(nrow*ncol),nrow=nrow,ncol=ncol))
v <- runif(ncol)
r1 <- data.frame(t(t(df)*v))
r2 <- data.frame(mapply(`*`,df,v,SIMPLIFY=FALSE))
r3 <- df * rep(v, each=nrow(df))
stopifnot(identical(r1, r2) && identical(r1, r3))
microbenchmark(data.frame(t(t(df)*v)), data.frame(mapply(`*`,df,v,SIMPLIFY=FALSE)), df * rep(v, each=nrow(df)))
}
Result
> set.seed(1)
>
> testit(100,100)
Unit: milliseconds
expr min lq median uq max neval
data.frame(t(t(df) * v)) 2.297075 2.359541 2.455778 3.804836 33.05806 100
data.frame(mapply(`*`, df, v, SIMPLIFY = FALSE)) 9.977436 10.401576 10.658964 11.762009 15.09721 100
df * rep(v, each = nrow(df)) 14.309822 14.956705 16.092469 16.516609 45.13450 100
> testit(1000,10)
Unit: microseconds
expr min lq median uq max neval
data.frame(t(t(df) * v)) 754.844 805.062 844.431 1850.363 27955.79 100
data.frame(mapply(`*`, df, v, SIMPLIFY = FALSE)) 1457.895 1497.088 1567.604 2550.090 4732.03 100
df * rep(v, each = nrow(df)) 5383.288 5527.817 5875.143 6628.586 32392.81 100
> testit(10,1000)
Unit: milliseconds
expr min lq median uq max neval
data.frame(t(t(df) * v)) 17.07548 18.29418 19.91498 20.67944 57.62913 100
data.frame(mapply(`*`, df, v, SIMPLIFY = FALSE)) 99.90103 104.36028 108.28147 114.82012 150.05907 100
df * rep(v, each = nrow(df)) 112.21719 118.74359 122.51308 128.82863 164.57431 100

Find most recent non-missing value in a vector

I'm trying to return the most recent row in the vector with a non-missing value. For instance, given
x <- c(1,2,NA,NA,3,NA,4)
Then function(x) would output a list like:
c(1,2,2,2,3,3,4)
Very simple question, but running it with loops or brute force on multiple columns takes forever.
You can use zoo::na.locf for that
require(zoo)
x <- c(1, 2, NA, NA, 3, NA, 4)
na.locf(x)
## [1] 1 2 2 2 3 3 4
You can do this using the Reduce function:
> x <- c(1,2,NA,NA,3,NA,4)
> locf <- function(x,y) if(is.na(y)) x else y
> Reduce( locf, x, accumulate=TRUE )
[1] 1 2 2 2 3 3 4
This way you do not need to load an extra package (and it could be customized to different types of objects if needed).
The Reduce option is quicker than zoo::na.locf for the sample vector on my computer:
> library(zoo)
> library(microbenchmark)
>
> microbenchmark(
+ Reduce( locf, x, accumulate=TRUE ),
+ na.locf(x)
+ )
Unit: microseconds
expr min lq median uq max
Reduce(locf, x, accumulate = TRUE) 22.169 24.0160 27.506 29.3530 112.073
na.locf(x) 149.841 151.8945 154.357 169.5465 377.271
neval
100
100
Though there may be other situations where na.locf will be faster. I was actually surprised at the amount of difference.
Benchmarking on bigger data shows the difference clearly between na.locf from zoo package and using Reduce:
x <- sample(c(1:5, NA), 1e6, TRUE)
require(zoo)
require(microbenchmark)
locf <- function(x,y) if(is.na(y)) x else y
microbenchmark(Reduce( locf, x, accumulate=TRUE ), na.locf(x), times=10)
Unit: milliseconds
expr min lq median uq max neval
Reduce(locf, x, accumulate = TRUE) 5480.4796 5958.0905 6605.3547 7458.404 7915.046 10
na.locf(x) 661.2886 911.1734 950.2542 1026.348 1095.642 10

Resources