Suppose that named elements of a vector - stored in list - should be assigned to the matching columns of a matrix (see example below).
library(microbenchmark)
set.seed(123)
myList <- list()
for(i in 1:10000) {
myList[[i]] <- list(sample(setNames(rnorm(5), sample(LETTERS[1:5])), ceiling(runif(1,1,4))))
}
myMatrix <- matrix(NA, ncol = 5, nrow = 10000)
colnames(myMatrix) <- LETTERS[1:5]
for(i in 1:10000) {
myMatrix[i, match(names(myList[[i]][[1]]), colnames(myMatrix))] <- myList[[i]][[1]]
}
myList[[6]][[1]]
myMatrix[6,]
microbenchmark(for(i in 1:10000) {myMatrix[i, match(names(myList[[i]][[1]]), colnames(myMatrix))] <- myList[[i]][[1]]}, times = 10)
In this example, elements of 10,000 vectors are assigned to the matching columns of a matrix.
Problem
The assignment is slow (approximately 3.5 seconds)!
Question
How can I speed up this process in R or with Rcpp?
Use rbindlist from package data.table. It can bind by matching column names.
library(microbenchmark)
n <- 10000
set.seed(123)
myList <- list()
for(i in 1:n) {
myList[[i]] <- list(sample(setNames(rnorm(5), sample(LETTERS[1:5])), ceiling(runif(1,1,4))))
}
myMatrix <- matrix(NA, ncol = 5, nrow = n)
colnames(myMatrix) <- LETTERS[1:5]
library(data.table)
microbenchmark(match = for(i in 1:n) {myMatrix[i, match(names(myList[[i]][[1]]), colnames(myMatrix))] <- myList[[i]][[1]]},
rbindlist = {
myMatrix1 <- as.matrix(rbindlist(lapply(myList,
function(x) as.list(unlist(x))),
fill = TRUE))
myMatrix1 <- myMatrix1[, order(colnames(myMatrix1))]
},
times = 10)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# match 1392.52949 1496.40382 1599.63584 1605.39080 1690.98410 1761.67322 10 b
#rbindlist 48.76146 50.29176 51.66355 51.10672 53.75465 54.93798 10 a
all.equal(myMatrix, myMatrix1)
#TRUE
Related
I realized that computing mutual information on a dataframe with NA using R's infotheo package does not yield errors but incorrect results. The problem is described in more detail here but while I now have a mathematically correct solution which only removes pairwise incomplete cases instead of across all columns the performance for large data sets it catastrophic. I guess it is the nested for loop which causes the long compute times, does anyone have an idea how to improve performance of the below code?
library(infotheo)
v1 <- c(1,2,3,4,5,NA,NA,NA,NA,NA)
v2 <- c(1,NA,3,NA,5,NA,7,NA,9,NA)
v3 <- c(NA,2,3,NA,NA,6,7,NA,7,NA)
v4 <- c(NA,NA,NA,NA,NA,6,7,8,9,10)
df <- cbind.data.frame(v1,v2,v3,v4)
ColPairMap<-function(df){
t <- data.frame(matrix(ncol = ncol(df), nrow = ncol(df)))
colnames(t) <- colnames(df)
rownames(t) <- colnames(df)
for (j in 1:ncol(df)) {
for (i in 1:ncol(df)) {
c(1:ncol(df))
if (nrow(df[complete.cases(df[,c(i,j)]),])>0) {
t[j,i] <- natstobits(mutinformation(df[complete.cases(df[,c(i,j)]),j], df[complete.cases(df[,c(i,j)]),i]))
} else {
t[j,i] <- 0
}
}
}
return(t)
}
ColPairMap(df)
Thanks in advance!
Twice the speed.
ColPairMap2 <- function(df){
t <- matrix(0, ncol = ncol(df), nrow = ncol(df),
dimnames = list(colnames(df), colnames(df)))
df <- as.matrix(df)
for (j in 1:ncol(df)) {
for (i in j:ncol(df)) {
compl_cases <- complete.cases(df[, c(i, j)])
if (sum(compl_cases) > 0) {
t[j,i] <- natstobits(mutinformation(df[compl_cases, j],
df[compl_cases, i]))
}
}
}
lt <- lower.tri(t)
t[lt] <- t[lt] + t(t)[lt]
t
}
all(ColPairMap(df) == ColPairMap2(df))
#[1] TRUE
Test the speed.
library(microbenchmark)
mb <- microbenchmark(
f1 = ColPairMap(df),
f2 = ColPairMap2(df)
)
print(mb, order = "median", unit = "relative")
#Unit: relative
# expr min lq mean median uq max neval cld
# f2 1.000000 1.00000 1.000000 1.000000 1.000000 1.000000 100 a
# f1 2.035973 2.01852 1.907398 2.008894 2.108486 0.569771 100 b
I found a tweak which is not helping for toy data sets as df above but for real world data sets, especially when executed on some proper H/W I've seen examples where it reduces a 2.5hrs compute time to 14min!
The code below is a complete copy&pastable exmple which incorporates Rui's solution using a nested for loop and building on this idea another solution using a nested 'foreach' loop parallelizing the task on 75% of the available cores.
You can control the size of the data set and consequently the compute time by adjusting n.
library(foreach)
library(parallel)
library(doParallel)
library(infotheo)
n <- 500 #creates an nXn matrix, the larger the more compute time is required
df <- (discretize(matrix(rnorm(4*n*n,n,n/10),ncol=n)))
## pairwise complete mutual information via nested for loop ##
start_for <- Sys.time()
ColPairMap<-function(df){
t <- data.frame(matrix(ncol = ncol(df), nrow = ncol(df)))
colnames(t) <- colnames(df)
rownames(t) <- colnames(df)
for (j in 1:ncol(df)) {
for (i in 1:ncol(df)) {
c(1:ncol(df))
if (nrow(df[complete.cases(df[,c(i,j)]),])>0) {
t[j,i] <- natstobits(mutinformation(df[complete.cases(df[,c(i,j)]),j], df[complete.cases(df[,c(i,j)]),i]))
} else {
t[j,i] <- 0
}
}
}
return(t)
}
ColPairMap(df)
end_for <- Sys.time()
end_for-start_for
## pairwise complete mutual information via nested foreach loop ##
start_foreach <- Sys.time()
ncl <- max(2,floor(detectCores()*0.75)) #number of cores
clst <- makeCluster(n=ncl,type="TERR") #create cluster
#e <- new.env() #new environment to export libraries to cores
#e$libs <- .libPaths()
#clusterExport(clst, "libs", envir=e) #export required packages to all cores
#clusterEvalQ(clst, .libPaths(libs)) #export required packages to all cores
clusterEvalQ(clst, { #export required packages to all cores
library(infotheo)
})
registerDoParallel(cl = clst) #register cluster
t <- foreach (j=1:ncol(df), .combine="c") %:% #parallellized nested loop for computing normalized pairwise complete MI between all columns
foreach (i=j:ncol(df), .combine="c", .packages="infotheo") %dopar% {
combine="c"
compl_cases <- complete.cases(df[,c(i,j)])
if (sum(compl_cases) > 0) {
natstobits(mutinformation(df[compl_cases,][,j], df[compl_cases,][,i]))
} else {
0
}
}
RCA_MI_Matrix <- matrix(0, ncol = ncol(df), nrow = ncol(df), dimnames = list(colnames(df), colnames(df))) #set-up empty matrix for MI values
RCA_MI_Matrix[lower.tri(RCA_MI_Matrix, diag=TRUE)] <- t #fill lower triangle with MI values from nested loop
RCA_MI_Matrix[upper.tri(RCA_MI_Matrix)] <- t(RCA_MI_Matrix)[upper.tri(RCA_MI_Matrix)] #mirror lower triangle of matrix into upper one
end_foreach <- Sys.time()
end_foreach-start_foreach
stopCluster(cl=clst) #stop cluster
In this post on CodeReview, I compared several ways to generate a large sparse matrix. Specifically, I compared dense and sparse constructions using the Matrix package in R. My question is about post-processing with the sparse constructions. I'm finding that when I try to find the row sums of every k columns, the dense construction outperforms the sparse constructions.
Microbenchmarking
ncols <- 100000
nrows <- 1000
col_probs <- runif(ncols, 0.001, 0.002)
mat1 <- spMat_dense(ncols=ncols,nrows=nrows,col_probs=col_probs)
mat2 <- spMat_dgC(ncols=ncols,nrows=nrows,col_probs = col_probs)
mat3 <- spMat_dgT(ncols=ncols,nrows=nrows,col_probs=col_probs)
k <- 50
starts <- seq(1, ncols, by=k)
microbenchmark::microbenchmark(sapply(starts, function(x) rowSums(mat1[, x:(x+k-1)])),
sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x+k-1)])),
sapply(starts, function(x) Matrix::rowSums(mat3[, x:(x+k-1)])),
times=5L)
Unit: milliseconds
expr
sapply(starts, function(x) rowSums(mat1[, x:(x + k - 1)]))
sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x + k - 1)]))
sapply(starts, function(x) Matrix::rowSums(mat3[, x:(x + k - 1)]))
min lq mean median uq max
912.0453 947.0454 1041.365 965.4375 1007.311 1374.988
2097.4125 2208.0056 2566.575 2406.8450 2851.640 3268.970
13231.4790 13619.3818 13819.745 13675.6282 13923.803 14648.434
neval cld
5 a
5 b
5 c
My guess is that the sapply function works better with dense matrices because it doesn't need to do the sparse to dense conversion under the hood. The functions are posted below.
Question
Is there a way to improve the speed of the above post-processing for sparse constructions?
Functions
spMat_dense <- function(ncols,nrows,col_probs){
matrix(rbinom(nrows*ncols,1,col_probs),
ncol=ncols,byrow=T)
}
library(Matrix)
spMat_dgC <- function(ncols,nrows,col_probs){
#Credit to Andrew Guster (https://stackoverflow.com/a/56348978/4321711)
mat <- Matrix(0, nrows, ncols, sparse = TRUE) #blank matrix for template
i <- vector(mode = "list", length = ncols) #each element of i contains the '1' rows
p <- rep(0, ncols) #p will be cumsum no of 1s by column
for(r in 1:nrows){
row <- rbinom(ncols, 1, col_probs) #random row
p <- p + row #add to column identifier
if(any(row == 1)){
for (j in which(row == 1)){
i[[j]] <- c(i[[j]], r-1) #append row identifier
}
}
}
p <- c(0, cumsum(p)) #this is the format required
i <- unlist(i)
x <- rep(1, length(i))
mat#i <- as.integer(i)
mat#p <- as.integer(p)
mat#x <- x
return(mat)
}
spMat_dgT <- function(ncols, nrows, col_probs){
#Credit to minem - https://codereview.stackexchange.com/a/222190/121860
r <- lapply(1:ncols, function(x) {
p <- col_probs[x]
i <- sample.int(2L, size = nrows, replace = T, prob = c(1 - p, p))
which(i == 2L)
})
rl <- lengths(r)
nc <- rep(1:ncols, times = rl) # col indexes
nr <- unlist(r) # row index
ddims <- c(nrows, ncols)
sparseMatrix(i = nr, j = nc, dims = ddims, giveCsparse = FALSE)
}
Using a dgCMatrix as input, this is one possible solution that is very fast:
new_combine <- function(mat,k){
#Convert dgCMatrix to dgTMatrix
x.T <- as(mat, "dgTMatrix")
#Map column indices to new set of indices
#based on partitioning every k columns
x.T#j <- as.integer(x.T#j %/% k)
#Correct dimensions of new matrix
x.T#Dim <- as.integer(c(nrow(x.T),floor(ncol(mat)/k)))
#Convert back to dgCMatrix
y <- as(x.T,"dgCMatrix")
y
}
microbenchmark::microbenchmark(sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x+k-1)])),
new_combine(mat2,k),
times=5L)
Unit: milliseconds
expr
sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x + k - 1)]))
new_combine(mat2, k)
min lq mean median uq
1808.872676 1864.783181 1925.17118 1935.98946 1990.28866
8.471521 9.396441 10.99871 10.04459 10.96175
max neval cld
2025.92192 5 b
16.11923 5 a
comp <- sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x+k-1)]))
comp2 <- new_combine(mat2,k)
> all.equal(comp2,as(comp,"dgCMatrix"))
[1] TRUE
I'm trying to complete a homework problem
I was successful on creating a randomly generated matrix but I do not know how to create more than one in one go.
runif(4,min=-10,max=10)
m=matrix(runif(4*n), ncol = 2, nrow = 2)
no gonna lie I don't really know what I'm doing
You could try:
a<- replicate(100, matrix(runif(4), ncol = 2))
Then you have to access each one as an array which means you have to subset in the third dimension of the array like:
> a[,,2]
[,1] [,2]
[1,] 0.8476489 0.6139453
[2,] 0.1315417 0.8195134
The other way would be with a loop that would generate list objects which are a little easier IMHO to subset.
my_list <- list()
for(i in 1:100){
my_list[[i]]<-matrix(runif(4), ncol = 2)
}
Then you access each element with my_list[[100]]
We can do this using lapply, two main ways, the second is easier to understand, the first way is much more efficient, especially when n gets large,
vals <- runif(n*4, min=-10, max=10)
rst2 <- lapply(1:1000, function(i) matrix(vals[i:(i+3)], ncol = 2))
Or,
the.list <- vector(1000, mode = "list") # Create an empyty list
matrix.list <- lapply(the.list, function(x) matrix(runif(4,min=-10,max=10), nrow = 2))
Returns,
.....
[[999]]
[,1] [,2]
[1,] -6.520801 -5.944080
[2,] -4.183131 1.190629
[[1000]]
[,1] [,2]
[1,] 1.208202 7.86769
[2,] -2.672111 -8.33435
Faster version, first generating all the data needed, then splitting it, then converting the split vectors all into matrices,
vals <- runif(1000*4, min=-10, max=10)
rst <- split(matrices, rep(1:1000, each = 2))
rst2 <- lapply(rst, function(x) matrix(x, ncol = 2))
Fast Loop Using #MDEWITT answer as a base, we can do some simple preprocessing tasks to increase the efficiency.
my_list <- vector(n, mode = "list")
vals <- runif(n*4, min=-10, max=10)
for(i in 1:n){
my_list[[i]]<-matrix(vals[i:(i+3)], ncol = 2,nrow=2)
}
Quick Benchmarking
We see the FLoop and FLapply functions are faster, this becomes more apparent when n=10000....
n <- 1000
microbenchmark(fun1(n), fun2(n), fun3(n), times = 1000)
Unit: milliseconds
expr min lq mean median uq max neval
Lapply(n) 9.709308 9.993209 19.110734 10.708773 12.259961 2515.7737 1000
Predefine(n) 5.159808 5.389392 11.058017 5.615735 6.673107 1004.6575 1000
MDEWITT(n) 10.838335 11.391154 22.738093 12.243137 13.898335 646.6250 1000
FLoop(n) 5.104331 5.384800 9.139668 5.612018 6.502171 369.3693 1000
FLapply(n) 6.191827 6.687455 10.947632 7.613829 8.667169 274.9425 1000
Code used,
library(microbenchmark)
Lapply <- function(n=1000){
the.list <- vector(n, mode = "list")
matrix.list <- lapply(the.list, function(x) matrix(runif(4,min=-10,max=10), nrow = 2))
}
Predefine <- function(n=1000){
vals <- runif(n*4, min=-10, max=10)
rst <- split(vals, rep(1:n, each = 2))
rst2 <- lapply(rst, function(x) matrix(x, ncol = 2))
}
MDEWITT <- function(n=1000){
a <- replicate(n, matrix(runif(4*2), ncol = 2, nrow = 2))
}
FLoop <- function(n=1000){
my_list <- vector(n, mode = "list")
vals <- runif(n*4, min=-10, max=10)
for(i in 1:n){
my_list[[i]]<-matrix(vals[i:(i+3)], ncol = 2,nrow=2)
}
}
FLapply <- function(n=1000){
vals <- runif(n*4, min=-10, max=10)
rst2 <- lapply(1:n, function(i) matrix(vals[i:(i+3)], ncol = 2))
}
This R code works, but the for loop looks too long and ugly, and I had read that using for loops is not advised in R.
What I want to do is to copy vectors of varying length from the list of vectors HaarData#W to the rows of the matrix MyMatrix.
Since the vectors length is shorter than the number of columns in the matrix, I want to duplicate the values to fill the row.
The vectors have length 2z z ∈ ℤ , and the matrix row length needs to be n such 2z ≤ n
library(wavelets)
Data <- seq(1, 16)
n <- as.integer(log2(length(Data)))
#Data <- seq(1, 2 ^ n, 1)
HaarData <- dwt(Data, filter = "haar")
#Square matrix to write data
MyMatrix <- matrix(, nrow = n, ncol = 2 ^ n)
row <- 0 #row counter
for (vector in HaarData#W) {
row <- row + 1
duplication <- (2 ^ n) / length(vector)
newRow <- c(rep(vector, each = duplication))
MyMatrix[row,] <- newRow
}
I am not sure why you want to do the operation in first place, nevertheless the following would be my approach:
library(wavelets)
library(microbenchmark)
Data <- seq(1, 32)
n <- as.integer(log2(length(Data)))
HaarData <- dwt(as.numeric(Data), filter = "haar")
# Abstract operation in the loop in a function, no side effects
duplicate_coefs <- function(filter_coefs, n){
rep(filter_coefs, each = `^`(2, n - as.integer(log2(length(filter_coefs))) ))
}
microbenchmark(
old = {
#Square matrix to write data
MyMatrix <- matrix(, nrow = n, ncol = 2 ^ n)
row <- 0 #row counter
for (vector in HaarData#W) {
row <- row + 1
duplication <- (2 ^ n) / length(vector)
newRow <- c(rep(vector, each = duplication))
MyMatrix[row,] <- newRow
}
}
,
new = {
n_len <- length(HaarData#W)
new_result <- matrix(unlist( lapply(HaarData#W, duplicate_coefs, n_len) )
, nrow = n_len
, byrow = TRUE)
)
identical(MyMatrix, new_result)
On my machine you get about 50x speed-up
Unit: microseconds
expr min lq mean median uq max neval
old 2891.967 2940.0550 3203.14740 2982.5360 3110.3985 6472.223 100
new 48.519 50.8065 59.04673 56.4805 60.8905 302.845 100
Hope this helps
I'm trying to speed up an R function from a package I regularly use, so any help vectorizing the for-loop below would be much appreciated!
y <- array(0, dim=c(75, 12))
samp <- function(x) x<-sample(c(0,1), 1)
y <- apply(y, c(1,2), samp)
nr <- nrow(y)
nc <- ncol(y)
rs <- rowSums(y)
p <- colSums(y)
out <- matrix(0, nrow = nr, ncol = nc)
for (i in 1:nr) {
out[i, sample.int(nc, rs[i], prob = p)] <- 1
}
The issue I'm having a hard time getting around is the reference to object 'rs' within the loop.
Any suggestions?
Here are two options:
This one uses the somewhat discouraged <<- operator:
lapply(1:nr, function(i) out[i, sample.int(nc, rs[i], prob = p)] <<- 1)
This one uses more traditional indexing:
out[do.call('rbind',sapply(1:nr, function(i) cbind(i,sample.int(nc, rs[i], prob = p))))] <- 1
I suppose you could also use Vectorize to do an implicit mapply on your function:
z <- Vectorize(sample.int, vectorize.args='size')(nc, rs, prob=p)
out[cbind(rep(1:length(z), sapply(z, length)), unlist(z))] <- 1
But I don't think that's necessarily any cleaner.
And, indeed, #Roland is correct, that all of these are slower than just doing the for loop:
> microbenchmark(op(), t1(), t2(), t3())
Unit: microseconds
expr min lq median uq max neval
op() 494.970 513.8290 521.7195 532.3040 1902.898 100
t1() 591.962 602.1615 609.4745 617.5570 2369.385 100
t2() 734.756 754.7700 764.3925 782.4825 2205.421 100
t3() 642.383 672.9815 711.4700 763.8150 2283.169 100
Yay for benefit-free obfuscation!