Calculate final value without using for-loop - r

upper.limit <- 15
starting.limit <- 5
lower.limit <- 0
set.seed(123)
x <- sample(-20:20)
for(i in 1:length(x)){
k <- starting.limit + x[i]
k <- ifelse(k > upper.limit, upper.limit, ifelse(k < lower.limit, lower.limit,k))
starting.limit <- k
}
My objective is to calculate the final value of starting limit at the end of the loop. The conditions are that for a given iteration, k cannot exceed upper.limit and fall below lower.limit.
I have written the above loop to achieve this. However, I have to do this for almost 10000 datasets. I wondered if there is a quicker way so that I can avoid a for loop
Thanks

We can design a function.
# s: starting.limit, x: the x vector, u:upper.limit, l:lower.limit
k_fun <- function(s, x, u = 15, l = 0){
k <- s + x
if (k > u){
k <- u
} else if (k < l){
k <- l
}
s <- k
return(s)
}
And then use accumulate from the purrr package to apply the function with the starting limit and the x vector. You can see how the number changes. The last number is the final output.
library(purrr)
accumulate(c(5, x), k_fun)
# [1] 5 0 11 6 15 15 0 0 10 15 9 15 8 7 3 0 3 0 15 2 2 14 15 7 4 15 15 3 15 0
# [31] 5 0 0 4 12 0 6 7 9 0 0 15
Benchmark
I used the following code to assess the performance. The accumulate is a little bit faster than a for loop on a vector with 400001 elements.
library(microbenchmark)
perf <- microbenchmark(
m1 = {upper.limit <- 15
starting.limit <- 5
lower.limit <- 0
set.seed(123)
x <- sample(-200000:200000)
for(i in 1:length(x)){
k <- starting.limit + x[i]
k <- ifelse(k > upper.limit, upper.limit, ifelse(k < lower.limit, lower.limit,k))
starting.limit <- k
}},
m2 = {
set.seed(123)
x <- sample(-200000:200000)
vec <- purrr::accumulate(c(5, x), k_fun)
k <- tail(vec, 1)
})
# Unit: milliseconds
# expr min lq mean median uq max neval
# m1 821.1735 879.3551 956.7404 941.1145 1019.8603 1290.800 100
# m2 649.3444 717.5986 773.3652 768.0313 823.5749 1006.148 100

you can try something like below with tidyverse
first, make x into a dataframe
x <- as.data.frame(sample(-20:20))
colnames(x) <- c("dat")
and then pipe like:
x %>%
mutate(sm = starting.limit) %>%
mutate(sm = if_else(sm+lead(dat,1) > upper.limit, upper.limit
, if_else(sm+lead(dat,1) < lower.limit, lower.limit, sm) )) %>%
select(sm) %>%
filter(sm != is.na(sm)) %>%
tail(n=1)
Effectively, modify the last select, filter and tail functions as per your need.
Benchmark
I was curious how this performs against the other solution, and tried to add my code to the microbenchmark already provided. Here goes
perf <- microbenchmark(
m1 = {upper.limit <- 15
starting.limit <- 5
lower.limit <- 0
set.seed(123)
x <- sample(-200000:200000)
for(i in 1:length(x)){
k <- starting.limit + x[i]
k <- ifelse(k > upper.limit, upper.limit, ifelse(k < lower.limit, lower.limit,k))
starting.limit <- k
}},
m2 = {
set.seed(123)
x <- sample(-200000:200000)
vec <- purrr::accumulate(c(5, x), k_fun)
k <- tail(vec, 1)
},
m3 = {
x <- sample(-200000:200000)
xd <- as.data.frame(x)
colnames(xd) <- c("dat")
xd %>%
mutate(sm = starting.limit) %>%
mutate(sm = if_else(sm+lead(dat,1) > upper.limit, upper.limit
, if_else(sm+lead(dat,1) < lower.limit, lower.limit, sm) )) %>%
select(sm) %>%
filter(sm != is.na(sm)) %>%
tail(n=1)
}
)
output:
Unit: milliseconds
expr min lq mean median uq max neval
m1 1223.49718 1255.69514 1272.2679 1260.9643 1272.3401 1392.0402 100
m2 964.76948 982.96555 1007.5521 989.5366 1007.9106 1173.2754 100
m3 68.80358 76.77386 133.0509 170.5572 177.0051 274.9299 100

Related

For a dataset of 0's and 1's, set all but the first 1 in each row to 0's

I have a data.frame of 1,480 rows and 1,400 columns like:
1 2 3 4 5 6 ..... 1399 1400
1 0 0 0 1 0 0 ..... 1 0 #first occurrence would be at 4
2 0 0 0 0 0 1 ..... 0 1
3 1 0 0 1 0 0 ..... 0 0
## and etc
Each row contains a series of 0's and 1's - predominantly 0's. For each row, I want to find at which column the first 1 shows up and set the remaining values to 0's.
My current implementation can efficiently find the occurrence of the first 1, but I've only figured out how to zero out the remaining values iteratively by row. In repeated simulations, this iterative process is taking too long.
Here is the current implementation:
N <- length(df[which(df$arm == 0), "pt_id"]) # of patients
M <- max_days
#
# df is like the data frame shown above
#
df[which(df$arm == 0), 5:length(colnames(df))] <- unlist(lapply(matrix(data = rep(pbo_hr, M*N), nrow=N, ncol = M), rbinom, n=1, size = 1))
event_day_post_rand <- apply(df[,5:length(colnames(df))], MARGIN = 1, FUN = function(x) which (x>0)[1])
df <- add_column(df, "event_day_post_rand" = event_day_post_rand, .after = "arm_id")
##
## From here trial days start on column 6 for df
##
#zero out events that occurred after the first event, since each patient can only have 1 max event which will be taken as the earliest event
for (pt_id in df[which(!is.na(df$event_day_post_rand)),"pt_id"]){
event_idx = df[which(df$pt_id == pt_id), "event_day_post_rand"]
df[which(df$pt_id == pt_id), as.character(5+event_idx+1):"1400"] <- 0
}
We can do
mat <- as.matrix(df) ## data frame to matrix
j <- max.col(mat, ties.method = "first")
mat[] <- 0
mat[cbind(1:nrow(mat), j)] <- 1
df <- data.frame(mat) ## matrix to data frame
I also suggest just using a matrix to store these values. In addition, the result will be a sparse matrix. So I recommend
library(Matrix)
sparseMatrix(i = 1:nrow(mat), j = j, x = rep(1, length(j)))
We can get a little more performance by setting the 1 elements to 0 whose rows are duplicates.
Since the OP is open to starting with a matrix rather than a data.frame, I'll do the same.
# dummy data
m <- matrix(sample(0:1, 1480L*1400L, TRUE, c(0.9, 0.1)), 1480L, 1400L)
# proposed solution
f1 <- function(m) {
ones <- which(m == 1L)
m[ones[duplicated((ones - 1L) %% nrow(m), nmax = nrow(m))]] <- 0L
m
}
# Zheyuan Li's solution
f2 <- function(m) {
j <- max.col(m, ties.method = "first")
m[] <- 0L
m[cbind(1:nrow(m), j)] <- 1L
m
}
microbenchmark::microbenchmark(f1 = f1(m),
f2 = f2(m),
check = "identical")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 9.1457 11.45020 12.04258 11.9011 12.3529 37.6716 100
#> f2 12.8424 14.92955 17.31811 15.3251 16.0550 43.6314 100
Zheyuan Li's suggestion to go with a sparse matrix is a good idea.
# convert to a memory-efficient nsparseMatrix
library(Matrix)
m1 <- as(Matrix(f1(m), dimnames = list(NULL, NULL), sparse = TRUE), "nsparseMatrix")
object.size(m)
#> 8288216 bytes
object.size(m1)
#> 12864 bytes
# proposed function to go directly to a sparse matrix
f3 <- function(m) {
n <- nrow(m)
ones <- which(m == 1L) - 1L
i <- ones %% n
idx <- which(!duplicated(i, nmax = n))
sparseMatrix(i[idx], ones[idx] %/% n, dims = dim(m), index1 = FALSE, repr = "C")
}
# going directly to a sparse matrix using Zheyuan Li's solution
f4 <- function(m) {
sparseMatrix(1:nrow(m), max.col(m, ties.method = "first"), dims = dim(m), repr = "C")
}
identical(m1, f3(m))
#> [1] TRUE
identical(m1, f4(m))
#> [1] TRUE
microbenchmark::microbenchmark(f1 = f1(m),
f3 = f3(m),
f4 = f4(m))
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 9.1719 9.30715 11.12569 9.52300 11.92740 83.8518 100
#> f3 7.4330 7.59875 12.62412 7.69610 11.08815 84.8291 100
#> f4 8.9607 9.31115 14.01477 9.49415 11.44825 87.1577 100

Using for loop instead of matrix index in [r]

I have been meaning to rewrite the following code with for loop in R:
x <- sample(1:100, 10)
x[x %% 2 == 0]
#[1] 6 26 72 62 32 86 100
which extracts those elements in vector x that are even number only.
I have come up with the following result in the end, but I believe there are more simple ways of coding this.
x <- sample(1:100, 10)
output <- integer(0)
for(i in seq_along(x)) {
if(x[i] %% 2 == 0) {
output[i] <- x[i]
output <- output[!is.na(output)]
}
}
output
#[1] 6 26 72 62 32 86 100
I would be grateful if you could help me with this.
You can skip the NA removes when adding the new hit to the end of output using length.
output <- integer(0)
for(i in seq_along(x)) {
if(x[i] %% 2 == 0) {
output[length(output) + 1] <- x[i]
}
}
output
Where length(output) gives you the current length of output. By adding 1 you place the new element at the end of output.
Or as #Roland (thanks!) commented using c
output <- integer(0)
for(i in seq_along(x)) {
if(x[i] %% 2 == 0) {
output <- c(output, x[i])
}
}
output
c combines output and x[i] to a new vector.
Or preallocate output with x and mark the non hits with NA
output <- x
for(i in seq_along(x)) {
if(x[i] %% 2 != 0) {
output[i] <- NA
}
}
output <- output[!is.na(output)]
output
Benchmarks:
fun <- alist(Vectorized = x[x %% 2 == 0]
, Question = {output <- integer(0)
for(i in seq_along(x)) {
if(x[i] %% 2 == 0) {output[i] <- x[i]; output <- output[!is.na(output)]}
}
output}
, NaOutside = {output <- integer(0)
for(i in seq_along(x)) {
if(x[i] %% 2 == 0) {output[i] <- x[i]}
}
output <- output[!is.na(output)]
output}
, Append = {output <- integer(0)
for(i in seq_along(x)) {
if(x[i] %% 2 == 0) {output[length(output) + 1] <- x[i]}
}
output}
, Append2 = {output <- integer(0); j <- 1
for(i in seq_along(x)) {
if(x[i] %% 2 == 0) {output[j] <- x[i]; j <- j + 1}
}
, C = {output <- integer(0)
for(i in seq_along(x)) {
if(x[i] %% 2 == 0) {
output <- c(output, x[i])
}
}
output}
, Preallocate = {output <- x
for(i in seq_along(x)) {
if(x[i] %% 2 != 0) {
output[i] <- NA
}
}
output <- output[!is.na(output)]
output}
)
library(microbenchmark)
set.seed(42)
x <- sample(1:100, 10)
microbenchmark(list = fun, control=list(order="block"))
#Unit: nanoseconds
# expr min lq mean median uq max neval
# Vectorized 644 655 781.54 666.5 721 8559 100
# Question 4377648 4419010 4682029.95 4492628.5 4579952 7512162 100
# NaOutside 3443488 3558401 3751542.62 3597273.0 3723662 5146015 100
# Append 3932586 4070234 4287628.11 4129849.0 4209361 6036142 100
# Append2 3966245 4094766 4360989.39 4147847.5 4312868 5899000 100
# C 3464081 3566902 3806531.77 3618758.5 3743058 6528224 100
# Preallocate 3162424 3263220 3435591.92 3290938.0 3374547 4823017 100
set.seed(42)
x <- sample(1:1e5, 1e4)
microbenchmark(list = fun, control=list(order="block"))
#Unit: microseconds
# expr min lq mean median uq max neval
# Vectorized 226.224 276.271 277.0027 278.993 284.4515 345.527 100
# Question 125550.120 126392.848 129287.7202 126812.309 128426.3655 157958.571 100
# NaOutside 6911.053 7020.831 7497.4403 7109.891 8158.7580 8779.448 100
# Append 7843.988 7982.987 8582.6769 8129.988 9287.5760 10775.894 100
# Append2 7647.340 7783.334 8347.7824 7954.683 9007.4500 10325.973 100
# C 27976.747 29776.632 29997.5407 30024.121 30250.9590 51630.868 100
# Preallocate 6119.198 6232.228 6679.9407 6367.618 7290.1015 8331.277 100

Comparison of rows and columns of a matrix

Lets assume we have p by n matrix. I want to generate an output matrix, w (p x p) such as w_ij represent how many times i_th rows number is bigger than j_th (can be at most n obviously).
My code is here, I'm looking for a faster way.
p <- dim(dat)[1]
n <- dim(dat)[2]
w <- matrix(0,p,p)
for(i in 1:n){
for(j in 1:(p-1)){
for(k in (j+1):p){
if(dat[j,i] > dat[k,i]){
w[j,k] <- w[j,k]+1
}else{
w[k,j] <- w[k,j]+1
}
}
}
}
A small example
If the input data is
dat <- matrix(1:9, 3)
dat
# [,1] [,2] [,3]
#[1,] 1 4 7
#[2,] 2 5 8
#[3,] 3 6 9
the expected outcome is
W <- matrix(c(0,3,3,0,0,3,0,0,0),3)
W
# [,1] [,2] [,3]
#[1,] 0 0 0
#[2,] 3 0 0
#[3,] 3 3 0
This seems to give a quick speed gain, without much extra work
newd <- t(dat)
for(i in 1:p) {
w[,i] <- colSums((newd - dat[i,]) > 0)
}
Quick comparison: wrap code in functions
f1 <- function(dat){
p <- dim(dat)[1]
n <- dim(dat)[2]
w <- matrix(0,p,p)
for(i in 1:n){
for(j in 1:(p-1)){
for(k in (j+1):p){
if(dat[j,i] > dat[k,i]){
w[j,k] <- w[j,k]+1
}else{
w[k,j] <- w[k,j]+1
}
}
}
}
w
}
f2 <- function(dat){
p <- dim(dat)[1]
w <- matrix(0,p,p)
newd <- t(dat)
for(i in 1:p) {
w[,i] <- colSums((newd - dat[i,]) > 0)
} ; w}
Generate slightly larger data
set.seed(1)
dat <- matrix(rnorm(1e4), 100)
Compare
all.equal(f1(dat), f2(dat))
Benchmark
library(microbenchmark)
microbenchmark(f1(dat), f2(dat), times=10)
# expr min lq mean median uq max neval cld
# f1(dat) 1586.10589 1594.40701 1619.03102 1616.14899 1635.05695 1688.08589 10 b
# f2(dat) 22.56083 23.13493 23.98392 23.34228 24.39766 28.29201 10 a
Of course, depending on the size of your matrix it may be worth writing your loops in c++/Rcpp for larger speed gains

How to use sapply function (or similar) for a matrix, not a vector?

To speed up the following:
x <- c(0, 1, 1.1, 1.5, 1.9, 2.8, 2.9, 3.5)
n <- length(x)
temp <- 0
for(i in 1:n) {
for(j in 1:n) {
temp <- temp + dnorm(x[i] - x[j])
}
}
> temp
[1] 13.40157
I can simply use the sapply function, as below:
out <- sapply(x, function(a) dnorm(x - a)))
sum(out)
> sum(out)
[1] 13.40157
But how to use the same trick for a matrix, not vector, that is I need to speed up the following:
x <- matrix(c(3, 3.3, 5, 6, 7, 4), nrow=3, ncol=2, byrow=FALSE)
n <- length(x[,1])
library(mvtnorm) # for dmvnorm
temp <- 0
for(i in 1:n) {
for(j in 1:n) {
temp <- temp + dmvnorm(x[i,] - x[j,])
}
}
> temp
[1] 0.6686979
In your 3x3 case, you need to compute the density from the following pairs of rows:
1,1
1,2
1,3
2,1
2,2
2,3
3,1
3,2
3,3
I would approach this by generating a matrix where each row corresponds the first element in each pair, a matrix where each row corresponds to the second element in each pair, subtract the two, and then pass the result to dmvnorm:
mat1 <- x[rep(1:n, each=n),]
mat2 <- x[rep(1:n, n),]
sum(dmvnorm(mat1-mat2))
# [1] 0.6686979
This appears to be a good deal quicker than the approach using looping:
library(mvtnorm)
OP <- function(x) {
n <- nrow(x)
temp <- 0
for(i in 1:n) {
for(j in 1:n) {
temp <- temp + dmvnorm(x[i,] - x[j,])
}
}
return(temp)
}
josilber <- function(x) {
n <- nrow(x)
mat1 <- x[rep(1:n, each=n),]
mat2 <- x[rep(1:n, n),]
sum(dmvnorm(mat1-mat2))
}
# 100 x 10 matrix
set.seed(144)
x <- matrix(rnorm(1000), nrow=100)
all.equal(OP(x), josilber(x))
# [1] TRUE
library(microbenchmark)
microbenchmark(OP(x), josilber(x))
# Unit: milliseconds
# expr min lq mean median uq max neval
# OP(x) 654.553137 696.28275 738.655380 719.058485 760.699813 1194.5594 100
# josilber(x) 2.775881 2.95865 6.789969 4.346013 5.948481 66.0617 100
For this 100 x 10 example, there was more than a 100x speedup from using the vectorized approach.

Compare each row with other rows of matrix

I am looking for an efficient solution for the following problem:
b <- matrix(c(0,0,0,1,1,0), nrow = 2, byrow = T)
weight <- c(1,1)
times <- 5
abc <- do.call(rbind, replicate(times, b, simplify=FALSE))
weight <- rep.int(weight,times)
sum1 <- as.numeric(rep.int(NA,nrow(abc)))
##Rprof()
for(j in 1:nrow(abc)){
a <- abc[j,]
sum1[j] <- sum(weight[rowSums(t(a == t(abc)) + 0) == ncol(abc)])
}
##Rprof(NULL)
##summaryRprof()
Is there a faster way to do this? Rprof shows that rowSums(), t(), == and + are quite slow. If nrows is 20,000 it takes like 21 seconds.
Thanks for helping!
Edit: I have a matrix abc and a vector weight with length equal to nrow(abc). The first value of weight corresponds to the first row of matrix abc and so on... Now, I would like to determine which rows of matrix abc are equal. Then, I want to remember the position of those rows in order to sum up the corresponding weights which have the same position. The appropriate sum I wanna store for each row.
Here is a way that looks valid and fast:
ff <- function(mat, weights)
{
rs <- apply(mat, 1, paste, collapse = ";")
unlist(lapply(unique(rs),
function(x)
sum(weights[match(rs, x, 0) > 0])))[match(rs, unique(rs))]
}
ff(abc, weight)
# [1] 5 5 5 5 5 5 5 5 5 5
And comparing with your function:
ffOP <- function(mat, weights)
{
sum1 <- as.numeric(rep.int(NA,nrow(mat)))
for(j in 1:nrow(mat)) {
a <- mat[j,]
sum1[j] <- sum(weights[rowSums(t(a == t(mat)) + 0) == ncol(mat)])
}
sum1
}
ffOP(abc, weight)
# [1] 5 5 5 5 5 5 5 5 5 5
library(microbenchmark)
m = do.call(rbind, replicate(1e3, matrix(0:11, 3, 4), simplify = F))
set.seed(101); w = runif(1e3*3)
all.equal(ffOP(m, w), ff(m, w))
#[1] TRUE
microbenchmark(ffOP(m, w), ff(m, w), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# ffOP(m, w) 969.83968 986.47941 996.68563 1015.53552 1051.23847 10
# ff(m, w) 20.42426 20.64002 21.36508 21.97182 22.59127 10
For the record, I, also, implemented your approach in C and here are the benchmarkings:
#> microbenchmark(ffOP(m, w), ff(m, w), ffC(m, w), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# ffOP(m, w) 957.66691 967.09429 991.35232 1000.53070 1016.74100 10
# ff(m, w) 20.60243 20.85578 21.70578 22.13434 23.04924 10
# ffC(m, w) 36.24618 36.40940 37.18927 37.39877 38.83358 10

Resources