The following code works, but as expected, it takes ages to execute for large vectors.
What would be the vectorised way to accomplish the same task:
x <- seq(0,10,0.01)
y <- seq(0,10,0.01)
df <- data.frame(vector1 = rnorm(10000), vector2 = rnorm(10000), vector3 = rnorm(10000))
m.out <- matrix(nrow=length(x),ncol = length(y))
a <- df$vector1
b <- df$vector2
c <- df$vector3
for (i in 1:length(x)){
for(j in 1:length(y)){
m.out[i,j] <- cor((x[i]*a + y[j]*b),c,use="complete.obs",method = "pearson")
}
}
Thanks,
Please see vectorized version below, you can use mapply and expand.grid. To return to wide dataset format you can use dcast of reshape2 package (however it still takes some time):
set.seed(123)
x <- seq(0, 10, 0.01)
y <- seq(0, 10, 0.01)
# simulation
df <- data.frame(vector1 = rnorm(length(x)), vector2 = rnorm(length(x)), vector3 = rnorm(length(x)))
a <- df$vector1
b <- df$vector2
c <- df$vector3
v <- expand.grid(x, y)
v$out <- mapply(function(n, m) cor(n * a + m * b, c, use = "complete.obs", method = "pearson"), v[, 1], v[, 2])
library(reshape2)
z <- dcast(v, Var1 ~ Var2)
rownames(z) <- z$Var1
z <- z[, -1]
head(z[, 1:5])
Output:
0 0.01 0.02 0.03 0.04
0 NA 0.0140699293 0.0140699293 0.0140699293 0.0140699293
0.01 -0.01383734 0.0003350528 0.0065542508 0.0090938390 0.0103897953
0.02 -0.01383734 -0.0059841841 0.0003350528 0.0042062076 0.0065542508
0.03 -0.01383734 -0.0086178379 -0.0035752709 0.0003350528 0.0031310581
0.04 -0.01383734 -0.0099713568 -0.0059841841 -0.0024814273 0.0003350528
0.05 -0.01383734 -0.0107798236 -0.0075458061 -0.0045052606 -0.0018627055
Related
I would like conditionally select values from dt2 based on values in dt1 in a row-wise manner and then pair-wise correlate rows in dt2 and save the correlation values in a new matrix, dt3. Before I start to explain in words, I guess the R code is much more descriptive. I do this by looping over the data frame, which is quite slow. I am sure there is the possibility to do this in a vectorized manner to increase performance. Has anyone a solution or suggestion? Thank you a lot!
library(data.table)
dt1 <- data.table(a=round(runif(100)), b=round(runif(100)), c=round(runif(100)), d=round(runif(100)), e=round(runif(100)), f=round(runif(100)))
dt2 <- data.table(a=runif(100), b=runif(100), c=runif(100), d=runif(100), e=runif(100), f=runif(100))
m <- nrow(dt2)
n <- m
dt3 <- matrix(nrow=m, ncol=n)
col_vec <- 1:n
for (r in 1:m) {
for (p in col_vec) {
selection <- dt1[r,] > 0 & dt1[p,] > 0
selection <- as.vector(selection)
r_values <- as.numeric(dt2[p, ..selection])
p_values <- as.numeric(dt2[r, ..selection])
correlation_value <- cor(r_values, p_values, method='spearman', use='na.or.complete')
dt3[r,p] <- correlation_value
dt3[p,r] <- correlation_value
print(glue('row {r} vs row {p}'))
}
col_vec <- col_vec[-1]
}
You could use the built-in NA exclusion mechanism with use = "pairwise.complete.obs".
Set values in dt2 as missing
if the corresponding dt1 value is 0, and then use one cor() call.
library(data.table)
n <- 4
set.seed(42)
dt1 <- data.table(a = round(runif(n)), b = round(runif(n)), c = round(runif(n)), d = round(runif(n)), e = round(runif(n)), f = round(runif(n)))
dt2 <- data.table(a = runif(n), b = runif(n), c = runif(n), d = runif(n), e = runif(n), f = runif(n))
replace(t(dt2), t(dt1) == 0, NA) |>
cor(method = "spearman", use = "pairwise.complete.obs")
#> [,1] [,2] [,3] [,4]
#> [1,] 1.0 1 -1 0.1
#> [2,] 1.0 1 NA -1.0
#> [3,] -1.0 NA 1 NA
#> [4,] 0.1 -1 NA 1.0
Both approaches in functions for benchmarking:
f_loop <- function(dt1, dt2) {
m <- nrow(dt2)
n <- m
dt3 <- matrix(nrow = m, ncol = n)
col_vec <- 1:n
for (r in 1:m) {
for (p in col_vec) {
selection <- dt1[r, ] > 0 & dt1[p, ] > 0
selection <- as.vector(selection)
r_values <- as.numeric(dt2[p, ..selection])
p_values <- as.numeric(dt2[r, ..selection])
correlation_value <- cor(r_values, p_values, method = "spearman", use = "na.or.complete")
dt3[r, p] <- correlation_value
dt3[p, r] <- correlation_value
# print(glue::glue("row {r} vs row {p}"))
}
col_vec <- col_vec[-1]
}
dt3
}
f_repl <- function(dt1, dt2) {
replace(t(dt2), t(dt1) == 0, NA) |>
cor(method = "spearman", use = "pairwise.complete.obs")
}
And test with bigger data:
n <- 100
set.seed(42)
dt1 <- data.table(a = round(runif(n)), b = round(runif(n)), c = round(runif(n)), d = round(runif(n)), e = round(runif(n)), f = round(runif(n)))
dt2 <- data.table(a = runif(n), b = runif(n), c = runif(n), d = runif(n), e = runif(n), f = runif(n))
# Check that we get the same result
all.equal(f_loop(dt1, dt2), f_repl(dt1, dt2))
#> [1] TRUE
bench::system_time(f_loop(dt1, dt2))
#> process real
#> 5.86s 5.92s
bench::system_time(f_repl(dt1, dt2))
#> process real
#> 188ms 198ms
I want to make the code below more efficient by using the foreach package. I tried it for a very long time but I don't manage to get the same result as when using the for-loops. I would like to use a nested foreach-loop including parallelization... And as output I would like to have two matrices with dim [R,b1] I would be very grateful for some suggestions!!
n <- c(100, 300, 500)
R <- 100
b0 <- 110
b1 <- seq(0.01, 0.1, length.out = 100)
## all combinations of n and b1
grid <- expand.grid(n, b1)
names(grid) <- c("n", "b1")
calcPower <- function( R, b0, grid) {
cl <- makeCluster(3)
registerDoParallel(cl)
## n and b1 coefficients
n <- grid$n
b1 <- grid$b1
## ensures reproducibility
set.seed(2020)
x <- runif(n, 18, 80)
x.dich <- factor( ifelse( x < median( x), 0, 1))
## enables to store two outputs
solution <- list()
## .options.RNG ensures reproducibility
res <- foreach(i = 1:R, .combine = rbind, .inorder = TRUE, .options.RNG = 666) %dorng% {
p.val <- list()
p.val.d <- list()
for( j in seq_along(b1)) {
y <- b0 + b1[j] * x + rnorm(n, 0, sd = 10)
mod.lm <- lm( y ~ x)
mod.lm.d <- lm( y ~ x.dich)
p.val <- c( p.val, ifelse( summary(mod.lm)$coef[2,4] <= 0.05, 1, 0))
p.val.d <- c( p.val.d, ifelse( summary(mod.lm.d)$coef[2,4] <= 0.05, 1, 0))
}
solution[[1]] <- p.val
solution[[2]] <- p.val.d
return(solution)
}
dp.val <- matrix( unlist(res[,1], use.names = FALSE), R, length(b1), byrow = TRUE)
dp.val.d <- matrix( unlist(res[,2], use.names = FALSE), R, length(b1), byrow = TRUE)
stopCluster(cl)
df <- data.frame(
effectS = b1,
power = apply( dp.val, 2, function(x){ mean(x) * 100}),
power.d = apply( dp.val.d, 2, function(x){ mean(x) * 100}),
n = factor(n))
return(df)
}
## simulation for different n
tmp <- with(grid,
by( grid, n,
calcPower, R = R, b0 = b0))
## combines the 3 results
df.power <- rbind(tmp[[1]], tmp[[2]], tmp[[3]])
I created a foreach loop in following code. There had to be some changes made. It is a lot easier to return a list then a matrix in foreach, since it's combined with rbind. Especially when you want to return multiple ones. My solution here is to save everything in a list and afterwards transform it into a matrix of length 100.
Note: there is one mistake in your code. summary( mod.lm.d)$coef[2,4] does not exist. I changed it to [2]. Adjust to your needing
solution <- list()
df2<-foreach(i = 1:R, .combine = rbind, .inorder=TRUE) %dopar%{
set.seed(i)
p.val <- list()
p.val.d <- list()
counter <- list()
for( j in seq_along(b1)){
x <- sort( runif(n, 18, 80))
x.dich <- factor( ifelse( x < median(x), 0, 1))
y <- b0 + b1[j] * x + rnorm( n, 0, sd = 10)
mod.lm <- lm( y ~ x)
mod.lm.d <- lm( y ~ x.dich)
p.val <- c(p.val, ifelse( summary( mod.lm)$coef[2] <= 0.05, 1, 0))
p.val.d <- c(p.val.d, ifelse( summary( mod.lm.d)$coef[2] <= 0.05, 1, 0))
counter <- c(counter, j)
}
solution[[1]] <- p.val
solution[[2]] <- p.val.d
solution[[3]] <- counter
return(solution)
}
dp.val <- unlist(df2[,1], use.names = FALSE)
dp.val.d <- unlist(df2[,2], use.names = FALSE)
dp.val.matr <- matrix(dp.val, R, length(b1))
dp.val.d.matr <- matrix(dp.val.d, R, length(b1))
stopCluster(cl)
for your comment:
A foreach does work with a normal for loop. Minimal reproducible example:
df<-foreach(i = 1:R, .combine = cbind, .inorder=TRUE) %dopar%{
x <- list()
for(j in 1:3){
x <- c(x,j)
}
return(x)
}
I want to generate data from a function iterating over a range of values. The setting is best explained in a small example:
myfun <- function(a, b, sims) {
x = 3/a*b
y = mean(a*rnorm(sims))
return(data.frame(x = x, y = y))
}
# Output I want:
d <- data.frame(x = 0, y= 0)
d[1,] <- myfun(a=4, b=2, sims = 100)
d[2,] <- myfun(a=4, b=3, sims = 100)
d[3,] <- myfun(a=4, b=4, sims = 100)
# --> With a for loop this is easy
# Using mdply, however, does not work
a <- expand.grid(a=1:3)
d <- plyr::mdply(a, myfun, b=seq(1,100, length=100), sims = 100)
You can use Map :
data <- expand.grid(a = 1:3, b = 1:100)
result <- do.call(rbind, Map(myfun, data$a, data$b, MoreArgs = list(sims = 100)))
head(result)
# x y
#1 3.0 -0.17846248
#2 1.5 0.06837716
#3 1.0 0.01034184
#4 6.0 -0.02898619
#5 3.0 0.10077290
#6 2.0 0.22321839
A similar way would be if you Vectorize myfun. Vectorize is a wrapper around mapply.
myfun_vec <- Vectorize(myfun)
t(myfun_vec(data$a, data$b, 100))
A purrr option :
result <- purrr::map2_df(data$a, data$b, myfun, sims = 100)
x1<-as.matrix(seq(-32.768,32.768,length=100))
x2<-as.matrix(seq(-32.768,32.768,length=100))
X<-cbind(x1,x2)
y <- outer(X,X,Ackley)
Function implementation
Ackley <- function(x1,x2){
a<-20
b<- 0.2
c<-(2*pi)
fofx1<- -a*exp(-b*sqrt((rowSums(X^2)/100)))-
exp(rowSums(cos((c*X)/100)))+a+exp(1)
fofx2 <- -a*exp(-b*sqrt((rowSums(X^2)/100)))-
exp(rowSums(cos((c*X)/100)))+a+exp(1)
return(fofx1+fofx2)
}
i'm getting this as error like this -
Error in dim(robj) <- c(dX, dY) :
dims [product 40000] do not match the length of object [100]
i wont to plot like this
ackley <- function(x1, x2) {
a <- 20
b <- 0.2
c <- (2*pi)
d <- 2
fofx1 <- -a*exp(-b*sqrt(1/d*sum(c(x1,x2)^2))) -
exp(sum( cos(c*c(x1,x2))/d))+a+exp(1)
#fofx2 <- -a*exp(-b*sqrt(sum(c(x1,x2)^2)/100))-
# exp(sum( cos(c*c(x1,x2))/100))+a+exp(1)
return(fofx1)
}
Ackley <- Vectorize(ackley)
x1 <- seq(-32.768,32.768,length=500)
x2 <- seq(-32.768,32.768,length=500)
z <- outer(x1, x2, FUN="Ackley")
library(plotly)
plot_ly(x=~x1, y=~x2, z = ~z, type="surface") %>%
layout(scene=list(aspectratio = list(x = 1, y = 1, z = 1)))
I have a large table with several thousand values for which I would like to compute the p-values using binom.test. As an example:
test <- data.frame("a" = c(4,8,8,4), "b" = c(2,3,8,0))
to add a third column called "pval" I use:
test$pval <- apply(test, 1, function(x) binom.test(x[2],x[1],p=0.05)$p.value)
This works fine for a small test sample such as above, however when I try to use this for my actual dataset the speed is way too slow. Any suggestions?
If you are just using the p-value, and always using two-sided tests, then simply extract that part of the code from the existing binom.test function.
simple.binom.test <- function(x, n)
{
p <- 0.5
relErr <- 1 + 1e-07
d <- dbinom(x, n, p)
m <- n * p
if (x == m) 1 else if (x < m) {
i <- seq.int(from = ceiling(m), to = n)
y <- sum(dbinom(i, n, p) <= d * relErr)
pbinom(x, n, p) + pbinom(n - y, n, p, lower.tail = FALSE)
} else {
i <- seq.int(from = 0, to = floor(m))
y <- sum(dbinom(i, n, p) <= d * relErr)
pbinom(y - 1, n, p) + pbinom(x - 1, n, p, lower.tail = FALSE)
}
}
Now test that it gives the same values as before:
library(testthat)
test_that(
"simple.binom.test works",
{
#some test data
xn_pairs <- subset(
expand.grid(x = 1:50, n = 1:50),
n >= x
)
#test that simple.binom.test and binom.test give the same answer for each row.
with(
xn_pairs,
invisible(
mapply(
function(x, n)
{
expect_equal(
simple.binom.test(x, n),
binom.test(x, n)$p.value
)
},
x,
n
)
)
)
}
)
Now see how fast it is:
xn_pairs <- subset(
expand.grid(x = 1:50, n = 1:50),
n >= x
)
system.time(
with(
xn_pairs,
mapply(
function(x, n)
{
binom.test(x, n)$p.value
},
x,
n
)
)
)
## user system elapsed
## 0.52 0.00 0.52
system.time(
with(
xn_pairs,
mapply(
function(x, n)
{
simple.binom.test(x, n)
},
x,
n
)
)
)
## user system elapsed
## 0.09 0.00 0.09
A five-fold speed up.