Consider the first example:
It calculates mean within the loop.
st <- Sys.time() #Starting Time
set.seed(123456789)
vara <- c()
sda <- c()
mvara <- c() #store mean
msda <- c() #store mean of standard deviation
K <- 100000
for(i in 1:K) {
a <- rnorm(30)
vara[i] <- var(a)
sda[i] <- sd(a)
mvara[i] <- mean(mvara)
msda[i] <- mean(msda)
}
et <- Sys.time()
et-st #time taken by code (approx more than one minute)
Consider the same code, except that the same mean is calculated outside the loop.
st <- Sys.time() #Starting Time
set.seed(123456789)
vara <- c()
sda <- c()
K <- 100000
for(i in 1:K) {
a <- rnorm(30)
vara[i] <- var(a)
sda[i] <- sd(a)
}
mvara <- cumsum(vara)/ (1:K)
msd <- cumsum(sda)/ (1:K)
et <- Sys.time() #less than 5 seconds
I just wanted to know, why there is so much difference in performance of both the codes? Where one should take care when using loop?
R is fastest when you use its internal optimized code to execute loops. My understanding of the reasons behind that are poor (the thread in the comment above has explanations from more knowledgeable people), but I believe some of it has to do with memory pre-allocation, and some with the way it transforms the problem into more efficient pieces.
Your code "outside the loop" could be made yet about 20x faster (on my system, went from 7.17 sec to 0.43 sec) by creating all your random numbers first, and then solving the whole table at once, instead of swapping between those two tasks in your loop. And that's using dplyr; I presume a data.table solution could be another 5-10x faster, especially given the large number of groups.
library(dplyr)
set.seed(123456789)
K <- 100000
n <- 30
a_df <- data.frame(trial = rep(1:K, each = 30),
val = rnorm(K*n))
results <- a_df %>%
group_by(trial) %>%
summarize(vara = var(val),
sda = sd(val)) %>%
mutate(mvara = cumsum(vara) / trial,
msd = cumsum(sda) / trial)
Related
I'm new to R, and trying to calculate the product between a fixed matrix to a 2-way frequency table for any combinations of columns in a dataframe or matrix and divide it by the sequence length (aka number of rows which is 15), the problem is that the running time increases dramatically when performing it on 1K sequences (1K columns). the goal is to use it with as much as possible sequences (more than 10 minutes, for 10K could be more than 1hr)
mat1 <- matrix(sample(LETTERS),ncol = 100,nrow = 15)
mat2 <- matrix(sample(abs(rnorm(26,0,3))),ncol=26,nrow=26)
rownames(mat2) <- LETTERS
colnames(mat2) <- LETTERS
diag(mat2) <- 0
test_vec <- c()
for (i in seq(ncol(mat1)-1)){
for(j in seq(i+1,ncol(mat1))){
s2 <- table(mat1[,i],mat1[,j]) # create 2-way frequency table
mat2_1 <- mat2
mat2_1 <- mat2_1[rownames(mat2_1) %in% rownames(s2),
colnames(mat2_1) %in% colnames(s2)]
calc <- ((1/nrow(mat1))*sum(mat2_1*s2))
test_vec <- append(test_vec,calc)
}}
Thanks for the help.
Here is an approach that converts mat1 to a data.table, and converts all the columns to factors, and uses table(..., exclude=NULL)
library(data.table)
m=as.data.table(mat1)[,lapply(.SD, factor, levels=LETTERS)]
g = combn(colnames(m),2, simplify = F)
result = sapply(g, function(x) sum(table(m[[x[1]]], m[[x[2]]], exclude=NULL)*mat2)/nrow(m))
Check equality:
sum(result-test_vec>1e-10)
[1] 0
Here there are 4950 combinations (100*99/2), but the number of combinations will increase quickly as nrow(mat1) increases (as you point out). You might find in that case that a parallelized version works well.
library(doParallel)
library(data.table)
registerDoParallel()
m=as.data.table(mat1)[,lapply(.SD, factor, levels=LETTERS)]
g = combn(colnames(m),2, simplify = F)
result = foreach(i=1:length(g), .combine=c) %dopar%
sum(table(m[[g[[i]][1]]], m[[g[[i]][2]]], exclude=NULL)*mat2)
result = result/nrow(m)
So I have sampled a set of lakes at x timepoints throughout the year. I also have deployed loggers etc. in the water and I want to use daily averages from these loggers, at the timepoint of the visit to x days/hours before. Sometimes I also just grab the a sample for the timepoint of the visit.
This is my solution, it works just fine but since I experiment alot with some model assumptions and perform sensitivity analyses it operates unsatisfactory slow.
I seem to have solved most of my R problems with loops and I often encounter more efficient scripts, it would be very interesting to see some more effective alternatives to my code.
Below code just generates some dummy data..
library(dplyr)
library(lubridate)
do.pct.sat <- function(x,y,z){
t <- x
do <- y
p <- z
atm <- (p*100)/101325
do.sat <- atm*exp(-139.34411+157570.1/(t+273.15)-66423080/(t+273.15)^2+12438000000/(t+273.15)^3-862194900000/(t+273.15)^4)
do.pct.sat <- (do/do.sat)*100
return(do.pct.sat)
}#function for calculating the % oxygen saturation
#here's some dummy date resembling real data
date.initial <- as.POSIXct("2022-06-01")#deployment date
date.end <- as.POSIXct("2022-10-01")#date of retrieval
id <- c("a","b","c")#lake id
lake <- list()#make dataset list for each lake
s <- list()#list of dataframes for the samples from the lake logger timelines
#loop below generates dummy data. this is not part of the real script that I want to improve.
for(i in 1:3){
datetime <- seq(from = date.initial,to = date.end,by=10*60)#10 minute intervals from deploy to retrieve
l <- length(datetime)#vector length of datetime
#set dummy data
do <- rnorm(l,mean = 10,sd=3)#o2 conc.
pressure <- rnorm(l,mean = 980,sd=50)#baro pressure
temp <- rnorm(l,mean=15,sd=5)#water temp
k.z <- rnorm(l,mean=0.35,sd=0.1)#gas exchange koeff / mixed layer depth
dosat.pct <- do.pct.sat(temp,do,pressure)#oxygen sat in %
iso <- as.data.frame(cbind(datetime,do,dosat.pct,temp,pressure,k.z))#bind dummy dataframe to resemble real data
iso$datetime <- as.POSIXct(iso$datetime,origin = "1970-01-01")
lake[[i]] <- iso#save the data frame to the lake logger list
samples <- as.POSIXct(sample((date.initial+5*24*60*60):date.end, 7, replace=FALSE),origin = "1970-01-01")#randomize 7 timepoints
s[[i]] <- as.data.frame(samples)#save it in empty data frame
s[[i]]$lake <- id[i]
}
names(lake) <- id
samples <- bind_rows(s)
samples$samples <- round_date(samples$samples,unit="10 minutes")#rounds my random samples to closest 10 minute
Below is the function that I want to effectivize (same library). I think it operates slow because I take one date at a time, before taking the next;
sample.lakes <- function(average=3){
dts <- list()#empty list
for(i in 1:length(lake)){
print(id[i])
data = lake[[i]]
y <- samples[grepl(id[i],samples$lake),]
dates <- y$samples
#empty vectors to fill with values sampled in loop
avg.kz <- vector()
sd.kz <- vector()
do.mgl <- vector()
dosat.pct <- vector()
temp.c <- vector()
for (k in 1:length(dates)){
print(k)
#below I filter the logger data to contain timepoint of sampling minus number of days I want the average from 'averages'.
prior.days = filter(data, datetime > as.POSIXct(dates[k])-(24*60*60)*average & datetime < as.POSIXct(dates[k]))
#fill the empty vectors with value I desire, mean and sd k.z and point sample of the other variables.
avg.kz[k] = mean(prior.days$k.z)
sd.kz[k] = sd(prior.days$k.z)
temp.c[k] <- data[grepl(dates[k],data$datetime),]$temp
do.mgl[k] <- data[grepl(dates[k],data$datetime),]$do
dosat.pct[k] <- data[grepl(dates[k],data$datetime),]$dosat.pct
}
sd.kz[is.na(sd.kz)] <- 0
#add them to data frame y
y$dosat.pct <- dosat.pct
y$do.mgl <- do.mgl
y$temp.c <- temp.c
y$avg.kz <- avg.kz
y$sd.kz <- sd.kz
dts[[i]] <- y#add to single-row dataframe
}
iso <- bind_rows(dts)#make a complete dataframe with samples.
return(iso)
}
iso <- sample.lakes(average=4)#do not set average to > 5 in this example script
I would appreciaty any suggestions alot!
My guess is that this part using grepl:
data[grepl(dates[k],data$datetime),]
inside your inner for loop is slow.
Couldn't you instead try just seeing if the datetimes are the same with ==?
In addition, you only need to subset data once.
Try this as an alternative:
for (k in 1:length(dates)){
print(k)
prior.days = filter(data, datetime > as.POSIXct(dates[k])-(24*60*60)*average & datetime < as.POSIXct(dates[k]))
avg.kz[k] = mean(prior.days$k.z)
sd.kz[k] = sd(prior.days$k.z)
sub_data <- data[data$datetime == dates[k], ]
temp.c[k] <- sub_data$temp
do.mgl[k] <- sub_data$do
dosat.pct[k] <- sub_data$dosat.pct
}
I have the following code:
n <- 1e6
no_clm <- rpois(n,30)
hold <- data.frame("x" = double(n))
c = 1
for (i in no_clm){
ctl <- sum(rgamma(i,30000)-2000)
hold[c,1] <- ctl
#hold <- rbind(hold,df)
c = c +1
}
Unfortunately the speed of this code is quite slow. I've narrowed down the speed to hold[c,1] <- ctl. If I remove this then the code runs near instantly.
How can I make this efficient? I need to store the results to some sort of dataframe or list in a fast fashion. In reality the actual code is more complex than this but the slowing point is the assigning.
Note that the above is just an example, in reality I have multiple calculations on the rgamma samples and each of these calculations are then stored in a large dataframe.
Try this
hold=data.frame(sapply(no_clm,function(x){
return(sum(rgamma(x,30000)-2000))
}))
It looks like you can just use one call to rgamma, as you are iterating over the number of observations parameter.
So if you do one call and the split the vector to the lengths required (no_clm) you can then just iterate over that list and sum
n <- 1e6
no_clm <- rpois(n, 30)
hold <- data.frame("x" = double(n))
# total observations to use for rgamma
total_clm <- sum(no_clm)
# get values
gammas <- rgamma(total_clm, 30000) - 2000
# split into list of lengths dictated by no_clm
hold$x <- sapply(split(gammas, cumsum(sequence(no_clm) == 1)), sum)
This took 5.919892 seconds
Move into sapply() loop instead of a for loop and then realise 2000 * no_clm can be moved outside the loop (to minimise number of function calls).
n <- 1e6
no_clm <- rpois(n, 30)
hold <- data.frame(x = sapply(no_clm, function(i) sum(rgamma(i, 30000))) - 2000 * no_clm)
You may observe a speed pickup using data.table:
dt = data.table(no_clm)
dt[, hold := sapply(no_clm, function(x) sum(rgamma(x, 30000)-2000))]
Here I am working on optimization of R-code , As we all know Most time consuming is for loop , I am trying to replace it with lapply and experiment to reduce the Execution time.
As one can see in the image the time required to excute the for loop is taking 40 Msec , here the task is to how one can minimize the execution time for For Loop here By using lapply . How to replace this code of for loop with Lapply so that we can optimize the processing speed of code afficiently. To dentify the time required for every line of code Library Profvis is used.
I have tried using the lapply , I am facing issue in implementation
library(profvis)
profvis({
rm(list = ls())
# Creating Dummy data
row_id <- 100
No_of_level <- 4
spliz <- paste("c(","TRUE,",paste(rep("FALSE",(row_id-1)),collapse=","),")")
d <- as.data.frame(matrix(,nrow = row_id*No_of_level ,ncol=2))
names(d) <- c("Tag","y_pred")
d$Tag <- cumsum(rep(eval(parse(text=spliz)),4))
d$y_pred <- sample(3:4, row_id*No_of_level, rep = TRUE)
d$y_pred <- paste("L",d$y_pred,sep="")
#### ------------------------------------
# How to replce Below For Loop codes to lapply and get the result in the variable.
v <- data.frame();i=0
for (i in (1:max(d$Tag))){
#i=4
s <- filter(d , Tag == i)
s$y_pred <- as.character(s$y_pred)
temp = 0
for(i in 1:nrow(s))
s$R2[i] <- ifelse(s$y_pred[i] == "L3", temp <- temp + 1, 0)
s$seq <- seq_along(1:nrow(s))
s$Aoc <- (1-(s$R2/s$seq))*100
s$Aoc1 <- (s$R2/s$seq)
v <- rbind(v,s)
}
})
Expected : Improve the execution time as of now for above For Loop code , execution time is 40 msec , if we try with lapply may be we can bring Processing time from 40 msec to 10 msec or less then that.
Not sure what your expected output is, but something like this should work:
v <- do.call(rbind,
lapply(split(d, d$Tag), function(s){
res <- s
res$R2 <- ifelse(as.character(res$y_pred) == "L3",
cumsum(as.character(res$y_pred) == "L3")), 0)
res$seq <- seq_along(1:nrow(res))
re$Aoc <- (1-(res$R2/res$seq))*100
res$Aoc1 <- (res$R2/res$seq)
#return
res
}))
I am a relatively new R programmer and have written a script that takes some statistical results and will ultimately compare it to a vector of results in which the target variable has been randomized. The result vector contains the statistical results of n simulations. As the number of simulations increases (I would like to run 10,000 simulations at least) the run time is longer than I would like. I have tried increasing the performance in ways I know to modify the code, but would love the help of others in optimizing it. The relevant part of the code is below.
#CREATE DATA
require(plyr)
Simulations <- 10001
Variation <- c("Control", "A", "B","C")
Trials <- c(727,724,723,720)
NonResponse <- c(692,669,679,682)
Response <- c(35,55,44,38)
ConfLevel <- .95
#PERFORM INITIAL CALCS
NonResponse <- Trials-Response
Data <-data.frame(Variation, NonResponse, Response, Trials)
total <- ddply(Data,.(Variation),function(x){data.frame(value = rep(c(0,1),times = c(x$NonResponse,x$Response)))})
total <- total[sample(1:nrow(total)), ]
colnames(total) <- c("Variation","Response")
#CREATE FUNCTION TO PERFORM SIMULATIONS
targetshuffle <- function(x)
{
shuffle_target <- x[,"Response"]
shuffle_target <- data.frame(sample(shuffle_target))
revised <- cbind(x[,"Variation"], shuffle_target)
colnames(revised) <- c("Variation","Yes")
yes_variation <- data.frame(table(revised$Yes,revised$Variation))
colnames(yes_variation) <- c("Yes","Variation","Shuffled_Response")
Shuffled_Data <- subset(yes_variation, yes_variation$Yes==1)
Shuffled_Data <- Shuffled_Data[match(Variation, Shuffled_Data$Variation),]
yes_variation <- cbind(Data,Shuffled_Data)
VectorPTest_All <- yes_variation[,c("Variation","NonResponse","Response","Trials","Shuffled_Response")]
Control_Only <- yes_variation[yes_variation$Variation=="Control",]
VectorPTest_Chall <- subset(yes_variation,!(Variation=="Control"))
VectorPTest_Chall <- VectorPTest_Chall[,c("Variation","NonResponse","Response","Trials","Shuffled_Response")]
ControlResponse <- Control_Only$Response
ControlResponseRevised <- Control_Only$Shuffled_Response
ControlTotal <- Control_Only$Trials
VariationCount <- length(VectorPTest_Chall$Variation)
VP <- data.frame(c(VectorPTest_Chall,rep(ControlResponse),rep(ControlResponseRevised),rep(ControlTotal)))
names(VP) <- c("Variation","NonResponse","Response", "Trials", "ResponseShuffled", "ControlReponse",
"ControlResponseShuffled","ControlTotal")
VP1 <<- data.frame(VP[,c(5,7,4,8)])
VP2 <<- data.frame(VP[,c(3,6,4,8)])
ptest <- apply(VP1, 1, function(column) prop.test(x=c(column[1], column[2]),
n=c(column[3], column[4]), alternative="two.sided",
conf.level=ConfLevel, correct=FALSE)$p.value)
min_p_value <- min(ptest)
return(min_p_value)
}
#CALL FUNCTION
sim_result <- do.call(rbind, rlply(Simulations, targetshuffle(total)))
Offhand, one thing to look at is creating all the data frames. Each time you do that you're copying all the data in the constituent object. If the dimensions are predictable you might consider creating empty matrices at the beginning of the function and populating them as you go.