improve execution time in a for loop (R) - r

Here is a reproducible example of a for loop. Since I want to do 3000 iterations and my matrix is way bigger than this reproducible example, the computer crushes. Any ideas on how can I do it?
I have read that loops are not reccomended in R and instead the web proposes to use vectors and apply functions but I wasn't able to build my formula with that functions...
The matrix:
row.names <- c('A2003','B2010','C2011','D2010','E2001','F2005','F2009','G2003','G2007','H2004','H2010')
sp1 <- c(4,83,1,2,4,3,1,5,7,2,4)
sp2 <- c(5,0,2,3,10,5,0,2,4,3,1)
sp3 <- c(7,2,4,8,7,2,4,83,1,5,7)
sp4 <- c(0,2,4,2,4,12,1,5,7,2,4)
Site <- c('A','B','C','D','E','F','F','G','G','H','H')
Year <- c('2003','2010','2011','2010','2001','2005','2009','2003','2007','2004','2010')
Obs <- c(1,1,1,4,9,6,8,2,5,2,3)
ID <- c('A2003','B2010','C2011','D2010','E2001','F2005','F2009','G2003','G2007','H2004','H2010')
df<- data.frame(row.names, sp1, sp2, sp3, sp4, Site, Year, Obs, ID)
rownames(df) <- df[,1]
df[,1] <- NULL
df
df.1 <- subset(df, Obs == 1)
df.more <- subset(df, Obs >= 2)
df.1
df.more
The loop function:
require (vegan)
iterations <- 3000
out <- vector("list", iterations)
for(i in 1:iterations){
rnd.more <- do.call(rbind, lapply(split(df.more, df.more$Site),
function(df.more) df.more[sample(nrow(df.more), 1,replace=FALSE) , ])
)
rnd.df <- rbind(df.1,rnd.more)
rnd.df.bc <- as.matrix(vegdist(rnd.df[1:4], method="bray"))
rnd.df.bc[lower.tri(rnd.df.bc,diag=TRUE)] <- NA
triang <- rnd.df.bc[!is.na(rnd.df.bc)]
mean.bc <- mean(triang)
out[[i]] <- list(rnd = rnd.df, bc = rnd.df.bc, ave = mean.bc)
}
Extraction of the results:
all.rnd.df <- lapply(out, "[[", "rnd")
capture.output(all.rnd.df,file="all.rnd.df.txt")
all.rnd.df.bc <- lapply(out, "[[", "bc")
capture.output(all.rnd.df.bc,file="all.rnd.df.bc.txt")
all.triang <- lapply(out, "[[", "ave")
capture.output(all.triang,file="all.triang.txt")

precompute the sample indices:
idx <- lapply(1:iterations, function(x) {
tapply(1:nrow(df.more), as.character(df.more$Site), function(y) {
if(length(y) == 1) y else sample(y, 1)
})
})
idx <- lapply(idx, function(ids) c(1:nrow(df.1), ids + nrow(df.1)))
precompute a placeholder data.frame to index
rnd.df <- rbind(df.1, df.more)
now you just index the precomputed objects without having to calculate them every loop:
iterations <- 3000
out <- vector("list", iterations)
for(i in 1:iterations){
rnd.df.bc <- as.matrix(vegdist(rnd.df[idx[[i]] ,1:4], method="bray"))
rnd.df.bc[lower.tri(rnd.df.bc,diag=TRUE)] <- NA
triang <- rnd.df.bc[!is.na(rnd.df.bc)]
mean.bc <- mean(triang)
out[[i]] <- list(rnd = rnd.df, bc = rnd.df.bc, ave = mean.bc)
}
Benchmarks:
f1 = my method
f2 = OPs code
> microbenchmark(f1(), f2(), times=5L)
Unit: seconds
expr min lq mean median uq max neval
f1() 2.21069 4.877017 4.666875 5.27416 5.444411 5.528096 5
f2() 13.54813 13.554965 19.500247 14.51089 27.074520 28.812732 5
Make it parallel:
cl <- makeCluster(3)
registerDoSNOW(cl)
out <- foreach(i = 1:iterations, .packages=c('vegan')) %do%
{
rnd.df.bc <- as.matrix(vegdist(rnd.df[idx[[i]] ,1:4], method="bray"))
rnd.df.bc[lower.tri(rnd.df.bc,diag=TRUE)] <- NA
triang <- rnd.df.bc[!is.na(rnd.df.bc)]
mean.bc <- mean(triang)
list(rnd = rnd.df, bc = rnd.df.bc, ave = mean.bc)
}
stopCluster(cl)

Not being familiar with the vegan package, I'm a little limited in what advice I can give you. For the most part, you've constructed your for loop well and as you can see below, you don't gain much by converting it to a function and running it through lapply.
I think your best bet is to parallelize your code. In the example below, if you convert your for loop to a function and use parLapply, you can shave a couple seconds off if you include the cluster build time. If you exclude the cluster build time, it's about 5 times faster on my 7 cores. Changes in computational time will vary by the number of cores you can run on. But I think that may be your best option for now.
library(parallel)
library(vegan)
row.names <- c('A2003','B2010','C2011','D2010','E2001','F2005','F2009','G2003','G2007','H2004','H2010')
sp1 <- c(4,83,1,2,4,3,1,5,7,2,4)
sp2 <- c(5,0,2,3,10,5,0,2,4,3,1)
sp3 <- c(7,2,4,8,7,2,4,83,1,5,7)
sp4 <- c(0,2,4,2,4,12,1,5,7,2,4)
Site <- c('A','B','C','D','E','F','F','G','G','H','H')
Year <- c('2003','2010','2011','2010','2001','2005','2009','2003','2007','2004','2010')
Obs <- c(1,1,1,4,9,6,8,2,5,2,3)
ID <- c('A2003','B2010','C2011','D2010','E2001','F2005','F2009','G2003','G2007','H2004','H2010')
df<- data.frame(row.names, sp1, sp2, sp3, sp4, Site, Year, Obs, ID)
rownames(df) <- df[,1]
df[,1] <- NULL
df
df.1 <- subset(df, Obs == 1)
df.more <- subset(df, Obs >= 2)
df.1
df.more
more.fun <- function(df.more, df.1)
{
rnd.more <- do.call(rbind, lapply(split(df.more, df.more$Site),
function(df.more) df.more[sample(nrow(df.more), 1,replace=FALSE) , ])
)
rnd.df <- rbind(df.1,rnd.more)
rnd.df.bc <- as.matrix(vegdist(rnd.df[1:4], method="bray"))
rnd.df.bc[lower.tri(rnd.df.bc,diag=TRUE)] <- NA
triang <- rnd.df.bc[!is.na(rnd.df.bc)]
mean.bc <- mean(triang)
list(rnd = rnd.df, bc = rnd.df.bc, ave = mean.bc)
}
start.orig <- Sys.time()
set.seed(pi)
iterations <- 3000
out <- vector("list", iterations)
for(i in 1:iterations){
rnd.more <- do.call(rbind, lapply(split(df.more, df.more$Site),
function(df.more) df.more[sample(nrow(df.more), 1,replace=FALSE) , ])
)
rnd.df <- rbind(df.1,rnd.more)
rnd.df.bc <- as.matrix(vegdist(rnd.df[1:4], method="bray"))
rnd.df.bc[lower.tri(rnd.df.bc,diag=TRUE)] <- NA
triang <- rnd.df.bc[!is.na(rnd.df.bc)]
mean.bc <- mean(triang)
out[[i]] <- list(rnd = rnd.df, bc = rnd.df.bc, ave = mean.bc)
}
end.orig <- Sys.time()
start.apply <- Sys.time()
fn = out <- lapply(1:3000, function(i) more.fun(df.more, df.1))
end.apply <- Sys.time()
start.parallel <- Sys.time()
cl <- makeCluster(7)
clusterEvalQ(cl, library(vegan))
clusterExport(cl, c("df.more", "df.1", "more.fun"))
start.parallel.apply <- Sys.time()
out <- parLapply(cl, 1:3000, function(i) more.fun(df.more, df.1))
end.parallel <- Sys.time()
#* Compare times
end.orig - start.orig
end.apply - start.apply
end.parallel - start.parallel
end.parallel - start.parallel.apply
(the time comparisons here are pretty crude)

Related

R vector memory exhausted for calculating tensor products

I am calculating the tensor products of 60,000 pairs of 28*28 matrices in RStudio (version 3.5.2), and the console shows me "Error: vector memory exhausted (limit reached?)". I don't think my MacBook Pro would have such low capacity (16GB RAM). I tried the mclapply method for parallel computing but still don't work. Can anyone provide me some insights? Thanks a lot!
If necessary, the following are my codes. I cannot run the last line.
install.packages("keras")
library(keras)
install_keras()
install_keras(method = "conda")
library(keras)
mnist <- dataset_mnist()
str(mnist)
trainx <- mnist$train$x
trainy <- mnist$train$y
testx <- mnist$test$x
testy <- mnist$test$y
trainxr <- trainx
trainxg <- trainx
trainxb <- trainx
testxr <- testx
testxg <- testx
testxb <- testx
#training data
i <- 1
for(i in i:60000){
randomr <- sample (0:255, 1)
randomg <- sample (0:255, 1)
randomb <- sample (0:255, 1)
trainxr[i,,] <- (randomr/255)*(trainx[i,,]/255)
trainxg[i,,] <- (randomg/255)*(trainx[i,,]/255)
trainxb[i,,] <- (randomb/255)*(trainx[i,,]/255)
i <- i+1
}
#testing data
j <- 1
for(j in j:10000){
randomr <- sample (0:255, 1)
randomg <- sample (0:255, 1)
randomb <- sample (0:255, 1)
testxr[j,,] <- (randomr/255)*(testx[j,,]/255)
testxg[j,,] <- (randomg/255)*(testx[j,,]/255)
testxb[j,,] <- (randomb/255)*(testx[j,,]/255)
j <- j+1
}
#for training
k <- 1
for(k in k:60000){
randomminus <- sample (0:255, 1)
matrixminus <- matrix((randomminus/255):(randomminus/255), nrow = 28, ncol = 28)
trainxr[k,,] <- trainxr[k,,] - matrixminus
trainxr[k,,] <- abs(trainxr[k,,])
trainxg[k,,] <- trainxg[k,,] - matrixminus
trainxg[k,,] <- abs(trainxg[k,,])
trainxb[k,,] <- trainxb[k,,] - matrixminus
trainxb[k,,] <- abs(trainxb[k,,])
k <- k+1
}
#for testing
l <- 1
for(l in l:10000){
randomminus <- sample (0:255, 1)
matrixminus <- matrix((randomminus/255):(randomminus/255), nrow = 28, ncol = 28)
trainxr[l,,] <- trainxr[l,,] - matrixminus
trainxr[l,,] <- abs(trainxr[l,,])
trainxg[l,,] <- trainxg[l,,] - matrixminus
trainxg[l,,] <- abs(trainxg[l,,])
trainxb[l,,] <- trainxb[l,,] - matrixminus
trainxb[l,,] <- abs(trainxb[l,,])
l <- l+1
}
#tensor product
stepone <- matrix(1:1, nrow=21952, ncol=28)
steptwo <- matrix(1:1, nrow=28, ncol=28)
trainxtensor_a <- trainxr %x% trainxg
I'm guessing you intending to collapse the 2nd and 3rd dimensions in that tensor product. Perhaps you want something like this:
library(tensor)
trainxtensor_a <- tensor(trainxr, trainxg, c(2,3), c(2,3))
Although you should try a smaller dataset to check if it is doing what you expect first:
trainxtensor_a <- tensor(trainxr[1:5,,], trainxg[1:5,,], c(2,3), c(2,3))

I want to use a list in a for loop code. list[[i]] works fine, but why doesn't res <- vector("list",n) work?

I am using a for loop to loop through two different matrices. The code looks like this:
x <- matrix(rnorm(1806),7,258)
x2 <- matrix(rnorm(1032),4,258)
samp_size <- 3
iter <- 1000
subs <- matrix(sample(1:nrow(x), samp_size*iter, replace=T),
ncol=samp_size, byrow=T)
subs2 <- matrix(sample(1:nrow(x2), samp_size*iter, replace=T),
ncol=samp_size, byrow=T)
for(j in 1:nrow(subs)){
ad <- x[subs[j,],]
ad1 <- x2[subs2[j,],]
rd <- rbind(ad,ad1)
dis <- dist(rd, method="euclidian")
#CV <- sd(unlist(rd), na.rm=TRUE)/mean(unlist(rd), na.rm=TRUE)*100
dis2 <- dis[dis!=0]
list[[j]] <- mean(dis2)
l <- unlist(list)
l <- na.omit(l)
sdis <- dist(ad, method="euclidian")
sdis2 <- sdis[sdis!=0]
res <- vector("list", 1000)
res[[j]] <- mean(sdis2)
l2 <- unlist(res)
l2 <- na.omit(l2)
}
list[[j]] works just fine.
The result of each iteration is added to it as a list element based on the value of j.
However, res[[j]] ends up being full of NULL with only the last element containing a value.
I am not sure what is happening here. I would like res[[j]] to behave the same way as list[[j]] does.
Help much appreciated.
Maybe initialize res:
# before the loop
res <- rep(list(NA),nrow(subs))
# or in the loop
res <- rep(list(NA),1000)
# or
res <- list(numeric(1000))

No appear Hist in R

library(downloader)
url <- "https://raw.githubusercontent.com/genomicsclass/dagdata/master/inst/extdata/femaleControlsPopulation.csv"
filename <- "femaleControlsPopulation"
download(url, destfile=filename)
dat <- read.csv(filename)
x <- unlist( read.csv(filename))
set.seed(1)
n <- 10000
averages5 <- vector("numeric", n)
for (i in 1:n){
x <- sample(x,5)
averages5[i] <- mean(x)
}
hist(averages5)
mean(abs(averages5-mean(x))>1)
I have the problem that appear empty hist and the mean is 0.....pls i need help
This will do what you want:
url <- "https://raw.githubusercontent.com/genomicsclass/dagdata/master/inst/extdata/femaleControlsPopulation.csv"
dat <- read.table(file=url, header=TRUE)
x <- dat$Bodyweight
n <- 10000
averages5 <- replicate(n, mean(sample(x,5)))
hist(averages5)

R apply function to a list of price data

in the code below I am grabbing data for three symbols, then I want to apply a simple function(which is a trading strategy) to this data. Ideally, I would then run statistics on those returns, such as those native to PerformanceAnalytics.
library("quantmod")
library("PerformanceAnalytics")
options(scipen=999)
PriceData <- new.env()
Symbols <- c("SPY", "QQQ", "IWM")
StartDt <- as.Date("2015-01-01")
suppressWarnings(getSymbols(Symbols, src="yahoo", env=PriceData, from=StartDt))
x <- list()
for (i in 1:length(Symbols)) {
x[[i]] <- get(Symbols[i], pos=PriceData)
}
SYSTEM <- function(data){
ret<- Delt(Cl(x[[i]]),Op(x[[i]]),type = 'arithmetic')
mavga <- SMA(Cl(x[[i]]), n=10)
mavgb <- SMA(Cl(x[[i]]), n=20)
sig <- ifelse(mavga>mavgb,1,0)+ifelse(mavgb>mavga,-1,0)
sig <- lag(sig,1)
sig[is.na(sig)]=0
strategyreturn <- sig * ret
return(strategyreturn)
}
######I'm doing something wrong here######
y <- lappy(x,SYSTEM)
z <- do.call("cbind", y)
PerformanceTable <- function(returns){
scalar <- 252
CS <- t(Return.cumulative(returns, geometric = FALSE))
SR <- t(SharpeRatio.annualized(returns, scale=scalar, geometric = FALSE))
SOR <- t(SortinoRatio(returns))
MDD <- t(maxDrawdown(returns))
WP <- colSums(returns > 0)/colSums(returns != 0)
WP <- as.data.frame(WP)
ASD = t(sd.annualized(returns, scale=scalar))
Stat <- cbind(CS,SR,SOR,MDD,ASD,WP)
colnames(Stat) <- c("Profit","SharpeRatio", "Sortino","MaxDrawdown", "AnnStdDev", "WinPct")
print("Performance Table")
print(Stat)
return (Stat)
}
Perf <- PerformanceTable(y)
You are almost there, just replace x[[i]] by data in your function:
SYSTEM <- function(data){
ret<- Delt(Cl(data),Op(data),type = 'arithmetic')
mavga <- SMA(Cl(data), n=10)
mavgb <- SMA(Cl(data), n=20)
sig <- ifelse(mavga>mavgb,1,0)+ifelse(mavgb>mavga,-1,0)
sig <- lag(sig,1)
sig[is.na(sig)]=0
strategyreturn <- sig * ret
return(strategyreturn)
}
and add a l to lappy()
y <- lapply(x,SYSTEM)
z <- do.call("cbind", y)

Using RStudio when I click on Run it executes properly but use source and it gives me a error message

I have the following code which optimizes and is suppose to export the results to a csv. All works well but the last line of code
write.csv(solutionsMatrix, file="SLATE/DK.A1.CSV")
When i click on Run it executes correctly but when i click on Source it gives me a error message
Error in source("~/.active-rstudio-document", echo = TRUE) :
~/.active-rstudio-document:76:34: unexpected symbol
75:
76: write.csv(solutionsMatrix, file="SLATE
With a up arrow point at the S in SLATE
Why is it doing this?? Thanks for looking
setwd("C:/DFS/NFL/R DATA")
library('lpSolve')
data <-read.csv ("DRAFTKINGS.CSV")
#Convert salary to numeric
data$Salary <- as.numeric(gsub(",","",data$Salary), data$Salary)
data$MAX.TEAM <- as.numeric(gsub(",","",data$MaxTeam), data$MaxTeam)
#Add binary valeus for positions 'Constraint Vectors'
data <- cbind(data, X=1)
data <- cbind(data, QB=ifelse(data$Pos=="QB",1,0))
data <- cbind(data, RB=ifelse(data$Pos=="RB", 1,0))
data <- cbind(data, WR=ifelse(data$Pos=="WR", 1,0))
data <- cbind(data, TE=ifelse(data$Pos=="TE", 1,0))
data <- cbind(data, DEF=ifelse(data$Pos=="DEF", 1,0))
#Objective Function. sum of proj pts
f.obj <- data$DKA1
#Constraints
num_X <- 9
num_QB <- 1
min_RB <- 2
min_WR <- 3
min_TE <-1
num_DEF <-1
max_RB <- 3
max_WR <-4
max_TE <-2
max_team_cost <- 50000
max_player_from_a_team <-data [2, 5]
#Constraints for max players from team
clubs <- sort(unique(data$Team))
team_constraint_vector <- c()
team_constraint_dir <- c()
team_constraint_rhs <- c()
for(i in 1:length(clubs)){
temp <- data$Team==as.character(clubs[i])
temp[temp==T] <- 1
temp[temp==F] <- 0
team_constraint_vector <- c(team_constraint_vector, temp)
team_constraint_dir <- c(team_constraint_dir, "<=")
team_constraint_rhs <- c(team_constraint_rhs, max_player_from_a_team)
}
solutions <- list()
solutionsMatrix <- matrix(, nrow=0, ncol=21)
for(i in 1:1){
f.con <- matrix (c(data$X,data$QB , data$RB ,data$RB ,data$WR, data$WR,data$TE, data$TE,data$DEF , data$Salary, team_constraint_vector), nrow=(10+length(clubs)), byrow=TRUE)
f.dir <- c("=","=", ">=","<=",">=","<=",">=","<=", "=", "<=", team_constraint_dir)
f.rhs <- c(num_X,num_QB,min_RB,max_RB,min_WR, max_WR,min_TE, max_TE, num_DEF, max_team_cost, team_constraint_rhs)
x <- lp ("max", f.obj, f.con, f.dir, f.rhs, all.bin=TRUE)
x
solutions[[i]] <- data[which(x$solution==1),]
solutionsMatrix <- rbind(solutionsMatrix, c("DKA1, sum(solutions[[i]]$DKA1), sum(solutions[[i]]$Salary), toString(solutions[[i]]$Name[4]),toString(solutions[[i]]$Pos[4]), toString(solutions[[i]]$Name[5]),toString(solutions[[i]]$Pos[5]), toString(solutions[[i]]$Name[8]),toString(solutions[[i]]$Pos[8]), toString(solutions[[i]]$Name[9]),toString(solutions[[i]]$Pos[9]), toString(solutions[[i]]$Name[6]),toString(solutions[[i]]$Pos[6]), toString(solutions[[i]]$Name[7]), toString(solutions[[i]]$Pos[7]),toString(solutions[[i]]$Name[2]),toString(solutions[[i]]$Pos[2]), toString(solutions[[i]]$Name[3]),toString(solutions[[i]]$Pos[3]), toString(solutions[[i]]$Name[1]),toString(solutions[[i]]$Pos[1])))
}
solutions[[1]]
#Solutions Matrix Optimization
solutionsMatrix
write.csv(solutionsMatrix, file="SLATE/DK.A1.CSV")
###################################################################

Resources