I'm trying to speed up a script that otherwise takes days to handle larger data sets. So, is there a way to completely vectorize the following script:
# k-fold cross validation
df <- trees # a data frame 'trees' from R.
df <- df[sample(nrow(df)), ] # randomly shuffles the data.
k <- 10 # Number of folds. Note k=nrow(df) in the leave-one-out cross validation.
folds <- cut(seq(from=1, to=nrow(df)), breaks=k, labels=FALSE) # creates unique numbers for k equally size folds.
df$ID <- folds # adds fold IDs.
df[paste("pred", 1:10, sep="")] <- NA # adds multiple columns "pred1"..."pred10" to speed up the following loop.
library(mgcv)
for(i in 1:k) {
# looping for different models:
m1 <- gam(Volume ~ s(Height), data=df, subset=(ID != i))
m2 <- gam(Volume ~ s(Girth), data=df, subset=(ID != i))
m3 <- gam(Volume ~ s(Girth) + s(Height), data=df, subset=(ID != i))
# looping for predictions:
df[df$ID==i, "pred1"] <- predict(m1, df[df$ID==i, ], type="response")
df[df$ID==i, "pred2"] <- predict(m2, df[df$ID==i, ], type="response")
df[df$ID==i, "pred3"] <- predict(m3, df[df$ID==i, ], type="response")
}
# calculating residuals:
df$res1 <- with(df, Volume - pred1)
df$res2 <- with(df, Volume - pred2)
df$res3 <- with(df, Volume - pred3)
Model <- paste("m", 1:10, sep="") # creates a vector of model names.
# creating a vector of mean-square errors (MSE):
MSE <- with(df, c(
sum(res1^2) / nrow(df),
sum(res2^2) / nrow(df),
sum(res3^2) / nrow(df)
))
model.mse <- data.frame(Model, MSE, R2) # creates a data frame of model names, mean-square errors and coefficients of determination.
model.mse <- model.mse[order(model.mse$MSE), ] # rearranges the previous data frame in order of increasing mean-square errors.
I'd appreciate any help. This code takes several days if run on 30,000 different GAM models and 3 predictors. Thanks
Related
I have an R code that contains some nested bracket for loop within which I used rmse() function from Metrics package. I tried it without the function and it worked, but inside my nested R code it does not.
Here is what I desire to do with R
I have generated a 50-time series dataset.
I lice the same time series dataset into chunks of the following sizes: 2,3,...,48,49 making me have 48 different time series formed from step 1 above.
I divided each 48-time series dataset into train and test sets so I can use rmse function in Metrics package to get the Root Mean Squared Error (RMSE) for the 48 subseries formed in step 2.
The RMSE for each series is then tabulated according to their chunk sizes
I obtained the best ARIMA model for each 48 different time series data set.
My R code
# simulate arima(1,0,0)
library(forecast)
library(Metrics)
n <- 50
phi <- 0.5
set.seed(1)
wn <- rnorm(n, mean=0, sd=1)
ar1 <- sqrt((wn[1])^2/(1-phi^2))
for(i in 2:n){
ar1[i] <- ar1[i - 1] * phi + wn[i]
}
ts <- ar1
t<-length(ts)# the length of the time series
li <- seq(n-2)+1 # vector of block sizes(i.e to be between 1 and n exclusively)
RMSEblk<-matrix(nrow = 1, ncol = length(li))#vector to store block means
colnames(RMSEblk)<-li
for (b in 1:length(li)){
l<- li[b]# block size
m <- ceiling(t / l) # number of blocks
blk<-split(ts, rep(1:m, each=l, length.out = t)) # divides the series into blocks
singleblock <- vector() #initialize vector to receive result from for loop
for(i in 1:10){
res<-sample(blk, replace=T, 100) # resamples the blocks
res.unlist<-unlist(res, use.names = F) # unlist the bootstrap series
# Split the series into train and test set
train <- head(res.unlist, round(length(res.unlist) * 0.6))
h <- length(res.unlist) - length(train)
test <- tail(res.unlist, h)
# Forecast for train set
model <- auto.arima(train)
future <- forecast(test, model=model,h=h)
nfuture <- as.numeric(out$mean) # makes the `future` object a vector
# use the `rmse` function from `Metrics` package
RMSE <- rmse(test, nn)
singleblock[i] <- RMSE # Assign RMSE value to final result vector element i
}
#singleblock
RMSEblk[b]<-mean(singleblock) #store into matrix
}
RMSEblk
The error I got
#Error in rmse(test, nn): unused argument (nn)
#Traceback:
But when I wrote
library(forecast)
train <- head(ar1, round(length(ar1) * 0.6))
h <- length(ar1) - length(train)
test <- tail(ar1, h)
model <- auto.arima(train)
#forecast <- predict(model, h)
out <- forecast(test, model=model,h=h)
nn <- as.numeric(out$mean)
rmse(test, nn)
It did work
Please point out what I am missing?
I am able to run your code after making two very small corrections in your for loop. See the two commented lines:
for (b in 1:length(li)){
l<- li[b]
m <- ceiling(t / l)
blk<-split(ts, rep(1:m, each=l, length.out = t))
singleblock <- vector()
for(i in 1:10){
res<-sample(blk, replace=T, 100)
res.unlist<-unlist(res, use.names = F)
train <- head(res.unlist, round(length(res.unlist) * 0.6))
h <- length(res.unlist) - length(train)
test <- tail(res.unlist, h)
model <- auto.arima(train)
future <- forecast(test, model=model,h=h)
nfuture <- as.numeric(future$mean) # EDITED: `future` instead of `out`
RMSE <- rmse(test, nfuture) # EDITED: `nfuture` instead of `nn`
singleblock[i] <- RMSEi
}
RMSEblk[b]<-mean(singleblock)
}
It is possible that these typos did not result in errors because nn and out were defined in the global environment while you ran the for loop. A good debugging tip is to restart R and try to reproduce the problem.
Your code does not define nn. Other code that works has nn. To start code with clean slate use this line as first executable line:
rm(list=ls())
I am trying to run a simulation code,
And in the matrix named par.est1 I am saving in the 5th and 6th columns the standard errors of the coefficients b1 and b2, but those happen to be the exact same on the 1000 repetitions. Could anyone know why is this happening? Maybe it has something to do with the way that I created the correlated variables?
This is the code:
set.seed(185736)
reps <- 1000 #repetitions
par.est1 <- matrix(NA, nrow=reps, ncol=6)
b1 <- 4
b2 <- 5.8
n <- 26
r <- 0.1
#Create correlated variables
library(MASS)
data <- mvrnorm(n=n, mu=c(0, 0), Sigma=matrix(c(1, r, r, 1), nrow=2), empirical=TRUE)
V1 = data[, 1] # standard normal (mu=0, sd=1)
V2 = data[, 2]
cor(V1, V2)
for(i in 1:reps){
Y <- V1*b1+V2*b2+rnorm(n,0,1) #The true DGP, with N(0,1) error
model1 <- lm(Y~V1+V2)
vcv1 <- vcov(model1)
par.est1[i,1] <- model1$coef[1]
par.est1[i,2] <- model1$coef[2]
par.est1[i,3] <- model1$coef[3]
par.est1[i,4] <- sqrt(diag(vcv1)[1]) #SE
par.est1[i,5] <- sqrt(diag(vcv1)[2])
par.est1[i,6] <- sqrt(diag(vcv1)[3])
}
Thank you.
Thanks user2554330.
Any way I can make correlated variables with different means and variances??
I did a bootstrap(a linear regression replicated 1000 times with replacement) and got 1000 intercepts(alpha) and 1000 slopes(beta),
msft.boot.sample <- list()
for (i in 1:1000) {
msft.boot.sample[[i]] <- sample(y_msft,size = 132, replace = TRUE)
}
x.boot.sample <- list()
for (i in 1:1000) {
x.boot.sample[[i]] <- sample(x, size = 132, replace = TRUE)
}
n <- 1000
my_lms <- lapply(1:n, function(i) lm(msft.boot.sample[[i]] ~ x.boot.sample[[i]]))
sapply(my_lms, coef)
summaries <- lapply(my_lms, summary)
Then,
coef(my_lms[[1]])["(Intercept)"]
# (Intercept)
# -0.0366332
coef(my_lms[[2]])["(Intercept)"]
# (Intercept)
# -0.01598145
coef(my_lms[[3]])["(Intercept)"]
# (Intercept)
# -0.02526318
I can get the one intercept and one slope at a time, but how to get all the alphas(or betas) in a time and put them in a data set?
I used a code like that but still got only one value of alpha.
for (i in 1:1000) {
alpha.1000 <- as.numeric(coef(my_lms[[i]])["(Intercept)"])
}
alpha.1000
# [1] -0.03495652
you are always overwriting the alpha.1000 value. Use this:
alpha.1000=numeric(1000)
for(i in 1:1000){
alpha.1000[i]<-as.numeric(coef(my_lms[[i]])["(Intercept)"])
}
If your regressions all have the same parameters, then
coef_list = sapply(my_lms, coef)
will give you a matrix with a column for each model and a row for each coefficient. A vector of intercepts is just the top row, coef_list[1, ].
I'm relatively new to JAGS and am running it through the R package jagsUI. I am building occupancy models, but want to summarize results as I go. So I have a matrix of 0s and 1s:
mat1 <- matrix(rbinom(10*10,1,.5),10,10)
y=mat1
That I want to run through the following model:
# Bundle data and summarize data bundle
str( win.data <- list(y = mat1, M = nrow(mat1), T = ncol(mat1)) )
# Specify model in BUGS language
sink("model.txt")
cat("
model {
# Priors
psi0 ~ dunif(0, 1)
p ~ dunif(0, 1)
for(t in 1:(T-1)){
rho[t] ~ dunif(-1,1)
}
beta0 ~ dnorm(0, 0.1)
# Likelihood
for (i in 1:M) { # Loop over sites
z[i,1] ~ dbern(psi0) # State model
y[i,1] ~ dbern(z[i,1]*p)
for (j in 2:T) { # Loop over replicate surveys
logit(psi[i,j])<- beta0 + rho[j-1]*z[i,j-1]
z[i,j] ~ dbern(psi[i,j])
y[i,j] ~ dbern(z[i,j]*p) # Observation model
}
}
# Derived quantities
coln[i,j] <- ifelse(z[i,j]-z[i,j-1]==1,1,0) # colonized
ext[i,j] <- ifelse(z[i,j-1]-z[i,j]==1,1,0) # went extinct
tot.coln[,j] <- sum(coln[,j]) # sum of colonized each survey
tot.ext[,j] <- sum(ext[,j]) # sum of extinctions each survey
Nocc[,j] <- sum(z[,j]) # total sites occupied each survey
coln.rate[,j] <- tot.coln[,j]/Nocc[,j]
ext.rate[,j] <- tot.ext[,j]/Nocc[,j]
}
",fill = TRUE)
sink()
# Initial values
zst <- apply(y, 1, max, na.rm=TRUE) # Avoid data/model/inits conflict
y<- as.matrix(y)
zst<- y
inits <- function(){list(z = zst)}
# Parameters monitored
params <- c("psi0", "p", "beta0", "coln.rate", "ext.rate")
# MCMC settings
ni <- 2000 ; nt <- 1 ; nb <- 1000 ; nc <- 3
# Call JAGS and summarize posteriors
library(jagsUI)
fm <- jags(win.data, inits, params, "model.txt", n.chains = nc,
n.thin = nt, n.iter = ni, n.burnin = nb)
print(fm, dig = 3)
The model runs except for the piece after "# Derived quantities". Basically I want to calculate the rate of change from 0 to 1 and from 1 to 0 in each survey. A couple of my thoughts on why it doesn't work. 1) z[i,j] isn't really 0s and 1s. 2) the calculations shouldn't go under Derived quantities. 3) ifelse from the JAGS manual isn't doing what I think.
I also tried using the "step" function replacing the first two lines after Derived quantities with:
coln[i,j] <- step(z[i,j]-z[i,j-1]-0.5) # colonized
ext[i,j] <- step(z[i,j-1]-z[i,j]-0.5) # went extinct
But no luck there. Any ideas?
You are indexing i and j here without looping through them. To make this work you would need to set it up within another nested for loop. Also, your extinction calculation was incorrect.
for(j in 2:T){
for(i in 1:M){
coln[i,j-1] <- ifelse(z[i,j]-z[i,j-1]==1,1,0) # colonized
ext[i,j-1] <- ifelse(z[i,j]-z[i,j-1]==-1,1,0) # went extinct
}
tot.coln[j-1] <- sum(coln[,j-1]) # sum of colonized each survey
tot.ext[j-1] <- sum(ext[,j-1]) # sum of extinctions each survey
Nocc[j-1] <- sum(z[,j-1]) # total sites occupied each survey
coln.rate[j-1] <- tot.coln[j-1]/Nocc[j-1]
ext.rate[j-1] <- tot.ext[j-1]/Nocc[j-1]
}
I have an issue finding the most efficient way to calculate a rolling linear regression over a xts object with multiple columns. I have searched and read several previously questions here on stackoverflow.
This question and answer comes close but not enough in my opinion as I want to calculate multiple regressions with the dependent variable unchanged in all the regressions. I have tried to reproduce an example with random data:
require(xts)
require(RcppArmadillo) # Load libraries
data <- matrix(sample(1:10000, 1500), 1500, 5, byrow = TRUE) # Random data
data[1000:1500, 2] <- NA # insert NAs to make it more similar to true data
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))
NR <- nrow(data) # number of observations
NC <- ncol(data) # number of factors
obs <- 30 # required number of observations for rolling regression analysis
info.names <- c("res", "coef")
info <- array(NA, dim = c(NR, length(info.names), NC))
colnames(info) <- info.names
The array is created in order to store multiple variables (residuals, coefficients etc.) over time and per factor.
loop.begin.time <- Sys.time()
for (j in 2:NC) {
cat(paste("Processing residuals for factor:", j), "\n")
for (i in obs:NR) {
regression.temp <- fastLm(data[i:(i-(obs-1)), j] ~ data[i:(i-(obs-1)), 1])
residuals.temp <- regression.temp$residuals
info[i, "res", j] <- round(residuals.temp[1] / sd(residuals.temp), 4)
info[i, "coef", j] <- regression.temp$coefficients[2]
}
}
loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time) # prints the loop runtime
As the loop shows the idea is to run a 30 observations rolling regression with data[, 1] as the dependent variable (factor) every time against one of the other factors. I have to store the 30 residuals in a temporary object in order to standardize them as fastLm does not calculate standardized residuals.
The loop is extremely slow and becomes a cumbersome if the numbers of columns (factors) in the xts object increases to around 100 - 1,000 columns would take an eternity. I hope one has a more efficient code to create rolling regressions over a large data set.
It should be pretty quick if you go down to level of the math of the linear regression. If X is the independent variable and Y is the dependent variable. The coefficients are given by
Beta = inv(t(X) %*% X) %*% (t(X) %*% Y)
I'm a little confused about which variable you want to be the dependent and which one is the independent but hopefully solving a similar problem below will help you as well.
In the example below I take 1000 variables instead of the original 5 and do not introduce any NA's.
require(xts)
data <- matrix(sample(1:10000, 1500000, replace=T), 1500, 1000, byrow = TRUE) # Random data
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))
NR <- nrow(data) # number of observations
NC <- ncol(data) # number of factors
obs <- 30 # required number of observations for rolling regression analysis
Now we can calculate the coefficients using Joshua's TTR package.
library(TTR)
loop.begin.time <- Sys.time()
in.dep.var <- data[,1]
xx <- TTR::runSum(in.dep.var*in.dep.var, obs)
coeffs <- do.call(cbind, lapply(data, function(z) {
xy <- TTR::runSum(z * in.dep.var, obs)
xy/xx
}))
loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time) # prints the loop runtime
Time difference of 3.934461 secs
res.array = array(NA, dim=c(NC, NR, obs))
for(z in seq(obs)) {
res.array[,,z] = coredata(data - lag.xts(coeffs, z-1) * as.numeric(in.dep.var))
}
res.sd <- apply(res.array, c(1,2), function(z) z / sd(z))
If I haven't made any errors in the indexing res.sd should give you the standardized residuals. Please feel free to fix this solution to correct any bugs.
Here is a much faster way to do it with the rollRegres package
library(xts)
library(RcppArmadillo)
#####
# simulate data
set.seed(50554709)
data <- matrix(sample(1:10000, 1500), 1500, 5, byrow = TRUE) # Random data
# data[1000:1500, 2] <- NA # only focus on the parts that are computed
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))
#####
# setup for solution in OP
NR <- nrow(data)
NC <- ncol(data)
obs <- 30L
info.names <- c("res", "coef")
info <- array(NA, dim = c(NR, length(info.names), NC))
colnames(info) <- info.names
#####
# solve with rollRegres
library(rollRegres)
loop.begin.time <- Sys.time()
X <- cbind(1, drop(data[, 1]))
out <- lapply(2:NC, function(j){
fit <- roll_regres.fit(
y = data[, j], x = X, width = obs, do_compute = c("sigmas"))
# are you sure you want the residual of the first and not the last
# observation in each window?
idx <- 1:(nrow(data) - obs + 1L)
idx_tail <- idx + obs - 1L
resids <- c(rep(NA_real_, obs - 1L),
data[idx, j] - rowSums(fit$coefs[idx_tail, ] * X[idx, ]))
# the package uses the unbaised estimator so we have to time by this factor
# to get the same
sds <- fit$sigmas * sqrt((obs - 2L) / (obs - 1L))
unclass(cbind(coef = fit$coefs[, 2L], res = drop(round(resids / sds, 4))))
})
loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time)
#R Time difference of 0.03123808 secs
#####
# solve with original method
loop.begin.time <- Sys.time()
for (j in 2:NC) {
cat(paste("Processing residuals for factor:", j), "\n")
for (i in obs:NR) {
regression.temp <- fastLm(data[i:(i-(obs-1)), j] ~ data[i:(i-(obs-1)), 1])
residuals.temp <- regression.temp$residuals
info[i, "res", j] <- round(residuals.temp[1] / sd(residuals.temp), 4)
info[i, "coef", j] <- regression.temp$coefficients[2]
}
}
#R Processing residuals for factor: 2
#R Processing residuals for factor: 3
#R Processing residuals for factor: 4
#R Processing residuals for factor: 5
loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time) # prints the loop runtime
#R Time difference of 7.554767 secs
#####
# check that results are the same
all.equal(info[, "coef", 2L], out[[1]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 2L], out[[1]][, "res"])
#R [1] TRUE
all.equal(info[, "coef", 3L], out[[2]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 3L], out[[2]][, "res"])
#R [1] TRUE
all.equal(info[, "coef", 4L], out[[3]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 4L], out[[3]][, "res"])
#R [1] TRUE
all.equal(info[, "coef", 5L], out[[4]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 5L], out[[4]][, "res"])
#R [1] TRUE
Do notice this comment inside the above solution
# are you sure you want the residual of the first and not the last
# observation in each window?
Here is a comparison to Sameer's answer
library(rollRegres)
require(xts)
data <- matrix(sample(1:10000, 1500000, replace=T), 1500, 1000, byrow = TRUE) # Random data
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))
NR <- nrow(data) # number of observations
NC <- ncol(data) # number of factors
obs <- 30 # required number of observations for rolling regression analysis
loop.begin.time <- Sys.time()
X <- cbind(1, drop(data[, 1]))
out <- lapply(2:NC, function(j){
fit <- roll_regres.fit(
y = data[, j], x = X, width = obs, do_compute = c("sigmas"))
# are you sure you want the residual of the first and not the last
# observation in each window?
idx <- 1:(nrow(data) - obs + 1L)
idx_tail <- idx + obs - 1L
resids <- c(rep(NA_real_, obs - 1L),
data[idx, j] - rowSums(fit$coefs[idx_tail, ] * X[idx, ]))
# the package uses the unbaised estimator so we have to time by this factor
# to get the same
sds <- fit$sigmas * sqrt((obs - 2L) / (obs - 1L))
unclass(cbind(coef = fit$coefs[, 2L], res = drop(round(resids / sds, 4))))
})
loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time)
#R Time difference of 0.9019711 secs
The time includes the time used to compute the standardized residuals.