Parallel Computation for Create_Matrix 'RTextTools' package - r

I am creating a DocumentTermMatrix using create_matrix() from RTextTools and create container and model based on that. It is for extremely large datasets.
I do this for each category (factor levels). So for each category it has to run matrix, container and model. When I run the below code in (say 16 core / 64 gb) - it runs only in one core and memory used is less than 10%.
Is there way I can speedup this process? Perhaps using doparallel & foreach? Any information would certainly help.
#import the required libraries
library("RTextTools")
library("hash")
library(tm)
for ( n in 1:length(folderaddress)){
#Initialize the variables
traindata = list()
matrix = list()
container = list()
models = list()
trainingdata = list()
results = list()
classifiermodeldiv = 0.80`
#Create the directory to place the models and the output files
pradd = paste(combinedmodelsaveaddress[n],"SelftestClassifierModels",sep="")
if (!file.exists(pradd)){
dir.create(file.path(pradd))
}
Data$CATEGORY <- as.factor(Data$CATEGORY)
#Read the training files
X <- split(Data, Data$CATEGORY)
data <- lapply(seq_along(X), function(x) as.data.frame(X[[x]])[,5])
names(data) <- levels(Data$CATEGORY)
list2env(data, envir = .GlobalEnv)
files=as.matrix(names(data))
fileno=length(files)
fileno=as.integer(fileno)
print(fileno)
#For all the files in the training folder(the number of files in the training folder = Number of categories in Taxonomy)
for(i in 1:fileno){
filename = as.character(files[i,1])
data1 = as.data.frame(data[i])
data1 = as.matrix(data1)
filenamechanged = gsub ("\\.[[:alnum:]]+","",filename)
type = matrix(data = as.character(filenamechanged),nrow = length(data1[,1]),ncol=1 )
data1 = cbind(data1,type)
traindata[[i]] = data1
print(i)
}
for(i in 1:fileno){
#Obtain the unique classified data from the train files for one category
trainingdata1 = as.data.frame(traindata[[i]][,1])
uniquetraintweet = hash()
typetrain1 = matrix(data=as.character(traindata[[i]][1,2]), ncol =1, nrow = length(trainingdata1[,1]))
#If the training data is less than 10 records for a category, do not create a model
#The model created based on a smaller set of data will not be accurate
if (length(trainingdata1[,1])<200){
matrix[[i]] = NULL
next
}
#Obtain the unique classified data from the train files of all the other category except that is considered as training category
trainingdata2=matrix(data="",nrow=0,ncol=1)
for (j in 1:fileno){
if ( j==i) next
trainingdata2dummy = as.data.frame(traindata[[j]][,1])
length(trainingdata1[,1])
colnames(trainingdata2)="feedbacks"
colnames(trainingdata2dummy)="feedbacks"
trainingdata2 = rbind(trainingdata2,trainingdata2dummy)
}
#Consider one category as training set and make the remaining categories as Others
typetrain2 = matrix(data="ZZOther",nrow=length(trainingdata2[,1]),ncol=1)
colnames(trainingdata1)="feedbacks"
trainingdata[[i]]=rbind(trainingdata1,trainingdata2)
colnames(typetrain1)="type"
colnames(typetrain2)="type"
type=rbind(typetrain1,typetrain2)
trainingdata[[i]] = cbind(trainingdata[[i]],type)
trainingdata[[i]]=trainingdata[[i]][sample(nrow(trainingdata[[i]])),]
#Input the training set and other set to the classifier
mindoc = max(1,floor(min(0.001*length(trainingdata[[i]][,1]),3)))
#Create Matrix
matrix[[i]] <- create_matrix(trainingdata[[i]][,1], language="english",
removeNumbers=FALSE, stemWords=FALSE,weighting=weightTf,minWordLength=3, minDocFreq=mindoc, maxDocFreq=floor(0.5*(length(trainingdata[[i]][,1]))))
#rowTotals <- apply(matrix[[i]] , 1, sum) #Find the sum of words in each Document
#matrix[[i]] <- matrix[[i]][rowTotals> 0,]
print(i)
#Create Container
container[[i]] <- create_container(matrix[[i]],trainingdata[[i]][,2],trainSize=1:length(trainingdata[[i]][,1]),virgin=FALSE)
print(i)
#Create Models
models[[i]] <- train_models(container[[i]], algorithms=c("SVM"))
print(i)
}
save(matrix, file = paste(pradd,"/Matrix",sep=""))
save(models, file = paste(pradd,"/Models",sep=""))
}

Here is an example of working with RTextTools in parallel. I created the dummy function using information to be found here.
The function myFun follows the introduction in the above link - at the end it writes a csv file (no directory is specified) containing the analytics/summary. Afterwards it is straight forward application of the base R package parallel in order to run myFun in parallel.
library(parallel)
library(RTextTools)
# I. A dummy function
# Uses RTextTools
myFun <- function (trainMethod) {
library(RTextTools)
data(USCongress)
# Create the document-term matrix
doc_matrix <- create_matrix(USCongress$text, language="english", removeNumbers=TRUE,
stemWords=TRUE, removeSparseTerms=.998)
container <- create_container(doc_matrix, USCongress$major, trainSize=1:4000,
testSize=4001:4449, virgin=FALSE)
# Train
model <- train_model(container,trainMethod)
classify <- classify_model(container, model)
# Analytics
analytics <- create_analytics(container,
cbind(classify))
summary(analytics)
# Saving
nameToSave <- paste(trainMethod, 'DocumentSummary.csv', sep = '_')
write.csv(analytics#document_summary, nameToSave)
}
# II. Parallel Processing
#
# 1. Vector for parallelization & number of cores available
trainMethods <- c('SVM','GLMNET','MAXENT','SLDA','BOOSTING')
num_cores <- detectCores() - 1L
# 2. Start a cluster
cl <- makeCluster(num_cores)
# 3. Export Variables needed to the cluster
# specifying exactly which variables should be exported
clusterExport(cl, varlist = c('myFun', 'trainMethods'))
# 4. do in parallel
parLapply(cl, seq_along(trainMethods), function (n) myFun(trainMethod = trainMethods[n]))
# stop the cluster
stopCluster(cl)
In your case, you'd have to turn your code into a function myFun (n, ...) with n being an element of seq_along(folderaddress) and of course substitute seq_along(trainMethods) for seq_along(folderaddress) in parLapply.
Of course chances are there exist ways besides parallelization to enhance your code. The problem is without sample data, any suggested improvement is but conjecture.

Related

Ways to increase foreach %dopar% loop speed with large data frame

I have a foreach %dopar% code setup to process my data in parallel but I am looking at ways to increase the speed. Basically I have a large data frame that is loaded using fread and
The size of the dataframe is 225 obs x 655369 variables. The foreach command selects two variables at a time, runs this process function (code that calculated various mediation, moderation, and conditional process models), for a total of 327,684 times. For this function the data must all be within the same dataframe. I noticed that the size of the dataframe seems to greatly slow down the foreach function.
From what I can tell the major cause of the slow down due to dataframe size is because of how the process function accesses the data for processing. So, what I am guessing is that each time the foreach runs the process function parses the entire dataframe until it finds the correct variable for each of the inputs.
One of my thoughts is to just chunk the data into smaller data frames to speed up processing time, and then merge the outputs together at the end. But I was wondering if anyone else has any suggestions for speeding this up as I am obviously not overly familiar with R.
The variable names for area_list and thickness_list, which are the mediators and the only values that change between each loop are labelled like this, such that the last number is either 0 or 1 for the pair, with all other numbers matching:
value_0_0_0 with value_0_0_1 for the first loop
value_1_0_0 with value_1_0_1 for the second loop
value_2_0_0 with value_2_0_1 for the third loop
value_3_0_0 with value_3_0_1 for the fourth loop
...
value_327684_1_0 with value_327684_1_1 etc.
options(scipen=999)
library(tidyverse)
library(foreach)
library(iterators)
library(parallel)
library(doParallel)
library("data.table")
library('janitor')
source("/scratch/R/process.r")
nCores <- detectCores() - 1
cl <- makeCluster(nCores)
registerDoParallel(cl)
my_data <- fread(
file = "/scratch/R/data.csv", header = TRUE, fill=TRUE, data.table = FALSE)
#Change values from -999 to NA in specific columns to avoid data issues (McAuley data)
my_data[, 88:133][my_data[, 88:133] == -999] <- NA
#Create dataframe for prepost useable data only
prepost_df <- subset(my_data, Select_PrePost==1)
pre_df <- subset(my_data, Select==1)
Large_MyData <- fread(
file = "/scratch/R/large.csv", header = TRUE, sep = ",", data.table = FALSE)
area_list <- names(Large_MyData)[grep("_1$",names(Large_MyData))]
thickness_list <- names(Large_MyData)[grep("_0$",names(Large_MyData))]
merged_data <- merge(pre_df, Large_MyData, by = "subs")
yvalue = "y"
xvalue = "x"
covariates = c("a","g","e")
ptm <- proc.time()
loopResults<-
foreach(area=area_list,thickness=thickness_list, .combine = rbind) %dopar%{
if (merged_data[area][1,1] == 0) {
merge_df3<-rbind(area,thickness)
merge_df_out<-cbind(merge_df3,yvalue,xvalue,'','','','','','')
} else {
result<-process(data=merged_data,y=yvalue,x=xvalue,m=c(area,thickness),cov=covariates,
model=4,contrast=1,boot=5000,save=2,modelbt=1,outscreen=0)
indirectEffects<-result[23:24,1:4]
indirectEffects_bootmean_area<-result[27,2]*result[38,2]
indirectEffects_bootmean_thickness<-result[32,2]*result[39,2]
indirectEffects_bootscore_area<-(indirectEffects_bootmean_area/result[23,2])
indirectEffects_bootscore_thickness<-(indirectEffects_bootmean_thickness/result[24,2])
merge_df1<-rbind(indirectEffects_bootmean_area,indirectEffects_bootmean_thickness)
merge_df2<-rbind(indirectEffects_bootscore_area,indirectEffects_bootscore_thickness)
merge_df3<-rbind(area,thickness)
merge_df_out<-cbind(merge_df3,yvalue,xvalue,indirectEffects,merge_df1,merge_df2)
}
}
proc.time() - ptm
stopCluster(cl)
colnames(loopResults) <- c("Vector","yvalue","xvalue","Effect","BootSE","BootLLCI","BootULCI","BootMean","boot_score")
loopResults

RAM demands increase exponentially when running a for-loop executing the glm function multiple times

I want to detect interaction effects in a dataset and I have written code that creates all the possible combinations of predictors, fits gml models with each pair separately and stores the vital statistics of the model and the model itself.
My dataset is comprised of 100,000 + observations and I want to examine 200,000 + possible pair combinations.
The code runs without errors but the problem is that after the 2,000 th iteration the 60 GB RAM of my PC has been filled (when I begin running the code there are 58 GB of free RAM )
For a reproducible example I will use the mtcars dataset:
data(mtcars)
setDT(mtcars)
predictor_names <- setdiff(names(mtcars) , "am")
combinations <- combn(length(predictor_names) , 2)
combinations <- t(combinations)
combinations <- as.data.frame(combinations)
models_glm <- list()
Coefficients_dt <- data.table(Predictor_1 = character() , Predictor_2 = character(), dev_ratio = numeric() ,
Estimate = numeric(), p.value = numeric())
system.time(
for (i in (1 : (nrow(combinations) - 1 ))) {
# Extracts the index positions of the names of the pairs
#----------------------------------------------------------
m <- combinations[i, 1]
k <- combinations[i, 2]
# Extracts the names of the predictors from a vector that holds them
#------------------------------------------------------------------------
m_name <- predictor_names[m]
k_name <- predictor_names[k]
# Uses the names of the predictors to construct a formula
#------------------------------------------------------------------
formula_glm <- paste0( "am ~ " , m_name, " * " , k_name)
formula_glm <- as.formula(formula_glm )
# Passes the formula to a glm model
#-------------------------------------------------------------------
model <- glm(formula_glm , mtcars, family = "binomial")
# Stores the model to a list
#-------------------------------------------------------------------
models_glm [[ paste0(m_name , "_*_" , k_name)]] <- model
# Calculates the dev.ratio
#---------------------------------------------------------------
residual.deviance <- model$deviance
null.deviance <- model$null.deviance
dev.ratio <- (null.deviance - residual.deviance) / null.deviance
# Extracts the Coefficient estimate and p-value from the model
#-------------------------------------------------------------------
Coefficients_df <- as.data.frame(summary(model)$coefficients)
names(Coefficients_df) <- c("Estimate" , "SE" , "Z", "p.value")
if(dim(Coefficients_df)[1] == 4){
Coefficients_dt <- rbind(Coefficients_dt , data.table(
Predictor_1 = m_name ,
Predictor_2 = k_name,
dev_ratio = dev.ratio,
Estimate = Coefficients_df$Estimate[4] ,
p.value = Coefficients_df$p.value[4]
))
}
}
)
What can I do to overcome this problem?
I.e. I would like to understand the root cause of the problem: What is taking space in RAM? The objects involved are not very large compared to the available RAM. Specifically the Coefficients_dt data.table at the most will become 200,000 row x 5 columns large.
So something else is going on and consumes more and more RAM as the iterations build up in the for-loop.
Next I would like to understand if there is some action I can take in the middle of the execution of the for-loop -- e.g. command nested in an if statement within the for loop-- that would free RAM space while possibly saving any objects that would be removed from the RAM and should be safeguarded.
Your advice will be appreciated.
Consider the following options:
Pre-allocate any needed object so instead of expanding it with values iteratively which requires the machine to reallocate space using memory, you assign values to existing elements:
models_glm <- vector(mode = "list", length = 45)
In fact, even consider naming the elements beforehand:
pnames <- sapply(1:nrow(combinations)-1, function(i){
paste0(predictor_names[combinations[i,1]], "_*_",
predictor_names[combinations[i,2]])
})
models_glm <- setNames(vector(mode="list", length=45), pnames)
Use data.table::rbindlist() to row bind a list of data tables into one large dataframe in one call instead of expanding dataframe row by row inside a loop. Below uses lapply returning an object equal to length of input. Also, notice empty datatable to avoid NULL returns, left out of rbindlist:
dTList <- lapply(seq(nrow(combinations)-1), function(i) {
#... same as above
# use <<- operator to update environment object outside function
models_glm[[paste0(m_name, "_*_", k_name)]] <<- model
#...
Coefficients_df <- setNames(as.data.frame(summary(model)$coefficients),
c("Estimate", "SE", "Z", "p.value"))
if(dim(Coefficients_df)[1] == 4){
data.table(
Predictor_1 = m_name ,
Predictor_2 = k_name,
dev_ratio = dev.ratio,
Estimate = Coefficients_df$Estimate[4],
p.value = Coefficients_df$p.value[4]
)
} else {
# RETURN EMPTY DT
data.table(
Predictor_1 = character(),
Predictor_2 = character(),
dev_ratio = numeric(),
Estimate = numeric(),
p.value = numeric()
)
}
})
coefficients <- data.table::rbindlist(dTlist)
rm(dTlist)
gc()
Finally, for large operations that do not need design/programming work, consider using the automated Rscript.exe over RStudio or Rgui as these later programs require additional resources. Below is a command line that can run from PowerShell, CMD prompt, or batch (.bat) file assuming Rscript is an environment PATH variable:
Rscript "C:\Path\To\ModelCoefficientDataBuild.R"
Specifically, RStudio's rsession.exe on Windows tends to not release memory back to OS once it obtains it until the session is over. See RStudio forum posts on subject. Of course be sure to save your needed objects to disk for use later:
saveRDS(coefficients, "coefficients_datatable.rds")

Parallel computing with foreach - Saving loop outputs to global list

I would like to run a large loop with the foreach function.This means using the %dopar% operator.
I can't find any questions already answered to this problem exactly. If this is a duplicate though, please point me in the right direction and I'll close this question.
I have been having mixed success. It works for simple examples on my machine, as per the help documentation, however I cannot seem to get good results for my own work.
My example is slightly more complicated, so the devil seems to be in the detail, as always! I have also read the 'white paper' provided by the package creators Revolution Analytics (you can get it here).
I don't see how best to maybe use the .combine argument to apply results to my global output list.
I would like to assign claculated value to one big list as opposed to using cbind or c
My example is pretty convoluted, but if I simplify it any further then any answers might not address my issue.
I will perform a kind of moving-linear model. So fit a model using lm() over 50 obersvations [1:50], predict the 51st observation [51], saving the results to a list.
Then I will shift it all one observation further. So a lm over [2:51] and predict the 52nd observation [52].
I will use a total of 100 observations, so I can make a maximum of 50 predictions.
## ============================================ ##
## Setup the backend for the foreach function ##
## ============================================ ##
## doMC calls upon cores on demand, uses them and closes them
library(doMC)
registerDoMC(detectCores()) #detectCores() uses all cores
## for Windows users
#library(doParallel) --> for Windows users
#registerDoParallel(detectCores())
## ======================== ##
## Create some dummy data ##
## ======================== ##
## three columns, one hundred observations
my_data <- data.table(outcome = runif(100), V1 = 3*runif(100), V2 = sqrt(runif(100)))
## Have a look at the data if you like - using the DT package
library(DT)
datatable(my_data, options = list(pageLength = nrow(my_data)))
## ================================= ##
## Perform the loop the normal way ##
## ================================= ##
## Create container (a list of lists) for results
my_results <- sapply(c(paste0("step_", seq(1:50))), function(x) NULL)
step_results <- sapply(c("coefs", "rank", "error"), function(x) NULL)
for(i in 1:length(my_results)){my_results[[i]] <- step_results}
## Use a for loop to stpe through all the 50 'slices'
for(i in 1:50) { #max. 50 predictions possible
## Fit a linear model
my_fit <- lm("outcome ~ V1 + V2", data = my_data[i:(i+49)])
## Predict the next step
my_pred <- predict(my_fit, newdata = my_data[i+50, .(V1, V2)])
error <- my_data$outcome[i+50] - my_pred #simply measure the delta to the actual value
## Assign some results to the container created earlier
my_results[[i]][[1]] <- my_fit$coefficients
my_results[[i]][[2]] <- my_fit$rank
my_results[[i]][[3]] <- error
}
str(my_results) ## Keep this container to compare to our next one
## ============================================ ##
## Perform the loop using foreach and %dopar% ##
## ============================================ ##
## Create same results object for results as previously for parallel results
par_results <- sapply(c(paste0("step_", seq(1:50))), function(x) NULL)
step_results <- sapply(c("coefs", "rank", "error"), function(x) NULL)
for(i in 1:length(par_results)){par_results[[i]] <- step_results}
my_results_par <- foreach(i = 1:50) %dopar%
{ #max. 50 predictions possible
my_fit <- lm("outcome ~ V1 + V2", data = my_data[i:(i+49)])
my_pred <- predict(my_fit, newdata = my_data[i+50, .(V1, V2)])
error <- my_data$outcome[i+50] - my_pred
## Assign some results to the container created earlier
par_results[[i]][[1]] <- my_fit$coefficients
par_results[[i]][[2]] <- my_fit$rank
par_results[[i]][[3]] <- error
Sys.sleep(i/20) #Allows time to see R processes spawn on your system
return(par_results)
}
## We can see straight away that this didn't work as I would like it to
identical(my_results, my_results_par) #FALSE
## This shows that the output seems good on the surface
class(my_results_par)
length(my_results_par)
## This shows that it doesn't (WARNING: very long)
str(my_results_par)
You can try out the various .combine arguments in the foreach function, for example:
foreach(i = 1:50, .combine = "c") {computation}
or
foreach(i = 1:50, .combine = "cbind") {computation}
these prodice a vector and a matrix respectively, but do not contain all the results that I was trying to save in each loop.
Questions
Does that structure give you a clue as to what is going on?
How might I use .combine argument to create my desired output?
Is what I am trying to do even possible??
Do I need to put the loop with foreach at a different point in the algorithm?
I have read that you can supply a custom function to foreach... might this be the way to do it? I still don't see how I would combine the results.
Yes, this can easily be done. We can modify your code for the foreach-step to the following, where we export the data.table package to each worker.
my_results_par <- foreach(i = 1:50, .combine = append, .packages = c("data.table")) %dopar%
{
my_fit <- lm("outcome ~ V1 + V2", data = my_data[i:(i+49)])
my_pred <- predict(my_fit, newdata = my_data[i+50, .(V1, V2)])
error <- my_data$outcome[i+50] - my_pred
par_results <- list(
coefs = my_fit$coefficients,
rank = my_fit$rank,
error = error
)
par_results <- list(par_results)
names(par_results) <- paste0("step_", i)
return(par_results)
}
identical(my_results, my_results_par)
[1] TRUE

How to run a script for multiple inputs and save result objects based on the name of input?

To run the same set of commands and save the result objects for each time series, I wrote the script in the following manner :
# Specify time series to be used
dat <- tsname
# Run a set of commands and fit models with different parameters
dat.1 <- model1(dat)
dat.2 <- model2(dat)
dat.3 <- model3(dat)
# Save objects for further analysis
tsname.1 <- dat.1
tsname.2 <- dat.2
save(tsname.1, tsname.2, tsname.3, file = paste0("tsname", ".rda")
In this way, we just need to change the script in the beginning and end, save the script for each time series and run each of them individually or in a main script.
The main reason for this method was because I could not find a way to rename the objects created and some search suggested that the above is the only way to do it.
Now as the number of series has increased, it is preferable to either use a for loop, foreach, batch script or commandArgs() to run one script and specify all time series as arguments.
To make that work though, the script must find a way to assign these objects with name of series itself so that they can be loaded later for further analysis.
How can we make such a script work or is there a better approach ? Which method of looping will work in that case ?
A MWE
set.seed(1)
tsdata <- ts(rnorm(250), start = c(1980,1), frequency = 12)
dat <- tsdata
dat.11 <- arima(dat, order = c(1, 1, 1))
dat.21 <- arima(dat, order = c(2, 1, 0))
tsname.11 <- dat.11 # problem is to specify this step in each script
tsname.21 <- dat.21
save(tsname.11, , file = "tsname.rda")
REVISED the code
How can we execute this script for multiple time series and store the results and result objects for further analysis ? If Batch command can be used, what is the best way to input set of multiple time series?
How can we run the script for one series, over a set of time series of same or mixed length?
I show a couple ways to create and retrieve individual objects using assign and get, but also provide an alternative where all model runs are stored as different elements of a list. Similarly, I show how you can save each model run in separate files (soi.1.rda, etc), but that you can also save everything together, in one step :)
# ===========================================
# = Set up model params, generate test data =
# ===========================================
mod.param <- 1:5 # orders of AR to try ...
test.soi <- arima.sim(model=list(ar=c(0.5, -0.2)), n=20)
# ===========================================================
# = Create empty vectors/ list to store data and data names =
# ===========================================================
dat.names <- c() # a place to store the names of the individual objects that we'll create
save.names <- c() # the names of the files to save, e.g., "soi.1"
dat.all <- list() # as an altnerative, you can save each analysis in different elements of a list
# ===================================================
# = Loop through each type of model, saving results =
# ===================================================
for(i in 1:length(mod.param)){ # loop through each model you want to run
temp.dat <- arima(test.soi, order=c(mod.param[i], 0, 0)) # temp.dat is the current model result
dat.names[i] <- paste("dat", i, sep=".") # dat.names stores the names of all the dat.x objects
assign(dat.names[i], temp.dat) # use assign() to create an object with name of temp.dat.name
# dat.all[[dat.names[i]]] <- temp.dat # store the object in a list
dat.all[[dat.names[i]]] <- get(dat.names[i]) # same as above, but using get(), which complements assign() nicely
save.name <- paste("soi", i, "rda", sep=".") # I'm assuming the file should be named soi.1.rda, not soi.rda
save(list=dat.names[i], file=save.name) # save soi.1.rda, soi.2.rda ... etc.
}
# But we don't have to save each file individually!
# We can save a file that contains our list of models (dat.all), as well as each model object (dat.1, dat.2 ... etc.)
all.objs <- ls() # what are all of the object names in our working memory?
dat.objs <- all.objs[all.objs%in%c(dat.names, "dat.all")] # subset to the names of objects we want to save
save(list=dat.objs, file="everything.rda") # save all relevant objects in 1 .rda file
print(dat.1)
print(dat.all$dat.1)
Edit: A different approach that applies each of several models to several time series
Note that the approach might change slightly depending on which models you want to apply to which time series. I've assumed that several models should be applied to each time series, and that the models differ only the the ARIMA order.
The results can be saved as 1 nested list (different model results grouped under different time series), or with model results for each time series being saved as a separate file.
# ============================================================
# = Generate many time series, many sets of model parameters =
# ============================================================
# Model parameters
n.Params <- 5
ar.orders <- 1:n.Params # orders of AR to try ...
i.orders <- rep(0, n.Params)
ma.orders <- rep(0,n.Params)
arima.params <- as.list(as.data.frame(rbind(ar.orders, i.orders, ma.orders)))
# Time Series Data
n.ts <- 10 # number of time series
test.soi <- quote(as.numeric(arima.sim(model=list(ar=c(0.2, 0.4)), n=sample(20:30, 1))))
all.soi.ts <- replicate(n.ts, eval(test.soi))
names(all.soi.ts) <- paste("soi", 1:n.ts, sep=".")
# ==============================================
# = Function to be applied to each time series =
# ==============================================
# Analyze time series
ats <- function(TS, arimaParams){
dat.all <- list() # as an altnerative, you can save each analysis in different elements of a list
for(i in 1:length(arimaParams)){ # loop through each model you want to run
temp.dat <- arima(TS, order=arimaParams[[i]]) # temp.dat is the current model result
dat.all[[i]] <- temp.dat # store the object in a list
}
dat.all
}
# =========================
# = All Results in 1 List =
# =========================
AllResults <- lapply(all.soi.ts, ats, arima.params) # multilevel list – top level is each TS, within each TS group are the results of all models applied to that time series
save(AllResults, file="everything.rda") # save this big list as 1 file
# ========================================================================
# = Each time series gets its own file and its own list of model results =
# ========================================================================
for(i in 1:length(all.soi.ts)){ # if you want many files, 1 file per time series, use this for loop
temp.ts <- all.soi.ts[[i]]
soi.name <- paste("soi", i, sep=".")
assign(soi.name, ats(temp.ts, arima.params))
save(list=soi.name, file=paste(soi.name, "rda", sep=".")) # each file will have a name like "soi.1.rda", containing the results of all models applied to the first time series
}
The function sets datname to the name of the input variable. Then define a list L of model outputs and add names. Finally use with(L, ...) to regard the list component names as variable names in ... and use save(list = ..., ...) which allows specification of the variables as a character string of names. Now we only have to set up the data and call the function to run it. If you have several data sets call the function for each one.
run <- function(dat, datname = deparse(subset(dat))) {
L <- list(
arima(dat, order = c(1, 1, 1)),
arima(dat, order = c(2, 1, 0))
)
names(L) <- paste(datname, seq_along(L), sep = ".")
with(L, save(list = names(L), file = paste0(datname, ".rda")))
}
set.seed(1)
soi <- ts(rnorm(250), start = c(1980,1), frequency = 12)
run(soi)
Another possibility might be to save the entire list rather than its components separately. That is, replace the with statement with
listname <- paste0(datname, ".models")
assign(listname, L)
save(list = listname, file = paste0(datname, ".rda"))
REVISED Some corrections and added alternative at end.
When you want to manipulate objects whose names are themselves stored inside a variable, just use assign() and its reverse get(). And use ls() to see which objects exist in a particular scope.
The objects don't need to be stored separately as tsname.1/2/3, model1/2/3??
You can make it real simple if you just store a vector dat[1:3].
Indeed you can have a vector of model[1:3] too. Use vectorization. It's your friend.
Use the assign("tsname.21", object,...) command and its reverse get("tsname.21") to manipulate objects by string name. Just be consistent about whether you prefer to refer to objnames or objects.
set.seed(1)
tsdata <- ts(rnorm(250), start = c(1980,1), frequency = 12)
dat <- tsdata
set.seed(1)
tsdata <- ts(rnorm(250), start = c(1980,1), frequency = 12)
dat <- tsdata
create_model <- function(data, params, objname.prefix='tsname.', envir=.GlobalEnv) {
objname = paste(objname.prefix, params[1], params[2], sep='') # both assigns and prints it
the.model <- arima(dat, order = params)
assign(objname, the.model, envir) # create the var in the global env
# If you want, you can return the varname
return(objname)
}
# dat.11 <- arima(dat, order = c(1, 1, 1))
create_model(dat, c(1, 1, 1))
# dat.21 <- arima(dat, order = c(2, 1, 0))
create_model(dat, c(2, 1, 0))
#tsname.11 <- dat.11 # problem is to specify this step in each script
#tsname.21 <- dat.21
save(tsname.11, , file = "tsname.rda")
# Use `ls(pattern=...)` to find object-names, with wildcard matching.
all.models <- ls(pattern='tsname.*')
#[1] "tsname.11" "tsname.21"
#############
# Refactor your original code similarly.
dat <- tsname
# Run a set of commands and fit models with different parameters
dat[1] <- model1(dat)
dat[2] <- model2(dat)
dat[3] <- model3(dat)
# or maybe figure out how to use sapply here
# Save objects for further analysis
tsname <- dat[1:2] # instead of tsname.1 <- dat.1, tsname.2 <- dat.2
#
save(tsname, file = paste0("tsname", ".rda")

R: make pls calibration models from n number of subset and use them to predict different test sets

I am trying to apply a function I wrote that uses the 'pls' package to make a model and then use it
to predict several test set(in this case 9), returning the R2,RMSEP and prediction bias of each test set
for n number of subset selected from the data frame.
the function is
cpo<-function(data,newdata1,newdata2,newdata3,newdata4,newdata5,newdata6,newdata7,newdata8,newdata9){
data.pls<-plsr(protein~.,8,data=data,validation="LOO")#making a pls model
newdata1.pred<-predict(data.pls,8,newdata=newdata1) #using the model to predict test sets
newdata2.pred<-predict(data.pls,8,newdata=newdata2)
newdata3.pred<-predict(data.pls,8,newdata=newdata3)
newdata4.pred<-predict(data.pls,8,newdata=newdata4)
newdata5.pred<-predict(data.pls,8,newdata=newdata5)
newdata6.pred<-predict(data.pls,8,newdata=newdata6)
newdata7.pred<-predict(data.pls,8,newdata=newdata7)
newdata8.pred<-predict(data.pls,8,newdata=newdata8)
newdata9.pred<-predict(data.pls,8,newdata=newdata9)
pred.bias1<-mean(newdata1.pred-newdata1[742]) #calculating the prediction bias
pred.bias2<-mean(newdata2.pred-newdata2[742])
pred.bias3<-mean(newdata3.pred-newdata3[742]) #[742] reference values in column742
pred.bias4<-mean(newdata4.pred-newdata4[742])
pred.bias5<-mean(newdata5.pred-newdata5[742])
pred.bias6<-mean(newdata6.pred-newdata6[742])
pred.bias7<-mean(newdata7.pred-newdata7[742])
pred.bias8<-mean(newdata8.pred-newdata8[742])
pred.bias9<-mean(newdata9.pred-newdata9[742])
r<-c(R2(data.pls,"train"),RMSEP(data.pls,"train"),pred.bias1,
pred.bias2,pred.bias3,pred.bias4,pred.bias5,pred.bias6,
pred.bias7,pred.bias8,pred.bias9)
return(r)
}
selecting n number of subsets (based on an answer from my question[1]: Select several subsets by taking different row interval and appy function to all subsets
and applying cpo function to each subset I tried
Edited based on #Gavin advice
FO03 <- function(data, nSubsets, nSkip){
outList <- vector("list", 11)
names(outList) <- c("R2train","RMSEPtrain", paste("bias", 1:9, sep = ""))
sub <- vector("list", length = nSubsets) # sub is the n number subsets created by selecting rows
names(sub) <- c( paste("sub", 1:nSubsets, sep = ""))
totRow <- nrow(data)
for (i in seq_len(nSubsets)) {
rowsToGrab <- seq(i, totRow, nSkip)
sub[[i]] <- data[rowsToGrab ,]
}
for(i in sub) { #for every subset in sub i want to apply cpo
outList[[i]] <- cpo(data=sub,newdata1=gag11p,newdata2=gag12p,newdata3=gag13p,
newdata4=gag21p,newdata5=gag22p,newdata6=gag23p,
newdata7=gag31p,newdata8=gag32p,newdata9=gag33p) #new data are test sets loaded in the workspace
}
return(outlist)
}
FOO3(GAGp,10,10)
when I try this I keep getting 'Error in eval(expr, envir, enclos) : object 'protein' not found' not found.
Protein is used in the plsr formula of cpo, and is in the data set.
I then tried to use the plsr function directly as seen below
FOO4 <- function(data, nSubsets, nSkip){
outList <- vector("list", 11)
names(outList) <- c("R2train","RMSEPtrain", paste("bias", 1:9, sep = ""))
sub <- vector("list", length = nSubsets)
names(sub) <- c( paste("sub", 1:nSubsets, sep = ""))
totRow <- nrow(data)
for (i in seq_len(nSubsets)) {
rowsToGrab <- seq(i, totRow, nSkip)
sub[[i]] <- data[rowsToGrab ,]
}
cal<-vector("list", length=nSubsets) #for each subset in sub make a pls model for protein
names(cal)<-c(paste("cal",1:nSubsets, sep=""))
for(i in sub) {
cal[[i]] <- plsr(protein~.,8,data=sub,validation="LOO")
}
return(outlist) # return is just used to end script and check if error still occurs
}
FOO4(gagpm,10,10)
When I tried this I get the same error 'Error in eval(expr, envir, enclos) : object 'protein' not found'.
Any advice on how to deal with this and make the function work will be much appreciated.
I suspect the problem is immediately at the start of FOO3():
FOO3 <- function(data, nSubsets, nSkip) {
outList <- vector("list", r <- c(R2(data.pls,"train"), RMSEP(data.pls,"train"),
pred.bias1, pred.bias2, pred.bias3, pred.bias4, pred.bias5,
pred.bias6, pred.bias7, pred.bias8, pred.bias9))
Not sure what you are trying to do when creating outList, but vector() has two arguments and you seem to be assigning to r a vector of numerics that you want R to use as the length argument to vector().
Here you are using the object data.pls and this doesn't exist yet - and never will in the frame of FOO3() - it is only ever created in cpo().
Your second loop looks totally wrong - you are not assigning the output from cpo() to anything. I suspect you wanted:
outList <- vector("list", 11)
names(outList) <- c("R2train","RMSEPtrain", paste("bias", 1:9, sep = ""))
....
for(i in subset) {
outList[[i]] <- cpo(....)
}
return(outList)
But that depends on what subset is etc. You also haven't got the syntax for this loop right. You have
for(i in(subset)) {
when it should be
for(i in subset) {
And subset and data aren't great names as these are common R functions and modelling arguments.
There are lots of problems with your code. Try to start simple and build up from there.
I have managed to achieved what i wanted using this, if there is a better way of doing it (i'm sure there must be) I'm eager to learn.This function preforms the following task
1. select "n" number of subsets from a dataframe
2. For each subset created, a plsr model is made
3. Each plsr model is used to predict 9 test sets
4. For each prediction, the prediction bias is calculated
far5<- function(data, nSubsets, nSkip){
sub <- vector("list", length = nSubsets)
names(sub) <- c( paste("sub", 1:nSubsets, sep = ""))
totRow <- nrow(data)
for (i in seq_len(nSubsets)) {
rowsToGrab <- seq(i, totRow, nSkip)
sub[[i]] <- data[rowsToGrab ,]} #sub is the subsets created
mop<- lapply(sub,cpr2) #assigning output from cpr to mop
names(mop)<-c(paste("mop", mop, sep=""))
return(names(mop))
}
call: far5(data,nSubsets, nSkip))
The first part -selecting the subsets is based on the answer to my question Select several subsets by taking different row interval and appy function to all subsets
I was then able to apply the function cpr2 to the subsets created using "lapply" instead of the "for' loop as was previously done.
cpr2 is a modification of cpo, for which only data is supplied, and the new data to be predicted is used directly in the function as shown below.
cpr2<-function(data){
data.pls<-plsr(protein~.,8,data=data,validation="LOO") #make plsr model
gag11p.pred<-predict(data.pls,8,newdata=gag11p) #predict each test set
gag12p.pred<-predict(data.pls,8,newdata=gag12p)
gag13p.pred<-predict(data.pls,8,newdata=gag13p)
gag21p.pred<-predict(data.pls,8,newdata=gag21p)
gag22p.pred<-predict(data.pls,8,newdata=gag22p)
gag23p.pred<-predict(data.pls,8,newdata=gag23p)
gag31p.pred<-predict(data.pls,8,newdata=gag31p)
gag32p.pred<-predict(data.pls,8,newdata=gag32p)
gag33p.pred<-predict(data.pls,8,newdata=gag33p)
pred.bias1<-mean(gag11p.pred-gag11p[742]) #calculate prediction bias
pred.bias2<-mean(gag12p.pred-gag12p[742])
pred.bias3<-mean(gag13p.pred-gag13p[742])
pred.bias4<-mean(gag21p.pred-gag21p[742])
pred.bias5<-mean(gag22p.pred-gag22p[742])
pred.bias6<-mean(gag23p.pred-gag23p[742])
pred.bias7<-mean(gag31p.pred-gag31p[742])
pred.bias8<-mean(gag32p.pred-gag32p[742])
pred.bias9<-mean(gag33p.pred-gag33p[742])
r<-signif(c(pred.bias1,pred.bias2,pred.bias3,pred.bias4,pred.bias5,
pred.bias6,pred.bias7,pred.bias8,pred.bias9),2)
out<-c(R2(data.pls,"train",ncomp=8),RMSEP(data.pls,"train",ncomp=8),r)
return(out)
} #signif use to return 2 decimal place for prediction bias
call:cpr2(data)
I was able to use this to solve my problem, however since the amount of new data to be predicted was only nine, it was possible to list them out as i did. If there is a more generalized way to do this I'm interested in learning.

Resources