I am trying to use foreach to make use of parallel processing for a complete subsets regression problem. I am trying to fit a complete list of models using the gamm4 package, using the binomial function where the response is provided as a proportion, and the weights argument supplies the number of trials. The code works fine when run using %do% but fails under %dopar% (retutns only NA's for AIC and BIC). Strangely, the code does work using %dopar% fine if the weights argument to the gamm4 call is left out, but obviously this is not a viable solution. I have been using similar code with no issues based on a gaussian distribution and a binomial distribution where the response is entered as 1,0s (thus no need for a call to weights) with no problems at all. I am using windows 7 64bit, with R version 3.1.2. I have updated all the relevant packages. A reproducible (but toy) example:
set.seed(666)
# generate a random factor with a random offset effect
random.factor=factor(sort(rep(1:10,10)))
random.effect=sort(rep(rnorm(10),10))
# generate some random predictor variables
X1 = rnorm(100)
X2 = rnorm(100)
X3 = rnorm(100)
X4 = rep(0,100) # make it so one variable fails (just to check the "try" if statement)
#X4 = rnorm(100)
X5 = rnorm(100)
# calculate a response variable based on some of the predictors
z = 1 + 2*X1 + 3*X2 + 2*X3^2 # linear combination with a bias
pr = 1/(1+exp(-(z+random.effect))) # pass through an inv-logit function
y = rbinom(n=100,size=100,pr)/100 # bernoulli response variable.
# Note that the response variable is a proprotion of successes of 100 trials
# We want to feed the number of trials as a "weights" argument to gamm
# now make a data frame of predictors
pred.dat=data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5)
pred.vars=colnames(pred.dat)
# make a dataframe for passing to gamm
use.dat = data.frame(random.factor=random.factor,y=y,pred.dat)
# now set up the models to run
# this includes all combinations of variables, but only up to a total of two in
# any one model
model.fits.test=c(combn(1:ncol(pred.dat), 1,simplify = F),
combn(1:ncol(pred.dat), 2,simplify = F))
models.use=list(1,2,3,4,5)
n.models=length(model.fits.test)
require(lme4)
require(doParallel)
registerDoParallel(cores=4)
# if I run this using do, it works fine (with error values from the try argument
# returned for models that fail)
out.dat<-foreach(l = 1:n.models,.combine=rbind,
.packages=c("lme4","gamm4"))%do%{
vars.vec=model.fits.test[[l]]
formula.l<-as.formula(paste("y~",
paste(colnames(pred.dat)[vars.vec],collapse="+"),"+(1|random.factor)",sep=""))
model.fit=try(glmer(formula.l,
data=use.dat,
family="binomial",
weights=rep(100,nrow(use.dat))))
success<-class(model.fit)[[1]]!="try-error"
out.vec<-c(rep(NA,2),rep(NA,ncol(pred.dat)))
names(out.vec)<- c("AIC","BIC",colnames(pred.dat))
out.vec[
which(match(names(out.vec),pred.vars[vars.vec])>0)]<-1
if(success){
out.vec["AIC"]<-AIC(model.fit)
out.vec["BIC"]<-BIC(model.fit)
}
return(out.vec)
}
out.dat
# if I run using dopar, nothing is returned.
out.dat<-foreach(l = 1:n.models,.combine=rbind,
.packages=c("lme4","gamm4"))%dopar%{
vars.vec=model.fits.test[[l]]
formula.l<-as.formula(paste("y~",
paste(colnames(pred.dat)[vars.vec],collapse="+"),"+(1|random.factor)",sep=""))
model.fit=try(glmer(formula.l,
data=use.dat,
family="binomial",
weights=rep(100,nrow(use.dat))))
success<-class(model.fit)[[1]]!="try-error"
out.vec<-c(rep(NA,2),rep(NA,ncol(pred.dat)))
names(out.vec)<- c("AIC","BIC",colnames(pred.dat))
out.vec[
which(match(names(out.vec),pred.vars[vars.vec])>0)]<-1
if(success){
out.vec["AIC"]<-AIC(model.fit)
out.vec["BIC"]<-BIC(model.fit)
}
return(out.vec)
}
out.dat
# Now run dopar without the weights argument (not really appropriate,
# but for the sake of demonstration). I get results again, but it doesn't
# really make sense to do this. Also, my real example fails unless I can supply
# weights.
out.dat<-foreach(l = 1:n.models,.combine=rbind,
.packages=c("lme4","gamm4"))%dopar%{
vars.vec=model.fits.test[[l]]
formula.l<-as.formula(paste("y~1+",
paste("s(",colnames(pred.dat)[vars.vec],")",collapse="+"),sep=""))
model.fit=try(gamm4(formula.l, random=~(1|random.factor),
data=use.dat,family="binomial"))
success<-class(model.fit)[[1]]!="try-error"
out.vec<-c(rep(NA,2),rep(NA,ncol(pred.dat)))
names(out.vec)<- c("AIC","BIC",colnames(pred.dat))
out.vec[
which(match(names(out.vec),pred.vars[vars.vec])>0)]<-1
if(success){
out.vec["AIC"]<-AIC(model.fit$mer)
out.vec["BIC"]<-BIC(model.fit$mer)
}
return(out.vec)
}
out.dat
Related
I am new to R and Elastic-Net Regression Model. I am running Elastic-Net Regression Model on the default dataset, titanic. I am trying to obtain the Alpha and Lambda values after running the train function. However when I run the train function, the output keeps on lagging and I had to wait for the output but there is no output at all. it is empty.... I am trying Tuning Parameters.
data(Titanic)
example<- as.data.frame(Titanic)
example['Country'] <- NA
countryunique <- array(c("Africa","USA","Japan","Australia","Sweden","UK","France"))
new_country <- c()
#Perform looping through the column, TLD
for(loopitem in example$Country)
{
#Perform random selection of an array, countryunique
loopitem <- sample(countryunique, 1)
#Load the new value to the vector
new_country<- c(new_country,loopitem)
}
#Override the Country column with new data
example$Country<- new_country
example$Class<- as.factor(example$Class)
example$Sex<- as.factor(example$Sex)
example$Age<- as.factor(example$Age)
example$Survived<- as.factor(example$Survived)
example$Country<- as.factor(example$Country)
example$Freq<- as.numeric(example$Freq)
set.seed(12345678)
trainRowNum <- createDataPartition(example$Survived, #The outcome variable
#proportion of example to form the training set
p=0.3,
#Don't store the result in a list
list=FALSE);
# Step 2: Create the training mydataset
trainData <- example[trainRowNum,]
# Step 3: Create the test mydataset
testData <- example[-trainRowNum,]
alphas <- seq(0.1,0.9,by=0.1);
lambdas <- 10^seq(-3,3,length=100)
#Logistic Elastic-Net Regression
en <- train(Survived~. ,
data = trainData,
method = "glmnet",
preProcess = NULL,
trControl = trainControl("repeatedcv",
number = 10,
repeats = 5),
tuneGrid = expand.grid(alpha = alphas,
lambda = lambdas)
)
Could you please kindly advise on what values are recommended to assign to Alpha and lambda?
Thank you
I'm not quite sure what the problem is. Your code runs fine for me. If I look at the en object it says:
Accuracy was used to select the optimal model using the
largest value.
The final values used for the model were alpha = 0.1 and lambda
= 0.1.
It didn't take long to run for me. Do you have a lot stored in your R session memory that could be slowing down your system and causing it to lag? Maybe try re-starting RStudio and running the above code from scratch.
To see the full results table with Accuracy for all combinations of Alpha and Lambda, look at en$results
As a side-note, you can easily carry out cross-validation directly in the glmnet package, using the cv.glmnet function. A helper package called glmnetUtils is also available, that lets you select the optimal Alpha and Lambda values simultaneously using the cva.glmnet function. This allows for parallelisation, so may be quicker than doing the cross-validation via caret.
Bert-toolkit is a very nice package to call R functions from Excel. See: https://bert-toolkit.com/
I have used bert-toolkit to call a fitted neuralnet (avNNnet fitted with Caret) within a wrapper function in R from Excel VBA. This runs perfect. This is the code to load the model within the wrapper function in bert-toolkit:
load("D:/my_model_avNNet.rda")
neuraln <- function(x1,x2,x3){
xx <- data.frame(x1,x2,x3)
z <- predict(my_model_avNNET, xx)
z
}
Currently I tried to do this with a fitted GAM (fitted with package mgcv). Although I do not succeed. If I call the fitted GAM from Excel VBA it gives error 2015. If I call the fitted GAM from a cell it gives #VALUE! At the same time the correct outcome of the calculation is shown in the bert-console!
This is the code to load the model in the wrapperfunction in bert-toolkit:
library(mgcv)
load("D:/gam_y_model.rda")
testfunction <- function(k1,k2){
z <- predict(gam_y, data.frame(x = k1, x2 = k2))
print (z)
}
The difference between the avNNnet-model (Caret) and the GAM-model (mgcv) is that the avNNnet-model does NOT need the Caret library to be loaded to generate a prediction, while the GAM-model DOES need the mgcv library to be loaded.
It seems to be not sufficient to load the mgvc-library in the script with the GAM-model which loads the GAM-model in a wrapper function in bert-toolkit, as I did in the code above. Although the correct outcome of the model is shown in the bert-console. It does not generate the correct outcome in Excel.
I wonder how this is possible and can be solved. It seems to me that maybe there are two instances of R running in bert-toolkit.
How can I load the the mgcv-library in such a way that it can be used by the GAM-model within the function called from Excel?
This is some example code to fit the GAM with mgcv and save to model (after running this code the model can uploaded in bert-toolkit with the code above) :
library(mgcv)
# construct some sample data:
x <- seq(0, pi * 2, 0.1)
x2 <- seq(0, pi * 20, 1)
sin_x <- sin(x)
tan_x2 <- tan(x2)
y <- sin_x + rnorm(n = length(x), mean = 0, sd = sd(sin_x / 2))
Sample_data <- data.frame(y,x,x2)
# fit gam:
gam_y <- gam(y ~ s(x) + s(x2), method = "REML")
# Make predictions with the fitted model:
x_new <- seq(0, max(x), length.out = 100)
x2_new <- seq(0, max(x2), length.out = 100)
y_pred <- predict(gam_y, data.frame(x = x_new, x2 = x2_new))
# save model, to load it later in bert-toolkit:
setwd("D:/")
save(gam_y, file = "gam_y_model.rda")
One of R's signatures is method dispatching where users call the same named method such as predict but internally a different variant is run such as predict.lm, predict.glm, or predict.gam depending on the model object passed into it. Therefore, calling predict on an avNNet model is not the same predict on a gam model. Similarly, just as the function changes due to the input, so does the output change.
According to MSDN documents regarding the Excel #Value! error exposed as Error 2015:
#VALUE is Excel's way of saying, "There's something wrong with the way your formula is typed. Or, there's something wrong with the cells you are referencing."
Fundamentally, without seeing actual results, Excel may not be able to interpret or translate into Excel range or VBA type the result R returns from gam model especially as you describe R raises no error.
For example, per docs, the return value of the standard predict.lm is:
predict.lm produces a vector of predictions or a matrix of predictions...
However, per docs, the return value of predict.gam is a bit more nuanced:
If type=="lpmatrix" then a matrix is returned which will give a vector of linear predictor values (minus any offest) at the supplied covariate values, when applied to the model coefficient vector. Otherwise, if se.fit is TRUE then a 2 item list is returned with items (both arrays) fit and se.fit containing predictions and associated standard error estimates, otherwise an array of predictions is returned. The dimensions of the returned arrays depends on whether type is "terms" or not: if it is then the array is 2 dimensional with each term in the linear predictor separate, otherwise the array is 1 dimensional and contains the linear predictor/predicted values (or corresponding s.e.s). The linear predictor returned termwise will not include the offset or the intercept.
Altogether, consider adjusting parameters of your predict call to render a numeric vector for easy Excel interpretation and not a matrix/array or some other higher dimension R type that Excel cannot render:
testfunction <- function(k1,k2){
z <- mgcv::predict.gam(gam_y, data.frame(x = k1, x2 = k2), type=="response")
return(z)
}
testfunction <- function(k1,k2){
z <- mgcv::predict.gam(gam_y, data.frame(x = k1, x2 = k2), type=="lpmatrix")
return(z)
}
testfunction <- function(k1,k2){
z <- mgcv::predict.gam(gam_y, data.frame(x = k1, x2 = k2), type=="linked")
return(z$fit) # NOTICE fit ELEMENT USED
}
...
Further diagnostics:
Check returned object of predict.glm with str(obj) and class(obj)/ typeof(obj) to see dimensions and underlying elements and compare with predict in caret;
Check if high precision of decimal numbers is the case such as Excel's limits of 15 decimal points;
Check amount of data returned (exceeds Excel's sheet row limit of 220 or cell limit of 32,767 characters?).
I am using an accelerated failure time / AFT model with a weibull distribution to predict data. I am doing this using the survival package in R. I am splitting my data in training and test, do training on the training set and afterwards try to predict the values for the test set. To do that I am passing the the test set as the newdata parameter, as stated in the references. I get an error, saying that newdata does not have the same size as the training data (obviously!). Then the function seems to evaluate predict the values for the training set.
How can I predict the values for the new data?
# get data
library(KMsurv)
library(survival)
data("kidtran")
n = nrow(kidtran)
kidtran <- kidtran[sample(n),] # shuffle row-wise
kidtran.train = kidtran[1:(n * 0.8),]
kidtran.test = kidtran[(n * 0.8):n,]
# create model
aftmodel <- survreg(kidtransurv~kidtran.train$gender+kidtran.train$race+kidtran.train$age, dist = "weibull")
predicted <- predict(aftmodel, newdata = kidtran.test)
Edit: As mentioned by Hack-R, there was this line of code missing
kidtransurv <- Surv(kidtran.train$time, kidtran.train$delta)
The problem seems to be in your specification of the dependent variable.
The data and code definition of the dependent was missing from your question, so I can't see what the specific mistake was, but it did not appear to be a proper Surv() survival object (see ?survreg).
This variation on your code fixes that, makes some minor formatting improvements, and runs fine:
require(survival)
pacman::p_load(KMsurv)
library(KMsurv)
library(survival)
data("kidtran")
n = nrow(kidtran)
kidtran <- kidtran[sample(n),]
kidtran.train <- kidtran[1:(n * 0.8),]
kidtran.test <- kidtran[(n * 0.8):n,]
# Whatever kidtransurv was supposed to be is missing from your question,
# so I will replace it with something not-missing
# and I will make it into a proper survival object with Surv()
aftmodel <- survreg(Surv(time, delta) ~ gender + race + age, dist = "weibull", data = kidtran.train)
predicted <- predict(aftmodel, newdata = kidtran.test)
head(predicted)
302 636 727 121 85 612
33190.413 79238.898 111401.546 16792.180 4601.363 17698.895
The target is to predict the values to the test set with the package monreg model, but this si not working with the predict function, because there isn't a model object to use the prediction function.
Giving an example:
require(monreg) # Package ‘monreg’
x <- rnorm(100)
y <- x + rnorm(100)
x_train=x[0:80]
y_train=y[0:80]
x_test=x[81:100]
y_test=y[81:100]
mon1 <- monreg(x, y, hd = .5, hr = .5)
# I was expecting to get the prediction over the test partion as R usualy works
predict(mon1,h=length(y_test))
But this is not working. In the case this package doesnt have any predict function, I would accept any advice to implement the Narayada Watson regression in R in order to predict values like this example I gave.
I am running some model averaging procedures using the output from the regsubsets command from the leaps package. Once I exclude an intercept, I get an error message that I cannot make sense of:
Reordering variables and trying again: Error in if
(any(index[force.out] == -1)) stop("Can't force the same variable in
and out") : missing value where TRUE/FALSE needed
This problem seems to occur only once my predictor matrix has more columns than the dependent variable has observations (which is one of the reasons for using leaps in the first place). See the example code below:
# Load the package --------------------------------------------------------
require(stats)
require(leaps)
# Some artificial data ----------------------------------------------------
y <- rnorm(20)
x1 <- rnorm(20*20)
dim(x1) <- c(20,20)
x2 <- rnorm(20*21)
dim(x2) <- c(20,21)
# Allow intercept ---------------------------------------------------------
summary(regsubsets(x1,y))$which
summary(regsubsets(x2,y))$which
# Without intercept -------------------------------------------------------
summary(regsubsets(x1,y,intercept=FALSE))$which
summary(regsubsets(x2,y,intercept=FALSE))$which
This usually happens when you have a Linear Dependency in the input variables - you should see a warning , when you run it with Intercept = T.
When you remove the linear dependent column from input predictor , you will be able to run regsubsetsObj with intercept = F . You will have manually remove the linearly dependent column . Its usually a derived column, calculated from existing metrics.