Models in a list can't be predicted upon - r

Consider a list of models that can be created by:
fits = vector(mode="list",length=10)
for(i in 1:10)
{
fits[[i]] = lm(nox~poly(dis,i),data=Boston)
}
Where, Boston dataset is used, that can be found in the MASS library.
Now, in order to make predictions:
dislim = range(Boston$dis)
dis.grid = seq(from = dislim[1],to = dislim[2],by = 0.1)
This is done to give values of dis upon which nox's values are predicted.
Now, in order to make predictions, we can do the following:
predict(fits[[1]],list(dis = dis.grid))
But this results in an error:
Error: variable 'poly(dis, i)' was fitted with type "nmatrix.1" but type "nmatrix.10" was supplied
In addition: Warning message:
In Z/rep(sqrt(norm2[-1L]), each = length(x)) :
longer object length is not a multiple of shorter object length
But, when I do the following:
lm.Boston = lm(nox~poly(dis,3),data=Boston)
lm.Boston.pred = predict(lm.Boston,list(dis = dis.grid))
It works fine. So, why can't I do that in the case of a list?

The correct way to specify a dynamic formula is to use paste and as.formula
library(MASS)
data(Boston)
dislim <- range(Boston$dis)
dis.grid <- seq(from = dislim[1],to = dislim[2],by = 0.1)
models <- lapply(1:10, function(i){
form = as.formula(paste0("nox~", "poly(dis," , i, ")"))
lm(form, data=Boston)
})
to predict
lapply(models, function(x){
predict(x, list(dis = dis.grid))
})
EDIT: Another way to build the formula (as per MrFlick comment) is:
`lm(bquote(nox~poly(dis,.(i))), data=Boston)`
models1 <- lapply(1:10, function(i){
lm(bquote(nox~poly(dis,.(i))), data=Boston)
})
Additionally (as per Nathan Werth comment) if the formulation:
models2 <- lapply(1:10, function(i){
lm(nox~poly(dis,i),data=Boston)
})
is used, the i is being treated as a variable in the model and it is possible to exploit such behavior in the following way:
predict(models2[[1]], list(dis = dis.grid, i = 1)
library(purrr)
models <- lapply(1:10, function(i){
form = as.formula(paste0("nox~", "poly(dis," , i, ")"))
lm(form, data=Boston)
})
models1 <- lapply(1:10, function(i){
lm(bquote(nox~poly(dis,.(i))), data=Boston)
})
models2 <- lapply(1:10, function(i){
lm(nox~poly(dis,i),data=Boston)
})
missuse <- lapply(models, function(x){
predict(x,list(dis = dis.grid))
})
MrFlick <- lapply(models1, function(x){
predict(x,list(dis = dis.grid))
})
NathanWerth <- purrr::map2(models2, 1:10, function(x, y){
predict(x,list(dis = dis.grid, i = y ))
})
purrr::pmap(list(missuse, MrFlick, NathanWerth), function(x, y, z) c(identical(x, y), identical(x, z)))

Related

Repeat analysis for several datasets in R

How can I repeat this code for each subject (xxx), so that the results are added to the data.frame (centralities)?
fullDataDetrend_xxx <- subset(fullDataDetrend, subjno == xxx, select=c(subjno,depressed,sad,tired,interest,happy,neg_thoughts,concentration_probl,ruminating,activity,datevar,timestamp,dayno,beepno))
model_xxx <- var1(
fullDataDetrend_xxx)
model_xxx_omega <- getmatrix(model_xxx, "omega_zeta")
centrality_model_xxx_omega <- centrality(model_xxx_omega )
centralities[nrow(centralities) + 1,] <- c("xxx",centrality_model_xxx_omega$InExpectedInfluence)
Did as suggested:
fullDataDetrend_split <- split(fulldataDetrend, fulldataDetrend$subjno)
then, to estimate network, pull centrality estimates, and write to centralities in global environment:
analyze_one <- function(dataframe){
network_model <- var1(
dataframe,
vars = useVars,
contemporaneous = "ggm",
dayvar = "dayno",
beepvar = "beepno",
estimator = "FIML",
verbose = TRUE,
omega_zeta = "full")
model_omega <- getmatrix(network_model, "omega_zeta")
centrality_omega<- centrality(model_omega)
model_beta <- getmatrix(network_model, "beta")
centrality_beta<- centrality(model_beta)
subjno <- as.list(dataframe[1,2])
centralities[nrow(centralities) + 1,] <- c(subjno,centrality_omega$InExpectedInfluence,centrality_beta$InExpectedInfluence,centrality_beta$OutExpectedInfluence)
assign('centralities',centralities, envir=.GlobalEnv)
}
then rerun the code with lapply for all dataframes (with ignoring errors):
lapply_with_error <- function(X,FUN,...){
lapply(X, function(x, ...) tryCatch(FUN(x, ...),
error=function(e) NULL))
}
lapply_with_error(fullDataDetrend_split, FUN = analyze_one)

I am trying to use various transformations of the response variable and fit corresponding these model, and obtain residual plots for each model

library(GLMsData)
data(fluoro)
lambda <- seq(-2,2,0.5)
lm.out <- list()
for(i in length(lambda)){
if(i != 0){
y <- (fluoro$Dose^lambda-1)/lambda
} else {
y <- log(fluoro$Dose)
}
lm.out[[i]] <- lm(y[i]~Time, data = fluoro, na.exclude = T)
}
print(lm.out)
Error in model.frame.default(formula = y[i] ~ Time, data = fluoro, drop.unused.levels = TRUE) : variable lengths differ (found for 'Time')
I am trying to use various transformations of the response variable and fit these corresponding models, and obtain residual plots for each model.
I need a help. Thanks
Here is a corrected version of the for loop in the question.
data(fluoro, package = "GLMsData")
lambda <- seq(-2, 2, 0.5)
lm.out <- list()
for(i in 1:length(lambda)){
if(lambda[i] != 0){
y <- (fluoro$Dose^lambda[i]-1)/lambda[i]
} else {
y <- log(fluoro$Dose)
}
lm.out[[i]] <- lm(y ~ Time, data = fluoro, na.action = na.exclude)
}
print(lm.out)
And a version with a boxcox function defined and used in a lapply loop.
boxcox <- function(x, lambda, na.rm = FALSE){
if(na.rm) x <- x[!is.na(x)]
if(lambda == 0){
log(x)
} else {
(x^lambda - 1)/lambda
}
}
lm_out2 <- lapply(lambda, \(l){
lm(boxcox(Dose, lambda = l) ~ Time, data = fluoro, na.action = na.exclude)
})
Check that both ways above produce the same results.
coef_list <- sapply(lm.out, coef)
coef_list2 <- sapply(lm_out2, coef)
identical(coef_list, coef_list2)
#[1] TRUE
smry_list <- lapply(lm.out, summary)
smry_list2 <- lapply(lm_out2, summary)
pval_list <- sapply(smry_list, \(fit) fit$coefficients[, "Pr(>|t|)"])
pval_list2 <- sapply(smry_list2, \(fit) fit$coefficients[, "Pr(>|t|)"])
identical(pval_list, pval_list2)
#[1] TRUE
R2_list <- sapply(smry_list, "[[", "r.squared")
R2_list2 <- sapply(smry_list2, "[[", "r.squared")
identical(R2_list, R2_list2)
#[1] TRUE

Recursive feature elimination (caret) with linear regression, how to disable intercept?

I am using recursive feature elimination from the R package 'caret'
Linear regression works fine for my problem, therfore I am using functions = lmFuncs insinde my control function.
But I would like to test this setup again without an intercept, is this possible?
My current code:
control <- rfeControl(functions = lmFuncs
, verbose = FALSE
)
results <- rfe(df_train
, df_train
, rfeControl=control
)
I would also go for a custom function, but I do not know how.
Many thanks in advance.
Edit:
I found the answer after having a deeper look into the caret package.
lmFuncs without Intercept:
lmFuncs_wo_intercept <- list(
summary = defaultSummary,
fit = function(x, y, first, last, ...) {
tmp <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE)
tmp$y <- y
#lm(y~., data = tmp) #old
lm(y~0+., data = tmp) #new
},
pred = function(object, x) {
if(!is.data.frame(x)) x <- as.data.frame(x, stringsAsFactors = TRUE)
predict(object, x)
},
rank = function(object, x, y) {
coefs <- abs(coef(object))
#coefs <- coefs[names(coefs) != "(Intercept)"] # old
coefs[is.na(coefs)] <- 0
vimp <- data.frame(Overall = unname(coefs),
var = names(coefs))
rownames(vimp) <- names(coefs)
vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE]
vimp
},
selectSize = pickSizeBest,
selectVar = pickVars
)
lmFuncs is your linear regression? I this case you could try to fit a second linear regression without the intercept and then apply the feature elimination function

tryCatch r in raster::calc

I'm wanting to write a function that will (hopefully) work in the raster calculator in the raster package. What I'm trying to do is regress each cell value against a vector of Time. There are multiple examples of this, but what I would like to do is for the method to try 1 type of regression (gls, controlling for AR1 residual errors), but if for some reason that regression throws an error (perhaps there is no AR1 structure in the residuals) then to revert back to simple OLS regression.
I've written two functions for the regression. One for gls:
# function for calculating the trend, variability, SNR, and residuals for each pixel
## this function will control for AR1 structure in the residuals
funTrAR1 <- function(x, ...) {if (sum(is.na(x)) >= 1) { NA } else {
mod <- nlme::gls(x ~ Year, na = na.omit, method = "REML", verbose = TRUE,
correlation = corAR1(form = ~ Year, fixed = FALSE),
control = glsControl(tolerance = 1e-3, msTol = 1e-3, opt = c("nlminb", "optim"),
singular.ok = TRUE, maxIter = 1000, msMaxIter = 1000))
slope <- mod$coefficients[2]
names(slope) <- "Trend"
var <- sd(mod$residuals)
names(var) <- "Variability"
snr <- slope/var
names(snr) <- "SNR"
residuals <- c(stats::quantile(
mod$residuals, probs = seq(0,1,0.25),
na.rm = TRUE, names = TRUE, type = 8),
base::mean(mod$residuals, na.rm = TRUE))
names(residuals) <- c("P0", "P25", "P50", "P75", "P100", "AvgResid")
return(c(slope, var, snr, residuals))}
}
and for OLS:
# function for calculating the trend, variability, SNR, and residuals for each pixel
## this function performs simple OLS
funTrOLS <- function(x, ...) {if (sum(is.na(x)) >= 1) { NA } else {
mod <- lm(x ~ Year, na.action = na.omit)
slope <- mod$coefficients[2]
names(slope) <- "TrendOLS"
var <- sd(mod$residuals)
names(var) <- "VariabilityOLS"
snr <- slope/var
names(snr) <- "SNROLS"
residuals <- c(stats::quantile(
mod$residuals, probs = seq(0,1,0.25),
na.rm = TRUE, names = TRUE, type = 8),
base::mean(mod$residuals, na.rm = TRUE))
names(residuals) <- c("P0", "P25", "P50", "P75", "P100", "AvgResid")
return(c(slope, var, snr, residuals))}
}
I'm trying to wrap these in a tryCatch expression which can be passed to raster::calc
xReg <- tryCatch(
{
funTrAR1
},
error = function(e) {
## this should create a text file if a model throws an error
sink(paste0(inDir, "/Outputs/localOLSErrors.txt"), append = TRUE)
cat(paste0("Used OLS regression (grid-cell) for model: ", m, ". Scenario: ", t, ". Variable: ", v, ". Realisation/Ensemble: ", r, ". \n"))
sink()
## run the second regression function
funTrOLS
}
)
This function is then passed to raster::calc like so
cellResults <- calc(rasterStack, fun = xReg)
Which if everything works will produce a raster stack of the output variables that looks similar to this
However, for some of my datasets the loop that I'm running all of this in stops and I receive the following error:
Error in nlme::gls(x ~ Year, na = na.omit, method = "REML", verbose = TRUE, :
false convergence (8)
Which is directly from nlme::gls and what I was hoping to avoid. I've never used tryCatch before (this might be very obvious), but does anyone know how to get the tryCatch() to move to the second regression function if the first (AR1) regression fails?
Here is another way to code this, perhaps that helps:
xReg <- function(x, ...) {
r <- try(funTrAR1(x, ...), silent=TRUE)
# if (class(r) == 'try-error') {
if (!is.numeric(r)) { # perhaps a faster test than the one above
r <- c(funTrOLS(x, ...), 2)
} else {
r <- c(r, 1)
}
r
}
I add a layer that shows which model was used for each cell.
You can also do
xReg <- function(x, ...) {
r <- funTrOLS(x, ...)
try( r <- funTrAR1(x, ...), silent=TRUE)
r
}
Or use calc twice and use cover after that
xReg1 <- function(x, ...) {
r <- c(NA, NA, NA, NA)
try( r <- funTrAR1(x, ...), silent=TRUE)
r
}
xReg2 <- function(x, ...) {
funTrOLS(x, ...)
}
a <- calc(rasterStack, xReg1)
b <- calc(rasterStack, xReg2)
d <- cover(a, b)
And a will show you where xReg1 failed.
After doing a bit more reading, and also looking at #RobertH answer, I wrote a bit of (very) ugly code that checks if GLS will fail and if it does, performs OLS instead. I'm positive that there is a nicer way to do this, but it works and maintains raster layer names as they were defined in my functions, it also exports any errors to a txt file.
for (i in 1) {
j <- tempCentredRas
cat(paste("Checking to see if gls(AR1) will work for model", m, r,"cell based calculations\n", sep = " "))
### This check is particularly annoying as it has to do this for every grid-cell
### it therefore has to perform GLS/OLS on every grid cell twice
### First to check if it (GLS) will fail, and then again if it does fail (use OLS) or doesn't (use GLS)
possibleLocalError <- tryCatch(
raster::calc(j, fun = funTrAR1),
error = function(err)
err
)
if (inherits(possibleLocalError, "error")) {
cat(paste("GLS regression failed for model", m, r, "using OLS instead for cell based results.","\n", sep = " "))
cellResults <- raster::calc(j, fun = funTrOLS)
} else {
cellResults <- raster::calc(j, fun = funTrAR1)
}
}

Write results from stepAIC to a table

I am trying to write r2, rmse, coefficients, and standardized coefficients from stepAIC to a .CSV file:
NO3_lmres_ClimateOnly <- data.frame()
for (i in unique(Data$SeasAlltxt)){
print (i)
subdata1 <- subset(Data, SeasAlltxt == i)
for (j in unique(Data$ALSCIDtxtall)){
subdata2 <- subset(subdata1, ALSCIDtxtall == j)
fit <- lm(NO3resid~Avg94NO3+MaxDepth_m+MaxDepthDOY+FirstZeroDOY+PeakToGone+PRISMppt+PRISMtmax, data = subdata2, na.action = na.omit)
step <- stepAIC(fit, direction="both")
rmse <- round(sqrt(mean(resid(step)^2)), 3)
r2 <- round(summary(step)$r.squared, 3)
coefs <- summary(step)$coefficients
stdcoefs <- lm.beta(step)
stdcoefs <- unname(stdcoefs)
params <- names(stdcoefs)
tempvalues <- data.frame(i,j,rmse,r2,coefs,stdcoefs,params)
colnames(tempvalues) <- c('SeasAlltxt', 'ALSCIDtxtall', 'rmse', 'r2', 'coef', 'stdcoef','param')
NO3_lmres_ClimateOnly <- rbind(NO3_lmres_ClimateOnly,tempvalues)
}
}
write.csv(NO3_lmres_ClimateOnly, file = "NO3_ClimateOnly_stats.csv")
However, the above code produces this error:
Error in data.frame(i, j, rmse, r2, coefs, stdcoefs, params) :
arguments imply differing number of rows: 1, 3, 2, 0
I would also like to write the p-value associated with each parameter to the output table.
Any suggestions for how to accomplish this?
Maybe you want to change your code:
for (j in unique(subdata1$ALSCIDtxtall))
...
coefs <- summary(step)$coefficients[,1]
...
tempvalues <- data.frame(t(c(i,j,rmse,r2,coefs,stdcoefs,params)),stringsAsFactors=F)
colnames(tempvalues ) <- c('SeasAlltxt', 'ALSCIDtxtall', 'rmse', 'r2', names(coefs), paste('stdcoef:',params),params)
Bud the final rbind will give you an error when stepAIC select different number of coefficients.
Think of using a list instead:
Define cont=1 outside the for
then, change the following lines:
tempvalues <- data.frame(t(c(i,j,rmse,r2,coefs,stdcoefs,params)),stringsAsFactors=F)
colnames(tempvalues ) <- c('SeasAlltxt', 'ALSCIDtxtall', 'rmse', 'r2', names(coefs), paste('stdcoef:',params),params)
NO3_lmres_ClimateOnly[[cont]] <- tempvalues
cont=cont+1
Good luck!!

Resources