R multivariate one step ahead forecasts and accuracy - r

Using R I would like to compare the RMSE (root mean square error) from two prediction models. The first model uses estimates from 1966 to 2000 to predict 2001 and then uses estimates from 1966 to 2001 to predict 2002 and so on up to 2015. The second model uses estimates from 1991 to 2000 to predict 2001 and then uses estimates from 1992 to 2001 to predict 2002 and so on up to 2015. This problem has me really stumped and I truly appreciate any help.
DF <- data.frame(YEAR=1966:2015, TEMP=rnorm(50), PRESSURE=rnorm(50), RAINFALL=rnorm(50))
lmod <- lm(TEMP ~ PRESSURE + RAINFALL, data = DF)
rmse <- function(error) sqrt(mean(error^2))
rmse(lmod$residuals)

You can loop it:
Method 1:
pred1<-numeric(0)
rmse1<-numeric(0)
for(i in 1:15){
DF.train1<-DF[DF$YEAR < 2000+i,]
DF.test1<-DF[DF$YEAR == 2000+i,]
lmod1 <- lm(TEMP ~ PRESSURE + RAINFALL, data = DF.train1)
pred1[i]<- predict(lmod1, newdata = DF.test1)
rmse1[i]<-sqrt(mean((DF.test1$TEMP-pred1[i])^2))
}
pred1
rmse1
mean(rmse1)
Method 2:
pred2<-numeric(0)
rmse2<-numeric(0)
for(i in 1:15){
DF.train2<-DF[DF$YEAR < 2000+i & DF$YEAR > 1989+i,]
DF.test2<-DF[DF$YEAR == 2000+i,]
lmod2 <- lm(TEMP ~ PRESSURE + RAINFALL, data = DF.train2)
pred2[i]<- predict(lmod2, newdata = DF.test2)
rmse2[i]<-sqrt(mean((DF.test2$TEMP-pred2[i])^2))
}
pred2
rmse2
mean(rmse2)
Comparing the individual components of rmse1 and rmse2, as well as their respective means should be useful. The vectors pred1 and pred2 contain the individual TEMP predictions for each year (2001-2015) for their respective methods.
Edit: should be working now, and Method 2 trains on a rolling 10 year gap. Also, I take RMSE to be the square root of the MSE as defined for predictors in this article.

Here is another solution, where simulations are in a function.
The interest of this solution is to easily modify model specifications.
For example, if you want to try the model2 with a range of 15 years instead of 10, just modify the input in the function (range = 15). This also gives you the possibility to do a light sensibility analysis.
compare_models <- function(DF, start = 1966, end = 2000, range = 10)
{
require(hydroGOF)
for (i in (end+1):tail(DF$YEAR)[6])
{
# model1
lmod_1 = lm(TEMP ~ PRESSURE + RAINFALL, data = DF[DF$YEAR >= start & DF$YEAR < i,])
DF$model1_sim[DF$YEAR == i] <- predict(lmod_1, newdata = DF[DF$YEAR == i,])
# model2
lmod_2 = lm(TEMP ~ PRESSURE + RAINFALL, data = DF[DF$YEAR >= i-range & DF$YEAR < i,])
DF$model2_sim[DF$YEAR == i] <- predict(lmod_2, newdata = DF[DF$YEAR == i,])
}
return(DF)
}
I used hydroGOF package to compute rmse and NSE, which is a common indicator of model efficiency (see Nash and Sutcliffe, 1970, 11528 citations at the moment).
output = compare_models(DF)
require(hydroGOF) # compute RMSE and NSE
# RMSE
rmse(output$model1_sim,output$TEMP)
rmse(output$model2_sim,output$TEMP)
# Nash-Sutcliffe efficiency
NSE(output$model1_sim,output$TEMP, na.rm = T)
NSE(output$model2_sim,output$TEMP, na.rm = T)
And a simple simulated/observed plot to look for model predictions:
# melting data for plot
output_melt = melt(output[,c("TEMP", "model1_sim", "model2_sim")], id = "TEMP")
# Plot
ggplot(output_melt, aes(x = TEMP, y = value, color = variable)) +
theme_bw() + geom_point() + geom_abline(slope = 1, intercept = 0) +
xlim(-2,2) + ylim(-2,2) + xlab("Measured") + ylab("Simulated")

Here's yet another solution:
year <- 2000
time.frame <- 35
train.models <- function(year, time.frame) {
predictions <- sapply(year:(max(df$YEAR)-1),
function(year) {
lmod <- lm(TEMP ~ PRESSURE + RAINFALL, DF,
subset = with(DF, YEAR %in% (year - time.frame + 1):year))
pred <- predict(lmod, newdata = DF[DF$YEAR == (year + 1),])
names(pred) <- year + 1
return (pred)
})
return (predictions)
}
models1 <- train.models(2000, 35)
models2 <- train.models(2001, 10)
rmse(models1 - DF$TEMP[DF$YEAR %in% names(models1)])
rmse(models2 - DF$TEMP[DF$YEAR %in% names(models2)])

Related

Dimension mismatch in subset expression in JAGS

I am very new to in bayesian analysis and I was trying to practice with an example from tidytuesday (https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-03-29/sports.csv)
I have set my model but when trying to run it the following error message appears:
Error in jags.model(textConnection(jags.script_with), data = dataset, :
RUNTIME ERROR:
Compilation error on line 5.
Dimension mismatch in subset expression of y
Below my approach:
Sports2 =
bind_rows(
sports_clean %>%
select(year, institution_name, sports,
participants = partic_women,
revenue = rev_women,
expenditure = exp_women) %>%
mutate(gender=1), #women
sports_clean %>%
select(year, institution_name, sports,
participants = partic_men,
revenue = rev_men,
expenditure = exp_men) %>%
mutate(gender=0) #men
) %>% na.omit
An example row of the dataset:
Year
institution_name
sports
participants
revenue
expenditure
gender
2015
Alabama A&M Uni
Soccer
21
410717
432648
1
#modeling with regression
set.seed(123)
model_with =
lm(expenditure ~ gender + participants, data=Sports2)
model_with
#dataset for jags model
dataset = list(x=Sports2[,c(4,7)], y=Sports2[,6], n=nrow(Sports2))
#estimation coefficients
dataset$b_guess = model_with$coefficients
#Model
jags.script_with =
"
model{
#likelihood
for (i in 1:n){
y[i] ~ dnorm(mu[i], tau)
mu[i] = intercept + participants*x[i,1]
}
#prioirs
intercept ~ dnorm(bgues[1], 0.1)
participants ~ dnorm(b_guess[2], 0.1)
tau ~ dgamma(0.01,0.01)
#transform
sigma = 1/sqrt(tau)
}
"
#compiling
mod_with = jags.model(textConnection(jags.script_with),
data = dataset,
n.chains = 4, n.adapt = 2000)
I can't figure out how to resolve the issue.
Looking for advice, please.
Thank you in advance!
Edit:
I have removed all (for the model) "unnecessary" parts. This is now the corrected code - unfortunately I cannot figure out why the error still persists.
Compiling model graph
Resolving undeclared variables
Deleting model
Error in jags.model(textConnection(jags.script_with), data = dataset, :
RUNTIME ERROR:
Compilation error on line 5.
Dimension mismatch in subset expression of y
library(rjags)
library(tidyverse)
library(ggplot2)
sports_raw = read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-03-29/sports.csv')
Sports2 =
bind_rows(
sports_raw %>%
select(year, institution_name, sports,
participants = partic_women,
revenue = rev_women,
expenditure = exp_women) %>%
mutate(gender=1), #women
sports_raw %>%
select(year, institution_name, sports,
participants = partic_men,
revenue = rev_men,
expenditure = exp_men) %>%
mutate(gender=0) #men
) %>% na.omit
#modeling with regression
set.seed(123)
model_with =
lm(expenditure ~ gender + participants, data=Sports2)
model_with
#dataset for jags model
dataset = list(x=Sports2[,c(4,7)], y=Sports2[,6], n=nrow(Sports2))
#estimation coefficients
dataset$b_guess = model_with$coefficients
#Model
jags.script_with =
"
model{
#likelihood
for (i in 1:n){
y[i] ~ dnorm(mu[i], tau)
mu[i] = intercept + participants*x[i,1]
}
#prioirs
intercept ~ dnorm(b_guess[1], 0.1)
participants ~ dnorm(b_guess[2], 0.1)
tau ~ dgamma(0.01,0.01)
#transform
sigma = 1/sqrt(tau)
}
"
#compiling
mod_with = jags.model(textConnection(jags.script_with),
data = dataset,
n.chains = 4, n.adapt = 2000)
The problem was that in your original code, you're subsetting a tibble using the [ and unlike in a regular data frame, where it would turn that single column into a vector, the tibble remains a tibble with one variable. The error really states that instead of being a vector as you intend in the model code, the y variable is actually a one-column data frame, which JAGS treats differently from a vector.
library(rjags)
library(tidyverse)
library(ggplot2)
sports_raw = read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-03-29/sports.csv')
Sports2 =
bind_rows(
sports_raw %>%
select(year, institution_name, sports,
participants = partic_women,
revenue = rev_women,
expenditure = exp_women) %>%
mutate(gender=1), #women
sports_raw %>%
select(year, institution_name, sports,
participants = partic_men,
revenue = rev_men,
expenditure = exp_men) %>%
mutate(gender=0) #men
) %>% na.omit
#modeling with regression
set.seed(123)
model_with =
lm(expenditure ~ gender + participants, data=Sports2)
model_with
#dataset for jags model
dataset = list(x=Sports2[,c(4,7)], y=Sports2[,6], n=nrow(Sports2))
dim(dataset$y)
[1] 130748 1
There are two ways to fix this, you can make y a vector in the data:
dataset = list(x=Sports2[,c(4,7)],
y=Sports2$expenditure, n=nrow(Sports2))
dim(dataset$y)
# NULL
length(dataset$y)
# [1] 130748
Or, you could change the likelihood part of your model to acknowledge that y is a one-column matrix:
y[i,1] ~ dnorm(mu[i], tau)
The rest of the model could stay as it is. Do one or the other of these (though not both at the same time) and your model will run.

R: How to loop through models dropping one observation at a time?

I have trouble looping through a regression model dropping one observation each time to estimate the effect of influential observations.
I would like to run the model several times, each time dropping the ith observation and extracting the relevant coefficient estimate and store it in a vector. I think this could quite easily be done with a fairly straight forward loop, however, I'm stuck at the specifics.
I want to be left with a vector containing n coefficient estimates from n iterations of the same model. Any help would be beneficial!
Below I provide some dummy data and example code.
#Dummy data:
set.seed(489)
patientn <- rep(1:400)
gender <- rbinom(400, 1, 0.5)
productid <- rep(c("Product A","Product B"), times=200)
country <- rep(c("USA","UK","Canada","Mexico"), each=50)
baselarea <- rnorm(400,400,60) #baseline area
baselarea2 <- rnorm(400,400,65) #baseline area2
sfactor <- c(
rep(c(0.3,0.9), times = 25),
rep(c(0.4,0.5), times = 25),
rep(c(0.2,0.4), times = 25),
rep(c(0.3,0.7), times = 25)
)
rashdummy2a <- data.frame(patientn,gender,productid,country,baselarea,baselarea2,sfactor)
Data <- rashdummy2a %>% mutate(rashleft = baselarea2*sfactor/baselarea*100) ```
## Example of how this can be done manually:
# model
m1<-lm(rashleft ~ gender + baselarea + sfactor, data = data)
# extracting relevant coefficient estimates, each time dropping a different "patient" ("patientn")
betas <- c(lm(rashleft ~ gender + baselarea + sfactor, data = rashdummy2b, patientn !=1)$coefficients[2],
lm(rashleft ~ gender + baselarea + sfactor, data = rashdummy2b, patientn !=2)$coefficients[2],
lm(rashleft ~ gender + baselarea + sfactor, data = rashdummy2b, patientn !=3)$coefficients[2])
# the betas vector now stores the relevant coefficient estimates (coefficient nr 2, for gender) for three different variations of the model.
We can use a for loop. In your question you use an object rashdummy2b which is not defined. Now I used data but you can replace that by an object of choice.
#create list to bind results to
result <- list()
#loop through patients and extract betas
for(i in unique(data$patientn)){
#construct linear model
lm.model <- lm(rashleft ~ gender + baselarea + sfactor, data = subset(data, data$patientn != i))
#create data.frame containing patient left out and coefficient
result.dt <- data.frame(beta = lm.model$coefficients[[2]],
patient_left_out = i)
#bind to list
result[[i]] <- result.dt
}
#bind to data.frame
result <- do.call(rbind, result)
Result
head(result)
beta patient_left_out
1 1.381248 1
2 1.345188 2
3 1.427784 3
4 1.361674 4
5 1.420417 5
6 1.454196 6
You can drop a particular row (or column) by using a negative index. In your case, you proceed as follows:
betas <- numeric(nrow(rashdummy2b)) # memory preallocation
for (i in 1:nrow(rashdummy2b)) {
betas[i] <- lm(rashleft ~ gender + baselarea + sfactor, data=rashdummy2b[-i,])$coefficients[2]
}

R: Predicting with lmer, y ~ . formula error

Predicting values in new data from an lmer model throws an error when a period is used to represent predictors. Is there any way around this?
The answer to this similar question offers a way to automatically write out the full formula instead of using the period, but I'm curious if there's a way to get predictions from new data just using the period.
Here's a reproducible example:
mydata <- data.frame(
groups = rep(1:3, each = 100),
x = rnorm(300),
dv = rnorm(300)
)
train_subset <- sample(1:300, 300 * .8)
train <- mydata[train_subset,]
test <- mydata[-train_subset,]
# Returns an error
mod <- lmer(dv ~ . - groups + (1 | groups), data = train)
predict(mod, newdata = test)
predict(mod) # getting predictions for the original data works
# Writing the full formula without the period does not return an error, even though it's the exact same model
mod <- lmer(dv ~ x + (1 | groups), data = train)
predict(mod, newdata = test)
This should be fixed in the development branch of lme4 now. You can install from GitHub (see first line below) or wait a few weeks (early April-ish) for a new version to hit CRAN.
remotes::install_github("lme4/lme4") ## you will need compilers etc.
mydata <- data.frame(
groups = rep(1:3, each = 100),
x = rnorm(300),
dv = rnorm(300)
)
train_subset <- sample(1:300, 300 * .8)
train <- mydata[train_subset,]
test <- mydata[-train_subset,]
# Returns an error
mod <- lmer(dv ~ . - groups + (1 | groups), data = train)
p1 <- predict(mod, newdata = test)
mod2 <- lmer(dv ~ x + (1 | groups), data = train)
p2 <- predict(mod2, newdata = test)
identical(p1, p2) ## TRUE

Output of cv.glm vs. cv.glmnet

I'm trying to do cross validation for a dataset that I want to fit a Poisson model to. I notice, however, that I get very different outputs from the functions cv.glm and cv.glmnet when I use lambda = 0. Below is my code for the basic Poisson model (the first part is setup of the data set):
game_soon <- function(game_type, hour){
ret_vec <- c()
len_game_type <- length(game_type)
for(i in 1:len_game_type){
if(game_type[i] == 'N' && hour[i] >= 16){
ret_vec <- c(ret_vec, 1)
}
else if(game_type[i] == 'D' && hour[i] >= 10 && hour[i] <= 17){
ret_vec <- c(ret_vec, 1)
}
else{
ret_vec <- c(ret_vec, 0)
}
}
return(ret_vec)
}
wrigley_agg <- read.csv("/Users/eweine/Desktop/myDivvy/export/EWEINE/WR/WRIGLEY_DIVVY/data", header=FALSE)
colnames(wrigley_agg) <- c("Checkouts", "Temp", "Humidity", "Rain_Intensity",
"Rain_Total", "Hour", "DOY", "Weekday", "Cubs_Game")
game_vec <- wrigley_agg$Cubs_Game
hour_vec <- wrigley_agg$Hour
new_column <- game_soon(game_vec, hour_vec)
wrigley_agg$Game_Soon <- new_column
require(glm)
require(boot)
basic_poisson <- glm(Checkouts ~ Weekday + Game_Soon + poly(Hour, 6) +
poly(Temp, 4) + poly(Rain_Intensity, 4), data=wrigley_agg, family=poisson)
cv_possion <- cv.glm(wrigley_agg, basic_poisson, K=10)
print(cv_possion)
My output is:
[1] 958.9232 958.5509
Below is my code for the cv.glmnet model:
x_pois <- model.matrix(Checkouts ~ Weekday + Game_Soon + poly(Hour, 6) + poly(Temp, 4) +
poly(Rain_Intensity, 4), data=wrigley_agg)
y_pois <- wrigley_agg$Checkouts
cv_lasso_pois <- cv.glmnet(x_pois, y_pois, family="poisson", alpha=1, lambda=seq(1, 0, -1))
no_penalty_cv <- cv_lasso_pois$cvm[cv_lasso_pois$lambda == 0]
print(no_penalty_cv)
And my output is:
[1] 13.41691
The data can be found here.
Why are these values so different?
This is an interesting question.
The difference between the mean cross-validated errors given by cv.glm and cv.glmnet (with lambda=0) is due to the different cost functions used by the two commands.
For poisson models, cv.glm uses by default the average squared error while cv.glmnet uses deviance.
Below I define a function devi that calculates deviance as required by cv.glm:
library(glmnet)
library(boot)
basic_poisson <- glm(Checkouts ~ Weekday + Game_Soon + poly(Hour, 6) +
poly(Temp, 4) + poly(Rain_Intensity, 4), data = wrigley_agg, family =
poisson)
devi <- function(y, eta) {
deveta = y * log(eta) - eta
devy = y * log(y) - y
devy[y == 0] = 0
mean(2 * (devy - deveta))
}
set.seed(1)
cv_poisson <- cv.glm(data=wrigley_agg, glmfit=basic_poisson, cost=devi, K = 10)
print(cv_poisson$delta)
The estimated (raw and adjusted) mean cross-validated errors now are:
[1] 13.42184 13.41605
They are very close to the error given by cv.glmnet.

Automatically compare nested models from mice's glm.mids

I have a multiply-imputed model from R's mice package in which there are lots of factor variables. For example:
library(mice)
library(Hmisc)
# turn all the variables into factors
fake = nhanes
fake$age = as.factor(nhanes$age)
fake$bmi = cut2(nhanes$bmi, g=3)
fake$chl = cut2(nhanes$chl, g=3)
head(fake)
age bmi hyp chl
1 1 <NA> NA <NA>
2 2 [20.4,25.5) 1 [187,206)
3 1 <NA> 1 [187,206)
4 3 <NA> NA <NA>
5 1 [20.4,25.5) 1 [113,187)
6 3 <NA> NA [113,187)
imput = mice(nhanes)
# big model
fit1 = glm.mids((hyp==2) ~ age + bmi + chl, data=imput, family = binomial)
I want to test the significance of each entire factor variable in the model (not the indicator variables for each level) by testing the full model against each possible nested model that drops one variable at a time. Manually, I can do:
# small model (no chl)
fit2 = glm.mids((hyp==2) ~ age + bmi, data=imput, family = binomial)
# extract p-value from pool.compare
pool.compare(fit1, fit2)$pvalue
How can I do this automatically for all the factor variables in my model? The very helpful function drop1 was suggested to me for a previous question -- now I want to do something exactly like that except for the mice case.
Possibly helpful note: An annoying feature of pool.compare is that it appears to want the "extra" variables in the larger model to be placed after the ones that are shared with the smaller model.
You can use a loop to iterate through the different combinations of predictors, after arranging them in the order required for pool.compare.
So using your fake data from above - tweaked the number of categories
library(mice)
library(Hmisc)
# turn all the variables into factors
# turn all the variables into factors
fake <- nhanes
fake$age <- as.factor(nhanes$age)
fake$bmi <- cut2(nhanes$bmi, g=2)
fake$chl <- cut2(nhanes$chl, g=2)
# Impute
imput <- mice(fake, seed=1)
# Create models
# - reduced models with one variable removed
# - full models with extra variables at end of expression
vars <- c("age", "bmi", "chl")
red <- combn(vars, length(vars)-1 , simplify=FALSE)
diffs <- lapply(red, function(i) setdiff(vars, i) )
(full <- lapply(1:length(red), function(i)
paste(c(red[[i]], diffs[[i]]), collapse=" + ")))
#[[1]]
#[1] "age + bmi + chl"
#[[2]]
#[1] "age + chl + bmi"
#[[3]]
#[1] "bmi + chl + age"
(red <- combn(vars, length(vars)-1 , FUN=paste, collapse=" + "))
#[1] "age + bmi" "age + chl" "bmi + chl"
The models are now in the correct order to pass to the glm call. I've also replaced glm.mids method as it has been replaced by with.mids - see ?glm.mids
out <- vector("list", length(red))
for( i in 1:length(red)) {
redMod <- with(imput,
glm(formula(paste("(hyp==2) ~ ", red[[i]])), family = binomial))
fullMod <- with(imput,
glm(formula(paste("(hyp==2) ~ ", full[[i]])), family = binomial))
out[[i]] <- list(predictors = diffs[[i]],
pval = c(pool.compare(fullMod, redMod)$pvalue))
}
do.call(rbind.data.frame, out)
# predictors pval
#2 chl 0.9976629
#21 bmi 0.9985028
#3 age 0.9815831
# Check manually by leaving out chl
mod1 <- with(imput, glm((hyp==2) ~ age + bmi + chl , family = binomial))
mod2 <- with(imput, glm((hyp==2) ~ age + bmi , family = binomial))
pool.compare(mod1, mod2)$pvalue
# [,1]
#[1,] 0.9976629
You will get a lot of warnings using this dataset
EDIT
You could wrap this in a function
impGlmDrop1 <- function(vars, outcome, Data=imput, Family="binomial")
{
red <- combn(vars, length(vars)-1 , simplify=FALSE)
diffs <- lapply(red, function(i) setdiff(vars, i))
full <- lapply(1:length(red), function(i)
paste(c(red[[i]], diffs[[i]]), collapse=" + "))
red <- combn(vars, length(vars)-1 , FUN=paste, collapse=" + ")
out <- vector("list", length(red))
for( i in 1:length(red)) {
redMod <- with(Data,
glm(formula(paste(outcome, red[[i]], sep="~")), family = Family))
fullMod <- with(Data,
glm(formula(paste(outcome, full[[i]], sep="~")), family = Family))
out[[i]] <- list(predictors = diffs[[i]],
pval = c(pool.compare(fullMod, redMod)$pvalue) )
}
do.call(rbind.data.frame, out)
}
# Run
impGlmDrop1(c("age", "bmi", "chl"), "(hyp==2)")

Resources