Fitting a multinomial glm for a very large dataset - r

I have multinomial compositional data for 100 categories from two groups, where each is represented by two ages:
set.seed(1)
df <- data.frame(group = c(rep("g1",200),rep("g2",200)),
age = c(rep("a1",100),rep("a2",100),rep("a1",100),rep("a2",100)),
category = rep(paste0("c",1:100),4),
n = c(rmultinom(1,7000,pgamma(shape=0.8,rate=0.1,q=seq(0.01,1,0.01))),
rmultinom(1,5000,pgamma(shape=0.8,rate=0.3,q=seq(0.01,1,0.01))),
rmultinom(1,1800,pgamma(shape=0.5,rate=0.1,q=seq(0.01,1,0.01))),
rmultinom(1,1200,pgamma(shape=0.9,rate=0.1,q=seq(0.01,1,0.01)))),
stringsAsFactors = F)
I want to fit a regression model to estimate the interaction effect of the category * group, while controlling for age.
So far, I'm trying to use a multicategorical glm (with a binomial(link = 'logit')), to a data.frame where I transform the df$n (total counts) to a binomial (binary) form:
binomial.df <- do.call(rbind,lapply(unique(df$group),function(g){
do.call(rbind,lapply(unique(dplyr::filter(df,group == g)$age),function(a){
do.call(rbind,lapply(unique(dplyr::filter(df,group == g)$category),function(t){
sum.non.category <- sum(dplyr::filter(df,group == g & age == a & category != t)$n)
sum.category <- sum(dplyr::filter(df,group == g & age == a & category == t)$n)
data.frame(group = g,age = a,category = t,assigned.category = c(rep(0,sum.non.category),rep(1,sum.category)))
}))
}))
}))
binomial.df$group <- factor(binomial.df$group, levels = c("g1","g2"))
binomial.df$age <- factor(binomial.df$age, levels = c("a1","a2"))
binomial.df$category <- factor(binomial.df$category, levels = paste0("c",1:100))
mm.fit <- glm(assigned.category ~ category * group + age,data = binomial.df, family = binomial(link = 'logit'))
Clearly for this size of data the glm call will run for days or even longer, so I'm looking for a more tractable way.
Any idea?
BTW, I tried using nnet's multinom first:
df$group <- factor(df$group, levels = c("g1","g2"))
df$age <- factor(df$age, levels = c("a1","a2"))
df$category <- factor(df$category, levels = paste0("c",1:100))
mm.fit <- nnet::multinom(n ~ category * group + age, data=df)
But I get:
Error in nnet.default(X, Y, w, mask = mask, size = 0, skip = TRUE, softmax = TRUE, :
too many (22220) weights

The nnet::multinom issue you can resolve by modifying your multinom call to include the argument MaxNWts=100000 :
mm.fit <- nnet::multinom(n ~ category * group + age, data=df, MaxNWts=100000)
To fit large multinomial models in R you could also look into the h20 package :
https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/glm.html

Related

Getting error "invalid type (list) for variable" when running multiple models in a for loop: how to specify outcome/predictors?

For a study I am working on I need to create bootstrapped datasets and inverse probability weights for each dataset and then run a series of models for each of these datasets/weights. I am attempting to do this with a nested for-loop where the first part of the loop creates the weights and the nested loop runs a series of models, each with different outcome variables and/or predictors. I am running about 80 models for each bootstrapped dataset, hence the reason for a more automated way to do this. Below is a example of what I am doing with some mock data:
# Creation of mock data
data <- data.frame("Severity" = as.factor(c(rep("None", 25), rep("Mild", 25), rep("Moderate", 25), rep("Severe", 25))), "Severity2" = as.factor(c(rep("None", 40), rep("Mild", 20), rep("Moderate", 20), rep("Severe", 20))), "Weight" = rnorm(100, mean = 160, sd = 30), "Age" = rnorm(100, mean = 40, sd = 7), "Gender" = as.factor(rbinom(100, size = 1, prob = 0.5)), "Tested" = as.factor(rbinom(100, size = 1, prob = 0.4)))
data$Severity <- ifelse(data$Tested == 0, NA, data$Severity)
data$Severity2 <- ifelse(data$Tested == 0, NA, data$Severity2)
data$Severity <- ordered(data$Severity, levels = c("None", "Mild", "Moderate", "Severe"))
data$Severity2 <- ordered(data$Severity2, levels = c("None", "Mild", "Moderate", "Severe"))
# Creating boostrapped datasets
nboot <- 2
set.seed(10)
boot.samples <- lapply(1:nboot, function(i) {
data[base::sample(1:nrow(data), replace = TRUE),]
})
# Create empty list to store results later
coefs <- list()
# Setting up the outcomes/predictors of each of the models I will run
mod1 <- list("outcome" <- "Severity", "preds" <- c("Weight","Age"))
mod2 <- list("outcome" <- "Severity2", "preds" <- c("Weight", "Age", "Gender"))
models <- list(mod1, mod2)
# Running the for-loop
for(i in 1:length(boot.samples)) {
#Setting up weight creation
null <- glm(formula = Tested ~ 1, family = "binomial", data = boot.samples[[i]])
full <- glm(formula = Tested ~ Age, family = "binomial", data = boot.samples[[i]])
step <- step(null, k = 2, direction = "forward", scope=list(lower = null, upper = full), trace = 0)
pd.combined <- stats::predict(step, type = "response")
numer.combined <- glm(Tested ~ 1, family = "binomial",
data = boot.samples[[i]])
pn.combined <- stats::predict(numer.combined, type = "response")
# Creating stabilized weights
boot.samples[[i]]$ipw <- ifelse(boot.samples[[i]]$Tested==0, ((1-pn.combined)/(1-pd.combined)), (pn.combined)/(pd.combined))
# Now running each model and storing the coefficients
for(j in 1:length(models)) {
outcome <- models[[j]][[1]] # Set the outcome name
predictors <- models[[j]][[2]] # Set the predictor names
model_results <- polr(boot.samples[[i]][,outcome] ~ boot.samples[[i]][, predictors], weights = boot.samples[[i]]$ipw, method = c("logistic"), Hess = TRUE) #Run the model
coefs[[j]] <- model_results$coefficients # Store regression model coefficients in list
}
}
The portion for creating the IPW weights works just fine, but I keep getting an error for the modeling portion that reads:
"Error in model.frame.default(formula = boot.samples[[i]][, outcome] ~ :
invalid type (list) for variable 'boot.samples[[i]][, predictors]'"
Based on the question asked and answered here: Error in model.frame.default ..... : invalid type (list) for variable I know that the issue is with how I'm calling the outcomes and predictors in the model. I've messed around lots of different ways to handle this to no avail, I need to specify the outcome and predictors as I do because in my actual models the outcomes and predictors changes with each model! Any ideas on how to deal with this would be greatly appreciated!
I've tried something like setting outcome <- boot.samples[[i]][,outcome] outside of the model and then just calling outcome in the model, but that gives me the same error.

How can I find regression model analyses from 2 dataset?

setwd("C:/Users/sevvalayse.yurtekin/Desktop/hw3")
data = read.table('DSE501_fall2020_HW3.csv', header= T, sep=',')
attach
data
getOption("max.print")
rs<-rowSums(data[,2:76], na.rm = TRUE)
data<-cbind(data,rs)
data
p1<-ggplot()+
geom_line(aes(y = rs, x=year), data=data)+
scale_x_continuous(breaks = seq(2004,2019,2))
p1
model = lm(rs ~ year )
model
summary(model)
residuals(model)
predict(model)
#model.fit = lm(year~rs)
#summary(model.fit)
new.year<-data.frame(
year = c(2021,2022,2023)
)
predict(model, newdata = new.year, interval = 'confidence')
data2 = read.table('TUIK_nufus_2019.csv', header = T, sep=",")
data2
total = data2$Total
mydata<-data[-c(1,2,3),]
model2 = lm(mydata~total)
model2
Hello, I have an error about the Error in model.frame.default(formula = mydata ~ total, drop.unused.levels = TRUE) : invalid type (list) for variable 'mydata'.
How can I fixed? I want to regression analyses from 2 data.
The line that's causing the issue is model2 = lm(mydata~total). mydata is not a vector, which is what your dependent variable should be in the lm function. When you set mydata you do not provide a column name: mydata<-data[-c(1,2,3), <enter column name of dependent variable>]
Otherwise you can fit your model with the following syntax (provided your dependent and independent variables are in the same dataframe). Here I just used y as a fake variable name: lm(y ~ total, data = mydata)

How can I train a glmnet model (Poisson family) with an offset term using the caret package in R?

I want to model insurance claim count using a Poisson glmnet. The data I have at hand contains the number of claims for each policy (which is the response variable), some features about the policy (gender, region, etc.) as well as the duration of the policy (in years). I want to include the log-duration as an offset term, as we usually do in actuarial science. With the cv.glmnet function of the glmnet package, it is straightforward:
library(tidyverse)
library(glmnet)
n <- 100
dat <- tibble(
nb_claims = rpois(n, lambda = 0.5),
duration = runif(n),
x1 = runif(n),
x2 = runif(n),
x3 = runif(n)
)
fit <- cv.glmnet(
x = dat %>% dplyr::select(x1, x2, x3) %>% as.matrix(),
y = dat %>% pull(nb_claims),
family = "poisson",
offset = dat %>% pull(duration) %>% log()
)
fit
However, my goal is to train this model using the train function of the caret package, because of the many advantages it gives. Indeed, validation, preprocessing as well as feature selection is much better with this package. It is straightforward to train a basic glmnet (without an offset term) with caret:
library(caret)
fit <- caret::train(
x = dat %>% dplyr::select(x1, x2, x3) %>% as.matrix(),
y = dat %>% pull(nb_claims),
method = "glmnet",
family = "poisson"
)
fit
Naively, we could try to add the offset argument in the train function:
fit <- caret::train(
x = dat %>% dplyr::select(x1, x2, x3) %>% as.matrix(),
y = dat %>% pull(nb_claims),
method = "glmnet",
family = "poisson",
offset = dat %>% pull(duration) %>% log()
)
fit
Unfortunately, this code throws the error Error : No newoffset provided for prediction, yet offset used in fit of glmnet. This error occurs because the caret::train function doesn't take care to give a value for the newoffset argument in predict.glmnet function.
In this book, they show how to add an offset term to a GLM model by modifying the source code of the caret::train function. It works perfectly. However, the predict.glm function is quite different from the predict.glmnet function, because it does not have the newoffset argument. I tried to modify the source code of the caret::train function, but I am having some trouble because I do not know well enough how this function works.
A simple way to perform this is pass the offset column as part of x and in each fit and predict call pass as x columns of x which are not the offset. While as offset/newoffset pass the x column corresponding to the offset.
In the following example the offest column of x needs to be named "offset" too. This can be changed relatively easy
To create the function we will just use lots of parts from: https://github.com/topepo/caret/blob/master/models/files/glmnet.R
glmnet is peculiar since it needs a loop, the rest is just rinse and reapeat from https://topepo.github.io/caret/using-your-own-model-in-train.html#illustrative-example-1-svms-with-laplacian-kernels
family = "poisson" will be specified throughout, to change this adopt code from https://github.com/topepo/caret/blob/master/models/files/glmnet.R
glmnet_offset <- list(type = "Regression",
library = c("glmnet", "Matrix"),
loop = function(grid) {
alph <- unique(grid$alpha)
loop <- data.frame(alpha = alph)
loop$lambda <- NA
submodels <- vector(mode = "list", length = length(alph))
for(i in seq(along = alph)) {
np <- grid[grid$alpha == alph[i],"lambda"]
loop$lambda[loop$alpha == alph[i]] <- np[which.max(np)]
submodels[[i]] <- data.frame(lambda = np[-which.max(np)])
}
list(loop = loop, submodels = submodels)
})
glmnet_offset$parameters <- data.frame(parameter = c('alpha', 'lambda'),
class = c("numeric", "numeric"),
label = c('Mixing Percentage', 'Regularization Parameter'))
glmnet_offset$grid <- function(x, y, len = NULL, search = "grid") {
if(search == "grid") {
init <- glmnet::glmnet(Matrix::as.matrix(x[,colnames(x) != "offset"]), y,
family = "poisson",
nlambda = len+2,
alpha = .5,
offset = x[,colnames(x) == "offset"])
lambda <- unique(init$lambda)
lambda <- lambda[-c(1, length(lambda))]
lambda <- lambda[1:min(length(lambda), len)]
out <- expand.grid(alpha = seq(0.1, 1, length = len),
lambda = lambda)
} else {
out <- data.frame(alpha = runif(len, min = 0, 1),
lambda = 2^runif(len, min = -10, 3))
}
out
}
So x[,colnames(x) != "offset"] is x while offset is x[,colnames(x) == "offset"]
glmnet_offset$fit <- function(x, y, wts, param, last, ...) {
theDots <- list(...)
## pass in any model weights
if(!is.null(wts)) theDots$weights <- wts
if(!(class(x)[1] %in% c("matrix", "sparseMatrix")))
x <- Matrix::as.matrix(x)
modelArgs <- c(list(x = x[,colnames(x) != "offset"],
y = y,
alpha = param$alpha,
family = "poisson",
offset = x[,colnames(x) == "offset"]),
theDots)
out <- do.call(glmnet::glmnet, modelArgs)
if(!is.na(param$lambda[1])) out$lambdaOpt <- param$lambda[1]
out
}
glmnet_offset$predict <- function(modelFit, newdata, submodels = NULL) {
if(!is.matrix(newdata)) newdata <- Matrix::as.matrix(newdata)
out <- predict(modelFit,
newdata[,colnames(newdata) != "offset"],
s = modelFit$lambdaOpt,
newoffset = newdata[,colnames(newdata) == "offset"],
type = "response") #important for measures to be appropriate
if(is.matrix(out)) out <- out[,1]
out
if(!is.null(submodels)) {
tmp <- as.list(as.data.frame(predict(modelFit,
newdata[,colnames(newdata) != "offset"],
s = submodels$lambda,
newoffset = newdata[,colnames(newdata) == "offset"],
type = "response"),
stringsAsFactors = TRUE))
out <- c(list(out), tmp)
}
out
}
For some reason which I don't understand yet it does not work without the prob slot
glmnet_offset$prob <- glmnet_offset$predict
glmnet_offset$tags = c("Generalized Linear Model", "Implicit Feature Selection",
"L1 Regularization", "L2 Regularization", "Linear Classifier",
"Linear Regression")
glmnet_offset$sort = function(x) x[order(-x$lambda, x$alpha),]
glmnet_offset$trim = function(x) {
x$call <- NULL
x$df <- NULL
x$dev.ratio <- NULL
x
}
library(tidyverse)
library(caret)
library(glmnet)
n <- 100
set.seed(123)
dat <- tibble(
nb_claims = rpois(n, lambda = 0.5),
duration = runif(n),
x1 = runif(n),
x2 = runif(n),
x3 = runif(n)
)
x = dat %>%
dplyr::select(-nb_claims) %>%
mutate(offset = log(duration)) %>%
dplyr::select(-duration) %>%
as.matrix
fit <- caret::train(
x = x,
y = dat %>% pull(nb_claims),
method = glmnet_offset,
)
fit
100 samples
4 predictor
No pre-processing
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 100, 100, 100, 100, 100, 100, ...
Resampling results across tuning parameters:
alpha lambda RMSE Rsquared MAE
0.10 0.0001640335 0.7152018 0.01805762 0.5814200
0.10 0.0016403346 0.7152013 0.01805684 0.5814193
0.10 0.0164033456 0.7130390 0.01798125 0.5803747
0.55 0.0001640335 0.7151988 0.01804917 0.5814020
0.55 0.0016403346 0.7150312 0.01802689 0.5812936
0.55 0.0164033456 0.7095996 0.01764947 0.5783706
1.00 0.0001640335 0.7152033 0.01804795 0.5813997
1.00 0.0016403346 0.7146528 0.01798979 0.5810811
1.00 0.0164033456 0.7063482 0.01732168 0.5763653
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were alpha = 1 and lambda = 0.01640335.
predict(fit$finalModel, x[,1:3], newoffset = x[,4]) #works
This will not work with preprocessing in caret since we pass offset as one of the features. However it will work with recipes since you can define columns on which preprocessing functions will be performed via selections. Se article for details: https://tidymodels.github.io/recipes/articles/Selecting_Variables.html
I haven't had time to error check my code. If any problems occur or if there is a mistake somewhere please comment. Thanks.
You can also post an issue in caret github asking this feature (offset/newoffset) to be added to the model
I tried to change the model info a lot of ways, but it was failing miserably. Below I can propose one solution, may not be the best, but will get you somewhere if your data is sensible.
In the poisson / negative binom .. regression, the offset in factor gets introduced into the regression, you can read more here and here:
where tx is the offset. In glmnet, there is a penalty factor you can introduce for each term, and if you let that be 0 for a term, basically you are not penalizing it and it's always included. We can use that for the offset, and you can see this effect only if you use a dataset that makes some sense (note that in your example dataset, the offsets are numbers that make no sense).
Below I use the insurance claims dataset from MASS:
library(tidyverse)
library(glmnet)
library(MASS)
dat <- Insurance
X = model.matrix(Claims ~ District + Group + Age,data=dat)
Y = dat$Claims
OFF = log(dat$Holders)
fit_cv <- cv.glmnet(
x = X,
y = Y,
family = "poisson",
offset = OFF
)
Now using caret, I will fit it without any training, and using the same lambda obtained from the fit in cv.glmnet. One thing you should note too is that cv.glmnet often uses lambda.1se instead of lambda.min:
fit_c <- caret::train(
x = cbind(X,OFF),
y = Y,
method = "glmnet",
family = "poisson",
tuneGrid=data.frame(lambda=fit_cv$lambda.1se,alpha=1),
penalty=c(rep(1,ncol(X)),0),
trControl = trainControl(method="none")
)
We can see how different are the predictions:
p1 = predict(fit_cv,newx=X,newoffset=OFF)
p2 = predict(fit_c,newx=cbind(X,OFF))
plot(p1,p2)

How to bootstrap Mixed-Effects Model in R

I have a data set (df) in this format
index <- runif(n = 100,min = 0, max = 1)
type1 <- rep("low", 50)
type2 <- rep("high", 50)
type <- c(type1,type2)
level1 <- rep("single", 25)
level2 <- rep("multiple", 25)
level3 <- rep("single", 25)
level4 <- rep("multiple", 25)
level <- c(level1,level2,level3,level4)
block <- rep(1:5, 10)
set <- rep(1:5, 10)
df <- data.frame("index" = index,"type" = type, "level" = level, "block" = block, "set" = set)
df$block <- as.factor(df$block)
df$set <- as.factor(df$set)
I want to create a model that looks like like this
model <- lmer(index ~ type * level + (1|block) + (1|set), data = df)
However, in my original data the fit is bad because the data is bound between 0 and 1. I want to bootstrap this mixed effects model. Any idea on how to achieve boot-strapping for such a model? I want to compare this this full model with sub-models eg. without interaction, or with level or type alone. I also want with confidence intervals for the final model
The confint() function has a method for merMod objects. The following should work:
confint(model, method = "boot", nsim = 1000)
And with multiple CPUs:
confint(model, method = "boot", nsim = 1000,
parallel = "multicore", ncpus = 8)

Predict(), NewData with two column and differing rows

I am trying to make the prediction of three variables (retweets,media,content) in my dataset (df_22) to choose between Poisson, Negative binomial and Zero-inflated Poisson. One of the three variables is the response variable (retweets) and the other two the predictive variables (media,content).
I realize the generalized linear models and without problem.
Zero-inflated Poisson data
library("pscl")
summary( m0 <- zeroinfl(retweets ~ media + content, data=df_22,dist="poisson") )
Poisson
summary( m1 <- glm(formula=retweets ~ media + content, data=df_22, family="poisson"(link=log)))
Negative binomial
library (MASS)
summary( m2 <- glm.nb(retweets ~ media + content, data=df_22) )
However, when I create the new database to make the prediction. I check it levels.
> levels(df_22$media)
[1] "other" "pic" "pw" "text" "web"
> levels(df_22$content)
[1] "cultura" "employ" "environment" "other" "security" "sport" "transport"
I have a problem. And it is that the rows of both columns is different.
newmedia = c("other","pic","pw","text", "web")
newcontent = c("cultura","employ","environment","other","security","sport","transport")
nd = data.frame(media = newmedia, content = newcontent)
Error in data.frame(media = newmedia, content = newcontent) : arguments imply differing number of rows: 5, 7
What should I do to solve these problems?
I want to solve this problem in order to be able to make these predictions so that I can choose which of the three models is better for my data.
p0 <- cbind(nd, Count = predict(m0, newdata = nd, type = "count"), Zero = predict(m0, newdata = nd, type = "zero"))
p1 <- cbind(nd, Mean = predict(m1, newdata = nd, type="response"), SE = predict(m1, newdata = nd, type="response", se.fit=T)$se.fit)
p2 <- cbind(nd, Mean = predict(m2, newdata = nd, type="response"), SE = predict(m2, newdata = nd, type="response", se.fit=T)$se.fit)
In the code below a sample data set is created and it computes the p0, p1, p2. The nb dataframe was created differently as a test dataframe.
Import libraries
library(pscl)
library (MASS)
Create sample data set
media <- c("other", "pic", "pw", "text", "web")
content <- c("cultura", "employ", "environment", "other", "security", "sport", "transport")
set.seed(1)
retweets <- floor(abs(1e4*rnorm(1000)))
temp_index <- which(retweets %in% sample(retweets, 20)) # sample indexes
retweets[temp_index] <- 0 # set some retweets to zero to run zeroinfl()
df <- data.frame(retweets)
df$media <- sample(media, 1000, replace = TRUE)
df$content <- sample(content, 1000, replace = TRUE)
head(df)
unique(df$media)
unique(df$content)
Create a test data set
Note: Here, test data set is drawn from the training data for illustration purpose only. Ideally, it should be a new set of data.
nd = df[sample(nrow(df), 300), ] # ideally this should not be from the train data, this is just for an example code
nd_X <- test[,c('media', 'content')]
nd_Y <- test[,c('retweets')]
Fit models: zeroinf(dist='poisson'), glm(family='poisson'), glm.nb()
# Poisson
summary( m0 <- zeroinfl(retweets ~ media + content, data=df, dist="poisson") )
# Binomial
summary( m1 <- glm(formula=retweets ~ media + content, data=df, family="poisson"(link=log)))
# glm()
#summary( m2 <- glm.nb(retweets ~ media + content, data=df) ) # gives error in summary due to zeros
summary( m2 <- glm.nb(retweets ~ media + content, data=df[df$retweets!=0,]) ) # no error without zeros
Predict using test data set
p0 <- cbind(nd, Count = predict(m0, newdata = nd_X, type = "count"), Zero = predict(m0, newdata = nd, type = "zero"))
p1 <- cbind(nd, Mean = predict(m1, newdata = nd_X, type="response"), SE = predict(m1, newdata = nd, type="response", se.fit=T)$se.fit)
p2 <- cbind(nd, Mean = predict(m2, newdata = nd_X, type="response"), SE = predict(m2, newdata = nd, type="response", se.fit=T)$se.fit)
Output:

Resources