Drop each Regressor step by step - r

Is there any chance to specify the full model once and then just to drop regressors one after the other and producing a nice stargazer table with it without having to write every regression line again and again?
data <- datasets::airquality
# Treating Month and Day as crosssectional and time fixed effects
re1 <- plm(data = data, Ozone~Solar.R+Wind+Temp,
index=c("Month", "Day"), model="within", effect="twoways") # full model
# this is the only regression line I actually want to write.
# The other regressors should be automatically dropped one by one in subsequent regressions.
re2 <- plm(data = data, Ozone~Wind+Temp,
index=c("Month", "Day"), model="within", effect="twoways") # first regressor dropped
re3 <- plm(data = data, Ozone~Solar.R+Temp,
index=c("Month", "Day"), model="within", effect="twoways") # second regressor dropped
re4 <- plm(data = data, Ozone~Solar.R+Wind,
index=c("Month", "Day"), model="within", effect="twoways") # third regressor dropped
stargazer(re1, re2, re3, re4,
type = "html",
title = "Dropped after one another",
out="HopeThisWorks.html")
I have looked into the step() function but this isn't quite helping as I am not aiming to drop according to significance or anything.

re1 <- plm(data = data, Ozone~Solar.R+Wind+Temp,
index=c("Month", "Day"), model="within", effect="twoways") # full model
A=lapply(paste0(".~.-",c("Solar.R","Wind","Temp")),function(x)update(re1,as.formula(x)))
[[1]]
Model Formula: Ozone ~ Wind + Temp
Coefficients:
Wind Temp
-2.6933 2.3735
[[2]]
Model Formula: Ozone ~ Solar.R + Temp
Coefficients:
Solar.R Temp
0.040986 2.782978
[[3]]
Model Formula: Ozone ~ Solar.R + Wind
Coefficients:
Solar.R Wind
0.096607 -4.841992
Now to be able to access this in the global environment: use
list2env(setNames(A,paste0("re",seq_along(A)+1)),.GlobalEnv)
stargazer(re1, re2, re3, re4,
type = "html",
title = "Dropped after one another",
out="HopeThisWorks.html")

This is a more flexible alternative:
bene <- function(data = datasets::airquality,
depvar = "Ozone",
covariates = c("Wind","Temp","Solar.R")){
funny <- function(id){
covs <- ifelse(is.na(id),paste0(covariates,collapse = " + "),paste0(covariates[-id],collapse = " + "))
model <- eval(parse(text=paste0("plm(data = data,",depvar," ~ ",covs,",index=c('Month', 'Day'), model='within', effect='twoways')")))
return(model)
}
xx <- capture.output(stargazer::stargazer(purrr::map(c(NA,1:length(covariates)),funny),
type = "html",
out = paste0("results/model.html"),
star.cutoffs = c(0.05,0.01,0.001),
title = paste0(depvar)))
models <- purrr::map(c(NA,1:length(covariates)),funny)
map(models,function(x)print(summary(x)))
}
bene(data = datasets::airquality,
depvar = "Ozone",
covariates = c("Wind","Temp","Solar.R"))

You could use (an adaption of) this (so far only stepwise-working) function, which shows you the stargazer output and also saves the regression table as a html file.
stepwise_model <- function(data = "datasets::airquality",
depvar = "Ozone",
covariates = c("Wind","Temp","Solar.R")){
data_df <- eval(parse(text = data))
models <- c()
for(q in 1:length(covariates)){
label <- paste0("mod_",depvar,"_",q)
models[q] <- label
cov <- paste0(covariates[1:q],collapse = " + ")
eval(parse(text = paste0(label," <<- plm::plm(",depvar," ~ ",cov,",data = data_df,index=c('Month', 'Day'), model='within', effect='twoways')")))
eval(parse(text = paste0("print(summary(",label,"))")))
}
modellist <- eval(parse(text = paste0("list(",paste0(models,collapse = ","),")")))
xx <- capture.output(stargazer::stargazer(modellist ,
type = "html",
out = paste0("results/paper/models/mod_",depvar,".html"),
star.cutoffs = c(0.05,0.01,0.001),
title = paste0(depvar)))
}
stepwise_model()

Related

How to have output from lm() include std. error and others without using summary() for stargazer

I'm fitting several linear models in r in the following way:
set.seed(12345)
n = 100
x1 = rnorm(n)
x2 = rnorm(n)+0.1
y = x + rnorm(n)
df <- data.frame(x1, x2, y)
x_str <- c("x1", "x1+x2")
regf_lm <- function(df,y_var, x_str ) {
frmla <- formula(paste0(y_var," ~ ", x_str ))
fit <- lm(frmla, data = df )
summary(fit) #fit
}
gbind_lm <- function(vv) {
n <- vv %>% length()
fits <- list()
coefs <- list()
ses <- list()
for (i in 1:n ) {
coefs[[i]] <- vv[[i]]$coefficients[,1]
ses[[i]] <- vv[[i]]$coefficients[,2]
fits[[i]] <- vv[[i]]
}
list("fits" = fits, "coefs" = coefs, "ses" = ses)
}
stargazer_lm <- function(mylist, fname, title_str,m_type = "html",...) {
stargazer(mylist$fits, coef = mylist$coefs,
se = mylist$ses,
type = m_type, title = title_str,
out = paste0("~/projects/outputs",fname), single.row = T ,...)
}
p_2 <- map(x_str,
~ regf_lm (df = df ,
y_var = "y", x_str = .))
m_all <- do.call(c, list(p_2)) %>% gbind_lm()
stargazer_lm(m_all,"name.html","My model", m_type = "html")
In regf_lm, if I use summary(fit) on the last line, I'm able to generate reg output with columns for estimated coefficients, std. error, etc. But Stargazer() does not work with summary(lm()) (returns error $ operator is invalid for atomic vectors). However, if I just use "fit" on the last line in regf_lm, the output shows only the estimated coefficients and not std error, R sq...and gbind_lm() won't work because I cannot extract ses or fit.
Any advice is greatly appreciated.
You can directly export model statistics in tidy format with the package broom
library(broom)
set.seed(12345)
n = 100
x1 = rnorm(n)
x2 = rnorm(n)+0.1
y = x1 + rnorm(n)
df <- data.frame(x1, x2, y)
x_str <- c("x1", "x1+x2")
regf_lm <- function(df,y_var, x_str ) {
frmla <- formula(paste0(y_var," ~ ", x_str ))
fit <- lm(frmla, data = df )
return(list(fit,select(broom::tidy(fit),std.error))) #fit
}
exm_model <- regf_lm(iris,'Sepal.Width','Sepal.Length')
stargazer(exm_model[[1]], coef = exm_model[[2]], title = 'x_model',
out ='abc', single.row = T)
This piece of code worked on my local with no problem, I think you can apply this in your workflow.

How can I find regression model analyses from 2 dataset?

setwd("C:/Users/sevvalayse.yurtekin/Desktop/hw3")
data = read.table('DSE501_fall2020_HW3.csv', header= T, sep=',')
attach
data
getOption("max.print")
rs<-rowSums(data[,2:76], na.rm = TRUE)
data<-cbind(data,rs)
data
p1<-ggplot()+
geom_line(aes(y = rs, x=year), data=data)+
scale_x_continuous(breaks = seq(2004,2019,2))
p1
model = lm(rs ~ year )
model
summary(model)
residuals(model)
predict(model)
#model.fit = lm(year~rs)
#summary(model.fit)
new.year<-data.frame(
year = c(2021,2022,2023)
)
predict(model, newdata = new.year, interval = 'confidence')
data2 = read.table('TUIK_nufus_2019.csv', header = T, sep=",")
data2
total = data2$Total
mydata<-data[-c(1,2,3),]
model2 = lm(mydata~total)
model2
Hello, I have an error about the Error in model.frame.default(formula = mydata ~ total, drop.unused.levels = TRUE) : invalid type (list) for variable 'mydata'.
How can I fixed? I want to regression analyses from 2 data.
The line that's causing the issue is model2 = lm(mydata~total). mydata is not a vector, which is what your dependent variable should be in the lm function. When you set mydata you do not provide a column name: mydata<-data[-c(1,2,3), <enter column name of dependent variable>]
Otherwise you can fit your model with the following syntax (provided your dependent and independent variables are in the same dataframe). Here I just used y as a fake variable name: lm(y ~ total, data = mydata)

Predict(), NewData with two column and differing rows

I am trying to make the prediction of three variables (retweets,media,content) in my dataset (df_22) to choose between Poisson, Negative binomial and Zero-inflated Poisson. One of the three variables is the response variable (retweets) and the other two the predictive variables (media,content).
I realize the generalized linear models and without problem.
Zero-inflated Poisson data
library("pscl")
summary( m0 <- zeroinfl(retweets ~ media + content, data=df_22,dist="poisson") )
Poisson
summary( m1 <- glm(formula=retweets ~ media + content, data=df_22, family="poisson"(link=log)))
Negative binomial
library (MASS)
summary( m2 <- glm.nb(retweets ~ media + content, data=df_22) )
However, when I create the new database to make the prediction. I check it levels.
> levels(df_22$media)
[1] "other" "pic" "pw" "text" "web"
> levels(df_22$content)
[1] "cultura" "employ" "environment" "other" "security" "sport" "transport"
I have a problem. And it is that the rows of both columns is different.
newmedia = c("other","pic","pw","text", "web")
newcontent = c("cultura","employ","environment","other","security","sport","transport")
nd = data.frame(media = newmedia, content = newcontent)
Error in data.frame(media = newmedia, content = newcontent) : arguments imply differing number of rows: 5, 7
What should I do to solve these problems?
I want to solve this problem in order to be able to make these predictions so that I can choose which of the three models is better for my data.
p0 <- cbind(nd, Count = predict(m0, newdata = nd, type = "count"), Zero = predict(m0, newdata = nd, type = "zero"))
p1 <- cbind(nd, Mean = predict(m1, newdata = nd, type="response"), SE = predict(m1, newdata = nd, type="response", se.fit=T)$se.fit)
p2 <- cbind(nd, Mean = predict(m2, newdata = nd, type="response"), SE = predict(m2, newdata = nd, type="response", se.fit=T)$se.fit)
In the code below a sample data set is created and it computes the p0, p1, p2. The nb dataframe was created differently as a test dataframe.
Import libraries
library(pscl)
library (MASS)
Create sample data set
media <- c("other", "pic", "pw", "text", "web")
content <- c("cultura", "employ", "environment", "other", "security", "sport", "transport")
set.seed(1)
retweets <- floor(abs(1e4*rnorm(1000)))
temp_index <- which(retweets %in% sample(retweets, 20)) # sample indexes
retweets[temp_index] <- 0 # set some retweets to zero to run zeroinfl()
df <- data.frame(retweets)
df$media <- sample(media, 1000, replace = TRUE)
df$content <- sample(content, 1000, replace = TRUE)
head(df)
unique(df$media)
unique(df$content)
Create a test data set
Note: Here, test data set is drawn from the training data for illustration purpose only. Ideally, it should be a new set of data.
nd = df[sample(nrow(df), 300), ] # ideally this should not be from the train data, this is just for an example code
nd_X <- test[,c('media', 'content')]
nd_Y <- test[,c('retweets')]
Fit models: zeroinf(dist='poisson'), glm(family='poisson'), glm.nb()
# Poisson
summary( m0 <- zeroinfl(retweets ~ media + content, data=df, dist="poisson") )
# Binomial
summary( m1 <- glm(formula=retweets ~ media + content, data=df, family="poisson"(link=log)))
# glm()
#summary( m2 <- glm.nb(retweets ~ media + content, data=df) ) # gives error in summary due to zeros
summary( m2 <- glm.nb(retweets ~ media + content, data=df[df$retweets!=0,]) ) # no error without zeros
Predict using test data set
p0 <- cbind(nd, Count = predict(m0, newdata = nd_X, type = "count"), Zero = predict(m0, newdata = nd, type = "zero"))
p1 <- cbind(nd, Mean = predict(m1, newdata = nd_X, type="response"), SE = predict(m1, newdata = nd, type="response", se.fit=T)$se.fit)
p2 <- cbind(nd, Mean = predict(m2, newdata = nd_X, type="response"), SE = predict(m2, newdata = nd, type="response", se.fit=T)$se.fit)
Output:

Error from JM package in R

JM is a package to fit a model with joint longitudinal and survival data. I can get it to run with their example data, but I get an error with my own data. Any idea what the issue with JMfit1 or JMfit2 is?
My Data:
https://1drv.ms/u/s!AkG9wyz5G1c1gR4Vs_xohO--4Rb5
install.packages('JM')
require(JM)
?jointModel
# Example from vignette
# linear mixed model fit (random intercepts + random slopes)
fitLME <- lme(log(serBilir) ~ drug * year, random = ~ year | id, data = pbc2)
summary(fitLME)
# survival regression fit
fitSURV <- survreg(Surv(years, status2) ~ drug, data = pbc2.id, x = TRUE)
summary(fitSURV)
# joint model fit, under the (default) Weibull model
fitJOINT <- jointModel(fitLME, fitSURV, timeVar = "year")
fitJOINT
summary(fitJOINT)
# we can also include an interaction term of log(serBilir) with drug
fitJOINT <- jointModel(fitLME, fitSURV, timeVar = "year",
# interFact = list(value = ~ drug, data = pbc2.id))
fitJOINT
summary(fitJOINT)
# With my data:
data = readRDS('data.list.d1.dk.RDS')
d1 = data$d1
dk = data$dk
dim(d1); names(d1)
dim(dk); names(dk)
slct.cov = c('ID','Yi','Ai','zi.1','zi.2','zi.3','xi_A','di')
fmla.fix = as.formula('Yi ~ Ai*(zi.1+zi.2+zi.3)')
fmla.rnd = as.formula(' ~ Ai|ID')
fit.Yi = lme(fixed= fmla.fix, random=reStruct(fmla.rnd),
method="ML", data = dk[,slct.cov] )
surv.model = survreg(Surv(xi_A, di) ~ zi.1+zi.2+zi.3, data = d1[,slct.cov], x = TRUE)
JMfit1 = jointModel(lmeObject = fit.Yi, survObject = surv.model, timeVar = 'Ai')
# Error in if (t1 || t2) { : missing value where TRUE/FALSE needed
dForm <- list(fixed = ~ 1 + zi.1 + zi.2 + zi.3, indFixed = c(2,6,7,8), random = ~ 1, indRandom = 2)
JMfit2 = jointModel(lmeObject = fit.Yi, survObject = surv.model, timeVar = 'Ai',
derivForm = dForm, parameterization = c("both")) #"both", "value", "slope"
# method = "weibull-PH-aGH",
# "weibull-PH-aGH", "weibull-PH-GH", "weibull-AFT-aGH","weibull-AFT-GH",
# "piecewise-PH-aGH", "piecewise-PH-GH", "Cox-PH-aGH", "Cox-PH-GH",
# "spline-PH-aGH", "spline-PH-GH", # "ch-Laplace"
# interFact = NULL, lag = 0, scaleWB = NULL,
# CompRisk = FALSE, init = NULL, control = list())
It looks to me that your fmla.fix model needs more investigation in its own right maybe.
Simplifying the interactions to Yi ~ Ai+zi.1+zi.3 + Ai*(zi.2) or even Yi ~ Ai+zi.1+zi.2+zi.3 seem to give a valid JMfit1 output.
I suspect you'll get a different error for JMfit2 (do you??), so that may be a subsequent SO question.

mgcv::gamm() and MuMIn::dredge() errors

I've been trying to fit multiple GAMs using the package mgcv within a function, and crudely select the most appropriate model through model selection procedures. But my function runs the first model then doesn't seem to recognise the input data dat again.
I get the error
Error in is.data.frame(data) : object 'dat' not found.
I think this is a scoping problem and I've looked here, and here for help but cannot figure it out.
Code and data are as follows (hopefully reproducible):
https://github.com/cwaldock1/Help/blob/master/test_gam.csv
library(mgcv)
# Function to fit multiple models
best.mod <- function(dat) {
# Set up control structure
ctrl <- list(niterEM = 0, msVerbose = TRUE, optimMethod="L-BFGS-B")
# AR(1)
m1 <- get.models(dredge(gamm(Temp ~ s(Month, bs = "cc") + s(Date, bs = 'cr') + Year,
data = dat, correlation = corARMA(form = ~ 1|Year, p = 1),
control = ctrl)), subset=1)[[1]]
# AR(2)
m2 <- get.models(dredge(gamm(Temp ~ s(Month, bs = "cc") + s(Date, bs = 'cr') + Year,
data = dat, correlation = corARMA(form = ~ 1|Year, p = 2),
control = ctrl)), subset=1)[[1]]
# AR(3)
m3 <- get.models(dredge(gamm(Temp ~ s(Month, bs = "cc") + s(Date, bs = 'cr') + Year,
data = dat, correlation = corARMA(form = ~ 1|Year, p = 3),
control = ctrl)), subset = 1)[[1]]
### Select best model to work with based on unselective AIC criteria
if(AIC(m2$lme) > AIC(m1$lme)){mod = m1}else{mod = m2}
if(AIC(mod$lme) > AIC(m3$lme)){mod = m3}else{mod = mod}
return(mod$gam)
}
mod2 <- best.mod(dat = test_gam)
Any help would be greatly appreciated.
Thanks,
Conor
get.models evaluates in model's formula environment, which in gamm is
(always?) .GlobalEnv, while it should be function's environment (i.e.
sys.frames(sys.nframe())).
So, instead of
get.models(ms, 1)
use
eval(getCall(ms, 1))

Resources