R: Variable Ranking model automation codes to write it as a Function - r

How can I write the list of commands below into just one Function?
For example: VariableRanking <- function(formula, variables,.....) {
Insert commands........ }
#Variable Ranking Model automation
#exclusion of the variables that are not model variables
exclude <- c("~,", "+" ) # exclude target which is bound_count for Property
formula <- toString(formula)
formula
#listing the entire model formula out
variables_pre <- unlist(strsplit(formula, split = " "))
variables_pre
#keeping only the model variables
variables <- sort(variables_pre[!variables_pre %in% exclude])
variables
#Exclude "," on the target variable
variables[1] <- substr(variables[1], 1, nchar(variables[1])-1)
variables
#Assigning the variables into a data frame
d <- c(1:length(variables))
d
d= data.frame(d)
d
d= t(d)
d
colnames(d)=variables
d
# exclude target variable on the data frame
allvariables <- colnames(d)[-1]
allvariables
# container for models
listOfModels <- vector("list", length(allvariables))
listOfModels
# loop over variables
for (i in seq_along(allvariables)) {
# exclude variable i
currentvariable <- allvariables[-i]
# programmatically assemble regression formula
regressionFormula <- as.formula(
paste(variables[1],"~", paste(currentvariable, collapse="+")))
# fit model
currentModel <- glm(formula = regressionFormula, family=binomial(link = "logit"), data=dataL_TT)
# store model in container
listOfModels[[i]] <- currentModel
}
listOfModels
#List of AICs for each model
lapply(listOfModels,function(xx) xx$aic)
#Assign X as the AIC of the full model
X <- modelTT$aic
X
# Difference of AICs of each model to the AIC of the full model
AICdifference <- lapply(listOfModels,function(xx) xx$aic - X)
AICdifference
# Naming the AIC Difference
AICdifference2 = data.frame(variables=allvariables, AICdiff=unlist(AICdifference))
AICdifference2
#Graph the Barchart of the AIC decrease of each variables and save it to pdf
pdf("Barchart.pdf",width=12,height=10)
par(mar=c(2,18,2,5))
barplot(sort(AICdifference2$AICdiff, decreasing = F), main="Variable Ranking based on AIC decrease",
horiz=TRUE, xlab="AIC Increase", names.arg= AICdifference2$variables[order(AICdifference2$AICdiff, decreasing = F)],
las=1, col= 'dodgerblue4')
dev.off()
Is it possible? because it has a lot of parameters.
So basically I just need the output of the AICdifference2 data frame.
And the barplot saved as pdf and pop up

Try this:
FOO <- function(myformula, data, fullmodel_AIC, plotname){
exclude <- c("~,", "+" ) # exclude target which is bound_count for Property
myformula <- toString(myformula)
variables_pre <- unlist(strsplit(myformula, split = " "))
variables <- sort(variables_pre[!variables_pre %in% exclude])
variables[1] <- substr(variables[1], 1, nchar(variables[1])-1)
d <- t(data.frame(c(1:length(variables))))
colnames(d)=variables
allvariables <- colnames(d)[-1]
listOfModels <- vector("list", length(allvariables))
for (i in seq_along(allvariables)) {
# exclude variable i
currentvariable <- allvariables[-i]
# programmatically assemble regression formula
regressionFormula <- as.formula(
paste(variables[1],"~", paste(currentvariable, collapse="+")))
# fit model
currentModel <- glm(formula = regressionFormula, family=binomial(link = "logit"), data = data)
# store model in container
listOfModels[[i]] <- currentModel
}
AICdifference <- lapply(listOfModels,function(xx) xx$aic - fullmodel_AIC)
AICdifference2 <- data.frame(variables=allvariables, AICdiff=unlist(AICdifference))
pdf(paste0(plotname, ".pdf"),width=12,height=10)
par(mar=c(2,18,2,5))
barplot(sort(AICdifference2$AICdiff, decreasing = F), main="Variable Ranking based on AIC decrease",
horiz=TRUE, xlab="AIC Increase", names.arg= AICdifference2$variables[order(AICdifference2$AICdiff, decreasing = F)],
las=1, col= 'dodgerblue4')
dev.off()
return(AICdifference2)
}
You need four parameters: The myformula, the data (dataL_TT in your code), the fullmodel_AIC (modelTT$aic in your code), and a string to name your plot.
Try calling it with FOO(myformula, dataL_TT, modelTT$aic, "test") and insert your formula object for myformula.
I've changed formula to myformula because formula is a base function of the stats package, and it is generally unwise to use object names which are base functions.

Related

ANOVA on multiple models stored in a tibble

It is possible to use anova on multible models stored in a tibble without listing them manually.
An example prediction of wage from age in Wage dataset from the ISLR2 library. I have a tibble a column for polynomial degrees in one column, GLM models in another and CV errors in the third column.
I can use anova through do.call but it does not show p-values without passing test = 'F' as an argument.
library(ISLR2)
library(tidyverse)
library(boot)
GLM <- function(n) {
result <- glm(wage ~ poly(age, n), data = Wage)
return(result)
}
CV <- function(n) {
glm_fit <- glm(wage ~ poly(age, n), data = Wage)
result <- cv.glm(Wage, glm_fit, K = 10)$delta[1]
return(result)
}
set.seed(1)
models <- tibble(polynom_degrees = 1:10) %>%
mutate(linear_model = map(polynom_degrees, GLM)) %>%
mutate(CV_error = map(polynom_degrees, CV)) %>%
mutate(CV_error = unlist(CV_error))
do.call(anova, models$linear_model)

Dummies not included in summary

I want to create a function which will perform panel regression with 3-level dummies included.
Let's consider within model with time effects :
library(plm)
fit_panel_lr <- function(y, x) {
x[, length(x) + 1] <- y
#adding dummies
mtx <- matrix(0, nrow = nrow(x), ncol = 3)
mtx[cbind(seq_len(nrow(mtx)), 1 + (as.integer(unlist(x[, 2])) - min(as.integer(unlist(x[, 2])))) %% 3)] <- 1
colnames(mtx) <- paste0("dummy_", 1:3)
#converting to pdataframe and adding dummy variables
x <- pdata.frame(x)
x <- cbind(x, mtx)
#performing panel regression
varnames <- names(x)[3:(length(x))]
varnames <- varnames[!(varnames == names(y))]
form <- paste0(varnames, collapse = "+")
x_copy <- data.frame(x)
form <- as.formula(paste0(names(y), "~", form,'-1'))
params <- list(
formula = form, data = x_copy, model = "within",
effect = "time"
)
pglm_env <- list2env(params, envir = new.env())
model_plm <- do.call("plm", params, envir = pglm_env)
model_plm
}
However, if I use data :
data("EmplUK", package="plm")
dep_var<-EmplUK['capital']
df1<-EmplUK[-6]
In output I will get :
> fit_panel_lr(dep_var, df1)
Model Formula: capital ~ sector + emp + wage + output + dummy_1 + dummy_2 +
dummy_3 - 1
<environment: 0x000001ff7d92a3c8>
Coefficients:
sector emp wage output
-0.055179 0.328922 0.102250 -0.002912
How come that in formula dummies are considered and in coefficients are not ? Is there any rational explanation or I did something wrong ?
One point why you do not see the dummies on the output is because they are linear dependent to the other data after the fixed-effect time transformation. They are dropped so what is estimable is estimated and output.
Find below some (not readily executable) code picking up your example from above:
dat <- cbind(EmplUK, mtx) # mtx being the dummy matrix constructed in your question's code for this data set
pdat <- pdata.frame(dat)
rhs <- paste(c("emp", "wage", "output", "dummy_1", "dummy_2", "dummy_3"), collapse = "+")
form <- paste("capital ~" , rhs)
form <- formula(form)
mod <- plm(form, data = pdat, model = "within", effect = "time")
detect.lindep(mod$model) # before FE time transformation (original data) -> nothing offending
detect.lindep(model.matrix(mod)) # after FE time transformation -> dummies are offending
The help page for detect.lindep (?detect.lindep is included in package plm) has some more nice examples on linear dependence before and after FE transformation.
A suggestion:
As for constructing dummy variables, I suggest to use R's factor with three levels and not have the dummy matrix constructed yourself. Using a factor is typically more convinient and less error prone. It is converted to the binary dummies (treatment style) by your typical estimation function using the model.frame/model.matrix framework.

How to use the replicate function in R to repeat the function

I have a problem when using replicate to repeat the function.
I tried to use the bootstrap to fit
a quadratic model using concentration as the predictor and Total_lignin as the response and going to report an estimate of the maximum with a corresponding standard error.
My idea is to create a function called bootFun that essentially did everything within one iteration of a for loop. bootFun took in only the data set the predictor, and the response to use (both variable names in quotes).
However, the SD is 0, not correct. I do not know where is the wrong place. Could you please help me with it?
# Load the libraries
library(dplyr)
library(tidyverse)
# Read the .csv and only use M.giganteus and S.ravennae.
dat <- read_csv('concentration.csv') %>%
filter(variety == 'M.giganteus' | variety == 'S.ravennae') %>%
arrange(variety)
# Check the data
head(dat)
# sample size
n <- nrow(dat)
# A function to do one iteration
bootFun <- function(dat, pred, resp){
# Draw the sample size from the dataset
sample <- sample_n(dat, n, replace = TRUE)
# A quadratic model fit
formula <- paste0('resp', '~', 'pred', '+', 'I(pred^2)')
fit <- lm(formula, data = sample)
# Derive the max of the value of concentration
max <- -fit$coefficients[2]/(2*fit$coefficients[3])
return(max)
}
max <- bootFun(dat = dat, pred = 'concentration', resp = 'Total_lignin' )
# Iterated times
N <- 5000
# Use 'replicate' function to do a loop
maxs <- replicate(N, max)
# An estimate of the max of predictor and corresponding SE
mean(maxs)
sd(maxs)
Base package boot, function boot, can ease the job of calling the bootstrap function repeatedly. The first argument must be the data set, the second argument is an indices argument, that the user does not set and other arguments can also be passed toit. In this case those other arguments are the predictor and the response names.
library(boot)
bootFun <- function(dat, indices, pred, resp){
# Draw the sample size from the dataset
dat.sample <- dat[indices, ]
# A quadratic model fit
formula <- paste0(resp, '~', pred, '+', 'I(', pred, '^2)')
formula <- as.formula(formula)
fit <- lm(formula, data = dat.sample)
# Derive the max of the value of concentration
max <- -fit$coefficients[2]/(2*fit$coefficients[3])
return(max)
}
N <- 5000
set.seed(1234) # Make the bootstrap results reproducible
results <- boot(dat, bootFun, R = N, pred = 'concentration', resp = 'Total_lignin')
results
#
#ORDINARY NONPARAMETRIC BOOTSTRAP
#
#
#Call:
#boot(data = dat, statistic = bootFun, R = N, pred = "concentration",
# resp = "Total_lignin")
#
#
#Bootstrap Statistics :
# original bias std. error
#t1* -0.4629808 -0.0004433889 0.03014259
#
results$t0 # this is the statistic, not bootstrapped
#concentration
# -0.4629808
mean(results$t) # bootstrap value
#[1] -0.4633233
Note that to fit a polynomial, function poly is much simpler than to explicitly write down the polynomial terms one by one.
formula <- paste0(resp, '~ poly(', pred, ',2, raw = TRUE)')
Check the distribution of the bootstrapped statistic.
op <- par(mfrow = c(1, 2))
hist(results$t)
qqnorm(results$t)
qqline(results$t)
par(op)
Test data
set.seed(2020) # Make the results reproducible
x <- cumsum(rnorm(100))
y <- x + x^2 + rnorm(100)
dat <- data.frame(concentration = x, Total_lignin = y)

Predict segmented lm outside of package

I have an array of outputs from hundreds of segmented linear models (made using the segmented package in R). I want to be able to use these outputs on new data, using the predict function. To be clear, I do not have the segmented linear model objects in my workspace; I just saved and reimported the relevant outputs (e.g. the coefficients and breakpoints). For this reason I can't simply use the predict.segmented function from the segmented package.
Below is a toy example based on this link that seems promising, but does not match the output of the predict.segmented function.
library(segmented)
set.seed(12)
xx <- 1:100
zz <- runif(100)
yy <- 2 + 1.5*pmax(xx-35,0) - 1.5*pmax(xx-70,0) +
15*pmax(zz-0.5,0) + rnorm(100,0,2)
dati <- data.frame(x=xx,y=yy,z=zz)
out.lm<-lm(y~x,data=dati)
o<-## S3 method for class 'lm':
segmented(out.lm,seg.Z=~x,psi=list(x=c(30,60)),
control=seg.control(display=FALSE))
# Note that coefficients with U in the name are differences in slopes, not slopes.
# Compare:
slope(o)
coef(o)[2] + coef(o)[3]
coef(o)[2] + coef(o)[3] + coef(o)[4]
# prediction
pred <- data.frame(x = 1:100)
pred$dummy1 <- pmax(pred$x - o$psi[1,2], 0)
pred$dummy2 <- pmax(pred$x - o$psi[2,2], 0)
pred$dummy3 <- I(pred$x > o$psi[1,2]) * (coef(o)[2] + coef(o)[3])
pred$dummy4 <- I(pred$x > o$psi[2,2]) * (coef(o)[2] + coef(o)[3] + coef(o)[4])
names(pred)[-1]<- names(model.frame(o))[-c(1,2)]
# compute the prediction, using standard predict function
# computing confidence intervals further
# suppose that the breakpoints are fixed
pred <- data.frame(pred, predict(o, newdata= pred,
interval="confidence"))
# Try prediction using the predict.segment version to compare
test <- predict.segmented(o)
plot(pred$fit, test, ylim = c(0, 100))
abline(0,1, col = "red")
# At least one segment not being predicted correctly?
Can I use the base r predict() function (not the segmented.predict() function) with the coefficients and break points saved from segmented linear models?
UPDATE
I figured out that the code above has issues (don't use it). Through some reverse engineering of the segmented.predict() function, I produced the design matrix and use that to predict values instead of directly using the predict() function. I do not consider this a full answer of the original question yet because predict() can also produce confidence intervals for the prediction, and I have not yet implemented that--question still open for someone to add confidence intervals.
library(segmented)
## Define function for making matrix of dummy variables (this is based on code from predict.segmented())
dummy.matrix <- function(x.values, x_names, psi.est = TRUE, nameU, nameV, diffSlope, est.psi) {
# This function creates a model matrix with dummy variables for a segmented lm with two breakpoints.
# Inputs:
# x.values: the x values of the segmented lm
# x_names: the name of the column of x values
# psi.est: this is legacy from the predict.segmented function, leave it set to 'TRUE'
# obj: the segmented lm object
# nameU: names (class character) of 3rd and 4th coef, which are "U1.x" "U2.x" for lm with two breaks. Example: names(c(obj$coef[3], obj$coef[4]))
# nameV: names (class character) of 5th and 6th coef, which are "psi1.x" "psi2.x" for lm with two breaks. Example: names(c(obj$coef[5], obj$coef[6]))
# diffSlope: the coefficients (class numeric) with the slope differences; called U1.x and U2.x for lm with two breaks. Example: c(o$coef[3], o$coef[4])
# est.psi: the estimated break points (class numeric); these are the estimated breakpoints from segmented.lm. Example: c(obj$psi[1,2], obj$psi[2,2])
#
n <- length(x.values)
k <- length(est.psi)
PSI <- matrix(rep(est.psi, rep(n, k)), ncol = k)
newZ <- matrix(x.values, nrow = n, ncol = k, byrow = FALSE)
dummy1 <- pmax(newZ - PSI, 0)
if (psi.est) {
V <- ifelse(newZ > PSI, -1, 0)
dummy2 <- if (k == 1)
V * diffSlope
else V %*% diag(diffSlope)
newd <- cbind(x.values, dummy1, dummy2)
colnames(newd) <- c(x_names, nameU, nameV)
} else {
newd <- cbind(x.values, dummy1)
colnames(newd) <- c(x_names, nameU)
}
# if (!x_names %in% names(coef(obj.seg)))
# newd <- newd[, -1, drop = FALSE]
return(newd)
}
## Test dummy matrix function----------------------------------------------
set.seed(12)
xx<-1:100
zz<-runif(100)
yy<-2+1.5*pmax(xx-35,0)-1.5*pmax(xx-70,0)+15*pmax(zz-.5,0)+rnorm(100,0,2)
dati<-data.frame(x=xx,y=yy,z=zz)
out.lm<-lm(y~x,data=dati)
#1 segmented variable, 2 breakpoints: you have to specify starting values (vector) for psi:
o<-segmented(out.lm,seg.Z=~x,psi=c(30,60),
control=seg.control(display=FALSE))
slope(o)
plot.segmented(o)
summary(o)
# Test dummy matrix fn with the same dataset
newdata <- dati
nameU1 <- c("U1.x", "U2.x")
nameV1 <- c("psi1.x", "psi2.x")
diffSlope1 <- c(o$coef[3], o$coef[4])
est.psi1 <- c(o$psi[1,2], o$psi[2,2])
test <- dummy.matrix(x.values = newdata$x, x_names = "x", psi.est = TRUE,
nameU = nameU1, nameV = nameV1, diffSlope = diffSlope1, est.psi = est.psi1)
# Predict response variable using matrix multiplication
col1 <- matrix(1, nrow = dim(test)[1])
test <- cbind(col1, test) # Now test is the same as model.matrix(o)
predY <- coef(o) %*% t(test)
plot(predY[1,])
lines(predict.segmented(o), col = "blue") # good, predict.segmented gives same answer

Multiply coefficients with standard deviation

In R, the stargazer package offers the possibility to apply functions to the coefficients, standard errors, etc:
dat <- read.dta("http://www.ats.ucla.edu/stat/stata/dae/nb_data.dta")
dat <- within(dat, {
prog <- factor(prog, levels = 1:3, labels = c("General", "Academic", "Vocational"))
id <- factor(id)
})
m1 <- glm.nb(daysabs ~ math + prog, data = dat)
transform_coef <- function(x) (exp(x) - 1)
stargazer(m1, apply.coef=transform_coef)
How can I apply a function where the factor with which I multiply depends on the variable, like the standard deviation of that variable?
This may not be exactly what you hoped for, but you can transform the coefficients, and give stargazer a custom list of coefficients. For example, if you would like to report the coefficient times the standard deviation of each variable, the following extension of your example could work:
library(foreign)
library(stargazer)
library(MASS)
dat <- read.dta("http://www.ats.ucla.edu/stat/stata/dae/nb_data.dta")
dat <- within(dat, {
prog <- factor(prog, levels = 1:3, labels = c("General", "Academic", "Vocational"))
id <- factor(id)
})
m1 <- glm.nb(daysabs ~ math + prog, data = dat)
# Store coefficients (and other coefficient stats)
s1 <- summary(m1)$coefficients
# Calculate standard deviations (using zero for the constant)
math.sd <- sd(dat$math)
acad.sd <- sd(as.numeric(dat$prog == "Academic"))
voc.sd <- sd(as.numeric(dat$prog == "Vocational"))
int.sd <- 0
# Append standard deviations to stored coefficients
StdDev <- c(int.sd, math.sd, acad.sd, voc.sd)
s1 <- cbind(s1, StdDev)
# Store custom list
new.coef <- s1[ , "Estimate"] * s1[ , "StdDev"]
# Output
stargazer(m1, coef = list(new.coef))
You may want to consider a couple of issues outside your original question about outputting coefficients in stargazer. Should you report the intercept when multiplying times the standard deviation? Will your standard errors and inference be the same with this transformation?

Resources