I use mtcars data to explain my problem. For example, I am trying to estimate the regression coefficient for variable cyl with mpg as dependent variable, and to assess the changes of the coefficient by including other other variables.
Step 1: lm(mpg ~ cyl, data = df) to get crude coefficient for cyl
Step 2: add each of other variables one at a time into the model in step 1, choose the one with the largest change (%) in coefficient of cyl, and add that variable in the above model.
Step 3: repeat step 2 by adding each variable of the remaining variables to the model above, and again find the one with the largest change in coefficient of `cyl';
Steps ....: repeat until all variables are included in the model.
library(dplyr)
df <- mtcars %>% select(mpg, cyl, disp, hp, wt)
my_fun1 <- function(df=data) {
out_df <- data.frame(matrix(ncol = 0, nrow = (length(df) - 1)))
md_1 <- lm(mpg ~ cyl, data = df)
out_df$Models[1] <- "Crude"
out_df$Estimate[1] <- md_1$coefficients[2]
pre_change <- 0
to_rm <- 0
for (k in 2:(length(df)-1)) {
for (i in 3:length(df)) {
if (!i %in% to_rm) {
md_tmp <- update(md_1, . ~ . + df[[i]])
change <- abs(100*(md_tmp$coefficients[2] - md_1$coefficients[2])/md_1$coefficients[2])
dif <- md_tmp$coefficients[2] - md_1$coefficients[2]
if (change >= pre_change) {
out_df$Estimate[k] <- md_tmp$coefficients[2]
out_df$Models[k] <- paste("+", names(df)[[i]])
out_df$Diff[k] <- md_tmp$coefficients[2] - md_1$coefficients[2]
picked <- names(df)[[i]]
picked_i = i
pre_change <- out_df$`Change (%)`[k] <- change
}
}
}
to_rm <- c(to_rm, picked_i)
md_1 <- update(md_1, .~. + eval(as.name(paste(picked))))
pre_change = 0
}
out_df
}
my_fun1(df = df)
After running above, I expected to get regression coefficients of cyl at each step in the following format.
Models Estimate Diff Change (%)
1 Crude -2.875790 NA NA
2 + wt -1.507795 1.367995 47.56937
3 + hp -1.227420 0.280375 18.59504
4 + disp -1.227420 1.037274 45.80194
However, steps 1 and 2 provide correct results, steps 3 and 4 are incorrect. Any suggestions would be appreciated.
You can probably make this a little easier by using the vectorized property of R and avoiding painful for loops.
my_fun2 <- function(dat, i) {
fit <- lm(mpg ~ cyl, data=dat)
crude <- fit$coef[2]
# vectorized evaluation function
# fits model and calculates coef and change
evav <- Vectorize(function(i) {
# create extension string from the "i"s
cf.ext <- paste(names(dat)[i], collapse="+")
# update formula with extensions
beta <- update(fit, as.formula(
paste0("mpg~cyl",
# paste already accepted coefs case they exist
if (length(bests) != 0) {
paste("", names(dat)[bests], sep="+", collapse="")
},
"+", cf.ext)
))$coe[2]
# calculate Diff
beta.d <- abs(crude - beta)
# calculate Change %
beta.d.perc <- 100 / crude*beta.d
# set an attribute "cf.name" to be able to identify coef later
return(`attr<-`(c(beta=beta, beta.d=beta.d,
beta.d.perc=beta.d.perc),
"cf.name", cf.ext))
}, SIMPLIFY=FALSE) # simplifying would strip off attributes
# create empty vector bests
bests <- c()
# lapply evav() over the "i"s
res <- lapply(i, function(...) {
# run evav()
i.res <- evav(i)
# find largest change
largest <- which.max(mapply(`[`, i.res, 2))
# update "bests" vector within function environment with `<<-`
bests <<- c(bests, i[largest])
# same with the "i"s
i <<- i[-largest]
return(i.res[[largest]])
})
# summarize everything into matrix and give dimnames
res <- `dimnames<-`(rbind(c(crude, NA, NA),
do.call(rbind, res)),
list(
c("crude",
paste0("+ ", mapply(attr, res, "cf.name"))),
c("Estimate", "Diff", "Change (%)")))
return(res)
}
Usage
my_fun2(mtcars[c("mpg", "cyl", "disp", "hp", "wt")], i=3:5)
# Estimate Diff Change (%)
# crude -2.8757901 NA NA
# + wt -1.5077950 1.367995 -47.56937
# + hp -0.9416168 1.934173 -67.25711
# + disp -1.2933197 1.582470 -55.02733
Check
Checking the Diffs:
fit <- lm(mpg ~ cyl, data=mtcars[c("mpg", "cyl", "disp", "hp", "wt")])
sapply(c("disp", "hp", "wt"), function(x)
unname(abs(fit$coe[2] - update(fit, as.formula(paste("mpg~cyl+", x)))$coe[2])))
# disp hp wt
# 1.2885133 0.6110965 1.3679952
sapply(c("disp", "hp"), function(x)
unname(abs(fit$coe[2] - update(fit, as.formula(paste("mpg~cyl+wt+", x)))$coe[2])))
# disp hp
# 1.090847 1.934173
sapply(c("disp"), function(x)
unname(abs(fit$coe[2] - update(fit, as.formula(paste("mpg~cyl+wt+hp+", x)))$coe[2])))
# disp
# 1.58247
Should look fine.
Related
I want to write function combinations_features(y, x) which go through all combinations containing three variables and will output r squared, adjusted r squared, AIC and BIC for each combination.
My solution
combinations_features <- function(y, x) {
# Define empty vectors to store statistics
feature_vec_1 <- feature_vec_2 <-
feature_vec_3 <- feature_vec_4 <- c()
# Obtaining all combinations containing three variables
comb_names <- utils::combn(colnames(x), 3)
# For each combination obtain wanted statistics
for (i in 1:ncol(comb_names)) {
feature_vec_1 <- append(
feature_vec_1, summary(lm(y ~ ., data = x[, comb_names[, i]]))$adj.r.squared
)
feature_vec_2 <- append(
feature_vec_2, summary(lm(y ~ ., data = x[, comb_names[, i]]))$r.squared
)
feature_vec_3 <- append(
feature_vec_3, AIC(lm(y ~ ., data = x[, comb_names[, i]]))
)
feature_vec_4 <- append(
feature_vec_4, BIC(lm(y ~ ., data = x[, comb_names[, i]]))
)
}
# Assign everything into data frame
data.frame(
"Adj R2" = feature_vec_1, "R2" = feature_vec_2,
"AIC" = feature_vec_3, "BIC" = feature_vec_4
)
}
Let's see how it works - define some artificial data and give it to the function.
set.seed(42)
predictors <- data.frame(rnorm(100), runif(100), rexp(100), rpois(100, 1))
dependent <- rnorm(100)
> combinations_features(dependent, predictors)
Adj.R2 R2 AIC BIC
1 -0.0283756015 0.002787295 276.2726 289.2985
2 0.0000677269 0.030368705 273.4678 286.4937
3 -0.0011990695 0.029140296 273.5944 286.6203
4 0.0015404392 0.031796789 273.3204 286.3463
However I find this code very inefficient due to these two things:
(1) Loop - I looped it over columns of matrices comb_names, I wonder if it can be omitted somehow
(2) Length of the code - This code is huge! Due to the fact that I define feature_vec for each statistics and append to them separately. I wonder if assigning to them can be done somehow by one command.
Could you please give me hand with improving my code by telling if it's possible to apply (1) or (2) ?
How about this, which relies on bind_rows() from tidyverse? I don't think there's a way to avoid looping over the combinations, but lapply makes everything a little neater, IMHO.
combinations_features1 <- function(y, x) {
comb_names <- utils::combn(colnames(x), 3)
bind_rows(
lapply(
1:ncol(comb_names),
function(z) {
m <- lm(y ~ ., data = x[, comb_names[,z]])
s <- summary(m)
tibble(Adj.R2=s$adj.r.squared, R2=s$r.squared, AIC=AIC(m), BIC=BIC(m))
}
)
)
}
combinations_features1(dependent, predictors)
# A tibble: 4 x 4
Adj.R2 R2 AIC BIC
<dbl> <dbl> <dbl> <dbl>
1 -0.0284 0.00279 276. 289.
2 0.0000677 0.0304 273. 286.
3 -0.00120 0.0291 274. 287.
4 0.00154 0.0318 273. 286.
bind_rows(), if given a list, binds the elements of the list into a single data.frame.
Same idea as above, just directly applying lapply to the list of combinations would also work:
combinations_features <- function(y,x){
do.call(rbind, lapply(utils::combn(colnames(x), 3, simplify=FALSE),
function(i){
f1 <- lm(y ~ ., data=x[, i])
data.frame(Adj.R2=summary(f1)$adj.r.squared,
R2=summary(f1)$r.squared,
AIC=AIC(f1), BIC=BIC(f1))
}))
}
I have a linear model with lots of explaining variables (independent variables)
model <- lm(y ~ x1 + x2 + x3 + ... + x100)
some of which are linear depended on each other (multicollinearity).
I want the machine to search for the name of the explaining variable which has the highest VIF coefficient (x2 for example), delete it from the formula and then run the old lm function with the new formula
model <- lm(y ~ x1 + x3 + ... + x100)
I already learned how to retrieve the name of the explaining variable which has the highest VIF coefficient:
max_vif <- function(x) {
vifac <- data.frame(vif(x))
nameofmax <- rownames(which(vifac == max(vifac), arr.ind = TRUE))
return(nameofmax)
}
But I still don't understand how to search the needed explaining variable, delete it from the formula and run the function again.
We can use the update function and paste in the column that needs to be removed. We first can fit a model, and then use update to change that model's formula. The model formula can be expressed as a character string, which allows you to concatenate the general formula .~. and whatever variable(s) you'd like removed (using the minus sign -).
Here is an example:
fit1 <- lm(wt ~ mpg + cyl + am, data = mtcars)
coef(fit1)
# (Intercept) mpg cyl am
# 4.83597190 -0.09470611 0.08015745 -0.52182463
rm_var <- "am"
fit2 <- update(fit1, paste0(".~. - ", rm_var))
coef(fit2)
# (Intercept) mpg cyl
# 5.07595833 -0.11908115 0.08625557
Using max_vif we can wrap this into a function:
rm_max_vif <- function(x){
# find variable(s) needing to be removed
rm_var <- max_vif(x)
# concatenate with "-" to remove variable(s) from formula
rm_var <- paste(paste0("-", rm_var), collapse = " ")
# update model
update(x, paste0(".~.", rm_var))
}
Problem solved!
I created a list containing all variables for lm model:
Price <- list(y,x1,...,x100)
Then I used different way for setting lm model:
model <- lm(y ~ ., data = Price)
So we can just delete variable with the highest VIF from Price list.
With the function i already came up the code will be:
Price <- list(y,x1,x2,...,x100)
model <- lm(y ~ ., data = Price)
max_vif <- function(x) { # Function for finding name of variable with the highest VIF
vifac <- data.frame(vif(x))
nameofmax <- rownames(which(vifac == max(vifac), arr.ind = TRUE))
return(nameofmax)
}
n <- max(data.frame(vif(model)))
while(n >= 5) { # Loop for deleting variable with the highest VIF from `Price` list one after another, untill there is no VIF equal or higher then 5
Price[[m]] <- NULL
model_auto <- lm(y ~ ., data = Price)
m <- max_vif(model)
n <- max(data.frame(vif(model)))
}
I was wondering why lm() says 5 coefs not defined because of singularities and then gives all NA in the summary output for 5 coefficients.
Note that all my predictors are categorical.
Is there anything wrong with my data on these 5 coefficients or code? How can I possibly fix this?
d <- read.csv("https://raw.githubusercontent.com/rnorouzian/m/master/v.csv", h = T) # Data
nms <- c("Age","genre","Length","cf.training","error.type","cf.scope","cf.type","cf.revision")
d[nms] <- lapply(d[nms], as.factor) # make factor
vv <- lm(dint~Age+genre+Length+cf.training+error.type+cf.scope+cf.type+cf.revision, data = d)
summary(vv)
First 6 lines of output:
Coefficients: (5 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.17835 0.63573 0.281 0.779330
Age1 -0.04576 0.86803 -0.053 0.958010
Age2 0.46431 0.87686 0.530 0.596990
Age99 -1.64099 1.04830 -1.565 0.118949
genre2 1.57015 0.55699 2.819 0.005263 **
genre4 NA NA NA NA ## For example here is all `NA`s? there are 4 more !
As others noted, a problem is that you seem to have multicollinearity. Another is that there are missing values in your dataset. The missing values should probably just be removed. As for correlated variables, you should inspect your data to identify this collinearity, and remove it. Deciding which variables to remove and which to retain is a very domain-specific topic. However, you could if you wish decide to use regularisation and fit a model while retaining all variables. This also allows you to fit a model when n (number of samples) is less than p (number of predictors).
I've shown code below that demonstrates how to examine the correlation structure within your data, and to identify which variables are most correlated (thanks to this answer. I've included an example of fitting such a model, using L2 regularisation (commonly known as ridge regression).
d <- read.csv("https://raw.githubusercontent.com/rnorouzian/m/master/v.csv", h = T) # Data
nms <- c("Age","genre","Length","cf.training","error.type","cf.scope","cf.type","cf.revision")
d[nms] <- lapply(d[nms], as.factor) # make factor
vv <- lm(dint~Age+genre+Length+cf.training+error.type+cf.scope+cf.type+cf.revision, data = d)
df <- d
df[] <- lapply(df, as.numeric)
cor_mat <- cor(as.matrix(df), use = "complete.obs")
library("gplots")
heatmap.2(cor_mat, trace = "none")
## https://stackoverflow.com/questions/22282531/how-to-compute-correlations-between-all-columns-in-r-and-detect-highly-correlate
library("tibble")
library("dplyr")
library("tidyr")
d2 <- df %>%
as.matrix() %>%
cor(use = "complete.obs") %>%
## Set diag (a vs a) to NA, then remove
(function(x) {
diag(x) <- NA
x
}) %>%
as.data.frame %>%
rownames_to_column(var = 'var1') %>%
gather(var2, value, -var1) %>%
filter(!is.na(value)) %>%
## Sort by decreasing absolute correlation
arrange(-abs(value))
## 2 pairs of variables are almost exactly correlated!
head(d2)
#> var1 var2 value
#> 1 id study.name 0.9999430
#> 2 study.name id 0.9999430
#> 3 Location timed 0.9994082
#> 4 timed Location 0.9994082
#> 5 Age ed.level 0.7425026
#> 6 ed.level Age 0.7425026
## Remove some variables here, or maybe try regularized regression (see below)
library("glmnet")
## glmnet requires matrix input
X <- d[, c("Age", "genre", "Length", "cf.training", "error.type", "cf.scope", "cf.type", "cf.revision")]
X[] <- lapply(X, as.numeric)
X <- as.matrix(X)
ind_na <- apply(X, 1, function(row) any(is.na(row)))
X <- X[!ind_na, ]
y <- d[!ind_na, "dint"]
glmnet <- glmnet(
x = X,
y = y,
## alpha = 0 is ridge regression
alpha = 0)
plot(glmnet)
Created on 2019-11-08 by the reprex package (v0.3.0)
Under such situation you can use "olsrr" package in R for stepwise regression analysis. I am providing you a sample code to do stepwise regression analysis in R
library("olsrr")
#Load the data
d <- read.csv("https://raw.githubusercontent.com/rnorouzian/m/master/v.csv", h = T)
# stepwise regression
vv <- lm(dint ~ Age + genre + Length + cf.training + error.type + cf.scope + cf.type + cf.revision, data = d)
summary(vv)
k <- ols_step_both_p(vv, pent = 0.05, prem = 0.1)
# stepwise regression plot
plot(k)
# final model
k$model
It will provide you exactly the same output as that of SPSS.
I ran several time-series regressions (one for every year) and now I'd like to generate a table similar to what coef() returns but also with level of significance (the stars), R-squared and F-statistic for each year which will look somewhat like this:
b0 b1 b2 b3 b4 R-sq. F-stat.
2010 ...*
2011 ...
2012 ...**
So far I tried mtable() from the memisc-package which gives me years as columns and coeffecients as rows but I'd prefer the result to be "transposed" (like above).
Since we don't have access to your data or the code you used to run your models, I created my own dummy models using the mtcars dataset:
data("mtcars")
model1 <- lm(mpg ~ wt + cyl, data = mtcars)
model2 <- lm(mpg ~ wt + cyl + hp, data = mtcars)
For future reference, you'll always want to supply some of your data using, for example, dput(head(my_dataframe, 20)). You should also put up more of the code you used to get where you're at; in fact, the minimum amount of code needed to reproduce your problem. You may want to read How to Create a Great R Reproducible Example for more information; it just helps others help you.
Then I rigged up the following (clumsy) function that I think does roughly what you're looking for. In any event, it should get you started in the right direction:
get_row <- function(x, coef_names) {
coef_mat <- coef(summary(x))
these_coef_names <- rownames(coef_mat)
rows <- match(coef_names, these_coef_names)
p <- coef_mat[rows, 4]
stars <- c("", "*", "**", "***")[(p < 0.05) + (p < 0.01) + (p < 0.001) + 1]
coefs <- round(coef_mat[rows, 1], 3)
output <- paste0(coefs, stars)
output <- ifelse(grepl("NA", output), NA, output)
return(output)
}
get_table <- function(...) {
models <- list(...)
if ( any(lapply(models, class) != "lm" ) ) {
stop("This function has only been tested with lm objects.")
}
coef_names <- unique(unlist(sapply(models, variable.names)))
coef_table <- t(sapply(models, get_row, coef_names))
colnames(coef_table) <- coef_names
return(coef_table)
}
get_table(model1, model2)
# (Intercept) wt cyl hp
# [1,] "39.686***" "-3.191***" "-1.508**" NA
# [2,] "38.752***" "-3.167***" "-0.942" "-0.018"
Is there, either for the lm() function or for some other function for linear regression, an argument such that the reference group can be set to always be the biggest group rather than the alphabetical/numerical default in lm()?
As this's often done in stats, I'm thinking I somehow keep missing it when I search the documentation or that I'm looking in the wrong places. Any help would be appreciated!
Below, even when in a UDF, is what I'd like NOT to have to keep doing.
mtcars # load dataset
mtcars <- mtcars[1:31, ] # remove a now so that there is a single biggest group
lm(mpg ~ gear+carb+disp, data = mtcars ) # carb's group 1 is the reference by default
mtcars$carb <- as.factor(mtcars$carb)
mtcars <- within(mtcars, carb <- relevel(carb, ref = "4")) # set carb's group 4 as the reference
lm(mpg ~ gear+carb+disp, data = mtcars )
I don't believe there is a built-in function to do that but it's not that difficult to write one.
largest_ref <- function(DF, col){
DF[[col]] <- factor(DF[[col]])
tbl <- table(DF[[col]])
largest <- names(tbl)[which.max(tbl)]
DF[[col]] <- relevel(DF[[col]], ref = largest)
DF
}
Now I will reload the test dataset and change a copy of it. Then run regressions on both datasets, the one releveled by your code and the one releveled by the function above.
data(mtcars)
mtcars <- mtcars[1:31, ]
mtc <- mtcars
mtcars$carb <- as.factor(mtcars$carb)
mtcars <- within(mtcars, carb <- relevel(carb, ref = "4")) # set carb's group 4 as the reference
fit1 <- lm(mpg ~ gear + carb + disp, data = mtcars)
mtc <- largest_ref(mtc, "carb")
fit2 <- lm(mpg ~ gear + carb + disp, data = mtc)
identical(coef(fit1), coef(fit2))
#[1] TRUE
As you can see, the results are the same. You can further see it with (output omited).
summary(fit1)
summary(fit2)
It doesn't look like lm has any option for this, but you can just create a wrapper function to change the levels of a factor accounting to frequeuncy, then use that in the formula.
big.ref <- function(x) {
if(!is.factor(x)) x<-factor(x)
counts <- sort(table(x), decreasing = TRUE)
relevel(x, ref=names(counts)[1])
}
lm(mpg ~ gear + big.ref(carb) + disp, data = mtcars )