Plot the impact for each variable in linear regression? - r

I want to create a plot like below for a lm model calculated using R.
Is there a simple way of doing it?
The plot above was collected here in this page.

Package {caret} offers a convenient method varImp:
Example:
library(caret)
my_model <- lm(mpg ~ disp + cyl, data = mtcars)
## > varImp(my_model)
##
## Overall
## disp 2.006696
## cyl 2.229809
For different measures of variable importance see ?varImp. Feed values into your plotting library of choice.
Extra: {ggstatsplot} calculates and plots a host of model stats for a plethora of model objects. This includes hypotheses about regression coefficients, for which method ggcoefstats() might serve your purpose (remember to scale predictor variables for meaningful comparison of coefficients though).

Following the method in the linked article (relative marginal increase in r squared), you could write your own function that takes a formula, and the data frame, then plots the relative importance:
library(ggplot2)
plot_importance <- function(formula, data) {
lhs <- as.character(as.list(formula)[[2]])
rhs <- as.list(as.list(formula)[[3]])
vars <- grep("[+\\*]", rapply(rhs, as.character), invert = TRUE, value = TRUE)
df <- do.call(rbind, lapply(seq_along(vars), function(i) {
f1 <- as.formula(paste(lhs, paste(vars[-i], collapse = "+"), sep = "~"))
f2 <- as.formula(paste(lhs, paste(c(vars[-i], vars[i]), collapse = "+"),
sep = "~"))
r1 <- summary(lm(f1, data = data))$r.squared
r2 <- summary(lm(f2, data = data))$r.squared
data.frame(variable = vars[i], importance = r2 - r1)
}))
df$importance <- df$importance / sum(df$importance)
df$variable <- reorder(factor(df$variable), -df$importance)
ggplot(df, aes(x = variable, y = importance)) +
geom_col(fill = "deepskyblue4") +
scale_y_continuous(labels = scales::percent) +
coord_flip() +
labs(title = "Relative importance of variables",
subtitle = deparse(formula)) +
theme_classic(base_size = 16)
}
We can test this out with the sample data provided in the linked article:
IV <- read.csv(paste0("https://statisticsbyjim.com/wp-content/uploads/",
"2017/07/ImportantVariables.csv"))
plot_importance(Strength ~ Time + Pressure + Temperature, data = IV)
And we see that the plot is the same.
We can also test it out on some built-in datasets to demonstrate that its use is generalized:
plot_importance(mpg ~ disp + wt + gear, data = mtcars)
plot_importance(Petal.Length ~ Species + Petal.Width, data = iris)
Created on 2022-05-01 by the reprex package (v2.0.1)

Just ended up using relaimpo package and showing with ggplot answered by #Allan Cameron
library(relaimpo)
relative_importance <- calc.relimp(mymodel, type="lmg")$lmg
df = data.frame(
variable=names(relative_importance),
importance=round(c(relative_importance) * 100,2)
)
ggplot(df, aes(x = reorder(variable, -importance), y = importance)) +
geom_col(fill = "deepskyblue4") +
geom_text(aes(label=importance), vjust=.3, hjust=1.2, size=3, color="white")+
coord_flip() +
labs(title = "Relative importance of variables") +
theme_classic(base_size = 16)

Related

Error in FUN(X[[i]], ...) : object 'predicted' not found

i try to search about this problem but i couldn't find,that i try to apply ggplot for the relation between the two variables (predictor and predicted x,y), the method was linear regression model (lm) but i got this error
Error in FUN(X[[i]], ...) : object 'predicted' not found
and this my code:
# Install tidymodels if you haven't done so
install.packages("rlang")
install.packages("tidymodels")
install.packages("dplyr")
# Library for modeling
library(tidymodels)
# Load tidyverse
library(tidyverse)
library(dplyr)
URL <- 'https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/noaa-weather-sample-data.tar.gz'
download.file (URL, destfile='noaa-weather-sample-data.tar.gz')
untar('noaa-weather-sample-data.tar.gz',tar = 'internal')
dataset<- read.csv ('noaa-weather-sample-data/jfk_weather_sample.csv')
head(dataset)
glimpse(dataset)
subset_data <- data.frame(dataset$HOURLYRelativeHumidity,dataset$HOURLYDRYBULBTEMPF,dataset$HOURLYStationPressure,dataset$HOURLYWindSpeed,dataset$HOURLYPrecip)
subset_data<-setNames(subset_data,c('HOURLYRelativeHumidity','HOURLYDRYBULBTEMPF','HOURLYStationPressure','HOURLYWindSpeed', 'HOURLYPrecip'))
head(subset_data,10)
unique(subset_data$HOURLYPrecip)
subset_data <- subset_data %>%mutate(HOURLYPrecip = replace(HOURLYPrecip, HOURLYPrecip == "T", "0.0"))
View(subset_data)
install.packages('stringr ')
library(stringr)
subset_data$HOURLYPrecip <- str_remove(subset_data$HOURLYPrecip, pattern = 's')
unique(subset_data$HOURLYPrecip)
subset_data$HOURLYPrecip <- as.numeric(subset_data$HOURLYPrecip)
subset_data$HOURLYRelativeHumidity <- as.numeric(subset_data$HOURLYRelativeHumidity)
subset_data$HOURLYDRYBULBTEMPF <- as.numeric(subset_data$HOURLYDRYBULBTEMPF)
subset_data$HOURLYWindSpeed <- as.numeric(subset_data$HOURLYWindSpeed)
str(subset_data1)
subset_data1 <- setNames(subset_data,c('relative_humidity','dry_bulb_temp_f','station_pressure','wind_speed', 'precip'))
#library(rsample)
set.seed(1234)
#split_data <- initial_split(subset_data1)
dt<- sort(sample(nrow(subset_data1), nrow(subset_data1)*.8))
train_data<- subset_data1 [dt,]
test_data <- subset_data1 [-dt,]
install.packages('ggplot')
library(ggplot2)
hist(train_data$relative_humidity)
hist(train_data$dry_bulb_temp_f)
hist(train_data$station_pressure)
hist(train_data$wind_speed)
hist(train_data$precip)
install.packages('stats')
library(stats)
linear_reg1 <- lm (precip ~ relative_humidity, train_data)
linear_reg2 <- lm (precip ~ dry_bulb_temp_f, train_data)
linear_reg3 <- lm (precip ~ wind_speed, train_data)
linear_reg4 <- lm (precip ~ station_pressure, train_data)
#ggplot(train_data, aes(x = train_data$relative_humidity ,y= train_data$precip)) + geom_point()+ stat_smooth(method = 'lm', col ='red')
ggplot(train_data, aes(x= relative_humidity, y= precip))+
geom_smooth(method='lm',se=FALSE ,color='red')+
geom_segment(aes(xend= relative_humidity ,yend = predicted),alpha=.2)+
geom_point()+
geom_point(aes(y = predicted),shape=1)+
theme_bw()
You simply don't have a column called predicted in your data set. Presumably you want this to be the predicted value of precipitation based on humidity. In that case, you can easily create the column in your data frame using your first linear model:
train_data$predicted <- predict(linear_reg1, train_data)
ggplot(train_data, aes(x = relative_humidity, y = precip)) +
geom_segment(aes(xend = relative_humidity, yend = predicted), alpha = 0.2) +
geom_point(alpha = 0.1) +
geom_smooth(method='lm', se = FALSE , color = 'red') +
geom_point(aes(y = predicted), shape = 1, alpha = 0.1) +
theme_bw()
As you can see, a linear model is no good here. It predicts negative precipitation at low humidity and underestimates the precipitation at high humidity.

Using purrr and functions to perform linear regressions on a number of variables with random errors

This is what my data looks like:
dataSet <- data.frame(study_id=c(1,1,1,1,2,2,2,2,3,3,3,3),
Timepoint=c(1,6,12,18,1,6,12,18,1,6,12,18),
Secretor=c(0,0,0,0,1,1,1,1,0,0,0,0),
Gene1=c(1,2,3,4,1,2,3,4,1,2,3,4),
Gene2=c(3,4,5,6,3,4,5,6,3,4,5,6),
Gene3=c(4,5,6,7,4,5,6,7,4,5,6,7),
Gene4=c(6,7,8,9,6,7,8,9,6,7,8,9))
I have successfully used purrr to generate many exploratory ggplots using the following function:
library(tidyverse)
stat_sum_df_all <- function(fun, geom="pointrange", ...) {
stat_summary(fun.data=fun, geom=geom, ...)
}
plot_fun = function(x, y) {
ggplot(data = dataSet, aes(x = .data[[x]], y = .data[[y]], group = Secretor, colour = Secretor)) +
stat_summary(geom = "line", fun.data = median_hilow) +
stat_sum_df_all("median_hilow", fun.args=(conf.int = 0.5), linetype = "solid") +
theme_bw()
}
genelist = names(dataSet)[4:7]
Timepoint = names(dataSet)[2]
all_plots = map(genelist,
~map(Timepoint, plot_fun, y = .x) )
Now what I would like to do is put the p-values of linear regressions in the title of the plots. The formula of my regression is this:
library(lmerTest)
fit <- lmer(genelist ~ Timepoint*Secretor + (1|study_id), data=dataSet)
However, I can't figure out how to similarly create a function like the one I did for the plots to run this regression for each gene. Thanks in advance for any suggestions.
I got assistance on reddit, here was the solution, thanks everyone for your help.
my_fitting_function <- function(gene) {
f <- paste0(gene, " ~ Timepoint*Secretor + (1|study_id)")
fit <- lmer(f, data = dataSet)
return(fit)
}
models <- purrr::map(genelist, my_fitting_function)

How to create a 2nd order trendline in R [duplicate]

I have a simple polynomial regression which I do as follows
attach(mtcars)
fit <- lm(mpg ~ hp + I(hp^2))
Now, I plot as follows
> plot(mpg~hp)
> points(hp, fitted(fit), col='red', pch=20)
This gives me the following
I want to connect these points into a smooth curve, using lines gives me the following
> lines(hp, fitted(fit), col='red', type='b')
What am I missing here. I want the output to be a smooth curve which connects the points
I like to use ggplot2 for this because it's usually very intuitive to add layers of data.
library(ggplot2)
fit <- lm(mpg ~ hp + I(hp^2), data = mtcars)
prd <- data.frame(hp = seq(from = range(mtcars$hp)[1], to = range(mtcars$hp)[2], length.out = 100))
err <- predict(fit, newdata = prd, se.fit = TRUE)
prd$lci <- err$fit - 1.96 * err$se.fit
prd$fit <- err$fit
prd$uci <- err$fit + 1.96 * err$se.fit
ggplot(prd, aes(x = hp, y = fit)) +
theme_bw() +
geom_line() +
geom_smooth(aes(ymin = lci, ymax = uci), stat = "identity") +
geom_point(data = mtcars, aes(x = hp, y = mpg))
Try:
lines(sort(hp), fitted(fit)[order(hp)], col='red', type='b')
Because your statistical units in the dataset are not ordered, thus, when you use lines it's a mess.
Generally a good way to go is to use the predict() function. Pick some x values, use predict() to generate corresponding y values, and plot them. It can look something like this:
newdat = data.frame(hp = seq(min(mtcars$hp), max(mtcars$hp), length.out = 100))
newdat$pred = predict(fit, newdata = newdat)
plot(mpg ~ hp, data = mtcars)
with(newdat, lines(x = hp, y = pred))
See Roman's answer for a fancier version of this method, where confidence intervals are calculated too. In both cases the actual plotting of the solution is incidental - you can use base graphics or ggplot2 or anything else you'd like - the key is just use the predict function to generate the proper y values. It's a good method because it extends to all sorts of fits, not just polynomial linear models. You can use it with non-linear models, GLMs, smoothing splines, etc. - anything with a predict method.

How do I plot a single numerical covariate using emmeans (or other package) from a model?

After variable selection I usually end up in a model with a numerical covariable (2nd or 3rd degree). What I want to do is to plot using emmeans package preferentially. Is there a way of doing it?
I can do it using predict:
m1 <- lm(mpg ~ poly(disp,2), data = mtcars)
df <- cbind(disp = mtcars$disp, predict.lm(m1, interval = "confidence"))
df <- as.data.frame(df)
ggplot(data = df, aes(x = disp, y = fit)) +
geom_line() +
geom_ribbon(aes(ymin = lwr, ymax = upr, x = disp, y = fit),alpha = 0.2)
I didn't figured out a way of doing it using emmip neither emtrends
For illustration purposes, how could I do it using mixed models via lme?
m1 <- lme(mpg ~ poly(disp,2), random = ~1|factor(am), data = mtcars)
I suspect that your issue is due to the fact that by default, covariates are reduced to their means in emmeans. You can use theat or cov.reduce arguments to specify a larger number of values. See the documentation for ref_grid and vignette(“basics”, “emmeans”), or the index of vignette topics.
Using sjPlot:
plot_model(m1, terms = "disp [all]", type = "pred")
gives the same graphic.
Using emmeans:
em1 <- ref_grid(m1, at = list(disp = seq(min(mtcars$disp), max(mtcars$disp), 1)))
emmip(em1, ~disp, CIs = T)
returns a graphic with a small difference in layout. An alternative is to add the result to an object and plot as the way that I want to:
d1 <- emmip(em1, ~disp, CIs = T, plotit = F)

How to plot 3 models in one Figure in R?

I'm new with R and I have fit 3 models for my data as follows:
Model 1: y = a(x) + b
lm1 = lm(data$CBI ~ data$dNDVI)
Model 2: y = a(x)2 + b(x) + c
lm2 <- lm(CBI ~ dNDVI + I(dNDVI^2), data=data)
Model 3: y = x(a|x| + b)–1
lm3 = nls(CBI ~ dNDVI*(a*abs(dNDVI) + b) - 1, start = c(a = 1.5, b = 2.7), data = data)
Now I would like to plot all these three models in R but I could not find the way to do it, can you please help me? I have tried with the first two models as follow and it work but I don't know how to add the Model 3 on it:
ggplot(data = data, aes(x = dNDVI, y = CBI)) +
geom_point() +
geom_smooth(method = lm, formula = y ~ x, size = 1, se = FALSE) +
geom_smooth(method = lm, formula = y ~ x + I(x^2), size = 1, se = FALSE ) +
theme_bw()
I also would like to add a legend which show 3 different colours or types of lines/curves for the 3 models as well. Can you please guide me how to make it in the figure?
Using iris as a dummy set to represent the three models:
new.dat <- data.frame(Sepal.Length=seq(min(iris$Sepal.Length),
max(iris$Sepal.Length), length.out=50)) #new data.frame to predict the fitted values for each model
m1 <- lm(Petal.Length ~ Sepal.Length, iris)
m2 <- lm(Petal.Length ~ Sepal.Length + I(Sepal.Length^2), data=iris)
m3 <- nls(Petal.Length ~ Sepal.Length*(a*abs(Sepal.Length) + b) - 1,
start = c(a = 1.5, b = 2.7), data = iris)
new.dat$m1.fitted <- predict(m1, new.dat)
new.dat$m2.fitted <- predict(m2, new.dat)
new.dat$m3.fitted <- predict(m3, new.dat)
new.dat <- new.dat %>% gather(var, val, m1.fitted:m3.fitted) #stacked format of fitted data of three models (to automatically generate the legend in ggplot)
ggplot(new.dat, aes(Sepal.Length, val, colour=var)) +
geom_line()

Resources