ggplot for linear-log regression model? - r

How do I plot a log linear model in R?
Currently, I am doing this but am not sure if it's the right/efficient way:
data(food)
model1 <- lm(food_exp~log(income), data = food)
temp_var <- predict(model1, interval="confidence")
new_df <- cbind(food, temp_var)
head(new_df)
ggplot(new_df, aes(x = income, y = food_exp))+
geom_point() +
geom_smooth(aes(y=lwr), color = "red", linetype = "dashed")+
geom_smooth(aes(y=upr), color = "red", linetype = "dashed")+
geom_smooth(aes(y = fit), color = "blue")+
theme_economist()

you can use geom_smooth and putting your formula directly in. It should yield the same as your fit (which you can check by also plotting that)
ggplot(new_df, aes(x = Sepal.Width, y = Sepal.Length))+
geom_point() +
geom_point(aes(y=fit), color="red") + #your original fit
geom_smooth(method=lm, formula=y~log(x)) #ggplot fit

If you don't car about extracting the parameters and just want the plot, you can plot directly in ggplot2.
Some fake data for plotting:
library(tidyverse)
set.seed(454)
income <- VGAM::rpareto(n = 100, scale = 20, shape = 2)*1000
food_exp <- rnorm(100, income*.3+.1, 3)
food <- data.frame(income, food_exp)
Now within ggplot2, use the geom_smooth function and specify that you want a linear model. Additionally, you can directly transform the income in the aes argument:
ggplot(food, aes(x = log(income), y = food_exp))+
geom_point()+
geom_smooth(method = "lm")+
theme_bw()+
labs(
title = "Log Linear Model Food Expense as a Function of Log(income)",
x = "Log(Income)",
y = "Food Expenses"
)
This will work for confidence intervals, but adding prediction intervals, you'll need to do what you did earlier with fitting the model, generating the prediction intervals.

Related

Adding a regression trend line and a shaded standard error area to a ggplot for regression models that geom_smooth does not handle

I have a data.frame with observed success/failure outcomes per two groups along with expected probabilities:
library(dplyr)
observed.probability.df <- data.frame(group = c("A","B"), p = c(0.4,0.6))
expected.probability.df <- data.frame(group = c("A","B"), p = qlogis(c(0.45,0.55)))
observed.data.df <- do.call(rbind,lapply(c("A","B"), function(g)
data.frame(group = g, value = c(rep(0,1000*dplyr::filter(observed.probability.df, group != g)$p),rep(1,1000*dplyr::filter(observed.probability.df, group == g)$p)))
)) %>% dplyr::left_join(expected.probability.df)
observed.probability.df$group <- factor(observed.probability.df$group, levels = c("A","B"))
observed.data.df$group <- factor(observed.data.df$group, levels = c("A","B"))
I'm fitting a logistic regression (binomial glm with a logit link function) to these data with the offset term:
fit <- glm(value ~ group + offset(p), data = observed.data.df, family = binomial(link = 'logit'))
Now, I'd like to plot these data as a bar graph using ggplot2's geom_bar, color-coded by group, and to add to that the trend line and shaded standard error area estimated in fit.
I'd use stat_smooth for that but I don't think it can handle the offset term in it's formula, so looks like I need to resort to assembling this figure in an alternative way.
To get the bars and the trend line I used:
slope.est <- function(x, ests) plogis(ests[1] + ests[2] * x)
library(ggplot2)
ggplot(observed.probability.df, aes(x = group, y = p, fill = group)) +
geom_bar(stat = 'identity') +
stat_function(fun = slope.est,args=list(ests=coef(fit)),size=2,color="black") +
scale_x_discrete(name = NULL,labels = levels(observed.probability.df$group), breaks = sort(unique(observed.probability.df$group))) +
theme_minimal() + theme(legend.title = element_blank()) + ylab("Fraction of cells")
So the question is how to add to that the shaded standard error around the trend line?
Using stat_function I am able to shade the entire area from the upper bound of the standard error all the way down to the X-axis:
ggplot(observed.probability.df, aes(x = group, y = p, fill = group)) +
geom_bar(stat = 'identity') +
stat_function(fun = slope.est,args=list(ests=coef(fit)),size=2,color="black") +
stat_function(fun = slope.est,args=list(ests=summary(fit)$coefficients[,1]+summary(fit)$coefficients[,2]),geom='area',fill="gray",alpha=0.25) +
scale_x_discrete(name = NULL,labels = levels(observed.probability.df$group), breaks = sort(unique(observed.probability.df$group))) +
theme_minimal() + theme(legend.title = element_blank()) + ylab("Fraction of cells")
Which is close but not quite there.
Any idea how to subtract from the shaded area above the area that's below the lower bound of the standard error? Perhaps geom_ribbon is the way to go here, but I don't know how to combine it with the slope.est function

Method to extract exact 95% confidence values from stat_smooth(method = "glm")

I have produced a glm interaction plot using ggplot2. I have attached the code I have used and the plot
.
I know that the grey shaded areas represent the 95% condfidence interval, but I am wondering if there is a method to get the exact values of the grey shaded areas and therefore 95% confidence interval?
#bind data togther
Modern_EarlyHolocene<-rbind(FladenF30, FladenB30, Early_Holocene)
#Build modern vs Holocene model
Modern_EarlyHolocene<-glm(Max_Height~Age+Time_period, data=Modern_EarlyHolocene,family = gaussian)
#Produce gg interaction plot
Modern_EarlyHolocene_plot<-ggplot(data=Modern_EarlyHolocene) +
aes(x = Age, y = Max_Height, group = Time_period, color = Time_period,) +>
geom_point( alpha = .7) +
stat_smooth(method = "glm", level=0.95) +
expand_limits(y=c(0,90), x=c(0,250))
#add axis labels
Modern_EarlyHolocene_plot + labs(x = "Age (years)", y = 'Maximum height (mm)') +
theme(legend.text = element_text(size = 14, colour = "Black"),
legend.title=element_blank()) +
theme(axis.text=element_text(size=14),
axis.title=element_text(size=16,face="bold"))
You can access de plot data with layer_data(Modern_EarlyHolocene_plot, i) with i corresponding to the layer to return, in the order added to the plot
You are effectively fitting a different regression line for each Time_period, so your glm has to include an interaction term. It should be:
Modern_EarlyHolocene<-glm(Max_Height~Age*Time_period, data=Modern_EarlyHolocene)
I do not have your data, so see below for an example with iris:
fit = glm(Sepal.Width ~ Sepal.Length * Species,data=iris)
g1 = ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width,color=Species)) +
geom_point( alpha = .7) + stat_smooth(method = "glm", level=0.95)
To get the se of the predictions, you do:
pred = predict(fit,iris,se.fit = TRUE)
df_pred = data.frame(iris,pred=pred$fit,se=pred$se)
We can plot this, and the upper and lower bounds of the prediction are 1.96 * the standard error:
g2 = ggplot(df_pred,aes(x=Sepal.Length,y=Sepal.Width,color=Species)) +
geom_point( alpha = .7) +
geom_ribbon(aes(ymin=pred-1.96*se,ymax=pred+1.96*se,fill=Species),alpha=0.1)

quadratic fit curve in Spaghetti plot. Lme?

I am trying to fit a quadratic curve over my spaghetti plot. In the beginning I did it only with ggplot like this:
library(ggplot2)
library(reshape2)
GCIP <- data_head$GCIP
Patient.ID <- data_head$Patient.ID
Eye <-data_head$Eye
Visit <-data_head$Visit
Patient<-data_head$Patient
data_head$time_since_on <- as.numeric(as.character(data_head$time_since_on))
ggplot(data = data_head, aes(x= time_since_on, y=GCIP)) +
geom_point(alpha=1, size=2) +
aes(colour=Patient.ID) +
geom_path(aes(group='Patient.ID'))
ggplot(data= data_head, aes(x = time_since_on, y = GCIP)) +
geom_point(size = 2, alpha= 1, aes(color = Patient.ID)) + #colour points by group
geom_path(aes(group = Patient.ID)) + #spaghetti plot
stat_smooth(method = "lm", formula = y ~ poly(x,2)) + #line of best fit by group
ylab("GCIP (volume)") + xlab("time_since_on (months)") +
theme_bw()
The problem is that I am not sure this code takes into account that each line contains different timepoints of 1 patient, so the line fitted should take that also into account.
Could you please tell me if this is correct?
Here you can see the graph I get
I am not sure and maybe is better to generate a lme model (but in that case I don't know how to introduce the quadratic fitting in the model).
I also did this:
data_head <- read.csv("/Users/adrianaroca-fernandez/Desktop/Analysis/Long_100418_2/N=lit.csv", sep=";", dec=",")
library(ggplot2)
library(reshape2)
library(lme4)
library(lsmeans)
GCIP <- data_head$GCIP
Patient.ID <- data_head$Patient.ID
Eye <-data_head$Eye
Visit <-data_head$Visit
Patient<-data_head$Patient
data_head$time_since_on <- as.numeric(as.character(data_head$time_since_on))
time_since_on <-data_head$time_since_on
time_since_on2 <- time_since_on^2
quadratic.model <-lm(GCIP ~ time_since_on + time_since_on2)
summary(quadratic.model)
time_since_onvalues <- seq(0, 250, 0.1)
predictedGCIP <- predict(quadratic.model,list(time_since_on=time_since_onvalues, time_since_on2=time_since_onvalues^2))
plot(time_since_on, GCIP, pch=16, xlab = "time_since_on (months)", ylab = "GCIP", cex.lab = 1.3, col = "blue")
lines(time_since_onvalues, predictedGCIP, col = "darkgreen", lwd = 3)
The problem is that I am still unable to introduce (1|Patient.ID) as a mixed effect. And I lose my spaghetti plot in this case, having just the dots. Here the result:
What do you think is better or how should I code this?
Thanks.
lili

how to plot predicted values on lm line for a null model using ggplot in r

Trying to reproduce below base code using ggplot which is yielding
incorrect result
base code
model1 <- lm(wgt ~ 1, data = bdims)
model1_null <- augment(model1)
plot(bdims$hgt, bdims$wgt)
abline(model1, lwd = 2, col = "blue")
pre_null <- predict(model1)
segments(bdims$hgt, bdims$wgt, bdims$hgt, pre_null, col = "red")
ggplot code
bdims %>%
ggplot(aes(hgt, wgt)) +
geom_point() +
geom_smooth(method = "lm", formula = bdims$hgt ~ 1) +
segments(bdims$hgt, bdims$wgt, bdims$hgt, pre_null, col = "red")
Here's an example using the built-in mtcars data:
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ 1) +
geom_segment(aes(xend = wt, yend = mean(mpg)), col = "firebrick2")
The formula references the aesthetic dimensions, not the variable names. And you need to use geom_segment not the base graphics segments. In a more complicated case you would pre-compute the model's predicted values for the segments, but for a null model it's easy enough to just use mean inline.

How to plot estimate values for a lmer regression model in R?

I have data that looks like this:
height <- c(1,2,3,4,2,4,6,8)
weight <- c(12,13,14,15,22,23,24,25)
type <- c("Wheat","Wheat","Wheat","Wheat","Rice","Rice","Rice","Rice")
set <- c(1,1,1,1,2,2,2,2)
dat <- data.frame(set,type,height,weight)
I run a lmer model with set as a random effect in R:
mod <- lmer(weight~height + type + (1|set), data = dat)
Now, I want to plot the estimates of the model and plot a regression, with weight on the x-axis and height on the y-axis, facet(~type)
I use the predict function as follows
dat$pred <- predict(mod, type = "response")
And I want to achieve a ggplot that will look like this:
ggplot(dat,aes(x = weight, y = height)) +
geom_point() + geom_smooth(method="lm", fill=NA) + facet_grid(~ type, scales = "free")
However, I note that the predict function has only a singular output. How do I plot that to achieve the same as above? Or do I have to store two different predict responses, and then plug it into the x,y of ggplot?
I can adapt your plot to show raw vs. predicted values like this:
ggplot(dat,aes(y = height)) +
geom_point(aes(x = weight)) +
geom_line(aes(x = pred)) +
facet_grid(~ type, scales = "free")
In your example plot though you have weight, the outcome variable in your model, on the x-axis, which is confusing. Normally you would have the outcome/predicted variable on the y-axis, so I would have plotted your model predictions like:
ggplot(dat,aes(x = height)) +
geom_point(aes(y = weight)) +
geom_line(aes(y = pred)) +
facet_grid(~ type, scales = "free")

Resources