I need to do plot a lot of variables against each other. In each of these plots I would like to automatically place information from a linear regression model in the upper left corner of the graph.
Taking the mtcars dataset for an example, I would like a piece of code I could use that would give me the the R2 and p-value from a linear regression model in the top left corner of the graph no matter what variables I plotted against each other. I have made a solution, where I plot R2 and P in the title, but since I need another title it's not optimal.
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point() +
stat_smooth(method = "lm", col = "red") +
labs(title = paste("Adj R2 = ",signif(summary(fit)$adj.r.squared, 1),
" P =",signif(summary(fit)$coef[2,4], 1)))
}
disp_vs_wt_cyl4 <- mtcars %>%
filter(cyl=="4")
ggplotRegression(lm(disp ~ wt, data = disp_vs_wt_cyl4)) +
geom_point(size = 3.74, colour = "#0c4c8a") +
theme_bw()
You could use annotation_custom in your plot, which would allow you to have a separate title. In this example, we allow a title to be passed to your function:
ggplotRegression <- function (fit, title) {
require(ggplot2)
lab <- grid::textGrob(label = paste0(
as.character(as.expression(fit$call$formula)), "\n",
"Adj R\u00b2 = ",
signif(summary(fit)$adj.r.squared, 1),
", p = ", signif(summary(fit)$coef[2,4], 1)),
x = unit(0.05, "npc"),
y = unit(0.9, "npc"), just = "left",
gp = grid::gpar(size = 14, fontface = "bold"))
ggplot(fit$model, aes_string(x = names(fit$model)[2],
y = names(fit$model)[1])) +
ggtitle(title) +
geom_point() +
stat_smooth(method = "lm", col = "red") +
annotation_custom(lab)
}
So we can do:
disp_vs_wt_cyl4 <- mtcars %>% filter(cyl=="4")
ggplotRegression(lm(disp ~ wt, data = disp_vs_wt_cyl4), "My Title") +
geom_point(size = 3.74, colour = "#0c4c8a") +
theme_bw()
Related
I have 3 columns in a data frame from which I want to create a visualisation with geom_smooth() :
ggplot(my_data_frame) +
aes(x = fin_enquete,
y = intentions,
colour = candidat) +
geom_point(alpha = 1/6,
shape = "circle",
size = .5L) +
geom_smooth(mapping = aes(y = erreur_inf),
size = .5L,
span = .42,
se = F) +
geom_smooth(mapping = aes(y = erreur_sup),
size = .5L,
span = .42,
se = F) +
geom_smooth(method = "loess",
size = 1.5L,
span = .42,
se = F) +
labs(x = "Date de fin d'enquĂȘte",
y = "Pourcentage d'intentions de vote") +
theme_minimal() +
theme(text = element_text(family = "DIN Pro")) +
coord_cartesian(expand = F) +
easy_remove_legend()
3 lines with geom_smooth
I would like to color the area between the upper and the lower line. I know the geom_ribbon() function but I am not sure I can use it in this situation.
Does anybody have a solution?
Have a nice day!
You could use geom_ribbon and calculate the loess model yourself within the geom_ribbon call?
Toy random data
dat <- data.frame(x=1:100, y=runif(100), y2=runif(100)+1, y3=runif(100)+2)
Now suppose we want a smoothed ribbon between y and y3, with y2 drawn as a line between them:
ggplot( dat , aes(x, y2)) +
geom_ribbon(aes(ymin=predict(loess(y~x)),
ymax=predict(loess(y3~x))), alpha=0.3) +
geom_smooth(se=F)
You could use lapply() smooth to calculate the range of df values such as (5,11,13) to calculate the smooths and plot only the two edges of the se.
Sample code:
library(ggplot2)
ggplot(data = mtcars,
mapping = aes(x = wt,
y = mpg)) +
geom_point(size = 2)+
lapply(c(5,11, 13), function (i) {
geom_smooth(
data = ~ cbind(., facet_plots = i),
method = lm,
se=F,
formula = y ~ splines::bs(x, i)
)
})+
#facet_wrap(vars(facet_plots))
geom_ribbon(
stat = "smooth",
method = "loess",
se = TRUE,
alpha = 0, # or, use fill = NA
colour = "black",
linetype = "dotted")+
theme_minimal()
Plot:
I made a visualization of a regression. Currently this is what the graph looks like.
The regression lines are hard to see since they are the same color as the scatter plot dots.
My question is, how do I make the regression lines a different color from the scatter plot dots?
Here is my code:
(ggplot(data=df, mapping=aes(x='score', y='relent',
color='factor(threshold)'))+
geom_point()+
scale_color_manual(values=['darkorange', 'purple'])+
geom_smooth(method='lm',
formula = 'y ~ x+I(x**2)',se=False, )+
geom_vline(xintercept = 766, color = "red", size = 1, linetype = "dashed")+
labs(y = "Yield",
x = "Score")+
theme_bw()
)
One option to achieve your desired result would be to "duplicate" your threshold column with different values, e.g. in the code below I map 0 on 2 and 1 on 3. This duplicated column could then be mapped on the color aes inside geom_smooth and allows to set different colors for the regression lines.
My code below uses R or ggplot2 but TBMK the code could be easily adapted to plotnine:
n <- 1000
df <- data.frame(
relent = c(runif(n, 100, 200), runif(n, 150, 250)),
score = c(runif(n, 764, 766), runif(n, 766, 768)),
threshold = c(rep(0, n), rep(1, n))
)
df$threshold_sm <- c(rep(2, n), rep(3, n))
library(ggplot2)
p <- ggplot(data = df, mapping = aes(x = score, y = relent, color = factor(threshold))) +
scale_color_manual(values = c("darkorange", "purple", "blue", "green")) +
geom_vline(xintercept = 766, color = "red", size = 1, linetype = "dashed") +
labs(
y = "Yield",
x = "Score"
) +
theme_bw()
p +
geom_point() +
geom_smooth(aes(color = factor(threshold_sm)),
method = "lm",
formula = y ~ x + I(x**2), se = FALSE
)
A second option would be to add some transparency to the points so that the lines stand out more clearly and by the way deals with the overplotting of the points:
p +
geom_point(alpha = .3) +
geom_smooth(aes(color = factor(threshold)),
method = "lm",
formula = y ~ x + I(x**2), se = FALSE
) +
guides(color = guide_legend(override.aes = list(alpha = 1)))
Compare:
iris %>%
ggplot(aes(Petal.Length, Sepal.Width, color = Species)) +
geom_point() +
geom_smooth(method = "lm", aes(group = Species))
With:
iris %>%
ggplot(aes(Petal.Length, Sepal.Width)) +
geom_point(aes(color = Species)) +
geom_smooth(method = "lm", aes(group = Species))
When aes(color = ...) is specified inside of ggplot(), it is applied to both of the subsequent geoms. Moving it to geom_point() applies it to the points only.
I want to customize the formula used in geom_smooth like this:
library(MASS)
library(ggplot2)
data("Cars93", package = "MASS")
str(Cars93)
Cars93.log <- transform(Cars93, log.price = log(Price))
log.model <- lm(log.price ~ Horsepower*Origin, data = Cars93.log)
summary(log.model)
plot(log.model)
p <- ggplot(data = Cars93.log, aes(x = Horsepower, y = log.price, colour = Origin)) +
geom_point(aes(shape = Origin, color = Origin)) + # Punkte
facet_grid(~ Origin) +
theme(axis.title.x = element_text(margin=margin(15,0,0,0)),
axis.title.y = element_text(margin=margin(0,15,0,0))) +
scale_y_continuous(n.breaks = 7) +
scale_colour_manual(values = c("USA" = "red","non-USA" = "black")) +
scale_shape_manual(values = c(16,16)) +
ylab("Price(log)")
lm.mod <- function(df) {
y ~ x*Cars93.log$Origin
}
p_smooth <- by(Cars93.log, Cars93.log$Origin,
function(x) geom_smooth(data=x, method = lm, formula = lm.mod(x)))
p + p_smooth
However, I receive the error that the computation failed because of different lengths of my used variables.
length(Cars93.log$log.price)
length(Cars93.log$Origin)
length(Cars93.log$Horsepower)
But when I check the length for each variable they're all the same... Any ideas, what's wrong?
Thanks a lot, Martina
I agree with #Rui Barradas, seems like the issue is the lines for lm.mod and p_smooth and the by function
Once you are making a distinction by Origin (e.g., by doing either facet_wrap or color = Origin) then geom_smooth will automatically run different models for those facets.
p <- ggplot(data = Cars93.log,
aes(x = Horsepower, y = log.price, color = Origin)) +
geom_point(aes(shape = Origin)) +
facet_wrap(~ Origin) +
theme(axis.title.x = element_text(margin=margin(15,0,0,0)),
axis.title.y = element_text(margin=margin(0,15,0,0))) +
scale_y_continuous(n.breaks = 7) +
scale_colour_manual(values = c("USA" = "red","non-USA" = "black")) +
scale_shape_manual(values = c(16,16)) +
ylab("Price(log)")
p + geom_smooth(method = lm, formula = y ~ x)
you can convince yourself that this is the same as the output of log.model by extending the x-axis limits to see where the geom_smooth line would cross the y axis (e.g., + coord_cartesian(xlim = c(0, 300)))
You can also see the difference in the graph if you don't pass color = Origin to the geom_smooth function (essentially what is happening if you comment this out from the first ggplot() initialization):
p <- ggplot(data = Cars93.log,
aes(x = Horsepower, y = log.price)) + # color = Origin)) +
geom_point(aes(shape = Origin)) +
#facet_wrap(~ Origin) +
theme(axis.title.x = element_text(margin=margin(15,0,0,0)),
axis.title.y = element_text(margin=margin(0,15,0,0))) +
scale_y_continuous(n.breaks = 7) +
scale_colour_manual(values = c("USA" = "red","non-USA" = "black")) +
scale_shape_manual(values = c(16,16)) +
ylab("Price(log)")
p + geom_smooth(method = lm, formula = y ~ x)
I am using ggplot2 to plot points from a .csv file that is just a column used a x values and a column used a y values. I am a little confused as to how ggplot decides what to make a legend for and haven't found any good examples online.
I would like the legend to show that geom_point is stress vs strain, and my geom_smooth is the best fit line.
Here is my code:
library(ggplot2)
imported = read.csv("data.csv")
Strain = imported$Strain
Stress = imported$Stress..N.m.2.
err = .0005
gg <-
ggplot(imported, aes(x=Strain, y=Stress)) +
geom_point(aes(group = "Points"), shape = 79, colour = "black", size = 2, stroke = 4) +
geom_smooth(method = "lm", se = FALSE, color = "orange") +
geom_errorbarh(xmin = Strain - err, xmax = Strain + err, show.legend = TRUE) +
theme_gray() + ggtitle("Stress vs Strain") +
theme(legend.position = "top")
gg
And it is producing the following plot:
my plot
Edit: added approach at top to create legend for each geom, by creating dummy mapping to separate aesthetics.
library(ggplot2)
ggplot(mtcars, aes(mpg, wt)) +
geom_point(aes(color = "point")) + # dummy mapping to color
geom_smooth(method = "lm", se = FALSE, color = "orange",
aes(linetype = "best fit")) + # dummy mapping to linetype
geom_errorbarh(aes(xmin = mpg - 2, xmax = mpg + 1)) +
scale_color_manual(name = "Stress vs. Strain", values = "black") +
scale_linetype_manual(name = "Best fit line", values = "solid")
original answer:
Note the difference in legend here:
library(ggplot2)
ggplot(mtcars, aes(mpg, wt, color = as.character(cyl))) +
geom_point() +
geom_errorbarh(aes(xmin = mpg - 2, xmax = mpg + 1),
show.legend = TRUE) # error bars reflected in legend
ggplot(mtcars, aes(mpg, wt, color = as.character(cyl))) +
geom_point() +
geom_errorbarh(aes(xmin = mpg - 2, xmax = mpg + 1),
show.legend = FALSE) # error bars not shown in legend
I perform a regression with reg <- lm(...) and get some coefficents I can access with reg$coefficients.
It's of type Named num and contains all the coefficients with their values.
Named num [1:11] 505.085 -0.251 -0.286 -0.22 -0.801 ...
- attr(*, "names")= chr [1:11] "(Intercept)" "year" "monthDez" "monthFeb" ...
I want to show these on my graph created with ggplot. My current approach was to use the subtitle for this:
labs(subtitle=paste(toString(names(reg$coefficients)), "\n",
paste(reg$coefficients, collapse = " ")))
But it's not aligned correctly (name directly over the value etc.)
Has someone an idea?
My current plot looks like this:
base <- ggplot(deliveries, aes(Date)) +
geom_line(aes(y = SalesVolume, colour = "SalesVolume"))+
ggtitle("Sales Volume By Time") +
xlab("Time") +
ylab("Sales Volume") +
labs(subtitle=paste(toString(names(reg$coefficients)), "\n", paste(reg$coefficients, collapse = " ")))
print(base + scale_x_date(labels = date_format("%b %y"), breaks = date_breaks("2 months")))
In this graph a forecast is displayed, so I want to see the regression coefficients there as well.
Would it work to make two separate plots and arrange them onto a grid?
library(ggplot2)
library(broom)
library(dplyr)
library(tidyr)
data_plot <-
ggplot(data = mtcars,
mapping = aes(x = qsec,
y = mpg,
colour = factor(gear))) +
geom_point()
fit <- lm(mpg ~ qsec + wt + factor(gear),
data = mtcars)
# Make a data frame with the contents of the model.
reg_data <-
tidy(fit) %>%
mutate(y = nrow(.):1 - 1) %>%
gather(estimate, value,
estimate:p.value) %>%
mutate(estimate = factor(estimate,
c("term", "estimate", "std.error",
"statistic", "p.value")))
# Make a plot displaying the table.
reg_plot <-
ggplot(data = reg_data,
mapping = aes(x = estimate,
y = y)) +
geom_text(mapping = aes(label = round(value, 2))) +
scale_y_continuous(breaks = unique(reg_data[["y"]]),
labels = unique(reg_data[["term"]])) +
scale_x_discrete(position = "top") +
xlab("") +
ylab("") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_blank())
# Arrange the two plots
gridExtra::grid.arrange(data_plot + theme(plot.margin = grid::unit(c(1,1,0,.5), "lines")),
reg_plot + theme(plot.margin = grid::unit(c(0,0,1,0), "lines")),
clip = FALSE,
nrow = 2,
ncol = 1,
heights = grid::unit(c(.70, .5),
c("null", "null")))
In my limited experience with ggplot2, annotate() could be used to add some annotations to a plot created with ggplot(), but I am not sure if the code below works for what you want
reg <- lm(data = mtcars, mpg ~ wt)
pred <- predict(reg)
newdata <- data.frame(mtcars, pred)
par <- summary(reg)$coefficients[,1] # extract model parameters
par.f <- format(par, digits = 2) # set the decimal digits of parameters
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_line(data = newdata, aes(x = wt, y = pred)) +
annotate("text", x = c(2, 2.5), y = 18, label = names(reg$coefficients)) +
annotate("text", x = c(2, 2.5), y = 16.5, label = par.f) # make them aligned by set x and y in annotate()
enter image description here