Change key glyphs in ggplot2 - r

I am having two issues with my ggplot. I am trying to plot two continuous variables in a scatterplot, stratified by a categorical variable (4 levels).
The first one is that the plot produces the letter "a" in the legend instead of a line. I know that there is something off with the glyph but I cannot figure it out.
The second issue is that when I use label.y = 10 in the stat_cor() function from the ggpubr package the correlations of the 4 groups collapse all together. label.x works fine.
My code is the following:
library(ggplot2)
library(ggpubr)
df <- data.frame(categories = as.factor(c(1,2,3,3,4,1,4,2,2,1,2,3,4)),
var1 = c(1,11,13,2,5,5,4,10,7,1,2,4,5),
var2 = c(2,10,12,15,14,1,3,7,11,5,6,7,5))
b <- ggplot(df, aes(x = var1, y = var2, colour = categories)) +
geom_point()+
geom_smooth(method = "lm", se = FALSE, fullrange = TRUE) +
scale_color_manual(values = c("#feedde", "#fdbe85", "#fd8d3c", "#d94701")) +
theme_bw() +
ggpubr::stat_cor(aes(color = categories), label.x = 3, label.y = 10) +
ggtitle("Title") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Var1") + ylab("Var2") +
labs(color = "Categories")
b

Related

mean line in every facet_wrap

ggplot(data = results, aes(x = inst, y = value, group = inst)) +
geom_boxplot() +
facet_wrap(~color) +
#geom_line(data = mean,
#mapping = aes(x = inst, y = average, group = 1))
theme_bw()
When I run the code above with the code line commented, it runs and gives the graph below but I want a joining mean lines on the boxplots based on its own color category for each group in facet wraps. Any ideas how can I do that?
Your code is generally correct (though you'll want to add color = color to the aes() specification in geom_line()), so I suspect your mean dataset isn't set up correctly. Do you have means grouped by both your x axis and faceting variable? Using ggplot2::mpg as an example:
library(dplyr) # >= v1.1.0
library(ggplot2)
mean_dat <- summarize(mpg, average = mean(hwy), .by = c(cyl, drv))
ggplot(mpg, aes(factor(cyl), hwy)) +
geom_boxplot() +
geom_line(
data = mean_dat,
aes(y = average, group = 1, color = drv),
linewidth = 1.5,
show.legend = FALSE
) +
facet_wrap(~drv) +
theme_bw()
Alternatively, you could use stat = "summary" and not have to create a means dataframe at all:
ggplot(mpg, aes(factor(cyl), hwy)) +
geom_boxplot() +
geom_line(
aes(group = 1, color = drv),
stat = "summary",
linewidth = 1.5,
show.legend = FALSE
) +
facet_wrap(~drv) +
theme_bw()
# same result as above

How to color the area between two geom_smooth lines?

I have 3 columns in a data frame from which I want to create a visualisation with geom_smooth() :
ggplot(my_data_frame) +
aes(x = fin_enquete,
y = intentions,
colour = candidat) +
geom_point(alpha = 1/6,
shape = "circle",
size = .5L) +
geom_smooth(mapping = aes(y = erreur_inf),
size = .5L,
span = .42,
se = F) +
geom_smooth(mapping = aes(y = erreur_sup),
size = .5L,
span = .42,
se = F) +
geom_smooth(method = "loess",
size = 1.5L,
span = .42,
se = F) +
labs(x = "Date de fin d'enquĂȘte",
y = "Pourcentage d'intentions de vote") +
theme_minimal() +
theme(text = element_text(family = "DIN Pro")) +
coord_cartesian(expand = F) +
easy_remove_legend()
3 lines with geom_smooth
I would like to color the area between the upper and the lower line. I know the geom_ribbon() function but I am not sure I can use it in this situation.
Does anybody have a solution?
Have a nice day!
You could use geom_ribbon and calculate the loess model yourself within the geom_ribbon call?
Toy random data
dat <- data.frame(x=1:100, y=runif(100), y2=runif(100)+1, y3=runif(100)+2)
Now suppose we want a smoothed ribbon between y and y3, with y2 drawn as a line between them:
ggplot( dat , aes(x, y2)) +
geom_ribbon(aes(ymin=predict(loess(y~x)),
ymax=predict(loess(y3~x))), alpha=0.3) +
geom_smooth(se=F)
You could use lapply() smooth to calculate the range of df values such as (5,11,13) to calculate the smooths and plot only the two edges of the se.
Sample code:
library(ggplot2)
ggplot(data = mtcars,
mapping = aes(x = wt,
y = mpg)) +
geom_point(size = 2)+
lapply(c(5,11, 13), function (i) {
geom_smooth(
data = ~ cbind(., facet_plots = i),
method = lm,
se=F,
formula = y ~ splines::bs(x, i)
)
})+
#facet_wrap(vars(facet_plots))
geom_ribbon(
stat = "smooth",
method = "loess",
se = TRUE,
alpha = 0, # or, use fill = NA
colour = "black",
linetype = "dotted")+
theme_minimal()
Plot:

How do I change the color of the regression lines in ggPlot?

I made a visualization of a regression. Currently this is what the graph looks like.
The regression lines are hard to see since they are the same color as the scatter plot dots.
My question is, how do I make the regression lines a different color from the scatter plot dots?
Here is my code:
(ggplot(data=df, mapping=aes(x='score', y='relent',
color='factor(threshold)'))+
geom_point()+
scale_color_manual(values=['darkorange', 'purple'])+
geom_smooth(method='lm',
formula = 'y ~ x+I(x**2)',se=False, )+
geom_vline(xintercept = 766, color = "red", size = 1, linetype = "dashed")+
labs(y = "Yield",
x = "Score")+
theme_bw()
)
One option to achieve your desired result would be to "duplicate" your threshold column with different values, e.g. in the code below I map 0 on 2 and 1 on 3. This duplicated column could then be mapped on the color aes inside geom_smooth and allows to set different colors for the regression lines.
My code below uses R or ggplot2 but TBMK the code could be easily adapted to plotnine:
n <- 1000
df <- data.frame(
relent = c(runif(n, 100, 200), runif(n, 150, 250)),
score = c(runif(n, 764, 766), runif(n, 766, 768)),
threshold = c(rep(0, n), rep(1, n))
)
df$threshold_sm <- c(rep(2, n), rep(3, n))
library(ggplot2)
p <- ggplot(data = df, mapping = aes(x = score, y = relent, color = factor(threshold))) +
scale_color_manual(values = c("darkorange", "purple", "blue", "green")) +
geom_vline(xintercept = 766, color = "red", size = 1, linetype = "dashed") +
labs(
y = "Yield",
x = "Score"
) +
theme_bw()
p +
geom_point() +
geom_smooth(aes(color = factor(threshold_sm)),
method = "lm",
formula = y ~ x + I(x**2), se = FALSE
)
A second option would be to add some transparency to the points so that the lines stand out more clearly and by the way deals with the overplotting of the points:
p +
geom_point(alpha = .3) +
geom_smooth(aes(color = factor(threshold)),
method = "lm",
formula = y ~ x + I(x**2), se = FALSE
) +
guides(color = guide_legend(override.aes = list(alpha = 1)))
Compare:
iris %>%
ggplot(aes(Petal.Length, Sepal.Width, color = Species)) +
geom_point() +
geom_smooth(method = "lm", aes(group = Species))
With:
iris %>%
ggplot(aes(Petal.Length, Sepal.Width)) +
geom_point(aes(color = Species)) +
geom_smooth(method = "lm", aes(group = Species))
When aes(color = ...) is specified inside of ggplot(), it is applied to both of the subsequent geoms. Moving it to geom_point() applies it to the points only.

How to use ggplot2 legend to denote different geoms

I am using ggplot2 to plot points from a .csv file that is just a column used a x values and a column used a y values. I am a little confused as to how ggplot decides what to make a legend for and haven't found any good examples online.
I would like the legend to show that geom_point is stress vs strain, and my geom_smooth is the best fit line.
Here is my code:
library(ggplot2)
imported = read.csv("data.csv")
Strain = imported$Strain
Stress = imported$Stress..N.m.2.
err = .0005
gg <-
ggplot(imported, aes(x=Strain, y=Stress)) +
geom_point(aes(group = "Points"), shape = 79, colour = "black", size = 2, stroke = 4) +
geom_smooth(method = "lm", se = FALSE, color = "orange") +
geom_errorbarh(xmin = Strain - err, xmax = Strain + err, show.legend = TRUE) +
theme_gray() + ggtitle("Stress vs Strain") +
theme(legend.position = "top")
gg
And it is producing the following plot:
my plot
Edit: added approach at top to create legend for each geom, by creating dummy mapping to separate aesthetics.
library(ggplot2)
ggplot(mtcars, aes(mpg, wt)) +
geom_point(aes(color = "point")) + # dummy mapping to color
geom_smooth(method = "lm", se = FALSE, color = "orange",
aes(linetype = "best fit")) + # dummy mapping to linetype
geom_errorbarh(aes(xmin = mpg - 2, xmax = mpg + 1)) +
scale_color_manual(name = "Stress vs. Strain", values = "black") +
scale_linetype_manual(name = "Best fit line", values = "solid")
original answer:
Note the difference in legend here:
library(ggplot2)
ggplot(mtcars, aes(mpg, wt, color = as.character(cyl))) +
geom_point() +
geom_errorbarh(aes(xmin = mpg - 2, xmax = mpg + 1),
show.legend = TRUE) # error bars reflected in legend
ggplot(mtcars, aes(mpg, wt, color = as.character(cyl))) +
geom_point() +
geom_errorbarh(aes(xmin = mpg - 2, xmax = mpg + 1),
show.legend = FALSE) # error bars not shown in legend

Preventing wrong density plots when coloring histograms according to groups

based on some dummy data I created a histogram with desity plot
set.seed(1234)
wdata = data.frame(
sex = factor(rep(c("F", "M"), each=200)),
weight = c(rnorm(200, 55), rnorm(200, 58))
)
a <- ggplot(wdata, aes(x = weight))
a + geom_histogram(aes(y = ..density..,
# color = sex
),
colour="black",
fill="white",
position = "identity") +
geom_density(alpha = 0.2,
# aes(color = sex)
) +
scale_color_manual(values = c("#868686FF", "#EFC000FF"))
The histogram of weight shall be colored corresponding to sex, so I use aes(y = ..density.., color = sex) for geom_histogram():
a + geom_histogram(aes(y = ..density..,
color = sex
),
colour="black",
fill="white",
position = "identity") +
geom_density(alpha = 0.2,
# aes(color = sex)
) +
scale_color_manual(values = c("#868686FF", "#EFC000FF"))
As I want it to, the density plot stays the same (overall for both groups), but the histograms jump scale up (and seem to be treated individually now):
How do I prevent this from happening? I need individually colored histogram bars but a joint density plot for all coloring groups.
P.S.
Using aes(color = sex) for geom_density() gets everything back to original scales - but I don't want individual density plots (like below):
a + geom_histogram(aes(y = ..density..,
color = sex
),
colour="black",
fill="white",
position = "identity") +
geom_density(alpha = 0.2,
aes(color = sex)
) +
scale_color_manual(values = c("#868686FF", "#EFC000FF"))
EDIT:
As it has been suggested, dividing by the number of groups in geom_histogram()'s aesthetics with y = ..density../2 may approximate the solution. Nevertheless, this only works with symmetric distributions like in the first output below:
a + geom_histogram(aes(y = ..density../2,
color = sex
),
colour="black",
fill="white",
position = "identity") +
geom_density(alpha = 0.2,
) +
scale_color_manual(values = c("#868686FF", "#EFC000FF"))
which yields
Less symmetric distributions, however, may cause trouble using this approach. See those below, where for 5 groups, y = ..density../5 was used. First original, then manipulation (with position = "stack"):
Since the distribution is heavy on the left, dividing by 5 underestimates on the left and overestimates on the right.
EDIT 2: SOLUTION
As suggested by Andrew, the below (complete) code solves the problem:
library(ggplot2)
set.seed(1234)
wdata = data.frame(
sex = factor(rep(c("F", "M"), each = 200)),
weight = c(rnorm(200, 55), rnorm(200, 58))
)
binwidth <- 0.25
a <- ggplot(wdata,
aes(x = weight,
# Pass binwidth to aes() so it will be found in
# geom_histogram()'s aes() later
binwidth = binwidth))
# Basic plot w/o colouring according to 'sex'
a + geom_histogram(aes(y = ..density..),
binwidth = binwidth,
colour = "black",
fill = "white",
position = "stack") +
geom_density(alpha = 0.2) +
scale_color_manual(values = c("#868686FF", "#EFC000FF")) +
# Use fixed scale for sake of comparability
scale_x_continuous(limits = c(52, 61)) +
scale_y_continuous(limits = c(0, 0.25))
# Plot w/ colouring according to 'sex'
a + geom_histogram(aes(x = weight,
# binwidth will only be found if passed to
# ggplot()'s aes() (as above)
y = ..count.. / (sum(..count..) * binwidth),
color = sex),
binwidth = binwidth,
fill="white",
position = "stack") +
geom_density(alpha = 0.2) +
scale_color_manual(values = c("#868686FF", "#EFC000FF")) +
# Use fixed scale for sake of comparability
scale_x_continuous(limits = c(52, 61)) +
scale_y_continuous(limits = c(0, 0.25)) +
guides(color = FALSE)
Note:
binwidth = binwidth needed to be passed to ggplot()'s aes(), otherwise the pre-specified binwidth would not be found by geom_histogram()'s aes(). Further, position = "stack" is specified, so that both versions of the histogram are comparable. Plots for dummy data and the more complex distribution below:
Solved - Thanks for your help!
I don't think you can do it using y=..density.., but you can recreate the same thing like this...
binwidth <- 0.25 #easiest to set this manually so that you know what it is
a + geom_histogram(aes(y = ..count.. / (sum(..count..) * binwidth),
color = sex),
binwidth = binwidth,
fill="white",
position = "identity") +
geom_density(alpha = 0.2) +
scale_color_manual(values = c("#868686FF", "#EFC000FF"))

Resources