Related
I have some data for which I run linear mixed-effect models using different polynomial degrees. I want to show in a plot the points representing my data and colouring by ID and also the prediction lines using the different polynomial degrees. However, for the prediction lines, I don't want to consider the ID. Below I show a reproducible example:
library(ggplot2)
library(lme4)
set.seed(123)
df <- data.frame(Height =rnorm(500, mean=175, sd=15),
Weight =rnorm(500, mean=70, sd=20),
ID = rep(c("A","B","C","D"), (500/4)))
mod1 <- lmer(Height ~ Weight + (1|ID), df)
mod2 <- lmer(Height ~ poly(Weight,2) + (1|ID), df)
y.mod1 <- predict(mod1, data.frame(Weight=df$Weight),re.form=NA) # Prediction of y according to model 1
y.mod2 <- predict(mod2, data.frame(Weight=df$Weight),re.form=NA) # Prediction of y according to model 2
df <- cbind(df, y.mod1,y.mod2)
df <- as.data.frame(df)
head(df)
Height Weight ID
1 166.5929 57.96214 A
2 171.5473 50.12603 B
3 198.3806 90.53570 C
4 176.0576 85.02123 D
5 176.9393 39.81667 A
6 200.7260 68.09705 B
What I did was, first, plot my data points:
Plot_a <- ggplot(df,aes(x=Weight, y=Height,colour=ID)) +
geom_point() +
theme_bw() +
guides(color=guide_legend(override.aes=list(fill=NA)))
Plot_a
Then, I plotted the "prediction lines" in my plot:
Plot_b <- Plot_a +
geom_line(data = df, aes(x=Weight, y=y.mod1)) +
geom_line(data = df, aes(x=Weight, y=y.mod2))
Plot_b
However, as you can see, the lines are coloured with different colours. I guess it is due to the ID factor. Does anyone know how to plot the lines without colouring using ID?
You can set independent colors for lines like this:
library(ggplot2)
#Code
#Plot 2
Plot_b <- Plot_a +
geom_line(data = df, aes(x=Weight, y=y.mod1),color='black') +
geom_line(data = df, aes(x=Weight, y=y.mod2),color='red')
Output:
As additional element, if you want two legends you can play with fill and color options in aes(). Here the code for that approach:
#Plot3
Plot_b <- Plot_a +
geom_line(data = df, aes(x=Weight, y=y.mod1,color='mod1'),show.legend = T) +
geom_line(data = df, aes(x=Weight, y=y.mod2,color='mod2'),show.legend = T) +
guides(color=guide_legend(title="Model"))
Output:
If you want to change colors you can check the options in scale_color_manual().
And for further customization:
#Plot 4
Plot_b <- Plot_a +
geom_line(data = df, aes(x=Weight, y=y.mod1,color='mod1'),show.legend = T) +
geom_line(data = df, aes(x=Weight, y=y.mod2,color='mod2'),show.legend = T) +
guides(fill = guide_legend(override.aes = list(linetype = 0)),
color=guide_legend(title="Model"))
Output:
I am plotting different models' prediction lines over some data points. I would like to get a legend indicating to which individual belongs each point colour and another legend indicating to which model belongs each line colour. Below I share a fake example for reproducibility:
set.seed(123)
df <- data.frame(Height =rnorm(500, mean=175, sd=15),
Weight =rnorm(500, mean=70, sd=20),
ID = rep(c("A","B","C","D"), (500/4)))
mod1 <- lmer(Height ~ Weight + (1|ID), df)
mod2 <- lmer(Height ~ poly(Weight,2) + (1|ID), df)
y.mod1 <- predict(mod1, data.frame(Weight=df$Weight),re.form=NA) # Prediction of y according to model 1
y.mod2 <- predict(mod2, data.frame(Weight=df$Weight),re.form=NA) # Prediction of y according to model 2
df <- cbind(df, y.mod1,y.mod2)
df <- as.data.frame(df)
head(df)
Height Weight ID y.mod1 y.mod2
1 166.5929 57.96214 A 175.9819 175.4918
2 171.5473 50.12603 B 176.2844 176.3003
3 198.3806 90.53570 C 174.7241 174.7082
4 176.0576 85.02123 D 174.9371 174.5487
5 176.9393 39.81667 A 176.6825 177.7303
6 200.7260 68.09705 B 175.5905 174.8027
First I plot my data points:
Plot_a <- ggplot(df,aes(x=Weight, y=Height,colour=ID)) +
geom_point() +
theme_bw() +
guides(color=guide_legend(override.aes=list(fill=NA)))
Plot_a
Then, I add lines relative to the prediction models:
Plot_b <- Plot_a +
geom_line(data = df, aes(x=Weight, y=y.mod1,color='mod1'),show.legend = T) +
geom_line(data = df, aes(x=Weight, y=y.mod2,color='mod2'),show.legend = T) +
guides(fill = guide_legend(override.aes = list(linetype = 0)),
color=guide_legend(title=c("Model")))
Plot_b
Does anyone know why I am not getting two different legends, one titled Model and the other ID?
I would like to get this
This type of problems generaly has to do with reshaping the data. The format should be the long format and the data is in wide format. See this post on how to reshape the data from long to wide format.
The plot layers become simpler, one geom_line is enough and there is no need for guideto override the aesthetics.
To customize the models' legend text, create a vector of legends, in this case with plotmath, in order to have math notation. And the colors are set manually too.
library(dplyr)
library(tidyr)
library(ggplot2)
model_labels <- c(expression(X^1), expression(X^2))
df %>%
pivot_longer(
cols = c(y.mod1, y.mod2),
names_to = "Model",
values_to = "Value"
) %>%
ggplot(aes(Weight, Height)) +
geom_point(aes(fill = ID), shape = 21) +
geom_line(aes(y = Value, color = Model)) +
scale_color_manual(labels = model_labels,
values = c("coral", "coral4")) +
theme_bw()
The issue is that in ggplot2 each aesthetic can only have one scale and only one legend. As you are using only the color aes you get one legend. If you want multiple legends for the same aesthetic have a look at the ggnewscales package. Otherwise you have to make use of a second aesthetic.
My preferred approach would be similar to the one proposed by #RuiBarradas. However, to stick close to your approach this could be achieved like so:
Instead of color map on linetype in your calls to geom_line.
Set the colors for the lines as arguments, i.e. not inside aes.
Make use of scale_linetype_manual to get solid lines for both models.
Make use of guide_legend to fix the colors appearing in the legend
library(ggplot2)
library(lme4)
#> Loading required package: Matrix
set.seed(123)
df <- data.frame(Height =rnorm(500, mean=175, sd=15),
Weight =rnorm(500, mean=70, sd=20),
ID = rep(c("A","B","C","D"), (500/4)))
mod1 <- lmer(Height ~ Weight + (1|ID), df)
mod2 <- lmer(Height ~ poly(Weight,2) + (1|ID), df)
y.mod1 <- predict(mod1, data.frame(Weight=df$Weight),re.form=NA) # Prediction of y according to model 1
y.mod2 <- predict(mod2, data.frame(Weight=df$Weight),re.form=NA) # Prediction of y according to model 2
df <- cbind(df, y.mod1,y.mod2)
df <- as.data.frame(df)
Plot_a <- ggplot(df) +
geom_point(aes(x=Weight, y=Height, colour=ID)) +
theme_bw() +
guides(color=guide_legend(override.aes=list(fill=NA)))
line_colors <- scales::hue_pal()(2)
Plot_b <- Plot_a +
geom_line(aes(x=Weight, y=y.mod1, linetype = "mod1"), color = line_colors[1]) +
geom_line(aes(x=Weight, y=y.mod2, linetype = "mod2"), color = line_colors[2]) +
scale_linetype_manual(values = c(mod1 = "solid", mod2 = "solid")) +
labs(color = "ID", linetype = "Model") +
guides(linetype = guide_legend(override.aes = list(color = line_colors)))
Plot_b
I have a data frame in this format:
row.names 100 50 25 0
metabolite1 113417.2998 62594.7067 39460.7705 1.223243e+02
metabolite2 3494058.7972 2046871.7446 1261278.2476 6.422864e+03
The columns refer to the concentrations of quality controls (%): 100, 50, 25, 0.
Currently to plot a single graph I am extracting the data into a new data frame and plotting it like this:
metabolite1 <- data.frame(Numbers = c(100,50,25,0), Signal = c(113417.2998,62594.7067,39460.7705,122.3243))
# Extract coefficient of variance for line of best fit
Coef <- coef(lm(Signal ~ Numbers, data = metabolite1))
# plot data
ggplot(metabolite1, aes(x = Numbers, y = Signal)) +
geom_point() +
xlim(0,100) +
geom_abline(intercept = Coef[1], slope = Coef[2])
This is extremely inefficient and I am trying to find a better way to plot separate scatter plots for each row rather than creating separate data frames. What would be a better way to do this? I have 160 metabolites I need to produce graphs for. I have attempted the melt the data frame into the format:
Name variable value
metabolite1 100 113417.2998
metabolite2 100 3494058.7972
metabolite1 50 62594.7067
metabolite2 50 2046871.7446
metabolite1 25 39460.7705
metabolite2 25 1261278.2476
metabolite1 0 1.223243e+02
metabolite2 0 6.422864e+03
and then use ggplot and faceting to plot the data
ggplot(data = df, aes(x = variable, y = value)) +
geom_point() + facet_grid(~ Name)
but the plots produced all have the same y axis scale which is not appropriate for the data I am working with. I'm assuming because of this I cannot use faceting to produce the plots.
EDIT: I do not know how to add separate lines of best fit to each plot without using geom_smooth, which I do not wish to do.
You're on the right track with your method of melting and faceting:
ggplot(data = df, aes(x = variable, y = value)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, lwd = .5, col = "black") +
facet_wrap(~ Name, scales = "free_y")
This yields similar plots as those you get from running ggplot on subsets:
out <- lapply(list(metabolite1, metabolite2), function(d) {
Coef <- coef(lm(Signal ~ Numbers, data = d))
# plot data
p <- ggplot(d, aes(x = Numbers, y = Signal)) +
geom_point() +
xlim(0,100) +
geom_abline(intercept = Coef[1], slope = Coef[2])
})
gridExtra::grid.arrange(out[[1]], out[[2]], nrow = 1)
Given a dataset with a factor column (X1) and a subtotal column (X2)
X1 X2
1 1 12
2 2 200
3 3 23
4 4 86
5 5 141
I would like to create a graphic like this:
which gives x2 as a percentage of the X2 total, divided by X1.
Edit: clarity and adding dataset for reproducability
For example
set.seed(1234)
df <- data.frame(x = 1:6)
df$y <- runif(nrow(df))
df$type <- sample(letters, nrow(df))
ggplot(df, aes(x+-.5, y, fill=type)) +
geom_bar(stat="identity", width=1) +
coord_polar(start = pi/2) +
scale_x_continuous(limits = c(0, nrow(df)*2)) +
geom_text(aes(label=scales::percent(y))) +
ggthemes::theme_map() + theme(legend.position = c(0,.15))
gives you
I'm using ggplot with facet_wrap to generate 3 side-by-side plots with linear models. In addition, I have another dimension (let's call it "z") I'd like to visualize by varying the size of the points on the plots.
Currently, the plots I generate keep the size of the points on the same scale across all 3 facets. I would instead like to scale the point sizes by facet - that way, one can quickly tell which point contains the highest "z" value for each facet.
Is there any way to do this without creating 3 separate plots? I've included a sample of my data and the code I used below:
x <- c(0.03,1.32,2.61,3.90,5.20,6.48,7.77,0.75,2.04,3.33,4.62,5.91,7.20,8.49,0.41,1.70,3.00,4.28,5.57,6.86,8.15)
y <- c(650,526,382,110,72,209,60,559,296,76,48,64,20,22,50,102,176,21,20,25,5)
z <- c(391174,244856,836435,46282,40351,27118,17411,26232,59162,9737,1917,20575,1484,450,12071,13689,133326,1662,711,728,412)
facet <- c("A","A","A","A","A","A","A","B","B","B","B","B","B","B","C","C","C","C","C","C","C")
df <- data.frame(x,y,z,facet)
ggplot(df, aes(x=x, y=y)) +
geom_point(aes(size=z)) +
geom_smooth(method="lm") +
facet_wrap(~facet)
The method below reassigns z to it's z-score within it's facet:
require(dplyr)
require(ggplot)
require(magrittr)
require(scales)
x <- c(0.03,1.32,2.61,3.90,5.20,6.48,7.77,0.75,2.04,3.33,4.62,5.91,7.20,8.49,0.41,1.70,3.00,4.28,5.57,6.86,8.15)
y <- c(650,526,382,110,72,209,60,559,296,76,48,64,20,22,50,102,176,21,20,25,5)
z <- c(391174,244856,836435,46282,40351,27118,17411,26232,59162,9737,1917,20575,1484,450,12071,13689,133326,1662,711,728,412)
facet <- c("A","A","A","A","A","A","A","B","B","B","B","B","B","B","C","C","C","C","C","C","C")
df <- data.frame(x,y,z,facet)
df %<>%
group_by(facet) %>%
mutate(z = scale(z)) # calculate point size within group
ggplot(df, aes(x=x, y=y, group = facet)) +
geom_point(aes(size=z)) +
geom_smooth(method="lm") +
facet_wrap(~facet )
Try to rescale size for each facet to take values in (0,1]:
df %>%
group_by(facet) %>%
mutate(newz = z/max(z)) %>%
ggplot(., aes(x=x, y=y)) +
geom_point(aes(size=newz)) +
geom_smooth(method="lm") +
facet_wrap(~facet)
I would just take the mean of the df$z by each df$facet
AverageFacet <- df %>% group_by(facet) %>% summarize(meanwithinfacet= mean(z, na.rm=TRUE))
df <- merge(df, AverageFacet)
df$pointsize<- df$z - df$meanwithinfacet
Now each point size depends on the mean of the facets
> head(df,10)
facet x y z meanwithinfacet pointsize
1 A 0.03 650 391174 229089.57 162084.429
2 A 1.32 526 244856 229089.57 15766.429
3 A 2.61 382 836435 229089.57 607345.429
4 A 3.90 110 46282 229089.57 -182807.571
5 A 5.20 72 40351 229089.57 -188738.571
6 A 6.48 209 27118 229089.57 -201971.571
7 A 7.77 60 17411 229089.57 -211678.571
8 B 0.75 559 26232 17079.57 9152.429
9 B 2.04 296 59162 17079.57 42082.429
and plot
ggplot(df, aes(x=x, y=y)) +
geom_point(aes(size=pointsize)) +
geom_smooth(method="lm") +
facet_wrap(~facet)
Looks like this, not sure about the legend though.
You could also instead of using the absolute difference from the mean use the how many standard deviates from the mean a given z is
AverageFacet <- df %>% group_by(facet) %>% summarize(meanwithinfacet= mean(z, na.rm=TRUE), sdwithinfacet= sd(z, na.rm=TRUE))
df <- merge(df, AverageFacet)
df$absoluteDiff<- df$z - df$meanwithinfacet
df$SDfromMean <- df$absoluteDiff / df$sdwithinfacet
ggplot(df, aes(x=x, y=y)) +
geom_point(aes(size=SDfromMean)) +
geom_smooth(method="lm") +
facet_wrap(~facet)