ggplot2 Colour & Shape by different Factors - r

I have a data set with 2 factors (MACH & YOU) Id like to produce a BoxPlot using ggplot2 and have the BoxPlot colour split by MACH whilst highlighting certain points (YOU) in a different shape and in Black..?
I can get the plot working but i can't make the (YOU) factor be bigger in terms of shape and make it black...without effecting all other points on the graph.
Ignore the commented lines - I was just playing around with those.
My dataframe x has the form
MEDIAN MACH YOU PROD
34.5 tool1 false ME
33.8 tool1 false ME
32.9 tool2 true ME
30.1 tool2 true ME
33.8 tool2 false.....etc
x<- data.frame(MEDIAN=c(34,32,56,34,45,34,45,33,23), MACH=c("t1","t1","t1","t2","t2","t2","t1","t1","t2"), YOU=c("false","false","false","false","true","true","true","false","false"), PROD="U","U","U","U","U","U","U","U","U")
ggplot(data=x,aes(MACH,MEDIAN ))+
geom_boxplot(fill = "white", colour = "blue")+
theme(panel.grid.minor = element_line(colour = "grey"), plot.title = element_text(size = rel(0.8)),axis.text.x = element_text(angle=90, vjust=1), strip.text.x = element_text(size = 8, colour = "black", face = "bold")) +
#geom_abline(colour = "grey80")+
#geom_point(shape = factor(YOURLOTS)), size = 3) +
#geom_hline(yintercept=x$TARG_AVG,colour = "green")+
#geom_hline(yintercept=x$TARG_MIN,colour = "red")+
#geom_hline(yintercept=x$TARG_MAX,colour = "red")+
geom_point(alpha = 0.6, position = position_jitter(w = 0.05, h = 0.0), aes(colour=factor(MACH),shape = factor(YOU)), size =3)+
facet_wrap(~PROD, scales = "free") +
ggtitle("MyTitle") +
scale_size_area() +
xlab("STAGE HIST EQUIPID")+
ylab("yaxis")

If you want to make the points for YOU of different size, depending on their value, you can add aes(size = factor(YOU)) inside geom_point().
You can choose the range of size of the points adding scale_size_discrete(range = c(3, 6)) to you plot. In this example, the minimum size would be 3 and the maximum value would be 6.
That would be
ggplot(data = x, aes(MACH, MEDIAN)) +
geom_boxplot(fill = "white", aes(color = MACH)) +
geom_point(aes(shape = factor(YOU), size = factor(YOU)), color = "black", alpha = 0.6, position = position_jitter(w = 0.05, h = 0.0)) +
labs(title = "My Title", x = "Stage Hist Equip ID", y = "y-axis") +
scale_size_discrete(range = c(3, 6))

I would solve this by using two subsets and two calls to geom_point():
library(ggplot2)
x <- data.frame(MEDIAN = c(34,32,56,34,45,34,45,33,23),
MACH = c("t1","t1","t1","t2","t2","t2","t1","t1","t2"),
YOU = c("false","false","false","false","true","true","true","false","false"),
PROD = c("U","U","U","U","U","U","U","U","U"))
ggplot(data = x, aes(MACH, MEDIAN)) +
geom_boxplot(fill = "white", colour = "blue") +
geom_point(data = subset(x, YOU != "true"), aes(color = MACH),
size = 8, alpha = 0.6,
position = position_jitter(w = 0.05, h = 0.0)) +
geom_point(data = subset(x, YOU == "true"), aes(shape = YOU),
color = "black", size = 8, alpha = 0.6,
position = position_jitter(w = -0.05, h = 0.0)) +
labs(title = "My Title", x = "Stage Hist Equip ID", y = "y-axis")

Related

How to present the results of a dataframe in a serial scale using ggplot as in the example attached?

I have this data frame :
Raw.Score = c(0,1,2,3,4,5,6,7,8)
Severity = c(-3.56553994,-2.70296933,-1.63969850,-0.81321707,-0.04629182,
0.73721320,1.61278518,2.76647043,3.94804472)
x = data.frame(Raw.Score = Raw.Score, Severity = Severity)
Raw.score are raw numbers from 0 to 8 (let's consider them as the labels of the severity numbers)
Severity are relative numbres that represent the locations of the scores in the diagram
I want to graphically present the results as in the following example using ggplot (the example includes different numbers but I want something similar)
As a fun exercise in ggplot-ing here is one approach to achieve or come close to your desired result.
Raw.Score = c(0,1,2,3,4,5,6,7,8)
Severity = c(-3.56553994,-2.70296933,-1.63969850,-0.81321707,-0.04629182,
0.73721320,1.61278518,2.76647043,3.94804472)
dat <- data.frame(Raw.Score, Severity)
library(ggplot2)
dat_tile <- data.frame(
Severity = seq(-4.1, 4.1, .05)
)
dat_axis <- data.frame(
Severity = seq(-4, 4, 2)
)
tile_height = .15
ymax <- .5
ggplot(dat, aes(y = 0, x = Severity, fill = Severity)) +
# Axis line
geom_hline(yintercept = -tile_height / 2) +
# Colorbar
geom_tile(data = dat_tile, aes(color = Severity), height = tile_height) +
# Sgements connecting top and bottom labels
geom_segment(aes(xend = Severity, yend = -ymax, y = ymax), color = "orange") +
# Axis ticks aka dots
geom_point(data = dat_axis,
y = -tile_height / 2, shape = 21, stroke = 1, fill = "white") +
# ... and labels
geom_text(data = dat_axis, aes(label = Severity),
y = -tile_height / 2 - .1, vjust = 1, fontface = "bold") +
# Bottom labels
geom_label(aes(y = -ymax, label = scales::number(Severity, accuracy = .01))) +
# Top labels
geom_point(aes(y = ymax, color = Severity), size = 8) +
geom_text(aes(y = ymax, label = Raw.Score), fontface = "bold") +
# Colorbar annotations
annotate(geom = "text", fontface = "bold", label = "MILD", color = "black", x = -3.75, y = 0) +
annotate(geom = "text", fontface = "bold", label = "SEVERE", color = "white", x = 3.75, y = 0) +
# Fixing the scales
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(limits = c(-ymax, ymax)) +
# Color gradient
scale_fill_gradient(low = "orange", high = "red", guide = "none") +
scale_color_gradient(low = "orange", high = "red", guide = "none") +
# Get rid of all non-data ink
theme_void() +
# Add some plot margin
theme(plot.margin = rep(unit(10, "pt"), 4)) +
coord_cartesian(clip = "off")

Rstudio Bland Altman grouped colours and shapes

I have a bland-altman plot of 16 measurements divided over 3 groups (Slice) which I want to colorcode and possibly have different shapes but somehow I cant get it working:
df <- data.frame("Slice" = c(1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3),
"Segments" = c(1:16),
"mean" = c(6,5,2,4,8,9,6,3,5,6,5,8,5,4,6,4),
"dif" = c(1,3,2,1,2,3,2,1,2,2,2,1,3,2,1,2))
#creat limits of agreement
LL = mean(df$dif)-1.96*(sd(df$dif))
UL = mean(df$dif)+1.96*(sd(df$dif))
#create BA plot
BAplot <- ggplot(df, aes(x=mean,y=dif))+
geom_jitter(alpha=1.0,size=18,shape="*", stroke = 1.5)+
geom_hline(yintercept=mean(df$dif),color= "blue",size=2)+
geom_text(aes(x = 12, y = mean(df$dif)+0.2, label = round(mean(df$dif), 1)), col = "blue", size = 7) +
geom_hline(yintercept=0,linetype=3,size=2) +
geom_hline(yintercept=c(UL,LL),color="black",linetype="dashed",size=2)+theme_bw()+
geom_text(aes(x = 12, y = UL+0.2, label = round(UL,1)), col = "black", size = 7) +
geom_text(aes(x = 12, y = LL+0.2, label = round(LL,1)), col = "black", size = 7) +
scale_x_continuous("mean",limits = c(-2,12))+
scale_y_continuous("diff", limits = c(-1, 5.5))
To code your points by color and to have different shapes you have to map your Slice column on the color and/or shape aesthetic inside geom_jitter. As Slice is a numeric I first converted it to a factor. If you want specific colors or shape you could set your desired values using scale_color_manual and scale_shape_manual:
library(ggplot2)
ggplot(df, aes(x = mean, y = dif)) +
geom_jitter(aes(color = factor(Slice), shape = factor(Slice)), alpha = 1.0, size = 2) +
geom_hline(yintercept = mean(df$dif), color = "blue", size = 2) +
geom_text(aes(x = 12, y = mean(dif) + 0.2, label = round(mean(dif), 1)), col = "blue", size = 7) +
geom_hline(yintercept = 0, linetype = 3, size = 2) +
geom_hline(yintercept = c(UL, LL), color = "black", linetype = "dashed", size = 2) +
theme_bw() +
geom_text(aes(x = 12, y = UL + 0.2, label = round(UL, 1)), col = "black", size = 7) +
geom_text(aes(x = 12, y = LL + 0.2, label = round(LL, 1)), col = "black", size = 7) +
scale_x_continuous("mean", limits = c(-2, 12)) +
scale_y_continuous("diff", limits = c(-1, 5.5))

ggplot2 legend not working/ add manual legend

I'm new to R. Legends and plotting seem to be more difficult than in Python. How can I change the graph to display each node as a different color in the legend? Now I have something like the picture.
Thank you for your help.
library(ggplot2)
library(MASS)
library(car)
library(robustbase)
data("airquality")
# Select only Ozone and Temp variables
air = airquality[c("Ozone" , "Temp")]
# We need to remove NA from data set
air = na.omit(air)
air.center = colMeans(air)
air.cov = cov(air)
rad = sqrt(qchisq(p = 0.95 , df = ncol(air)))
ellipse <- ellipse(center = air.center , shape = air.cov , radius = rad ,
segments = 150 , draw = FALSE)
ellipse <- as.data.frame(ellipse)
colnames(ellipse) <- colnames(air)
# Finding distances
distances <- mahalanobis(x = air , center = air.center , cov = air.cov)
# Cutoff value for ditances from Chi-Sqaure Dist.
# with p = 0.95 df = 2 which in ncol(air)
cutoff <- qchisq(p = 0.95 , df = ncol(air))
### Minimum Covariance Determinant (MCD)
Y_mcd <- covMcd(air)
# Robust estimate of location
Y_mcd$center
# Robust estimate of scatter
Y_mcd$cov
# Make elilipse
ellipse_mcd <- data.frame(ellipse(center = Y_mcd$center,
shape = Y_mcd$cov,
radius= rad,
segments=100,draw=FALSE))
#the same names as in previous plot
colnames(ellipse_mcd) <- colnames(air)
plot_fig <- ggplot(air , aes(x = Ozone , y = Temp)) +
geom_point(size = 2) +
geom_polygon(data = ellipse , fill = "blue" , color = "blue" , alpha = 0.5,show.legend =T)+
geom_point(aes(air.center[1] , air.center[2],fill='Mahalanobis') , size = 5 , color = "blue") +
geom_text(data=subset(air, distances > cutoff),
aes(Ozone,Temp,label=row.names(air[distances > cutoff ,])), hjust = 1 , vjust = -1.5 ,size = 3.5)+
ylab("Temperature Values") + xlab("Ozone Values")+ggtitle("Mahalanobis distance")+ theme(
legend.position = c(0.95, 0.15),
legend.justification = c("right", "top")
) + geom_polygon(data=ellipse_mcd,aes(x = Ozone,y = Temp, colour='LINE2'), color="red", fill="red",
alpha=0.3, inherit.aes = FALSE) +
geom_point(aes(x = Y_mcd$center[1], y = Y_mcd$center[2],fill='MCD'),
color = "red", size = 6)
plot_fig
The issue is that you mapped on the fill aesthetic and set the color of the points as arguments. Instead map the labels on the color aes and set the color values via scale_color_manual:
plot_fig <- ggplot(air , aes(x = Ozone , y = Temp)) +
geom_point(size = 2) +
geom_polygon(data = ellipse , fill = "blue" , color = "blue" , alpha = 0.5, show.legend =T)+
geom_point(aes(air.center[1] , air.center[2], color = 'Mahalanobis'), size = 5) +
geom_text(data=subset(air, distances > cutoff),
aes(Ozone,Temp,label=row.names(air[distances > cutoff ,])), hjust = 1 , vjust = -1.5 ,size = 3.5)+
ylab("Temperature Values") +
xlab("Ozone Values")+
ggtitle("Mahalanobis distance")+
theme(
legend.position = c(0.95, 0.15),
legend.justification = c("right", "top")
) +
geom_polygon(data=ellipse_mcd,aes(x = Ozone,y = Temp, colour='LINE2'), color="red", fill="red",
alpha=0.3, inherit.aes = FALSE) +
geom_point(aes(x = Y_mcd$center[1], y = Y_mcd$center[2], color='MCD'), size = 6) +
scale_color_manual(values = c(MCD = "red", Mahalanobis = "blue"))
plot_fig
If you map the Mahalanobis/MCD categorical variables to colour, then let fill be dependent on the mapped colour, the legend should sort itself out naturally and you can set the colours with scale_colour_manual().
ggplot(air, aes(Ozone, Temp)) +
geom_point(size = 2) +
geom_polygon(
data = ellipse,
aes(fill = after_scale(alpha(colour, 0.5)), colour = "Mahalanobis")
) +
geom_polygon(data = ellipse_mcd,
aes(fill = after_scale(alpha(colour, 0.3)), colour = "MCD")) +
geom_point(aes(air.center[1], air.center[2], colour = "Mahalanobis"),
size = 5) +
geom_point(aes(Y_mcd$center[1], Y_mcd$center[2], colour = "MCD"), size = 5) +
geom_text(data = subset(air, distances > cutoff),
aes(label = row.names(air[distances > cutoff, ])),
hjust = 1, vjust = -1.5, size = 3.5) +
scale_colour_manual(values = c("blue", "red")) +
labs(x = "Ozone Values", y = "Temperature values",
title = "Mahalanobis distance") +
theme(legend.position = c(0.95, 0.15),
legend.justification = c("right", "top"))

annotate ggplot with discrete axis (w/ reproducible example)

I'm struggling to find a straightforward solution to fix my plot. The problem stems down to the discrete nature of the x-axis. I want to annotate the plot with text and segments in order to show statistical results.
1) I want to print the p-value between "Baby" and "Queen" as well as between "Queen" and "Worker", but ggplot only allows to annotate above each label, not between them.
2) Similarly, I want the first two geom_segments to be separated, but ggplot won't let me end the first one at something like "Queen"-0.1 and start the second one at "Queen"+0.1 as it is mixing factors and numbers.
Fully reproducible example below, with issues on line 12, 13 and 18:
data <- data.frame(Group.1 = rep(c("A","B"),3),Group.2 = c("Baby","Baby","Worker","Worker","Queen","Queen"),
value = c(0.18,0.30,0.09,0.25,-0.26,-0.55))
boxplot_candidates <- ggplot(aes(y=value,x=Group.2,fill=Group.2),data= data) + theme_bw() +
scale_fill_manual(values=c("lightgreen","darkgreen","goldenrod1"),name="") +
theme(plot.title = element_text(face="bold", size=18, hjust=0)) +
labs(x="",y="Transcript expression\n(log2-centered TMM-nornalised TPMs)") +
theme(plot.title=element_text(size=18, vjust=2),legend.position="", legend.text=element_text(size=14),
axis.text.x = element_text(size = 14, colour = "black"),
axis.text.y = element_text(size = 14, colour = "black"),
axis.title.y=element_text(size = 14, colour = "black",vjust=1),
axis.title.x=element_text(size = 14, colour = "black")) +
geom_segment(aes(x="Baby",xend="Queen",y=0.7,yend=0.7)) + ##### MAKE XEND SMALLER
geom_segment(aes(x="Queen",xend="Worker",y=0.7,yend=0.7)) + ##### MAKE XEND LARGER
geom_segment(aes(x="Baby",xend="Worker",y=1.2,yend=1.2)) +
ylim(-1.5,1.5) + stat_boxplot(geom ='errorbar') +
geom_boxplot(notch=F,outlier.shape=NA) +
geom_point(size=2,position = position_jitter(width = 0.2)) + stat_summary(fun.y=mean, colour = "white",geom="point", size=4) +
annotate("text", x = as.factor(unique(data$Group.2)),y=c(0.8,0.8,1.3),
label = c("p < 0.001","p < 0.001","p = 0.89"),family="",fontface = 3,size=4) ##### PRINT "p < 0.001" BETWEEN LABELS
print(boxplot_candidates)
Categorical variables are simply placed at locations 1, 2, 3, etc. If you want to reach locations between two categorical variables, you can use coordinates such as 1.2 or 1.5 etc.
Here is a reproducible example with all the irrelevant theme code stripped out:
data <- data.frame(Group.1 = rep(c("A", "B"), 3),
Group.2 = c("Baby", "Baby", "Worker", "Worker", "Queen", "Queen"),
value = c(0.18, 0.30, 0.09, 0.25, -0.26, -0.55))
ggplot(data, aes(y = value, x = Group.2, fill = Group.2)) +
stat_boxplot(geom = 'errorbar') +
geom_boxplot(notch = F, outlier.shape = NA) +
geom_segment(aes(x=1.1, xend=1.9, y=0.7, yend=0.7)) +
geom_segment(aes(x=2.1, xend=2.9, y=0.7, yend=0.7)) +
geom_segment(aes(x=1.1, xend=2.9, y=1.2, yend=1.2)) +
geom_point(size = 2, position = position_jitter(width = 0.2)) +
stat_summary(fun.y = mean, colour = "white", geom = "point", size = 4) +
annotate("text",
x = c(1.5, 2.5, 2),
y = c(0.8, 0.8, 1.3),
label = c("p < 0.001", "p < 0.001", "p = 0.89"),
family = "", fontface = 3, size=4) +
scale_fill_manual(values=c("lightgreen", "darkgreen", "goldenrod1"),
guide = "none") +
ylim(-1.5, 1.5) +
labs(x="", y="Transcript expression\n(log2-centered TMM-nornalised TPMs)") +
theme_bw()

Applying log scale to y-axis for visualizing proportions with ggplot2

I am attempting to recreate some plots from a research article in R and am running into an issue with applying a log scale to y axis. The visualization I'm attempting to recreate is this:
reference plot with y log scale
I currently have a working version without the logarithmic scale applied to the y-axis:
Proportion_Mean_Plot <- ggplot(proportions, aes(days2,
proportion_mean, group = observation)) +
geom_point(aes(shape = observation)) +
geom_line() +
scale_x_continuous(breaks = seq(0,335,20)) +
scale_y_continuous(breaks = seq(0,6,.5)) +
theme_tufte() +
geom_rangeframe() +
theme(legend.position="none") +
theme(axis.line.x = element_line(colour = "black", size = 0.5, linetype = 1),
axis.line.y = element_line(colour = "black", size = 0.5, linetype = 1)) +
labs(title = "Proportion of Baseline Mean",
subtitle = "Daily steps within each intervention phase",
x = "DAYS",
y = "PROPORTION OF BASELINE \n(MEAN)") +
geom_vline(xintercept = 164.5) +
geom_hline(yintercept = 1) +
annotate("text", x = c(82, 246), y = 5,
label = c("Intervention 1", "Intervention 2")) +
geom_segment(aes(x = 0, y = mean, xend = end, yend = mean),
data = proportion_intervention1_data) +
geom_segment(aes(x = start, y = mean, xend = end, yend = mean),
data = proportion_intervention2_data, linetype = 4)
This produces a decent representation of the original:
normally scaled y-axis plot
I would like to try to apply that logarithmic scaling to more closely match it. Any help is appreciated.
As per Richard's suggestion, here is a quick example how you can use scale_y_log10:
suppressPackageStartupMessages(library(tidyverse))
set.seed(123)
# generate some data
proportions <- tibble(interv_1 = pmax(0.4, rnorm(160, mean = 1.3, sd = 0.2)),
interv_2 = pmax(0.01, rnorm(160, mean = 1.6, sd = 0.5)))
proportions <- proportions %>%
gather(key = observation, value = proportion_mean) %>%
mutate(days2 = 1:320)
# create the plot
ggplot(proportions, aes(days2, proportion_mean, group = observation)) +
geom_point(aes(shape = observation)) +
geom_line() +
scale_x_continuous(breaks = seq(0,335,20), expand = c(0, 0)) +
scale_y_log10(breaks = c( 0.1, 0.5, 1, 2, 3, 4, 5), limits = c(0.1, 5)) +
# theme_tufte() +
# geom_rangeframe() +
theme(legend.position="none") +
theme(axis.line.x = element_line(colour = "black", size = 0.5, linetype = 1),
axis.line.y = element_line(colour = "black", size = 0.5, linetype = 1)) +
labs(title = "Proportion of Baseline Mean",
subtitle = "Daily steps within each intervention phase",
x = "DAYS",
y = "PROPORTION OF BASELINE \n(MEAN)") +
geom_vline(xintercept = 164.5) +
geom_hline(yintercept = 1) +
annotate("text", x = c(82, 246), y = 5,
label = c("Intervention 1", "Intervention 2")) +
# plugged the values for the means of the two distributions
geom_segment(aes(x = 0, y = 1.3, xend = 164.5, yend = 1.3)) +
geom_segment(aes(x = 164.5, y = 1.6, xend = 320, yend = 1.6), linetype = 4)

Resources