Change legend and shape in ggbiplot pca - r

can you please help me with my pca? I would like to change the shapes in that way that each species has a different color and all 2-3 organisms for each species have 2-3 different symbols. It should look like this: enter image description here
So far I tried the following code:
setwd("~/Schwarze Johannisbeeren/SJ Wein mit nicht Sc/PCA/stackoverflow frage")
results = read.csv("results.csv", sep = ";", encoding = "UTF-8", header=TRUE, check.names=FALSE)
results.pca <- prcomp(results[,c(3:7)],
center = TRUE,
scale. = TRUE)
#grouping by organism
results.organism <- results[, 1]
#by species
results.species <- results[, 2]
summary(results.pca)
library(ggplot2)
library(ggbiplot)
ggbiplot(results.pca, alpha=0, obs.scale = 1, var.scale = 1 ,ellipse = TRUE,ellipse.prob=0.68, circle = F, varname.size=0,
var.axes = F, groups=results$species) +
theme_bw()+
geom_point(aes( colour=factor(results.species)), size=2)+
scale_shape_manual(values= c("Mt1"= 1, "Mt2" =2, "Al1"= 1, "Al2" =2, "Bg1" =1, "Bg2"=2, "Bg3" =3, "Cs1"= 1, "Cs2" =2, "Cs3" =3, "Df1"= 1, "Df2" =2, "Df3" =3))+
#scale_color_brewer(name= "organism", type = "qual", palette = 2)+
#scale_x_continuous (limits = c (-1, 9))+
theme(axis.text.x = element_text(size = 12, colour = "black", vjust = 0.5, hjust = 1, face= "bold"),
axis.title.y = element_text(size = 12, face = "bold"),
axis.title.x = element_text(size = 12, face = "bold"),
axis.text.y = element_text(colour = "black", size = 12, face = "bold"))
and that´s my data
> results
organism species lactones cyanides alcohols ethers acids
1 Mt1 Mt 23435.167 166.4 137653.9 4040.1 1131.52
2 Mt1 Mt 23303.111 168.9 153511.0 4529.1 1148.52
3 Mt1 Mt 22340.556 176.6 150719.9 3255.8 1200.88
4 Mt2 Mt 51519.222 175.9 173401.1 3890.1 1196.12
5 Mt2 Mt 48824.500 166.5 171614.4 3694.1 1132.20
6 Mt2 Mt 50427.278 165.4 168865.1 3693.2 1124.72
7 Al1 Al 25260.222 162.0 211737.4 9563.9 1101.60
8 Al1 Al 23177.556 161.5 199886.7 10403.3 1098.20
9 Al1 Al 27903.000 156.2 240088.4 11897.1 1062.16
10 Al2 Al 5993.722 180.4 289334.9 6673.3 1226.72
11 Al2 Al 7307.389 169.7 275631.1 8333.4 1153.96
12 Al2 Al 9419.167 147.5 277924.5 9622.2 1003.00
13 Bg1 Bg 58216.944 132.4 92275.3 4099.5 900.32
14 Bg1 Bg 69860.222 147.4 105654.9 4080.6 1002.32
15 Bg1 Bg 72809.333 145.8 111731.3 4014.6 991.44
16 Bg2 Bg 51584.611 142.9 105548.2 6450.1 971.72
17 Bg2 Bg 57738.056 141.2 117728.9 6332.4 960.16
18 Bg2 Bg 53356.056 142.7 110260.2 6506.2 970.36
19 Bg3 Bg 41983.389 130.8 103799.4 4781.8 889.44
20 Bg3 Bg 46930.722 148.3 113944.6 5151.6 1008.44
21 Bg3 Bg 49487.611 139.4 121976.5 5318.3 947.92
22 Cs1 Cs 7155.056 161.6 221538.8 8356.0 1098.88
23 Cs1 Cs 8153.611 151.0 179823.0 7961.2 1026.80
24 Cs1 Cs 7445.722 168.6 176978.0 8196.5 1146.48
25 Cs2 Cs 10771.556 126.4 144314.1 8634.6 859.52
26 Cs2 Cs 12239.556 142.6 142913.7 9471.9 969.68
27 Cs2 Cs 13788.611 136.1 131506.7 9390.4 925.48
28 Cs3 Cs 12082.111 152.0 171730.0 6259.6 1033.60
29 Cs3 Cs 14331.556 143.3 141748.7 7532.8 974.44
30 Cs3 Cs 14123.056 158.2 150303.0 7755.8 1075.76
31 Df1 Df 26906.778 156.2 310203.9 5505.5 1062.16
32 Df1 Df 20689.111 163.5 214322.9 5315.6 1111.80
33 Df1 Df 22872.722 154.1 197572.9 4627.7 1047.88
34 Df2 Df 18838.222 159.2 125167.6 12372.9 1082.56
35 Df2 Df 18218.667 155.8 127077.2 11182.0 1059.44
36 Df2 Df 18545.389 156.2 154400.4 10543.6 1062.16
37 Df3 Df 19924.111 156.4 199472.6 4452.3 1063.52
38 Df3 Df 22504.056 158.0 196343.0 3994.1 1074.40
39 Df3 Df 16907.278 151.5 185052.9 4084.6 1030.20
>
By the way, is it possible to have only PC1( x %) instead of PC1(x % explained var.) for the axis labeling?

One approach to achieve your desired result would be to first create shape and color palettes which map organism names to shapes and colors. Second, inside your geom_point extend the data by adding a column with the organism for which I use dplyr::bind_cols. Doing so allows to map the organism on the shape and the color aes. Finally, get rid of the color legend for the groups using scale_color_discrete(guide = "none") and add a second color scale via ggnewscale::new_scale_color and a scale_color_manual:
Note: Easy fix for the axis titles would be to set them manually using +labs(x = ..., y = ...).
library(ggplot2)
library(ggbiplot)
pal_shape <- gsub("^.*?(.)$", "\\1", results$organism)
pal_shape <- scales::shape_pal()(3)[as.integer(pal_shape)]
names(pal_shape) <- results$organism
pal_color <- gsub("^(.*?).$", "\\1", results$organism)
pal_color <- setNames(scales::hue_pal()(5), sort(unique(results$species)))[pal_color]
names(pal_color) <- results$organism
ggbiplot(results.pca,
alpha = 0, obs.scale = 1, var.scale = 1, ellipse = TRUE, ellipse.prob = 0.68, circle = F, varname.size = 0,
var.axes = F, groups = results$species
) +
scale_color_discrete(guide = "none") +
ggnewscale::new_scale_color() +
geom_point(data = ~ dplyr::bind_cols(.x, organism = results$organism),
aes(shape = organism, colour = organism),
size = 2) +
scale_shape_manual(values = pal_shape) +
scale_color_manual(values = pal_color) +
theme_bw() +
theme(
axis.text.x = element_text(size = 12, colour = "black", vjust = 0.5, hjust = 1, face = "bold"),
axis.title.y = element_text(size = 12, face = "bold"),
axis.title.x = element_text(size = 12, face = "bold"),
axis.text.y = element_text(colour = "black", size = 12, face = "bold")
)
DATA
results <- structure(list(organism = c(
"Mt1", "Mt1", "Mt1", "Mt2", "Mt2",
"Mt2", "Al1", "Al1", "Al1", "Al2", "Al2", "Al2", "Bg1", "Bg1",
"Bg1", "Bg2", "Bg2", "Bg2", "Bg3", "Bg3", "Bg3", "Cs1", "Cs1",
"Cs1", "Cs2", "Cs2", "Cs2", "Cs3", "Cs3", "Cs3", "Df1", "Df1",
"Df1", "Df2", "Df2", "Df2", "Df3", "Df3", "Df3"
), species = c(
"Mt",
"Mt", "Mt", "Mt", "Mt", "Mt", "Al", "Al", "Al", "Al", "Al", "Al",
"Bg", "Bg", "Bg", "Bg", "Bg", "Bg", "Bg", "Bg", "Bg", "Cs", "Cs",
"Cs", "Cs", "Cs", "Cs", "Cs", "Cs", "Cs", "Df", "Df", "Df", "Df",
"Df", "Df", "Df", "Df", "Df"
), lactones = c(
23435.167, 23303.111,
22340.556, 51519.222, 48824.5, 50427.278, 25260.222, 23177.556,
27903, 5993.722, 7307.389, 9419.167, 58216.944, 69860.222, 72809.333,
51584.611, 57738.056, 53356.056, 41983.389, 46930.722, 49487.611,
7155.056, 8153.611, 7445.722, 10771.556, 12239.556, 13788.611,
12082.111, 14331.556, 14123.056, 26906.778, 20689.111, 22872.722,
18838.222, 18218.667, 18545.389, 19924.111, 22504.056, 16907.278
), cyanides = c(
166.4, 168.9, 176.6, 175.9, 166.5, 165.4, 162,
161.5, 156.2, 180.4, 169.7, 147.5, 132.4, 147.4, 145.8, 142.9,
141.2, 142.7, 130.8, 148.3, 139.4, 161.6, 151, 168.6, 126.4,
142.6, 136.1, 152, 143.3, 158.2, 156.2, 163.5, 154.1, 159.2,
155.8, 156.2, 156.4, 158, 151.5
), alcohols = c(
137653.9, 153511,
150719.9, 173401.1, 171614.4, 168865.1, 211737.4, 199886.7, 240088.4,
289334.9, 275631.1, 277924.5, 92275.3, 105654.9, 111731.3, 105548.2,
117728.9, 110260.2, 103799.4, 113944.6, 121976.5, 221538.8, 179823,
176978, 144314.1, 142913.7, 131506.7, 171730, 141748.7, 150303,
310203.9, 214322.9, 197572.9, 125167.6, 127077.2, 154400.4, 199472.6,
196343, 185052.9
), ethers = c(
4040.1, 4529.1, 3255.8, 3890.1,
3694.1, 3693.2, 9563.9, 10403.3, 11897.1, 6673.3, 8333.4, 9622.2,
4099.5, 4080.6, 4014.6, 6450.1, 6332.4, 6506.2, 4781.8, 5151.6,
5318.3, 8356, 7961.2, 8196.5, 8634.6, 9471.9, 9390.4, 6259.6,
7532.8, 7755.8, 5505.5, 5315.6, 4627.7, 12372.9, 11182, 10543.6,
4452.3, 3994.1, 4084.6
), acids = c(
1131.52, 1148.52, 1200.88,
1196.12, 1132.2, 1124.72, 1101.6, 1098.2, 1062.16, 1226.72, 1153.96,
1003, 900.32, 1002.32, 991.44, 971.72, 960.16, 970.36, 889.44,
1008.44, 947.92, 1098.88, 1026.8, 1146.48, 859.52, 969.68, 925.48,
1033.6, 974.44, 1075.76, 1062.16, 1111.8, 1047.88, 1082.56, 1059.44,
1062.16, 1063.52, 1074.4, 1030.2
)), class = "data.frame", row.names = c(
"1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39"
))

Related

Overlay 2 dataframes on the same graph using R

I would like to overlay the points from two different dataframes on the same graph but I can't seem to achieve it.
I know that I need to combine the 2 ggplot() but I don't know which method to use.
ggplot(grid) +
ggtitle("top+bot") +
aes(x, y) +
geom_point(shape=15, color = "blue", size = 2, alpha = (grid$z/op)) +
scale_x_continuous(name= "length (m)", breaks=seq(0, max(fullgrid[,"x"]), step)) +
scale_y_continuous(name= "width (mm)", breaks=seq(0, max(fullgrid[,"y"]), 100)) +
expand_limits(x = 0, y = 0) +
coord_cartesian(expand = FALSE) +
theme(plot.title = element_text(size=10, hjust = 0.5), axis.title.x = element_text(size=10), axis.text.x= element_text(angle=30, hjust=1), axis.title.y = element_text(size=10), axis.text.y= element_text(angle=30, hjust=1))
ggplot(def) +
ggtitle("top+bot") +
geom_rect(data = def , aes(xmin = xbegc, xmax = xendc, ymin = ybegc, ymax =yendc ),
alpha = 8)
There are many things wrong in your code. I've boiled down your code to something more essential - to crystallise the problem: The use of a global aesthetic for all geom layers, although not all data frames have this aesthetic (in your case: mod).
I've also used a smaller version of your data and slightly changed your regex.
Smaller comments in the code - I recommend to read.
library(ggplot2)
freq <- structure(list(vlookup = c("Entrevista_final|1|Q3_nova|0|C1|1", "Entrevista_final|1|Q3_nova|0|C1|2", "Entrevista_final|1|Q3_nova|0|C3|1", "Entrevista_final|1|Q3_nova|0|C3|4", "Entrevista_final|1|Q3_nova|0|C3|2", "Entrevista_final|1|Q3_nova|0|C3|3", "Entrevista_final|1|Q3_nova|0|C4_1|2018", "Entrevista_final|1|Q3_nova|0|C4_1|2020", "Entrevista_final|1|Q3_nova|0|C4_1|1993", "Entrevista_final|1|Q3_nova|0|C4_1|2015", "Entrevista_final|1|Q3_nova|0|C4_1|2016", "Entrevista_final|1|Q3_nova|0|C4_1|1996", "Entrevista_final|1|Q3_nova|0|C4_1|99", "Entrevista_final|1|Q3_nova|0|C4_1|2017", "Entrevista_final|1|Q3_nova|0|C4_1|2004", "Entrevista_final|1|Q3_nova|0|C4_1|2019", "Entrevista_final|1|Q3_nova|0|C4_1|2002", "Entrevista_final|1|Q3_nova|0|C4_1|2021", "Entrevista_final|1|Q3_nova|0|C4_2|99", "Entrevista_final|1|Q3_nova|0|C4_2|2018"), cruza1 = c("Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final", "Entrevista_final"), mod1 = c("1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"), cruza2 = c("Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova", "Q3_nova"), mod2 = c("0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0"), var = c("C1", "C1", "C3", "C3", "C3", "C3", "C4_1", "C4_1", "C4_1", "C4_1", "C4_1", "C4_1", "C4_1", "C4_1", "C4_1", "C4_1", "C4_1", "C4_1", "C4_2", "C4_2"), mod = c("1", "2", "1", "4", "2", "3", "2018", "2020", "1993", "2015", "2016", "1996", "99", "2017", "2004", "2019", "2002", "2021", "99", "2018"), pop_extrapolada = c(22, 17, 32, 3, 3, 1, 4, 10, 1, 1, 2, 1, 2, 2, 1, 4, 1, 10, 2, 3), PERCENT = c(56.4102564102564, 43.5897435897436, 82.051282051282, 7.69230769230769, 7.69230769230769, 2.56410256410256, 10.2564102564103, 25.6410256410256, 2.56410256410256, 2.56410256410256, 5.12820512820513, 2.56410256410256, 5.12820512820513, 5.12820512820513, 2.56410256410256, 10.2564102564103, 2.56410256410256, 25.6410256410256, 15.3846153846154, 23.0769230769231), count = c(22, 17, 32, 3, 3, 1, 4, 10, 1, 1, 2, 1, 2, 2, 1, 4, 1, 10, 2, 3), BA = c(39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 13, 13), StdErr = c(0.0795074974876938, 0.0795074974876938, 0.0615313285518115, 0.0427252055542318, 0.0427252055542318, 0.0253433535020364, 0.0486447864199686, 0.0700118062133025, 0.0253433535020364, 0.0253433535020364, 0.0353661791821834, 0.0253433535020364, 0.0353661791821834, 0.0353661791821834, 0.0253433535020364, 0.0486447864199686, 0.0253433535020364, 0.0700118062133025, 0.100484337629983, 0.117340426124568), LowerCLMean = c(0.407774134748444, 0.279569006543315, 0.699529312635895, 0, 0, 0, 0.00691824151210088, 0.118752349327158, 0, 0, 0, 0, 0, 0, 0, 0.00691824151210088, 0, 0.118752349327158, 0, 0), UpperCLMean = c(0.720430993456685, 0.592225865251556, 0.941496328389746, 0.160929799423163, 0.160929799423163, 0.0754713784093646, 0.198209963616104, 0.394068163493355, 0.0754713784093646, 0.0754713784093646, 0.120819383213079, 0.0754713784093646, 0.120819383213079, 0.120819383213079, 0.0754713784093646, 0.198209963616104, 0.0754713784093646, 0.394068163493355, 0.352798149154327, 0.463095108198593), ME = c(15.6328429354121, 15.6328429354121, 12.0983507876925, 8.40067225000863, 8.40067225000863, 4.9830352768339, 9.56458610520017, 13.7657907083098, 4.9830352768339, 4.9830352768339, 6.95373319310279, 4.9830352768339, 6.95373319310279, 6.95373319310279, 4.9830352768339, 9.56458610520017, 4.9830352768339, 13.7657907083098, NA, NA), StdDev = c(4.55932029745697, 4.03560271072923, 5.42183783041485, 1.72749876078217, 1.72749876078217, 1, 1.99211042289525, 3.1247047104581, 1, 1, 1.41235641966466, 1, 1.41235641966466, 1.41235641966466, 1, 1.99211042289525, 1, 3.1247047104581, 1.40830867828517, 1.71755640373177)), row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"))
## shortened your regex a bit
freq$esc <- gsub("^(.*C[0-9]).*", "\\1", freq$vlookup)
db <- data.frame(x = 1:3, y = c(5.5, 6.5, 9.7))
## change the filter so that the example works
Q6 <- freq[grepl("Entrevista_final|1", freq$esc), c("var", "mod", "PERCENT")]
## remove fill = mod from main ggplot2 call and put it into your bar plot and in geom_text as group
ggplot(Q6, aes(x = var, y = PERCENT)) +
## use geom_col instead of geom_bar(stat = "identity"), position = "stack" is default
geom_col(aes(fill = factor(x = mod)), color = "red") +
geom_text(aes(group = factor(x = mod), label = sprintf(PERCENT, fmt = "%1.0f")),
position = position_stack(vjust = 0.5)
) +
## use your coordinate data frame and use the correct geom layer syntax
geom_point(data = db, aes(x, y))
Created on 2023-02-16 with reprex v2.0.2

How to order x axis based on y axis values (in decreasing order)

I was trying to make bars in clustered column plot appearing with a decreasing ordered based on values of y axis(pososto).I tryed aes(x=reorder(allele,-pososto,sum(and mean)) but those didn't work.If someone can help it would be perfect.
Thank you in advance!
Part of my data:
df <- structure(list(allele = c("Ak24:02", "Ak24:02", "Ak24:02", "Ak24:02",
"Ak02:01", "Ak02:01", "Ak02:01", "Ak02:01", "Ak01:01", "Ak01:01",
"Ak01:01", "Ak01:01", "Ak11:01", "Ak11:01", "Ak11:01", "Ak11:01",
"Ak03:01", "Ak03:01", "Ak03:01", "Ak03:01", "Ak32:01", "Ak32:01",
"Ak32:01", "Ak32:01", "Ak26:01", "Ak26:01"), subject = c("her",
"reth", "las", "xan", "her", "reth", "las", "xan", "her", "reth",
"las", "xan", "her", "reth", "las", "xan", "her", "reth", "las",
"xan", "her", "reth", "las", "xan", "her", "reth"), pososto = c(6.2076749,
0, 14.3529412, 0, -0.7751938, 0, -0.7751938, 0, 5.0666604, 0,
0, 11.944798, 0, 0, 9.7559913, 6.0631187, -12.4022258, -1.3476446,
13.1983584, 17.1626014, 0, -13.1511297, 14.7583726, -5.6592186,
0, 0)), class = "data.frame", row.names = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26"
))
My basic code:
df = data.frame(allele, subject, pososto)
df$subject <- factor(df$subject, levels =c("her", "xan", "las", "reth"))
df$pososto = as.numeric(df$pososto)
p=ggplot(df, aes(x = allele, y = pososto, fill = subject)) +
geom_bar(position = "dodge",stat = "identity", aes(fill= subject)) +
#gia labels
# geom_text(aes(label = round(count, 2)),position = position_dodge(width = 0.9),
# vjust = ifelse(df[,3]>=0, -0.5, 1) , size= 3)+
scale_x_discrete(limits = (unique(df$allele))) +
scale_fill_manual(values=c("#696969","#778899","#A9A9A9","#CCCCFF"))+
theme_bw(base_size = 9) +
geom_hline(yintercept = 0, colour = "black") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))strong text`
dataframe is looking like this:
The plot I am taking from basic code:
Maybe this is what you are looking for. To order the bars in decreasing order without grouping by allele you could add a helper variable via interaction which could then be mapped on x.
To only show the allele on the x axis I make use of a custom labels function in scale_x_discrete and use an underscore as separator in interaction.
library(ggplot2)
df$subject <- factor(df$subject, levels =c("her", "xan", "las", "reth"))
df$pososto = as.numeric(df$pososto)
df$x <- interaction(df$allele, df$subject, sep = "_")
ggplot(df, aes(x = reorder(x, -pososto), y = pososto, fill = subject)) +
geom_col(position = "dodge") +
scale_x_discrete(labels = ~ gsub("^(.*?)_.*$", "\\1", .x)) +
scale_fill_manual(values=c("#696969","#778899","#A9A9A9","#CCCCFF"))+
theme_bw(base_size = 9) +
geom_hline(yintercept = 0, colour = "black") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
Are you looking for such a solution?
library(tidyverse)
df %>%
arrange(allele, pososto, subject) %>%
group_by(allele) %>%
mutate(id=row_number()) %>%
mutate(subject = fct_reorder(allele, id)) %>%
ggplot(aes(x = allele, y = pososto, fill=factor(id))) +
geom_col(position=position_dodge())+
scale_x_discrete(limits = (unique(df$allele))) +
theme_bw(base_size = 9) +
geom_hline(yintercept = 0, colour = "black") +
scale_fill_manual(values=c("#696969","#778899","#A9A9A9","#CCCCFF"), labels = c("her", "reth", "las", "xan"))+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
data:
df <- structure(list(allele = c("Ak24:02", "Ak24:02", "Ak24:02", "Ak24:02",
"Ak02:01", "Ak02:01", "Ak02:01", "Ak02:01", "Ak01:01", "Ak01:01",
"Ak01:01", "Ak01:01", "Ak11:01", "Ak11:01", "Ak11:01", "Ak11:01",
"Ak03:01", "Ak03:01", "Ak03:01", "Ak03:01", "Ak32:01", "Ak32:01",
"Ak32:01", "Ak32:01"), subject = c("her", "reth", "las", "xan",
"her", "reth", "las", "xan", "her", "reth", "las", "xan", "her",
"reth", "las", "xan", "her", "reth", "las", "xan", "her", "reth",
"las", "xan"), pososto = c(6.2076749, 0, 14.3529412, 0, -0.7751938,
0, -0.7751938, 0, 5.0666604, 0, 0, 11.944798, 0, 0, 9.7559913,
6.0631187, -12.4022258, -1.3476446, 13.1983584, 17.1626014, 0,
-13.1511297, 14.7583726, -5.6592186)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24"
))

Can we color the different rows/covariates/studies in different colors in R forest plots?

Using the forestplot package in the programming language R, I would like to make a forest plot that has each row in a different color. By each row, I mean the boxes and the respective confidence intervals.
Taking an example from the vignette [https://cran.r-project.org/web/packages/forestplot/vignettes/forestplot.html],
library(forestplot)
# Cochrane data from the 'rmeta'-package
cochrane_from_rmeta <-
structure(list(
mean = c(NA, NA, 0.578, 0.165, 0.246, 0.700, 0.348, 0.139, 1.017, NA, 0.531),
lower = c(NA, NA, 0.372, 0.018, 0.072, 0.333, 0.083, 0.016, 0.365, NA, 0.386),
upper = c(NA, NA, 0.898, 1.517, 0.833, 1.474, 1.455, 1.209, 2.831, NA, 0.731)),
.Names = c("mean", "lower", "upper"),
row.names = c(NA, -11L),
class = "data.frame")
tabletext <- cbind(c("", "Study", "Auckland", "Block", "Doran", "Gamsu", "Morrison", "Papageorgiou", "Tauesch", NA, "Summary"),
c("Deaths", "(steroid)", "36", "1", "4", "14", "3", "1", "8", NA, NA),
c("Deaths", "(placebo)", "60", "5", "11", "20", "7", "7", "10", NA, NA),
c("", "OR", "0.58", "0.16", "0.25", "0.70", "0.35", "0.14", "1.02", NA, "0.53"))
forestplot(tabletext, cochrane_from_rmeta, new_page = TRUE,
is.summary = c(TRUE, TRUE, rep(FALSE, 8), TRUE),
clip = c(0.1,2.5), xlog = TRUE,
col = fpColors(box="royalblue",line="darkblue", summary="royalblue"))
I want each study to have its own color (Auckland can be colored blue, Block can be colored red, Doran can be colored green, and so on). I think that this might be accomplished by changing the argument to the fpColors() function.
Is there any way to do this?
Take a look at fpShapesGp. With this, it's possible to color the rows in different colors.
An simple example for your code:
styles <- fpShapesGp(
lines = list(
gpar(col = "black"),
gpar(col = "blue"),
gpar(col = "black"),
gpar(col = "blue"),
gpar(col = "black"),
gpar(col = "blue"),
gpar(col = "black"),
gpar(col = "blue"),
gpar(col = "black"),
gpar(col = "blue"),
gpar(col = "black")
),
box = list(
gpar(fill = "black"),
gpar(fill = "blue"),
gpar(fill = "black"),
gpar(fill = "blue"),
gpar(fill = "black"),
gpar(fill = "blue"),
gpar(fill = "black"),
gpar(fill = "blue"),
gpar(fill = "black"),
gpar(fill = "blue"),
gpar(fill = "black")
)
)
forestplot(tabletext, cochrane_from_rmeta, new_page = TRUE,
is.summary = c(TRUE, TRUE, rep(FALSE, 8), TRUE),
clip = c(0.1,2.5), xlog = TRUE,
shapes_gp = styles)
As a result you get this:

Align Text to geom_vline with varying location

I have a function that creates a histogram with an overlying density plot. The function also displays a red dotted line indicating alpha. Users can indicate the alpha level. Moreover, the count in the histogram will differ as a function of the input data. I want a label indicating alpha = 0.05(for example) next to the red dotted line. The label should always be next to the alpha line and always be near the top of the graph (I did not solve that). I´m aware of Align geom_text to a geom_vline in ggplot2, but they do not provide what I´m looking for (and/or produce error messages, I tried to reduce the size of the label by text=element_text(size=11) as suggested there, but that does not work).
Find below some sample code:
multiverse.p.histogram <- function(dataframe, pvalues, alpha = 0.05){
hist <- ggplot(dataframe, aes(x = p.value)) + geom_histogram(binwidth = 0.01, color = "black",fill = "dodgerblue") + theme_bw() + xlim(0,1) + geom_density(alpha = 0.5, fill = "#FF6666") +xlab("p-value") + ggtitle("Histogram of Multiverse P-Values") + geom_vline(xintercept = alpha, color = "red", linetype = "dashed") +
geom_text(aes(x = alpha, y = 75, label = "Alpha"), color = "red") +
theme(
axis.text = element_text(color = "black"),
axis.line = element_line(colour = "black"),
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
return(hist)
}#close histogram function
#and some sample data
df_multiverse <- structure(list(transformation = c("normal", "normal", "normal",
"normal", "normal", "normal", "normal", "normal", "normal", "normal",
"normal", "normal", "normal", "normal", "normal", "normal", "normal",
"normal", "normal", "normal", "normal", "normal", "normal", "normal",
"normal", "normal", "normal", "normal", "normal", "normal", "normal",
"normal", "normal", "normal", "normal", "normal", "normal", "normal",
"normal", "normal", "normal", "normal", "normal", "normal", "normal",
"normal", "normal", "normal", "normal", "normal"), datatrimming = c("notrimming",
"notrimming", "notrimming", "notrimming", "notrimming", "notrimming",
"notrimming", "notrimming", "notrimming", "notrimming", "notrimming",
"mad", "mad", "mad", "mad", "mad", "mad", "mad", "mad", "mad",
"mad", "mad", "mad", "mad", "mad", "mad", "mad", "mad", "mad",
"mad", "mad", "mad", "mad", "mad", "mad", "mad", "mad", "mad",
"mad", "mad", "mad", "mad", "mad", "mad", "mad", "mad", "mad",
"mad", "mad", "mad"), fixedtrimming = c("min", "min", "min",
"min", "min", "minmax", "minmax", "minmax", "minmax", "minmax",
"nofixedtrimming", "min", "min", "min", "min", "min", "minmax",
"minmax", "minmax", "minmax", "minmax", "nofixedtrimming", "min",
"min", "min", "min", "min", "minmax", "minmax", "minmax", "minmax",
"minmax", "nofixedtrimming", "min", "min", "min", "min", "min",
"minmax", "minmax", "minmax", "minmax", "minmax", "nofixedtrimming",
"min", "min", "min", "min", "min", "minmax"), min = c("0.1",
"0.2", "0.3", "0.4", "0.5", "0.1", "0.2", "0.3", "0.4", "0.5",
NA, "0.1", "0.2", "0.3", "0.4", "0.5", "0.1", "0.2", "0.3", "0.4",
"0.5", NA, "0.1", "0.2", "0.3", "0.4", "0.5", "0.1", "0.2", "0.3",
"0.4", "0.5", NA, "0.1", "0.2", "0.3", "0.4", "0.5", "0.1", "0.2",
"0.3", "0.4", "0.5", NA, "0.1", "0.2", "0.3", "0.4", "0.5", "0.1"
), max = c("4.78103879314337", "4.78103879314337", "4.78103879314337",
"4.78103879314337", "4.78103879314337", "10", "10", "10", "10",
"10", NA, "1.50348972125673", "1.50348972125673", "1.50348972125673",
"1.50348972125673", "1.50348972125673", "10", "10", "10", "10",
"10", NA, "1.6673730851492", "1.6673730851492", "1.6673730851492",
"1.6673730851492", "1.6673730851492", "10", "10", "10", "10",
"10", NA, "1.82875939263309", "1.82875939263309", "1.82875939263309",
"1.82875939263309", "1.82875939263309", "10", "10", "10", "10",
"10", NA, "1.98682907108801", "1.98682907108801", "1.98682907108801",
"1.98682907108801", "1.98682907108801", "10"), DispersionMeasure = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "2", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2.5", "2.5", "2.5", "2.5", "2.5",
"2.5", "2.5", "2.5", "2.5", "2.5", "2.5", "3", "3", "3", "3",
"3", "3", "3", "3", "3", "3", "3", "3.5", "3.5", "3.5", "3.5",
"3.5", "3.5"), df = c(23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23), t.value = c(-1.96240490816673,
-1.91062435558061, -1.88913858576971, -1.50889838134833, -0.584414818091524,
-1.96240490816673, -1.91062435558061, -1.88913858576971, -1.50889838134833,
-0.584414818091524, -2.01035512741752, -2.32446732021548, -2.32446732021548,
-2.25138730178018, -1.75805360848308, -0.671509667928522, -2.32446732021548,
-2.32446732021548, -2.25138730178018, -1.75805360848308, -0.671509667928522,
-2.32446732021548, -2.07781942947361, -2.04327207374561, -1.96398718960439,
-1.45016152484876, -0.43329653628318, -2.07781942947361, -2.04327207374561,
-1.96398718960439, -1.45016152484876, -0.43329653628318, -2.07781942947361,
-3.1795493150037, -3.14621983607465, -3.03987566457514, -2.35519486220697,
-1.34118074962509, -3.1795493150037, -3.14621983607465, -3.03987566457514,
-2.35519486220697, -1.34118074962509, -3.19618807311348, -3.37575126770368,
-3.33582114002809, -3.25737102188504, -2.65364122964845, -1.74520405186558,
-3.37575126770368), p.value = c(0.0619242560601778, 0.0685974542038329,
0.0715464534237802, 0.14494031195569, 0.564630276572904, 0.0619242560601778,
0.0685974542038329, 0.0715464534237802, 0.14494031195569, 0.564630276572904,
0.056262190757649, 0.0292871811194525, 0.0292871811194525, 0.0342153500184824,
0.0920408256371383, 0.508584931329577, 0.0292871811194525, 0.0292871811194525,
0.0342153500184824, 0.0920408256371383, 0.508584931329577, 0.0292871811194525,
0.049074641173751, 0.0526459198825374, 0.0617296734199745, 0.160514579425126,
0.668835951230964, 0.049074641173751, 0.0526459198825374, 0.0617296734199745,
0.160514579425126, 0.668835951230964, 0.049074641173751, 0.00417775230313281,
0.00452298394363368, 0.00581820793330847, 0.0274164539383892,
0.192956766873482, 0.00417775230313281, 0.00452298394363368,
0.00581820793330847, 0.0274164539383892, 0.192956766873482, 0.00401507276581307,
0.00260719926285416, 0.00287129534969705, 0.00346795018735445,
0.0141919615636613, 0.0942977424474807, 0.00260719926285416),
estimate = c(-0.797956867083461, -0.776801900236937, -0.7455698051489,
-0.444049984838546, -0.10530217843728, -0.797956867083461,
-0.776801900236937, -0.7455698051489, -0.444049984838546,
-0.10530217843728, -0.820469748450972, -0.251308805770323,
-0.251308805770323, -0.251096848307402, -0.226028966303428,
-0.134612249858047, -0.251308805770323, -0.251308805770323,
-0.251096848307402, -0.226028966303428, -0.134612249858047,
-0.251308805770323, -0.265907227757688, -0.261504591915461,
-0.260164781545852, -0.225524157517464, -0.10176195202019,
-0.265907227757688, -0.261504591915461, -0.260164781545852,
-0.225524157517464, -0.10176195202019, -0.265907227757688,
-0.409969137221152, -0.405618224033153, -0.409494543344045,
-0.387356945276789, -0.329354185640372, -0.409969137221152,
-0.405618224033153, -0.409494543344045, -0.387356945276789,
-0.329354185640372, -0.422572659021681, -0.506062313897924,
-0.501186805248218, -0.510763602114717, -0.498830153358464,
-0.447892133899374, -0.506062313897924)), row.names = c("df",
"df1", "df2", "df3", "df4", "df5", "df6", "df7", "df8", "df9",
"df10", "df11", "df12", "df13", "df14", "df15", "df16", "df17",
"df18", "df19", "df20", "df21", "df22", "df23", "df24", "df25",
"df26", "df27", "df28", "df29", "df30", "df31", "df32", "df33",
"df34", "df35", "df36", "df37", "df38", "df39", "df40", "df41",
"df42", "df43", "df44", "df45", "df46", "df47", "df48", "df49"
), class = "data.frame")
#execute function
multiverse.p.histogram(df_multiverse, df_multiverse$p.value)
There are two problems with the code:
The alpha does not display next to the line, but on the line and I had to specify y = 75 manually. Ideally, it should always be shortly underneath the upper border. Finally, I can´t get the text size of the alpha to decrease. I tried nudge_x, but that produces the following error: Warnmeldungen:
1: Removed 2 rows containing missing values (geom_bar).
2: Removed 264 rows containing missing values (geom_text).
Does anyone have suggestions?
Thanks already!
Edit:
Based on the answers, here is my updated code:
multiverse.p.histogram <- function(dataframe, pvalues, alpha = 0.05){
ggplot(dataframe, aes(x = p.value)) +
geom_histogram(binwidth = 0.01, color = "black", fill = "dodgerblue") + #plots the histogram
geom_density(alpha = 0.5, fill = "#FF6666") + #adds densityplot
geom_vline(xintercept = alpha, color = "red", linetype = "dashed") + #adds alpha line
geom_text(x = alpha, hjust = -0.5, #adds alpha symbol next to line
y = Inf,
label = expression(paste(alpha)),
color = "red", check_overlap = TRUE,
vjust = "inward") +
ggtitle("Histogram of Multiverse P-Values") +
xlab("p-value") +
theme_bw() +
theme(axis.text = element_text(color = "black"),
axis.line = element_line(colour = "black"),
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
}
Here's a few tweaks to your function that may help:
Find out where the approximate upper limit of your plot will be by using the base R hist function. Use this as the position for alpha, then set the upper y limit as a small multiple of that to ensure everything fits nicely.
You only need a single alpha label, so don't map the text to an aesthetic. You can use x and y positions directly.
Use hjust to adjust your text position.
It makes your code easier to read and debug if you arrange the plot code so it all fits neatly across a single screen and is in a predictable order (I like ggplot then geoms then scales then lims, then labels then themes, but whatever order works best for you, stick to a consistent scheme.
multiverse.p.histogram <- function(dataframe, pvalues, alpha = 0.05)
{
upper <- max(hist(dataframe$p.value, breaks = seq(0, 1, 0.01))$counts)
ggplot(dataframe, aes(x = p.value)) +
geom_histogram(binwidth = 0.01, color = "black", fill = "dodgerblue") +
geom_density(alpha = 0.5, fill = "#FF6666") +
geom_vline(xintercept = alpha, color = "red", linetype = "dashed") +
geom_text(x = alpha, hjust = -0.25,
y = upper,
label = "Alpha",
color = "red", check_overlap = TRUE) +
coord_cartesian(xlim = c(0, 1)) +
xlim(-0.01, 1) +
ylim(0, upper * 1.1) +
ggtitle("Histogram of Multiverse P-Values") +
xlab("p-value") +
theme_bw() +
theme(axis.text = element_text(color = "black"),
axis.line = element_line(colour = "black"),
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
}

how to get combined legend in ggplot2 when I have both geom_line and geom_hline

I have data as follows in a file wiki-hier.dat:
1 2 5 10 20
2 80.77 80.76 80.77 80.77 80.77
3 83.06 83.11 83.11 83.11 83.11
4 84.40 84.46 84.43 84.44 84.43
5 85.24 85.31 85.31 85.31 85.31
6 85.92 86.03 86.02 86.02 86.01
7 86.41 86.56 86.55 86.55 86.55
8 86.62 86.76 86.77 86.77 86.77
10 87.13 87.32 87.32 87.32 87.32
12 87.50 87.71 87.68 87.69 NA
14 87.72 87.93 87.94 87.94 87.94
16 87.82 88.08 88.10 88.10 NA
18 87.87 88.15 88.17 88.17 88.17
20 87.93 88.22 88.23 88.23 88.23
24 88.10 88.39 88.40 88.41 88.40
28 88.10 88.46 88.47 88.47 88.47
32 88.12 88.46 88.49 88.49 NA
36 88.16 88.50 88.52 88.52 88.52
40 88.12 88.49 88.50 88.50 88.50
50 88.11 88.48 88.45 88.44 NA
60 87.90 88.26 88.25 88.25 NA
70 87.73 88.05 88.05 88.04 NA
80 87.60 87.89 87.90 87.89 NA
100 87.38 87.66 87.63 87.62 NA
120 87.01 87.23 87.23 87.23 NA
150 86.73 86.91 86.90 86.89 NA
200 86.04 86.14 86.08 86.06 NA
250 85.62 85.59 85.53 85.51 NA
I'm trying to plot this using ggplot2. My code is as follows:
library(ggplot2)
t = read.table("wiki-hier.dat", header=TRUE)
t$x = as.numeric(rownames(t))
data = data.frame(x = rep(t$x, 2), acc = c(t$X1, t$X2),
beam = factor(rep(c(1,2), each=length(t$x))))
pdf("plotlines-wiki-hier-ggplot.pdf")
p = qplot(x, acc, data = data, geom = c("line", "point"),
group = beam, color = beam, shape = beam, linetype = beam) +
xlab("K-d subdivision factor") +
ylab("Acc#161 (pct)") +
geom_hline(aes(yintercept=84.49, linetype="Naive Bayes"), show_guide=TRUE) +
scale_linetype_manual("beam", values = c(1,3,2))
print(p)
dev.off()
The problem is, I get two legends, and I only want one:
This one legend should have colors, shapes and linetypes in it for the keys "1" and "2".
How do I fix this?
The next-best thing would be to have two legends, one containing only the keys "1" and "2" and the other containing only the key "Naive Bayes".
One way is to create another dataframe which you can map the aesthetics to similar to your main data.
#Your data
dat <- structure(list(X1 = c(80.77, 83.06, 84.4, 85.24, 85.92, 86.41,
86.62, 87.13, 87.5, 87.72, 87.82, 87.87, 87.93, 88.1, 88.1, 88.12,
88.16, 88.12, 88.11, 87.9, 87.73, 87.6, 87.38, 87.01, 86.73,
86.04, 85.62), X2 = c(80.76, 83.11, 84.46, 85.31, 86.03, 86.56,
86.76, 87.32, 87.71, 87.93, 88.08, 88.15, 88.22, 88.39, 88.46,
88.46, 88.5, 88.49, 88.48, 88.26, 88.05, 87.89, 87.66, 87.23,
86.91, 86.14, 85.59), X5 = c(80.77, 83.11, 84.43, 85.31, 86.02,
86.55, 86.77, 87.32, 87.68, 87.94, 88.1, 88.17, 88.23, 88.4,
88.47, 88.49, 88.52, 88.5, 88.45, 88.25, 88.05, 87.9, 87.63,
87.23, 86.9, 86.08, 85.53), X10 = c(80.77, 83.11, 84.44, 85.31,
86.02, 86.55, 86.77, 87.32, 87.69, 87.94, 88.1, 88.17, 88.23,
88.41, 88.47, 88.49, 88.52, 88.5, 88.44, 88.25, 88.04, 87.89,
87.62, 87.23, 86.89, 86.06, 85.51), X20 = c(80.77, 83.11, 84.43,
85.31, 86.01, 86.55, 86.77, 87.32, NA, 87.94, NA, 88.17, 88.23,
88.4, 88.47, NA, 88.52, 88.5, NA, NA, NA, NA, NA, NA, NA, NA,
NA)), .Names = c("X1", "X2", "X5", "X10", "X20"), class = "data.frame",
row.names = c("2", "3", "4", "5", "6", "7", "8", "10", "12", "14", "16",
"18", "20", "24", "28", "32", "36", "40", "50", "60", "70", "80", "100",
"120", "150", "200", "250"))
dat$x = as.numeric(rownames(dat))
dat = data.frame(x = rep(dat$x, 2), acc = c(dat$X1, dat$X2),
beam = factor(rep(c(1,2), each=length(dat$x))))
# Create a new dataframe for your horizontal line
newdf <- data.frame(x=c(0,max(dat$x)), acc=84.49, beam='naive')
# or of you want the full horizontal lines
# newdf <- data.frame(x=c(-Inf, Inf), acc=84.49, beam='naive')
Plot
library(ggplot2)
ggplot(dat, aes(x, acc, colour=beam, shape=beam, linetype=beam)) +
geom_point(size=4) +
geom_line() +
geom_line(data=newdf, aes(x,acc)) +
scale_linetype_manual(values =c(1,3,2)) +
scale_shape_manual(values =c(16,17, NA)) +
scale_colour_manual(values =c("red", "blue", "black"))
I used NA to suppress the shape in the naive legend
EDIT
After re-reading perhaps all you need is this
ggplot(dat, aes(x, acc, colour=beam, shape=beam, linetype=beam)) +
geom_point(size=4) +
geom_line() +
geom_hline(aes(yintercept=84.49), linetype="dashed")

Resources