Related
I really like this confusion plot, but I want the values where Freq is zero to be white.
gg %>% ggplot(aes(Prediction, Reference, fill= Freq)) +
geom_tile() + geom_text(aes(label=Freq)) +
scale_fill_gradient(low="#f8766d", high="#00ba38") +
labs(x = "Prediction",y = "Reference")
I tried filtering out the zero Freq, but it looks ugly:
gg %>% dplyr::filter(Freq != 0) %>%
ggplot(aes(Prediction, Reference, fill= Freq)) +
geom_tile() + geom_text(aes(label=Freq)) +
scale_fill_gradient(low="#f8766d", high="#00ba38") +
labs(x = "Prediction",y = "Reference")
Anyone know how I can keep the red/green on the diagonal/off diagonal, but make zeros white?
Here is the data:
gg <- structure(list(Prediction = structure(c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L), .Label = c("0", "1", "2", "3", "4", "5", "6",
"7", "8", "9"), class = "factor"), Reference = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L), .Label = c("0", "1", "2", "3", "4",
"5", "6", "7", "8", "9"), class = "factor"), Freq = c(99L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 97L, 2L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 0L, 1L, 93L, 1L, 2L, 1L, 0L, 1L, 1L, 0L, 0L, 0L,
1L, 96L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 0L, 3L, 0L, 94L, 0L, 0L,
2L, 0L, 1L, 2L, 0L, 0L, 2L, 1L, 85L, 1L, 0L, 6L, 3L, 0L, 0L,
1L, 0L, 1L, 1L, 95L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 3L, 0L, 0L,
89L, 0L, 8L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 97L, 0L, 1L, 0L,
0L, 1L, 1L, 1L, 0L, 2L, 2L, 92L)), class = "data.frame", row.names = c(NA,
-100L))
That was nice! I'm facing similar problems these days. If you want the 0 to appear, I think there are at least two ways of doing this:
The first one is by manipulating a bit the data and using the parameter na.value="white":
gg %>% mutate(Freq2 = ifelse(Freq == 0,NA,Freq)) %>%
ggplot(aes(Prediction, Reference, fill = Freq2)) +
geom_tile() +
geom_text(aes(label=Freq)) +
scale_fill_gradientn(colours = c("#f8766d", "#00ba38"),na.value="white") +
labs(x = "Prediction",y = "Reference", fill = "Freq")
The second one is by playing with the colours of the scale:
gg %>%
ggplot(aes(Prediction, Reference, fill = Freq)) +
geom_tile() +
geom_text(aes(label=Freq)) +
scale_fill_gradientn(colours = c("white","#f8766d", "#00ba38"), values = c(0,0.01,1)) +
labs(x = "Prediction",y = "Reference")
This solution has the downside of being quite sensible to the values, in the sense that if your data changes a lot, this won't lead to the same coloring. So the first solution is more robust.
To remove the 0's from the white cells, the best option is to set the label to an empty string in those cases. For exemple, using the first solution:
gg %>% mutate(Freq2 = ifelse(Freq == 0,NA,Freq)) %>%
ggplot(aes(Prediction, Reference, fill = Freq2)) +
geom_tile() +
geom_text(aes(label=ifelse(Freq == 0,"",Freq))) +
scale_fill_gradientn(colours = c("#f8766d", "#00ba38"),na.value="white") +
labs(x = "Prediction",y = "Reference", fill = "Freq")
One option is to use na.value after replacing the 0 to NA
library(ggplot2)
library(dplyr)
gg %>%
mutate(Freq = na_if(Freq, 0)) %>%
ggplot(aes(Prediction, Reference, fill= Freq)) +
geom_tile() +
geom_text(aes(label=Freq)) +
scale_fill_gradient(low="#f8766d", high="#00ba38",
na.value = 'white') +
labs(x = "Prediction",y = "Reference")
-output
If we need the 0 values as well, instead of creating a new column, can change the 0 to NA in fill using .data
ggplot(gg, aes(Prediction, Reference, fill = na_if(.data[["Freq"]], 0))) +
geom_tile() +
geom_text(aes(label=Freq)) +
scale_fill_gradient(low="#f8766d", high="#00ba38",
na.value = 'white') +
labs(x = "Prediction",y = "Reference", fill = "Freq")
I am making two boxplots and want to arrange them beside each other. I have made each of them look like I want when displaying them separately but when I use ggarrange() the colors disappear. This is my code for the plots:
BOX1_data <- read.table(file = "clipboard",
sep = "\t", header=TRUE)
BOX1_data$Diagnosis <- as.factor(BOX1_data$Diagnosis)
BOX1plot <- ggplot(BOX1_data, aes(x=Diagnosis, y=No.Variants, fill= Diagnosis)) + geom_boxplot() +
scale_fill_brewer(palette = "Dark2") +
scale_x_discrete(labels = c("AC\nN=38", "SqCC\nN=15", "SCLC\nN=8", "BL disease\nN=16"))
BOX2_data <- read.table(file = "clipboard",
sep = "\t", header=TRUE)
BOX2_data$Stage <- as.factor(BOX2_data$Stage)
BOX2plot <- ggplot(BOX2_data, aes(x=Stage, y=No.Variants, fill = Stage)) + geom_boxplot(width = 0.4) +
scale_fill_brewer(palette = "Dark2") +
scale_x_discrete(labels = c("Stage I-III\nN=24", "Stage IV\nN=37"))
To arrange the plots I then write:
BOX_list <- list(BOX1plot, BOX2plot)
ggarrange(plotlist = BOX_list, labels = c('A', 'B'), ncol = 2)
The easiest way of getting rid of gridlines etc I thought was by using theme_set() and I think that this might be my problem.
My code is:
theme_set(theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), panel.background = element_blank(),
axis.line = element_line(colour = "grey")))
I realize that theme_bw() overwrites my colors in the boxes. But I have tried removing it, switching it for theme_transparent() (this removes all my labels) and neither works. I have searched for a way of just adding a transparency to my boxes in the theme so that my colors will shine through. I am also suspicious that maybe the palette that I chose might give me the same colors in the two plots which I also do not want. To add, if it matters, I have 4 groups in the first plot and 2 in the second.
dput(BOX1_data)
structure(list(Diagnosis = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("1", "2", "3", "4"), class = "factor"),
No.Variants = c(3L, 4L, 6L, 14L, 3L, 3L, 4L, 3L, 3L, 3L,
8L, 6L, 22L, 10L, 6L, 9L, 1L, 9L, 3L, 4L, 8L, 2L, 13L, 3L,
11L, 19L, 5L, 5L, 3L, 12L, 4L, 2L, 4L, 18L, 8L, 7L, 7L, 12L,
4L, 1L, 6L, 3L, 2L, 8L, 10L, 3L, 15L, 9L, 13L, 13L, 15L,
10L, 10L, 12L, 6L, 3L, 12L, 9L, 15L, 10L, 18L, 3L, 6L, 3L,
6L, 1L, 3L, 3L, 7L, 1L, 2L, 10L, 7L, 7L, 1L, 0L, 2L)), row.names = c(NA,
-77L), class = "data.frame")
dput(BOX2_data)
structure(list(No.Variants = c(3L, 4L, 6L, 14L, 3L, 3L, 4L, 3L,
3L, 3L, 8L, 6L, 22L, 10L, 6L, 9L, 1L, 9L, 3L, 4L, 8L, 2L, 13L,
3L, 11L, 19L, 5L, 5L, 3L, 12L, 4L, 2L, 4L, 18L, 8L, 7L, 7L, 12L,
4L, 1L, 6L, 3L, 2L, 8L, 10L, 3L, 15L, 9L, 13L, 13L, 15L, 10L,
10L, 12L, 6L, 3L, 12L, 9L, 15L, 10L, 18L), Stage = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("1",
"2"), class = "factor")), row.names = c(NA, -61L), class = "data.frame")
Grateful for any tips!
As already pointed out, it seems the OP's issue with theme_set() removing the fill colors set in your two plots was solved by updating to a new version of ggplot2. Herein, I have a solution for the second part of OP's question (that was clarified in the comments). Represented here for convenience:
Now it is just the problem that I want the palette to continue on the second plot's boxes and not restart so that I will get different colors on all boxes.
In order to do this, one has to realize that there are 4 fill colors for the first plot BOX1plot, and 2 fill colors for BOX2plot. For BOX1plot, we want the color palette to begin at the first color, but for BOX2plot, we want the palette to start on the 5th color sequence in the palette. There's no way to do this through the scale_*_brewer() functions, so the approach here will be to access the Brewer palette from RcolorBrewer::brewer.pal(), and then assign where to begin and end in that sequence based on the number of levels of each factor using scale_fill_manual() to just set the color values from the extracted Brewer color palette.
You can just "know" that you need to "use colors 1-4" for BOX1plot and "use color 5 and 6" for BOX2plot; however, it is much more elegant to just calculate this automatically based on the number of levels (in case you want to run this again). The code below does this:
library(ggplot2)
library(ggpubr)
library(RColorBrewer)
# ... read in your data as before
# create factors (as OP did before)
BOX1_data$Diagnosis <- as.factor(BOX1_data$Diagnosis)
BOX2_data$Stage <- as.factor(BOX2_data$Stage)
# make color palette based on Brewer "Dark2" palette
lev_diag <- length(levels(BOX1_data$Diagnosis))
lev_stage <- length(levels(BOX2_data$Stage))
lev_total <- lev_diag + lev_stage
my_colors <- brewer.pal(lev_total, "Dark2")
BOX1plot <- ggplot(BOX1_data, aes(x=Diagnosis, y=No.Variants, fill= Diagnosis)) + geom_boxplot() +
scale_fill_manual(values=my_colors[1:lev_diag]) +
scale_x_discrete(labels = c("AC\nN=38", "SqCC\nN=15", "SCLC\nN=8", "BL disease\nN=16"))
BOX2plot <- ggplot(BOX2_data, aes(x=Stage, y=No.Variants, fill = Stage)) + geom_boxplot(width = 0.4) +
scale_fill_manual(values = my_colors[(lev_diag+1):lev_total]) +
scale_x_discrete(labels = c("Stage I-III\nN=24", "Stage IV\nN=37"))
BOX_list <- list(BOX1plot, BOX2plot)
ggarrange(plotlist = BOX_list, labels = c('A', 'B'), ncol = 2)
If you have issues with ggarrange() I would suggest next approach using patchwork:
library(ggplot2)
library(patchwork)
#Data format
BOX1_data$Diagnosis <- as.factor(BOX1_data$Diagnosis)
#Plot 1
BOX1plot <- ggplot(BOX1_data, aes(x=Diagnosis, y=No.Variants, fill= Diagnosis)) + geom_boxplot() +
scale_fill_brewer(palette = "Dark2") +
scale_x_discrete(labels = c("AC\nN=38", "SqCC\nN=15", "SCLC\nN=8", "BL disease\nN=16"))
#Data format
BOX2_data$Stage <- as.factor(BOX2_data$Stage)
#Plot 2
BOX2plot <- ggplot(BOX2_data, aes(x=Stage, y=No.Variants, fill = Stage)) + geom_boxplot(width = 0.4) +
scale_fill_brewer(palette = "Dark2") +
scale_x_discrete(labels = c("Stage I-III\nN=24", "Stage IV\nN=37"))
#Arrange plots
BOX1plot+BOX2plot+plot_annotation(tag_levels = 'A')
The output:
I would like to change the order of my legend, and not to display them in alphabetical order as you can see below. I would like to have
"NONE","LIGHT","MEDIUM","HEAVY","V_COLD","COLD","MEDIUM","HOT".
Is it possible? I tried with several arguments but without success.
Below, my table :
structure(list(SOUNAME = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "BALLYSHANNON (CATHLEENS FALL)", class = "factor"),
year_month = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L,
6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L), .Label = c("2013-03",
"2013-04", "2013-05", "2013-06", "2013-07", "2013-08", "2013-09",
"2013-10", "2013-12"), class = "factor"), pre_type = structure(c(4L,
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L,
3L, 4L, 1L, 2L, 3L), .Label = c("HEAVY", "LIGHT", "MEDIUM",
"NONE"), class = "factor"), pre_value = c(13L, 2L, 11L, 5L,
9L, 3L, 10L, 7L, 2L, 6L, 13L, 10L, 10L, 1L, 15L, 4L, 16L,
2L, 7L, 5L, 2L, 2L, 17L, 9L, 7L, 3L, 13L, 6L, 5L, 2L, 10L,
14L, 1L, 5L, 19L, 6L), tem_type = structure(c(4L, 3L, 2L,
1L, 4L, 3L, 2L, 1L, 4L, 3L, 2L, 1L, 4L, 3L, 2L, 1L, 4L, 3L,
2L, 1L, 4L, 3L, 2L, 1L, 4L, 3L, 2L, 1L, 4L, 3L, 2L, 1L, 4L,
3L, 2L, 1L), .Label = c("COLD", "HOT", "MEDIUM", "V_COLD"
), class = "factor"), tem_value = c(0L, 7L, 0L, 23L, 0L,
29L, 0L, 1L, 0L, 29L, 2L, 0L, 0L, 21L, 9L, 0L, 0L, 5L, 25L,
0L, 0L, 18L, 13L, 0L, 0L, 21L, 9L, 0L, 0L, 26L, 5L, 0L, 0L,
24L, 0L, 7L), cnt_vehicle = c(NA, 2754406, NA, NA, NA, 2846039,
NA, NA, NA, 3149377, NA, NA, NA, 3058810, NA, NA, NA, 3362614,
NA, NA, NA, 3415716, NA, NA, NA, 3020812, NA, NA, NA, 3076665,
NA, NA, NA, 2775306, NA, NA), x = c(1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L,
6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L
)), .Names = c("SOUNAME", "year_month", "pre_type", "pre_value",
"tem_type", "tem_value", "cnt_vehicle", "x"), row.names = c(NA,
-36L), class = "data.frame")
Below my graph:
ggplot(data = b_complet_2013, aes(x = x, y = pre_value*100000, fill = pre_type), stat = "identity") +
scale_x_continuous(breaks=(1:9)+0.2, labels=unique(b_complet_2013$year_month)) +
geom_bar(stat = "identity", width=0.3) +
xlab("date") + ylab ("Number of days of précipitations(left) and temperatures (ritght)") +
ggtitle("Precipitation per month") +
geom_bar(data=b_complet_2013,aes(x=x+0.4, y=tem_value*100000, fill=tem_type), width=0.3, stat = "identity") +
xlab("date") + ylab("Number of days of precipitations(left) and temperatures (ritght)") +
ggtitle("Impact of weather on road traffics") + theme( axis.title.y = element_text(color = "blue", face = "bold")) +
theme(axis.text.y = element_text(color = "blue", face = "bold", size=9)) + theme( axis.title.y.right = element_text(color = "black", face = "bold")) +
theme(axis.text.y.right = element_text(color = "black", size = 9, face = "bold")) +
geom_line(mapping = aes(x= x+0.2, y = as.numeric(cnt_vehicle)), colour = I("blue"), size = 0.8) +
geom_point(aes(x= x+0.2, y = as.numeric(cnt_vehicle), colour = I("blue")), show.legend=FALSE, stat = "identity") +
scale_y_continuous(sec.axis = sec_axis(~./100000,name="Number of days of precipitations(left) and temperatures (ritght)")) +
theme( plot.title = element_text(size = 17)) + theme(axis.title.x = element_text(size = 12)) + theme(axis.title.y = element_text(size = 12)) +
labs(y = "Number of vehicles", color ="black") +
theme(panel.background = element_rect(linetype = "dashed", fill="white"), plot.background = element_rect(linetype = "dashed",fill="grey90" ))
consider the following example data:
ex = structure(list(group = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 1L,
2L, 3L, 4L, 6L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 6L, 1L,
2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 6L,
1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L,
5L, 6L, 1L, 2L, 1L, 2L, 3L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 5L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L,
4L), .Label = c("A", "B", "C", "D", "E", "F"), class = "factor"),
ID = structure(c(35L, 35L, 35L, 35L, 35L, 35L, 1L, 1L, 1L,
1L, 1L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 9L, 9L,
9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L,
11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L,
13L, 13L, 14L, 14L, 14L, 14L, 14L, 14L, 21L, 21L, 22L, 22L,
22L, 22L, 2L, 3L, 4L, 5L, 8L, 15L, 16L, 17L, 18L, 19L, 19L,
20L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 30L, 31L, 32L,
33L, 34L), .Label = c("10", "107", "108", "109", "124", "17",
"18", "187", "19", "21", "24", "26", "27", "28", "335", "336",
"339", "340", "341", "342", "38", "39", "576", "577", "578",
"579", "580", "581", "582", "583", "584", "585", "586", "592",
"6"), class = "factor"), value = c(1L, 7L, 4L, 4L, 3L, 9L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 5L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 33L, 27L, 28L, 21L, 28L, 1L, 3L, 1L, 1L, 1L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 2L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L)), class = "data.frame",
row.names = c(NA, -88L), .Names = c("group", "ID", "value")
)
Note that in group A, value = 1 for every ID. I use ggplot2 to create dot plot based on counts of the value variable using geom_dotplot and faceting by group:
ggplot(ex) + aes(x = value) +
geom_dotplot(binwidth = 1, method = "histodot") +
facet_wrap(~ group)
The dot stack in the first facet is cut off, even when exported using ggsave. Changing the y-axis limits has no effect, but changing the aspect ratio so that H >= W seems to fix the issue (usually by adding way more space to the top than necessary). Is this a bug, or am I specifying my dot plot incorrectly?
EDIT
One workaround is to flip my dotplot and bin by the y variable:
ggplot(ex) + aes(x = group, y = value) +
geom_dotplot(binwidth = 1, method = "histodot",
binaxis = "y", stackdir = "centerwhole") +
facet_wrap(~ group, scales = "free_x")
Two other parameters that can help you are stackratio and dotsize. For example:
ggplot(ex) + aes(x = value) +
geom_dotplot(binwidth = 1, method = "histodot", stackratio = 0.9, dotsize = .75) +
facet_wrap(~ group) +
scale_y_continuous(NULL, breaks = NULL)
You would need to tweak the numbers until you got the layout you wanted.
I found an interesting workaround using geom_bar that achieves the same structure as a dot plot but with rectangles:
ggplot(ex) + aes(x = value, group = ID) +
geom_bar(color = "black", fill = "white", width = 1) +
facet_wrap(~ group)
Although it results in rectangles (rather than dots) and you can't control the stack spacing. The rectangles get resized according to the plot window, which would be equivalent to tweaking the dot size in geom_dotplot. Also, it begs the question "why not just use a regular bar plot?"
I have a line plot of some event at a hospital that I have been struggling with.
The challenges that I haven't solved yet are, 1) sorting the lines on the plot so that the patient-lines are sorted by Assessment-date, 2) coloring the lines by the variable 'openCase' and finally, 3) I would like to remove the Discharge-point (the blue square) for the cases that are in the year 2014 (or at some other random cut of date).
Any help would be appreciated?
Here is my sample data,
library(ggplot2)
library(plyr)
df <- data.frame(
date = seq(Sys.Date(), len= 156, by="5 day")[sample(156, 78)],
openCase = rep(0:1, 39),
patients = factor(rep(1:26, 3), labels = LETTERS)
)
df <- ddply(df, "patients", mutate, visit = order(date))
df$visit <- as.factor(df$visit)
levels(df$visit) <- c("Assessment (1)", "Treatment (2)", "Discharge (3)")
qplot(date, patients, data = df, geom = "line") +
geom_point(aes(colour = visit), size = 2, shape=0)
I'm aware that my example data is not perfect as some of the assessment datas is after the treatments and some of the discharge data is before the assessments data, but that part of the challenge that my base data is messed up.
What it looks like at the moment,
Update 2012-04-30 16:30:13 PDT
My data is delivered from a database and looks something like this,
df <- structure(list(date = structure(c(15965L, 15680L, 16135L, 15730L,
15920L, 15705L, 16110L, 15530L, 15575L, 15905L, 16140L, 15795L,
15955L, 15945L, 16205L, 15675L, 15525L, 15830L, 15625L, 15725L,
15855L, 15840L, 15615L, 15500L, 15780L, 15765L, 15610L, 15690L,
16080L, 15570L, 15685L, 16175L, 15740L, 15600L, 15985L, 15485L,
15605L, 16115L, 15535L, 15755L, 16145L, 16040L, 15970L, 16000L,
16075L, 15995L, 16010L, 15990L, 15665L, 15895L, 15865L, 16120L,
15880L, 15930L, 16055L, 15820L, 15650L, 16155L, 15700L, 15640L,
15505L, 15750L, 15800L, 15775L, 15825L, 15635L, 16150L, 15860L,
16100L, 15475L, 16050L, 15785L, 15495L, 15810L, 15805L, 15490L,
15460L, 16085L), class = "Date"), openCase = c(0L, 0L, 0L, 1L,
1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L,
1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L), patients = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L,
6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L,
11L, 12L, 12L, 12L, 13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L,
16L, 16L, 16L, 17L, 17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L,
20L, 20L, 21L, 21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L,
24L, 25L, 25L, 25L, 26L, 26L, 26L), .Label = c("A", "B", "C",
"D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P",
"Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"), class = "factor"),
visit = structure(c(2L, 1L, 3L, 3L, 1L, 2L, 2L, 3L, 1L, 3L,
1L, 2L, 2L, 1L, 3L, 2L, 1L, 3L, 1L, 2L, 3L, 3L, 2L, 1L, 3L,
2L, 1L, 3L, 1L, 2L, 1L, 3L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 1L,
3L, 2L, 1L, 2L, 3L, 3L, 1L, 2L, 1L, 3L, 2L, 2L, 3L, 1L, 3L,
2L, 1L, 3L, 2L, 1L, 1L, 2L, 3L, 3L, 1L, 2L, 2L, 3L, 1L, 1L,
3L, 2L, 1L, 3L, 2L, 2L, 1L, 3L), .Label = c("zym", "xov", "poi"
), class = "factor")), .Names = c("date", "openCase", "patients",
"visit"), row.names = c(NA, -78L), class = "data.frame")
The number of levels in visit, and specific labeling, will most likely change so I would like some kind of code where I rank or sort based on my existing data instead (visit) of generating new variables.
This is part-way:
Starting from after your initial definition of the data.
First, I think you want rank(date) rather than order(date) -- it made more sense to me, anyway.
df <- ddply(df, "patients", mutate, visit = rank(date))
df$visit <- as.factor(df$visit)
levels(df$visit) <- c("Assessment (1)", "Treatment (2)", "Discharge (3)")
Reorder patients by minimum date value (= Assessment date):
df$patients <- reorder(df$patients,df$date,function(x) min(as.numeric(x)))
Create a new data set missing the Discharge point, where they are after Jan 1 2014 (if you wanted to drop the Discharge point for cases that were assessed after a given date, you'd need to use ddply):
df2 <- subset(df,!(visit=="Discharge (3)" & date > as.Date("2014-01-01")))
As #Joran pointed out above it's a bit hard to get two separate colour scales for different variables, but this sort-of works (you have to make openCase into a factor in order to combine it with the colour scale for visit)
ggplot(df, aes(date, patients)) + geom_line(aes(colour=factor(openCase))) +
geom_point(data=df2,aes(colour = visit), size = 2, shape=0)
Alternately (and I think this is prettier anyway), you could code openCase with line type:
ggplot(df, aes(date, patients)) + geom_line(aes(linetype=factor(openCase))) +
geom_point(data=df2,aes(colour = visit), size = 2, shape=0)
I'm still not sure I understand what is wrong with #Ben's answer, but I'll try adding one of my own. Starting with the df given in the edit.
Create a new variable Visit (note the capital V) which is Assessment/Treatment/Discharge based on the ordering of the dates given. This is #Ben's code, just re-written.
df <- ddply(df, "patients", mutate,
Visit = factor(rank(date),
levels = 1:3,
labels=c("Assessment (1)", "Treatment (2)", "Discharge (3)")))
I don't understand how this relates to the visit column in the data originally; in fact, the original visit column is not used hereafter:
> table(df$Visit, df$visit)
zym xov poi
Assessment (1) 16 7 3
Treatment (2) 3 16 7
Discharge (3) 7 3 16
Reorder the patients (again copying Ben):
df$patients <- reorder(df$patients,df$date,function(x) min(as.numeric(x)))
Determine the subset of points that should be shown (same idea as Ben, but different code)
df2 <- df[!((df$Visit == "Discharge (3)") & (df$date > as.Date("2014-01-01"))),]
To add something new, here is a way to make the lines different colors without impacting the legend
ggplot(df, aes(date, patients)) +
geom_blank() +
geom_line(data = df[df$openCase == 0,], colour = "black") +
geom_line(data = df[df$openCase == 1,], colour = "red") +
geom_point(data = df2, aes(colour = Visit), size = 2, shape = 0)