I have produced a stacked percent barplot from the following data, which is in a csv file,
,ONE,TWO,THREE
1,2432,420,18
2,276,405,56
3,119,189,110
4,90,163,140
5,206,280,200
6,1389,1080,1075
7,3983,3258,4878
8,7123,15828,28111
9,8608,48721,52576
10,9639,44725,55951
11,8323,45695,32166
12,2496,18254,26600
13,1524,8591,18583
14,7861,1857,1680
15,10269,5165,4618
16,13560,64636,63262
using the following code
library(ggplot2)
library(reshape2)
library(scales)
data <- read.csv(file="file.csv",sep=",",header=TRUE)
data <- data[,2:ncol(data)]
datam <- melt(cbind(data,ind = sort(rownames(data))),is.var = c('ind'))
datam$ind <- as.numeric(datam$ind)
ggplot(datam,aes(x = variable, y = value,fill = factor(as.numeric(ind)))) +
geom_bar(position = "fill") + scale_y_continuous(labels =percent_format()) +
scale_fill_discrete("Barcode\nMatch") +xlab("Barcode")+ylab("Reads")
The result is
The problem is that the items in the legend are not in the same order as the stacks they represent. The colours and the numbers are right but the order is not. In other words, is there a way to invert the order of the items in the legend? Thanks
you can use a new option reverse = TRUE:
ggplot(datam,aes(x = variable, y = value,fill = factor(as.numeric(ind)))) +
geom_bar(position = "fill") + scale_y_continuous(labels =percent_format()) +
scale_fill_discrete("Barcode\nMatch") + xlab("Barcode")+ylab("Reads") +
guides(fill = guide_legend(reverse = TRUE))
Add + scale_fill_hue(breaks=c("new order 1","new order 2","new order...")) as in:
library(ggplot2)
ggplot(data=PlantGrowth, aes(x=group, fill=group)) + geom_bar() +
geom_bar(colour="black", legend=FALSE) +
scale_fill_hue(breaks=c("trt1","ctrl","trt2"))
I'd also check out http://wiki.stdout.org/rcookbook/Graphs/Legends%20(ggplot2)/ for more.
This may have changed and become easier with he new ggplot but I'm not sure.
Related
I am plotting the number of covid19 PCR in the towns of my province. The problem its that many town havenĀ“t any PCR positive. I need a way to plot only the towns with at least 1+ PCR.
This is my code:
library(tidyverse)
library('data.table')
dfcsv1 <- read.csv("https://dadesobertes.gva.es/datastore/dump/ee17a346-a596-4866-a2ac-a530eb811737?bom=True",
encoding = "UTF-8", header = TRUE, sep = ",")
colnames(dfcsv1) <- c("code","code2","Municipio", "PCR", "TasaPCR", "PCR14",
"TasaPCR14", "Muertos", "TasaMuertos")
dfcsv1$TasaMuertos = as.numeric(gsub(",","\\.",dfcsv1$TasaMuertos))
dfcsv1$TasaPCR = as.numeric(gsub(",","\\.",dfcsv1$TasaPCR))
dfcsv1$TasaPCR14 = as.numeric(gsub(",","\\.",dfcsv1$TasaPCR14))
dfcsv1 %>%
mutate(Municipio = fct_reorder(Municipio, PCR14)) %>%
ggplot(aes(x=Municipio, y=PCR14, fill =TasaPCR14)) +
geom_bar(stat="identity", width=0.6) +
coord_flip() +
geom_text(data=dfcsv1, aes(y=PCR14,label=PCR14),vjust=1)+
scale_fill_gradient(low="steelblue", high="red")
As others have said in the comments, you need to filter out the PCR14 that is greater than 0 before reordering the factor levels. However, you will also need to remove the data parameter from geom_text, otherwise all those factor levels come back and you will have a big mess. It's already a bit crowded with the zero levels removed.
I think you should also change the vjust to an hjust to put the text in a nicer position since you have flipped the coordinates, with a compensating increase in the (flipped) y axis range to accommodate it:
dfcsv1 %>%
filter(PCR14 > 0) %>%
mutate(Municipio = fct_reorder(Municipio, PCR14)) %>%
ggplot(aes(x = Municipio, y = PCR14, fill = TasaPCR14)) +
geom_bar(stat = "identity", width = 0.6) +
coord_flip() +
geom_text(aes(y = PCR14,label = PCR14), hjust= -0.5) +
scale_fill_gradient(low = "steelblue", high = "red") +
ylim(c(0, 45))
Incidentally, it looks a lot better with the ones removed too:
dfcsv1 %>%
filter(PCR14 > 1) %>%
mutate(Municipio = fct_reorder(Municipio, PCR14)) %>%
ggplot(aes(x=Municipio, y=PCR14, fill =TasaPCR14)) +
geom_bar(stat="identity", width=0.6) + coord_flip() +
geom_text(aes(y=PCR14,label=PCR14),hjust=-0.5)+
scale_fill_gradient(low="steelblue", high="red") +
ylim(c(0, 45))
As a general rule, regardless of the type of plot or whether you are using ggplot , lattice or the base plot function, subsetting should happen first.
plot(x[y>0] , y[y>0])
The rest is aesthetics.
I have the following graph and I want to highlight the columns (both) for watermelons as it has the highest juice_content and weight. I know how to change the color of the columns but I would like to WHOLE columns to be highlighted. Any idea on how to achieve this? There doesn't seems to be any similar online.
fruits <- c("apple","orange","watermelons")
juice_content <- c(10,1,1000)
weight <- c(5,2,2000)
df <- data.frame(fruits,juice_content,weight)
df <- gather(df,compare,measure,juice_content:weight, factor_key=TRUE)
plot <- ggplot(df, aes(fruits,measure, fill=compare)) + geom_bar(stat="identity", position=position_dodge()) + scale_y_log10()
An option is to use gghighlight
library(gghighlight)
ggplot(df, aes(fruits,measure, fill = compare)) +
geom_col(position = position_dodge()) +
scale_y_log10() +
gghighlight(fruits == "watermelons")
In response to your comment, how about working with different alpha values
ggplot(df, aes(fruits,measure)) +
geom_col(data = . %>% filter(fruits == "watermelons"),
mapping = aes(fill = compare),
position = position_dodge()) +
geom_col(data = . %>% filter(fruits != "watermelons"),
mapping = aes(fill = compare),
alpha = 0.2,
position = position_dodge()) +
scale_y_log10()
Or you can achieve the same with one geom_col and a conditional alpha (thanks #Tjebo)
ggplot(df, aes(fruits, measure)) +
geom_col(
mapping = aes(fill = compare, alpha = fruits == 'watermelons'),
position = position_dodge()) +
scale_alpha_manual(values = c(0.2, 1)) +
scale_y_log10()
You could use geom_area to highlight behind the bars. You have to force the x scale to discrete first which is why I've used geom_blank (see this answer geom_ribbon overlay when x-axis is discrete) noting that geom_ribbon and geom_area are effectively the same except geom_area always has 0 as ymin
#minor edit so that the level isn't hard coded
watermelon_level <- which(levels(df$fruits) == "watermelons")
AreaDF <- data.frame(fruits = c(watermelon_level-0.5,watermelon_level+0.5))
plot <- ggplot(df, aes(fruits)) +
geom_blank(aes(y=measure, fill=compare))+
geom_area(data = AreaDF, aes( y = max(df$measure)), fill= "yellow")+
geom_bar(aes(y=measure, fill=compare),stat="identity", position=position_dodge()) + scale_y_log10()
Edit to address comment
If you want to highlight multiple fruits then you could do something like this. You need a data.frame with where you want the geom_area x and y, including dropping it to 0 between. I'm sure there's slightly tidier methods of getting the data.frame but this one works
highlight_level <- which(levels(df$fruits) %in% c("apple", "watermelons"))
AreaDF <- data.frame(fruits = unlist(lapply(highlight_level, function(x) c(x -0.51,x -0.5,x+0.5,x+0.51))),
yval = rep(c(1,max(df$measure),max(df$measure),1), length(highlight_level)))
AreaDF <- AreaDF %>% mutate(
yval = ifelse(floor(fruits) %in% highlight_level & ceiling(fruits) %in% highlight_level, max(df$measure), yval)) %>%
arrange(fruits) %>% distinct()
plot <- ggplot(df, aes(fruits)) +
geom_blank(aes(y=measure, fill=compare))+
geom_area(data = AreaDF, aes(y = yval ), fill= "yellow")+
geom_bar(aes(y=measure, fill=compare),stat="identity", position=position_dodge()) + scale_y_log10()
plot
I have produced a stacked percent barplot from the following data, which is in a csv file,
,ONE,TWO,THREE
1,2432,420,18
2,276,405,56
3,119,189,110
4,90,163,140
5,206,280,200
6,1389,1080,1075
7,3983,3258,4878
8,7123,15828,28111
9,8608,48721,52576
10,9639,44725,55951
11,8323,45695,32166
12,2496,18254,26600
13,1524,8591,18583
14,7861,1857,1680
15,10269,5165,4618
16,13560,64636,63262
using the following code
library(ggplot2)
library(reshape2)
library(scales)
data <- read.csv(file="file.csv",sep=",",header=TRUE)
data <- data[,2:ncol(data)]
datam <- melt(cbind(data,ind = sort(rownames(data))),is.var = c('ind'))
datam$ind <- as.numeric(datam$ind)
ggplot(datam,aes(x = variable, y = value,fill = factor(as.numeric(ind)))) +
geom_bar(position = "fill") + scale_y_continuous(labels =percent_format()) +
scale_fill_discrete("Barcode\nMatch") +xlab("Barcode")+ylab("Reads")
The result is
The problem is that the items in the legend are not in the same order as the stacks they represent. The colours and the numbers are right but the order is not. In other words, is there a way to invert the order of the items in the legend? Thanks
you can use a new option reverse = TRUE:
ggplot(datam,aes(x = variable, y = value,fill = factor(as.numeric(ind)))) +
geom_bar(position = "fill") + scale_y_continuous(labels =percent_format()) +
scale_fill_discrete("Barcode\nMatch") + xlab("Barcode")+ylab("Reads") +
guides(fill = guide_legend(reverse = TRUE))
Add + scale_fill_hue(breaks=c("new order 1","new order 2","new order...")) as in:
library(ggplot2)
ggplot(data=PlantGrowth, aes(x=group, fill=group)) + geom_bar() +
geom_bar(colour="black", legend=FALSE) +
scale_fill_hue(breaks=c("trt1","ctrl","trt2"))
I'd also check out http://wiki.stdout.org/rcookbook/Graphs/Legends%20(ggplot2)/ for more.
This may have changed and become easier with he new ggplot but I'm not sure.
I am working in ggplot and I would like to manually change the fill color of my labels according to a factor. There is a column called 'Gene' and it works when I use fill=factor(Gene) but I would like to chose my colors, since I have two graphs and now the "Genes" are colored differently in the two graphs.
My code looks like this:
dat<- read.delim("SCN.study_Pvariants_42.txt",sep="\t",check.names=F)
ggplot(dat, aes(x = dat$Index, y = as.numeric(dat$Parazscore))) +
geom_bar(stat = "identity", fill = "darkcyan") +
theme_bw() +
labs(title="SCN Paralog conservation",
y="Parazscore",
x="aminoacid sequence") +
geom_label_repel(aes(label=dat$'Variant',fill=factor(Gene)))
I changed it with using the scale_color_manual function and adding a vector cols:
dat<- read.delim("SCN.study_Pvariants_42.txt",sep="\t",check.names=F)
cols <- c("GENE1"="lightblue","GENE2"="green","GENE3"="yellow",etc)
ggplot(dat, aes(x = dat$Index, y = as.numeric(dat$Parazscore))) +
geom_bar(stat = "identity", fill = "darkcyan") +
theme_bw() +
labs(title="SCN Paralog conservation",
y="Parazscore",
x="aminoacid sequence") +
geom_label_repel(aes(label=dat$'Variant', fill=factor(Gene))) +
scale_color_manual(values=cols)
But regrettably, this does not work.
Could someone help me to solve this?
Thank you,
Anne
I'm having issues with the following. I need to barplot 3 factor variables vs 1 numerical variable.
My dataset:
Site,Gall,Status,Count
Site1,absent,unhealthy,35
Site1,absent,healthy,1750
Site1,present,unhealthy,23
Site1,present,healthy,1146
Site2,absent,unhealthy,146
Site2,absent,healthy,1642
Site2,present,unhealthy,30
Site2,present,healthy,333
I have tried using ggplot, but then it only lets me define x, y, and one more option, so I have used fill=Gall.
My code looks as following, I am still missing one factor variable.
ggplot(dat, aes(Status, Count, fill = Gall)) +
geom_bar(stat = "identity", position = "dodge")
Can anyone help me please ?
Thank you, much appreciated
There are a couple of solutions. If you are intent on filling by two factors, you can use interaction:
ggplot(dat, aes(Status, Count)) +
geom_col(aes(fill = interaction(Site, Gall)), position = "dodge")
In general though, it's better to use faceting for multiple factors. For example:
ggplot(dat, aes(Status, Count)) +
geom_col(aes(fill = Gall), position = "dodge") + facet_grid(Site ~ .)
You might be better off with points instead of bars. For example:
library(dplyr)
library(ggplot2)
ggplot(dat %>% mutate(Site = gsub("([0-9]$)", " \\1", Site)),
aes(Status, Count, colour=Status, shape=Gall)) +
geom_point(size=3, position=position_dodge(0.5), stroke=1) +
facet_grid(~ Site, switch="x") +
theme_classic() +
theme(strip.placement = "outside",
strip.background=element_blank()) +
scale_colour_manual(values=hcl(c(195,15),100,65)) +
scale_shape_manual(values=c(1,16)) +
labs(x="") +
guides(colour=FALSE)