How to plot columns above and below data points in ggplot2 - r

Using ggplot2, I am plotting percentage values for 15 species across three sites (each species occurs in each site). The data points associated with site 'C' are my reference points.
Now, instead of plotting sites 'A' and 'B' as points, I would like to display them using vertical lines or column-like structures. As such, these data points should be extended as vertical lines to the top or bottom side of the site 'C' points (green colour), i.e. to the top where values are larger than the reference value and bottom for smaller values.
Specifically, I would hope for a red line from a red point to the green point and a blue line from the blue point to the green point. The red line should ideally have the same width as the red point (and same for blue). The line should also be offset as are the red and blue points (relative to the green point), so that lines do not overlap. Finally, the line should not go to the center but the edge of a point.
For this purpose I have offset points for 'A' and 'B' and also reduced their size to half of the reference point size.
library(ggplot2)
MyData$species <- as.character(MyData$species)
MyData$species <- factor(MyData$species, levels=unique(MyData$species))
pos <- position_dodge(width=0.21)
cols <- c("C" = "darkgreen", "B" = "blue", "A" = "red")
tiff(file = "MyData.tiff", height=10, width=10, units="in", res=300, compression="lzw")
ggplot(data = MyData, aes(x=species, y=value, group=site, colour=site)) +
geom_point(data=subset(MyData, site=="C"), size = 4, shape=15, alpha=1, position=pos) +
geom_line(data=subset(MyData, site=="C"), size = 2, lwd=2, alpha=0.4, show_guide=FALSE) +
geom_point(data=subset(MyData, site!="C"), size = 1.8, shape=15, alpha=1, position = pos) +
scale_colour_manual(values = cols) +
xlab("Species") +
ylab("Value (%)") +
scale_y_continuous(expand=c(0.01,0.01),
limits=c(0.0,100),
breaks=c(0,20,40,60,80,100),
labels=c("0","20","40","60","80","100")) +
theme_bw() +
theme(legend.position="none") +
theme(axis.title.x = element_text(vjust=0.1,face="bold", size=16),
axis.text.x = element_text(vjust=0.4, size=14, angle=90, hjust=1.0)) +
theme(axis.title.y = element_text(vjust=0.1,face="bold", size=16),
axis.text.y = element_text(face="bold", size=14, angle=0)) +
theme(panel.grid.minor=element_blank(), panel.grid.major=element_blank()) +
theme(panel.border = element_rect(size=1, color = "black")) +
theme(plot.margin = unit(c(0.3,0.4,0.3,0.3),"lines"))
dev.off()
This is my current plot. So basically, I would like to replace the red and blue points with lines that extend to the green points (without overlapping them).
Many thanks in advance for any advice on an elegant solution for this.
This is a dput() of my dataset.
structure(list(site = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A", "B", "C"), class = "factor"),
species = structure(c(13L, 11L, 2L, 14L, 1L, 9L, 12L, 10L,
6L, 8L, 15L, 7L, 3L, 4L, 5L, 13L, 11L, 2L, 14L, 1L, 9L, 12L,
10L, 6L, 8L, 15L, 7L, 3L, 4L, 5L, 13L, 11L, 2L, 14L, 1L,
9L, 12L, 10L, 6L, 8L, 15L, 7L, 3L, 4L, 5L), .Label = c("Species 1",
"Species 10", "Species 11", "Species 12", "Species 13", "Species 14",
"Species 15", "Species 2", "Species 3", "Species 4", "Species 5",
"Species 6", "Species 7", "Species 8", "Species 9"), class = "factor"),
value = c(2, 3.25, 3.53, 4.31, 4.59, 5.26, 6.02, 6.42, 6.6,
7.26, 8.89, 12.45, 35.62, 72.42, 73.55, 1.36, 2.36, 2.17,
10.34, 6.84, 1.88, 5.09, 7.35, 3.87, 10.55, 6.6, 14.64, 39.57,
88.06, 64.54, 5.03, 12.34, 5.42, 3.63, 5.16, 6.04, 3, 8.94,
3.28, 7.64, 6.25, 21.96, 39.35, 78.55, 47.35)), .Names = c("site",
"species", "value"), class = "data.frame", row.names = c(NA,
-45L))

You can try geom_linerange() for the lines from points A/B to point C.
Define the ymin/ymax values for each site/species, & reorder site such that A / B lines drop down to each side of point C:
library(dplyr)
MyData <- MyData %>%
group_by(species) %>%
mutate(value.C = value[site == "C"]) %>%
rowwise() %>%
mutate(value.min = min(value, value.C),
value.max = max(value, value.C)) %>%
ungroup() %>%
mutate(site = factor(site, levels = c("A", "C", "B")))
Plot:
# vary dodge width such that the lines drop to the edge of point C
# for your chosen dimensions (for mine 0.5 was about right)
pos <- position_dodge(width = 0.5)
ggplot(data = MyData,
aes(x = species, y = value,
ymin = value.min, ymax = value.max,
group = site, colour = site, size = site)) +
geom_linerange(size = 1.8, alpha = 0.4, position = pos) +
geom_line(data = subset(MyData, site == "C"),
size = 2, lwd = 2, alpha = 0.4) +
geom_point(data = subset(MyData, site == "C"),
size = 4, shape = 15, position = pos) +
scale_color_manual(values = cols) +
theme_classic() +
theme(legend.position = "none")
# + other theme-related settings...

You can add geom_line to draw the vertical lines
library(ggplot2)
MyData$species <- as.character(MyData$species)
MyData$species <- factor(MyData$species, levels=unique(MyData$species))
pos <- position_dodge(width=0.21)
cols <- c("C" = "darkgreen", "B" = "blue", "A" = "red")
windows()
ggplot(data = MyData, aes(x=species, y=value, group=site, colour=site)) +
geom_point(data=subset(MyData, site=="C"), size = 4, shape=15, alpha=1, position=pos) +
geom_line(data=subset(MyData, site=="C"), size = 2, lwd=2, alpha=0.4, show_guide=FALSE) +
geom_point(data=subset(MyData, site!="C"), size = 1.8, shape=15, alpha=1, position = pos) +
geom_line(aes(group = species)) + #New code Added
scale_colour_manual(values = cols) +
xlab("Species") +
ylab("Value (%)") +
scale_y_continuous(expand=c(0.01,0.01),
limits=c(0.0,100),
breaks=c(0,20,40,60,80,100),
labels=c("0","20","40","60","80","100")) +
theme_bw() +
theme(legend.position="none") +
theme(axis.title.x = element_text(vjust=0.1,face="bold", size=16),
axis.text.x = element_text(vjust=0.4, size=14, angle=90, hjust=1.0)) +
theme(axis.title.y = element_text(vjust=0.1,face="bold", size=16),
axis.text.y = element_text(face="bold", size=14, angle=0)) +
theme(panel.grid.minor=element_blank(), panel.grid.major=element_blank()) +
theme(panel.border = element_rect(size=1, color = "black")) +
theme(plot.margin = unit(c(0.3,0.4,0.3,0.3),"lines"))

Related

R piechart managing legend in multiple plot

I am trying to write a code for multiple pie charts showing top 10 bacterial abundance during initial treatment days, mid treatment days and end of treatment days antibiotic usage for two drugs say X and Y . But in this plot I need to fix three points
putting Others at the end for all plots.
Add a common legend
add a common order so that all the successive pie charts take the bacteria order from the first. Not reorder them every time. So that it will be easier to compare by eye.
Can anyone please help?
My code is as bellow.
dput(xInitialDays)
structure(list(Dataset = c("Lachnospiraceae", "Bifidobacteriaceae",
"Oscillospiraceae", "Enterobacteriaceae", "Peptostreptococcaceae",
"Sutterellaceae", "Erysipelotrichaceae", "Peptoniphilaceae",
"Clostridiaceae", "Bacteroidaceae", "Others"), Values = c(41670.41,
22926.45, 21939.35, 15083.38, 11544.14, 10824.77, 9349.65, 6716.93,
4856.3, 3807.31, 20828.8), per = c(24.58, 13.52, 12.94, 8.9,
6.81, 6.38, 5.51, 3.96, 2.86, 2.25, 12.27)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 65L), class = "data.frame")
allcols= 20
getPalette = colorRampPalette(brewer.pal(12, "Set3"))
fill=getPalette(allcols)
allcols<-fill
xInitialDays %>%
mutate(taxa = xInitialDays$Dataset) %>%
mutate(cols = fill[c(1:11)]) %>%
select(taxa,cols) -> ColAssgn
len = 11
xinidays <- function(){
xInitialDays%>%
ggplot(aes(x='',y=reorder(Values,Values),fill=Dataset))+
geom_bar(width=1,stat="identity")+
scale_fill_manual(breaks = ColAssgn$taxa,
values = ColAssgn$cols)+
theme_void()+
theme_classic() +
theme(legend.position = "top") +
coord_polar("y",start=0) +
theme(axis.line = element_blank())+
theme(axis.text = element_blank()) +
theme(axis.ticks = element_blank())+
labs(x = NULL, y = NULL, fill = NULL)+
ylab("Baseline")
}
order(xInitialDays$Values)
xinidays()
I have repeated the similar code for middays and final days and for X and Y.
> dput(xMidDays)
structure(list(Dataset = c("Bifidobacteriaceae", "Oscillospiraceae",
"Enterobacteriaceae", "Lachnospiraceae", "Erysipelotrichaceae",
"Sutterellaceae", "Akkermansiaceae", "Acidaminococcaceae", "Bacteroidaceae",
"Lactobacillaceae", "Others"), Values = c(41204.96, 22093.39,
21504.25, 19273.93, 14853.98, 7146.53, 6734.07, 4340.38, 3444.21,
2541.56, 18130.24), per = c(25.55, 13.7, 13.33, 11.95, 9.21,
4.43, 4.18, 2.69, 2.14, 1.58, 11.23)), row.names = c(1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 65L), class = "data.frame")
And I took this strategy bellow to select the same bacteria colours
taxalen <- xMidDays$Dataset[c(which(!(xMidDays$Dataset %in% ColAssgn$taxa)))]
for (count in 1:length(taxalen)){
if(length(taxalen) > 0){
len = len +1
ColAssgn %>%
rbind(c(taxalen[count],allcols[len])) -> ColAssgn
}
}
xmiddays <- function(){
xMidDays %>%
ggplot(aes(x='',y=reorder(Values,Values),fill = Dataset))+
geom_bar(width=1,stat="identity")+
scale_fill_manual(breaks = ColAssgn$taxa,
values = ColAssgn$cols)+
theme_void()+
theme_classic() +
theme(legend.position = "top") +
coord_polar("y",start=0) +
theme(axis.line = element_blank())+
theme(axis.text = element_blank()) +
theme(axis.ticks = element_blank())+
labs(x = NULL, y = NULL, fill = NULL)+
ylab("Mid Treatment")
}
xmiddays()
And for final days
> dput(xFinaldays)
structure(list(Dataset = c("Lachnospiraceae", "Bifidobacteriaceae",
"Oscillospiraceae", "Enterobacteriaceae", "Erysipelotrichaceae",
"Clostridiaceae", "Acidaminococcaceae", "Peptostreptococcaceae",
"Sutterellaceae", "Bacteroidaceae", "Others"), Values = c(32106.33,
21813.1, 12246.03, 8259.62, 7029.33, 6365.3, 4091.83, 2610.92,
1968.89, 1864.72, 6713.37), per = c(30.56, 20.76, 11.66, 7.86,
6.69, 6.06, 3.89, 2.48, 1.87, 1.77, 6.39)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 52L), class = "data.frame")
taxalen <- xFinaldays$Dataset[c(which(!(xFinaldays$Dataset %in% ColAssgn$taxa)))]
for (count in 1:length(taxalen)){
if(length(taxalen) > 0){
len = len +1
ColAssgn %>%
rbind(c(taxalen[count],allcols[len])) -> ColAssgn
}
}
xfindays <- function(){
xFinaldays %>%
ggplot(aes(x='',y=reorder(Values,Values),fill = Dataset))+
geom_bar(width=1,stat="identity")+
scale_fill_manual(breaks = ColAssgn$taxa,
values = ColAssgn$cols)+
theme_void()+
theme_classic() +
theme(legend.position = "top") +
coord_polar("y",start=0) +
theme(axis.line = element_blank())+
theme(axis.text = element_blank()) +
theme(axis.ticks = element_blank())+
labs(x = NULL, y = NULL, fill = NULL)+
ylab("End of Treatment")
}
xfindays()
And now for the combined group X plot :
group_x <- plot_grid(
xinidays() + theme(legend.position="none"),
xmiddays() + theme(legend.position="none"),
xfindays() + theme(legend.position="none"),
nrow = 1
)
group_x_plot <-plot_grid(group_x,ncol=1
,axis = "tblr"
,labels = "Genus level comparison at multiple treatment time points for Drug X",
label_size = 15
)
group_x_plot
Please anyone can help me with the above questions and fix this code?
many thanks,
Mitra

Width of bars in ggplot2 with many groups

I tried to reproduce the answer given by Roman in this post: The same width of the bars in geom_bar(position = "dodge")
but I couldnot fix my problem. When bars have the same width, the distance between the groups are too big. Same problem when I use facet_grid
My df:
df <- structure(list(discipline = structure(c(2L, 3L, 3L, 2L, 2L, 2L, 4L, 6L, 7L, 3L, 4L, 6L, 8L, 8L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("", "Biogeochemistry", "Ecology", "Geochemistry", "Geography", "Management", "Microbiology", "Oceanography"), class = "factor"), focus = structure(c(34L, 55L, 40L, 47L, 54L, 57L, 47L, 19L, 31L, 25L, 23L, 25L, 47L, 52L,13L, 20L, 23L, 16L, 26L, 27L), .Label = c("", "Abiotic measures", "Acidification", "Biogeochemichal budgets", "Biogeochemistry", "Biogeochemistry, discharge", "Blue Carbon", "Chromophoric Dissolved organic matter, river plume", "Coastal anthromes", "Connectivity", "Coral reefs", "Ecology", "Ecosystem Function", "Ecosystem Services", "Embryo plants", "Fisheries", "Food webs", "Global change", "Governance", "Groundwater", "Hidrology", "Integrative Magamenet", "Isotopes", "Land-sea interactions","Land-sea interface", "Land use", "Life history", "Life traits", "Livelihoods", "Management", "Microbial community", "Modelling water quality", "Nitrogen fluxes", "Nutrients", "Parasites", "ph, CO2", "Planning", "Pollutants", "Pollution", "Primary production", "Remote Sensing", "Resilience", "resilience, self-organization", "Restoration",
"Salinization", "Sea level rise", "Sediment flux", "Sediments", "socio land-sea interactions", "Species interaction", "Submarine ground water", "Submarine groundwater", "Subsidies", "Trace metals", "Trophic interactions", "Water quality", "Water resources"), class = "factor"), n = c(39L, 17L, 11L, 9L, 6L, 5L, 5L, 4L, 4L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L)), row.names = c(NA, -20L), class = c("tbl_df","tbl", "data.frame"))
First I tried with position = position_dodge2(preserve = "single")
ggplot(df, aes(x = (discipline), y = n, fill = reorder(focus, n))) +
geom_bar(position = position_dodge2(width = 0.9, preserve = "single"), stat = "identity") + ylab("N") + theme_classic() + geom_text(aes(label=focus), position = position_dodge2(width = 0.9, preserve = "single"), angle = 90, hjust = -0.1) + theme(legend.position = "none")
Then I used facet_grid
ggplot(df, aes(x = (discipline), y = n, fill = reorder(focus, n))) +
geom_bar(position = "dodge", stat = "identity") + ylab("N") + theme_classic() + geom_text(aes(label=focus), position = position_dodge2(width = 0.9, preserve = "single"), angle = 90, hjust = -0.1) + theme(legend.position = "none") + facet_grid(scales = "free_x", space = "free_x", switch = "x")
Even when width of bars are equal, distance between groups are too big.
What can I do to solve this problem?
Maybe try this. It looks like the issue is with position. If you define position_dodge2() for the bars you can avoid the big bars you got. Here the code:
library(ggplot2)
#Code
ggplot(df, aes(x = (discipline), y = n, fill = reorder(focus, n))) +
geom_bar(position = position_dodge2(0.9,preserve = 'single'),
stat = "identity") + ylab("N") +
theme_classic() +
geom_text(aes(label=focus), position = position_dodge2(width = 0.9, preserve = "single"),
angle = 90, hjust = -0.1) + theme(legend.position = "none") +
facet_grid(scales = "free_x", space = "free_x", switch = "x")
Output:
Whereas, the original code produces this (using position = "dodge"):

ggplot2: nudge geom_step() upwards a little bit for every group with discrete y-axis

I have objects moving through different places over time, the plots look like this (but with many more paths):
ggplot(data = df, aes(
y = place,
x = value,
color = order,
group = order
)) +
geom_step(alpha = 0.5) +
theme(legend.position = "bottom") +
guides(color = guide_legend(ncol = 1)) +
geom_point(alpha = 0.5) +
facet_wrap( ~ order)
I'd like to combine the facets into one plot:
ggplot(data = df, aes(
y = place,
x = value,
color = order,
group = order
)) +
geom_step(alpha = 0.5) +
theme(legend.position = "bottom") +
guides(color = guide_legend(ncol = 1)) +
geom_point(alpha = 0.5)
The problem I have with this is the overlapping. I would like to nudge/move every different color of geom_step() up by a few pixels (maybe a linewidth), so that overlapping lines appear thicker. I have tried this R - ggplot dodging geom_lines but changing the x- and y-coordinate messes up the plot.
ggplot(data = df, aes(
y = value,
x = place,
color = order,
group = order
)) +
geom_step(alpha = 0.5, direction = "vh", position = position_dodge(width = 0.5)) +
theme(legend.position = "bottom") +
guides(color = guide_legend(ncol = 1)) +
coord_flip()
I hope I was clear about my desired output. I'm grateful for any hints!
the data:
df <- structure(list(place = structure(c(1L, 7L, 8L, 2L, 8L, 4L, 8L,
11L, 9L, 10L, 9L, 7L, 6L, 7L, 1L, 7L, 8L, 3L, 8L, 5L, 9L, 11L,
9L, 10L, 8L, 7L, 6L, 7L), .Label = c("A", "B", "C", "D", "E",
"F", "G", "H", "I", "J", "K"), class = "factor"), order = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("a",
"b"), class = "factor"), value = c(0, 38.0069999694824, 254.986999988556,
266.786999940872, 358.447000026703, 368.375, 613.148000001907,
626.457999944687, 778.240999937057, 790.655999898911, 844.833999872208,
914.274999856949, 925.282999992371, 952.84299993515, 0, 38.3450000286102,
80.5469999313354, 93.7960000038147, 188.280999898911, 199.918999910355,
380.635999917984, 385.131000041962, 447.441999912262, 455.503999948502,
528.233000040054, 677.162999868393, 690.805000066757, 713.063999891281
)), row.names = c(NA, -28L), class = "data.frame")
Okay, so after some more googling i stumbled upon the ggstance-package, which includes a vertical version of position_dodge which does exactly what i need:
library(ggstance)
ggplot(data = df, aes(
y = place,
x = value,
color = order,
group = order
)) +
geom_step(position = position_dodge2v(height = 0.2, preserve = "single")) +
theme(legend.position = "bottom") +
guides(color = guide_legend(ncol = 1))

Align error bars with bar chart factor levels

I am trying to plot a bar chart with multiple error bars that align with the relevant values for each factor level on the x axis.
I have followed instructions as per - Grouped barplot in R with error bars - but I cannot figure out why my script isn't cooperating.
Here is the result I get, where obviously I want the errorbars to align with the relevant data.
Here is my data:
structure(list(key = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("2030", "2050", "2100"
), class = "factor"), value = c(0.4, 0.4, 0.1, 0.1, 0.1, 3.4,
4.5, 6.8, 3.6, 3, 5.7, 12.4, 14.9, 9.5, 10.8), scenario = structure(c(4L,
2L, 1L, 6L, 5L, 4L, 2L, 1L, 6L, 5L, 4L, 2L, 1L, 6L, 5L), .Label = c("1.5°C-high-OS",
"1.5°C-limited-OS", "1.5°C-low-OS", "Below-1.5°C", "Higher-2°C",
"Lower-2°C"), class = "factor"), Lower.Quartile = c(0, 0, 0,
0, 0, 0, 3.4, 3.7, 1.8, 1.6, 0, 6.4, 12.1, 6.9, 8.2), Upper.Quartile = c(1.1,
1, 0.4, 0.3, 0.2, 8.3, 6.3, 9.5, 4.6, 4.9, 13.4, 15, 16.3, 12.1,
15.3)), row.names = c(1L, 3L, 4L, 5L, 6L, 7L, 9L, 10L, 11L, 12L,
13L, 15L, 16L, 17L, 18L), class = "data.frame")
Here is my code:
library(ggplot2)
library(tidyr)
ggplot(Global.BECCS.mean, aes(x = key)) + geom_bar(aes(fill = scenario, y = value), position = position_dodge(), stat='identity', col = "black") + theme_classic() +
xlab('Year') + ylab(expression(paste(GtC,' sequestered by BECCS per year'))) + geom_hline(yintercept = 5.5, linetype = 'dashed') + geom_hline(yintercept = 16, linetype = 'dotted') +
geom_errorbar(aes(ymin = Global.BECCS.mean$Lower.Quartile, ymax = Global.BECCS.mean$Upper.Quartile), width = 0.1, position = position_dodge())
A couple of things:
define the y and fill aesthetics in ggplot to allow all layers to inherit them as well
specify the width in position_dodge to get the bars in the wanted position
and finally there's no need, I think, to use Global.BECCS.mean$, the name of the variable should be sufficient.
.
ggplot(Global.BECCS.mean, aes(x = key, fill = scenario, y = value)) +
geom_bar(position = position_dodge(), stat='identity', col = "black") +
theme_classic() +
xlab('Year') +
ylab(expression(paste(GtC,' sequestered by BECCS per year'))) +
geom_hline(yintercept = 5.5, linetype = 'dashed') +
geom_hline(yintercept = 16, linetype = 'dotted') +
geom_errorbar(aes(ymin = Lower.Quartile,
ymax = Upper.Quartile),
width = 0.3,
position = position_dodge(0.9))

how can I plot a complex data structure

I have a data way much bigger than this representative one
df<- structure(list(Pama1 = structure(c(2L, 5L, 3L, 5L, 3L, 3L, 3L,
3L, 3L, 4L, 1L), .Label = c("", "DD1", "n/a", "PAMANA", "zf"), class = "factor"),
X = structure(c(11L, 3L, 10L, 2L, 4L, 5L, 6L, 7L, 8L, 9L,
1L), .Label = c("", "116", "12", "138", "197", "219", "224",
"230", "280", "85", "Start1"), class = "factor"), X.1 = structure(c(11L,
10L, 2L, 4L, 3L, 5L, 8L, 6L, 7L, 9L, 1L), .Label = c("",
"101", "145", "199", "222", "227", "233", "238", "331", "89",
"End1"), class = "factor"), Pama2 = structure(c(2L, 4L, 4L,
4L, 3L, 4L, 4L, 6L, 5L, 1L, 1L), .Label = c("", "DD2", "GGTR",
"n/a", "PAMANA", "T_reg"), class = "factor"), X.2 = structure(c(9L,
2L, 2L, 8L, 3L, 4L, 5L, 6L, 7L, 1L, 1L), .Label = c("", "1",
"115", "208", "214", "232", "376", "85", "Start2"), class = "factor"),
X.3 = structure(c(10L, 8L, 2L, 9L, 3L, 4L, 5L, 6L, 7L, 1L,
1L), .Label = c("", "15", "195", "229", "231", "362", "577",
"76", "86", "End2"), class = "factor"), Pama3 = structure(c(1L,
3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("DD3",
"GGTR", "n/a"), class = "factor"), X.4 = structure(c(10L,
1L, 7L, 8L, 9L, 9L, 2L, 3L, 4L, 5L, 6L), .Label = c("1",
"129", "136", "153", "166", "178", "48", "65", "66", "Start1"
), class = "factor"), X.5 = structure(c(10L, 6L, 7L, 8L,
3L, 9L, 1L, 2L, 4L, 4L, 5L), .Label = c("131", "138", "144",
"168", "180", "34", "51", "70", "79", "End2"), class = "factor")), .Names = c("Pama1",
"X", "X.1", "Pama2", "X.2", "X.3", "Pama3", "X.4", "X.5"), class = "data.frame", row.names = c(NA,
-11L))
At first I put all starts and ends together and find the range
In this example it is 1 to 577
Then I want to plot or highlight the regions that there is a string
for example, something like this figure that I made
or even like this
The code below tidies up the data into a form suitable for plotting and then uses geom_segment to lay out the sequences. To tidy the data, we want each column to be a variable and each row to be an observation.
library(tidyverse)
## Clean up data frame and convert to long form
df = map_df(seq(1,ncol(df),3), # Turn each group of three columns into separate data frames that we'll stack into long format
~ setNames(df[-1,.x:(.x+2)], c("DD","Start","End")), # Column names appear to be in the first data row, so we'll remove this row and provide new column names
.id="Pama") %>% # This line and next add a "Pama" column
mutate(Pama = paste0("Pama", Pama)) %>%
filter(!DD %in% c("n/a","")) %>% # Remove empty rows
mutate_at(vars(matches("^[SE]")), funs(as.numeric(as.character(.)))) # All columns are in character format. Convert the numbers to numeric format.
The data frame now looks like this:
Pama DD Start End
<chr> <chr> <dbl> <dbl>
1 Pama1 zf 12 89
2 Pama1 zf 116 199
3 Pama1 PAMANA 280 331
4 Pama2 GGTR 115 195
5 Pama2 T_reg 232 362
6 Pama2 PAMANA 376 577
7 Pama3 GGTR 66 144
ggplot(df, aes(y=Pama, yend=Pama)) +
geom_segment(data=data.frame(Pama=unique(df$Pama), x=min(df$Start), xend=max(df$End)),
aes(x=x, xend=xend), colour="grey80", size=10) +
geom_segment(aes(x=Start, xend=End, colour=DD), size=20) +
geom_text(aes(x=(Start+End)/2, label=DD), colour="white", size=3, fontface="bold") +
geom_text(data=gather(df, key, value, Start:End),
aes(x=value, label=value, colour=DD), size=2.5,
fontface="bold", position=position_nudge(0,-0.3)) +
guides(colour=FALSE) +
scale_x_continuous(breaks=seq(0,1000,100)) +
labs(x="", y="") +
theme_classic(base_size=15) +
theme(axis.line.y=element_blank(),
axis.ticks.y=element_blank())
UPDATE: To address your comment, here's another way of positioning the numbers to avoid overlap.
ggplot(df, aes(y=Pama, yend=Pama)) +
geom_segment(data=data.frame(Pama=unique(df$Pama), x=min(df$Start), xend=max(df$End)),
aes(x=x, xend=xend), colour="grey80", size=10) +
geom_segment(aes(x=Start, xend=End, colour=DD), size=20) +
geom_text(aes(x=(Start+End)/2, label=DD), colour="white", size=3, fontface="bold") +
geom_text(data=gather(df, key, value, Start:End),
aes(x=ifelse(key=="Start", value + 10, value - 10), label=value),
colour="white", size=2.8, fontface="bold", position=position_nudge(0,-0.2)) +
guides(colour=FALSE) +
scale_x_continuous(breaks=seq(0,1000,100)) +
labs(x="", y="") +
theme_classic(base_size=15) +
theme(axis.line.y=element_blank(),
axis.ticks.y=element_blank())
UPDATE 2: To address your second comment, we'll add a grouping column that we'll use to alternate high and low number labels:
# Add grouping variable to alternate high and low labels
df = df %>% group_by(Pama) %>% arrange(Start) %>%
mutate(hilow = rep(c("high","low"),nrow(df))[1:n()])
ggplot(df, aes(y=Pama, yend=Pama)) +
geom_segment(data=data.frame(Pama=unique(df$Pama), x=min(df$Start), xend=max(df$End)),
aes(x=x, xend=xend), colour="grey80", size=10) +
geom_segment(aes(x=Start, xend=End, colour=DD), size=20) +
geom_text(aes(x=(Start+End)/2, label=DD), colour="white", size=3, fontface="bold") +
geom_text(data=gather(df, key, value, Start:End) %>% filter(hilow=="high"),
aes(x=value, label=value, colour=DD), hjust=0.5,
size=3, fontface="bold", position=position_nudge(0,0.3)) +
geom_text(data=gather(df, key, value, Start:End) %>% filter(hilow=="low"),
aes(x=value, label=value, colour=DD), hjust=0.5,
size=3, fontface="bold", position=position_nudge(0,-0.3)) +
guides(colour=FALSE) +
scale_x_continuous(breaks=seq(0,1000,100)) +
labs(x="", y="") +
theme_classic(base_size=15) +
theme(axis.line.y=element_blank(),
axis.ticks.y=element_blank())
That is one weird data structure you have. This will probably work out better down the line if you can modify the source data in some way to obtain a tidy dataframe from the start, where each column is a single variable and each row is an observation.
We can wrangle the data to obtain such dataframe (this is in base R, you can arguably achieve the same in other ways, also using dplyr or data.table):
df2 <- rbind(setNames(cbind(rep('DD1', nrow(df) - 1), df[2:nrow(df), 1:3]), c('DD', 'Pama', 'Start', 'End')),
setNames(cbind(rep('DD2', nrow(df) - 1), df[2:nrow(df), 4:6]), c('DD', 'Pama', 'Start', 'End')),
setNames(cbind(rep('DD3', nrow(df) - 1), df[2:nrow(df), 7:9]), c('DD', 'Pama', 'Start', 'End'))
)
df2$Start <- as.numeric(as.character(df2$Start))
df2$End <- as.numeric(as.character(df2$End))
df2 <- df2[!df2$Pama %in% c('','n/a'), ]
df2
#> DD Pama Start End
#> 2 DD1 zf 12 89
#> 4 DD1 zf 116 199
#> 10 DD1 PAMANA 280 331
#> 51 DD2 GGTR 115 195
#> 81 DD2 T_reg 232 362
#> 91 DD2 PAMANA 376 577
#> 52 DD3 GGTR 66 144
This gives us a nice dataset, where we can map any ggplot2's aestethic to a simple column:
library(ggplot2)
ggplot(df2, aes(y = DD, color = Pama)) +
geom_segment(aes(x = Start, xend = End, yend = DD), size = 10) +
geom_text(aes(label = Start, x = Start), size = 2.5, nudge_y = -.15) +
geom_text(aes(label = End, x = End), size = 2.5, nudge_y = -.15) +
scale_y_discrete(position = 'right') +
theme(panel.background = element_rect(fill = 'white'),
axis.text.x = element_blank(),
axis.text.y.right = element_text(size = 14),
axis.ticks.y = element_blank(),
axis.title = element_blank())
Update:
The text positioning can indeed cause problems, it does so in this example, where we had to reduce the text size to get it somewhat right.
Here is a solution, based on the ggrepel package:
library(ggplot2)
library(ggrepel)
ggplot(df2, aes(y = DD, color = Pama)) +
geom_segment(aes(x = Start, xend = End, yend = DD), size = 10) +
geom_text_repel(data = function(d) tidyr::gather(d, k, p, -DD, -Pama), aes(label = p, x = p), size = 5, nudge_y = -.15, segment.size = 0) +
# geom_label_repel(aes(label = End, x = End), size = 5, nudge_y = -.15) +
geom_text(aes(x = (Start + End) / 2, label = Pama), colour = "white", size = 2.5) +
scale_y_discrete(position = 'right') +
guides(color = FALSE) +
theme(panel.background = element_rect(fill = 'white'),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y.right = element_text(size = 14),
axis.ticks.y = element_blank(),
axis.title = element_blank())
(The text size is forcibly increased to show it does not overlap)
PS: Yea, this update makes it even more similar to #eipi10 answer.. it's a great answer, of course I'd steal from him :P

Resources