ggplot and the geom_text() - r

I am new to R and ggplot2.I have searched a lot regarding this but I could not find the solution.
Sample observation1 observation2 observation3 percentage
sample1_A 163453473 131232689 61984186 30.6236955883
Sample1_B 170151351 137202212 59242536 26.8866816109
sample2_A 194102849 162112484 89158170 40.4183031852
sample2_B 170642240 141888123 79925652 41.7493687378
sample3_A 192858504 161227348 90532447 41.8068248626
sample3_B 177174787 147412720 81523935 40.5463120438
sample4_A 199232380 174656081 118115358 55.6409038531
sample4_B 211128931 186848929 123552556 54.7201927527
sample5_A 186039420 152618196 87012356 40.9656544833
sample5_B 145855252 118225865 66265976 39.5744515254
sample6_A 211165202 186625116 112710053 48.5457722338
sample6_B 220522502 193191927 114882014 47.238670909
I am planning to plot a bar plot with ggplot2. I want to plot the first three columns as a bar plot "dodge" and label the observation3 bar with the percentage. I could plot the bars as below but I could not use geom_text() to add the label.
data1 <- read.table("readStats.txt", header=T)
data1.long <- melt(data1)
ggplot(data1.long[1:36,], aes(data1.long$Sample[1:36],y=data1.long$value[1:36], fill=data1.long$variable[1:36])) + geom_bar(stat="identity", width=0.5, position="dodge")

Transform data1 to long form with the observation columns as the measure variables and the Sample and percentage columns as the id variables. Compute the maximum value, mx, to be used to place the percentages. Then perform the plot. Note that geom_bar uses data1.long but geom_text uses data1. We have colored the text giving the percentages the same color as the observation3 bars. (See this post for how to specify default colors.) Both inherit aes(x = Sample) but use different y and other aesthetics. We clean up the X axis labels by removing all lower case letters and underscores from the data1$Sample (optional).
library(ggplot2)
library(reshape2)
data1.long <- melt(data1, measure = 2:4) # cols 2:4 are observation1, ..., observation3
mx <- max(data1.long$value) # maximum observation value
ggplot(data1.long, aes(x = Sample, y = value)) +
geom_bar(aes(fill = variable), stat = "identity", width = 0.5, position = "dodge") +
geom_text(aes(y = mx, label = paste0(round(percentage), "%")), data = data1,
col = "#619CFF", vjust = -0.5) +
scale_x_discrete(labels = gsub("[a-z_]", "", data1$Sample))
(click on chart to enlarge)
Note: We used this data. Note that one occurrence of Sample was changed to sample with a lower case s:
Lines <- "Sample observation1 observation2 observation3 percentage
sample1_A 163453473 131232689 61984186 30.6236955883
sample1_B 170151351 137202212 59242536 26.8866816109
sample2_A 194102849 162112484 89158170 40.4183031852
sample2_B 170642240 141888123 79925652 41.7493687378
sample3_A 192858504 161227348 90532447 41.8068248626
sample3_B 177174787 147412720 81523935 40.5463120438
sample4_A 199232380 174656081 118115358 55.6409038531
sample4_B 211128931 186848929 123552556 54.7201927527
sample5_A 186039420 152618196 87012356 40.9656544833
sample5_B 145855252 118225865 66265976 39.5744515254
sample6_A 211165202 186625116 112710053 48.5457722338
sample6_B 220522502 193191927 114882014 47.238670909"
data1 <- read.table(text = Lines, header = TRUE)
UPDATE: minor improvements

It might be that G. Grothendieck's answer is a better solution, but here's my suggestion (code below)
# install.packages("ggplot2", dependencies = TRUE)
require(ggplot2)
df <- structure(list(Sample = structure(1:12, .Label = c("sample1_A",
"Sample1_B", "sample2_A", "sample2_B", "sample3_A", "sample3_B",
"sample4_A", "sample4_B", "sample5_A", "sample5_B", "sample6_A",
"sample6_B"), class = "factor"), observation1 = c(163453473L,
170151351L, 194102849L, 170642240L, 192858504L, 177174787L, 199232380L,
211128931L, 186039420L, 145855252L, 211165202L, 220522502L),
observation2 = c(131232689L, 137202212L, 162112484L, 141888123L,
161227348L, 147412720L, 174656081L, 186848929L, 152618196L,
118225865L, 186625116L, 193191927L), observation3 = c(61984186L,
59242536L, 89158170L, 79925652L, 90532447L, 81523935L, 118115358L,
123552556L, 87012356L, 66265976L, 112710053L, 114882014L),
percentage = c(30.6236955883, 26.8866816109, 40.4183031852,
41.7493687378, 41.8068248626, 40.5463120438, 55.6409038531,
54.7201927527, 40.9656544833, 39.5744515254, 48.5457722338,
47.238670909)), .Names = c("Sample", "observation1", "observation2",
"observation3", "percentage"), class = "data.frame", row.names = c(NA,
-12L))
# install.packages("reshape2", dependencies = TRUE)
require(reshape2)
data1.long <- melt(df, id=c("Sample"), measure.var = c("observation1", "observation2", "observation3"))
data1.long$percentage <- paste(round(data1.long$percentage, 2), "%", sep="")
data1.long[data1.long$variable == "observation1" | data1.long$variable == "observation2" ,2] <- ""
ggplot(data1.long, aes(x = Sample, y = value, fill=variable)) +
geom_bar(, stat="identity", width=0.5, position="dodge") +
geom_text(aes(label = percentage), vjust=2.10, size=2, hjust=-.06, angle = 90)

Related

Adding geom_line between data points with different geom_boxplot fill variable

Hi I have a much larger data frame but a sample dummy df is as follows:
set.seed(23)
df = data.frame(name = c(rep("Bob",8),rep("Tom",8)),
topic = c(rep(c("Reading","Writing"),8)),
subject = c(rep(c("English","English","Spanish","Spanish"),4)),
exam = c(rep("First",4),rep("Second",4),rep("First",4),rep("Second",4)),
score = sample(1:100,16))
I have to plot it in the way shown in the picture below (for my original data frame) but with lines connecting the scores corresponding to each name between the first and second class in the exam variable, I tried geom_line(aes(group=name)) but the lines are not connected in the right way. Is there any way to connect the points that also respects the grouping by the fill variable similar to how the position_dodge() helps separate the points by their fill grouping? Thanks a lot!
library(ggplot2)
df %>% ggplot(aes(x=topic,y=score,fill=exam)) +
geom_boxplot(outlier.shape = NA) +
geom_point(size=1.75,position = position_dodge(width = 0.75)) +
facet_grid(~subject,switch = "y")
One option to achieve your desired result would be to group the lines by name and topic and do the dodging of lines manually instead of relying on position_dogde. To this end convert topic to a numeric for the geom_line and shift the position by the necessary amount to align the lines with the dodged points:
set.seed(23)
df <- data.frame(
name = c(rep("Bob", 8), rep("Tom", 8)),
topic = c(rep(c("Reading", "Writing"), 8)),
subject = c(rep(c("English", "English", "Spanish", "Spanish"), 4)),
exam = c(rep("First", 4), rep("Second", 4), rep("First", 4), rep("Second", 4)),
score = sample(1:100, 16)
)
library(ggplot2)
ggplot(df, aes(x = topic, y = score, fill = exam)) +
geom_boxplot(outlier.shape = NA) +
geom_point(size = 1.75, position = position_dodge(width = 0.75)) +
geom_line(aes(
x = as.numeric(factor(topic)) + .75 / 4 * ifelse(exam == "First", -1, 1),
group = interaction(name, topic)
)) +
facet_grid(~subject, switch = "y")

Dodging vertical lines for median_hilow in ggplot

I need to plot lines that show median and IQR for 3 replicates, across multiple samples.
Data:
sampleid <- rep(1:20, each = 3)
replicate <- rep(1:3, 20)
sample1 <- seq(120,197, length.out = 60)
sample2 <- seq(113, 167, length.out = 60)
sample3 <- seq(90,180, length.out = 60)
What I have done so far?
df <- as.data.frame(cbind(sampleid,replicate,sample1, sample2, sample3))
library(reshape2)
long <- melt(df,id.vars = c('sampleid', 'replicate'))
ggplot(data = long, aes(x = variable, y = value, colour = factor(replicate))) + stat_summary(fun.data=median_hilow, conf.int=.5)
However, the plot of the IQR for replicates that I am getting are overlapped with each other for each sample. I would like to find out a way to "dodge" these 3 lines so that they are visible next to each other, without changing other parameters of the plot that I have achieved. Is this achievable?
You have to introduce jitter to the lines:
ggplot(data = long, aes(x = variable, y = value, colour = factor(replicate))) +
stat_summary(fun.data=median_hilow, fun.args = (conf.int=.5), position = "jitter")
Please note you also need to have your conf.int=5 wrapped in the fun.args.
Alternatively, change your x to factor(replicate) and add facet_wrap:
ggplot(data = long, aes(x = factor(replicate), y = value, colour = factor(replicate))) +
stat_summary(fun.data=median_hilow, fun.args = (conf.int=.5)) +
facet_wrap(~variable)

ggplo2 in R: geom_segment displays different line than geom_line

Say I have this data frame:
treatment <- c(rep("A",6),rep("B",6),rep("C",6),rep("D",6),rep("E",6),rep("F",6))
year <- as.numeric(c(1999:2004,1999:2004,2005:2010,2005:2010,2005:2010,2005:2010))
variable <- c(runif(6,4,5),runif(6,5,6),runif(6,3,4),runif(6,4,5),runif(6,5,6),runif(6,6,7))
se <- c(runif(6,0.2,0.5),runif(6,0.2,0.5),runif(6,0.2,0.5),runif(6,0.2,0.5),runif(6,0.2,0.5),runif(6,0.2,0.5))
id <- 1:36
df1 <- as.data.table(cbind(id,treatment,year,variable,se))
df1$year <- as.numeric(df1$year)
df1$variable <- as.numeric(df1$variable)
df1$se <- as.numeric(df1$se)
As I mentioned in a previous question (draw two lines with the same origin using ggplot2 in R), I wanted to use ggplot2 to display my data in a specific way.
I managed to do so using the following script:
y1 <- df1[df1$treatment=='A'&df1$year==2004,]$variable
y2 <- df1[df1$treatment=='B'&df1$year==2004,]$variable
y3 <- df1[df1$treatment=='C'&df1$year==2005,]$variable
y4 <- df1[df1$treatment=='D'&df1$year==2005,]$variable
y5 <- df1[df1$treatment=='E'&df1$year==2005,]$variable
y5 <- df1[df1$treatment=='E'&df1$year==2005,]$variable
y6 <- df1[df1$treatment=='F'&df1$year==2005,]$variable
p <- ggplot(df1,aes(x=year,y=variable,group=treatment,color=treatment))+
geom_line(aes(y = variable, group = treatment, linetype = treatment, color = treatment),size=1.5,lineend = "round") +
scale_linetype_manual(values=c('solid','solid','solid','dashed','solid','dashed')) +
geom_point(aes(colour=factor(treatment)),size=4)+
geom_errorbar(aes(ymin=variable-se,ymax=variable+se),width=0.2,size=1.5)+
guides(colour = guide_legend(override.aes = list(shape=NA,linetype = c("solid", "solid",'solid','dashed','solid','dashed'))))
p+labs(title="Title", x="years", y = "Variable 1")+
theme_classic() +
scale_x_continuous(breaks=c(1998:2010), labels=c(1998:2010),limits=c(1998.5,2010.5))+
geom_segment(aes(x=2004, y=y1, xend=2005, yend=y3),colour='blue1',size=1.5,linetype='solid')+
geom_segment(aes(x=2004, y=y1, xend=2005, yend=y4),colour='blue1',size=1.5,linetype='dashed')+
geom_segment(aes(x=2004, y=y2, xend=2005, yend=y5),colour='red3',size=1.5,linetype='solid')+
geom_segment(aes(x=2004, y=y2, xend=2005, yend=y6),colour='red3',size=1.5,linetype='dashed')+
scale_color_manual(values=c('blue1','red3','blue1','blue1','red3','red3'))+
theme(text = element_text(size=12))
As you can see I used both geom_line and geom_segment to display the lines for my graph.
It's almost perfect but if you look closely, the segments that are drawn (between 2004 and 2005) do not display the same line size, even though I used the same arguments values in the script (i.e. size=1.5 and linetype='solid' or dashed).
Of course I could change manually the size of the segments to get similar lines, but when I do that, segments are not as smooth as the lines using geom_line.
Also, I get the same problem (different line shapes) by including the size or linetype arguments within the aes() argument.
Do you have any idea what causes this difference and how I can get the exact same shapes for both my segments and lines ?
It seems to be an anti-aliasing issue with geom_segment, but that seems like a somewhat cumbersome approach to begin with. I think I have resolved your issue by duplicating the A and B treatments in the original data frame.
# First we are going to duplicate and rename the 'shared' treatments
library(dplyr)
library(ggplot2)
df1 %>%
filter(treatment %in% c("A", "B")) %>%
mutate(treatment = ifelse(treatment == "A",
"AA", "BB")) %>%
bind_rows(df1) %>% # This rejoins with the original data
# Now we create `treatment_group` and `line_type` variables
mutate(treatment_group = ifelse(treatment %in% c("A", "C", "D", "AA"),
"treatment1",
"treatment2"), # This variable will denote color
line_type = ifelse(treatment %in% c("AA", "BB", "D", "F"),
"type1",
"type2")) %>% # And this variable denotes the line type
# Now pipe into ggplot
ggplot(aes(x = year, y = variable,
group = interaction(treatment_group, line_type), # grouping by both linetype and color
color = treatment_group)) +
geom_line(aes(x = year, y = variable, linetype = line_type),
size = 1.5, lineend = "round") +
geom_point(size=4) +
# The rest here is more or less the same as what you had
geom_errorbar(aes(ymin = variable-se, ymax = variable+se),
width = 0.2, size = 1.5) +
scale_color_manual(values=c('blue1','red3')) +
scale_linetype_manual(values = c('dashed', 'solid')) +
labs(title = "Title", x = "Years", y = "Variable 1") +
scale_x_continuous(breaks = c(1998:2010),
limits = c(1998.5, 2010.5))+
theme_classic() +
theme(text = element_text(size=12))
Which will give you the following
My numbers are different since they were randomly generated.
You can then modify the legend to your liking, but my recommendation is using something like geom_label and then be sure to set check_overlap = TRUE.
Hope this helps!

How to plot bar plot and error bar with x,y data in R

I have data in following format.
X ID Mean Mean+Error Mean-Error
61322107 cg09959428 0.39158198 0.39733463 0.38582934
61322255 cg17147820 0.30742542 0.31572314 0.29912770
61322742 cg08922201 0.47443355 0.47973039 0.46913671
61322922 cg08360511 0.06614797 0.06750279 0.06479315
61323029 cg00998427 0.05625839 0.05779519 0.05472160
61323113 cg15492820 0.10606674 0.10830587 0.10382761
61323284 cg02950427 0.36187007 0.36727818 0.35646196
61323413 cg01996653 0.35582920 0.36276991 0.34888849
61323667 cg14161454 0.77930230 0.78821970 0.77038491
61324205 cg25149253 0.93585347 0.93948514 0.93222180
How can i plot error bar plot with column(bars)
enter image description here
where X-Axis is having X value. So each bar will be plotted at X of fixed width.
I'll try answering. I am using a package called plotly. You can look here for more details.
df <- read.csv('test.csv')
colnames(df) <- c("x", "id", "mean", "mean+error", "mean-error")
df$`mean+error` = df$`mean+error` - df$mean
df$`mean-error` = df$mean - df$`mean-error`
library(plotly)
p <- ggplot(df, aes(factor(x), y = mean)) + geom_bar(stat = "identity")
p <- plotly_build(p)
length(p$data)
p$layout$xaxis
plot_ly(df, x = 1:10, y = mean, type = "bar",
error_y = list(symmetric = F,
array = df$`mean+error`,
arrayminus = df$`mean-error`,
type = "data")) %>%
layout(xaxis = list(tickmode = "array",tickvals = 1:10,ticktext = df$x))
I get this:
The most popular approach would probably be using geom_errorbar() in ggplot2.
library("ggplot2")
ggplot(df, aes(x=ID, y = Mean)) +
geom_bar(stat="identity", fill="light blue") +
geom_errorbar(aes(ymin = Mean.Error, ymax = Mean.Error.1))
where Mean.Error and Mean.Error.1 are the header names for mean +/- error you get when you try to read in your example as text.

Bar plot of group means with lines of individual results overlaid

this is my first stack overflow post and I am a relatively new R user, so please go gently!
I have a data frame with three columns, a participant identifier, a condition (factor with 2 levels either Placebo or Experimental), and an outcome score.
set.seed(1)
dat <- data.frame(Condition = c(rep("Placebo",10),rep("Experimental",10)),
Outcome = rnorm(20,15,2),
ID = factor(rep(1:10,2)))
I would like to construct a bar plot with two bars with the mean outcome score for each condition and the standard deviation as an error bar. I would like to then overlay lines connecting points for each participant's score in each condition. So the plot displays the individual response as well as the group mean.If it is also possible I would like to include an axis break.
I don't seem to be able to find any advice in other threads, apologies if I am repeating a question.
Many Thanks.
p.s. I realise that presenting data in this way will not be to everyones tastes. It is for a specific requirement!
This ought to work:
library(ggplot2)
library(dplyr)
dat.summ <- dat %>% group_by(Condition) %>%
summarize(mean.outcome = mean(Outcome),
sd.outcome = sd(Outcome))
ggplot(dat.summ, aes(x = Condition, y = mean.outcome)) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin = mean.outcome - sd.outcome,
ymax = mean.outcome + sd.outcome),
color = "dodgerblue", width = 0.3) +
geom_point(data = dat, aes(x = Condition, y = Outcome),
color = "firebrick", size = 1.2) +
geom_line(data = dat, aes(x = Condition, y = Outcome, group = ID),
color = "firebrick", size = 1.2, alpha = 0.5) +
scale_y_continuous(limits = c(0, max(dat$Outcome)))
Some people are better with ggplot's stat functions and arguments than I am and might do it differently. I prefer to just transform my data first.
set.seed(1)
dat <- data.frame(Condition = c(rep("Placebo",10),rep("Experimental",10)),
Outcome = rnorm(20,15,2),
ID = factor(rep(1:10,2)))
dat.w <- reshape(dat, direction = 'wide', idvar = 'ID', timevar = 'Condition')
means <- colMeans(dat.w[, 2:3])
sds <- apply(dat.w[, 2:3], 2, sd)
ci.l <- means - sds
ci.u <- means + sds
ci.width <- .25
bp <- barplot(means, ylim = c(0,20))
segments(bp, ci.l, bp, ci.u)
segments(bp - ci.width, ci.u, bp + ci.width, ci.u)
segments(bp - ci.width, ci.l, bp + ci.width, ci.l)
segments(x0 = bp[1], x1 = bp[2], y0 = dat.w[, 2], y1 = dat.w[, 3], col = 1:10)
points(c(rep(bp[1], 10), rep(bp[2], 10)), dat$Outcome, col = 1:10, pch = 19)
Here is a method using the transfomations inside ggplot2
ggplot(dat) +
stat_summary(aes(x=Condition, y=Outcome, group=Condition), fun.y="mean", geom="bar") +
stat_summary(aes(x=Condition, y=Outcome, group=Condition), fun.data="mean_se", geom="errorbar", col="green", width=.8, size=2) +
geom_line(aes(x=Condition, y=Outcome, group=ID), col="red")

Resources