geom_bar and errorbars for given data - r

I would like to have bars and errorbars for this data
I managed to get the bars with:
ggplot(FCDreach_global_mod, aes(x = as.factor(t3-t2), y = 1-value, fill=as.factor(t2-t1) )) +
geom_bar(stat = "identity" )
However I don't know how to draw the errorbars. I tried geom_errorbar() but couldn't get it work.
When drawing line plots I would use:
stat_summary(fun.data=mean_cl_normal, geom="errorbar")
but this does not seem to work correctly with geom_bar()
I tried this:
ggplot(FCDreach_global_mod, aes(x = as.factor(t3-t2), y = 1-value, fill=as.factor(t2-t1) ) ) +
stat_summary(fun.y=mean,geom="bar")+
stat_summary(fun.data=mean_cl_normal,geom="errorbar", width=0.5)
and the breaks on the y looked quite different compared to the ones I got with geom_bar(stat = "identity" ). The size of the bars is the same, but something weird happens with the y scale.
geom_bar:
stat_summary:
EDIT: the desired output is to show the equivalent of this plot in a barplot, of course excluding the x axis and placing t3-t2 on x
which I obtain by:
ggplot(FCDreach_global_mod, aes(x=roundedRealNumVehicles/2, y=1-value, colour=as.factor(t3-t2), lty=as.factor(t2-t1)) ) +
stat_summary( fun.y=mean, geom="line" ) +
stat_summary(fun.data=mean_cl_normal,geom="errorbar", width=0.5)

In your first graph, the y-axis represents the (1-value) summed for each level of (t3-t2). In the second, the y-axis is the mean. So, manually you can see this by using aggregate to recreate these values,
## Question 1: what is the y-axis of the first plot?
## Aggregate by summing (1-value)
(p1 <- aggregate((1-value) ~ I(t3-t2), data=FCDreach_global_mod, sum))
# I(t3 - t2) (1 - value)
# 1 0.4 19.51663
# 2 0.5 19.70297
## Question 2: where does the 0.075 come from in the stat_summary?
## Aggregate (1-value) taking the mean
(p2 <- aggregate((1-value) ~ I(t3-t2), data=FCDreach_global_mod, mean))
# I(t3 - t2) (1 - value)
# 1 0.4 0.09119921
# 2 0.5 0.09038062
## Get normal confidence intervals
se <- with(FCDreach_global_mod,
do.call(rbind,
lapply(split(1 - value, factor(t3-t2)), function(x)
mean(x) + c(-1,1)*sd(x)/sqrt(length(x))*qnorm(0.975))
))
## Recreate barplot
dat <- setNames(p2, c("x", "y"))
dat <- cbind(dat, setNames(data.frame(se), c("ymin", "ymax")))
ggplot(dat, aes(x,y)) +
geom_bar(stat="identity", aes(fill=factor(x))) +
geom_errorbar(aes(x=x, ymin=ymin, ymax=ymax), color="black", width=0.05) +
theme_bw()

Related

Multiple trendlines for factors ggplot2 and colors

I'm not too familiar with ggplot but it looks better than what I've been getting with plot_ly
I'm having trouble getting a trendline for each factor of the series. The trendlines just don't show up in the generated graph
Here is the code I've been working with
ggplot(subset(df,FACTOR %in% c("1","2")), aes(x= DUR, y= TEMP, color=FACTOR)) +
geom_point() +
geom_smooth(data=subset(df, FACTOR=="1"), method=lm , se=FALSE) +
geom_smooth(data=subset(df, FACTOR=="2"), method=lm , se=FALSE) +
xlab("Duration (min)") +
ylab('Change in Temperature (C)')
My df looks like this
DUR TEMP FACTOR
# # 1
# # 1
# # 2
# # 3
# # 4
... ... ...
Thanks
try to add the factor as group in the aes statement and for drawing with one geom_smooth call all you need (not sure if method should be "lm" instead of only lm):
library(ggplot2)
ggplot(subset(df,GROUP %in% c("1","2")), aes(x= DUR, y= TEMP, color=FACTOR, group = FACTOR)) +
geom_point() +
geom_smooth(method=lm , se=FALSE) +
xlab("Duration (min)") +
ylab('Change in Temperature (C)')
Not sure why you filter bey "GROUP" in subset - should be FACTOR from what I understand of your code and data snipped

ggplot2: Different vlines for each graph using facet_wrap [duplicate]

I've poked around, but been unable to find an answer. I want to do a weighted geom_bar plot overlaid with a vertical line that shows the overall weighted average per facet. I'm unable to make this happen. The vertical line seems to a single value applied to all facets.
require('ggplot2')
require('plyr')
# data vectors
panel <- c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B","B")
instrument <-c("V1","V2","V1","V1","V1","V2","V1","V1","V2","V1","V1","V2","V1","V1","V2","V1")
cost <- c(1,4,1.5,1,4,4,1,2,1.5,1,2,1.5,2,1.5,1,2)
sensitivity <- c(3,5,2,5,5,1,1,2,3,4,3,2,1,3,1,2)
# put an initial data frame together
mydata <- data.frame(panel, instrument, cost, sensitivity)
# add a "contribution to" vector to the data frame: contribution of each instrument
# to the panel's weighted average sensitivity.
myfunc <- function(cost, sensitivity) {
return(cost*sensitivity/sum(cost))
}
mydata <- ddply(mydata, .(panel), transform, contrib=myfunc(cost, sensitivity))
# two views of each panels weighted average; should be the same numbers either way
ddply(mydata, c("panel"), summarize, wavg=weighted.mean(sensitivity, cost))
ddply(mydata, c("panel"), summarize, wavg2=sum(contrib))
# plot where each panel is getting its overall cost-weighted sensitivity from. Also
# put each panel's weighted average on the plot as a simple vertical line.
#
# PROBLEM! I don't know how to get geom_vline to honor the facet breakdown. It
# seems to be computing it overall the data and showing the resulting
# value identically in each facet plot.
ggplot(mydata, aes(x=sensitivity, weight=contrib)) +
geom_bar(binwidth=1) +
geom_vline(xintercept=sum(contrib)) +
facet_wrap(~ panel) +
ylab("contrib")
If you pass in the presumarized data, it seems to work:
ggplot(mydata, aes(x=sensitivity, weight=contrib)) +
geom_bar(binwidth=1) +
geom_vline(data = ddply(mydata, "panel", summarize, wavg = sum(contrib)), aes(xintercept=wavg)) +
facet_wrap(~ panel) +
ylab("contrib") +
theme_bw()
Example using dplyr and facet_wrap incase anyone wants it.
library(dplyr)
library(ggplot2)
df1 <- mutate(iris, Big.Petal = Petal.Length > 4)
df2 <- df1 %>%
group_by(Species, Big.Petal) %>%
summarise(Mean.SL = mean(Sepal.Length))
ggplot() +
geom_histogram(data = df1, aes(x = Sepal.Length, y = ..density..)) +
geom_vline(data = df2, mapping = aes(xintercept = Mean.SL)) +
facet_wrap(Species ~ Big.Petal)
vlines <- ddply(mydata, .(panel), summarize, sumc = sum(contrib))
ggplot(merge(mydata, vlines), aes(sensitivity, weight = contrib)) +
geom_bar(binwidth = 1) + geom_vline(aes(xintercept = sumc)) +
facet_wrap(~panel) + ylab("contrib")

ggplot: plot frequency counts of values in a dataframe (with no preprocessing)

I often find myself doing this:
# Original data
df.test <- data.frame(value=floor(rexp(10000, 1/2)))
# Compute the frequency of every value
# or the probability
freqs <- tabulate(df.test$value)
probs <- freqs / sum(freqs)
# Create a new dataframe with the frequencies (or probabilities)
df.freqs <- data.frame(n=1:length(freqs), freq=freqs, probs=probs)
# Plot them, usually in log-log
g <- ggplot(df.freqs, aes(x=n, y = freq)) + geom_point() +
scale_y_log10() + scale_x_log10()
plot(g)
Can it be done just using ggplot without creating an intermediate dataset?
For frequency count, you can specify the stat parameter in geom_point as count:
ggplot(df.test, aes(x = value)) + geom_point(stat = "count") +
scale_x_log10() + scale_y_log10()

Scaling really large bar compared to other bars in ggplot2 in R

I am trying to make a plot in ggplot2 in R with the following code:
feature
[1] abs_deg_sum_1 NumAfterEdits_1 N_1 NumAfterEdits_3
[5] TimeSinceLastEdit_2 wt_product_1 NumAfterEdits_2 dwdt_1
52 Levels: abs_deg_diff_1 abs_deg_diff_2 abs_deg_diff_3 abs_deg_diff_4 ... Z_4
relative_importance
[1] 61.048212 17.235435 1.891542 1.409848 1.356924 1.264824 1.220593 1.184612
library(ggplot2)
df = data.frame(feature, relative_importance)
c <- ggplot(df, aes(x = feature, y = relative_importance, fill = feature)) + geom_bar(stat = "identity")
c + coord_flip()
positions <- c("abs_deg_sum_1", "NumAfterEdits_1", "N_1", "NumAfterEdits_3","TimeSinceLastEdit_2", "wt_product_1", "NumAfterEdits_2",
"dwdt_1")
c <- c + scale_x_discrete(limits = positions)
c + coord_flip()
Since the first value in relative_importance is really large compared to all other values, the plot doesn't show much about the other values. I get the following plot:
How can I change my code to capture more information in my plot? Especially about the smaller values
Here are several options, though I prefer the first or second (or maybe the third if you really want to go with a bar plot):
# Fake data
dat = data.frame(group=LETTERS[1:5], values=c(1.5,0.6,12.6,2.1,85))
# Value labels instead of bars, plus we add a horizontal segment to provide
# better visual guidance as to the relative values. This also requires
# some factor gymnastics to be able to get both the segments and the
# correct x-axis labels. I've left in the legend, but it's not necessary
# and can be removed if you wish.
ggplot(dat, aes(as.numeric(group), values, colour=group)) +
geom_segment(aes(x=as.numeric(group)-0.35, xend=as.numeric(group)+0.35,
yend=values), alpha=0.75) +
geom_text(aes(label=values), fontface="bold", show_guide=FALSE) +
scale_x_continuous(breaks=1:5, labels=levels(dat$group))
#scale_y_log10(limits=c(0.1,100), breaks=c(0.1, 0.3,1,3,10,30,100)) # For a log scale, if desired
#coord_flip() # Flip to horizontal orientation, if desired
# Value labels instead of bars
ggplot(dat, aes(group, values, colour=group)) +
geom_text(aes(label=values), fontface="bold")
# Bar plot with value labels added
ggplot(dat, aes(group, values, fill=group)) +
geom_bar(stat="identity") +
geom_text(aes(label=values, y=0.5*values), size=5, colour="black")
# Value labels instead of bars; log scale
ggplot(dat, aes(group, values, colour=group)) +
geom_text(aes(label=values)) +
scale_y_log10(limits=c(0.1,100), breaks=c(0.1,0.3,1,3,10,30,100)) +
coord_flip()
# Bar plot with log scale. Note that bar baseline is 1 instead of
# zero for a log scale, so this doesn't work so well.
ggplot(dat, aes(group, values, fill=group)) +
geom_bar(stat="identity") +
scale_y_log10(limits=c(0.1,100), breaks=c(0.1,0.3,1,3,10,30,100)) +
coord_flip()
# Points instead of bars; log scale
ggplot(dat, aes(group, values, fill=group)) +
geom_point(pch=21, size=4) +
scale_y_log10(limits=c(0.1,100), breaks=c(0.1,0.3,1,3,10,30,100)) +
coord_flip()
If the logarithmic axis doesn't work for you and if you have some flexibility in the plot format, you could divide the features into two groups based on the value of relative_importance and show each in it's own panel with appropriate y-scales. Code including adjustment of bar widths would look like:
library(ggplot2)
# assign rows to Large or Small group
cut_off_for_small_values <- 3
small_value_title <- "Expanded_Scale_for_Smaller_Values"
df <- data.frame(feature, relative_importance,
importance_grp = ifelse(relative_importance > cut_off_for_small_values,
"All", small_value_title))
# calculate relative bar widths
width_adj <- .8*nrow(df[df$importance_grp==small_value_title,])/nrow(df)
# plot data
c <- ggplot(df, aes(x = feature, y = relative_importance, fill = feature))
c <- c + geom_bar(data=transform(df, importance_grp="All"),
stat = "identity")
c <- c + geom_bar(data=df[df$importance_grp==small_value_title,],
stat = "identity", width=width_adj)
c <- c + geom_text(aes(x = feature, y = relative_importance,
label = format(relative_importance, digits=3), vjust=-.5))
c <- c + theme(axis.text.x = element_text(angle=90))
c <- c + facet_wrap( ~ importance_grp, scales="free" )
which gives plot

Error when adding errorbars to ggplot

Dear Stackoverflow users,
I would like to draw a grouped barplot with three independent variables with error bars. I based my graph on an example on Stacked Overflow (stacked bars within grouped bars), using ggplot with geom_bar. When I add the geom_errorbar according to examples of the help pages, I get the following error:
Error in if (empty(data)) { : missing value where TRUE/FALSE needed
This is the script I use:
treatment<-rep(c(rep(c(1),8),rep(c(2),8)),2)
origin<-rep(c("A","B"),16)
time<-c(rep(c(5),16),rep(c(10),16))
sulfide<-c(0,10,5,8,9,6,16,18,20,25,50,46,17,58,39,43,20,25,50,46,17,58,39,43,100,120,103,104,150,160,200,180)
Reed<-data.frame(treatment,origin,time,sulfide)
# specify factor types
Reed$treatment<-as.factor(Reed$treatment)
Reed$origin<-as.character(Reed$origin)
Reed$time<-as.factor(Reed$time)
library(ggplot2)
library(scales)
#draw plot
ggplot() +geom_bar(data=Reed, aes(y = sulfide, x = treatment, fill=origin), stat="identity",position="dodge") +theme_bw() + facet_grid( ~ time)+xlab("treatment") +ylab("Sulfide")+ggtitle("Time)")
This is how I added error bars:
ErrorBars <- function(x, y, upper, lower=upper, length=0.03,...{if(length(x) != length(y) | length(y) !=length(lower) | length(lower) != length(upper))stop("vectors must be same length")arrows(x,y+upper, x, y-lower, angle=90, code=3, length=length, ...)}#function for errorbars
SE<- function(x) sqrt(var(x,na.rm=TRUE)/length(na.omit(x))) #function for SE
Reed$trt<- paste(Reed$treatment,Reed$origin,sep="")#combine treatment and origin to a column
mean_Reed<-data.frame(tapply(Reed$sulfide,list(Reed$trt,Reed$time),mean,na.rm=TRUE)) #mean
SE_Reed<-data.frame(tapply(Reed$sulfide,list(Reed$trt, Reed$time),SE)) # SE
limits <- aes(ymax = mean_Reed + SE_Reed, ymin=mean_Reed - SE_Reed)# Define the top and bottom of the errorbars
#plot with error bars:
ggplot() +geom_bar(data=Reed, aes(y = sulfide, x = treatment, fill=origin), stat="identity",position="dodge") +theme_bw() + facet_grid( ~ time)+xlab("treatment") +ylab("Sulfide")+ggtitle("Time)"+ geom_errorbar(limits, width=.2,position="dodge")
I really can't find what I'm doing wrong.
I hope you can help me:)
Leaving aside the issue of error bars for the moment, there's a much more serious problem with your plot. You have 2 values each of treatment, time, and origin, for a total of 8 combinations, but 32 values of sulfide - so there are 4 values of sulfide for each combination. When you plot this using, e.g.,
ggplot(data=Reed) +
geom_bar(aes(y = sulfide, x = treatment, fill=origin), stat="identity",position="dodge") +
facet_grid( ~ time)+xlab("treatment") +ylab("Sulfide")
you are plotting bars for all four sulfide values on top of each other all in the same color. This has the effect of displaying only the maximum value. It's a little hard to believe this is what you intended, and even if you did there's a better way to do that. For instance, if you want to plot the mean value of sulfide for each combination of factors, you can do it this way.
ggp <- ggplot(data=Reed, aes(y = sulfide, x = as.factor(treatment), group=origin)) +
geom_bar(aes(fill=origin), stat="summary", fun.y=mean, position="dodge") +
theme_bw() +
facet_grid( ~ time)+xlab("treatment") +ylab("Sulfide")+ggtitle("Time")
ggp
This uses stat="summary" to automatically summarize the result using the aggregating function mean (fun.y=mean).
As similar approach can be used to very simply add the error bars:
se <- function(y) sd(y)/length(y) # to calculate standard error in the mean
ggp+stat_summary(geom="errorbar",position=position_dodge(width=0.85),
fun.data=function(y)c(ymin=mean(y)-se(y),ymax=mean(y)+se(y)), width=0.1)
Notice that there is no need to aggregate the data externally - ggplot does it for you.
Finally, this approach lends itself to the use of many built-in functions for generating confidence limits with more statistical rigor.
ggp+stat_summary(fun.data=mean_cl_normal, conf.int=0.95,
geom="errorbar",position=position_dodge(width=0.85), width=0.1)
So here we use the ggplot built-in function mean_cl_normal to calculate 95% confidence limits on the mean assuming the data follows a normal distribution (and that, hence, the means will follow a t-distribution). We use the argument conf.int=... to specify the desired confidence interval, but the default is 0.95 so it really wasn't necessary in this example.
There are several other functions of this type: see the documentation and links therein for an explanation.
If you want to build your error bars by making a summary dataset, you just need to get that dataset in the correct format. There are lots of options for this; I will use dplyr. Notice I keep all the grouping variables from the plot in this dataset in a "tidy" format, with each variable in a separate column.
library(dplyr)
meandat = Reed %>%
group_by(treatment, time, origin) %>%
summarise(mean = mean(sulfide, na.rm = TRUE), se = SE(sulfide))
Source: local data frame [8 x 5]
Groups: treatment, time [?]
treatment time origin mean se
(fctr) (fctr) (chr) (dbl) (dbl)
1 1 5 A 7.50 3.378856
2 1 5 B 10.50 2.629956
3 1 10 A 31.50 7.858117
4 1 10 B 43.00 6.819091
5 2 5 A 31.50 7.858117
6 2 5 B 43.00 6.819091
7 2 10 A 138.25 23.552689
8 2 10 B 141.00 17.540429
Now error bars can be added via geom_errorbar. You'll see I set the aesthetics globally within ggplot to save myself having to re-type some of these, but you can change this as you want. I use position_dodge to get the error bars placed correctly over each bar.
ggplot(data = Reed, aes(y = sulfide, x = treatment, fill=origin)) +
geom_bar(stat="identity", position="dodge") +
theme_bw() +
facet_grid( ~ time)+
xlab("treatment") +
ylab("Sulfide")+
ggtitle("Time")+
geom_errorbar(data = meandat, aes(ymin = mean - se, ymax = mean + se, y = mean),
position = position_dodge(width = .9))
You can actually do all of this via stat_summary, rather than calculating the summary statistics "by hand". An example is here. The code would look like so, and gives the same plot as above.
ggplot(data = Reed, aes(y = sulfide, x = treatment, fill=origin)) +
geom_bar(stat="identity",position="dodge") +
theme_bw() +
facet_grid( ~ time) +
xlab("treatment") +
ylab("Sulfide") +
ggtitle("Time") +
stat_summary(geom = "errorbar", fun.data = mean_cl_normal, mult = 1,
position = position_dodge(width = .9))
I've been using the development version of ggplot2, ggplot2_1.0.1.9003, and found that I needed to add stat_summary function arguments via fun.args. This would look like fun.args = list(mult = 1) to get error bars of 1 standard error.

Resources