I am trying to make a plot in ggplot2 in R with the following code:
feature
[1] abs_deg_sum_1 NumAfterEdits_1 N_1 NumAfterEdits_3
[5] TimeSinceLastEdit_2 wt_product_1 NumAfterEdits_2 dwdt_1
52 Levels: abs_deg_diff_1 abs_deg_diff_2 abs_deg_diff_3 abs_deg_diff_4 ... Z_4
relative_importance
[1] 61.048212 17.235435 1.891542 1.409848 1.356924 1.264824 1.220593 1.184612
library(ggplot2)
df = data.frame(feature, relative_importance)
c <- ggplot(df, aes(x = feature, y = relative_importance, fill = feature)) + geom_bar(stat = "identity")
c + coord_flip()
positions <- c("abs_deg_sum_1", "NumAfterEdits_1", "N_1", "NumAfterEdits_3","TimeSinceLastEdit_2", "wt_product_1", "NumAfterEdits_2",
"dwdt_1")
c <- c + scale_x_discrete(limits = positions)
c + coord_flip()
Since the first value in relative_importance is really large compared to all other values, the plot doesn't show much about the other values. I get the following plot:
How can I change my code to capture more information in my plot? Especially about the smaller values
Here are several options, though I prefer the first or second (or maybe the third if you really want to go with a bar plot):
# Fake data
dat = data.frame(group=LETTERS[1:5], values=c(1.5,0.6,12.6,2.1,85))
# Value labels instead of bars, plus we add a horizontal segment to provide
# better visual guidance as to the relative values. This also requires
# some factor gymnastics to be able to get both the segments and the
# correct x-axis labels. I've left in the legend, but it's not necessary
# and can be removed if you wish.
ggplot(dat, aes(as.numeric(group), values, colour=group)) +
geom_segment(aes(x=as.numeric(group)-0.35, xend=as.numeric(group)+0.35,
yend=values), alpha=0.75) +
geom_text(aes(label=values), fontface="bold", show_guide=FALSE) +
scale_x_continuous(breaks=1:5, labels=levels(dat$group))
#scale_y_log10(limits=c(0.1,100), breaks=c(0.1, 0.3,1,3,10,30,100)) # For a log scale, if desired
#coord_flip() # Flip to horizontal orientation, if desired
# Value labels instead of bars
ggplot(dat, aes(group, values, colour=group)) +
geom_text(aes(label=values), fontface="bold")
# Bar plot with value labels added
ggplot(dat, aes(group, values, fill=group)) +
geom_bar(stat="identity") +
geom_text(aes(label=values, y=0.5*values), size=5, colour="black")
# Value labels instead of bars; log scale
ggplot(dat, aes(group, values, colour=group)) +
geom_text(aes(label=values)) +
scale_y_log10(limits=c(0.1,100), breaks=c(0.1,0.3,1,3,10,30,100)) +
coord_flip()
# Bar plot with log scale. Note that bar baseline is 1 instead of
# zero for a log scale, so this doesn't work so well.
ggplot(dat, aes(group, values, fill=group)) +
geom_bar(stat="identity") +
scale_y_log10(limits=c(0.1,100), breaks=c(0.1,0.3,1,3,10,30,100)) +
coord_flip()
# Points instead of bars; log scale
ggplot(dat, aes(group, values, fill=group)) +
geom_point(pch=21, size=4) +
scale_y_log10(limits=c(0.1,100), breaks=c(0.1,0.3,1,3,10,30,100)) +
coord_flip()
If the logarithmic axis doesn't work for you and if you have some flexibility in the plot format, you could divide the features into two groups based on the value of relative_importance and show each in it's own panel with appropriate y-scales. Code including adjustment of bar widths would look like:
library(ggplot2)
# assign rows to Large or Small group
cut_off_for_small_values <- 3
small_value_title <- "Expanded_Scale_for_Smaller_Values"
df <- data.frame(feature, relative_importance,
importance_grp = ifelse(relative_importance > cut_off_for_small_values,
"All", small_value_title))
# calculate relative bar widths
width_adj <- .8*nrow(df[df$importance_grp==small_value_title,])/nrow(df)
# plot data
c <- ggplot(df, aes(x = feature, y = relative_importance, fill = feature))
c <- c + geom_bar(data=transform(df, importance_grp="All"),
stat = "identity")
c <- c + geom_bar(data=df[df$importance_grp==small_value_title,],
stat = "identity", width=width_adj)
c <- c + geom_text(aes(x = feature, y = relative_importance,
label = format(relative_importance, digits=3), vjust=-.5))
c <- c + theme(axis.text.x = element_text(angle=90))
c <- c + facet_wrap( ~ importance_grp, scales="free" )
which gives plot
Related
I am trying to create a barplot with the ggplot2 library. My data is stored in read.csv2 format.
# Library
library(ggplot2)
library(tidyverse) # function "%>%"
# 1. Read data (comma separated)
data = read.csv2(text = "Age;Frequency
0 - 10;1
11 - 20;5
21 - 30;20
31 - 40;13
41 - 49;1")
# 2. Print table
df <- as.data.frame(data)
df
# 3. Plot bar chart
ggplot(df, aes(x = Age)) +
geom_bar() +
theme_classic()
The code runs fine, but it produces a graph that looks like all data are at max all the time.
You need to specify your y axis as well:
ggplot(df, aes(x = Age, y = Frequency)) +
geom_bar(stat = "identity") +
theme_classic()
The default value of geom_bar plots the frequency of the values which is 1 for all the Age values here (Check table(df$Age)). You may use geom_bar with stat = 'identity'
library(ggplot2)
ggplot(df, aes(Age, Frequency)) +
geom_bar(stat = 'identity') +
theme_classic()
OR geom_col :
ggplot(df, aes(Age, Frequency)) +
geom_col() +
theme_classic()
I would like to have bars and errorbars for this data
I managed to get the bars with:
ggplot(FCDreach_global_mod, aes(x = as.factor(t3-t2), y = 1-value, fill=as.factor(t2-t1) )) +
geom_bar(stat = "identity" )
However I don't know how to draw the errorbars. I tried geom_errorbar() but couldn't get it work.
When drawing line plots I would use:
stat_summary(fun.data=mean_cl_normal, geom="errorbar")
but this does not seem to work correctly with geom_bar()
I tried this:
ggplot(FCDreach_global_mod, aes(x = as.factor(t3-t2), y = 1-value, fill=as.factor(t2-t1) ) ) +
stat_summary(fun.y=mean,geom="bar")+
stat_summary(fun.data=mean_cl_normal,geom="errorbar", width=0.5)
and the breaks on the y looked quite different compared to the ones I got with geom_bar(stat = "identity" ). The size of the bars is the same, but something weird happens with the y scale.
geom_bar:
stat_summary:
EDIT: the desired output is to show the equivalent of this plot in a barplot, of course excluding the x axis and placing t3-t2 on x
which I obtain by:
ggplot(FCDreach_global_mod, aes(x=roundedRealNumVehicles/2, y=1-value, colour=as.factor(t3-t2), lty=as.factor(t2-t1)) ) +
stat_summary( fun.y=mean, geom="line" ) +
stat_summary(fun.data=mean_cl_normal,geom="errorbar", width=0.5)
In your first graph, the y-axis represents the (1-value) summed for each level of (t3-t2). In the second, the y-axis is the mean. So, manually you can see this by using aggregate to recreate these values,
## Question 1: what is the y-axis of the first plot?
## Aggregate by summing (1-value)
(p1 <- aggregate((1-value) ~ I(t3-t2), data=FCDreach_global_mod, sum))
# I(t3 - t2) (1 - value)
# 1 0.4 19.51663
# 2 0.5 19.70297
## Question 2: where does the 0.075 come from in the stat_summary?
## Aggregate (1-value) taking the mean
(p2 <- aggregate((1-value) ~ I(t3-t2), data=FCDreach_global_mod, mean))
# I(t3 - t2) (1 - value)
# 1 0.4 0.09119921
# 2 0.5 0.09038062
## Get normal confidence intervals
se <- with(FCDreach_global_mod,
do.call(rbind,
lapply(split(1 - value, factor(t3-t2)), function(x)
mean(x) + c(-1,1)*sd(x)/sqrt(length(x))*qnorm(0.975))
))
## Recreate barplot
dat <- setNames(p2, c("x", "y"))
dat <- cbind(dat, setNames(data.frame(se), c("ymin", "ymax")))
ggplot(dat, aes(x,y)) +
geom_bar(stat="identity", aes(fill=factor(x))) +
geom_errorbar(aes(x=x, ymin=ymin, ymax=ymax), color="black", width=0.05) +
theme_bw()
I'm using ggplot2 to create a simple dot plot of -1 to +1 correlation values using the following R code:
ggplot(dataframe, aes(x = exit)) +
geom_point(aes(y= row.names(dataframe))) +
geom_text(aes(y=exit, label=samplesize))
The y-axis has text labels, and I believe those text labels may be the reason that my geom_text() data point labels are squished down into the bottom of the plot as pictured here:
How can I change my plotting so that the data point labels appear on the dots themselves?
I understand that you would like to have the samplesize appear above each data point in the plot. Here is a sample plot with a sample data frame that does this:
EDIT: Per note by Gregor, changed the geom_text() call to utilize aes() when referencing the data. Thanks for the heads up!
top10_rank<-
String Number
4 h 0
1 a 1
11 w 1
3 z 3
7 z 3
2 b 4
8 q 5
6 k 6
9 r 9
5 x 10
10 l 11
x<-ggplot(data=top10_rank, aes(x = Number,
y = String)) + geom_point(size=3) + scale_y_discrete(limits=top10_rank$String)
x + geom_text(data=top10_rank, size=5, color = 'blue',
aes(x = Number,label = Number), hjust=0, vjust=0)
Not sure if this is what you wanted though.
Your problem is simply that you switched the y variables:
# your code
ggplot(dataframe, aes(x = exit)) +
geom_point(aes(y = row.names(dataframe))) + # here y is the row names
geom_text(aes(y =exit, label = samplesize)) # here y is the exit column
Since you want the same y-values for both you can define this in the initial ggplot() call and not worry about repeating it later
# working version
ggplot(dataframe, aes(x = exit, y = row.names(dataframe))) +
geom_point() +
geom_text(aes(label = samplesize))
Using row names is a little fragile, it's a little safer and more robust to actually create a data column with what you want for y values:
# nicer code
dataframe$y = row.names(dataframe)
ggplot(dataframe, aes(x = exit, y = y)) +
geom_point() +
geom_text(aes(label = samplesize))
Having done this, you probably don't want the labels right on top of the points, maybe a little offset would be better:
# best of all?
ggplot(dataframe, aes(x = exit, y = y)) +
geom_point() +
geom_text(aes(x = exit + .05, label = samplesize), vjust = 0)
In the last case, you'll have to play with the adjustment to the x aesthetic, what looks right will depend on the dimensions of your final plot
I'm trying to annotate the highest value in each facet of a graph.
I can't figure out how to remove extra space at the bottom of the y axis without clipping the text above the highest value.
A) Is there a non-symmetrical version of scale_y_continuous(expand=c(0,0))?
B) Or, is there a way to make ggplot include text as part of the graph range?
# a simple dataset
count <- 40
data <- data.frame(
category = sample(LETTERS[1:3], count, TRUE),
x = rnorm(count),
y = abs(rnorm(count))
)
# find the highest value in each category
require(plyr)
data <- data[order(-data$y),]
topValues <- ddply(data, .(category), head, 1)
require(ggplot2)
ggplot(data) +
geom_line(aes(x=x, y=y)) +
geom_text(data=topValues, aes(x=x, y=y, label=y)) + # label the highest y value
# add vjust=-1 to put text above point if possible
facet_grid(category ~ ., scale="free") +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0))
The answer comes thanks to baptiste.
Just add this call to the plot to make a blank point at the top of the text:
geom_blank(data=topValues, aes(x=x, y=y*1.1, label=y))
You can use the vjust argument of geom_text to tweak the vertical position of the label relative to the x and y coordinate:
ggplot(data) +
geom_line(aes(x=x, y=y)) +
geom_text(data=topValues, aes(x=x, y=y, label=y), vjust = 1.5) + # label the highest y value
facet_grid(category ~ ., scale="free") +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0))
I have a basic problem with the geom_histogram function
With the dataset:
df <- data.frame(value = factor( rep(c("A","B"), c(100,200) )))
I create a histogram with:
ggplot(df, aes(x=value, fill = factor(value))) + geom_histogram()
and the output is a histogram with count 100 for A and 200 for B
If I instead plot the density with:
ggplot(df, aes(x=value, fill = factor(value), ..density..)) + geom_histogram()
the output is a histogram with density 1 for A and 1 for B. I assume the reason is that the density is calculated on A and B separately.
The histogram created with:
ggplot(df, aes(x=value, group = 1, fill = factor(value),..density..)) + geom_histogram()
Is a histogram where A is 0.33 and B is 0.66, but the fill color is black, and I cannot find a way to get the fill colors used in the previous histograms in this version of the plot.
How do I generate the last version of the histogram with fill colors based on factor(value)?
I solved the problem with:
ggplot(df, aes(x=value, fill = factor(value))) +
geom_histogram(aes(y=..count../sum(..count..)))