Using ggplot2, I want to create a histogram where anything above X is grouped into the final bin. For example, if most of my distribution was between 100 and 200, and I wanted to bin by 10, I would want anything above 200 to be binned in "200+".
# create some fake data
id <- sample(1:100000, 10000, rep=T)
visits <- sample(1:1200,10000, rep=T)
#merge to create a dataframe
df <- data.frame(cbind(id,visits))
#plot the data
hist <- ggplot(df, aes(x=visits)) + geom_histogram(binwidth=50)
How can I limit the X axis, while still representing the data I want limit?
Perhaps you're looking for the breaks argument for geom_histogram:
# create some fake data
id <- sample(1:100000, 10000, rep=T)
visits <- sample(1:1200,10000, rep=T)
#merge to create a dataframe
df <- data.frame(cbind(id,visits))
#plot the data
require(ggplot2)
ggplot(df, aes(x=visits)) +
geom_histogram(breaks=c(seq(0, 200, by=10), max(visits)), position = "identity") +
coord_cartesian(xlim=c(0,210))
This would look like this (with the caveats that the fake data looks pretty bad here and the axis need to be adjusted as well to match the breaks):
Edit:
Maybe someone else can weigh in here:
# create breaks and labels
brks <- c(seq(0, 200, by=10), max(visits))
lbls <- c(as.character(seq(0, 190, by=10)), "200+", "")
# true
length(brks)==length(lbls)
# hmmm
ggplot(df, aes(x=visits)) +
geom_histogram(breaks=brks, position = "identity") +
coord_cartesian(xlim=c(0,220)) +
scale_x_continuous(labels=lbls)
The plot errors with:
Error in scale_labels.continuous(scale) :
Breaks and labels are different lengths
Which looks like this but that was fixed 8 months ago.
If you want to fudge it a little to get around the issues of bin labelling then just subset your data and create the binned values in a new sacrificial data-frame:
id <- sample(1:100000, 10000, rep=T)
visits <- sample(1:1200,10000, rep=T)
#merge to create a dataframe
df <- data.frame(cbind(id,visits))
#create sacrificical data frame
dfsac <- df
dfsac$visits[dfsac$visits > 200 ] <- 200
Then use the breaks command in scale_x_continuous to define your bin labels easily:
ggplot(data=dfsac, aes(dfsac$visits)) +
geom_histogram(breaks=c(seq(0, 200, by=10)),
col="black",
fill="red") +
labs(x="Visits", y="Count")+
scale_x_continuous(limits=c(0, 200), breaks=c(seq(0, 200, by=10)), labels=c(seq(0,190, by=10), "200+"))
Related
I'm not able to plot7 time series one on top of the other using ggplot. Why does this reproducible code not work? signal is a factor variable with 7 values spanning 700 values (100 values each), yet somehow the values will only plot if I change the x in aes() to be 1:700. I'd like each signal to plot from 1 to 100. Why isn't that happening?
signal_to_noise_ratio = 10
t=seq(0.1,10,0.1)
df <- data.frame(truesignal = sin(t))
df2 <- df
for (i in seq(5)) {
noise = rnorm(t)
k <- sqrt(var(t)/(signal_to_noise_ratio*var(noise)))
data_wNoise = t + k*noise
df2[,i] = sin(data_wNoise)
}
df[,2:6] = df2
df[,2:7] = rowSums(df2)
colnames(df) <- c("truesignal", "noisy1", "noisy2", "noisy3", "noisy4", "noisy5",
"stacked")
melt_df <- melt(df,measure.vars = 1:7, variable.name=c("signal"))
ggplot(data=melt_df,
aes(x=t,y=value,colour=factor(signal))) +
geom_path() +
facet_grid(signal~.)
You probably want something like an id variable.
melt_df$t.2 <- rep(1:100, 7)
library(ggplot2)
ggplot(data=melt_df,
aes(x=t.2, y=value, colour=factor(signal))) +
geom_path() +
facet_grid(signal~.)
Yields:
Following the very good example provided here, I tried to make the following filled contour plot.
x<-seq(1,11,.03) # note finer grid
y<-seq(1,11,.03)
xyz.func<-function(x,y) {(x^2+y^2)}
gg <- expand.grid(x=x,y=y)
gg$z <- with(gg,xyz.func(x,y)) # need long format for ggplot
brks <- cut(gg$z,breaks=c(1, 2, 5, 10, 30, 50, 100, 200))
brks <- gsub(","," - ",brks,fixed=TRUE)
gg$brks <- gsub("\\(|\\]","",brks) # reformat guide labels
ggplot(gg,aes(x,y)) +
geom_tile(aes(fill=brks))+
scale_fill_manual("Z",values=brewer.pal(7,"YlOrRd"))+
scale_x_continuous(expand=c(0,0))+
scale_y_continuous(expand=c(0,0))+
coord_fixed()
The result looks like this:
The thing is, the contours are sorted by alphabetical order, not by ascending values.
How would you change the order of the colors to be by ascending z values?
At first, I thought about adding "0"s in front of the values. I tried something like:
brks <- gsub(pattern = "(\b[0-9]\b)", replacement = "0$1", x = brks)
But it does not work.
Moreover, it would only add one zero in front of single digits, and 100 would still be before 02.
Actually, I'm not completely satisfied with this workaround, as 001 - 002 does not look beautiful.
Make your breaks an ordered factor:
x<-seq(1,11,.03) # note finer grid
y<-seq(1,11,.03)
xyz.func<-function(x,y) {(x^2+y^2)}
gg <- expand.grid(x=x,y=y)
gg$z <- with(gg,xyz.func(x,y)) # need long format for ggplot
brks <- cut(gg$z,breaks=c(1, 2, 5, 10, 30, 50, 100, 200), ordered_result = T)
levels(brks) <- gsub(","," - ", levels(brks), fixed=TRUE)
levels(brks) <- gsub("\\(|\\]","", levels(brks))
gg$brks <- brks # reformat guide labels
ggplot(gg,aes(x,y)) +
geom_tile(aes(fill=brks))+
scale_fill_manual("Z",values=brewer.pal(7,"YlOrRd"))+
scale_x_continuous(expand=c(0,0))+
scale_y_continuous(expand=c(0,0))+
coord_fixed()
I'm trying to plot distribution of species between 2 different habitat types (hab 1 and hab 2). Some of my species secondarily use some habitats, so I have a separate column for secondary hab1 (hab1.sec). To visualise their distribution across the two habitats and different depths, I am using a facet_grid between hab1 and hab2. Example code as below:
# example code
set.seed(101)
ID <- seq(1,20, by=1) ## ID for plotting
species <- sample(letters, size=20) ## arbitrary species
## different habitat types in hab.1
hab1 <- c("coastal","shelf","slope","open.ocean","seamount")
hab1.pri <- sample(hab1, size = 20, replace = T)
## secondarily used habitats, may not be present for some species
hab.sec <- c("coastal","shelf","slope","open.ocean","seamount", NA)
hab1.sec <- sample(hab.sec, size = 20, replace = T)
## habitat types for hab.2
hab2 <- c("epipelagic","benthopelagic","epibenthic","benthic")
hab.2 <- sample(hab2, size = 20, replace = T)
## arbitrary depth values
dep.min <- sample(seq(0,1000), size = 20, replace = T)
dep.max <- sample(seq(40, 1500), size = 20, replace = T)
# make data frame
dat <- data.frame(ID, species, hab1.pri, hab1.sec, hab.2,dep.min, dep.max)
# ggplot with facet grid
p <- ggplot(data=dat)+ geom_segment(aes(x=as.factor(ID),xend=as.factor(ID),y=dep.min, yend=dep.max),size=2,data = dat)+ scale_y_reverse(breaks = c(0, 200, 1000,1500))+facet_grid(hab.2~hab1.pri, scales = "free" ,space = "free")+theme_bw()
I would like to add segments for hab1.sec within the existing facet grid. I have tried this code:
p+ geom_segment(aes(x=as.factor(ID),xend=as.factor(ID),y=dep.min, yend=dep.max),linetype=2,data = dat)+facet_wrap(~hab1.sec)
But doing this produces a new graph.
Is there a better way to add those extra lines to the existing grid (preferably as dashed lines)?
I'd be really grateful for any help with this!
Thanks a lot, in advance!
What about combining the primary and secondary habitats into one variable and mapping that variable to an aesthetic?
Note I'm using tidyr and dplyr tools here because they help a lot in cases like this.
library(dplyr)
library(tidyr)
dat %>%
gather(hab1, value, -ID, -species, -(hab.2:dep.max)) %>%
ggplot()+
geom_segment(aes(x=as.factor(ID),xend=as.factor(ID),y=dep.min, yend=dep.max, linetype=hab1),size=2) +
scale_y_reverse(breaks = c(0, 200, 1000,1500))+
facet_grid(hab.2~value, scales = "free" ,space = "free")+
theme_bw()
I used the code provided in R: How do I display clustered matrix heatmap (similar color patterns are grouped) succesfully, however im not able to replace the Y-axis with text-labels, is this possible?
library(reshape2)
library(ggplot2)
# Create dummy data
set.seed(123)
df <- data.frame(
a = sample(1:5, 25, replace=TRUE),
b = sample(1:5, 25, replace=TRUE),
c = sample(1:5, 25, replace=TRUE)
)
# Perform clustering
k <- kmeans(df, 3)
# Append id and cluster
dfc <- cbind(df, id=seq(nrow(df)), cluster=k$cluster)
# Add idsort, the id number ordered by cluster
dfc$idsort <- dfc$id[order(dfc$cluster)]
dfc$idsort <- order(dfc$idsort)
# use reshape2::melt to create data.frame in long format
dfm <- melt(dfc, id.vars=c("id", "idsort"))
ggplot(dfm, aes(x=variable, y=idsort)) + geom_tile(aes(fill=value))
You can use scale_y_continuous() to set breaks= and then provide labels= (for example used just letters). With argument expand=c(0,0) inside scale_... you can remove grey area in plot.
ggplot(dfm, aes(x=variable, y=idsort)) + geom_tile(aes(fill=value))+
scale_x_discrete(expand=c(0,0))+
scale_y_continuous(expand=c(0,0),breaks=1:25,labels=letters[1:25])
I have this code:
x <- seq(-600, 600, length=10000)
dat1 <- data.frame(x=x, SD=400, val = (1/(1+10^(-x/400))))
dat2 <- data.frame(x=x, SD=200, val = (1/(1+10^(-x/200))))
dat3 <- data.frame(x=x, SD=600, val = (1/(1+10^(-x/600))))
dat <- rbind(dat1, dat2, dat3)
ggplot(data=dat, aes(x=x, y=val, colour=SD)) + geom_line(aes(group=SD))
What I expected is to have 3 curves and I do. However the legend shows that there are 6 curves - for SD 100, 200, 300, 400, 500, 600 instead of only 200, 400, 600. Why is that and how do I fix this?
The legend is not indicating the presence of 6 curves. You've mapped the continuous variable SD to the aesthetic colour, which results in a continuous colour scale, i.e. a gradient. If you want only the three values in the legend, try wrapping SD in factor:
ggplot(data=dat, aes(x=x, y=val, colour=factor(SD))) + geom_line(aes(group=SD))