I used the code provided in R: How do I display clustered matrix heatmap (similar color patterns are grouped) succesfully, however im not able to replace the Y-axis with text-labels, is this possible?
library(reshape2)
library(ggplot2)
# Create dummy data
set.seed(123)
df <- data.frame(
a = sample(1:5, 25, replace=TRUE),
b = sample(1:5, 25, replace=TRUE),
c = sample(1:5, 25, replace=TRUE)
)
# Perform clustering
k <- kmeans(df, 3)
# Append id and cluster
dfc <- cbind(df, id=seq(nrow(df)), cluster=k$cluster)
# Add idsort, the id number ordered by cluster
dfc$idsort <- dfc$id[order(dfc$cluster)]
dfc$idsort <- order(dfc$idsort)
# use reshape2::melt to create data.frame in long format
dfm <- melt(dfc, id.vars=c("id", "idsort"))
ggplot(dfm, aes(x=variable, y=idsort)) + geom_tile(aes(fill=value))
You can use scale_y_continuous() to set breaks= and then provide labels= (for example used just letters). With argument expand=c(0,0) inside scale_... you can remove grey area in plot.
ggplot(dfm, aes(x=variable, y=idsort)) + geom_tile(aes(fill=value))+
scale_x_discrete(expand=c(0,0))+
scale_y_continuous(expand=c(0,0),breaks=1:25,labels=letters[1:25])
Related
I'm trying to plot two graphs side-by-side with one common legend that incorporates all the variables between both graphs (some vars are different between the graphs).
Here's a mock example of what I've been attempting:
#make relative abundance values for n rows
makeData <- function(n){
n <- n
x <- runif(n, 0, 1)
y <- x / sum(x)
}
#make random matrices filled with relative abundance values
makeDF <- function(col, rw){
df <- matrix(ncol=col, nrow=rw)
for(i in 1:ncol(df)){
df[,i] <- makeData(nrow(df))
}
return(df)
}
#create df1 and assign col names
df1 <- makeDF(4, 5)
colSums(df1) #verify relative abundance values = 1
df1 <- as.data.frame(df1)
colnames(df1) <- c("taxa","s1", "s2", "s3")
df1$taxa <- c("ASV1", "ASV2", "ASV3", "ASV4", "ASV5")
#repeat for df2
df2 <- makeDF(4,5)
df2 <- as.data.frame(df2)
colnames(df2) <- c("taxa","s1", "s2", "s3")
df2$taxa <- c("ASV1", "ASV5", "ASV6", "ASV7", "ASV8")
# convert wide data format to long format -- for plotting
library(reshape2)
makeLong <- function(df){
df.long <- melt(df, id.vars="taxa",
measure.vars=grep("s\\d+", names(df), val=T),
variable.name="sample",
value.name="value")
return(df.long)
}
df1 <- makeLong(df1)
df2 <- makeLong(df2)
#generate distinct colours for each asv
taxas <- union(df1$taxa, df2$taxa)
library("RColorBrewer")
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
colpals <- qual_col_pals[c("Set1", "Dark2", "Set3"),] #select colour palettes
col_vector = unlist(mapply(brewer.pal, colpals$maxcolors, rownames(colpals)))
taxa.col=sample(col_vector, length(taxas))
names(taxa.col) <- taxas
# plot using ggplot
library(ggplot2)
plotdf2 <- ggplot(df2, aes(x=sample, y=value, fill=taxa)) +
geom_bar(stat="identity")+
scale_fill_manual("ASV", values = taxa.col)
plotdf1 <- ggplot(df1, aes(x=sample, y=value, fill=taxa)) +
geom_bar(stat="identity")+
scale_fill_manual("ASV", values = taxa.col)
#combine plots to one figure and merge legend
library(ggpubr)
ggpubr::ggarrange(plotdf1, plotdf2, ncol=2, nrow=1, common.legend = T, legend="bottom")
(if you have suggestions on how to generate better mock data, by all means!)
When I run my code, I am able to get the two graphs in one figure, but the legend does not incorporate all variables from both plots:
I ideally would like to avoid having repeat variables in the legend, such as:
From what I've searched online, the legend only works when the variables are the same between graphs, but in my case I have similar and different variables.
Thanks for any help!
Maybe this is what you are looking for:
Convert your taxa variables to factor with the levels equal to your taxas variable, i.e. to include all levels from both datasets.
Add argument drop=FALSE to both scale_fill_manual to prevent dropping of unused factor levels.
Note: I only added the relevant parts of the code and set the seed to 42 at the beginning of the script.
set.seed(42)
df1$taxa <- factor(df1$taxa, taxas)
df2$taxa <- factor(df2$taxa, taxas)
# plot using ggplot
library(ggplot2)
plotdf2 <- ggplot(df2, aes(x=sample, y=value, fill=taxa)) +
geom_bar(stat="identity") +
scale_fill_manual("ASV", values = taxa.col, drop = FALSE)
plotdf1 <- ggplot(df1, aes(x=sample, y=value, fill=taxa)) +
geom_bar(stat="identity")+
scale_fill_manual("ASV", values = taxa.col, drop = FALSE)
#combine plots to one figure and merge legend
library(ggpubr)
ggpubr::ggarrange(plotdf1, plotdf2, ncol=2, nrow=1, common.legend = T, legend="bottom")
Following the very good example provided here, I tried to make the following filled contour plot.
x<-seq(1,11,.03) # note finer grid
y<-seq(1,11,.03)
xyz.func<-function(x,y) {(x^2+y^2)}
gg <- expand.grid(x=x,y=y)
gg$z <- with(gg,xyz.func(x,y)) # need long format for ggplot
brks <- cut(gg$z,breaks=c(1, 2, 5, 10, 30, 50, 100, 200))
brks <- gsub(","," - ",brks,fixed=TRUE)
gg$brks <- gsub("\\(|\\]","",brks) # reformat guide labels
ggplot(gg,aes(x,y)) +
geom_tile(aes(fill=brks))+
scale_fill_manual("Z",values=brewer.pal(7,"YlOrRd"))+
scale_x_continuous(expand=c(0,0))+
scale_y_continuous(expand=c(0,0))+
coord_fixed()
The result looks like this:
The thing is, the contours are sorted by alphabetical order, not by ascending values.
How would you change the order of the colors to be by ascending z values?
At first, I thought about adding "0"s in front of the values. I tried something like:
brks <- gsub(pattern = "(\b[0-9]\b)", replacement = "0$1", x = brks)
But it does not work.
Moreover, it would only add one zero in front of single digits, and 100 would still be before 02.
Actually, I'm not completely satisfied with this workaround, as 001 - 002 does not look beautiful.
Make your breaks an ordered factor:
x<-seq(1,11,.03) # note finer grid
y<-seq(1,11,.03)
xyz.func<-function(x,y) {(x^2+y^2)}
gg <- expand.grid(x=x,y=y)
gg$z <- with(gg,xyz.func(x,y)) # need long format for ggplot
brks <- cut(gg$z,breaks=c(1, 2, 5, 10, 30, 50, 100, 200), ordered_result = T)
levels(brks) <- gsub(","," - ", levels(brks), fixed=TRUE)
levels(brks) <- gsub("\\(|\\]","", levels(brks))
gg$brks <- brks # reformat guide labels
ggplot(gg,aes(x,y)) +
geom_tile(aes(fill=brks))+
scale_fill_manual("Z",values=brewer.pal(7,"YlOrRd"))+
scale_x_continuous(expand=c(0,0))+
scale_y_continuous(expand=c(0,0))+
coord_fixed()
Hi there: I need to plot a factor with 81 different categories with different frequency counts each. Each factor name is a 4-letter category. It looks like this. As you can see, it is pretty tough to read the factor labels. I'd like to stagger the y-axis according to this suggestion. However, this issue on github suggests that something has changed in ggplot2 and that the hjust and vjust options no longer work. Does anyone have any suggestions to make this plot look better, in particular to make the factor levels readable.
#libraries
# install.packages('stringi')
library(ggplot2)
library(stringi)
#fake data
var<-stri_rand_strings(81, 4, pattern='[HrhEgeIdiFtf]')
var1<-rnorm(81, mean=175, sd=75)
#data frame
out<-data.frame(var, var1)
#set levels for plotting
out$var<-factor(out$var, levels=out$var[order(out$var1, decreasing=FALSE)])
#PLot
out.plot<-out %>%
ggplot(., aes(x=var, y=var1))+geom_point()+coord_flip()
#Add staggered axis option
out.plot+theme(axis.text.y = element_text(hjust = grid::unit(c(-2, 0, 2), "points")))
To stagger the labels, you could add spaces to the labels in the dataframe.
# Libraries
library(ggplot2)
library(stringi)
# fake data
set.seed(12345)
var <- stri_rand_strings(81, 4, pattern = '[HrhEgeIdiFtf]')
var1 <- rnorm(81, mean = 175, sd = 75)
out <- data.frame(var, var1)
# Add spacing, and set levels for plotting
out = out[order(out$var1), ]
out$var = paste0(out$var, c("", " ", " "))
out$var <- factor(out$var, levels = out$var[order(out$var1, decreasing = FALSE)])
# Plot
out.plot <- ggplot(out, aes(x = var, y = var1)) +
geom_point() + coord_flip()
out.plot
Alternatively, draw the original plot, then edit. Here, I use the grid function, editGrob() to do the editing.
# Libraries
library(ggplot2)
library(gtable)
library(grid)
library(stringi)
# fake data
set.seed(12345)
var <- stri_rand_strings(81, 4, pattern = '[HrhEgeIdiFtf]')
var1 <- rnorm(81, mean = 175, sd = 75)
out <- data.frame(var, var1)
# Set levels for plotting
out$var <- factor(out$var, levels = out$var[order(out$var1, decreasing = FALSE)])
# Plot
out.plot <- ggplot(out, aes(x = var, y = var1)) +
geom_point() + coord_flip()
# Get the ggplot grob
g = ggplotGrob(out.plot)
# Get a hierarchical list of component grobs
grid.ls(grid.force(g))
Look through the list to find the section referring to the left axis. The relevant bit is:
axis-l.6-3-6-3
axis.line.y..zeroGrob.232
axis
axis.1-1-1-1
GRID.text.229
axis.1-2-1-2
You will need to set up path from 'axis-l', through 'axis', through 'axis', though to 'GRID.text'.
# make the relevant column a little wider
g$widths[3] = unit(2.5, "cm")
# The edit
g = editGrob(grid.force(g),
gPath("axis-l", "axis", "axis", "GRID.text"),
x = unit(c(-1, 0, 1), "npc"),
grep = TRUE)
# Draw the plot
grid.newpage()
grid.draw(g)
Another option is to find your way through the structure to the relevant grob to make the edit.
# Get the grob
g <- ggplotGrob(out.plot)
# Get the y axis
index <- which(g$layout$name == "axis-l") # Which grob
yaxis <- g$grobs[[index]]
# Get the ticks (labels and marks)
ticks <- yaxis$children[[2]]
# Get the labels
ticksL <- ticks$grobs[[1]]
# Make the edit
ticksL$children[[1]]$x <- rep(unit.c(unit(c(1,0,-1),"npc")), 27)
# Put the edited labels back into the plot
ticks$grobs[[1]] <- ticksL
yaxis$children[[2]] <- ticks
g$grobs[[index]] <- yaxis
# Make the relevant column a little wider
g$widths[3] <- unit(2.5, "cm")
# Draw the plot
grid.newpage()
grid.draw(g)
Sandy mentions adding spaces to the labels.
With a discrete axis, you can also simply add line breaks to alternate cases. In my case I wanted to stagger alternate ones:
scale_x_discrete(labels=paste0(c("","\n"),net_change$TZ_t)
Where net_change$TZ_t is my ordered factor. It extends to 'triple' levels easily with c("","\n","\n\n").
I'm trying to plot distribution of species between 2 different habitat types (hab 1 and hab 2). Some of my species secondarily use some habitats, so I have a separate column for secondary hab1 (hab1.sec). To visualise their distribution across the two habitats and different depths, I am using a facet_grid between hab1 and hab2. Example code as below:
# example code
set.seed(101)
ID <- seq(1,20, by=1) ## ID for plotting
species <- sample(letters, size=20) ## arbitrary species
## different habitat types in hab.1
hab1 <- c("coastal","shelf","slope","open.ocean","seamount")
hab1.pri <- sample(hab1, size = 20, replace = T)
## secondarily used habitats, may not be present for some species
hab.sec <- c("coastal","shelf","slope","open.ocean","seamount", NA)
hab1.sec <- sample(hab.sec, size = 20, replace = T)
## habitat types for hab.2
hab2 <- c("epipelagic","benthopelagic","epibenthic","benthic")
hab.2 <- sample(hab2, size = 20, replace = T)
## arbitrary depth values
dep.min <- sample(seq(0,1000), size = 20, replace = T)
dep.max <- sample(seq(40, 1500), size = 20, replace = T)
# make data frame
dat <- data.frame(ID, species, hab1.pri, hab1.sec, hab.2,dep.min, dep.max)
# ggplot with facet grid
p <- ggplot(data=dat)+ geom_segment(aes(x=as.factor(ID),xend=as.factor(ID),y=dep.min, yend=dep.max),size=2,data = dat)+ scale_y_reverse(breaks = c(0, 200, 1000,1500))+facet_grid(hab.2~hab1.pri, scales = "free" ,space = "free")+theme_bw()
I would like to add segments for hab1.sec within the existing facet grid. I have tried this code:
p+ geom_segment(aes(x=as.factor(ID),xend=as.factor(ID),y=dep.min, yend=dep.max),linetype=2,data = dat)+facet_wrap(~hab1.sec)
But doing this produces a new graph.
Is there a better way to add those extra lines to the existing grid (preferably as dashed lines)?
I'd be really grateful for any help with this!
Thanks a lot, in advance!
What about combining the primary and secondary habitats into one variable and mapping that variable to an aesthetic?
Note I'm using tidyr and dplyr tools here because they help a lot in cases like this.
library(dplyr)
library(tidyr)
dat %>%
gather(hab1, value, -ID, -species, -(hab.2:dep.max)) %>%
ggplot()+
geom_segment(aes(x=as.factor(ID),xend=as.factor(ID),y=dep.min, yend=dep.max, linetype=hab1),size=2) +
scale_y_reverse(breaks = c(0, 200, 1000,1500))+
facet_grid(hab.2~value, scales = "free" ,space = "free")+
theme_bw()
Using ggplot2, I want to create a histogram where anything above X is grouped into the final bin. For example, if most of my distribution was between 100 and 200, and I wanted to bin by 10, I would want anything above 200 to be binned in "200+".
# create some fake data
id <- sample(1:100000, 10000, rep=T)
visits <- sample(1:1200,10000, rep=T)
#merge to create a dataframe
df <- data.frame(cbind(id,visits))
#plot the data
hist <- ggplot(df, aes(x=visits)) + geom_histogram(binwidth=50)
How can I limit the X axis, while still representing the data I want limit?
Perhaps you're looking for the breaks argument for geom_histogram:
# create some fake data
id <- sample(1:100000, 10000, rep=T)
visits <- sample(1:1200,10000, rep=T)
#merge to create a dataframe
df <- data.frame(cbind(id,visits))
#plot the data
require(ggplot2)
ggplot(df, aes(x=visits)) +
geom_histogram(breaks=c(seq(0, 200, by=10), max(visits)), position = "identity") +
coord_cartesian(xlim=c(0,210))
This would look like this (with the caveats that the fake data looks pretty bad here and the axis need to be adjusted as well to match the breaks):
Edit:
Maybe someone else can weigh in here:
# create breaks and labels
brks <- c(seq(0, 200, by=10), max(visits))
lbls <- c(as.character(seq(0, 190, by=10)), "200+", "")
# true
length(brks)==length(lbls)
# hmmm
ggplot(df, aes(x=visits)) +
geom_histogram(breaks=brks, position = "identity") +
coord_cartesian(xlim=c(0,220)) +
scale_x_continuous(labels=lbls)
The plot errors with:
Error in scale_labels.continuous(scale) :
Breaks and labels are different lengths
Which looks like this but that was fixed 8 months ago.
If you want to fudge it a little to get around the issues of bin labelling then just subset your data and create the binned values in a new sacrificial data-frame:
id <- sample(1:100000, 10000, rep=T)
visits <- sample(1:1200,10000, rep=T)
#merge to create a dataframe
df <- data.frame(cbind(id,visits))
#create sacrificical data frame
dfsac <- df
dfsac$visits[dfsac$visits > 200 ] <- 200
Then use the breaks command in scale_x_continuous to define your bin labels easily:
ggplot(data=dfsac, aes(dfsac$visits)) +
geom_histogram(breaks=c(seq(0, 200, by=10)),
col="black",
fill="red") +
labs(x="Visits", y="Count")+
scale_x_continuous(limits=c(0, 200), breaks=c(seq(0, 200, by=10)), labels=c(seq(0,190, by=10), "200+"))