Create heatmap with values from matrix in ggplot2 - r

I've seen heatmaps with values made in various R graphics systems including lattice and base like this:
I tend to use ggplot2 a bit and would like to be able to make a heatmap with the corresponding cell values plotted. Here's the heat map and an attempt using geom_text:
library(reshape2, ggplot2)
dat <- matrix(rnorm(100, 3, 1), ncol=10)
names(dat) <- paste("X", 1:10)
dat2 <- melt(dat, id.var = "X1")
p1 <- ggplot(dat2, aes(as.factor(Var1), Var2, group=Var2)) +
geom_tile(aes(fill = value)) +
scale_fill_gradient(low = "white", high = "red")
p1
#attempt
labs <- c(apply(round(dat[, -2], 1), 2, as.character))
p1 + geom_text(aes(label=labs), size=1)
Normally I can figure out the x and y values to pass but I don't know in this case since this info isn't stored in the data set. How can I place the text on the heatmap?

Key is to add a row identifier to the data and shape it "longer".
edit Dec 2022 to make code reproducible with R 4.2.2 / ggplot2 3.4.0 and reflect changes in tidyverse semantics
library(ggplot2)
library(tidyverse)
dat <- matrix(rnorm(100, 3, 1), ncol = 10)
## the matrix needs names
names(dat) <- paste("X", 1:10)
## convert to tibble, add row identifier, and shape "long"
dat2 <-
dat %>%
as_tibble() %>%
rownames_to_column("Var1") %>%
pivot_longer(-Var1, names_to = "Var2", values_to = "value") %>%
mutate(
Var1 = factor(Var1, levels = 1:10),
Var2 = factor(gsub("V", "", Var2), levels = 1:10)
)
#> Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
#> `.name_repair` is omitted as of tibble 2.0.0.
#> ℹ Using compatibility `.name_repair`.
ggplot(dat2, aes(Var1, Var2)) +
geom_tile(aes(fill = value)) +
geom_text(aes(label = round(value, 1))) +
scale_fill_gradient(low = "white", high = "red")
Created on 2022-12-31 with reprex v2.0.2

There is another simpler way to make heatmaps with values. You can use pheatmap to do this.
dat <- matrix(rnorm(100, 3, 1), ncol=10)
names(dat) <- paste("X", 1:10)
install.packages('pheatmap') # if not installed already
library(pheatmap)
pheatmap(dat, display_numbers = T)
This will give you a plot like this
If you want to remove clustering and use your color scheme you can do
pheatmap(dat, display_numbers = T, color = colorRampPalette(c('white','red'))(100), cluster_rows = F, cluster_cols = F, fontsize_number = 15)
You can also change the fontsize, format, and color of the displayed numbers.

Related

How group dataset in a boxplot?

I have been trying to figure out how to group 9 datasets into 3 different groups (1, 2, and 3).
I have 3 different data frames that look like this:
ID1 ID2 dN dS Omega Label_ID1 Label_ID2 Group
QJY77946 NP_073551 0.0293 0.0757 0.3872 229E-CoV 229E-CoV Intra
QJY77954 NP_073551 0.0273 0.0745 0.3668 229E-CoV 229E-CoV Intra
...
So, the only columns that I´m interested in are three: dN, dS, and Omega.
My main goal is to take these three columns from my data frames and plots in a boxplot using Rstudio.
To do that, first I take the 3 columns of each data frame with these lines:
dN_1 <- df_1$dN
dS_1 <- df_1$dS
Omega_1 <- df_1$Omega
Then, to generate the plot I use this line (option 1):
boxplot(dN_S, dS_S, Omega_S, dN_M, dS_M, Omega_M, dN_E, dS_E, Omega_E,
main = "Test",
xlab = "Frames",
ylab = "Distribution",
col = "red")
My goal is to group these 9 boxes into 3 separate groups:
I know that using ggplot2 could be easier, so my option 2 is to use these lines (option 2):
df_1 %>%
ggplot(aes(y=dN_S)) +
geom_boxplot(
color = "blue",
fill = "blue",
alpha = 0.2,
notch = T,
notchwidth = 0.8)
However, you can see that I couldn´t find a way to plot all groups in the same plot.
So how can I group my data in the boxplot using option 1 or option 2? Maybe the second option is less development but perhaps someone could help with that too.
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
set.seed(123)
df_s <- data.frame(dN = runif(20),
dS = runif(20),
Omega = runif(20))
df_m <- data.frame(dN = runif(20),
dS = runif(20),
Omega = runif(20))
df_e <- data.frame(dN = runif(20),
dS = runif(20),
Omega = runif(20))
df <-
list(df_s, df_m, df_e) %>%
set_names(c("S", "M", "E")) %>%
map_dfr(bind_rows, .id = "df") %>%
pivot_longer(-df)
ggplot(df)+
geom_boxplot(aes(x = name, y = value))+
facet_wrap(~df, nrow = 1)
Created on 2021-09-24 by the reprex package (v2.0.0)
One way to accomplish this is by providing ggplot() another aesthetic, like fill. Here's a small reproducible example:
library(tidyverse)
df <- tibble(category = rep(letters[1:4], 5),
time = c(rep("before", 10), rep("after", 10)),
num = rnorm(20))
df %>%
ggplot() +
geom_boxplot(aes(x=category, y=num, fill = time))
Let me know if you're looking for something else.

How do you compare similarities between variables in an R data frame, based on two categorical variables and one numeric variables

I have a dataframe with three variables of interest: LGA(Location), Offence Category and Total (numeric)
What I am hoping to do, is compare the distance/similarity between each LGA, based on the Total value, in order to create a heat map or similar structure. Is this possible? And if so, what would the process be?
Here is a snippet of the data frame:
I don't really understand your question, but here is an example of a heatmap and a clustered heatmap for 'similar' data:
# Load libraries
library(tidyverse)
library(readxl)
library(httr)
# Find some data
url1 <- "https://www.bocsar.nsw.gov.au/Documents/lga/NewSouthWales.xlsx"
# Get the data and remove missing data points (NA's)
GET(url1, write_disk(tf <- tempfile(fileext = ".xlsx")))
df <- read_excel(path = tf, 2L, skip = 5) %>%
na.omit()
df2 <- df %>%
# format the data to "long format" for plotting
pivot_longer(cols = -c(`Premises type`)) %>%
# Change "Premises type" and "name" to factors
mutate(`Premises type` = factor(
`Premises type`, levels = unique(`Premises type`))
) %>%
mutate(name = factor(
name, levels = unique(name))
) %>%
# Remove the "Total" counts
filter(`Premises type` != "Total")
# Define colours for text (white for dark fill, black for light fill)
hcl <- farver::decode_colour(viridisLite::inferno(length(df2$value)), "rgb", "hcl")
label_col <- ifelse(hcl[, "l"] > 50, "black", "white")
# Plot the data (log scale for fill)
ggplot(df2, aes(y = fct_rev(`Premises type`),
x = name, fill = log(value))) +
geom_tile() +
geom_text(aes(label = value, color = factor(value)),
show.legend = FALSE, size = 2.5) +
theme(axis.text.x = element_text(angle = 45, hjust = 1.05),
axis.title = element_blank()) +
scale_color_manual(values = label_col) +
scale_fill_viridis_c(option = "inferno", na.value = "black")
And a clustered heatmap (similar Premises Type / Crime types cluster together):
# Load the raw data and format for pheatmap (expects a matrix)
dm <- read_excel(path = tf, 2L, skip = 5) %>%
na.omit() %>%
column_to_rownames(var = "Premises type")
# Plot the data
pheatmap::pheatmap(as.matrix(dm), scale = "row")
Edit
I haven't used it before, so I don't know if the output is correct, but based on this SO post you can use cluster::daisy() to get the gower dissimilarity for "Premises Type" then plot using pheatmap, e.g.
library(cluster)
pheatmap::pheatmap(as.matrix(daisy(dm)))
Edit 2
You only need two variables for this heatmap (i.e. "Local government Area" (Character) and "Total" (Numeric) should be fine):
# Load libraries
library(tidyverse)
library(readxl)
library(httr)
library(cluster)
library(pheatmap)
# Find some data
url1 <- "https://www.bocsar.nsw.gov.au/Documents/lga/NewSouthWales.xlsx"
# Get the data and remove missing data points (NA's)
GET(url1, write_disk(tf <- tempfile(fileext = ".xlsx")))
df <- read_excel(path = tf, 2L, skip = 5) %>%
na.omit()
# Select two variables, then set the Premises type as the rownames
df3 <- df %>%
select(`Premises type`, Robbery) %>%
column_to_rownames(var = "Premises type")
# (in your case, use "column_to_rownames(`Local government Area`)"
# Then plot the heatmap
pheatmap(daisy(as.matrix(df3)),
labels_row = rownames(df3),
labels_col = rownames(df3))

ggplot showing names of selected ids

I want to plot a scatterplot using ggplot instead of plot and
produce this scatterplot where some IDs have different colors
and labeled:
Some asked for the dput of the data, so I added the dput in the end.
The problem with adding dput, the quesiton won't go through because
it shows too much coding compared to the question content, so that's
why I avoid it, and the head of dput is not helpful in reproducing the
real data.
I want to be able to plot all the values then show the name
for a selected IDs, not all of them.
This what I tried:
library(ggplot2)
library(ggrepel)
fig5ctrial<-read.csv(url("https://raw.githubusercontent.com/learnseq/learning/main/fig5mintab.txt"),sep = '\t',header = TRUE)
a_select<- c("RPL31", "HSPB1", "MAFB", "ALPL1", "VGF","PCSK1N", "BSG", "CALY", "B2M", "SCG5", "TM4SF4")
selalpha <- fig5ctrial[match(rev(a_select), fig5ctrial$geneIDs), ]
alphanames<-fig5ctrial$geneIDs
attach(fig5ctrial)
z1 <-plot(a.donor, a_cells, main="Scatterplot Example", xlab="Spearsman p all cells ", ylab="Spearsman p alpha cells ", pch=19)
text(a_cells~a.donor, labels=alphanames,data=fig5ctrial, font=2)
z01 <-
#Plot
ggplot()+
#assign alpha cell
geom_point(fig5ctrial,aes(a.donor, a_cells))+
#assign all cells
geom_point(fig5ctrial,aes(all_donors, all.cells))+
#assign IDs of interest
geom_point(fig5ctrial,aes(all_donors, all.cells, color = factor(selalpha)))+
#Add labels
geom_text_repel(data=subset(fig5ctrial,
geneIDs %in% a_select),
aes(label=geneIDs),show.legend = F)
The solution from #r2evans is tremendously complete (upvoted for the very well explained details) and better than this. I used the same ggrepel strategy, but with a simulated variable over your data:
library(ggplot2)
library(ggrepel)
#Data
fig5ctrial <- read.csv('https://raw.githubusercontent.com/learnseq/learning/main/alphacell.csv',stringsAsFactors = F)
#Group
fig5ctrial$allcellstypes <- sample(1:2,nrow(fig5ctrial),replace = T)
fig5ctrial$geneIDs <- trimws(fig5ctrial$geneIDs,whitespace = '\'')
#Plot
ggplot(fig5ctrial,aes(X...donor., a.cells, color = factor(allcellstypes)))+
geom_point()+
#Add labels
geom_text_repel(data=subset(fig5ctrial,
geneIDs %in% a_select),
aes(label=geneIDs),show.legend = F)+
labs(color='allcellstypes')
Output:
Update: After playing with data from OP, here a possible sketch to solve the issue (Remember in the last attempt we merged all data):
library(xlsx)
library(ggplot2)
library(ggrepel)
#Data
fig5cwithoutdesc <- read.xlsx('fig5cwithoutdesc.xlsx',1,colIndex = 1:4)
ids <- c("RPL31", "HSPB1", "MAFB", "ALPL1", "VGF","PCSK1N", "BSG", "CALY", "B2M", "SCG5", "TM4SF4")
#Clean gen id
fig5cwithoutdesc$geneIDs <- trimws(gsub('[[:punct:] ]+',' ',fig5cwithoutdesc$geneIDs))
#Plot
ggplot(fig5cwithoutdesc,aes(a_donor, a_cell,color=(geneIDs %in% ids)))+
geom_point()+
scale_color_manual(values = c('gray','blue'))+
geom_text_repel(data=subset(fig5cwithoutdesc,geneIDs %in% ids),
aes(label=geneIDs),force=19)+
theme_bw()+
theme(legend.position = 'none')
Output:
Here's a sample that uses a little dplyr (not essential) and ggrepel (essential).
Sample data:
set.seed(42)
dat <- data.frame(id = c(outer(letters, letters, paste0)), x = runif(26*26))
dat$y <- dat$x + rnorm(26*26, 0.2, 0.2)
dat[1:3,]
# id x y
# 1 aa 0.9148060 0.9611270
# 2 ba 0.9370754 1.0316538
# 3 ca 0.2861395 0.4818541
Code for the plot:
library(ggplot2)
library(ggrepel)
library(dplyr)
# dots we want to highlight
interesting <- c("mg", "qx", "zz")
dat %>%
mutate(id = replace(id, !id %in% interesting, "")) %>%
ggplot(., aes(x, y)) +
geom_point(aes(color = (id %in% interesting))) +
scale_color_manual(guide = FALSE, values = c("FALSE" = "black", "TRUE" = "red")) +
ggrepel::geom_text_repel(
aes(label = id), color = "red",
nudge_x = 0.5, direction = "x", hjust = 0)
Notes:
dplyr can easily be removed here, replaced with transform and perhaps a temp-variable;
the id=replace(...) portion is to remove the label (id) for any uninteresting variables, so that geom_text_repel will only label the interesting ones;
there are other techniques for highlighting specific dots, including adding another geom_point(..., data=~subset(., id %in% interesting)), but that adds more points ... and in some graphic formats (pdf, svg) this produces extra objects and therefore might have unintended consequences. Coloring the points in this way will be more difficult if you are already using aes(color=.) elsewhere.

How to plot a(n unknown) number of data series as geom_line in same chart

My first Q here, so please go lightly if I'm out of step anywhere.
I'm trying to code R to produce a single chart to contain a number of data series lines. The number of data series may vary but will be provided in the data frame. I have tried to rearrange another thread's content to print the geom_line , but not successfully.
The logic is:
#desire to replace loop of 1:5 with ncol(df)
print(ggplot(df,aes(x=time))
for (i in 1:5) {
print (+ geom_line(aes(y=df[,i]))
}
#functioning geom point loops ggplot production:
for (i in 1:5) {
print(ggplot(df,aes(x=time,y=df[,i]))+geom_point())
}
#functioning multi-line ggplot where n is explicit:
ggplot(data=df, aes(x=time), group=1) +
geom_line(aes(y=df$`3`))+
geom_line(aes(y=df$`4`))
The functioning example code produces n number of point charts, 5 in this case. I would like just one chart to contain n line series.
This may be similar to How to plot n dimensional matrix? for which there are currently no relevant answers
Any contributions much appreciated, thanks
You can use gather from tidyverse "world" to do that.
As you didn't supply a sample data I used mtcars.
I created two data.frames one with 3 columns one with 9. In each one of them I plotted all of the variables against the variable mpg.
library(tidyverse)
df3Columns <- mtcars[, 1:4]
df9Columns <- mtcars[, 1:10]
df3Columns %>%
gather(var, value, -mpg) %>%
ggplot(aes(mpg, value, group = var, color = var)) +
geom_line()
df9Columns %>%
gather(var, value, -mpg) %>%
ggplot(aes(mpg, value, group = var, color = var)) +
geom_line()
Edit - using the sample data in comments.
library(tidyverse)
df %>%
rownames_to_column("time") %>%
gather(var, value, -time) %>%
ggplot(aes(time, value, group = var, color = var)) +
geom_line()
Sample data:
df <- structure(list("39083" = c(96, 100, 100), "39090" = c(99, 100, 100), "39097" = c(99, 100, 100)), row.names = 3:5, class = "data.frame")
To strictly answer your question, you can simply store your ggplot in a variable and add the geom_line one by one:
df <- structure(list("39083" = c(96, 100, 100), "39090" = c(99, 100, 100), "39097" = c(99, 100, 100)), row.names = 3:5, class = "data.frame")
g <- ggplot(df, aes(x = 1:nrow(df)))
for (i in colnames(df))
{
g <- g + geom_line(y = df[,i])
}
g <- g + scale_y_continuous(limits = c(min(df), max(df)))
print(g)
However, this is not a very convenient solution. I would highly recommend to refactor your data frame to be more ggplot style.
df.ultimate <- data.frame(time = numeric(), value = numeric(), group = character())
for (i in colnames(df))
{
df.ultimate <- rbind(df.ultimate, data.frame(time = 1:nrow(df), value = df[, i], group = i))
}
g <- ggplot(df.ultimate, aes(x = time, y = value, color = group))
g <- g + geom_line()
print(g)
A one-line solution:
ggplot(data.frame(time = rep(1:nrow(df), ncol(df)),
value = as.vector(as.matrix(df)),
group = rep(colnames(df), each = nrow(df))),
aes(x = time, y = value, color = group)) + geom_line()

Plot NA counts in a histogram

I have a question related to the histograms in R using ggplot2. I have been working trying to represent some values in a histogram from two different variables. After trying and looking for some solutions in Stackoverflow I got it but...does somebody know how to print NAs count as a new column just to compare the missings in the two variables?
Here is the R code:
i<-"ADL_1_bathing"
j<-"ADL_1_T2_bathing"
t1<-data.frame(datosMedicos[,i])
colnames(t1)<-"datos"
t2<-data.frame(datosMedicos[,j])
colnames(t2)<-"datos"
t1$time<-"t1"
t2$time<-"t2"
juntarParaGrafico<-rbind(t1,t2)
ggplot(juntarParaGrafico, aes(datos, fill = time) ) +
geom_histogram(col="darkblue",alpha = 0.5, aes(y = ..count..), binwidth = 0.2, position = 'dodge', na.rm = F) +
theme(legend.justification = c(1, 1), legend.position=c(1, 1))+
labs(title=paste0("Distribution of ",i), x=i, y="Count")
And this is the output:
Image about the two variables values but without the missing bars:
you could try to summarise the number of NAs b4 plotting. How about this?
library(ggplot2)
library(dplyr)
df1 = data.frame(a = rnorm(1:20))
df1[sample(1:20, 5),] = NA
df2 = data.frame(a = rnorm(1:20))
df2[sample(1:20, 3),] = NA
df2$time = "t2"
df1$time = "t1"
df = rbind(df1, df2)
df %>% group_by(time) %>% summarise(numNAs = sum(is.na(a)))
histogramDF= df %>% group_by(time) %>% summarise(numNAs = sum(is.na(a)))
qplot(x=time, y = numNAs, fill=time, data = histogramDF, stat='identity', geom="histogram")

Resources