heatmap-like plot, but for categorical variables - r

I have three factors (set1, set2, and set3) for each of about 50 individuals. The values for set1, set2, and set3 are "A","B","C". I'd like to make a heatmap-like plot of these data but have the legend show the color associated with the values (eg., A='red', B='blue', C='black'). Any suggestions?
Thanks.

I decided it would be easist to approach this with ggplot2 (for me anyway):
#recreate a data set
dat <- data.frame(person=factor(paste0("id#", 1:50),
levels =rev(paste0("id#", 1:50))), matrix(sample(LETTERS[1:3], 150, T), ncol = 3))
library(ggplot2); library(reshape2)
dat3 <- melt(dat, id.var = 'person')
ggplot(dat3, aes(variable, person)) + geom_tile(aes(fill = value),
colour = "white") + scale_fill_manual(values=c("red", "blue", "black"))

A similar plot can also be made with base graphics. Here is one method using the base image function. This sample has a categorical response rather than a numeric one.
dx <- data.frame( Tasks = c('1','2','3','4'),
Phase1 = c('Done','Done','Done','WIP'),
Phase2 = c('WIP','Done','Done',''),
Phase3 = c('','WIP','Done',''))
ff<-factor(as.matrix(dx[,2:4]),
levels=c("Done","WIP",""),
labels=c("done","wip","-empty-")
)
fx<-matrix(as.numeric(ff), ncol=ncol(dx)-1)
#use labels to assign colors
col<-c(done="darkgreen",wip="orange","-empty-"="black")
imgflip<-function(x) {t(x[nrow(x):1,])}
image(imgflip(fx),
breaks=(1:(nlevels(ff)+1))-.5,
col=col[levels(ff)],
xaxt="n", yaxt="n"
)
axis(2, at=seq(0,1,length.out=nrow(dx)), labels=rev(paste("Task",dx$Tasks)), las=2)
axis(3, at=seq(0,1,length.out=length(names(dx))-1), labels=names(dx)[-1])
which will produce this picture.

Related

R colour code plot by rownames for principal component analysis

I am attempting to complete a principal component analysis on a set of data containing columns of numeric data.
Assuming a dataset like this (in reality I have a pre configured data frame, this one if for reproducibility):
v1 <- c(1,2,3,4,5,6,7)
v2 <- c(3,6,2,5,2,4,9)
v3 <- c(6,1,4,2,3,7,5)
dataset <-data.frame(v1,v2,v3)
row.names(dataset) <-c('New York', 'Seattle', 'Washington DC', 'Dallas', 'Chicago','Los Angeles','Minneapolis')
I have ran my principal component analysis, and successfully plotted it:
pca=prcomp(dataset,scale=TRUE)
plot(pca$x[,1], pca$x[,2],
xlab="First PC",ylab="Second PC")
text(pca$x[,1], pca$x[,2],cex=0.7,pos=3,col="darkgrey")
What I want to do however is colour code my data points based on the city, which is the row names of my dataset. I also want to use these cities (i.e. rownames) as labels.
I've tried the following, but neither have worked:
## attempt 1 - I get row labels, but no chart
plot(pca$x[,1], pca$x[,2],col=rownames(dataset),pch=rownames(dataset),
xlab="First PC",ylab="Second PC")
text(pca$x[,1], pca$x[,2],labels=rownames(dataset),cex=0.7,pos=3,col="darkgrey")
## attempt 2
datasetwithcity = rownames_to_column(dataset, var = "city")
head(datasetwithcity)
OnlyCities=datasetwithcity[,1]
OnlyCities
# this didn't work:
City_Labels=as.numeric(OnlyCities)
head(City_Labels)
# gets city labels, but loses points and no colour
plot(pca$x[,1], pca$x[,2],col=City_Labels,pch=City_Labels,
xlab="First PC",ylab="Second PC")
text(pca$x[,1], pca$x[,2],labels=rownames(dataset),
cex=0.7,pos=3,col="darkgrey")
There are many different ways to do this.
In base R, you could do:
plot(pca$x[,1], pca$x[,2],
xlab="First PC",ylab="Second PC", col = seq(nrow(pca$x)),
xlim = c(-2.5, 2.5), ylim = c(-2, 2))
text(pca$x[,1], pca$x[,2],cex=0.7,pos=3,col="darkgrey")
text(x = pca$x[,1], y = pca$x[,2], labels = rownames(pca$x), pos = 1)
Personally, I think the resulting aesthetics are nicer (and more easy to change to suit your needs) with ggplot. The code is also a bit easier to read once you get used to the syntax.
library(ggplot2)
df <- as.data.frame(pca$x)
df$city <- rownames(df)
ggplot(df, aes(PC1, PC2, color = city)) +
geom_point(size = 3) +
geom_text(aes(label = city) , vjust = 2) +
lims(x = c(-2.5, 2.5), y = c(-2, 2)) +
theme_bw() +
theme(legend.position = "none")
Created on 2021-10-28 by the reprex package (v2.0.0)

Boxplots aren't colouring or plotting labels properly in R, why?

My Tukey test significant results LABELS and the colours plotted as box plots do not plot over each sample box plot. Why?
Seems like the labels are plotted at different y-axis along the same s1 (x-axis)?
Reproducible dataset here:
library(multcompView)
df <- data.frame('Sample'=c("s1","s1","s1","s1","s1","s2","s2","s2","s2","s2","s3","s3","s3","s3","s4","s4","s5","s5"), 'value'=c(-0.1098,-0.1435,-0.1046,-0.1308,-0.1523,-0.1219,-0.1114,-0.1328,-0.1589,-0.1567,-0.1395,-0.1181,-0.1448,-0.124,-0.1929,-0.1996,-0.1981,-0.1917))
anova_df <- aov(df$value ~ df$Sample )
tukey_df <- TukeyHSD(anova_df, 'df$Sample', conf.level=0.95)
# I need to group the treatments that are not different each other together.
TUKEY <- tukey_df
generate_label_df <- function(TUKEY, variable){
# Extract labels and factor levels from Tukey post-hoc
Tukey.levels <- TUKEY[[variable]][,4]
Tukey.labels <- data.frame(multcompLetters(Tukey.levels)['Letters'])
#I need to put the labels in the same order as in the boxplot :
Tukey.labels$Sample=rownames(Tukey.labels)
Tukey.labels=Tukey.labels[order(Tukey.labels$Sample) , ]
return(Tukey.labels)
}
# Apply the function on my dataset
LABELS <- generate_label_df(TUKEY , "df$Sample")
# A panel of colors to draw each group with the same color :
my_colors <- c(
rgb(143,199,74,maxColorValue = 255),
rgb(242,104,34,maxColorValue = 255),
rgb(111,145,202,maxColorValue = 255))
# Draw the basic boxplot
a <- boxplot(df$value ~ df$Sample , ylim=c(min(df$value) , 1.1*max(df$value)) , col=my_colors[as.numeric(LABELS[,1])] , ylab="Value" , main="")
# I want to write the letter over each box. Over is how high I want to write it.
over <- 0.1*max(a$stats[nrow(a$stats),] )
#Add the labels
text(c(1:nlevels(df$Sample)), a$stats[nrow(a$stats),]+over, LABELS[,1] , col=my_colors[as.numeric(LABELS[,1])] )
Current output:
Desired plot-like (colours and LABELS):
First, LABELS$Letters is a character vector. You can get as.numeric(LABELS[,1]) to work if you make it a factor first.
Second, your y-limit needs some work for negative values. There is a function you might find useful called extendrange which is used in many a plotting function.
This line c(1:nlevels(df$Sample)) also would work if df$Sample was a factor which is was not.
Also, if you are plotting text at a specific location, you can adjust the text using either text(..., pos = ) or text(..., adj = ) to shift the position.
LABELS$Letters <- factor(LABELS$Letters)
a <- boxplot(df$value ~ df$Sample , ylim = extendrange(df$value), col=my_colors[as.numeric(LABELS[,1])] , ylab="Value" , main="")
text(seq_along(a$names), apply(a$stats, 2, max), LABELS[,1], col=my_colors[as.numeric(LABELS[,1])], pos = 3)
If you don't mind changing your workflow and use tidyverse library this is how you could achieve your goal:
# join df and LABELS into one data table
inner_join(df, LABELS, by = "Sample") %>%
# calculate max value for each Sample group (it will be used to place the labels)
group_by(Sample) %>%
mutate(placement = max(value)) %>%
ungroup() %>%
# make a plot
ggplot(aes(Sample, value, fill = Letters))+
geom_boxplot()+
geom_text(aes(y = placement, label = Letters, col = Letters), nudge_y = 0.01, size = 6)+
theme_minimal()+
theme(legend.position = "none")

Automatic Highlighting Outliers in ggplots

I have a dataframe df. While plotting this in ggplot. Can we also highlight outliers. Below is the sample code
df <- data.frame(col=runif(100, min=0, max=100000))
df$D <- c(1:100)
ggplot(df,aes(x=D,y=col))+geom_line()
Is there the way to highlight outliers here
We can define a function for this. The line_outlier_plot has four arguments. df has the same format as your example data frame. outlier_color and normal_color are to specify the color for the points.drop indicates if we want to drop the category in the legend.
We have to define how to determine an outlier. Here, I decided that an outlier is a value larger or smaller than the mean plus or minus 3 times of the standard deviation. You can define your own approach to determine the outlier by modifying the code in the ifelse statement.
library(ggplot2)
line_outlier_plot <- function(df, outlier_color = "red", normal_color = "black", drop = FALSE){
# Assign a label to show if it is an outlier or not
df$label <- ifelse(df$col > mean(df$col) + 3 * sd(df$col) |
df$col < mean(df$col) - 3 * sd(df$col), "Outlier", "Normal")
df$label <- factor(df$label, levels = c("Normal", "Outlier"))
# Set the color palette
pal <- c("Outlier" = outlier_color, "Normal" = normal_color)
p <- ggplot(df, aes(x = D, y = col)) +
geom_line() +
geom_point(aes(color = label)) +
scale_color_manual(values = pal, drop = drop)
return(p)
}
Below is an example of the plot using this function.
set.seed(155)
df <- data.frame(col=rnorm(1000))
df$D <- c(1:1000)
line_outlier_plot(df)

Adding text outside plot doesn't work in r

I have a simple dataset:
11 observations, 1 variable.
I want to plot them adding my own axis names, but when I want to change the position of them, R keeps plotting them in the exact same spot.
Here is my script:
plot(data[,5], xlab = "", xaxt='n')
axis(1, at = 1:11, labels = F)
text(1:11, par("usr")[3] - 0.1, srt = 90, adj = 1, labels = names, xpd = TRUE)
I am changing the -0.1, to any number but R keeps placing the labels in the exact same spot. I tried with short names like "a" but the result is the same.
Thanks in advance
My data:
10308.9
10201.6
12685.3
3957.93
7677.1
9671.7
11849.4
10755.7
11283.4
11583.8
12066.9
names <- rep("name",11)
My ggplot solution:
# creating the sample dataframe
data <- read.table(text="10308.9
10201.6
12685.3
3957.93
7677.1
9671.7
11849.4
10755.7
11283.4
11583.8
12066.9", header=FALSE)
# adding a names column
data$names <- as.factor(paste0("name",sprintf("%02.0f", seq(1,11,1))))
#creating the plot
require(ggplot2)
ggplot(data, aes(x=names, y=V1)) +
geom_bar(fill = "white", color = "black")
which gives:
When you want to change the order of the bars, you can do that with transform:
# transforming the data (I placed "name04" as the first one)
data2 <- transform(data,
newnames=factor(names,
levels=c("name04","name01","name02","name03","name04","name05","name06","name07","name08","name09","name10","name11"),
ordered =TRUE))
#creating the plot
ggplot(data2, aes(x=newnames, y=V1)) +
geom_bar(stat="identity", fill="white", color="black")
which gives:

Adding line type to legend in ggplot2

How do I make the line types used by geom_hline or geom_abline show up in the legend of a ggplot plot?
For example:
require (ggplot2)
# some data
dummy <- data.frame (category1 = rep (1:5, 8), category2 = rep (1:4, each = 10),
category3 = rep (factor (1:2), 2), expected = 10 ^ rep (4:7, each = 10),
value = 10 ^rnorm(40, 5))
# faceted plot
baseplot <-ggplot (dummy ) +
geom_point (aes (category1, value, color = category3))+
scale_y_log10 () +
facet_wrap (~category2)
# add a dotted line for expected value
p1 <-baseplot + geom_hline ( aes ( yintercept = expected), linetype = 2)
I tried a couple approaches to making the dotted line show up in the legend, but they give me the same thing as p1
p1a < -p1+scale_linetype_discrete (labels = "expected")+
guides ( linetype= guide_legend ("", labels ="expected"))
p1b <- baseplot + geom_hline (aes (yintercept = expected, linetype = "expected")) +
scale_linetype_manual (labels= "expected", values = 2)
p1a
p1b
How about multiple lines/line types?
Let's say I also wanted to plot groupwise and overall geometric means
require (reshape)
require (plyr)
# calculate geometric means, keep them in their own data frame
geometric_mean <- function (x) exp ( mean (log (x)))
dummy $GM_overall <- geometric_mean (dummy $value)
extra <- ddply(dummy, c( "GM_overall", "expected","category2"), summarize,
GM_group = geometric_mean (value))
extra_long <- melt (GM_group_long, id.vars = "category2")
I expected this approach to show linetype in the legend based on this post, but no such luck
p2=baseplot + geom_hline ( aes ( yintercept = value , linetype = variable), extra)
p2
Here's another case where I would want to do something similar with abline
It would be nice to be able to label the line as 1:1
dummy$value2 <- dummy $value * runif(40, 0.5, 2)
ggplot (dummy)+coord_fixed() +
geom_point (aes (value, value2, color = category3))+
geom_abline (yintercept =0, slope =1)
I'm using R 3.0.0, ggplot 0.9.3.1
You run through several examples, but this simple case should get you most of the way there:
dummy <- data.frame (category1 = rep (1:5, 8), category2 = rep (1:4, each = 10),
category3 = rep (factor (1:2), 2), expected = 10 ^ rep (4:7, each = 10),
value = 10 ^rnorm(40, 5))
# faceted plot
baseplot <- ggplot(dummy) +
geom_point(aes(category1, value, color = category3))+
scale_y_log10() +
facet_wrap(~category2)
# add a dotted line for expected value
baseplot + geom_hline(aes(yintercept = expected,linetype = "expected"),show_guide = TRUE)
The key in most cases, I think, is adding show_guide = TRUE. It is FALSE by default for this geom, which may or may not be intuitive. (I can see the rationale.)
Note how, in this "one line type" case, I "tricked" ggplot into creating a legend by mapping linetype to the character "expected", which causes a new column to be created behind the scenes. Multiple line types should work as expected with the usual methods of creating columns and mapping them to linetype.

Resources