Related
I want to get a Mahalanobis difference for each set of two scores, after being grouped by another variable. In this case, it would be a Mahalanobis difference for each Attribute (across each set of 2 scores). The output should be 3 Mahalanobis distances (one for A, B and C).
Currently I am working with (in my original dataframe, there are some NAs, hence I include one in the reprex):
library(tidyverse)
library(purrr)
df <- tibble(Attribute = unlist(map(LETTERS[1:3], rep, 5)),
Score1 = c(runif(7), NA, runif(7)),
Score2 = runif(15))
mah_db <- df %>%
dplyr::group_by(Attribute) %>%
dplyr::summarise(MAH = mahalanobis(Score1:Score2,
center = base::colMeans(Score1:Score2),
cov(Score1:Score2, use = "pairwise.complete.obs")))
This raises the error:
Caused by error in base::colMeans(): ! 'x' must be an array of at
least two dimensions
But as far as I can tell, I am giving colMeans two columns.
So what's going wrong here? And I wonder if even fixing this gives a complete solution?
It seems your question is more about the statistics than dplyr. So I just give a small example based on your data and an adapted example from ?mahalanobis. Perhaps also have a look here or here.
df <- subset(x = df0, Attribute == "A", select = c("Score1", "Score2"))
df$mahalanobis <- mahalanobis(x = df, center = colMeans(df), cov = cov(df))
df$p <- pchisq(q = df$mahalanobis, df = 2, lower.tail = FALSE)
plot(density(df$mahalanobis, bw = 0.3), ylim = c(0, 0.8),
main="Squared Mahalanobis distances");
grid()
rug(df$mahalanobis)
df <- subset(x = df0, Attribute == "B", select = c("Score1", "Score2"))
df <- df[complete.cases(df), ]
df$mahalanobis <- mahalanobis(x = df, center = colMeans(df), cov = cov(df))
df$p <- pchisq(q = df$mahalanobis, df = 2, lower.tail = FALSE)
lines(density(df$mahalanobis, bw = 0.3), col = "red",
main="Squared Mahalanobis distances");
rug(df$mahalanobis, col = "red")
df <- subset(x = df0, Attribute == "C", select = c("Score1", "Score2"))
df$mahalanobis <- mahalanobis(x = df, center = colMeans(df), cov = cov(df))
df$p <- pchisq(q = df$mahalanobis, df = 2, lower.tail = FALSE)
lines(density(df$mahalanobis, bw = 0.3), col = "green",
main="Squared Mahalanobis distances");
rug(df$mahalanobis, col = "green")
Hope, that helps (and too long for a comment).
(Of course you can make to code much shorter, but it shows in each step what happens.)
I have a dataframe made by the school grades of some students in different subjects. The students are also characterized by their gender (F or M), that is included as a suffix in their names (e.g. Anne_F, Albert_M, etc...)
With these data I have created an heatmap with the package pheatmap(), in this way:
library(pheatmap)
Anne_F <- c(9,7,6,10,6)
Carl_M <- c(6,7,9,5,7)
Albert_M <- c(8,8,8,7,9)
Kate_F <- c(10,5,10,9,5)
Emma_F <- c(6,8,10,8,7)
matrix <- cbind(Anne_F, Carl_M, Albert_M, Kate_F, Emma_F)
rownames(matrix) <- c("Math", "Literature", "Arts", "Science", "Music")
print(matrix)
heatmap <- pheatmap(
mat = matrix,
cluster_rows = F,
cluster_cols = F,
cellwidth = 30,
cellheight = 30,
)
heatmap
Which gives this matrix
and the relative plot:
Now I would like to automatically recognize if a student is Male or Female and add this as a column annotation in the heatmap, in order to have a graph like this:
I have thought to create two vectors, one with the name of the students:
name <- c("Anne", "Carl", "Albert", "Kate", "Emma") and one with the respective genders: gender <- c("F", "M", "M", "F", "F") , but I can't figure out how to associate names with genders, and to show them on the heatmap.
I don't mean to manually associate one-name to one-gender (as Anne to F, Albert to M, etc,). I need to take the entire vector of names and associate it with the corresponding vector of genders (and then annotate them on the heatmap), because their number will increase in the future.
Many thanks in advance for your help.
You need to use annotation_col option in pheatmap.
library(pheatmap)
# split matrix into "Name" and "Gender"
name_gender_matrix <- str_split_fixed(colnames(matrix), "_", 2)
# Data that maps to the heatmap should be set at the rownames
annot_col <- data.frame(row.names = name_gender_matrix[, 1], Gender = name_gender_matrix[, 2])
# Align the column name of your matrix and with the annotation
colnames(matrix) <- rownames(annot_col)
heatmap <- pheatmap(
mat = matrix,
cluster_rows = F,
cluster_cols = F,
cellwidth = 30,
cellheight = 30,
annotation_col = annot_col
)
With the given data, you could achieve your desired output like this:
Gender <- sapply(colnames(matrix), function(x) strsplit(x, "_")[[1]][2])
df <- as.data.frame(Gender)
pheatmap(
mat = matrix,
cluster_rows = F,
cluster_cols = F,
cellwidth = 30,
cellheight = 30,
annotation_col = df,
annotation_colors = list(Gender = c(M = "#6ef88a", F = "#d357fe"))
)
df <- data.frame(replicate(10,sample(0:100,1000,rep=TRUE)))
eee <- as.data.frame(lapply(df, function(cc) cc[ sample(c(TRUE, NA), prob = c(0.85, 0.15), size = length(cc), replace = TRUE) ]))
View(eee)
This gives me a data frame with missing data.
If a variable in my current data frame has missing values, then I want to create two new variables. The first being a binary "yes" this was missing or "no" it wasn't missing. I want the second variable to be the same as the original, if the variable is not missing. If it is missing, I want to impute the mean of the original variable for my new column.
I'm not sure how to write the code to do this checking my whole data set instead of doing each variable individually.
Thank you for the help!
I worked something out that is crude but effective.
df <- data.frame(replicate(10,sample(0:100,1000,rep=TRUE)))
eee <- as.data.frame(lapply(df,
function(cc) cc[ sample(c(TRUE, NA), prob = c(0.85, 0.15), size = length(cc), replace = TRUE) ]))
replace_fn1 <- function(x) ifelse(is.na(x), "yes", "no")
pt1 <- apply(eee, c(1, 2), replace_fn1)
col_means <- as.data.frame(t(apply(eee, 2, mean, na.rm = TRUE)))
#set up df with same size of all column means
col_means <- as.data.frame(matrix(col_means,
nrow = 1000, ncol = 10, byrow = TRUE))
pt2 <- pt1
na_ind <- which(is.na(eee), arr.ind = TRUE)
pt2[na_ind] <- col_means[na_ind]
I am trying to create a heatmap of species abundances across six sites.
I have a matrix of sites vs species, of numeric abundance data.
However when I run my code, R returns an error that my matrix is non-numeric.
Can anyone figure this one out? I am stumped.
Exported dataframe link: log_mean_wide
Working:
lrc <- rainbow(nrow(log_mean_wide), start = 0, end = .3)
lcc <- rainbow(ncol(log_mean_wide), start = 0, end = .3)
logmap <- heatmap(log_mean_wide, col = cm.colors(256), scale = "column",
RowSideColors = lrc, ColSideColors = lcc, margins = c(5, 10),
xlab = "species", ylab = "Site",
main = "heatmap(<Auckland Council MCI data 1999, habitat:bank>, ..., scale = \"column\")")
error message: Error in heatmap(log_mean_wide, Rowv = NA, Colv = NA, col = cm.colors(256), : 'x' must be a numeric matrix
log_heatmap <- heatmap(log_mean_wide, Rowv=NA, Colv=NA, col = cm.colors(256), scale="column", margins=c(5,10)) #same error
is.numeric(log_mean_wide) #[1] FALSE
is.character(log_mean_wide) #[1] FALSE
is.factor(log_mean_wide) #[1] FALSE
is.logical(log_mean_wide) #[1] FALSE
is.integer(log_mean_wide) #[1] FALSE
?!?!
dims <- dim(log_mean_wide)
log_mean_matrix <- as.numeric(log_mean_wide)
dim(log_mean_matrix) <- dims
Error: (list) object cannot be coerced to type 'double'
str(log_mean_wide) shows species as numeric, site as character- why does this not work then?
storage.mode(log_mean_wide) <- "numeric"
Error in storage.mode(log_mean_wide) <- "numeric" : (list) object cannot be coerced to type 'double'
There are two issues:
The first column log_mean_wide$Site is non-numeric.
heatmap only accepts a matrix as input data (not a data.frame).
To address these issues, you can do the following (mind you, there is a lot of clutter in the heatmap):
# Store Site information as rownames
df <- log_mean_wide;
rownames(df) <- log_mean_wide[, 1];
# Remove non-numeric column
df <- df[, -1];
# Use as.matrix to convert data.frame to matrix
logmap <- heatmap(
as.matrix(df),
col = cm.colors(256),
scale = "column",
margins = c(5, 10),
xlab = "species", ylab = "Site",
main = "heatmap(<Auckland Council MCI data 1999, habitat:bank>, ..., scale = \"column\")")
This is an old question but since I spent some time figuring out what the issue was, I will add an answer here. Drawing a heatmap, specifically adding an annotation column may fail if the annotation data is a tibble.
Reproducible example:
test = matrix(rnorm(200), 20, 10)
test[1:10, seq(1, 10, 2)] = test[1:10, seq(1, 10, 2)] + 3
test[11:20, seq(2, 10, 2)] = test[11:20, seq(2, 10, 2)] + 2
test[15:20, seq(2, 10, 2)] = test[15:20, seq(2, 10, 2)] + 4
colnames(test) = paste("Test", 1:10, sep = "")
rownames(test) = paste("Gene", 1:20, sep = "")
annotation_col = data.frame(
CellType = factor(rep(c("CT1", "CT2"), 5)),
Time = 1:5
)
rownames(annotation_col) = paste("Test", 1:10, sep = "")
pheatmap::pheatmap(test, annotation_col =
annotation_col)
The above works. However, if you instead were using a tibble, you would get an error
pheatmap::pheatmap(test, annotation_col =
dplyr::as_tibble(annotation_col))
Error in cut.default(a, breaks = 100) : 'x' must be numeric
NOTE
I think it would have been better for this error to specify that we needed a data.frame instead of something else. That might have been more specific.
See this issue.
This question already has answers here:
How to create pre-annotated rowside column in heatmap.2
(2 answers)
Closed 6 years ago.
I have the following MWE in which I make a heatmap without performing any clustering and showing any dendrogram. I want to group my rows (genes) together in categories, in a better looking way than how it is now.
This is the MWE:
#MWE
library(gplots)
mymat <- matrix(rexp(600, rate=.1), ncol=12)
colnames(mymat) <- c(rep("treatment_1", 3), rep("treatment_2", 3), rep("treatment_3", 3), rep("treatment_4", 3))
rownames(mymat) <- paste("gene", 1:dim(mymat)[1], sep="_")
rownames(mymat) <- paste(rownames(mymat), c(rep("CATEGORY_1", 10), rep("CATEGORY_2", 10), rep("CATEGORY_3", 10), rep("CATEGORY_4", 10), rep("CATEGORY_5", 10)), sep=" --- ")
mymat #50x12 MATRIX. 50 GENES IN 5 CATEGORIES, ACROSS 4 TREATMENTS WITH 3 REPLICATES EACH
png(filename="TEST.png", height=800, width=600)
print(
heatmap.2(mymat, col=greenred(75),
trace="none",
keysize=1,
margins=c(8,14),
scale="row",
dendrogram="none",
Colv = FALSE,
Rowv = FALSE,
cexRow=0.5 + 1/log10(dim(mymat)[1]),
cexCol=1.25,
main="Genes grouped by categories")
)
dev.off()
Which produces this:
I would like to group the CATEGORIES in the rows together (and, if possible, the treatments in the columns as well), so it looks something like the following:
Or, maybe even better, with the CATEGORIES on the left, the same way as when clustering is performed and dendrograms are shown; however is easier and clearer...
Is there any way? Thanks!!
EDIT!!
I was made aware of the RowSideColors in the comments and I made the MWE below. However, I don't seem to be able to print the legend in the output png, plus the colors in the legend are not correct, and I cannot get the position right either. So please help me with the legend in the MWE below.
On another hand, I use the palette "Set3", consisting of 12 colors, but what if I need more than 12 colors (if I have more than 12 categories)??
NEW MWE
library(gplots)
library(RColorBrewer)
col1 <- brewer.pal(12, "Set3")
mymat <- matrix(rexp(600, rate=.1), ncol=12)
colnames(mymat) <- c(rep("treatment_1", 3), rep("treatment_2", 3), rep("treatment_3", 3), rep("treatment_4", 3))
rownames(mymat) <- paste("gene", 1:dim(mymat)[1], sep="_")
mymat
mydf <- data.frame(gene=paste("gene", 1:dim(mymat)[1], sep="_"), category=c(rep("CATEGORY_1", 10), rep("CATEGORY_2", 10), rep("CATEGORY_3", 10), rep("CATEGORY_4", 10), rep("CATEGORY_5", 10)))
mydf
png(filename="TEST.png", height=800, width=600)
print(
heatmap.2(mymat, col=greenred(75),
trace="none",
keysize=1,
margins=c(8,6),
scale="row",
dendrogram="none",
Colv = FALSE,
Rowv = FALSE,
cexRow=0.5 + 1/log10(dim(mymat)[1]),
cexCol=1.25,
main="Genes grouped by categories",
RowSideColors=col1[as.numeric(mydf$category)]
)
#THE LEGEND DOESN'T WORK INSIDE print(), AND THE POSITION AND COLORS ARE WRONG
#legend("topright",
# legend = unique(mydf$category),
# col = col1[as.numeric(mydf$category)],
# lty= 1,
# lwd = 5,
# cex=.7
# )
)
dev.off()
Which produces:
Please help me with the legend, and with the hypothetical case I need more than 12 colors. Thanks!
I would use pheatmap package. Your example would look something like that:
library(pheatmap)
library(RColorBrewer)
# Generte data (modified the mydf slightly)
col1 <- brewer.pal(12, "Set3")
mymat <- matrix(rexp(600, rate=.1), ncol=12)
colnames(mymat) <- c(rep("treatment_1", 3), rep("treatment_2", 3), rep("treatment_3", 3), rep("treatment_4", 3))
rownames(mymat) <- paste("gene", 1:dim(mymat)[1], sep="_")
mydf <- data.frame(row.names = paste("gene", 1:dim(mymat)[1], sep="_"), category = c(rep("CATEGORY_1", 10), rep("CATEGORY_2", 10), rep("CATEGORY_3", 10), rep("CATEGORY_4", 10), rep("CATEGORY_5", 10)))
# add row annotations
pheatmap(mymat, cluster_cols = F, cluster_rows = F, annotation_row = mydf)
# Add gaps
pheatmap(mymat, cluster_cols = F, cluster_rows = F, annotation_row = mydf, gaps_row = c(10, 20, 30, 40))
# Save to file with dimensions that keep both row and column names readable
pheatmap(mymat, cluster_cols = F, cluster_rows = F, annotation_row = mydf, gaps_row = c(10, 20, 30, 40), cellheight = 10, cellwidth = 20, file = "TEST.png")