How to make hierarchical cluster pheatmap in r? - r

I have use this code to make hierarchical cluster heatmap but no color is coming
library(tidyverse)
Mydata <- structure(list(Location = c("Karnaphuli River", "Sangu River", "Kutubdia Channel", "Moheshkhali Channel", "Bakkhali River", "Naf River", "St. Martin's Island", "Mean "), Cr = c(114.92, 2.75, 18.88, 27.6, 39.5, 12.8, 17.45, 33.41), Pb = c(31.29, 26.42, 52.3, 59.45, 34.65, 12.8, 9.5, 32.34), Cu = c(9.48, 54.39, 52.4, 73.28, 76.26, 19.48, 8.94, 42.03), Zn = c(66.2, 71.17, 98.7, 95.3, 127.84, 27.76, 21.78, 72.67), As = c(89.67, 9.85, 8.82, 18.54, 15.38, 7.55, 16.45, 23.75), Cd = c(1.06, 0, 0.96, 2.78, 3.12, 0.79, 0.45, 1.53)), class = "data.frame", row.names = c(NA, -8L))
library(pheatmap)
Mydata %>% column_to_rownames(var = "Location") %>%
as.matrix() %>% pheatmap(Mydata, cutree_cols = 6)

You don't need to pass data again when using pipes. Try :
library(pheatmap)
Mydata %>%
column_to_rownames(var = "Location") %>%
as.matrix() %>% pheatmap(cutree_cols = 6)

Related

How to rearrange dataset for ScatterPlot using ggplot in R

I'm trying to plot the countries current co2 emissions over the 6 years but I'm having trouble with the way the data is entered in the excel file. I tried using unlisting and tried combining new vars but with no luck. Any help on determining aes(x =, y=) for the data sets I provided?
structure(list(`2010` = c(5.78, 7.34, 8.74, 1.45, 17.9), `2011` = c(5.76,
7.56, 8.49, 1.56, 17.1), `2012` = c(5.75, 7.36, 7.62, 1.56, 17.5
), `2013` = c(5.23, 6.71, 7.36, 1.7, 17.5), `2014` = c(5.3, 6.42,
7.04, 1.76, 16.9), `2015` = c(5.31, 6.04, 6.73, 1.79, 16.4)), row.names = c(59L,
62L, 69L, 79L, 184L), class = "data.frame")
Assuming each row is a different country:
library(tidyverse)
j_df <- structure(list(
`2010` = c(5.78, 7.34, 8.74, 1.45, 17.9),
`2011` = c(5.76, 7.56, 8.49, 1.56, 17.1),
`2012` = c(5.75, 7.36, 7.62, 1.56, 17.5 ),
`2013` = c(5.23, 6.71, 7.36, 1.7, 17.5),
`2014` = c(5.3, 6.42, 7.04, 1.76, 16.9),
`2015` = c(5.31, 6.04, 6.73, 1.79, 16.4)
), row.names = c(59L, 62L, 69L, 79L, 184L), class = "data.frame")
j_df %>%
rownames_to_column(var = "rowname") %>%
rename(Country = rowname) %>%
pivot_longer(cols = `2010`:`2015`, names_to = "year", values_to = "C_Emissions") %>%
ggplot(aes(x = year, y = C_Emissions, color = Country)) +
geom_point() +
geom_line(aes(group = Country))

How to customize colors for lines and points in feasts::gg_season()

I'm able to convert the following df to tsibble object and plot using gg_season():
library(tsibble)
library(feasts)
library(tidyr)
library(dplyr)
df <- structure(list(date = structure(c(18292, 18321, 18352, 18382,
18413, 18443, 18474, 18505, 18535, 18566, 18596, 18627, 18658,
18686, 18717, 18747, 18778, 18808, 18839, 18870, 18900, 18931,
18961, 18992), class = "Date"), value1 = c(-2.94, -40.61, -6.89,
3.04, -3.5, 0.18, 6.79, 9.08, 9.35, 10.92, 20.53, 18.04, 24.6,
154.6, 30.4, 32.1, 27.7, 32.1, 19.2, 25.4, 28, 26.9, 21.7, 20.9
), value2 = c(-12.66, 7.56, -1.36, -14.39, -16.18, 3.29, -0.69,
-1.6, 13.47, 4.83, 4.56, 7.58, 28.7, 18.9, 39.1, 44, 52, 37.1,
28.2, 32.7, 17.2, 20.4, 31.4, 19.5)), class = "data.frame", row.names = c(NA,
-24L))
# Convert to tsibble object and plot using gg_season()
df %>%
pivot_longer(value1:value2) %>%
mutate(date = yearmonth(date)) %>%
mutate(year = year(date)) %>%
as_tsibble(index = date, key = name) %>%
gg_season(value) +
geom_point() # +
# scale_color_manual(values = c('2020' = 'blue', '2021' = 'red'))
Now I try to customize colors based on different years, ie., setting blue for 2020, and red for 2021. I've added scale_color_manual(values = c('2020' = 'blue', '2021' = 'red')), but I didn't succeed yet, how could I do that correctly? Thanks.
Reference:
how to change the color in geom_point or lines in ggplot
...
gg_season(value, pal = c("#3333FF", "#FF3333")) +
geom_point()
The year scale here is a continuous one (explaining why the scale_color_manual line produces "Error: Continuous value supplied to discrete scale"). But we can give gg_season a vector of color codes to use in its pal parameter.

Getting the distance matrix back from already clustered data

I have used hclust in the TSclust package to do agglomerative hierarchical clustering. My question is, Can I get the dissimlarity (distance) matrix back from hclust? I wanted the values of the distance to rank which variable is closer to a single variable in the group of variables.
example: If (x1,x2, x3,x4,x5,x6,x7,x8,x9,x10) are the variables used to form the distance matrix, then what I wanted is the distance between x3 and the rest of variables (x3x1,x3x2,x3x4,x3x5, and so on). Can we do that? Here is the code and reproducible data.
Data:
structure(list(x1 = c(186.41, 100.18, 12.3, 14.38, 25.97, 0.06,
0, 6.17, 244.06, 19.26, 256.18, 255.69, 121.88, 75, 121.45, 11.34,
34.68, 3.09, 34.3, 26.13, 111.31), x2 = c(327.2, 8.05, 4.23,
6.7, 3.12, 1.91, 37.03, 39.17, 140.06, 83.72, 263.29, 261.22,
202.48, 23.27, 2.87, 7.17, 14.48, 3.41, 5.95, 70.56, 91.58),
x3 = c(220.18, 126.14, 98.59, 8.56, 0.5, 0.9, 17.45, 191.1,
164.64, 224.36, 262.86, 237.75, 254.88, 42.05, 9.12, 0.04,
12.22, 0.61, 61.86, 114.08, 78.94), x4 = c(90.74, 26.11,
47.86, 10.86, 3.74, 23.69, 61.79, 68.12, 87.92, 171.76, 260.98,
266.62, 96.27, 57.15, 78.89, 16.73, 6.59, 49.44, 57.21, 202.2,
67.17), x5 = c(134.09, 27.06, 7.44, 4.53, 17, 47.66, 95.96,
129.53, 40.23, 157.37, 172.61, 248.56, 160.84, 421.94, 109.93,
22.77, 2.11, 49.18, 64.13, 52.61, 180.87), x6 = c(173.17,
46.68, 6.54, 3.05, 0.35, 0.12, 5.09, 72.46, 58.19, 112.31,
233.77, 215.82, 100.63, 65.84, 2.69, 0.01, 3.63, 12.93, 66.55,
28, 61.74), x7 = c(157.22, 141.81, 19.98, 116.18, 16.55,
122.3, 62.67, 141.84, 78.3, 227.27, 340.22, 351.38, 147.73,
0.3, 56.12, 33.2, 5.51, 54.4, 82.98, 152.66, 218.26), x8 = c(274.08,
51.92, 54.86, 15.37, 0.31, 0.05, 36.3, 162.04, 171.78, 181.39,
310.73, 261.55, 237.99, 123.99, 1.92, 0.74, 0.23, 18.51,
7.68, 65.55, 171.33), x9 = c(262.71, 192.34, 2.75, 21.68,
1.69, 3.92, 0.09, 9.33, 120.36, 282.92, 236.7, 161.59, 255.44,
126.44, 7.63, 2.04, 1.02, 0.12, 5.87, 146.25, 134.11), x10 = c(82.71,
44.09, 1.52, 2.63, 4.38, 28.64, 168.43, 80.62, 20.36, 39.29,
302.31, 247.52, 165.73, 18.27, 2.67, 1.77, 23.13, 53.47,
53.14, 46.61, 86.29)), class = "data.frame", row.names = c(NA,
-21L))
Code:
as.ts(cdata)
library(dplyr) # data wrangling
library(ggplot2) # grammar of graphics
library(ggdendro) # dendrograms
library(TSclust) # cluster time series
cluster analysis
dist_ts <- TSclust::diss(SERIES = t(cdata), METHOD = "INT.PER") # note the data frame must be transposed
hc <- stats::hclust(dist_ts, method="complete") # method can be also "average" or diana (for DIvisive ANAlysis Clustering)
hcdata <- ggdendro::dendro_data(hc)
names_order <- hcdata$labels$label
# Use the following to remove labels from dendogram so not doubling up - but good for checking hcdata$labels$label <- ""
hcdata%>%ggdendro::ggdendrogram(., rotate=FALSE, leaf_labels=FALSE)
I believe the object you are looking for is stored in the variable dist_ts:
dist_ts <- TSclust::diss(SERIES = t(cdata), METHOD = "INT.PER")
print(dist_ts)

How to make Hierarchical Cluster Heatmap in R?

I have this data set, I want to make Hierarchical Cluster Heatmap in R. Please help me
structure(list(Location = c("Karnaphuli River", "Sangu River", "Kutubdia Channel", "Moheshkhali Channel", "Bakkhali River", "Naf River", "St. Martin's Island", "Mean "), Cr = c(114.92, 2.75, 18.88, 27.6, 39.5, 12.8, 17.45, 33.41), Pb = c(31.29, 26.42, 52.3, 59.45, 34.65, 12.8, 9.5, 32.34), Cu = c(9.48, 54.39, 52.4, 73.28, 76.26, 19.48, 8.94, 42.03), Zn = c(66.2, 71.17, 98.7, 95.3, 127.84, 27.76, 21.78, 72.67), As = c(89.67, 9.85, 8.82, 18.54, 15.38, 7.55, 16.45, 23.75), Cd = c(1.06, 0, 0.96, 2.78, 3.12, 0.79, 0.45, 1.53)), class = "data.frame", row.names = c(NA, -8L))
library(tidyverse)
library(gplots)
#>
#> Attaching package: 'gplots'
#> The following object is masked from 'package:stats':
#>
#> lowess
dat <- structure(list(Location = c("Karnaphuli River", "Sangu River", "Kutubdia Channel", "Moheshkhali Channel", "Bakkhali River", "Naf River", "St. Martin's Island", "Mean "), Cr = c(114.92, 2.75, 18.88, 27.6, 39.5, 12.8, 17.45, 33.41), Pb = c(31.29, 26.42, 52.3, 59.45, 34.65, 12.8, 9.5, 32.34), Cu = c(9.48, 54.39, 52.4, 73.28, 76.26, 19.48, 8.94, 42.03), Zn = c(66.2, 71.17, 98.7, 95.3, 127.84, 27.76, 21.78, 72.67), As = c(89.67, 9.85, 8.82, 18.54, 15.38, 7.55, 16.45, 23.75), Cd = c(1.06, 0, 0.96, 2.78, 3.12, 0.79, 0.45, 1.53)), class = "data.frame", row.names = c(NA, -8L))
dat %>%
column_to_rownames(var = "Location") %>%
as.matrix() %>%
heatmap.2(., # source data
scale = "none", # set scaling by column, row or none
Rowv = T, # toggles clustering of rows
Colv = T, # toggles clustering of columns
trace = "none", # turn off trace in each column of heatmap
margin = c(3, 10), # set margins around plot
col = colorRampPalette(c("white", "red"))(11), # set color scheme
symkey = F, # set color scale to be asymetric
symbreaks = F, # set color scale to be asymetric
main = "Heatmap Title", # set main plot title
cexRow = 1, # set font size for rows
cexCol = 1, # set font size for columns
tracecol = "black", # set color of histogram on key
key.xlab = "Value", # set title for legend
lhei = c(1,3), # set key height as proportion to total plot height
lwid = c(1,3), # set key width as proportion to total plot width
keysize = 2 # set overall key size
)
Created on 2021-04-21 by the reprex package (v1.0.0)
There are many options to customize the details of the heatmap - check the documentation for more. I've shown a few of the ones I commonly use here.

Attached point in ggplot

I want to connect these different points by geom_line. It does not work. Can someone help me identify the problem?
df_st <- cbind(ID = c("ID_201","ID_202","ID_203","ID_204","ID_205","ID_206","ID_207","ID_208",
"ID_209","ID_210","ID_211","ID_212"),
PARAM_1 = c(48.4,17.6,19.2,23.6,23.7,17.8,16.5,18.2,17.6,19.7,14.3,15.7),
PARAM_2 = c(14.615,8.06,7.83,10.81,10.635,9.44,7.54,8.86,6.855,8.68,7.36,6.695),
PARAM_3 = c(19.8,10.3,10.2,13.6,13.8,11.9,9.4,11.2,8.9,11.3,9.0,9.0)) %>% data.frame
df_st <- df_st %>%
mutate_at(vars(-ID), as.character)
df_st <- df_st %>%
mutate_at(vars(-ID), as.numeric)
df_st_g <- df_st %>%
dplyr::select(ID, PARAM_1,PARAM_2, PARAM_3) %>%
gather(key = "variable", value = "value",PARAM_1:PARAM_3)
ggplot(df_st_g, aes(x = ID, y = value)) +
geom_point(aes(color = variable)) +
theme_classic() ```
Do you mean something like this:
library(tidyverse)
library(tidyr)
#Data
df_st <- structure(list(ID = c("ID_201", "ID_202", "ID_203", "ID_204",
"ID_205", "ID_206", "ID_207", "ID_208", "ID_209", "ID_210", "ID_211",
"ID_212"), PARAM_1 = c(48.4, 17.6, 19.2, 23.6, 23.7, 17.8, 16.5,
18.2, 17.6, 19.7, 14.3, 15.7), PARAM_2 = c(14.615, 8.06, 7.83,
10.81, 10.635, 9.44, 7.54, 8.86, 6.855, 8.68, 7.36, 6.695), PARAM_3 = c(19.8,
10.3, 10.2, 13.6, 13.8, 11.9, 9.4, 11.2, 8.9, 11.3, 9, 9)), row.names = c(NA,
-12L), class = "data.frame")
df2 <- pivot_longer(df_st,cols = names(df_st)[-1])
#Plot
ggplot(df2, aes(x = ID, y = value,group=name,color=name)) +
geom_point() +
geom_line() +
theme_classic()

Resources