Related
I have performed Spearman correlation for my data. Then I tried to cluster and plot my data using the "ward.D2" method for corrplot()and pheatmap(). However, the order of the variables is different between the two plots.
Could someone help me clarify this point, thus correcting my code and creating the two plots with the same order of clustered variables?
Thank you so much.
#A sample of my dataset:
dput(Data_corr)
structure(list(S_cHDLP = c(0.299999999999999, -2.78, 0.880000000000001,
2.48, 2.15, 5.31, 3.02, 1.19, 2.1, -1.18, -0.34, 1.25, -3.25,
-3.16, 0.19, -0.100000000000001, -2.16, -0.220000000000001, 0.77,
-2.12), H7P = c(-0.18, -0.48, -0.13, -0.21, 0.07, 0.64, -0.13,
-0.1, 0.12, -0.22, 0.09, -0.0399999999999999, -1.56, 0.39, 0.58,
-0.49, 0.2, 0.13, 0.11, 0.06), H6P = c(0, 0, 0, 0.16, -0.23,
0, 0, 0, -0.26, -0.28, 0.06, -0.17, 1.16, -0.12, -0.32, -0.17,
0.38, 0.05, 0.01, 0), H5P = c(0, 0.84, 0.47, 1.21, 0.01, 0.21,
1.36, 0.2, -0.12, 0.93, -1.01, 0, -0.58, -0.97, -1, 0.97, -0.89,
0.35, -0.59, -0.12), H4P = c(-0.12, -1.27, -0.18, 0.25, 1.02,
1.26, -0.62, -0.16, 0.25, -0.01, 0.44, 0.17, 0.19, 0.97, 2.35,
0.3, -0.18, 0.03, 0.0899999999999999, 0.38), H3P = c(-0.31, 0.39,
0.13, 0.29, 0, 0.02, -0.07, 0, 0, -0.32, 0, -0.79, 0, -0.53,
-0.71, -0.2, 2.08, 0.86, 0, 0), H2P = c(-1.28, -0.619999999999999,
-1.07, 1.96, 0.15, 4.92, 1.55, 3, -0.459999999999999, -0.56,
1.12, 3.44, -1.48, -1.27, 1.45, 0.609999999999999, -1.59, -1.57,
2.04, 2.03), H1P = c(1.58, -2.15, 1.96, 0.51, 2, 0.37, 1.47,
-1.83, 2.56, -0.62, -1.46, -2.19, -1.77, -1.9, -1.25, -0.73,
-0.57, 1.35, -1.28, -4.14), TRLZ_TRL = c(4.61, 1.49, -2.71, 1.54,
-5.46, 2.18, 3.48, 12.83, 7.51, 7.74, -8.38, -0.729999999999997,
6.11, -19.74, -0.869999999999997, -1.82, -1.57000000000001, 0.609999999999999,
-14.79, -18.65), LDLZ = c(-0.0599999999999987, -0.400000000000002,
-0.289999999999999, -1.2, -0.479999999999997, -0.59, -1.29, -0.0599999999999987,
0.210000000000001, -1.58, 1.97, 0.0800000000000018, -1, 1.95,
1.41, 0.00999999999999801, 0.430000000000003, -0.289999999999999,
0.68, 0.52), HDLZ = c(-0.200000000000001, -0.200000000000001,
-0.0700000000000003, 0, -0.0200000000000014, -0.0199999999999996,
-0.0399999999999991, -0.119999999999999, -0.0900000000000016,
-0.0500000000000007, -0.15, -0.16, -0.640000000000001, 0.42,
0.16, -0.130000000000001, 0.15, 0.41, -0.0300000000000011, 0.18
)), class = "data.frame", row.names = c(NA, -20L))
library(pheatmap)
library(corrplot)
CorMethod <- "spearman"
CorMatrix <- cor(Data_corr, method=CorMethod, use="pairwise.complete.obs")
## 1st Plot
Plot3<-pheatmap(CorMatrix, cluster_cols=T, cluster_rows=T, cutree_rows = 3, angle_col=45, fontsize_col=5, fontsize_row = 7, treeheight_col=0, clustering_method="ward.D2")
#2nd Plot
Plot8 <-corrplot(CorMatrix, method="square", type="lower", order="hclust", hclust.method="ward.D2", tl.pos="ld", tl.cex = 0.5, tl.col="black", tl.srt=45)
You can create a corrplot with the same order given by pheatmap as follows:
#2nd Plot
library(RColorBrewer)
ord <- Plot3$tree_row$order
ReordCorMatrix <- CorMatrix[ord, ord]
Plot8 <-corrplot(ReordCorMatrix, method="square", type="lower", order="original",
hclust.method="ward.D2",
tl.pos="ld", tl.cex = 0.5, tl.col="black", tl.srt=45,
col=colorRampPalette(rev(brewer.pal(n = 7, name="RdYlBu")))(100))
I would like to plot a time series of the Standardized Precipitation Index (SPI). Normally this looks somewhat like this:
You can see that the area under/above the curve is colored in blue/red. This is what I would like to plot too.
I know that there were kind of similar questions, like this and that one. This could bring be a bit further, but unfortunately not to the final result yet.
To make it easier to understand, here is some code so that everybody can reproduce it:
library(data.table)
library(ggplot2)
vec1 <- 1:310
vec2 <- c(1.78, 1.88, 1.10, 0.42, 0.73, 1.35, 1.34, 0.54, 0.20, 0.72, 1.29, 1.78, 1.30, 1.37, -0.13, 0.64, -0.13, 0.87, 0.47, -0.26, -0.27,
-0.81, -0.54, -0.77, -0.29, -0.22, -0.05, 0.41, 0.45, 0.91, -0.31, 0.67, 0.28, 0.93, 0.43, -0.04, -0.80, -1.20, -0.73, -0.98, 0.47, -0.01,
1.30, 1.45, 0.72, -0.59, -1.14, -0.33, 0.22, 0.49, 0.58, 0.36, 0.66, 0.64, 0.47, -0.60, 1.01, 1.50, 1.18, 0.82, 0.02, 0.57, 0.25,
1.20, 1.19, 0.71, -0.30, -1.37, -1.50, -1.03, -0.77, -1.08, -1.92, -2.32, -2.46, -1.61, -0.39, 0.67, 0.38, 0.62, -0.34, 0.01, -0.55, -0.74,
-1.95, -1.18, -0.96, 0.36, -0.96, -1.28, -2.29, -2.67, -0.65, -0.13, 0.61, 0.21, 0.57, 0.11, 0.37, 0.20, -0.14, -0.87, -0.84, 0.87, 1.33,
0.45, -0.76, -1.27, -0.65, -0.29, 0.54, 0.14, -0.55, -0.94, -0.98, -0.44, -0.37, 0.72, 0.70, 0.95, 0.89, 1.10, 1.51, 1.11, 1.77, 1.20,
1.23, -0.72, -1.43, -2.11, -1.37, -0.80, -0.34, -0.14, 0.22, -0.65, -0.44, -0.86, -0.46, -0.67, -0.91, -0.40, -0.09, 0.22, 0.96, 0.71, 0.51,
-1.61, -1.62, -1.43, -0.27, 1.08, 1.76, 1.30, 0.78, 1.02, 1.01, 0.56, -0.32, 0.37, 0.31, 1.36, 1.49, 1.42, 0.78, -0.19, 0.64, 0.39,
0.47, -1.13, -1.45, -0.52, 0.43, -0.19, -0.97, -0.27, 0.63, 1.01, 1.01, 0.83, -0.56, -1.71, -0.29, 1.06, 1.82, 1.28, 0.88, 1.08, 1.78,
1.47, 0.74, -0.34, 0.14, 1.09, 1.49, 1.30, 0.28, -0.25, 0.24, -0.33, 0.05, -0.86, -0.69, -1.03, -0.59, 0.32, 0.61, 0.84, -0.18, -0.67,
0.46, 0.31, -0.72, -2.26, -2.85, -0.69, -0.77, 0.64, -1.49, -1.69, -1.55, -0.28, -0.80, -1.15, -0.38, 0.31, 0.18, -0.27, -0.84, -0.94, -1.23,
-0.53, -1.52, -0.73, -0.93, 0.25, -0.11, 0.38, 0.48, 0.10, -0.02, 0.26, 1.39, 1.61, 0.83, 0.09, 0.95, 1.07, 0.77, 0.23, 0.26, 0.85,
0.93, 0.91, 1.10, 0.47, 0.74, 1.42, 1.17, 0.32, -0.40, 0.76, 1.44, 1.69, 1.03, 0.01, 0.46, 0.61, 0.60, -0.09, -0.31, -0.96, -0.91,
-0.06, 0.75, 1.32, 1.29, 0.55, 0.43, -1.25, 0.12, -0.05, 0.18, -0.77, -2.19, -1.85, -2.12, -1.51, -1.14, -0.79, -0.82, -1.13, -1.72, -2.14,
-1.95, -0.63, 0.70, 0.64, 0.17, -1.04, -0.58, -0.57, -0.57, -1.05, -1.11, -0.59, -0.07, 1.22, 0.30, -0.15)
df <- data.frame(vec1, vec2)
colnames(df) <- c("ID", "SPI")
df = as.data.table(df)
There you have the entire SPI time series, one value for each month between 1980 and 2005.
So far I came to that result:
ggplot(data = df, aes(x = ID, y = SPI)) +
geom_col(data = df[SPI <= 0], fill = "red") +
geom_col(data = df[SPI >= 0], fill = "blue") +
theme_bw()
When you run the code you will see the following plot:
This is going into the right direction, but the small white gaps between values in the first half are disturbing and not supposed to be there. So there must be something wrong.
It is supposed to look like this, just with the two colors blue and red for the positive and negative values:
ggplot(data = df, aes(x = ID, y = SPI)) +
geom_area() +
theme_bw()
The code leads to this image:
You can see that there are no white gaps between the single values and I have absolutely no idea what leads to these errors.
Anybody with an idea how to solve that?
OP, what you are observing are the artifacts related to the resolution of your graphics driver and the space between columns. The areas you show are composed of many filled columns next to one another on the x axis. You do not specify the width= argument for geom_col(), so the default value leaves a space between the individual values on the x axis. It's best to illustrate if we take only a section of your data along the x axis:
ggplot(data = df, aes(x = ID, y = SPI)) +
geom_col(data = df[SPI <= 0], fill = "red") +
geom_col(data = df[SPI >= 0], fill = "blue") +
theme_bw() +
xlim(0,100) # just the first part on the left
There's your white lines - it's the space bewtween the columns. When you have the larger picture, the appearance of the white lines has to do with the resolution of your graphics device. You can test this if you save your graphic with ggsave() using different parameters for dpi=. For example, on my computer saving ggsave('filename.png', dpi=72) gives no lines, but ggsave('filename.png', dpi=600) shows the white lines in places.
There's an easy solution to this though, which is to specify the width= argument of geom_col() to be 1. Be default, it's set to 0.75 or 0.8 (not exactly sure), which leaves a gap between the next value (fills ~75 or 80% of the space). If you set this to 1, it fills 100% of the space allotted for that column, leaving no white space in-between:
ggplot(data = df, aes(x = ID, y = SPI)) +
geom_col(data = df[SPI <= 0], fill = "red", width=1) +
geom_col(data = df[SPI >= 0], fill = "blue", width=1) +
theme_bw() +
xlim(0,100)
What if you change the distance between columns?
Just added "width=1" inside "aes" of ggplot.
ggplot(data = df, aes(x = ID, y = SPI, width=1)) +
geom_col(data = df[SPI <= 0], fill = "red") +
geom_col(data = df[SPI >= 0], fill = "blue") +
theme_bw()
I get this:
image.result
I do apologise in advance but I'm not an R expert (at all). I used R rarely and the last time, I had to convert my data as tibble in order to plot them correctly, and now I did the same but I believe it would be better to convert them into POSIXct (but I don't know how to do it :confused: )
I have this csv:
structure(list(Date = c("01/02/2003 01:01:01", "01/03/2003 01:01:01",
"01/04/2003 01:01:01", "01/05/2003 01:01:01", "01/06/2003 01:01:01",
"01/07/2003 01:01:01", "01/08/2003 01:01:01", "01/09/2003 01:01:01",
"01/10/2003 01:01:01", "01/11/2003 01:01:01", "01/12/2003 01:01:01",
"01/01/2004 01:01:01", "01/02/2004 01:01:01", "01/03/2004 01:01:01",
"01/04/2004 01:01:01", "01/05/2004 01:01:01", "01/06/2004 01:01:01",
"01/07/2004 01:01:01", "01/08/2004 01:01:01", "01/09/2004 01:01:01",
"01/10/2004 01:01:01", "01/11/2004 01:01:01", "01/12/2004 01:01:01",
"01/01/2005 01:01:01", "01/02/2005 01:01:01", "01/03/2005 01:01:01",
"01/04/2005 01:01:01", "01/05/2005 01:01:01", "01/06/2005 01:01:01",
"01/07/2005 01:01:01", "01/08/2005 01:01:01", "01/09/2005 01:01:01",
"01/10/2005 01:01:01", "01/11/2005 01:01:01", "01/12/2005 01:01:01",
"01/01/2006 01:01:01", "01/02/2006 01:01:01", "01/03/2006 01:01:01",
"01/04/2006 01:01:01", "01/05/2006 01:01:01", "01/06/2006 01:01:01",
"01/07/2006 01:01:01", "01/08/2006 01:01:01", "01/09/2006 01:01:01",
"01/10/2006 01:01:01", "01/11/2006 01:01:01"), Pz1 = c(0.53,
0.25, 0.3, 0.51, 0.23, 0.52, 0.48, 0.36, 0.56, 0.27, 0.44, 0.28,
0.79, 0.15, 0.73, 0.44, 0.5, 0.26, 0.1, 0.26, 0.69, 0.38, 0.51,
0.39, 0.42, 0.29, 0.68, 0.62, 0.5, 0.06, 0.29, 0.13, 0.6, 0.21,
0.34, 0.17, 0.39, 0.21, 0.89, 0.19, 0.44, 0.53, 0.55, 0.89, 0.55,
0.65), Pz2 = c(0.62, 0.99, 0.87, 0.77, 0.51, 0.66, 0.4, 0.68,
0.87, 0.13, 0.81, 0.29, 0.11, 0.11, 0.23, 0.71, 0.85, 0.05, 0.78,
0.32, 0.16, 0.54, 0.65, 0.09, 0.97, 0.81, 0.49, 0.36, 0.37, 0.78,
0.04, 0.67, 0.91, 0.12, 0.34, 0.3, 0.71, 0.04, 0.73, 0.33, 0.59,
0.23, 0.82, 0.04, 0.04, 0.82), Pz3 = c(0.2, 0.76, 0.04, 0.39,
0.58, 0.49, 0.44, 0.12, 0.16, 0.12, 0.95, 0.95, 0.08, 0.68, 0.57,
0.49, 0.58, 0.46, 0.39, 0.51, 0.69, 0.09, 0.68, 0.18, 0.3, 0.75,
0.76, 0.85, 0.17, 0.6, 0.45, 0.26, 0.65, 0.07, 0.7, 0.71, 0.47,
0.79, 0.58, 0.08, 0.37, 0.86, 0.23, 0.31, 0.06, 0.1), Pz4 = c(0.21,
0.65, 0.67, 0.45, 0.32, 0.79, 0.94, 0.78, 0.73, 0.83, 0.79, 0.46,
0.07, 0.84, 0.25, 0.27, 0.77, 0.37, 0.16, 0.67, 0.88, 0.67, 0.87,
0.95, 0.63, 0.61, 0.21, 0.21, 0.4, 0.74, 0.62, 0.22, 0.08, 0.67,
0.2, 0.18, 0.83, 0.3, 0.15, 0.7, 0.5, 0.43, 0.81, 0.17, 0.31,
0.66), Pz5 = c(0.57, 0.78, 1, 0.87, 0.88, 0.5, 0.24, 0.71, 0.11,
0.4, 0.08, 0.2, 0.67, 0.41, 0.28, 0.45, 0.6, 0.18, 0.27, 0.02,
0.96, 0.48, 0.95, 0.01, 0.8, 0.07, 0.34, 0.09, 0.19, 0.59, 0.34,
0.66, 0.48, 0.86, 0.97, 0.76, 0.93, 0.21, 0.5, 0.93, 0.41, 0.33,
0.32, 0.12, 0.42, 0.94), Pz6 = c(0.42, 0.34, 0.34, 0.73, 0.7,
0.67, 0.09, 0.45, 0.55, 0.88, 0.05, 0.15, 0.85, 0.02, 0.42, 0.14,
0.68, 0.71, 0.57, 0.14, 0.85, 0.81, 0.2, 0.97, 0.42, 0.59, 0.23,
0.39, 0.5, 0.87, 0.37, 0.63, 0.7, 0.3, 0.33, 0.29, 0.9, 0.75,
0.38, 0.17, 0.87, 0.45, 0.79, 0.74, 0.21, 0.05)), class = "data.frame", row.names = c(NA,
-46L))
Now...
So far I managed to get them into the plot style I need, using the following code:
a=read.csv("C:/Users/simon/Desktop/4.csv")
b=a
c=b %>%
mutate(Date = dmy_hms(Date)) %>%
arrange(Date) %>%
as_tbl_time(index = Date)
df=c
df_melt = melt(df, id= "Date")
d=ggplot(df_melt, aes(x = Date, y = value)) +
geom_line() +
facet_wrap(~ variable, scales = 'free_y', ncol = 2)
d+theme_bw()
Which gives this:
I know, it's absolutely horrible, but it was the only way I remember. What I do really need is to add a shaded rectangle in every plot (at the same date); something like this, for instance:
Could you kindly help me with this matter, please?
I don't mind changing the data format to POSIXct (if required for ggplot), anything really, as long as it looks like this.
Thanks a lot!
P.S. I don't need the data with the time (i.e., 01:01:01 can be removed if needed!)
Add the line geom_rect(aes(xmin=as.POSIXct("2006/01/01"), xmax=as.POSIXct("2006/06/01"), ymin=0, ymax=1), alpha=.01) to your ggplot code.
x = c(0.80, 1.23, -0.13, -0.65, 0.53, -0.10, -0.96, 1.63, 2.79, 2.02, -1.03, -0.86,
0.58, 0.59, 0.60, -1.77, -0.77, -0.73, -0.43, 2.60, -0.81, -2.81, -2.13, 1.66,
1.54, -0.15, -0.31, 0.09, 2.47, 0.24, -0.75, 2.09, 0.46, -0.80, -0.50, 2.58,
0.80, 0.39, 0.82, -0.58, -1.09, -0.29, -1.26, -1.72, 1.54, 1.06, 1.21, 0.15,
-0.57, -0.32, -1.44, -1.56)
z = acf(x, lag.max = 4, type = c("covariance"))
z
I am stuck in Number 3 question of this image:
I have some data of x values called "norm". I want to plot a histogram and then plot the density function of a beta with parameters 3.5 and 3 onto the histogram. The main goal of this is to show that the beta fits the norm data. I also need the scale of the y axis to match for both the density and the histogram. I got a plot at one point but the density function was very low because the histogram counted so the y axis went to 30 in my case and obviously the density line was <1.
Here is my code:
x <- seq(0,1, len = 115)
db_trial = dbeta(x, 3.5, 3.0)
ggplot(data = norm)+
geom_line(aes(x,db_trial), col = "red", stat = "density")+
geom_histogram(aes(y = ...density...), bins = 10, alpha = .2, fill =
"green", col = "red")
Here is my data set norm which is just the presidents data set in R but divided by 100.
# dput(norm)
structure(list(approval_rate = c(0.87, 0.82, 0.75, 0.63, 0.5,
0.43, 0.32, 0.35, 0.6, 0.54, 0.55, 0.36, 0.39, 0.69, 0.57, 0.57,
0.51, 0.45, 0.37, 0.46, 0.39, 0.36, 0.24, 0.32, 0.23, 0.25, 0.32,
0.32, 0.59, 0.74, 0.75, 0.6, 0.71, 0.61, 0.71, 0.57, 0.71, 0.68,
0.79, 0.73, 0.76, 0.71, 0.67, 0.75, 0.79, 0.62, 0.63, 0.57, 0.6,
0.49, 0.48, 0.52, 0.57, 0.62, 0.61, 0.66, 0.71, 0.62, 0.61, 0.57,
0.72, 0.83, 0.71, 0.78, 0.79, 0.71, 0.62, 0.74, 0.76, 0.64, 0.62,
0.57, 0.8, 0.73, 0.69, 0.69, 0.71, 0.64, 0.69, 0.62, 0.63, 0.46,
0.56, 0.44, 0.44, 0.52, 0.38, 0.46, 0.36, 0.49, 0.35, 0.44, 0.59,
0.65, 0.65, 0.56, 0.66, 0.53, 0.61, 0.52, 0.51, 0.48, 0.54, 0.49,
0.49, 0.61, 0.68, 0.44, 0.4, 0.27, 0.28, 0.25, 0.24, 0.24, 0.01
)), .Names = "approval_rate", row.names = c(NA, -115L), class = "data.frame")
This returns an error "Stat_bin requires the following missing aesthetics: x". What am I doing wrong. I am a novice with ggplot2.
It's usually better to use stat_function for this type of thing. Note that I'm technically using an anonymous function that wraps dbeta, so you can adjust the height of the curve via multiplication.
g <- ggplot(data = norm, aes(x = approval_rate))+
geom_histogram() +
stat_function(fun = function(x) dbeta(x, shape1 = 3.5, shape2 = 3.0) * 5, color = 'red')
print(g)