r - Add significance level to correlation heatmap [duplicate] - r

This question already has answers here:
Significance level added to matrix correlation heatmap using ggplot2
(3 answers)
Closed 9 years ago.
I have the following data frame df (appended)
I have written a short script to plot a correlation heatmap
library(ggplot2)
library(plyr)
library(reshape2)
library(gridExtra)
#Load data frame
df <- data.frame(read.csv("~/Documents/wig_cor.csv",sep="\t"))
c = cor(df[sapply(df,is.numeric)])
#Plot all data
plots <- dlply(df, .(Method), function (x1) {
ggplot(melt(cor(x1[sapply(x1,is.numeric)])),
aes(x=Var1,y=Var2,fill=value)) + geom_tile(aes(fill = value),colour = "white") + geom_text(aes(label = sprintf("%1.2f",value)), vjust = 1) + theme_bw() + theme(legend.position = 'none') +
scale_fill_gradient2(midpoint=0.8,low = "white", high = "steelblue")})
#Plot by EF Analysis Method
plots <- dlply(df, .(Method), function (x1) {
ggplot(subset(melt(cor(x1[sapply(x1,is.numeric)]))[lower.tri(c),],Var1 != Var2),
aes(x=Var1,y=Var2,fill=value)) + geom_tile(aes(fill = value),colour = "white") +
geom_text(aes(label = sprintf("%1.2f",value)), vjust = 1) +
theme_bw() +
scale_fill_gradient2(name="R^2",midpoint=0.7,low = "white", high = "red") + xlab(NULL)+ylab(NULL) + theme(axis.text.x=element_blank(),axis.text.y=element_blank(), axis.ticks=element_blank(),panel.border=element_blank()) + ggtitle(x1$Method) + theme(plot.title = element_text(lineheight=1,face="bold")) + geom_text(data = subset(melt(cor(x1[sapply(x1,is.numeric)])),Var1==Var2),aes(label=Var1),vjust=3 ) })
#Function to grab legend
g_legend<-function(a.gplot){
tmp <- ggplot_gtable(ggplot_build(a.gplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
legend
}
legend <- g_legend(plots$WIG_Method)
png(file = "/misc/croc_common/physics/jamie/Portfolio/WesternEF/EFCorrelations.png", width = 1200, height = 400)
grid.arrange(legend,plots$Single_ROI+theme(legend.position='none'), plots$Simple_2_ROI+theme(legend.position='none'),plots$WIG_Method+theme(legend.position='none'), plots$WIG_drawn_bg+theme(legend.position='none'), ncol=5, nrow=1, widths=c(1/17,4/17,4/17,4/17,4/17))
dev.off()
However, I would like to use stars to highlight the statistical significanceas of each correlation as described here but I am completely lost on how to do this. Any guidance
structure(list(Study = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 19L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L), .Label = c("WCBP12236", "WCBP12241", "WCBP12242", "WCBP12243",
"WCBP12245", "WCBP13001", "WCBP13002", "WCBP13003", "WCBP13004",
"WCBP13005", "WCBP13006", "WCBP13007", "WCBP13008", "WCBP13009",
"WCBP13010", "WCBP13011", "WCBP13012", "WCBP13013", "WCBP13014"
), class = "factor"), G1 = c(68, 68.6, 66.6, 73.1, 51.6, 50.1,
64.1, 73, 63.7, 43.2, 62.3, 59.2, 67.5, 68.2, 54.6, 67.9, 56.5,
54.2, 67.3, 68, 68.4, 67.9, 73.3, 51.7, 50.3, 63.9, 73.9, 64,
42.9, 62.5, 59.3, 66.7, 68.4, 54, 68.2, 56.8, 54.5, 67, 53.2,
41.4, 53, 52.3, 41, 37.4, 56.9, 65.3, 36.2, 35.3, 36.1, 32.5,
56.5, 47.7, 39.4, 59.6, 38.1, 24.2, 30.2, 68.5, 68.9, 70.7, 74.9,
53.4, 51.6, 65.9, 75.7, 64.7, 42.8, 61.4, 60.8, 69.5, 68.7, 55.9,
70.7, 59.5, 51.1, 69.5), G2 = c(79.8, 72.2, 73.5, 74.4, 50.4,
54.8, 63.1, 70.4, 63.6, 45.1, 65.3, 49.4, 65.3, 76.2, 51, 63.9,
58.7, 57.8, 67, 79.6, 72.1, 73.9, 74.7, 50.5, 55.1, 62.8, 70.5,
63.3, 44.6, 65.5, 48.9, 64.9, 76.3, 50.6, 64.8, 58.6, 58.3, 67.4,
51.2, 37.7, 49.1, 53.7, 44.6, 37.3, 54.9, 64.1, 33.8, 31.9, 34.2,
30.3, 56.2, 44.6, 38.2, 63.2, 35.8, 26.5, 27.6, 80.6, 71.6, 75.4,
77.1, 52.4, 56.3, 66, 72.3, 64.5, 38.2, 64.3, 49.2, 66.9, 77.1,
52.4, 67.5, 59.6, 55.6, 69.9), S1 = c(75.1, 65.9, 72.7, 68.8,
49, 57.5, 66.5, 74.1, 60.9, 51.8, 58, 64.3, 71.1, 71.4, 58.9,
62.2, 58, 57.7, 58.6, 75.2, 66, 73.2, 69.7, 48.9, 57.7, 66.5,
74.7, 60.8, 51.4, 58.9, 65.5, 70.5, 71.4, 58.9, 65.1, 60.8, 57.7,
58.4, 54.3, 40.2, 52.6, 60.5, 42.6, 34.1, 55, 64.7, 36.3, 32.5,
39, 38.8, 58.1, 48, 40.5, 61, 40, 26.4, 28.8, 76.4, 66.5, 73.9,
72, 50.7, 59.2, 69.9, 76.3, 62.4, 50, 58.5, 66.6, 73.7, 72.3,
62.6, 69.6, 62.7, 57.9, 61.1), S2 = c(76.6, 71.6, 71.2, 72.7,
51.6, 56.7, 65.9, 73.5, 63.6, 55.2, 62.6, 62.2, 69.1, 71.1, 56.8,
61, 61.7, 60, 55.7, 76.9, 71.6, 72.3, 73.2, 51.7, 56.8, 64.5,
74.9, 63.6, 51.3, 63, 62.8, 68.7, 71.3, 56.8, 64.2, 62.8, 60.4,
55.8, 53.6, 42.5, 50, 54.4, 42.2, 36.4, 57.7, 64.1, 35.1, 30.8,
39.1, 37.4, 58.7, 47.8, 42, 58.8, 39.4, 24.2, 28.2, 78.2, 73.3,
72.3, 75.6, 53.4, 57.8, 68.3, 76.6, 63.7, 51.7, 63.4, 63.3, 71.5,
72.3, 60.2, 67.1, 65.5, 58.2, 59.1), Method = structure(c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Simple_2_ROI",
"Single_ROI", "WIG_drawn_bg", "WIG_Method"), class = "factor")), .Names = c("Study",
"G1", "G2", "S1", "S2", "Method"), row.names = c(NA, -76L), class = "data.frame")

Here's an example of one way to do it with ggplot. You essentially add the significance stars as characters to a dataframe and plot them as text on the heatmap: https://github.com/andrewheiss/Attitudes-in-the-Arab-World/blob/master/figure12.R

A useful function for getting p values out of the correlation matrix is rcorr from Hmisc. Using it, I got this:
In each cell of the correlation matrix, there is a pair of numbers: The upper one represents the coefficient of correlation (as does the color gradient of the cell), while the lower one represents the p value. Is this what you wanted? (See the bottom of the answer for improved response, whereby I convert p values into stars...)
I proceeded as follows:
Since your p values would be VERY small in this data frame, I have used jitter and stripped the amount of observations so as to decrease the statistical significance. The reason for that is that very low p values would be very hard to read in a correlation matrix of this type. Consequently, the result does not make much sense from a statistical point of view but it demonstrates nicely how the significance levels can be added to the matrix.
First, jitter it and limit the number of observations:
mydf=df
mydf[,2:5] = sapply(mydf[,2:5],jitter,amount=20)
mydf=mydf[c(1:5,20:24,39:43,58:62),]
Then calculate r coefficient and p values:
library(Hmisc)
# calculate r
c = rcorr(as.matrix(mydf[sapply(mydf,is.numeric)]))$r
# calculate p values
p = rcorr(as.matrix(mydf[sapply(mydf,is.numeric)]))$P
Make a plot based on both those values:
plots <- dlply(mydf, .(Method), function (x1) {
ggplot(data.frame(subset(melt(rcorr(as.matrix(x1[sapply(x1,is.numeric)]))$r)[lower.tri(c),],Var1 != Var2),
pvalue=subset(melt(rcorr(as.matrix(x1[sapply(x1,is.numeric)]))$P)[lower.tri(p),],Var1 != Var2)$value),
aes(x=Var1,y=Var2,fill=value)) +
geom_tile(aes(fill = value),colour = "white") +
geom_text(aes(label = sprintf("%1.2f",value)), vjust = 0) +
geom_text(aes(label = sprintf("%1.2f",pvalue)), vjust = 1) +
theme_bw() +
scale_fill_gradient2(name="R^2",midpoint=0.25,low = "blue", high = "red") +
xlab(NULL) +
ylab(NULL) +
theme(axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
panel.border=element_blank()) +
ggtitle(x1$Method) + theme(plot.title = element_text(lineheight=1,face="bold")) +
geom_text(data = subset(melt(rcorr(as.matrix(x1[sapply(x1,is.numeric)]))$r),Var1==Var2),
aes(label=Var1),vjust=1 )
})
Display plot.
grid.arrange(plots$Single_ROI + theme(legend.position='none'),
plots$Simple_2_ROI + theme(legend.position='none'),
plots$WIG_Method + theme(legend.position='none'),
plots$WIG_drawn_bg + theme(legend.position='none'),
ncol=2,
nrow=2)
Stars instead of p values:
Modify data frame (I leave a few more observations this time):
library(Hmisc)
library(car)
mydf=df
set.seed(12345)
mydf[,2:5] = sapply(mydf[,2:5],jitter,amount=15)
mydf=mydf[c(1:10,20:29,39:48,58:67),]
Calculate r, p values and recode p values into stars inside the plot function:
# calculate r
c = rcorr(as.matrix(mydf[sapply(mydf,is.numeric)]))$r
# calculate p values
p = rcorr(as.matrix(mydf[sapply(mydf,is.numeric)]))$P
plots <- dlply(mydf, .(Method), function (x1) {
ggplot(data.frame(subset(melt(rcorr(as.matrix(x1[sapply(x1,is.numeric)]))$r)[lower.tri(c),],Var1 != Var2),
pvalue=Recode(subset(melt(rcorr(as.matrix(x1[sapply(x1,is.numeric)]))$P)[lower.tri(p),],Var1 != Var2)$value , "lo:0.01 = '***'; 0.01:0.05 = '*'; else = ' ';")),
aes(x=Var1,y=Var2,fill=value)) +
geom_tile(aes(fill = value),colour = "white") +
geom_text(aes(label = sprintf("%1.2f",value)), vjust = 0) +
geom_text(aes(label = pvalue), vjust = 1) +
theme_bw() +
scale_fill_gradient2(name="R^2",midpoint=0.25,low = "blue", high = "red") +
xlab(NULL) +
ylab(NULL) +
theme(axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
panel.border=element_blank()) +
ggtitle(x1$Method) + theme(plot.title = element_text(lineheight=1,face="bold")) +
geom_text(data = subset(melt(rcorr(as.matrix(x1[sapply(x1,is.numeric)]))$r),Var1==Var2),
aes(label=Var1),vjust=1 )
})
Display plot.
grid.arrange(plots$Single_ROI + theme(legend.position='none'),
plots$Simple_2_ROI + theme(legend.position='none'),
plots$WIG_Method + theme(legend.position='none'),
plots$WIG_drawn_bg + theme(legend.position='none'),
ncol=2,
nrow=2)

Related

ggplot coord_polar: Full circle hast only 330° instead of 360°. How to fix?

For my Master thesis i work with trees. I got 36 trees, one tree is my main tree, ever further tree i got the distance and the azimut° in relation to the main tree.
When in now plot all the trees, to show where they grow in the relation to the main tree, the full circle of the plot seems to only have ~330°.
You can see it easily by looking at the labels (which are supposed to be the directions of the sky).
It seems to me that it is because of the tree nr.24, because it has the highest azimut° with 329.4°.
there is no error or any other problem.
How do i fix the plot to be a full circle with 360° ?
Code
library(ggplot2)
pie <- ggplot(Baumdaten, aes(x = distanz, label = Nr))
pie + coord_polar(theta = "y", start = 0) +
geom_point(mapping = aes(distanz,azimut),
size = 15*Baumdaten$baumumfang/max(Baumdaten$baumumfang),
col = "deepskyblue") +
geom_text(y = Baumdaten$azimut, size = 5) +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle=0, vjust = 1, hjust=1, size = 15) )+
# axis.line.y.left = element_line(),
# axis.ticks.y.left = element_line()
# ) +
labs(x = "Distanz zum Zentralbaum [m]", y = NULL) +
scale_y_continuous(breaks=c(0,45,90,135,180,225,270,315),
labels=c("N","NO","O","SO","S","SW","W","NW") ) +
ggtitle("") +
theme(plot.title = element_text(hjust = 0.5))
Data
Baumdaten <- structure(list(Nr = 1:36, baumumfang = c(166.42, 124.03, 117.75,
130.31, 125.6, 153.86, 164.85, 147.58, 122.46, 109.9, 141.3,
130.31, 125.6, 122.46, 114.61, 122.46, 139.73, 152.29, 103.62,
119.32, 117.75, 133.45, 111.47, 117.75, 131.88, 166.42, 141.3,
106.76, 103.62, 113.04, 150.72, 166.42, 153.86, 139.73, 125.6,
122.46), baumhoehe = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), distanz = c(0, 3, 5.2,
5, 10, 12.2, 13, 10.8, 13.8, 14.8, 17.7, 27.1, 27, 35, 14.6,
24, 12.1, 18.4, 17.3, 26, 25, 29.5, 31.4, 34.1, 37, 46, 46, 43,
48, 42, 41, 34.4, 25, 24, 20.5, 14.1), azimut = c(0, 113, 261.3,
30.1, 255.3, 222.6, 180.9, 148, 133.2, 112.6, 117.1, 112, 103.3,
101.4, 144.7, 119.4, 288.3, 286.5, 317.2, 303.3, 327.1, 314.7,
325.3, 329.4, 323.5, 325.8, 318.4, 307.6, 301.7, 292, 288.5,
286.5, 284.4, 277.3, 262.9, 242.8), Kronenverlichtung = c(40L,
30L, 40L, 98L, 30L, 50L, 95L, 40L, 30L, 40L, 70L, 40L, 40L, 100L,
40L, 60L, 40L, 95L, 100L, 20L, 20L, 5L, 40L, 100L, 15L, 100L,
90L, 40L, 60L, 100L, 40L, 70L, 50L, 50L, 30L, 80L), Vergilbung = c(0L,
0L, 0L, 0L, 0L, 0L, 3L, 1L, 0L, 0L, 1L, 2L, 0L, 0L, 1L, 0L, 1L,
4L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 1L, 0L,
0L, 0L, 3L), durchmesser = c(53, 39.5, 37.5, 41.5, 40, 49, 52.5,
47, 39, 35, 45, 41.5, 40, 39, 36.5, 39, 44.5, 48.5, 33, 38, 37.5,
42.5, 35.5, 37.5, 42, 53, 45, 34, 33, 36, 48, 53, 49, 44.5, 40,
39)), row.names = c(NA, -36L), class = "data.frame")
The 330° circle:
Short answer: add limits = c(0, 360) to scale_y_continuous(). Your plot will look like what you appear to have in mind.
Longer answer: I re-formatted & streamlined your code to the following:
ggplot(Baumdaten,
aes(x = distanz, y = azimut, label = Nr)) +
geom_point(aes(size = 15 * baumumfang/max(baumumfang)),
col = "deepskyblue") +
geom_text(size = 5) +
coord_polar(theta = "y", start = 0) +
labs(title = "",
x = "Distanz zum Zentralbaum [m]",
y = NULL) +
scale_y_continuous(breaks = seq(0, 359, by = 45), # less typing for same result
labels = c("N", "NO", "O", "SO", "S", "SW", "W", "NW"),
limits = c(0, 360)) +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(hjust = 1, size = 15),
plot.title = element_text(hjust = 0.5),
legend.position = "none") # hides all legends (necessary since size is
# a mapped aesthetic now)
Some advice on coding practices when it comes to ggplot:
Specify your aesthetic mappings for clarity. i.e. aes(x = distanz, y = azimut) instead of aes(distanz, azimut).
Put common aesthetic mappings in the top level ggplot() so that all geom layers inherit them by default, to minimize repetition. i.e. aes(x = distanz, y = azimut) should be at the top level because both geom_point and geom_text layers use them.
You can also put uncommon aesthetic mappings in the top level ggplot() if there's no confusion. i.e. aes(label = Nr) can be at the top level here, since you only have one text layer, but if you have multiple geom_text / geom_label layers that use different columns from your dataset, it would be simpler to specify these mappings in the respective geom layers instead.
All mappings should go inside aes(), and without using the $ symbol. i.e. aes(size = 15 * baumumfang/max(baumumfang)) instead of size = 15 * Baumdaten$baumumfang / max(Baumdaten$baumumfang).
Follow a consistent order in your ggplot layers, so that they are easier to keep track over time. This becomes particularly useful if you copy off an existing piece of code & modify different details all over the place. E.g. the ggplot2 cheatsheet on RStudio's website uses the following:
Don't specify default details in theme elements. You can check what the defaults are with calc_element. e.g. running calc_element("axis.text.x", theme_bw(base_size = 15)) in console shows its angle / vjust are already 0 & 1 respectively, so you only have to specify theme(axis.text.x = element_text(hjust = 1, size = 15)).

convert dataframe to time series for arima

I am having problems converting the following dataset to ts to be used with stats::arima
I was able to convert to xts objet but arima does not seem to like it.Can someone guide me on
how to convert it to ts? I really need to use arima model here. Thanks
library(ggfortify)
library(xts)
wt <- structure(list(SampleDate = structure(c(13687, 13694, 13701,
13708, 13715, 13722, 13729, 13736, 13743, 13750, 13757, 13764,
13771, 13778, 13785), class = "Date"), DOC = c(3, 10, 17, 24,
31, 38, 45, 52, 59, 66, 73, 80, 87, 94, 101), AvgWeight = c(1,
1.66666666666667, 2.06666666666667, 2.275, 3.83333333333333,
6.2, 7.4, 8.5, 10.25, 11.1, 13.625, 15.2, 16.375, 17.8, 21.5),
PondName = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "Pond01", class = "factor")), row.names = c(NA,
15L), class = "data.frame")
pond <- as.xts(wt$AvgWeight,order.by=seq(as.Date("2007-06-23"), by=7, len=15))
d.arima <- arima(pond)
#arima is not recognized.....probably because I need a ts and not a xts object here.....
autoplot(d.arima, predict = predict(d.arima, n.ahead = 3,prediction.interval = TRUE,level=0.95),
ts.colour='dodgerblue',predict.colour='green',
predict.linetype='dashed',ts.size=1.5,conf.int.fill='azure3') + xlab('DOC') + ylab('AvgWeight-grs') +
theme_bw()
I get this weird plot...

Plotting a multiple linear regression in R using scatter3D() (package plot3D)

I have the following data in a csv file.
y,x1,x2,x3,x4,x5,x6,x7,x8,x9
10,2113,1985,38.9,64.7,4,868,59.7,2205,1917
11,2003,2855,38.8,61.3,3,615,55,2096,1575
11,2957,1737,40.1,60,14,914,65.6,1847,2175
13,2285,2905,41.6,45.3,-4,957,61.4,1903,2476
10,2971,1666,39.2,53.8,15,836,66.1,1457,1866
11,2309,2927,39.7,74.1,8,786,61,1848,2339
10,2528,2341,38.1,65.4,12,754,66.1,1564,2092
11,2147,2737,37,78.3,-1,761,58,1821,1909
4,1689,1414,42.1,47.6,-3,714,57,2577,2001
2,2566,1838,42.3,54.2,-1,797,58.9,2476,2254
7,2363,1480,37.3,48,19,984,67.5,1984,2217
example = data.frame(x1,x2,x3,x4,y)
How can I graph the variables x1, x2, x3 using scatter3D(x,y,z)?
I have tried:
library("plot3D")
with(example,scatter3D(y ~ x1 + x2 + x3))
But I get error:
Error in min(x,na.rm) : invalid 'type' (list) of argument
Looks like you want to plot a regression plane. The scatter3d function in package car will do that. You need to install car and rgl. First let's make your data more accessible:
dput(example)
structure(list(y = c(10L, 11L, 11L, 13L, 10L, 11L, 10L, 11L,
4L, 2L, 7L), x1 = c(2113L, 2003L, 2957L, 2285L, 2971L, 2309L,
2528L, 2147L, 1689L, 2566L, 2363L), x2 = c(1985L, 2855L, 1737L,
2905L, 1666L, 2927L, 2341L, 2737L, 1414L, 1838L, 1480L), x3 = c(38.9,
38.8, 40.1, 41.6, 39.2, 39.7, 38.1, 37, 42.1, 42.3, 37.3), x4 = c(64.7,
61.3, 60, 45.3, 53.8, 74.1, 65.4, 78.3, 47.6, 54.2, 48), x5 = c(4L,
3L, 14L, -4L, 15L, 8L, 12L, -1L, -3L, -1L, 19L), x6 = c(868L,
615L, 914L, 957L, 836L, 786L, 754L, 761L, 714L, 797L, 984L),
x7 = c(59.7, 55, 65.6, 61.4, 66.1, 61, 66.1, 58, 57, 58.9,
67.5), x8 = c(2205L, 2096L, 1847L, 1903L, 1457L, 1848L, 1564L,
1821L, 2577L, 2476L, 1984L), x9 = c(1917L, 1575L, 2175L,
2476L, 1866L, 2339L, 2092L, 1909L, 2001L, 2254L, 2217L)),
class = "data.frame", row.names = c(NA, -11L))
install.packages("car")
install.packages("rgl")
library(car)
library(rgl)
scatter3d(y~x1+x2, example)
The plot window will be small. Use the mouse to drag the lower right corner to make it bigger. You can drag within the plot to rotate it.

ggplot with posterior distribution plotted over geom_smooth

I'd like to recreate this plot from this blog post on Posterior predicted distribution for linear regression in JAGS) using ggplot?
Knowing all the extras available for ggplot, what methods are there to go about this?
Here's a bare-bones example of what I've come up with using density to add a geom_path.
library(ggplot2)
#mydat <- read.csv("HtWt30.csv")
mydat <- structure(list(male = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L,
1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 1L, 0L),
height = c(64, 62.3, 67.9, 64.2, 64.8, 57.5,
65.6, 70.2, 63.9, 71.1, 66.5, 68.1, 62.9, 75.1, 64.6, 69.2, 68.1,
72.6, 63.2, 64.1, 64.1, 71.5, 76, 69.7, 73.3, 61.7, 66.4, 65.7,
68.3, 66.9),
weight = c(136.4, 215.1, 173.6, 117.3, 123.3, 96.5,
178.3, 191.1, 158, 193.9, 127.1, 147.9, 119, 204.4, 143.4, 124.4,
140.9, 164.7, 139.8, 110.2, 134.1, 193.6, 180, 155, 188.2, 187.4,
139.2, 147.9, 178.6, 111.1)),
.Names = c("male", "height", "weight"), class = "data.frame", row.names = c(NA, -30L))
# smooth plot
g_smooth <- ggplot(mydat, aes(x = height, y = weight)) + geom_smooth()
# fake posterior at a height = 60
p60 <- data.frame(x = 60, y = rnorm(1000, mean = 145, sd = 10))
# density kernel
d60 <- density(p60$y)
# calculate scaling factor so that density covers 1/20 of full x range
density_scaling <- ((max(mydat$height) - min(mydat$height)) / 20) / max(d60$y)
# convert to points
d60points <- data.frame(y = d60$x, x = 60 + d60$y * density_scaling)
# add path to plot
g_smooth <- g_smooth + geom_path(data = d60points, aes(x = x, y = y))
# fake posterior at a height = 70
p70 <- data.frame(x = 60, y = rnorm(1000, mean = 165, sd = 10))
# density kernel
d70 <- density(p70$y)
# calculate scaling factor so that density covers 1/20 of full x range
density_scaling <- ((max(mydat$height) - min(mydat$height)) / 20) / max(d70$y)
# convert to points
d70points <- data.frame(y = d70$x, x = 70 + d70$y * density_scaling)
# add path to plot
g_smooth <- g_smooth + geom_path(data = d70points, aes(x = x, y = y))
g_smooth

Error in cor(data[, -1], use = "complete.obs") : 'x' must be numeric

I'm completely new to R - really have no clue what I'm doing to be honest. But I really need to run bivariate/multivariate regressions with this data following someone's advice and I'm stuck. Any help is greatly appreciated.
rm(list=ls())
setwd("C:/Users/Bogi/Documents/School/Honors Thesis/Voting and Economic Data")
data<-read.csv("BOGDAN_DATA1.csv")
head(data)
round(cor(data[,-1],use="complete.obs"),1)
Error in cor(data[, -1], use = "complete.obs") : 'x' must be numeric
dput
structure(list(REGION = structure(1:6, .Label = c("Altai Republic",
"Altai Territory", "Amur Region", "Arkhangelsk Region", "Astrakhan region",
"Belgorod region"), class = "factor"), PCT_CHANGE_VOTE = structure(c(2L,
3L, 5L, 4L, 6L, 1L), .Label = c("-13%", "-16%", "-17%", "-25%",
"-26%", "2%"), class = "factor"), PCT_CHANGE_GRP = structure(c(2L,
1L, 4L, 3L, 3L, 4L), .Label = c("10%", "17%", "19%", "27%"), class = "factor"),
PCT_CHANGE_INFLATION = structure(c(1L, 2L, 1L, 3L, 3L, 2L
), .Label = c("-2%", "-3%", "-4%"), class = "factor"), PCT_CHANGE_UNEMP = structure(c(5L,
4L, 1L, 2L, 6L, 3L), .Label = c("-13%", "-14%", "-17%", "-3%",
"5%", "7%"), class = "factor"), POVERTY = c(18.6, 22.6, 20.4,
14.4, 14.2, 8.6), POP_AGE1 = c(25.8, 16.9, 18.5, 17.1, 17.8,
15.2), POP_AGE2 = c(58.8, 59.6, 61.3, 60.4, 60.8, 60.3),
POP_AGE3 = c(15.4, 23.5, 20.2, 22.5, 21.4, 24.5), POP_URBAN = c(28.7,
55.2, 67, 76.2, 66.7, 66.4), POP_RURAL = c(71.3, 44.8, 33,
23.8, 33.3, 33.6), COMPUTER = c(46.4, 54.5, 66.1, 74, 65.1,
55.2), INTERNET = c(32.1, 41, 50.7, 66.5, 60, 50.7)), .Names = c("REGION",
"PCT_CHANGE_VOTE", "PCT_CHANGE_GRP", "PCT_CHANGE_INFLATION",
"PCT_CHANGE_UNEMP", "POVERTY", "POP_AGE1", "POP_AGE2", "POP_AGE3",
"POP_URBAN", "POP_RURAL", "COMPUTER", "INTERNET"), row.names = c(NA,
6L), class = "data.frame")
You could loop the columns 2:5 (lapply(data[2:5], ..)), remove the % in columns 2:5 (gsub('[%]',..)) and convert the columns to numeric. The output from gsub will be character class, convert it to numeric by as.numeric
data[2:5] <- lapply(data[2:5], function(x)
as.numeric(gsub('[%]', '', x)))
Cor1 <- round(cor(data[-1],use="complete.obs"),1)
Or you could remove the % in those columns using awk on shell (assuming ,
as delimiter)
awk 'BEGIN {OFS=FS=","} function SUB(F) {sub(/\%/,"", $F)}{SUB(2);SUB(3);SUB(4);SUB(5)}1' Bogdan.csv > Bogdan2.csv
Read the file with read.csv and run the cor
dat1 <- read.csv('Bogdan2.csv')
Cor2 <- round(cor(dat1[-1], use='complete.obs'), 1)
identical(Cor1, Cor2)
#[1] TRUE

Resources