Error msg: LDL' Bunch-Kaufman decomposition of covariance matrix - r

I'm new to R-Matrix. I am trying to decompose singular covariance matrix into LDL' form with R-function BunchKaufman(x, ...) http://stat.ethz.ch/R-manual/R-devel/library/Matrix/html/BunchKaufman-methods.html
Please help me get past first-base with trailing "Error in function..."
A <- matrix( c( 0.184, 0.228, 0.252, 0.022, -0.022, 0.228, 1.053, 0.142, 0.106, -0.106,
+ 0.252, 0.142, 0.382, 0.015, -0.015, 0.022, 0.106, 0.015, 0.055, -0.055,
+ -0.022, -0.106, -0.015, -0.055, 0.055), ncol=5, nrow=5)
BunchKaufman(A)
Error in function (classes, fdef, mtable) :
unable to find an inherited method for function ‘BunchKaufman’ for signature ‘"matrix"’

Following works:
A <- forceSymmetric(A)
syA <- new("dsyMatrix", A , Dim = as.integer(c(nrow(A),nrow(A))) , uplo = "L" )
BunchKaufman(syA)

Related

Need Help Making an Ordihull

I have been collaborating on this code that creates an NMDS plot and I want to add shaded polygons of the points. However, the ordihull code keeps returning the following error. Why would the argument be of length zero?
Error in if (n < 4) return(colMeans(x[-n, , drop = FALSE])) : argument is of length zero
> m1 <- metaMDS(d1)
> m2 <- metaMDS(d2)
> m3 <- metaMDS(d3)
> mdat <- data.frame(m3$points)
> mdat$site <- substr(rownames(mdat), 1, 1) mdat$col <- ifelse(mdat$site == "D", "red",
ifelse(mdat$site == "H", "blue", "green"))
> plot(mdat[,1], mdat[,2], pch=16, col=mdat$col, display = "sites",
xlab="NMDS1", ylab="NMDS2", xlim=c(-0.2, 0.2),
ylim=c(-0.2, 0.2), main= "Phylum")
> ordihull(mdat[,1], mdat[,2], display="sites", label=T,
lwd=2, draw="polygon",col= c("blue", "red", "green"))
Here is the Dput:
> structure(list(p__Proteobacteria = c(44.807, 40.907, 36.558,36.811,
39.401, 40.114, 45.911, 43.133, 30.137, 27.734, 26.722,
31.261), p__Actinobacteria = c(26.819, 34.651, 40.904, 38.847,
39.446, 37.523, 29.881, 29.251, 31.783, 23.641, 34.918, 31.308
), p__Acidobacteria = c(8.48, 6.6, 5.934, 6.609, 5.89, 7.567,
5.795, 6.666, 10.616, 10.709, 8.988, 11.794), p__Bacteroidetes =
c(7.56, 8.189, 5.363, 6.223, 4.716, 3.613, 4.65, 5.2, 4.281, 2.785,
2.808, 3.271), p__Gemmatimonadetes = c(3.529, 2.108, 1.213, 1.193,
1.541, 1.439, 1.006, 1.171, 5.794, 4.107, 4.001, 2.747),
p__Chloroflexi = c(2.686, 2.987, 2.979, 3.049, 4.128, 4.564, 5.304,
4.624, 3.669, 2.775, 4.534, 4.94), p__Bacteria_unclassified =
c(2.38, 1.869, 1.579, 1.247, 2.3, 2.108, 1.36, 1.193, 3.126, 1.885,
2.987, 2.37), p__Firmicutes = c(0.998, 0.807, 2.76, 2.962, 0.866,
1.32, 1.651, 2.073, 1.099, 1.046, 1.3, 1.302), p__Verrucomicrobia =
c(0.676, 0.404, 0.32, 0.35, 0.293, 0.239, 0.188, 0.261, 0.521,
0.726, 0.52, 0.397), p__Nitrospirae = c(0.464, 0.244, 0.198, 0.208,
0.016, 0.032, 0.024, 0.042, 0.296, 0.103, 0.229, 0.211),
p__Candidatus_Saccharibacteria = c(0.421, 0.511, 0.456, 0.552,
0.523, 0.6, 0.842, 1.016, 0.672, 0.636, 0.465, 0.736),
p__Planctomycetes = c(0.392, 0.267, 0.354, 0.285, 0.275, 0.356,
0.285, 0.276, 0.33, 0.438, 0.552, 0.365), p__Fibrobacteres = c(0.14,
0.074, 0.007, 0.009, 0.072, 0.044, 0.136, 0.079, 0.117, 0.018,
0.167, 0.065), p__Candidatus_Latescibacteria = c(0.113, 0.059,
0.017, 0.005, 0.004, 0.017, 0.015, 0.009, 0, 0.011, 0.007, 0.018
), p__Latescibacteria = c(0.085, 0.04, 0.01, 0.004, 0.012, 0.015,
0.033, 0.015, 0.012, 0.016, 0.011, 0.018), p__Cyanobacteria =
c(0.079, 0.048, 1.071, 1.372, 0.32, 0.19, 2.629, 4.689, 7.133,
22.963, 11.417, 8.767), p__Thermodesulfobacteria = c(0.068, 0.057,
0.115, 0.103, 0.008, 0.01, 0.015, 0.007, 0.01, 0.003, 0.002, 0.013),
p__Elusimicrobia = c(0.059, 0.021, 0.012, 0.001, 0.004, 0.002,
0.015, 0.017, 0, 0.002, 0.005, 0.006), p__Chlorobi = c(0.052,
0.025, 0.002, 0.012, 0.029, 0.046, 0.033, 0.04, 0.05, 0.02,
0.046, 0.025), p__Armatimonadetes = c(0.046, 0.053, 0.051,
0.072, 0.076, 0.095, 0.048, 0.053, 0.197, 0.159, 0.128, 0.125
), p__Spirochaetes = c(0.035, 0.021, 0.002, 0.001, 0, 0.002,
0.024, 0.039, 0, 0, 0, 0), p__Parcubacteria = c(0.03, 0.013,
0, 0, 0.01, 0.015, 0.042, 0.037, 0.032, 0.059, 0.053, 0.011
), p__Chlamydiae = c(0.028, 0.017, 0.046, 0.05, 0.014, 0.007,
0.021, 0.022, 0.07, 0.074, 0.08, 0.152)), class = "data.frame",
row.names = c("D15B", "D610B", "D15F", "D610F", "HR15B", "HR610B",
"HR15F", "HR610F", "C15B", "C610B", "C15F", "C610F"))
Here are the codes:
> phylum.dat <- dput
> x <- data.frame(tax=names(phylum.dat), nsites=apply(phylum.dat, 2, function(x){length(which(x>0))}))
> d1 <- vegdist(phylum.dat, method = "jaccard", binary = TRUE)
> d2 <- vegdist(log1p(phylum.dat, method = "jaccard"))
> logit_phylum <- as.matrix(phylum.dat+1)/100
> d3 <- qlogis(logit_phylum)
> d3 <- d3+abs(min(d3))
> d3 <- vegdist(d3, method = "jaccard")
> m1 <- metaMDS(d1)
> m2 <- metaMDS(d2)
> m3 <- metaMDS(d3)
> e1 <- envfit(m3, phylum.dat)
> exy <- data.frame(tax=names(phylum.dat),
> x=e1$vectors$arrows[,1],
> y=e1$vectors$arrows[,2],
> pval=e1$vectors$pvals,
> r=e1$vectors$r)
> rownames(exy) <- NULL
> exy <- exy[order(-exy$r),]
> mdat <- data.frame(m3$points)
> mdat$site <- substr(rownames(mdat), 1, 1)
> mdat$col <- ifelse(mdat$site == "D", "red",
> ifelse(mdat$site == "H", "blue", "green"))
> mdat$rad <- sqrt((mdat$MDS1^2) + (mdat$MDS2^2))
> max(mdat$rad)
> exy$x2 <- 0.17 * exy$r * exy$x
> exy$y2 <- 0.17 * exy$r * exy$y
> exy$adj <- ifelse(exy$x < 0, 1, 0)
> plot(mdat[,1], mdat[,2], pch=16, col=mdat$col,
> xlab="NMDS1", ylab="NMDS2", xlim=c(-0.2, 0.2),
> ylim=c(-0.2, 0.2), main= "Phylum")
> ordihull(mdat[,1], mdat[,2], display="sites", label=T,
> lwd=2, draw="polygon",col= c("blue", "red", "green"))

How to turn the row values in a dataframe into NA when the values of one column are greater than another column in r?

My data looks like this:
> dput(head(CORt, 5))
structure(list(rDate = structure(c(1438019100, 1438019400, 1438019700,
1438020000, 1438020300), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
I630 = c(0.536, 0.506, 0.419, 0.456, 0.427), I800 = c(0.414,
0.388, 0.339, 0.351, 0.331), I532 = c(0.547, 0.534, 0.463,
0.488, 0.464), I570 = c(0.522, 0.508, 0.467, 0.468, 0.445
), WR630 = c(0.0127, 0.0573, 0.0083, 0.0057, 0.0053), WR800 = c(0.0144,
0.0506, 0.0249, 0.0163, 0.0159), WR532 = c(0.0139, 0.0394,
0.006, 0.005, 0.0049), WR570 = c(0.0176, 0.0379, 0.0094,
0.0054, 0.0049), NR630 = c(0.006, 0.034, 0.006, 0.004, 0.004
), NR800 = c(0.007, 0.04, 0.019, 0.02, 0.019), NR532 = c(0.007,
0.072, 0.01, 0.007, 0.007), NR570 = c(0.009, 0.077, 0.008,
0.007, 0.007), ER630 = c(0.0351, 0.0746, 0.0116, 0.0055,
0.0052), ER800 = c(0.0278, 0.0596, 0.03, 0.0324, 0.0303),
ER532 = c(0.04, 0.085, 0.013, 0.008, 0.008), ER570 = c(0.034,
0.083, 0.013, 0.009, 0.008)), row.names = c(NA, 5L), class = "data.frame")
In the CORt dataframe when the values of WR630 > I630 I want to turn all values of that row(s) into NA but I want to preserve the rDate column dates and the ER532 values of that row(s).
I have been using this code (example):
which(CORt$WR630>CORt$I630)
CORt[c(7632, 12530, 13684, 14260, 18295, 19735, 23770, 24634, 27529, 44055), setdiff(names(CORt), c("rDate", "ER532"))] <- NA
but this is not handy when I have 200 lines, for example. I'm looking for a code that will turn the row values when WR630 > I630 into NA directly.
Any help is much appreciated.
You can use the which command instead of typing output all the row numbers manually.
CORt[which(CORt$WR630>CORt$I630),setdiff(names(CORt), c("rDate", "ER532"))] <- NA
If you don't have any missing values in the data you can also skip which.
CORt[CORt$WR630>CORt$I630,setdiff(names(CORt), c("rDate", "ER532"))] <- NA
How does this work for you?
nrow(data) %>% map(
.f = function(i) {
if (data[i,"WR630"] > data[i,"I630"] ) {
data[i,-c(1,16)] <- NA
}
data
}
)
Please note that it uses index numbering, instead of names to avoid setting rDate and ER532 to NA. In the data you provided, I didnt find any cases where you condition held true, so I tested it reversely to be certain it works.

xlim geom_histogram Error: Aesthetics must be either length 1 or the same as the data

I am trying to plot a histogram with a custom colour palette. The problem arises when I set the xlim of the histogram.
Please see below the reproducible example:
# sample dataframe
test_dt <- structure(list(col_1 = c(0.057, -0.063, -0.319, 0.02, 0.079,
0.007, -0.105, -0.084, 0.019, 0.28, -0.064, -0.243, -0.116, 0.079,
0.07, -0.187, -0.725, 0.134, 0.062, -0.056, -0.074, 0.392, -0.014,
-0.062, 0.214, 0.371, 0.069, -0.03, 0.036, -0.175, 0.097, 0.358,
0.153, -0.092, -0.038, -0.051, 0.017, -0.108, 0.133, 0.105, 0.187,
-0.056, -0.316, 0.15, -0.142, 0.076, 0.242, -0.069, 0.155, 0.214,
0.162, -0.037, -0.109, 0.111, -0.077, -0.435, 0.003, 0.187, 0.134,
0.027, 0.107, 0.175, -0.355, -0.572, 0.038, -0.209, -0.263, -0.147,
-0.23, -0.174, 0.203, -0.118, 0.008, -0.268, -0.001, 0.227, -0.019,
0.08, 0.044, -0.065, -0.131, 0.093, 0.127, -0.131, 0.039, 0.045,
0.032, 0.343, 0.053, -0.033, 0.453, 0.07, -0.225, 0.094, 0.002,
-0.119, 0.014, -0.125, 0.003, -0.48)), row.names = c(NA, -100L
), class = "data.frame")
# colour palette
RBW <- colorRampPalette(c("darkred","white","darkblue"))
# plot histogram without xlim
ggplot(test_dt) +
geom_histogram(aes(x=col_1),
position = "identity",
bins = 60,
color = "grey10",
fill = RBW(60))
When I run the following lines is when I get the error:
Aesthetics must be either length 1 or the same as the data
# plot histogram with xlim
ggplot(test_dt) +
geom_histogram(aes(x=col_1),
position = "identity",
bins = 60,
color = "grey10",
fill = RBW(60)) +
xlim(-2,2)
instead of xlim, add + coord_cartesian(xlim = c(-2,2))
library(ggplot2)
``` r
ggplot(test_dt) +
geom_histogram(aes(x=col_1),
position = "identity",
bins = 60,
color = "grey10",
fill = RBW(60)) +
coord_cartesian(xlim = c(-2,2))
Created on 2020-02-11 by the reprex package (v0.3.0)

Plot conditional density curve `P(Y|X)` along a linear regression line

This is my data frame, with two columns Y (response) and X (covariate):
## Editor edit: use `dat` not `data`
dat <- structure(list(Y = c(NA, -1.793, -0.642, 1.189, -0.823, -1.715,
1.623, 0.964, 0.395, -3.736, -0.47, 2.366, 0.634, -0.701, -1.692,
0.155, 2.502, -2.292, 1.967, -2.326, -1.476, 1.464, 1.45, -0.797,
1.27, 2.515, -0.765, 0.261, 0.423, 1.698, -2.734, 0.743, -2.39,
0.365, 2.981, -1.185, -0.57, 2.638, -1.046, 1.931, 4.583, -1.276,
1.075, 2.893, -1.602, 1.801, 2.405, -5.236, 2.214, 1.295, 1.438,
-0.638, 0.716, 1.004, -1.328, -1.759, -1.315, 1.053, 1.958, -2.034,
2.936, -0.078, -0.676, -2.312, -0.404, -4.091, -2.456, 0.984,
-1.648, 0.517, 0.545, -3.406, -2.077, 4.263, -0.352, -1.107,
-2.478, -0.718, 2.622, 1.611, -4.913, -2.117, -1.34, -4.006,
-1.668, -1.934, 0.972, 3.572, -3.332, 1.094, -0.273, 1.078, -0.587,
-1.25, -4.231, -0.439, 1.776, -2.077, 1.892, -1.069, 4.682, 1.665,
1.793, -2.133, 1.651, -0.065, 2.277, 0.792, -3.469, 1.48, 0.958,
-4.68, -2.909, 1.169, -0.941, -1.863, 1.814, -2.082, -3.087,
0.505, -0.013, -0.12, -0.082, -1.944, 1.094, -1.418, -1.273,
0.741, -1.001, -1.945, 1.026, 3.24, 0.131, -0.061, 0.086, 0.35,
0.22, -0.704, 0.466, 8.255, 2.302, 9.819, 5.162, 6.51, -0.275,
1.141, -0.56, -3.324, -8.456, -2.105, -0.666, 1.707, 1.886, -3.018,
0.441, 1.612, 0.774, 5.122, 0.362, -0.903, 5.21, -2.927, -4.572,
1.882, -2.5, -1.449, 2.627, -0.532, -2.279, -1.534, 1.459, -3.975,
1.328, 2.491, -2.221, 0.811, 4.423, -3.55, 2.592, 1.196, -1.529,
-1.222, -0.019, -1.62, 5.356, -1.885, 0.105, -1.366, -1.652,
0.233, 0.523, -1.416, 2.495, 4.35, -0.033, -2.468, 2.623, -0.039,
0.043, -2.015, -4.58, 0.793, -1.938, -1.105, 0.776, -1.953, 0.521,
-1.276, 0.666, -1.919, 1.268, 1.646, 2.413, 1.323, 2.135, 0.435,
3.747, -2.855, 4.021, -3.459, 0.705, -3.018, 0.779, 1.452, 1.523,
-1.938, 2.564, 2.108, 3.832, 1.77, -3.087, -1.902, 0.644, 8.507
), X = c(0.056, 0.053, 0.033, 0.053, 0.062, 0.09, 0.11, 0.124,
0.129, 0.129, 0.133, 0.155, 0.143, 0.155, 0.166, 0.151, 0.144,
0.168, 0.171, 0.162, 0.168, 0.169, 0.117, 0.105, 0.075, 0.057,
0.031, 0.038, 0.034, -0.016, -0.001, -0.031, -0.001, -0.004,
-0.056, -0.016, 0.007, 0.015, -0.016, -0.016, -0.053, -0.059,
-0.054, -0.048, -0.051, -0.052, -0.072, -0.063, 0.02, 0.034,
0.043, 0.084, 0.092, 0.111, 0.131, 0.102, 0.167, 0.162, 0.167,
0.187, 0.165, 0.179, 0.177, 0.192, 0.191, 0.183, 0.179, 0.176,
0.19, 0.188, 0.215, 0.221, 0.203, 0.2, 0.191, 0.188, 0.19, 0.228,
0.195, 0.204, 0.221, 0.218, 0.224, 0.233, 0.23, 0.258, 0.268,
0.291, 0.275, 0.27, 0.276, 0.276, 0.248, 0.228, 0.223, 0.218,
0.169, 0.188, 0.159, 0.156, 0.15, 0.117, 0.088, 0.068, 0.057,
0.035, 0.021, 0.014, -0.005, -0.014, -0.029, -0.043, -0.046,
-0.068, -0.073, -0.042, -0.04, -0.027, -0.018, -0.021, 0.002,
0.002, 0.006, 0.015, 0.022, 0.039, 0.044, 0.055, 0.064, 0.096,
0.093, 0.089, 0.173, 0.203, 0.216, 0.208, 0.225, 0.245, 0.23,
0.218, -0.267, 0.193, -0.013, 0.087, 0.04, 0.012, -0.008, 0.004,
0.01, 0.002, 0.008, 0.006, 0.013, 0.018, 0.019, 0.018, 0.021,
0.024, 0.017, 0.015, -0.005, 0.002, 0.014, 0.021, 0.022, 0.022,
0.02, 0.025, 0.021, 0.027, 0.034, 0.041, 0.04, 0.038, 0.033,
0.034, 0.031, 0.029, 0.029, 0.029, 0.022, 0.021, 0.019, 0.021,
0.016, 0.007, 0.002, 0.011, 0.01, 0.01, 0.003, 0.009, 0.015,
0.018, 0.017, 0.021, 0.021, 0.021, 0.022, 0.023, 0.025, 0.022,
0.022, 0.019, 0.02, 0.023, 0.022, 0.024, 0.022, 0.025, 0.025,
0.022, 0.027, 0.024, 0.016, 0.024, 0.018, 0.024, 0.021, 0.021,
0.021, 0.021, 0.022, 0.016, 0.015, 0.017, -0.017, -0.009, -0.003,
-0.012, -0.009, -0.008, -0.024, -0.023)), .Names = c("Y", "X"
), row.names = c(NA, -234L), class = "data.frame")
With this I run a OLS regression: lm(dat[,1] ~ dat[,2]).
At a set of values: X = quantile(dat[,2], c(0.1, 0.5, 0.7)), I would like to plot a graph similar to the following, with conditional density P(Y|X) displaying along the regression line.
How can I do this in R? Is it even possible?
I call your dataset dat. Don't use data as it masks R function data.
dat <- na.omit(dat) ## retain only complete cases
## use proper formula rather than `$` or `[,]`;
## otherwise you get trouble in prediction with `predict.lm`
fit <- lm(Y ~ X, dat)
## prediction point, as given in your question
xp <- quantile(dat$X, probs = c(0.1, 0.5, 0.7), names = FALSE)
## make prediction and only keep `$fit` and `$se.fit`
pred <- predict.lm(fit, newdata = data.frame(X = xp), se.fit = TRUE)[1:2]
#$fit
# 1 2 3
#0.20456154 0.14319857 0.00678734
#
#$se.fit
# 1 2 3
#0.2205000 0.1789353 0.1819308
To understand the theory behind the following, read Plotting conditional density of prediction after linear regression. Now I am to use mapply function to apply the same computation to multiple points:
## a function to make 101 sample points from conditional density
f <- function (mu, sig) {
x <- seq(mu - 3.2 * sig, mu + 3.2 * sig, length = 101)
dx <- dnorm(x, mu, sig)
cbind(x, dx)
}
## apply `f` to all `xp`
lst <- mapply(f, pred[[1]], pred[[2]], SIMPLIFY = FALSE)
## To plot rotated density curve, we basically want to plot `(dx, x)`
## but scaling `(alpha * dx, x)` is needed for good scaling with regression line
## Also to plot rotated density along the regression line,
## a shift is needed: `(alpha * dx + xp, x)`
## The following function adds rotated, scaled density to a regression line
## a "for-loop" is used for readability, with no loss of efficiency.
## (make sure there is an existing plot; otherwise you get `plot.new` error!!)
addrsd <- function (xp, lst, alpha = 1) {
for (i in 1:length(xp)) {
x0 <- xp[i]; mat <- lst[[i]]
dx. <- alpha * mat[, 2] + x0 ## rescale and shift
x. <- mat[, 1]
lines(dx., x., col = "gray") ## rotate and plot
segments(x0, x.[1], x0, x.[101], col = "gray") ## a local axis
}
}
Now let's see the picture:
## This is one simple way to draw the regression line
## A better way is to generate and grid and predict on the grid
## In later example I will show this
plot(dat$X, fit$fitted, type = "l", ylim = c(-0.6, 1))
## we try `alpha = 0.01`;
## you can also try `alpha = 1` in raw scale to see what it looks like
addrsd(xp, lst, 0.01)
Note, we have only scaled the height of the density, not its span. The span sort of implies confidence band, and should not be scaled. Consider further overlaying confidence band on the plot. If the use of matplot is not clear, read How do I change colours of confidence interval lines when using matlines for prediction plot?.
## A grid is necessary for nice regression plot
X.grid <- seq(min(dat$X), max(dat$X), length = 101)
## 95%-CI based on t-statistic
CI <- predict.lm(fit, newdata = data.frame(X = X.grid), interval = "confidence")
## use `matplot`
matplot(X.grid, CI, type = "l", col = c(1, 2, 2), lty = c(1, 2, 2))
## add rotated, scaled conditional density
addrsd(xp, lst, 0.01)
You see that the span of the density curve agrees with the confidence ribbon.

Area under a density plot not equal to 1

I am trying to chart a probability density plot using ggplot. My problem is that the area under the curve is not equal to one. Advice appreciated.
Sample chart... the code that produced this chart follows... The Y axis looks like it is a count for small sized bins, rather than a probability for falling into that bin. The example code here, is one of the sources I drew on in the preparation of this chart.
Sample code... most of which is data... the key bit of code is at the bottom...
library(ggplot2)
library(reshape)
library(plyr)
library(scales)
Date <- as.Date(
c("1976-01-16", "1976-02-15", "1976-03-16", "1976-04-15", "1976-05-16",
"1976-06-15", "1976-07-16", "1976-08-16", "1976-09-15", "1976-10-16",
"1976-11-15", "1976-12-16", "1977-01-16", "1977-02-14", "1977-03-16",
"1977-04-15", "1977-05-16", "1977-06-15", "1977-07-16", "1977-08-16",
"1977-09-15", "1977-10-16", "1977-11-15", "1977-12-16", "1978-01-16",
"1978-02-14", "1978-03-16", "1978-04-15", "1978-05-16", "1978-06-15",
"1978-07-16", "1978-08-16", "1978-09-15", "1978-10-16", "1978-11-15",
"1978-12-16", "1979-01-16", "1979-02-14", "1979-03-16", "1979-04-15",
"1979-05-16", "1979-06-15", "1979-07-16", "1979-08-16", "1979-09-15",
"1979-10-16", "1979-11-15", "1979-12-16", "1980-01-16", "1980-02-15",
"1980-03-16", "1980-04-15", "1980-05-16", "1980-06-15", "1980-07-16",
"1980-08-16", "1980-09-15", "1980-10-16", "1980-11-15", "1980-12-16",
"1981-01-16", "1981-02-14", "1981-03-16", "1981-04-15", "1981-05-16",
"1981-06-15", "1981-07-16", "1981-08-16", "1981-09-15", "1981-10-16",
"1981-11-15", "1981-12-16", "1982-01-16", "1982-02-14", "1982-03-16",
"1982-04-15", "1982-05-16", "1982-06-15", "1982-07-16", "1982-08-16",
"1982-09-15", "1982-10-16", "1982-11-15", "1982-12-16", "1983-01-16",
"1983-02-14", "1983-03-16", "1983-04-15", "1983-05-16", "1983-06-15",
"1983-07-16", "1983-08-16", "1983-09-15", "1983-10-16", "1983-11-15",
"1983-12-16", "1984-01-16", "1984-02-15", "1984-03-16", "1984-04-15",
"1984-05-16", "1984-06-15", "1984-07-16", "1984-08-16", "1984-09-15",
"1984-10-16", "1984-11-15", "1984-12-16", "1985-01-16", "1985-02-14",
"1985-03-16", "1985-04-15", "1985-05-16", "1985-06-15", "1985-07-16",
"1985-08-16", "1985-09-15", "1985-10-16", "1985-11-15", "1985-12-16"))
GOLD <- c(
-0.104, 0.051, 0.011, -0.035, -0.008, -0.010, -0.065, -0.067, 0.041, 0.017,
0.126, 0.023, -0.011, 0.029, 0.087, 0.007, -0.016, -0.044, 0.048, -0.013,
0.030, 0.062, -0.029, 0.042, 0.078, 0.028, 0.031, -0.045, 0.005, 0.043,
0.028, 0.090, 0.030, 0.072, -0.094, 0.009, 0.093, 0.080, -0.014, -0.013,
0.077, 0.084, 0.058, 0.021, 0.184, 0.097, 0.002, 0.169, 0.474, -0.014,
-0.168, -0.067, -0.007, 0.169, 0.071, -0.025, 0.077, -0.022, -0.059, -0.044,
-0.063, -0.103, -0.003, -0.008, -0.031, -0.040, -0.113, 0.005, 0.081, -0.014,
-0.057, -0.009, -0.062, -0.026, -0.117, 0.061, -0.046, -0.058, 0.080, 0.076,
0.190, -0.031, -0.019, 0.074, 0.079, 0.022, -0.144, 0.030, 0.013, -0.057,
0.026, -0.017, -0.012, -0.042, -0.030, 0.015, -0.043, 0.041, 0.022, -0.032,
-0.011, 0.001, -0.083, 0.004, -0.019, -0.002, 0.003, -0.065, -0.063, 0.017,
-0.044, 0.134, -0.022, -0.014, -0.008, 0.033, -0.014, 0.017, -0.004, -0.023)
df <- data.frame(Date=Date, GOLD=GOLD)
p <- ggplot(data=df, aes(x=GOLD, y=..density..)) +
stat_density(fill='grey50') +
xlab('Percent change on previous month') +
ylab('Density') +
opts(title='Change in Gold Price in the US')
ggsave(p, width=8, height=4, filename='plot.png', dpi=125)
I don't think this is a problem with ggplot, but with your understanding of the y-axis in a density plot. The base plotting functions in R plot the same thing. You can set the call to y=..scaled.. to give you a relative density, but if you use stat_bin() you'll see the actual histogram and notice it's not the counts. If you want you could normalize your data with something like this:
GOLD_N <- (GOLD- mean(GOLD))/sd(GOLD)
df <- data.frame(Date=Date, GOLD=GOLD,GOLD_N=GOLD_N)
Then run your plot it will look something like this:
You should watch this video about how to interpret density functions http://www.youtube.com/watch?v=Fvi9A_tEmXQ But normalizing your data will give you the plot that's a bit more intuitive if you're used to staring at PDF's and will sum to 1. But don't misinterpret the y axis. y IS NOT the probability of a randomly drawn value from the density being equal to x.

Resources