R ggplot2 adding function to a histogram - r

I am trying to add this function curve to a histogram. Individually, they work. But when I try to put them on the same graph, the function messes up... I can't seem to figure out how to get them together.
# make dataframe for ggplot (can use random numbers from 0 to 10 to simulate x)
c= data.frame(x= x, cx= c(seq(from= 0.001, to= 10, by= 0.001)))
x and cx have the same number of data points.
# function for curve (alpha and beta are just constants set at 0.5)
fx= function(x){
(beta^alpha)/((x+beta)^(alpha+1)*gamma(alpha))
}
When either the geom_histogram or the stat_function is commented out, the graph works correctly.
# graph code
h_x= ggplot(data= NULL) +
geom_histogram(data= c, aes(x= x, y= ..density..), binwidth= 0.2, col= "purple", fill= "yellow") +
stat_function(fun= fx, data= c, aes(x= cx)) +
coord_cartesian(xlim= c(0:10)) +
labs(title= "Figure 03", x= "x")
plot(h_x)
Curve by itself
;
Histogram and curve together

Like #Gregor, I made some changes to your code and the graph looks OK.
I hope it can help you.
set.seed(1)
x <- rgamma(10000,1)
df1 <- data.frame(x= x, cx= c(seq(from= 0.001, to= 10, by= 0.001)))
beta <- alpha <- 0.5
fx <- function(x) {
print(str(x))
(beta^alpha)/((x+beta)^(alpha+1)*gamma(alpha))
}
# graph code
h_x <- ggplot(data=df1) +
geom_histogram(aes(x= x, y= ..density..), binwidth= 0.2, col= "purple", fill= "yellow") +
stat_function(fun=fx, aes(x=cx), lwd=1) +
coord_cartesian(xlim= c(0:10)) +
labs(title= "Figure 03", x="x")
plot(h_x)

Thanks for the help! I ended up figuring out the problem... It's because there was some large values (greater than 100) for my x values, when I removed these points, the graph looked much better!
But now my graph looks like this:
n= 10000
i= 1
alpha= 0.5
beta= 0.5
x= matrix(data= 5, nrow= n)
lambda= matrix(data= 1.5, nrow= n)
while (i < n) {
x[i+1]= rexp(1, rate= lambda[i])
lambda[i+1]= (x[i+1]+beta)^(alpha+1)*(lambda[i]^alpha)*exp(-lambda[i]*(x[i+1]+beta))
if ((lambda[i+1] < 0.00001) || (lambda[i+1] > 10)) {
while ((lambda[i+1] < 0.00001) || (lambda[i+1] > 10)) {
x[i+1]= rexp(1, rate= lambda[i])
lambda[i+1]= (x[i+1]+beta)^(alpha+1)*(lambda[i]^alpha)*exp(-lambda[i]*(x[i+1]+beta))
}
}
i= i+1
}
# data frame:
df4= data.frame(x= x[x<100], cx= c(seq(from= 0.011, to= 10, by= 0.001)))
# graph (same function (fx) from first post):
h_x= ggplot(data= df4) +
geom_histogram(aes(x= x, y= ..density..), binwidth= 0.2, col= "purple", fill= "yellow") +
stat_function(fun= fx) +
coord_cartesian(xlim= c(0:10)) +
labs(title= "Figure 03", x= "x")
plot(h_x)
Is there any way to make it a smooth curve? I tried scale_x_continuous but to no avail...

Related

How can I add confidence intervals to a scatterplot for a regression on two variables?

I need to create an insightful graphic with a regression line, data points, and confidence intervals. I am not looking for smoothed lines. I have tried multiple codes, but I just can't get it right.
I am looking for something like this:
Some codes I have tried:
p <- scatterplot(df.regsoft$w ~ df.regsoft$b,
data = df.regsoft,
boxplots = FALSE,
regLine = list(method=lm, col="red"),
pch = 16,
cex = 0.7,
xlab = "Fitted Values",
ylab = "Residuals",
legend = TRUE,
smooth = FALSE)
abline(coef = confint.lm(result.rs))
But this doesn't create what I want to create, however it is closest to what I intended. Notice that I took out "smooth" since this is not really what I am looking for.
How can I make this plot interactive?
If you don't mind switch to ggplot and the tidyverse, then this is simply a geom_smooth(method = "lm"):
library(tidyverse)
d <- tibble( #random stuff
x = rnorm(100, 0, 1),
y = 0.25 * x + rnorm(100, 0, 0.25)
)
m <- lm(y ~ x, data = d) #linear model
d %>%
ggplot() +
aes(x, y) + #what to plot
geom_point() +
geom_smooth(method = "lm") +
theme_bw()
without method = "lm" it draws a smoothed line.
As for the Conf. interval (Obs 95%) lines, it seems to me that's simply a quantile regression. In that case, you can use the quantreg package.
If you want to make it interactive, you can use the plotly package:
library(plotly)
p <- d %>%
ggplot() +
aes(x, y) +
geom_point() +
geom_smooth(method = "lm") +
theme_bw()
ggplotly(p)
================================================
P.S.
I am not completely sure this is what the figure you posted is showing (I guess so), but to add the quantile lines, I would just perform two quantile regressions (upper and lower) and then calculate the values of the quantile lines for your data:
library(tidyverse)
library(quantreg)
d <- tibble( #random stuff
x = rnorm(100, 0, 1),
y = 0.25 * x + rnorm(100, 0, 0.25)
)
m <- lm(y ~ x, data = d) #linear model
# 95% quantile, two tailed
rq_low <- rq(y ~ x, data = d, tau = 0.025) #lower quantile
rq_high <- rq(y ~ x, data = d, tau = 0.975) #upper quantile
d %>%
mutate(low = rq_low$coefficients[1] + x * rq_low$coefficients[2],
high = rq_high$coefficients[1] + x * rq_high$coefficients[2]) %>%
ggplot() +
geom_point(aes(x, y)) +
geom_smooth(aes(x, y), method = "lm") +
geom_line(aes(x, low), linetype = "dashed") +
geom_line(aes(x, high), linetype = "dashed") +
theme_bw()

Nonparametric regression ggplot

I'm trying to plot some nonparametric regression curves with ggplot2. I achieved It with the base plot()function:
library(KernSmooth)
set.seed(1995)
X <- runif(100, -1, 1)
G <- X[which (X > 0)]
L <- X[which (X < 0)]
u <- rnorm(100, 0 , 0.02)
Y <- -exp(-20*L^2)-exp(-20*G^2)/(X+1)+u
m <- lm(Y~X)
plot(Y~X)
abline(m, col="red")
m2 <- locpoly(X, Y, bandwidth = 0.05, degree = 0)
lines(m2$x, m2$y, col = "red")
m3 <- locpoly(X, Y, bandwidth = 0.15, degree = 0)
lines(m3$x, m3$y, col = "black")
m4 <- locpoly(X, Y, bandwidth = 0.3, degree = 0)
lines(m4$x, m4$y, col = "green")
legend("bottomright", legend = c("NW(bw=0.05)", "NW(bw=0.15)", "NW(bw=0.3)"),
lty = 1, col = c("red", "black", "green"), cex = 0.5)
With ggplot2 have achieved plotting the linear regression:
With this code:
ggplot(m, aes(x = X, y = Y)) +
geom_point(shape = 1) +
geom_smooth(method = lm, se = FALSE) +
theme(axis.line = element_line(colour = "black", size = 0.25))
But I dont't know how to add the other lines to this plot, as in the base R plot. Any suggestions? Thanks in advance.
Solution
The shortest solution (though not the most beautiful one) is to add the lines using the data= argument of the geom_line function:
ggplot(m, aes(x = X, y = Y)) +
geom_point(shape = 1) +
geom_smooth(method = lm, se = FALSE) +
theme(axis.line = element_line(colour = "black", size = 0.25)) +
geom_line(data = as.data.frame(m2), mapping = aes(x=x,y=y))
Beautiful solution
To get beautiful colors and legend, use
# Need to convert lists to data.frames, ggplot2 needs data.frames
m2 <- as.data.frame(m2)
m3 <- as.data.frame(m3)
m4 <- as.data.frame(m4)
# Colnames are used as names in ggplot legend. Theres nothing wrong in using
# column names which contain symbols or whitespace, you just have to use
# backticks, e.g. m2$`NW(bw=0.05)` if you want to work with them
colnames(m2) <- c("x","NW(bw=0.05)")
colnames(m3) <- c("x","NW(bw=0.15)")
colnames(m4) <- c("x","NW(bw=0.3)")
# To give the different kernel density estimates different colors, they must all be in one data frame.
# For merging to work, all x columns of m2-m4 must be the same!
# the merge function will automatically detec columns of same name
# (that is, x) in m2-m4 and use it to identify y values which belong
# together (to the same x value)
mm <- Reduce(x=list(m2,m3,m4), f=function(a,b) merge(a,b))
# The above line is the same as:
# mm <- merge(m2,m3)
# mm <- merge(mm,m4)
# ggplot needs data in long (tidy) format
mm <- tidyr::gather(mm, kernel, y, -x)
ggplot(m, aes(x = X, y = Y)) +
geom_point(shape = 1) +
geom_smooth(method = lm, se = FALSE) +
theme(axis.line = element_line(colour = "black", size = 0.25)) +
geom_line(data = mm, mapping = aes(x=x,y=y,color=kernel))
Solution which will settle this for everyone and for eternity
The most beautiful and reproducable way though will be to create a custom stat in ggplot2 (see the included stats in ggplot).
There is this vignette of the ggplot2 team to this topic: Extending ggplot2. I have never undertaken such a heroic endeavour though.

Drawing polygon: limits?

The polygon function in R seems rather simple...however I can't get it to work.
It easily works with this code:
x <- seq(-3,3,0.01)
y1 <- dnorm(x,0,1)
y2 <- 0.5*dnorm(x,0,1)
plot(x,y1,type="l",bty="L",xlab="X",ylab="dnorm(X)")
points(x,y2,type="l",col="red")
polygon(c(x,rev(x)),c(y2,rev(y1)),col="skyblue")
When adopting this to something else, it doesn't work. Here some stuff to reproduce the issue:
lowerbound = c(0.05522914,0.06567045,0.07429926,0.08108482,0.08624472,0.09008050,0.09288837,0.09492226)
upperbound = c(0.1743657,0.1494058,0.1333106,0.1227383,0.1156714,0.1108787,0.1075915,0.1053178)
lim = c(100,200,400,800,1600,3200,6400,12800)
plot(upperbound, ylim=c(0, 0.2), type="b", axes=FALSE)
lines(lowerbound, type="b", col="red")
atvalues <- seq(1:8)
axis(side=1, at=atvalues, labels=lim)
axis(side=2, at=c(0,0.05,0.1,0.15,0.2), labels=c(0,0.05,0.1,0.15,0.2))
polygon(lowerbound,upperbound, col="skyblue")
It also doesn't work when only segmenting a subset when directly calling the coordinates:
xpoly <- c(100,200,200,100)
ypoly <- c(lowerbound[1], lowerbound[2], upperbound[2], upperbound[1])
polygon(xpoly,ypoly, col="skyblue")
What am I missing?
Plotting the whole polygon
You need to supply both x and y to polygon. Normally, you'd also do that for plot, but if you don't it will just use the Index as x, that is integers 1 to n. We can use that to make an x range. seq_along will create a 1:n vector, where n is the length of another object.
x <- c(seq_along(upperbound), rev(seq_along(lowerbound)))
y <- c(lowerbound, rev(upperbound))
plot(upperbound, ylim=c(0, 0.2), type="b", axes=FALSE)
lines(lowerbound, type="b", col="red")
atvalues <- seq(1:8)
axis(side=1, at=atvalues, labels=lim)
axis(side=2, at=c(0,0.05,0.1,0.15,0.2), labels=c(0,0.05,0.1,0.15,0.2))
polygon(x = x, y = y, col="skyblue")
Plotting a subset
For a subset, I would create the y first, and then use the old x to easily get `x values:
y2 <- c(lowerbound[1:2], upperbound[2:1])
x2 <- x[which(y2 == y)]
polygon(x2, y2, col="skyblue")
How I would do it
Creating something like this is much easier in ggplot2, where geom_ribbon does a lot of the heavy lifting. We just have to make an actual data.frame, an stop relying on indices.
Full polygon:
library(ggplot2)
ggplot(d, aes(x = x, ymin = low, ymax = up)) +
geom_ribbon(fill = 'skyblue', alpha = 0.5) +
geom_line(aes(y = low), col = 'red') +
geom_line(aes(y = up), col = 'black') +
scale_x_continuous(trans = 'log2') +
theme_bw()
Subset:
ggplot(d, aes(x = x, ymin = low, ymax = up)) +
geom_ribbon(data = d[1:2, ], fill = 'skyblue', alpha = 0.5) +
geom_line(aes(y = low), col = 'red') +
geom_line(aes(y = up), col = 'black') +
scale_x_continuous(trans = 'log2') +
theme_bw()

filled.contour vs. ggplot2 + stat_contour

I am new to ggplot2, and I am trying to replicate a graph that I created using filled.contour with ggplot2.
below is my code:
require(ggplot2)
require(reshape2)
#data prep
scale <- 10
xs <- scale * c(0, 0.5, 0.8, 0.9, 0.95, 0.99, 1)
ys <- scale * c(0, 0.01, 0.05, 0.1, 0.2, 0.5, 1)
df <- data.frame(expand.grid(xs,ys))
colnames(df) <- c('x','y')
df$z <- ((scale-df$x) * df$y) / ((scale-df$x) * df$y + 1)
#filled contour looks good
filled.contour(xs, ys, acast(df, x~y, value.var='z'))
#ggplot contour looks bad
p <- ggplot(df, aes(x=x, y=y, z=z))
p + stat_contour(geom='polygon', aes(fill=..level..))
I can't figure out how to get ggplot contour to fill the polygons all the way up to the upper left hand side (there's a point at (0,10) with z= 0.99) ...all I get are these weird triangles
To create a ggplot version of the filled.contour plot you'll need to have a larger data.frame than the df object in your example and using geom_tile will produce the plot you are looking for. Consider the following:
# a larger data set
scl <- 10
dat <- expand.grid(x = scl * seq(0, 1, by = 0.01),
y = scl * seq(0, 1, by = 0.01))
dat$z <- ((scl - dat$x) * dat$y) / ((scl - dat$x) * dat$y + 1)
# create the plot, the geom_contour may not be needed, but I find it helpful
ggplot(dat) +
aes(x = x, y = y, z = z, fill = z) +
geom_tile() +
geom_contour(color = "white", alpha = 0.5) +
scale_fill_gradient(low = "lightblue", high = "magenta") +
theme_bw()

3D plot of bivariate distribution using R or Matlab

i would like to know if someone could tell me how you plot something similar to this
with histograms of the sample generates from the code below under the two curves. Using R or Matlab but preferably R.
# bivariate normal with a gibbs sampler...
gibbs<-function (n, rho)
{
mat <- matrix(ncol = 2, nrow = n)
x <- 0
y <- 0
mat[1, ] <- c(x, y)
for (i in 2:n) {
x <- rnorm(1, rho * y, (1 - rho^2))
y <- rnorm(1, rho * x,(1 - rho^2))
mat[i, ] <- c(x, y)
}
mat
}
bvn<-gibbs(10000,0.98)
par(mfrow=c(3,2))
plot(bvn,col=1:10000,main="bivariate normal distribution",xlab="X",ylab="Y")
plot(bvn,type="l",main="bivariate normal distribution",xlab="X",ylab="Y")
hist(bvn[,1],40,main="bivariate normal distribution",xlab="X",ylab="")
hist(bvn[,2],40,main="bivariate normal distribution",xlab="Y",ylab="")
par(mfrow=c(1,1))`
Thanks in advance
Best regards,
JC T.
You could do it in Matlab programmatically.
This is the result:
Code:
% Generate some data.
data = randn(10000, 2);
% Scale and rotate the data (for demonstration purposes).
data(:,1) = data(:,1) * 2;
theta = deg2rad(130);
data = ([cos(theta) -sin(theta); sin(theta) cos(theta)] * data')';
% Get some info.
m = mean(data);
s = std(data);
axisMin = m - 4 * s;
axisMax = m + 4 * s;
% Plot data points on (X=data(x), Y=data(y), Z=0)
plot3(data(:,1), data(:,2), zeros(size(data,1),1), 'k.', 'MarkerSize', 1);
% Turn on hold to allow subsequent plots.
hold on
% Plot the ellipse using Eigenvectors and Eigenvalues.
data_zeroMean = bsxfun(#minus, data, m);
[V,D] = eig(data_zeroMean' * data_zeroMean / (size(data_zeroMean, 1)));
[D, order] = sort(diag(D), 'descend');
D = diag(D);
V = V(:, order);
V = V * sqrt(D);
t = linspace(0, 2 * pi);
e = bsxfun(#plus, 2*V * [cos(t); sin(t)], m');
plot3(...
e(1,:), e(2,:), ...
zeros(1, nPointsEllipse), 'g-', 'LineWidth', 2);
maxP = 0;
for side = 1:2
% Calculate the histogram.
p = [0 hist(data(:,side), 20) 0];
p = p / sum(p);
maxP = max([maxP p]);
dx = (axisMax(side) - axisMin(side)) / numel(p) / 2.3;
p2 = [zeros(1,numel(p)); p; p; zeros(1,numel(p))]; p2 = p2(:);
x = linspace(axisMin(side), axisMax(side), numel(p));
x2 = [x-dx; x-dx; x+dx; x+dx]; x2 = max(min(x2(:), axisMax(side)), axisMin(side));
% Calculate the curve.
nPtsCurve = numel(p) * 10;
xx = linspace(axisMin(side), axisMax(side), nPtsCurve);
% Plot the curve and the histogram.
if side == 1
plot3(xx, ones(1, nPtsCurve) * axisMax(3 - side), spline(x,p,xx), 'r-', 'LineWidth', 2);
plot3(x2, ones(numel(p2), 1) * axisMax(3 - side), p2, 'k-', 'LineWidth', 1);
else
plot3(ones(1, nPtsCurve) * axisMax(3 - side), xx, spline(x,p,xx), 'b-', 'LineWidth', 2);
plot3(ones(numel(p2), 1) * axisMax(3 - side), x2, p2, 'k-', 'LineWidth', 1);
end
end
% Turn off hold.
hold off
% Axis labels.
xlabel('x');
ylabel('y');
zlabel('p(.)');
axis([axisMin(1) axisMax(1) axisMin(2) axisMax(2) 0 maxP * 1.05]);
grid on;
I must admit, I took this on as a challenge because I was looking for different ways to show other datasets. I have normally done something along the lines of the scatterhist 2D graphs shown in other answers, but I've wanted to try my hand at rgl for a while.
I use your function to generate the data
gibbs<-function (n, rho) {
mat <- matrix(ncol = 2, nrow = n)
x <- 0
y <- 0
mat[1, ] <- c(x, y)
for (i in 2:n) {
x <- rnorm(1, rho * y, (1 - rho^2))
y <- rnorm(1, rho * x, (1 - rho^2))
mat[i, ] <- c(x, y)
}
mat
}
bvn <- gibbs(10000, 0.98)
Setup
I use rgl for the hard lifting, but I didn't know how to get the confidence ellipse without going to car. I'm guessing there are other ways to attack this.
library(rgl) # plot3d, quads3d, lines3d, grid3d, par3d, axes3d, box3d, mtext3d
library(car) # dataEllipse
Process the data
Getting the histogram data without plotting it, I then extract the densities and normalize them into probabilities. The *max variables are to simplify future plotting.
hx <- hist(bvn[,2], plot=FALSE)
hxs <- hx$density / sum(hx$density)
hy <- hist(bvn[,1], plot=FALSE)
hys <- hy$density / sum(hy$density)
## [xy]max: so that there's no overlap in the adjoining corner
xmax <- tail(hx$breaks, n=1) + diff(tail(hx$breaks, n=2))
ymax <- tail(hy$breaks, n=1) + diff(tail(hy$breaks, n=2))
zmax <- max(hxs, hys)
Basic scatterplot on the floor
The scale should be set to whatever is appropriate based on the distributions. Admittedly, the X and Y labels aren't placed beautifully, but that shouldn't be too hard to reposition based on the data.
## the base scatterplot
plot3d(bvn[,2], bvn[,1], 0, zlim=c(0, zmax), pch='.',
xlab='X', ylab='Y', zlab='', axes=FALSE)
par3d(scale=c(1,1,3))
Histograms on the back walls
I couldn't figure out how to get them automatically plotted on a plane in the overall 3D render, so I had to make each rect manually.
## manually create each histogram
for (ii in seq_along(hx$counts)) {
quads3d(hx$breaks[ii]*c(.9,.9,.1,.1) + hx$breaks[ii+1]*c(.1,.1,.9,.9),
rep(ymax, 4),
hxs[ii]*c(0,1,1,0), color='gray80')
}
for (ii in seq_along(hy$counts)) {
quads3d(rep(xmax, 4),
hy$breaks[ii]*c(.9,.9,.1,.1) + hy$breaks[ii+1]*c(.1,.1,.9,.9),
hys[ii]*c(0,1,1,0), color='gray80')
}
Summary Lines
## I use these to ensure the lines are plotted "in front of" the
## respective dot/hist
bb <- par3d('bbox')
inset <- 0.02 # percent off of the floor/wall for lines
x1 <- bb[1] + (1-inset)*diff(bb[1:2])
y1 <- bb[3] + (1-inset)*diff(bb[3:4])
z1 <- bb[5] + inset*diff(bb[5:6])
## even with draw=FALSE, dataEllipse still pops up a dev, so I create
## a dummy dev and destroy it ... better way to do this?
dev.new()
de <- dataEllipse(bvn[,1], bvn[,2], draw=FALSE, levels=0.95)
dev.off()
## the ellipse
lines3d(de[,2], de[,1], z1, color='green', lwd=3)
## the two density curves, probability-style
denx <- density(bvn[,2])
lines3d(denx$x, rep(y1, length(denx$x)), denx$y / sum(hx$density), col='red', lwd=3)
deny <- density(bvn[,1])
lines3d(rep(x1, length(deny$x)), deny$x, deny$y / sum(hy$density), col='blue', lwd=3)
Beautifications
grid3d(c('x+', 'y+', 'z-'), n=10)
box3d()
axes3d(edges=c('x-', 'y-', 'z+'))
outset <- 1.2 # place text outside of bbox *this* percentage
mtext3d('P(X)', edge='x+', pos=c(0, ymax, outset * zmax))
mtext3d('P(Y)', edge='y+', pos=c(xmax, 0, outset * zmax))
Final Product
One bonus of using rgl is that you can spin it around with your mouse and find the best perspective. Lacking making an animation for this SO page, doing all of the above should allow you the play-time. (If you spin it, you'll be able to see that the lines are slightly in front of the histograms and slightly above the scatterplot; otherwise I found intersections, so it looked noncontinuous at places.)
In the end, I find this a bit distracting (the 2D variants sufficed): showing the z-axis implies that there is a third dimension to the data; Tufte specifically discourages this behavior (Tufte, "Envisioning Information," 1990). However, with higher dimensionality, this technique of using RGL will allow significant perspective on patterns.
(For the record, Win7 x64, tested with R-3.0.3 in 32-bit and 64-bit, rgl v0.93.996, car v2.0-19.)
Create the dataframe with bvn <- as.data.frame(gibbs(10000,0.98)). Several 2d solutions in R:
1: A quick & dirty solution with the psych package:
library(psych)
scatter.hist(x=bvn$V1, y=bvn$V2, density=TRUE, ellipse=TRUE)
which results in:
2: A nice & pretty solution with ggplot2:
library(ggplot2)
library(gridExtra)
library(devtools)
source_url("https://raw.github.com/low-decarie/FAAV/master/r/stat-ellipse.R") # needed to create the 95% confidence ellipse
htop <- ggplot(data=bvn, aes(x=V1)) +
geom_histogram(aes(y=..density..), fill = "white", color = "black", binwidth = 2) +
stat_density(colour = "blue", geom="line", size = 1.5, position="identity", show_guide=FALSE) +
scale_x_continuous("V1", limits = c(-40,40), breaks = c(-40,-20,0,20,40)) +
scale_y_continuous("Count", breaks=c(0.0,0.01,0.02,0.03,0.04), labels=c(0,100,200,300,400)) +
theme_bw() + theme(axis.title.x = element_blank())
blank <- ggplot() + geom_point(aes(1,1), colour="white") +
theme(axis.ticks=element_blank(), panel.background=element_blank(), panel.grid=element_blank(),
axis.text.x=element_blank(), axis.text.y=element_blank(), axis.title.x=element_blank(), axis.title.y=element_blank())
scatter <- ggplot(data=bvn, aes(x=V1, y=V2)) +
geom_point(size = 0.6) + stat_ellipse(level = 0.95, size = 1, color="green") +
scale_x_continuous("label V1", limits = c(-40,40), breaks = c(-40,-20,0,20,40)) +
scale_y_continuous("label V2", limits = c(-20,20), breaks = c(-20,-10,0,10,20)) +
theme_bw()
hright <- ggplot(data=bvn, aes(x=V2)) +
geom_histogram(aes(y=..density..), fill = "white", color = "black", binwidth = 1) +
stat_density(colour = "red", geom="line", size = 1, position="identity", show_guide=FALSE) +
scale_x_continuous("V2", limits = c(-20,20), breaks = c(-20,-10,0,10,20)) +
scale_y_continuous("Count", breaks=c(0.0,0.02,0.04,0.06,0.08), labels=c(0,200,400,600,800)) +
coord_flip() + theme_bw() + theme(axis.title.y = element_blank())
grid.arrange(htop, blank, scatter, hright, ncol=2, nrow=2, widths=c(4, 1), heights=c(1, 4))
which results in:
3: A compact solution with ggplot2:
library(ggplot2)
library(devtools)
source_url("https://raw.github.com/low-decarie/FAAV/master/r/stat-ellipse.R") # needed to create the 95% confidence ellipse
ggplot(data=bvn, aes(x=V1, y=V2)) +
geom_point(size = 0.6) +
geom_rug(sides="t", size=0.05, col=rgb(.8,0,0,alpha=.3)) +
geom_rug(sides="r", size=0.05, col=rgb(0,0,.8,alpha=.3)) +
stat_ellipse(level = 0.95, size = 1, color="green") +
scale_x_continuous("label V1", limits = c(-40,40), breaks = c(-40,-20,0,20,40)) +
scale_y_continuous("label V2", limits = c(-20,20), breaks = c(-20,-10,0,10,20)) +
theme_bw()
which results in:
Matlab's implementation is called scatterhist and requires the Statistics Toolbox. Unfortunately it is not 3D, it is an extended 2D plot.
% some example data
x = randn(1000,1);
y = randn(1000,1);
h = scatterhist(x,y,'Location','SouthEast',...
'Direction','out',...
'Color','k',...
'Marker','o',...
'MarkerSize',4);
legend('data')
legend boxoff
grid on
It also allows grouping of datasets:
load fisheriris.mat;
x = meas(:,1); %// x-data
y = meas(:,2); %// y-data
gnames = species; %// assigning of names to certain elements of x and y
scatterhist(x,y,'Group',gnames,'Location','SouthEast',...
'Direction','out',...
'Color','kbr',...
'LineStyle',{'-','-.',':'},...
'LineWidth',[2,2,2],...
'Marker','+od',...
'MarkerSize',[4,5,6]);
R Implementation
Load library "car". We use only dataEllipse function to draw ellipse based on the percent of data (0.95 means 95% data falls within the ellipse).
library("car")
gibbs<-function (n, rho)
{
mat <- matrix(ncol = 2, nrow = n)
x <- 0
y <- 0
mat[1, ] <- c(x, y)
for (i in 2:n) {
x <- rnorm(1, rho * y, (1 - rho^2))
y <- rnorm(1, rho * x,(1 - rho^2))
mat[i, ] <- c(x, y)
}
mat
}
bvn<-gibbs(10000,0.98)
Open a PDF Device:
OUTFILE <- "bivar_dist.pdf"
pdf(OUTFILE)
Set up the layout first
layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), widths=c(3,1), heights=c(1,3), TRUE)
Make Scatterplot
par(mar=c(5.1,4.1,0.1,0))
The commented lines can be used to plot a scatter diagram without "car" package from where we use dataEllipse function
# plot(bvn[,2], bvn[,1],
# pch=".",cex = 1, col=1:length(bvn[,2]),
# xlim=c(-0.6, 0.6),
# ylim=c(-0.6,0.6),
# xlab="X",
# ylab="Y")
#
# grid(NULL, NULL, lwd = 2)
dataEllipse(bvn[,2], bvn[,1],
levels = c(0.95),
pch=".",
col=1:length(bvn[,2]),
xlim=c(-0.6, 0.6),
ylim=c(-0.6,0.6),
xlab="X",
ylab="Y",
center.cex = 1
)
Plot histogram of X variable in the top row
par(mar=c(0,4.1,3,0))
hist(bvn[,2],
ann=FALSE,axes=FALSE,
col="light blue",border="black",
)
title(main = "Bivariate Normal Distribution")
Plot histogram of Y variable to the right of the scatterplot
yhist <- hist(bvn[,1],
plot=FALSE
)
par(mar=c(5.1,0,0.1,1))
barplot(yhist$density,
horiz=TRUE,
space=0,
axes=FALSE,
col="light blue",
border="black"
)
dev.off(which = dev.cur())
dataEllipse(bvn[,2], bvn[,1],
levels = c(0.5, 0.95),
pch=".",
col= 1:length(bvn[,2]),
xlim=c(-0.6, 0.6),
ylim=c(-0.6,0.6),
xlab="X",
ylab="Y",
center.cex = 1
)
I took #jaap's code above and turned it into a slightly more generalized function. The code can be sourced here. Note: I am not adding anything new to #jaap's code, just a few minor changes and wrapped it in a function. Hopefully it is helpful.
density.hist <- function(df, x=NULL, y=NULL) {
require(ggplot2)
require(gridExtra)
require(devtools)
htop <- ggplot(data=df, aes_string(x=x)) +
geom_histogram(aes(y=..density..), fill = "white", color = "black", bins=100) +
stat_density(colour = "blue", geom="line", size = 1, position="identity", show.legend=FALSE) +
theme_bw() + theme(axis.title.x = element_blank())
blank <- ggplot() + geom_point(aes(1,1), colour="white") +
theme(axis.ticks=element_blank(), panel.background=element_blank(), panel.grid=element_blank(),
axis.text.x=element_blank(), axis.text.y=element_blank(), axis.title.x=element_blank(),
axis.title.y=element_blank())
scatter <- ggplot(data=df, aes_string(x=x, y=y)) +
geom_point(size = 0.6) + stat_ellipse(type = "norm", linetype = 2, color="green",size=1) +
stat_ellipse(type = "t",color="green",size=1) +
theme_bw() + labs(x=x, y=y)
hright <- ggplot(data=df, aes_string(x=x)) +
geom_histogram(aes(y=..density..), fill = "white", color = "black", bins=100) +
stat_density(colour = "red", geom="line", size = 1, position="identity", show.legend=FALSE) +
coord_flip() + theme_bw() + theme(axis.title.y = element_blank())
grid.arrange(htop, blank, scatter, hright, ncol=2, nrow=2, widths=c(4, 1), heights=c(1, 4))
}

Resources