R: Calculate and plot difference between two density countours - r

I have two datasets with two continuous variables: duration and waiting.
library("MASS")
data(geyser)
geyser1 <- geyser[1:150,]
geyser2 <- geyser[151:299,]
geyser2$duration <- geyser2$duration - 1
geyser2$waiting <- geyser2$waiting - 20
For each dataset I output a 2D density plot
ggplot(geyser1, aes(x = duration, y = waiting)) +
xlim(0.5, 6) + ylim(40, 110) +
stat_density2d(aes(alpha=..level..),
geom="polygon", bins = 10)
ggplot(geyser2, aes(x = duration, y = waiting)) +
xlim(0.5, 6) + ylim(40, 110) +
stat_density2d(aes(alpha=..level..),
geom="polygon", bins = 10)
I now want to produce a plot which indicates the regions where the two plot have the same density (white), negative differences (gradation from white to blue where geyser2 is denser than geyser1) and positive differences (gradation from white to red where geyser1 is denser than geyser2).
How to compute and plot the difference of the densities?

You can do this by first using kde2d to calculate the densities and then subtracting them from each other. Then you do some data reshaping to get it into a form that can be fed to ggplot2.
library(reshape2) # For melt function
# Calculate the common x and y range for geyser1 and geyser2
xrng = range(c(geyser1$duration, geyser2$duration))
yrng = range(c(geyser1$waiting, geyser2$waiting))
# Calculate the 2d density estimate over the common range
d1 = kde2d(geyser1$duration, geyser1$waiting, lims=c(xrng, yrng), n=200)
d2 = kde2d(geyser2$duration, geyser2$waiting, lims=c(xrng, yrng), n=200)
# Confirm that the grid points for each density estimate are identical
identical(d1$x, d2$x) # TRUE
identical(d1$y, d2$y) # TRUE
# Calculate the difference between the 2d density estimates
diff12 = d1
diff12$z = d2$z - d1$z
## Melt data into long format
# First, add row and column names (x and y grid values) to the z-value matrix
rownames(diff12$z) = diff12$x
colnames(diff12$z) = diff12$y
# Now melt it to long format
diff12.m = melt(diff12$z, id.var=rownames(diff12))
names(diff12.m) = c("Duration","Waiting","z")
# Plot difference between geyser2 and geyser1 density
ggplot(diff12.m, aes(Duration, Waiting, z=z, fill=z)) +
geom_tile() +
stat_contour(aes(colour=..level..), binwidth=0.001) +
scale_fill_gradient2(low="red",mid="white", high="blue", midpoint=0) +
scale_colour_gradient2(low=muted("red"), mid="white", high=muted("blue"), midpoint=0) +
coord_cartesian(xlim=xrng, ylim=yrng) +
guides(colour=FALSE)

Related

ggplot2 geom_histogram draw desity bars from sampling a mixture distribution with 2 weighted distribution

First things first, I got the 2 mixed distributions (they have mixed part) and I've known the samples come from which distribution.
Then I want to plot histogram according to the samples' density and the mixture distribution.
Let's head to the code (seg 1):
library(mixtools)
# two components
set.seed(1) # for reproducible example
b1 <- rnorm(900000, mean=8, sd=2) # samples
b2 <- rnorm(100000, mean=17, sd=2)
# densities corresponding to samples
d = dnorm(c(b1, b2), mean = 8, sd = 2)*.9 + dnorm(c(b1, b2), mean = 17, sd = 2)*.1
# ground truth
b <- data.frame(ss=c(b1,b2), dd=d, gg=factor(c(rep(1, length(b1)), rep(2, length(b2)))))
# sample from mixed distribution
c <- b[sample(nrow(b), 500000),]
library(ggplot2)
ggplot(data = c, aes(x = ss)) +
geom_histogram(aes(y = stat(density)), binwidth = .5, alpha = .3, position="identity") +
geom_line(data = c, aes(x = ss, y = dd), color = "red", inherit.aes = FALSE)
this result is fine: like this
But I want to fill the color according to the samples' group. So I change the code (seg 2):
ggplot(data=c, aes(x=ss)) +
geom_histogram(aes(y=stat(density), fill=gg, color=gg),
binwidth=.5, alpha=.3, position="identity") +
geom_line(data=c, aes(x=ss, y=dd), color="red", inherit.aes=FALSE)
the result is wrong. R calculate the density of two parts separately. So the two part looks like the same height.
Then I found some methods like this (seg 3):
breaks = seq(min(c$ss), max(c$ss), .5) # form cut points
bins1 = cut(with(c, ss[gg==1]), breaks) # form intervals by cutting
bins2 = cut(with(c, ss[gg==2]), breaks)
cnt1 = sapply(split(with(c, ss[gg==1]), bins1), length) # assign points to its interval
cnt2 = sapply(split(with(c, ss[gg==2]), bins2), length)
h = data.frame(
x = head(breaks, -1)+.25,
dens1 = cnt1/sum(cnt1,cnt2), # height of density bar
dens2 = cnt2/sum(cnt1,cnt2)
# weight = sapply(split(samples.mixgamma$samples, bins), sum)
)
ggplot(h) +
geom_bar(aes(x, dens1), fill="red", alpha = .3, stat="identity") +
geom_bar(aes(x, dens2), fill="blue", alpha = .3, stat="identity") +
geom_line(data=c, aes(x=ss, y=dd), color="red", inherit.aes=FALSE)
or set y=stat(count)/sum(stat(count)) like this (seg 4):
ggplot(data=c, aes(x=ss)) +
geom_histogram(aes(y=stat(count)/sum(stat(count)), fill=gg, color=gg),
binwidth=.5, alpha=.3, position="identity") +
geom_line(data=c, aes(x=ss, y=dd), color="red", inherit.aes=FALSE)
the results are the same and wrong, all the bars are about half as tall as seg 1.
So if I want to fill the 2 groups with different color with mixture like seg 2 and the right proportion like seg 1 and avoid the mistake like seg 3 and seg 4, what can I do?
Many thanks!
The solution is that: probability density should be calculated as y=stat(count)/.5/sum(stat(count)). I only do the normolization but not divide mass by it's volume. So the answer such as this and seg 3 need to be modified

Evenly-spaced 1/freq for R function spec.pgram()

The goal is to plot of the coherency between two time series (i.e. the correlation coefficient with respect to frequencies). How can I get 1/freq (i.e. the period) in the x-axis to be evenly-spaced?
t <- 0:99
ts1 <- ts(2*cos((2*pi)/24*t))
ts2 <- ts(2*cos((2*pi)/48*t))
ts12 <- ts.intersect(ts1, ts2)
Coh <- spec.pgram(ts12, spans=3)
plot(Coh$freq, Coh$coh, type='l')
plot(1/Coh$freq, Coh$coh, type='l') # how to get 1/freq to be evenly-space?
I have tried to modify the function spec.pgram() but without success. More specifically, I replace the line:
freq <- seq.int(from = xfreq/N, by = xfreq/N, length.out = Nspec)
with:
freq.tmp <- seq.int(from = xfreq/N, by = xfreq/N, length.out = Nspec)
freq <- rev(1/seq(from=1/max(freq.tmp), to=1/min(freq.tmp), length.out=Nspec))
Has anyone else had better luck? Thanks
Do you mean that you just want to relabel the x-axis with periods, rather than frequencies? That would maintain the spacing of the values on the x-axis, at the expense of a non-linear scaling for the x values. For example (using ggplot2):
library(ggplot2)
dat = as.data.frame(Coh[c("freq","spec","coh","phase")])
ggplot(dat, aes(freq, coh)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks=dat$freq[seq(1,nrow(dat),3)],
minor_breaks=dat$freq,
labels=round(1/dat$freq[seq(1,nrow(dat),3)],1)) +
labs(x="Period")
You could also set the x-value labels to fall on integer periods:
breaks = c(1:10,15,25,50,100)
ggplot(dat, aes(freq, coh)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks=1/breaks,
minor_breaks = 1/(breaks[-1] - 0.5 * diff(breaks)),
labels=breaks) +
labs(x="Period")

ggplot loess line from one dataset over scatterplot of another

The function below calculates binned averages, sizes the bin points on the graph relative to the number of observations in each bin, and plots a lowess line through the bin means. Instead of plotting the lowess line through the bin means, however, I would like to plot the line through the original dataset so that the error bands on the lowess line represent the uncertainty in the actual dataset, not the uncertainty in the binned averages. How do I modify geom_smooth() so that it will plot the line using df instead of dfplot?
library(fields)
library(ggplot2)
binplot <- function(df, yvar, xvar, sub = FALSE, N = 50, size = 40, xlabel = "X", ylabel = "Y"){
if(sub != FALSE){
df <- subset(df, eval(parse(text = sub)))
}
out <- stats.bin(df[,xvar], df[,yvar], N= N)
x <- out$centers
y <- out$stats[ c("mean"),]
n <- out$stats[ c("N"),]
dfplot <- as.data.frame(cbind(x,y,n))
if(size != FALSE){
sizes <- n * (size/max(n))
}else{
sizes = 3
}
ggplot(dfplot, aes(x,y)) +
xlab(xlabel) +
ylab(ylabel) +
geom_point(shape=1, size = sizes) +
geom_smooth()
}
Here is a reproducible example that demonstrates how the function currently works:
sampleSize <- 10000
x1 <- rnorm(n=sampleSize, mean = 0, sd = 4)
y1 <- x1 * 2 + x1^2 * .3 + rnorm(n=sampleSize, mean = 5, sd = 10)
binplot(data.frame(x1,y1), "y1", "x1", N = 25)
As you can see, the error band on the lowess line reflects the uncertainty if each bin had an equal number of observations, but they do not. The bins at the extremes have far fewer obseverations (as illustrated by the size of the points) and the lowess line's error band should reflect that.
You can explicitly set the data= parameter for each layer. You will also need to change the aesthetic mapping since the original data.frame had different column names. Just change your geom_smooth call to
geom_smooth(data=df, aes_string(xvar, yvar))
with the sample data, this returned

R - add centroids to scatter plot

I have a dataset two continuous variables and one factor variable (two classes). I want to create a scatterplot with two centroids (one for each class) that includes error bars in R. The centroids should be positioned at the mean values for x and y for each class.
I can easily create the scatter plot using ggplot2, but I can't figure out how to add the centroids. Is it possible to do this using ggplot / qplot?
Here is some example code:
x <- c(1,2,3,4,5,2,3,5)
y <- c(10,11,14,5,7,9,8,5)
class <- c(1,1,1,0,0,1,0,0)
df <- data.frame(class, x, y)
qplot(x,y, data=df, color=as.factor(class))
Is this what you had in mind?
centroids <- aggregate(cbind(x,y)~class,df,mean)
ggplot(df,aes(x,y,color=factor(class))) +
geom_point(size=3)+ geom_point(data=centroids,size=5)
This creates a separate data frame, centroids, with columns x, y, and class where x and y are the mean values by class. Then we add a second point geometry layer using centroid as the dataset.
This is a slightly more interesting version, useful in cluster analysis.
gg <- merge(df,aggregate(cbind(mean.x=x,mean.y=y)~class,df,mean),by="class")
ggplot(gg, aes(x,y,color=factor(class)))+geom_point(size=3)+
geom_point(aes(x=mean.x,y=mean.y),size=5)+
geom_segment(aes(x=mean.x, y=mean.y, xend=x, yend=y))
EDIT Response to OP's comment.
Vertical and horizontal error bars can be added using geom_errorbar(...) and geom_errorbarh(...).
centroids <- aggregate(cbind(x,y)~class,df,mean)
f <- function(z)sd(z)/sqrt(length(z)) # function to calculate std.err
se <- aggregate(cbind(se.x=x,se.y=y)~class,df,f)
centroids <- merge(centroids,se, by="class") # add std.err column to centroids
ggplot(gg, aes(x,y,color=factor(class)))+
geom_point(size=3)+
geom_point(data=centroids, size=5)+
geom_errorbar(data=centroids,aes(ymin=y-se.y,ymax=y+se.y),width=0.1)+
geom_errorbarh(data=centroids,aes(xmin=x-se.x,xmax=x+se.x),height=0.1)
If you want to calculate, say, 95% confidence instead of std. error, replace
f <- function(z)sd(z)/sqrt(length(z)) # function to calculate std.err
with
f <- function(z) qt(0.025,df=length(z)-1, lower.tail=F)* sd(z)/sqrt(length(z))
I could not get the exact code by #jlhoward to work for me (specifically with the error bars), so I made minor changes to remove errors and even remove warnings. So, you should be able to run the code from start to finish, and if #jlhoward wants to incorporate this into the existing answer, that's great.
centroids <- aggregate(cbind(mean.x = x, mean.y = y) ~ class, df, mean)
gg <- merge(df, centroids, by = "class")
f <- function(z) sd(z) / sqrt(length(z)) # function to calculate std.err
se <- aggregate(cbind(se.x = x ,se.y = y) ~ class, df, f)
centroids <- merge(centroids, se, by = "class") # add std.err column to centroids
ggplot(gg, aes(x = x, y = y, color = factor(class))) +
geom_point(size = 3) +
geom_point(data = centroids, aes(x = mean.x, y = mean.y), size = 5) +
geom_errorbar(data = centroids,
aes(x = mean.x, y = mean.y, ymin = mean.y - se.y, ymax = mean.y + se.y),
width = 0.1) +
geom_errorbarh(data = centroids, inherit.aes=FALSE, # keeps ggplot from using first aes
aes(xmin = (mean.x - se.x), xmax = (mean.x + se.x), y = mean.y,
height = 0.1, color = factor(class))) +
labs(x = "Label for x-axis", y = "Label for y-axis") +
theme(legend.title = element_blank()) # remove legend title

Normalising the x scales of overlaying density plots in ggplot

When overlaying ggplot density plots that feature data of same length but different scales is it possible to normalise the x scale for the plots so the densities match up? Alternatively is there a way to normalise the density y scale?
library(ggplot2)
data <- data.frame(x = c('A','B','C','D','E'), y1 = rnorm(100, mean = 0, sd = 1),
y2 = rnorm(100, mean = 0, sd = 50))
p <- ggplot(data)
# Overlaying the density plots is a fail
p + geom_density(aes(x=y1), fill=NA) + geom_density(aes(x=y2), alpha=0.3,col=NA,fill='red')
# You can compress the xscale in the aes() argument:
y1max <- max(data$y1)
y2max <- max(data$y2)
p + geom_density(aes(x=y1), fill=NA) + geom_density(aes(x=y2*y1max/y2max), alpha=0.3,col=NA,fill='red')
# But it doesn't fix the density scale. Any solution?
# And will it work with facet_wrap?
p + geom_density(aes(x=y1), col=NA,fill='grey30') + facet_wrap(~ x, ncol=2)
Thanks!
Does this do what you were hoping for?
p + geom_density(aes(x=scale(y1)), fill=NA) +
geom_density(aes(x=scale(y2)), alpha=0.3,col=NA,fill='red')
The scale function with only a single data argument will center an empiric distribution on 0 and then divide the resulting values by the sample standard deviation so the result has a standard deviation of 1. You can change the defaults for the location and the degree of "compression" or "expansion". You will probably need to investigate putting in appropriate x_scales for y1 and y2. This may take some preprocessing with scale. The scaling factor is recorded in an attribute of the returned object.
attr(scale(data$y2), "scaled:scale")
#[1] 53.21863

Resources