cumulative probability plot from frequency table - r

Is there any way to plot the cumulative probability from a frequency table? I mean a "smooth" version of it, similar to the way geom_density() plots.
So far, I managed to plot the individually calculated probabilities as points joined by lines, but it doesn't look very good.

I generate some test data:
set.seed(1)
x <- sort(sample(1:100, 20))
p <- runif(x); p <- cumsum(p)/sum(p)
table <- data.frame(x=x, prob=p)
You can use geom_smooth from the ggplot2 package.
require("ggplot2")
qplot(x=x, y=p, data=table, aes(ymin=0, ymax=1)) + ylab("ecf") +
geom_smooth(se=F, stat="smooth", method="loess", fullrange=T, fill="lightgrey", size=1)
As an alternative, an easy way to specifiy smoothing by a parameter try DeconCdf from the decon package:
require("decon")
plot(DeconCdf(x, sig=1))
If you want to use ggplot, you first have to transform the Decon function object in a data.frame.
f <- DeconCdf(x, sig=1)
m <- ggplot(data=data.frame(x=f$x, p=f$y), aes(x=x, y=p, ymin=0, ymax=1)) + ylab("ecf")
m + geom_line(size=1)
Use the sig-Parameter as your smoothing parameter:
f <- DeconCdf(x, sig=0.3)
m <- ggplot(data=data.frame(x=f$x, p=f$y), aes(x=x, y=p, ymin=0, ymax=1)) + ylab("ecf")
m + geom_line(size=1)

This version plots a histogram with a smoothed line from geom_density:
# Generate some data:
set.seed(28986)
x2 <- rweibull(100, 1, 1/2)
# Plot the points:
library(ggplot2)
library(scales)
ggplot(data.frame(x=x2),aes(x=x, y=1-cumsum(..count..)/sum(..count..))) +
geom_histogram(aes(fill=..count..)) +
geom_density(fill=NA, color="black", adjust=1/2) +
scale_y_continuous("Percent of units\n(equal to or larger than x)",labels=percent) +
theme_grey(base_size=18)
Note that I've used 1 - "cumulative probability" due to individual preference (I think it looks better and I'm accustomed to dealing with "reliability" metrics), but obviously that's just a preference that you could ignore by removing the 1- part in the aes.

Related

How to make density plot correctly show area near the limits?

When I plot densities with ggplot, it seems to be very wrong around the limits. I see that geom_density and other functions allow specifying various density kernels, but none of them seem to fix the issue.
How do you correctly plot densities around the limits with ggplot?
As an example, let's plot the Chi-square distribution with 2 degrees of freedom. Using the builtin probability densities:
library(ggplot2)
u = seq(0, 2, by=0.01)
v = dchisq(u, df=2)
df = data.frame(x=u, p=v)
p = ggplot(df) +
geom_line(aes(x=x, y=p), size=1) +
theme_classic() +
coord_cartesian(xlim=c(0, 2), ylim=c(0, 0.5))
show(p)
We get the expected plot:
Now let's try simulating it and plotting the empirical distribution:
library(ggplot2)
u = rchisq(10000, df=2)
df = data.frame(x=u)
p = ggplot(df) +
geom_density(aes(x=x)) +
theme_classic() +
coord_cartesian(xlim=c(0, 2))
show(p)
We get an incorrect plot:
We can try to visualize the actual distribution:
library(ggplot2, dplyr, tidyr)
u = rchisq(10000, df=2)
df = data.frame(x=u)
p = ggplot(df) +
geom_point(aes(x=x, y=0.5), position=position_jitter(height=0.2), shape='.', alpha=1) +
theme_classic() +
coord_cartesian(xlim=c(0, 2), ylim=c(0, 1))
show(p)
And it seems to look correct, contrary to the density plot:
It seems like the problem has to do with kernels, and geom_density does allow using different kernels. But they don't really correct the limit problem. For example, the code above with triangular looks about the same:
Here's an idea of what I'm expecting to see (of course, I want a density, not a histogram):
library(ggplot2)
u = rchisq(10000, df=2)
df = data.frame(x=u)
p = ggplot(df) +
geom_histogram(aes(x=x), center=0.1, binwidth=0.2, fill='white', color='black') +
theme_classic() +
coord_cartesian(xlim=c(0, 2))
show(p)
The usual kernel density methods have trouble when there is a constraint such as in this case for a density with only support above zero. The usual recommendation for handling this has been to use the logspline package:
install.packages("logspline")
library(logspline)
png(); fit <- logspline(rchisq(10000, 3))
plot(fit) ; dev.off()
If this needed to be done in the ggplot2 environment there is a dlogspline function:
densdf <- data.frame( y=dlogspline(seq(0,12,length=1000), fit),
x=seq(0,12,length=1000))
ggplot(densdf, aes(y=y,x=x))+geom_line()
Perhaps you were insisting on one with 2 degrees of freedom?

ggplot2: add conditional density curves describing both dimensions of scatterplot

I have scatterplots of 2D data from two categories. I want to add density lines for each dimension -- not outside the plot (cf. Scatterplot with marginal histograms in ggplot2) but right on the plotting surface. I can get this for the x-axis dimension, like this:
set.seed(123)
dim1 <- c(rnorm(100, mean=1), rnorm(100, mean=4))
dim2 <- rnorm(200, mean=1)
cat <- factor(c(rep("a", 100), rep("b", 100)))
mydf <- data.frame(cbind(dim2, dim1, cat))
ggplot(data=mydf, aes(x=dim1, y=dim2, colour=as.factor(cat))) +
geom_point() +
stat_density(aes(x=dim1, y=(-2+(..scaled..))),
position="identity", geom="line")
It looks like this:
But I want an analogous pair of density curves running vertically, showing the distribution of points in the y-dimension. I tried
stat_density(aes(y=dim2, x=0+(..scaled..))), position="identity", geom="line)
but receive the error "stat_density requires the following missing aesthetics: x".
Any ideas? thanks
You can get the densities of the dim2 variables. Then, flip the axes and store them in a new data.frame. After that it is simply plotting them on top of the other graph.
p <- ggplot(data=mydf, aes(x=dim1, y=dim2, colour=as.factor(cat))) +
geom_point() +
stat_density(aes(x=dim1, y=(-2+(..scaled..))),
position="identity", geom="line")
stuff <- ggplot_build(p)
xrange <- stuff[[2]]$ranges[[1]]$x.range # extract the x range, to make the new densities align with y-axis
## Get densities of dim2
ds <- do.call(rbind, lapply(unique(mydf$cat), function(lev) {
dens <- with(mydf, density(dim2[cat==lev]))
data.frame(x=dens$y+xrange[1], y=dens$x, cat=lev)
}))
p + geom_path(data=ds, aes(x=x, y=y, color=factor(cat)))
So far I can produce:
distrib_horiz <- stat_density(aes(x=dim1, y=(-2+(..scaled..))),
position="identity", geom="line")
ggplot(data=mydf, aes(x=dim1, y=dim2, colour=as.factor(cat))) +
geom_point() + distrib_horiz
And:
distrib_vert <- stat_density(data=mydf, aes(x=dim2, y=(-2+(..scaled..))),
position="identity", geom="line")
ggplot(data=mydf, aes(x=dim2, y=dim1, colour=as.factor(cat))) +
geom_point() + distrib_vert + coord_flip()
But combining them is proving tricky.
So far I have only a partial solution since I didn't manage to obtain a vertical stat_density line for each individual category, only for the total set. Maybe this can nevertheless help as a starting point for finding a better solution. My suggestion is to try with the ggMarginal() function from the ggExtra package.
p <- ggplot(data=mydf, aes(x=dim1, y=dim2, colour=as.factor(cat))) +
geom_point() + stat_density(aes(x=dim1, y=(-2+(..scaled..))),
position="identity", geom="line")
library(ggExtra)
ggMarginal(p,type = "density", margins = "y", size = 4)
This is what I obtain:
I know it's not perfect, but maybe it's a step in a helpful direction. At least I hope so. Looking forward to seeing other answers.

A better way to build confidence bands around mean/median of an observed sample using ggplot2

So I have a three column data frame that has Trials, Ind. Variable, Observation. Something like:
df1<- data.frame(Trial=rep(1:10,5), Variable=rep(1:5, each=10), Observation=rnorm(1:50))
I am trying to plot a 95% conf. Interval around the mean for each trial using a rather inefficient method as follows:
b<-NULL
b$mean<- aggregate(Observation~Variable, data=df1,mean)[,2]
b$sd <- aggregate(Observation~Variable, data=df1,sd)[,2]
b$Variable<- df1$Variable
b$Observation <- df1$Observation
b$ucl <- rep(qnorm(.975, mean=b$mean, sd=b$sd), each=10)
b$lcl <- rep(qnorm(.025, mean=b$mean, sd=b$sd), each=10)
b<- as.data.frame(b)
c <- ggplot(b, aes(Variable, Observation))
c + geom_point(color="red") +
geom_smooth(aes(ymin = lcl, ymax = ucl), data=b, stat="summary", fun.y="mean")
This is inefficient since it duplicates values for ymin, ymax. I've seen the geom_ribbon methods but I would still need to duplicate. However, if I was using any kind of smoothing like glm, the code is much simpler with no duplication. Is there a better way of doing this?
References:
1. R Plotting confidence bands with ggplot
2. Shading confidence intervals manually with ggplot2
3. http://docs.ggplot2.org/current/geom_smooth.html
With this method, I get the same output as with your method. This was inspired by the docs for ggplot. Again, this will be meaningful so long as each x value has multiple points.
set.seed(1)
df1 <- data.frame(Trial=rep(1:10,5), Variable=rep(1:5, each=10), Observation=rnorm(1:50)) my_ci <- function(x) data.frame(y=mean(x), ymin=mean(x)-2*sd(x), ymax=mean(x)+2*sd(x))
my_ci <- function(x) data.frame(
y=mean(x),
ymin=mean(x) - 2 * sd(x),
ymax=mean(x) + 2 * sd(x)
)
ggplot(df1, aes(Variable, Observation)) + geom_point(color="red") +
stat_summary(fun.data="my_ci", geom="smooth")
The ggplot package comes with wrappers for a number of summarizing functions in the Hmisc package, including
mean_cl_normal which calculates the confidence limits based on the t-distribution,
mean_cl_boot which uses a bootstrap method that does not assume a distribution of the mean,
mean_sdl which uses a multiple of the standard deviation (default=2).
This latter method is the same as in the answer above, but is not the 95% CL. Confidence limits based on the t-distribution are given by:
CL = t × s / √n
Where t is the appropriate quantile of the t-distribution and s is the sample standard deviation. Compare the confidence bands:
ggplot(df1, aes(x=Variable, y=Observation)) +
stat_summary(fun.data="mean_sdl", geom="line", colour="blue")+
stat_summary(fun.data="mean_sdl", mult=2, geom="errorbar",
width=0.1, linetype=2, colour="blue")+
geom_point(color="red") +
labs(title=expression(paste(bar(x)," \u00B1 ","2 * sd")))
ggplot(df1, aes(x=Variable, y=Observation)) +
geom_point(color="red") +
stat_summary(fun.data="mean_cl_normal", geom="line", colour="blue")+
stat_summary(fun.data="mean_cl_normal", conf.int=0.95, geom="errorbar",
width=0.1, linetype=2, colour="blue")+
stat_summary(fun.data="mean_cl_normal", geom="point", size=3,
shape=1, colour="blue")+
labs(title=expression(paste(bar(x)," \u00B1 ","t * sd / sqrt(n)")))
Finally, rotating this last plot using coord_flip() generates something very close to a Forest Plot, which is a standard method for summarizing data like yours.
ggplot(df1, aes(x=Variable, y=Observation)) +
geom_point(color="red") +
stat_summary(fun.data="mean_cl_normal", conf.int=0.95, geom="errorbar",
width=0.2, colour="blue")+
stat_summary(fun.data="mean_cl_normal", geom="point", size=3,
shape=1, colour="blue")+
geom_hline(aes(yintercept=mean(Observation)), linetype=2)+
labs(title="Forest Plot")+
coord_flip()

making binned scatter plots for two variables in ggplot2 in R

I have a dataframe with two columns x and y that each contain values between 0 and 100 (the data are paired). I want to correlate them to each other using binned scatter plots. If I were to use a regular scatter plot, it would be easy to do:
geom_point(aes(x=x, y=y))
but I'd like to instead bin the points into N bins from 0 to 100, get the average value of x in each bin and the average value of y for the points in that bin, and show that as a scatter plot - so correlate the binned averages instead of the raw data points.
is there a clever/quick way to do this in ggplot2, using some combination of geom_smooth() and geom_point? Or does it have to be pre-computed manually and then plotted?
Yes, you can use stat_summary_bin.
set.seed(42)
x <- runif(1e4)
y <- x^2 + x + 4 * rnorm(1e4)
df <- data.frame(x=x, y=y)
library(ggplot2)
(ggplot(df, aes(x=x,y=y)) +
geom_point(alpha = 0.4) +
stat_summary_bin(fun.y='mean', bins=20,
color='orange', size=2, geom='point'))
I suggest geom_bin2d.
DF <- data.frame(x=1:100,y=1:100+rnorm(100))
library(ggplot2)
p <- ggplot(DF,aes(x=x,y=y)) + geom_bin2d()
print(p)

Add a vertical line with different intercept for each panel in ggplot2

I'm using ggplot2 to create panels of histograms, and I'd like to be able to add a vertical line at the mean of each group. But geom_vline() uses the same intercept for each panel (i.e. the global mean):
require("ggplot2")
# setup some sample data
N <- 1000
cat1 <- sample(c("a","b","c"), N, replace=T)
cat2 <- sample(c("x","y","z"), N, replace=T)
val <- rnorm(N) + as.numeric(factor(cat1)) + as.numeric(factor(cat2))
df <- data.frame(cat1, cat2, val)
# draws a single histogram with vline at mean
qplot(val, data=df, geom="histogram", binwidth=0.2) +
geom_vline(xintercept=mean(val), color="red")
# draws panel of histograms with vlines at global mean
qplot(val, data=df, geom="histogram", binwidth=0.2, facets=cat1~cat2) +
geom_vline(xintercept=mean(val), color="red")
How can I get it to use each panel's group mean as the x-intercept? (Bonus points if you can also add a text label by the line with the value of the mean.)
I guess this is a reworking of #eduardo's really, but in one line.
ggplot(df) + geom_histogram(mapping=aes(x=val))
+ geom_vline(data=aggregate(df[3], df[c(1,2)], mean),
mapping=aes(xintercept=val), color="red")
+ facet_grid(cat1~cat2)
alt text http://www.imagechicken.com/uploads/1264782634003683000.png
or using plyr (require(plyr) a package by the author of ggplot, Hadley):
ggplot(df) + geom_histogram(mapping=aes(x=val))
+ geom_vline(data=ddply(df, cat1~cat2, numcolwise(mean)),
mapping=aes(xintercept=val), color="red")
+ facet_grid(cat1~cat2)
It seems unsatisfying that vline isn't cut on the facets, I'm not sure why.
One way is to construct the data.frame with the mean values before hand.
library(reshape)
dfs <- recast(data.frame(cat1, cat2, val), cat1+cat2~variable, fun.aggregate=mean)
qplot(val, data=df, geom="histogram", binwidth=0.2, facets=cat1~cat2) + geom_vline(data=dfs, aes(xintercept=val), colour="red") + geom_text(data=dfs, aes(x=val+1, y=1, label=round(val,1)), size=4, colour="red")

Resources