calculate density of one point in groups - r

I am plotting some density curves, and I want to add a point at the mean of each group. However, I want to plot these points along the top of the density curve, not at 0. Is there a way to come up with a value of the density at the mean point within groups? code follows:
# make df
df<- data.frame(group=c("a","b",'c'),
value=rnorm(
3000,
mean=c(1,2,3),
sd=c(1,1.5,1)
))
library(tidyverse)
library(ggridges)
library(ggdist)
Way 1: density ridges from ggridges ppackage
df %>%
# calculate mean density per group to use later
group_by(group)%>%
mutate(mean_value=mean(value)) %>%
ggplot()+
aes(x=value,y=group)+
geom_density_ridges()+
# could do with stat summary - blue points
stat_summary(
orientation = "y",
fun = mean,
geom = "point",
color="blue"
)+
# or could do with geom_point using precalculated value (red points)
# nudged so we can see both.
geom_point(aes(x=mean_value,y=group),
color="red",
position = position_nudge(x=.1)
)
way 2: stat_halfeye from ggdist package
df %>%
group_by(group)%>%
mutate(mean_value=mean(value)) %>%
# mutate(mean_density = density(mean_value,value))
ggplot()+
aes(x=value,y=group)+
stat_halfeye()+
# could do with stat summary
stat_summary(
orientation = "y",
fun = mean,
geom = "point",
color="blue",
alpha=.8
)+
# or could do with geom_point using precalculated value
# nudged so we can see both.
geom_point(aes(x=mean_value,y=group),
color="red",
position = position_nudge(x=.1)
)
desired output: for these blue or red points to be at the top of the density curve. So I will need a y aesthetic that is something like "group + density value."
Would rather use way 2 (ggdist) than geom_density ridges
Thanks

I'm not sure if there's a way to calculate the height of the density curve at the mean value within the ggplot geom/stat functions, so I've created a couple of helper functions to do that.
dens_at_mean calculates the height of the density curve at the mean of the data. get_mean_coords runs dens_at_mean by group and then scales the height values to match the y-values generated by stat_halfeye and returns a data frame that can be passed to geom_point.
# Reproducible data
set.seed(394)
df<- data.frame(group=c("a","b",'c'),
value=rnorm(
3000,
mean=c(1,2,3),
sd=c(1,1.5,1)
))
# Function to get height of density curve at mean value
dens_at_mean = function(x) {
d = density(x)
mean.x = mean(x)
data.frame(mean.x = mean.x,
max.y = max(d$y),
mean.y = approx(d$x, d$y, xout=mean.x)$y)
}
# Function to return data frame with properly scaled heights
# to plot mean points
get_mean_coords = function(data, value.var, group.var) {
data %>%
group_by({{group.var}}) %>%
summarise(vals = list(dens_at_mean({{value.var}}))) %>%
ungroup %>%
unnest_wider(vals) %>%
# Scale y-value to work properly with stat_halfeye
mutate(mean.y = (mean.y/max(max.y) * 0.9 + 1:n())) %>%
select(-max.y)
}
df %>%
ggplot()+
aes(x=value, y=group)+
stat_halfeye() +
geom_point(data=get_mean_coords(df, value, group),
aes(x=mean.x, y=mean.y),
color="red", size=2) +
theme_bw() +
scale_y_discrete(expand=c(0.08,0.05))

Related

How to add label to each geom_density line?

I have a column for Devices, and Values and I'm plotting a density curve for each Device.
library (ggplot2)
library(magritrr) # for the pipe operator
df %>% ggplot(aes(x = Value, group = Device)) + geom_density()
Now how do I add a label to each line? (I want the Device name to appear beside each density line on the graph and not in the legend)
I created a new summary dataset specifically for the labels. I positioned the labels at the peak of each density plot by picking the max Value for the x-axis and the max density for the y-axis. Hopefully this is helpful.
library(ggplot2)
library(dplyr)
Device = c("dev1","dev2","dev3","dev1","dev2","dev3","dev1","dev2","dev3")
Value = c(10,30,77,5,29,70,12,18,76)
df <- data.frame(Device, Value)
labels <- df %>%
group_by(Device) %>%
summarise(xPos = max(Value),
yPos = max((density(Value))$y))
ggplot() +
geom_density(data=df, aes(x = Value, group = Device, colour=Device)) +
geom_label(data=labels, aes(x=xPos, y=yPos, colour=Device, label=Device)) +
theme_light() +
theme(legend.position = "None")

How to include -inf values in geom_boxplot using the log scale?

I think ggplot is inconsistent in how it treats -inf in log scale. Below are two plots that display similar information. In the first case, I use a box plot to show the 25th, 50th, and 75th percentiles of the example dataset on a log scale. As you can see, it removes the 0 (or - inf values on the log scale) and creates the box plot from the remaining data points.
In the second example, I precalculate the 25th, 50th, and 75th percentiles and then use geom_point to make a similar plot with log scales. In this case, the zero value is simply plotted at the bottom of the axis instead of being removed. I think this behaviour is better. Is there a way to plot the box plot without removing the -inf values?
Thanks a lot.
library(tidyverse)
# create dataset
Test_Data <- data.frame("Values" = seq(0,10000,1))
Test_Data$Values[Test_Data$Values%%3==0] <- 0
Test_Data$Groups[Test_Data$Values%%2==0] <- 'A'
Test_Data$Groups[Test_Data$Values%%2!=0] <- 'B'
Test_Data$Values <- Test_Data$Values/10000
# plot boxplot
ggplot(data = Test_Data, aes(x= Groups, y = Values)) +
geom_boxplot()+
scale_y_continuous(trans='log10', limits = c(1e-5, 1e1)) +
annotation_logticks(sides = "l")
# Create dataset with quantile measures
Test_Data_Processed <- Test_Data %>%
select_all() %>%
group_by(Groups) %>%
summarise(Num = n(),
Median = median(Values),
Percnt_25 = quantile(Values,.25),
Percnt_75 = quantile(Values, .75)) %>%
gather(Measure, Value, Median:Percnt_75)
# plot with geom_point
ggplot(data = Test_Data_Processed, aes(x= Groups, y = Value, shape = Measure, color = Groups)) +
geom_point(size = 4) +
scale_y_continuous(trans='log10', limits = c(1e-5, 1e1)) +
annotation_logticks(sides = "l")

Scale density plots in ggplot2 to have same x-axis range

I want to overlay two density plots; one of data prior to transformation and one after. I don't care about the x and y values, only the shape of the curve.
I want to superimpose the 2 charts for a given Predictor on top of each other, even though the x-axis is different. I find it hard to look across the two facets. In reality, as well, there will be a lot more plots, so combining the non-transformed and transformed data into the one would be the best solution.
library(tidyverse)
require(caret)
data(BloodBrain)
bbbTrans <- preProcess(select(bbbDescr, adistd, adistm, dpsa3, inthb), method = "YeoJohnson")
bbbTransData <- predict(bbbTrans, select(bbbDescr, adistd, adistm, dpsa3, inthb))
dat <- bbbTransData %>%
gather(Predictor, Value) %>%
mutate(Transformation = "Yeo-Johnson") %>%
bind_rows(data.frame(gather(select(bbbDescr, adistd, adistm, dpsa3, inthb), Predictor, Value), Transformation = "NA", stringsAsFactors = FALSE))
# For the predictor adistd, I would like the x-axis range to be 0:12.5 for the
# "Yeo-Johnson" transformation and 0:250 for no transformation. In this plot, it
# is hard to see the shape of the transformed variables due to the different x-value range.
dat %>% ggplot(aes(x = Value, color = Transformation)) +
geom_density(aes(y = ..scaled..), position = "dodge") +
facet_wrap(~Predictor, scales = "free")
# i.e., I want to superimpose the 2 charts for a given Predictor on top of each other, even though the x-axis is different
# I find it hard to look across the two facets. In reality, as well, there will be a lot more plots, so combining the non-transformed and transformed data into the one plot using colour would be the best solution.
filter(dat, Transformation != 'NA') %>% ggplot(aes(x = Value, y = ..scaled..)) +
geom_density() +
facet_wrap(~Predictor, scales = "free")
filter(dat, Transformation == 'NA') %>% ggplot(aes(x = Value, y = ..scaled..)) +
geom_density() +
facet_wrap(~Predictor, scales = "free")
Edit: The algorithm I think I need is (and prefer to do using tidyverse):
Group by predictor/transformation
Get density for each
Transform x of density to (x-xmin)/(xmax-xmin) so that between 0 to 1
Plot transformed density$x, density$y
Solution that scales (base::scale) and calculates density (stats::density). density function outputs same number of equally spaced points so we can arrange them from 0 to 1 (as OP wants).
# How many points we want
nPoints <- 1e3
# Final result
res <- list()
# Using simple loop to scale and calculate density
combinations <- expand.grid(unique(dat$Predictor), unique(dat$Transformation))
for(i in 1:nrow(combinations)) {
# Subset data
foo <- subset(dat, Predictor == combinations$Var1[i] & Transformation == combinations$Var2[i])
# Perform density on scaled signal
densRes <- density(x = scale(foo$Value), n = nPoints)
# Position signal from 1 to wanted number of points
res[[i]] <- data.frame(x = 1:nPoints, y = densRes$y,
pred = combinations$Var1[i], trans = combinations$Var2[i])
}
res <- do.call(rbind, res)
ggplot(res, aes(x / nPoints, y, color = trans, linetype = trans)) +
geom_line(alpha = 0.5, size = 1) +
facet_wrap(~ pred, scales = "free")

How can I plot a cumulative distribution function (CDF) for binned data?

I've got discrete data which i presented in ranges
for example
Marks Freq cumFreq
1 (37.9,43.1] 4 4
2 (43.1,48.2] 16 20
3 (48.2,53.3] 76 96
i need to plot the cmf for this data, I know that there is
plot(ecdf(x))
but i don't what to add for it to have what I need.
Here are a few options:
library(ggplot2)
library(scales)
library(dplyr)
## Fake data
set.seed(2)
dat = data.frame(score=c(rnorm(130,40,10), rnorm(130,80,5)))
Here's how to plot the ECDF if you have the raw data:
# Base graphics
plot(ecdf(dat$score))
# ggplot2
ggplot(dat, aes(score)) +
stat_ecdf(aes(group=1), geom="step")
Here's one way to plot the ECDF if you have only summary data:
First, let's group the data into bins, similar to what you have in your question. We use the cut function to create the bins and then create a new pct column to calculate each bins fraction of the total number of scores. We use the dplyr chaining operator (%>%) to do it all in one "chain" of functions.
dat.binned = dat %>% count(Marks=cut(score,seq(0,100,5))) %>%
mutate(pct = n/sum(n))
Now we can plot it. cumsum(pct) calculates the cumulative percentages (like cumFreq in your question). geom_step creates step plot with these cumulative percentages.
ggplot(dat.binned, aes(Marks, cumsum(pct))) +
geom_step(aes(group=1)) +
scale_y_continuous(labels=percent_format())
Here's what the plots look like:
What about this:
library(ggplot2)
library(scales)
library(dplyr)
set.seed(2)
dat = data.frame(score = c(rnorm(130,40,10), rnorm(130,80,5)))
dat.binned = dat %>% count(Marks = cut(score,seq(0,100,5))) %>%
mutate(pct = n/sum(n))
ggplot(data = dat.binned, mapping = aes(Marks, cumsum(pct))) +
geom_line(aes(group = 1)) +
geom_point(data = dat.binned, size = 0.1, color = "blue") +
labs(x = "Frequency(Hz)", y = "Axis") +
scale_y_continuous(labels = percent_format())

ggplot2 shading envelope of time series

I am plotting the results of 50 - 100 experiments.
Each experiment results in a time series.
I can plot a spaghetti plot of all time series, but
what I'd like to have is sort of a density map for the time series plume.
(something similar to the gray shading in the lower panel
in this figure: http://www.ipcc.ch/graphics/ar4-wg1/jpg/fig-6-14.jpg)
I can 'sort of' do this with 2d binning or binhex but the result could be prettier (see example below).
Here is a code that reproduces a plume plot for mock data (uses ggplot2 and reshape2).
# mock data: random walk plus a sinus curve.
# two envelopes for added contrast.
tt=10*sin(c(1:100)/(3*pi))
rr=apply(matrix(rnorm(5000),100,50),2,cumsum) +tt
rr2=apply(matrix(rnorm(5000),100,50),2,cumsum)/1.5 +tt
# stuff data into a dataframe and melt it.
df=data.frame(c(1:100),cbind(rr,rr2) )
names(df)=c("step",paste("ser",c(1:100),sep=""))
dfm=melt(df,id.vars = 1)
# ensemble average
ensemble_av=data.frame(step=df[,1],ensav=apply(df[,-1],1,mean))
ensemble_av$variable=as.factor("Mean")
ggplot(dfm,aes(step,value,group=variable))+
stat_binhex(alpha=0.2) + geom_line(alpha=0.2) +
geom_line(data=ensemble_av,aes(step,ensav,size=2))+
theme(legend.position="none")
Does anyone know of a nice way do get a shaded envelope with gradients. I have also tried geom_ribbon but that did not give any indication of density changes along the plume. binhex does that, but not with aesthetically pleasing results.
Compute quantiles:
qs = data.frame(
do.call(
rbind,
tapply(
dfm$value, dfm$step, function(i){quantile(i)})),
t=1:100)
head(qs)
X0. X25. X50. X75. X100. t
1 -0.8514179 0.4197579 0.7681517 1.396382 2.883903 1
2 -0.6506662 1.2019163 1.6889073 2.480807 5.614209 2
3 -0.3182652 2.0480082 2.6206045 4.205954 6.485394 3
4 -0.1357976 2.8956990 4.2082762 5.138747 8.860838 4
5 0.8988975 3.5289219 5.0621513 6.075937 10.253379 5
6 2.0027973 4.5398120 5.9713921 7.015491 11.494183 6
Plot ribbons:
ggplot() +
geom_ribbon(data=qs, aes(x=t, ymin=X0., ymax=X100.),fill="gray30", alpha=0.2) +
geom_ribbon(data=qs, aes(x=t, ymin=X25., ymax=X75.),fill="gray30", alpha=0.2)
This is for two quantile intervals, (0-100) and (25-75). You'll need more args to quantile and more ribbon layers for more quantiles, and need to adjust the colours too.
Based on the idea of Spacedman, I found a way to add more intervals in an automatic way: I first compute the quantiles for each step, group them by pairs of symmetric values and then use geom_ribbon in the right order...
library(tidyr)
library(dplyr)
condquant <- dfm %>% group_by(step) %>%
do(quant = quantile(.$value, probs = seq(0,1,.05)), probs = seq(0,1,.05)) %>%
unnest() %>%
mutate(delta = 2*round(abs(.5-probs)*100)) %>%
group_by(step, delta) %>%
summarize(quantmin = min(quant), quantmax= max(quant))
ggplot() +
geom_ribbon(data = condquant, aes(x = step, ymin = quantmin, ymax = quantmax,
group = reorder(delta, -delta), fill = as.numeric(delta)),
alpha = .5) +
scale_fill_gradient(low = "grey10", high = "grey95") +
geom_line(data = dfm, aes(x = step, y = value, group=variable), alpha=0.2) +
geom_line(data=ensemble_av,aes(step,ensav),size=2)+
theme(legend.position="none")
Thanks Erwan and Spacedman.
Avoiding 'tidyr' ('dplyr' and 'magrittr') my version of Erwans answer becomes
probs=c(0:10)/10 # use fewer quantiles than Erwan
arr=t(apply(df[,-1],1,quantile,prob=probs))
dfq=data.frame(step=df[,1],arr)
names(dfq)=c("step",colnames(arr))
dfqm=melt(dfq,id.vars=c(1))
# add inter-quantile (per) range as delta
dfqm$delta=dfqm$variable
levels(dfqm$delta)=abs(probs-rev(probs))*100
dfplot=ddply(dfqm,.(step,delta),summarize,
quantmin=min(value),
quantmax=max(value) )
ggplot() +
geom_ribbon(data = dfplot, aes(x = step, ymin = quantmin,
ymax =quantmax,group=rev(delta),
fill = as.numeric(delta)),
alpha = .5) +
scale_fill_gradient(low = "grey25", high = "grey75") +
geom_line(data=ensemble_av,aes(step,ensav),size=2) +
theme(legend.position="none")

Resources