Plot point on ggplot2 smoothing regression on vline intersection - r

I want to create a (time-series) plot out of 40 million data points in order to show two regression lines with two specific events on each of it (first occurrence of an optimum in time-series).
Currently, I draw the regression lines and add a geom_vline to it to indicate the event.
As I want to be independent from colours in the plot, it would be beneficial if I could just plot the marker geom_vline as a point on the regression line.
Do you have any idea how to solve this using ggplot2?
My current approach is this here (replaced data points with test data):
library(ggplot2)
# Generate data
m1 <- "method 1"
m2 <- "method 2"
data1 <- data.frame(Time=seq(100), Value=sample(1000, size=100), Type=rep(as.factor(m1), 100))
data2 <- data.frame(Time=seq(100), Value=sample(1000, size=100), Type=rep(as.factor(m2), 100))
df <- rbind(data1, data2)
rm(data1, data2)
# Calculate first minima for each Type
m1_intercept <- df[which(df$Type == m1), ][which.min(df[which(df$Type == m1), ]$Value),]
m2_intercept <- df[which(df$Type == m2), ][which.min(df[which(df$Type == m2), ]$Value),]
# Plot regression and vertical lines
p1 <- ggplot(df, aes(x=Time, y=Value, group=Type, colour=Type), linetype=Type) +
geom_smooth(se=F) +
geom_vline(aes(xintercept=m1_intercept$Time, linetype=m1_intercept$Type)) +
geom_vline(aes(xintercept=m2_intercept$Time, linetype=m2_intercept$Type)) +
scale_linetype_manual(name="", values=c("dotted", "dashed")) +
guides(colour=guide_legend(title="Regression"), linetype=guide_legend(title="First occurrence of optimum")) +
theme(legend.position="bottom")
ggsave("regression.png", plot=p1, height=5, width=7)
which generates this plot:
My desired plot would be something like this:
So my questions are
Does it make sense to indicate a minimum value on a regression line? The values y-axis position would be in fact wrong but just to indicate the timepoint?
If yes, how can I achieve such a behaviour?
If no, what would you think could be better?
Thank you very much in advance!
Robin

If you first run your ggplot() call with only geom_smooth(), you can access plotted values through ggplot_build(), which we then can use to plot points on the two fitted lines. Example:
# Create initial plot
p1<-ggplot(df, aes(x=Time, y=Value, colour=Type)) +
geom_smooth(se=F)
# Now we can access the fitted values
smooths <- ggplot_build(p1)$data[[1]]
smooths_1 <- smooths[smooths$group==1,] # First group (method 1)
smooths_2 <- smooths[smooths$group==2,] # Second group (method 2)
# Then we find the closest plotted values to the minima
smooth_1_x <- smooths_1$x[which.min(abs(smooths_1$x - m1_intercept$Time))]
smooth_2_x <- smooths_2$x[which.min(abs(smooths_2$x - m2_intercept$Time))]
# Subset the previously defined datasets for respective closest values
point_data1 <- smooths_1[smooths_1$x==smooth_1_x,]
point_data2 <- smooths_1[smooths_2$x==smooth_2_x,]
Now we use point_data1 and point_data2 to place the points on your plot:
ggplot(df, aes(x=Time, y=Value, colour=Type)) +
geom_smooth(se=F) +
geom_point(data=point_data1, aes(x=x, y=y), colour = "red",size = 5) +
geom_point(data=point_data2, aes(x=x, y=y), colour = "red", size = 5)
To reproduce this plot, you can use set.seed(42) for your data generation step.

Related

ggplot: how to add direct labels to each curve

I was trying to add direct label to each curve in my density plot. The label is something like plots in this tutorial, however I can't get it worked.
Here is my data frame:
values <- runif(1200, 35, 60)
ind <- as.factor(rep(c(1:6), each=200))
inout <- as.factor(rep(c(1:2), each =600))
df <- data.frame(values,ind,inout)
Here is the density plot:
ggplot(df) +
geom_density(aes(x=values, group=interaction(ind,inout), colour=factor(inout)), alpha=1) +
geom_density(aes(x=values, group=inout, fill=factor(inout)), alpha=.4) +
theme(text = element_text(size=25)) +
theme(legend.justification=c(1,1), legend.position=c(1,1)) +
guides(colour=FALSE) +
scale_fill_discrete(name="Ave.",breaks=c("1", "2"),labels=c("S1", "S2"))
How can I add direct labels (i.e., 1 to 6) to each curve for two groups (i.e., S1 and S2)? Two averaged curves don't need to be labeled.
Thanks a lot.

How to shade part of a density curve in ggplot (with no y axis data)

I'm trying to create a density curve in R using a set of random numbers between 1000, and shade the part that is less than or equal to a certain value. There are a lot of solutions out there involving geom_area or geom_ribbon, but they all require a yval, which I don't have (it's just a vector of 1000 numbers). Any ideas on how I could do this?
Two other related questions:
Is it possible to do the same thing for a cumulative density function (I'm currently using stat_ecdf to generate one), or shade it at all?
Is there any way to edit geom_vline so it will only go up to the height of the density curve, rather than the whole y axis?
Code: (the geom_area is a failed attempt to edit some code I found. If I set ymax manually, I just get a column taking up the whole plot, instead of just the area under the curve)
set.seed(100)
amount_spent <- rnorm(1000,500,150)
amount_spent1<- data.frame(amount_spent)
rand1 <- runif(1,0,1000)
amount_spent1$pdf <- dnorm(amount_spent1$amount_spent)
mean1 <- mean(amount_spent1$amount_spent)
#density/bell curve
ggplot(amount_spent1,aes(amount_spent)) +
geom_density( size=1.05, color="gray64", alpha=.5, fill="gray77") +
geom_vline(xintercept=mean1, alpha=.7, linetype="dashed", size=1.1, color="cadetblue4")+
geom_vline(xintercept=rand1, alpha=.7, linetype="dashed",size=1.1, color="red3")+
geom_area(mapping=aes(ifelse(amount_spent1$amount_spent > rand1,amount_spent1$amount_spent,0)), ymin=0, ymax=.03,fill="red",alpha=.3)+
ylab("")+
xlab("Amount spent on lobbying (in Millions USD)")+
scale_x_continuous(breaks=seq(0,1000,100))
There are a couple of questions that show this ... here and here, but they calculate the density prior to plotting.
This is another way, more complicated than required im sure, that allows ggplot to do some of the calculations for you.
# Your data
set.seed(100)
amount_spent1 <- data.frame(amount_spent=rnorm(1000, 500, 150))
mean1 <- mean(amount_spent1$amount_spent)
rand1 <- runif(1,0,1000)
Basic density plot
p <- ggplot(amount_spent1, aes(amount_spent)) +
geom_density(fill="grey") +
geom_vline(xintercept=mean1)
You can extract the x and y positions for the area to shade from the plot object using ggplot_build. Linear interpolation was used to get the y value at x=rand1
# subset region and plot
d <- ggplot_build(p)$data[[1]]
p <- p + geom_area(data = subset(d, x > rand1), aes(x=x, y=y), fill="red") +
geom_segment(x=rand1, xend=rand1,
y=0, yend=approx(x = d$x, y = d$y, xout = rand1)$y,
colour="blue", size=3)

When I use stat_summary with line and point geoms I get a double legend

I have data for 4 sectors (A,B,C,D) and 5 years. I would like to draw 4 lines, 1 for each sector, adding a point for every year and add a fifth line representing the mean line using the stat_summary statement and controlling the line colors by means of scale_color_manual and point shapes in aes() argument. The problem is that if I add the point geom the legend is split in two parts one for point shapes and one for line colors. I didn't understand how to obtain 1 legend combining colors and points.
Here is an example. First of all let's build the data frame dtfr as follows:
a <- 100; b <- 100; c <- 100; d <- 100
for(k in 2:5){
a[k] <- a[k-1]*(1+rnorm(1)/100)
b[k] <- b[k-1]*(1+rnorm(1)/100)
c[k] <- c[k-1]*(1+rnorm(1)/100)
d[k] <- d[k-1]*(1+rnorm(1)/100)
}
v <- numeric()
for(k in 1:5){ v <- c(v,a[k],b[k],c[k],d[k]) }
dtfr <- data.frame(Year=rep(2008:2012,1, each=4),
Sector=rep(c("A","B","C","D"),5),
Value=v,
stringsAsFactors=F)
Now let us start to draw our graph by ggpolt2. In the first graph we draw lines and points geom without the mean line:
library(ggplot2)
ggplot(dtfr, aes(x=Year, y=Value)) +
geom_line(aes(group=Sector, color=Sector)) +
geom_point(aes(color=Sector, shape=Sector)) +
# stat_summary(aes(colour="mean",group=1), fun.y=mean, geom="line", size=1.1) +
scale_color_manual(values=c("#004E00", "#33FF00", "#FF9966", "#3399FF", "#FF004C")) +
ggtitle("Test for ggplot2 graph")
In this graph we have the legend with line colors and point shapes all in one:
But if I use the stat_summary to draw the mean line using the following code:
ggplot(dtfr, aes(x=Year, y=Value)) +
geom_line(aes(group=Sector, color=Sector)) +
geom_point(aes(color=Sector, shape=Sector)) +
stat_summary(aes(colour="mean",group=1), fun.y=mean, geom="line", size=1.1) +
scale_color_manual(values=c("#004E00", "#33FF00", "#FF9966", "#3399FF", "#FF004C")) +
ggtitle("Test for ggplot2 graph")
I get the mean (red) line but the legend is split into two parts one for line colors and one for point shapes. At this point my question is: How can I get the mean line graph with the legend like the one in the first graph? That is, how to get only one legend combining lines and shapes in the second graph where is drawn the mean line?
Try this:
ggplot(dtfr, aes(x=Year, y=Value)) +
geom_line(aes(group=Sector, color=Sector)) +
geom_point(aes(color=Sector, shape=Sector)) +
stat_summary(aes(colour="mean",shape="mean",group=1), fun.y=mean, geom="line", size=1.1) +
scale_color_manual(values=c("#004E00", "#33FF00", "#FF9966", "#3399FF", "#FF004C")) +
scale_shape_manual(values=c(1:4, 32)) +
ggtitle("Test for ggplot2 graph")
Maybe someone more knowledgeable can come in and correct my explanation (or provide a better solution), but here's how I understand it: You have 5 values in the color scale, but you only have 4 in the shape scale; you're missing a value for "mean". So the scales aren't really compatible in a way. You can fix this by assigning a blank shape (32) to your mean line.
Here is a different approach that calculates the summary/mean beforehand and adds it as an additional level to the data frame before building the plot.
The approach can be used to easily add an additional line but with a specific color, which may be desired for a summary/mean for example.
First, I calculate the mean and add it to the dtfr of the OP.
dtfr2 <- dtfr %>%
dplyr::group_by(Year) %>%
dplyr::summarise(Value = mean(Value)) %>%
dplyr::mutate(Sector = NA) %>%
dplyr::bind_rows(dtfr)
dtfr2 now has additional rows with the mean values stored in Value and NAs in Sector.
Then, building the plot is easy:
p1 <- ggplot(dtfr2, aes(x=Year, y=Value, color = Sector, shape = Sector)) +
geom_line() +
geom_point()
Finally, you may tweak the legend a little:
p1 +
scale_color_discrete(labels = c(letters[1:4], "M"), na.value = "black") +
scale_shape_discrete(labels = c(letters[1:4], "M"))

ggplot2 using geom_errorbar and geom_point to add points to a plot

I have a plot using ggplot, and I would like to add points and error bars to it. I am using geom_errorbar and geom_point, but I am getting an error: "Discrete value supplied to continuous scale" and I am not sure why. The data labels in the plot below should remain the same. I simply want to add new points to the existing graph. The new graph should look like the one below, except with two points/CI bars for each label on the Y axis.
The following example is from the lme4 package, and it produces a plot with confidence intervals using ggplot below (all can be replicated except the last two lines of borken code). My data is only different in that it includes about 15 intercepts instead of 6 below (which is why I am using scale_shape_manual).
The last two lines of code is my attempt at adding points/confidence intervals. I'm going to put a 50 bounty on this. Please let me know if I am being unclear. Thanks!
library("lme4")
data(package = "lme4")
# Dyestuff
# a balanced one-way classiï¬cation of Yield
# from samples produced from six Batches
summary(Dyestuff)
# Batch is an example of a random effect
# Fit 1-way random effects linear model
fit1 <- lmer(Yield ~ 1 + (1|Batch), Dyestuff)
summary(fit1)
coef(fit1) #intercept for each level in Batch
randoms<-ranef(fit1, postVar = TRUE)
qq <- attr(ranef(fit1, postVar = TRUE)[[1]], "postVar")
rand.interc<-randoms$Batch
#THESE ARE THE ADDITIONAL POINTS TO BE ADDED TO THE PLOT
Inter <- c(-25,-45,20,30,23,67)
SE2 <- c(20,20,20,20,20,20)
df<-data.frame(Intercepts=randoms$Batch[,1],
sd.interc=2*sqrt(qq[,,1:length(qq)]), Intercepts2=Inter, sd.iterc2=SE2,
lev.names=rownames(rand.interc))
df$lev.names<-factor(df$lev.names,levels=df$lev.names[order(df$Intercepts)])
library(ggplot2)
p <- ggplot(df,aes(lev.names,Intercepts,shape=lev.names))
#Added horizontal line at y=0
#Includes first set of points/confidence intervals. This works without error
p <- p + geom_hline(yintercept=0) +geom_errorbar(aes(ymin=Intercepts-sd.interc, ymax=Intercepts+sd.interc), width=0,color="black") + geom_point(aes(size=2))
#Removed legends and with scale_shape_manual point shapes set to 1 and 16
p <- p + guides(size=FALSE,shape=FALSE) + scale_shape_manual(values=c(16,16,16,16,16,16))
#Changed appearance of plot (black and white theme) and x and y axis labels
p <- p + theme_bw() + xlab("Levels") + ylab("")
#Final adjustments of plot
p <- p + theme(axis.text.x=element_text(size=rel(1.2)),
axis.title.x=element_text(size=rel(1.3)),
axis.text.y=element_text(size=rel(1.2)),
panel.grid.minor=element_blank(),
panel.grid.major.x=element_blank())
#To put levels on y axis you just need to use coord_flip()
p <- p+ coord_flip()
print(p)
#####
# code for adding more plots, NOT working yet
p <- p +geom_errorbar(aes(ymin=Intercepts2-sd.interc2, ymax=Intercepts2+sd.interc2),
width=0,color="gray40", lty=1, size=1)
p <- p + geom_point(aes(Intercepts2, lev.names),size=0,pch=7)
First, in your data frame df and geom_errorbar() there are two different variables sd.iterc2 and sd.interc2. Changed also in df to sd.interc2.
For the last line of geom_point() you get the error because your x and y values are in wrong order. As your are using coord_flip() then x and y values should be placed in the same order as in original plot before coord_flip(), that is, lev.names as x, and Intercepts2 as y. Changed also size= to 5 for better illustration.
+ geom_point(aes(lev.names,Intercepts2),size=5,pch=7)
Update - adding legend
To add legend for the points of intercept types, one option is to reshape your data to long format and add new column with intercept types. Other option with your existing data is, first, remove shape=lev.names from ggplot() call. Then in both geom_point() calls add shape="somename" inside aes(). Then with scale_shape_manual() set shape values you need.
ggplot(df,aes(lev.names,Intercepts))+
geom_hline(yintercept=0) +
geom_errorbar(aes(ymin=Intercepts-sd.interc, ymax=Intercepts+sd.interc), width=0,color="black")+
geom_point(aes(shape="Intercepts"),size=5)+
theme_bw() + xlab("Levels") + ylab("")+
theme(axis.text.x=element_text(size=rel(1.2)),
axis.title.x=element_text(size=rel(1.3)),
axis.text.y=element_text(size=rel(1.2)),
panel.grid.minor=element_blank(),
panel.grid.major.x=element_blank())+
coord_flip()+
geom_errorbar(aes(ymin=Intercepts2-sd.interc2, ymax=Intercepts2+sd.interc2),
width=0,color="gray40", lty=1, size=1) +
geom_point(aes(lev.names,Intercepts2,shape="Intercepts2"),size=5)+
scale_shape_manual(values=c(16,7))

making binned scatter plots for two variables in ggplot2 in R

I have a dataframe with two columns x and y that each contain values between 0 and 100 (the data are paired). I want to correlate them to each other using binned scatter plots. If I were to use a regular scatter plot, it would be easy to do:
geom_point(aes(x=x, y=y))
but I'd like to instead bin the points into N bins from 0 to 100, get the average value of x in each bin and the average value of y for the points in that bin, and show that as a scatter plot - so correlate the binned averages instead of the raw data points.
is there a clever/quick way to do this in ggplot2, using some combination of geom_smooth() and geom_point? Or does it have to be pre-computed manually and then plotted?
Yes, you can use stat_summary_bin.
set.seed(42)
x <- runif(1e4)
y <- x^2 + x + 4 * rnorm(1e4)
df <- data.frame(x=x, y=y)
library(ggplot2)
(ggplot(df, aes(x=x,y=y)) +
geom_point(alpha = 0.4) +
stat_summary_bin(fun.y='mean', bins=20,
color='orange', size=2, geom='point'))
I suggest geom_bin2d.
DF <- data.frame(x=1:100,y=1:100+rnorm(100))
library(ggplot2)
p <- ggplot(DF,aes(x=x,y=y)) + geom_bin2d()
print(p)

Resources