Custom +/- 10% Band from geom_abline? - r

I'm trying to create a plot that compares measured to predicted values where the line going through the plot is of the form y = 0 + x. I would like to then shade the region +/- 10% from the line. Is there a way to do this without adding a column of data to the data.frame?
Code
library(tidyverse)
# Fake Data - Not my real data
N <- 1e3
x <- runif(N, 200, 600)
a <- 0
b <- 1
y_true <- a + b * x
sigma <- 50
y <- rnorm(N, y_true, sigma)
d <- tibble(y, x)
ggplot(d, aes(x, y)) +
geom_point() +
geom_abline(slope = 1) +
#geom_ribbion(aes(x = ? * .9, y = ? * 1.1)
Created on 2019-07-29 by the reprex package (v0.3.0)

Related

How to set up y lim and add annotation by group for linear regression in ggplot2

I am drawing the plot for linear regression by group. The demo data and code is below. I have two groups: A and B. For each group, I would like to draw regression line separately. So the regression line for group A is in page 1 of the PDF, the regression line for group B is in page 2 of the PDF. What's more, I also want to adjust the y lim by group. For group A, the Y lim should be from mean(Y) - 10 to mean(Y) + 10 by 1, the Y label should be Y1. For group B, the Y lim should be from -1 to 1 by 0.1, the Y label should be Y2. Finally, I want to add the slope, intercept, their stander error and the median of Y at X = 1 as annotation to the plot. I want the annotation to show up at bottom of each page which looks like:
slope: 2
se for slope: 1
intercept: 20
se for intercept: 1
baseline median: 1
How can I achieve this in ggplot?
Now my pdf file has two pages. But the Y lim is not adjusted by group. I also cannot add annotation by group.
library(ggplot2)
# generate dummy data
set.seed(2)
df1 <- data.frame(group = rep("A",100),
x = rep(seq(1,5,1),20))
df1$y <- 2*df1$x + 20 + abs(rnorm(100))
df2 <- data.frame(group = rep("B",100),
x = rep(seq(1,5,1),20))
df2$y <- 0.001*df2$x + 0.01 + abs(rnorm(100))
df <- as.data.frame(rbind(df1,df2))
my_breaks <- function(x){'if'(mean(x) > 10, seq(mean(x)-10, mean(x)+10, 1),seq(-1,1,0.1))}
pdf("Regression_example.pdf")
for(i in 1:2){
print(ggplot(df, aes(x, y)) + geom_point() + facet_wrap_paginate(~group, ncol = 1, nrow=1, page = i) + geom_smooth(method='lm') + scale_y_continuous(breaks = my_breaks))
}
dev.off()

draw vertical lines in ggplot with faceting

I have line plots y vs x. y is sigmoid and varies from 0 to 1.
determine the value of x where y = 0.5 or very close by interpolation.
draw vertical line at x where y = 0.5
library(tidyverse)
# continuous variables
x <- seq(-5, 5, 0.1)
# compute y1
error_term <- runif(1, min = -2, max = 2)
y1 <- 1/(1 + exp(-x + error_term))
# compute y2
error_term <- runif(1, min = -2, max = 2)
y2 <- 1/(1 + exp(-x + error_term))
# merge y
y <- c(y1, y2)
x <- c(x, x)
# categorical variable
a <- c(rep(0, 101), rep(1, 101))
tbl <- tibble(x, a, y)
# TASK
# 1. determine values of x at which y = 0.5 for all categories and store them in variable x0
# 2. Use x0 to draw vertical lines in plots at x where y is 0.5
# ggplot
ggplot(data = tbl,
aes(x = x,
y = y)) +
geom_line() +
theme_bw() +
facet_grid(a ~ .)
This really isn't something built in to ggplot so you'll need to summarize the data yourself prior to plotting. You can write a helper function and then create the data you need for the lines
find_intersect <- function(x,y, target=0.5) {
optimize(function(z) (approxfun(x,y)(z)-target)^2, x)$minimum
}
line_data <- tbl %>%
group_by(a) %>%
summarize(xint=find_intersect(x,y))
Then plot with
ggplot(data = tbl,
aes(x = x,
y = y)) +
geom_line() +
theme_bw() +
geom_vline(aes(xintercept=xint), data=line_data) +
facet_grid(a ~ .)

Make ggplot with regression line and normal distribution overlay

I am trying to make a plot to show the intuition behind logistic (or probit) regression. How would I make a plot that looks something like this in ggplot?
(Wolf & Best, The Sage Handbook of Regression Analysis and Causal Inference, 2015, p. 155)
Actually, what I would rather even do is have one single normal distribution displayed along the y axis with mean = 0, and a specific variance, so that I can draw horizontal lines going from the linear predictor to the y axis and sideways normal distribution. Something like this:
What this is supposed to show (assuming I haven't misunderstood something) is . I haven't had much success so far...
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
# Probability density function of a normal logistic distribution
pdfDeltaFun <- function(x) {
prob = (exp(x)/(1 + exp(x))^2)
return(prob)
}
# Tried switching the x and y to be able to turn the
# distribution overlay 90 degrees with coord_flip()
ggplot(df, aes(x = y, y = x)) +
geom_point() +
geom_line() +
stat_function(fun = pdfDeltaFun)+
coord_flip()
I think this comes pretty close to the first illustration you give. If this is a thing you don't need to repeat many times, it is probably best to compute the density curves prior to plotting and use a seperate dataframe to plot these.
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
# For every row in `df`, compute a rotated normal density centered at `y` and shifted by `x`
curves <- lapply(seq_len(NROW(df)), function(i) {
mu <- df$y[i]
range <- mu + c(-3, 3)
seq <- seq(range[1], range[2], length.out = 100)
data.frame(
x = -1 * dnorm(seq, mean = mu) + df$x[i],
y = seq,
grp = i
)
})
# Combine above densities in one data.frame
curves <- do.call(rbind, curves)
ggplot(df, aes(x, y)) +
geom_point() +
geom_line() +
# The path draws the curve
geom_path(data = curves, aes(group = grp)) +
# The polygon does the shading. We can use `oob_squish()` to set a range.
geom_polygon(data = curves, aes(y = scales::oob_squish(y, c(0, Inf)),group = grp))
The second illustration is pretty close to your code. I simplified your density function by the standard normal density function and added some extra paramters to stat function:
library(ggplot2)
x <- seq(1, 11, 1)
y <- x*0.5
x <- x - mean(x)
y <- y - mean(y)
df <- data.frame(x, y)
ggplot(df, aes(x, y)) +
geom_point() +
geom_line() +
stat_function(fun = dnorm,
aes(x = after_stat(-y * 4 - 5), y = after_stat(x)),
xlim = range(df$y)) +
# We fill with a polygon, squishing the y-range
stat_function(fun = dnorm, geom = "polygon",
aes(x = after_stat(-y * 4 - 5),
y = after_stat(scales::oob_squish(x, c(-Inf, -1)))),
xlim = range(df$y))

Considereing the number of the coordinate in the heatmap

I'm trying to make a heatmap considering the value of the point (variable 'x'). But when I run my code I only have the heatmap considering the points, and not its values.
Here is my code:
head(dengue)
lat long x
1 7791000 598157.0 156
2 7790677 598520.0 307
3 7790795 598520.0 153
4 7790153 598808.0 135
5 7790935 598813.0 1888
6 7790765 598881.7 1169
library(ggplot2)
library(ggsn)
hmap <- ggplot(dengue, aes(x=long, y=lat)) +
stat_density2d(aes(fill = ..level..), alpha=0.8, geom="polygon") +
geom_point(colour="red") +
geom_path(data=map.df,aes(x=long, y=lat,group=group), colour="grey50") +
scale_fill_gradientn(colours=rev(brewer.pal(5,"Spectral"))) +
coord_fixed() +
scalebar(location="bottomright",y.min=7781600.0, y.max=7812898.0,
x.min=597998.4, x.max=619721.2,
dist=2, transform = F,
st.dist=.04,dist_unit="km") +
blank() +
guides(fill=guide_legend(title=""))
north2(hmap, x=.7, y=.9, symbol=16)
And here is the map that I got:
Any hint on how can I make a heatmap considering the values of the points (variable 'x'), and not just its coordinates?
There was a post here that describes the adaptation of the MASS package's kde2d function to take into account the weights of points.
library(MASS)
kde2d.weighted <- function (x, y, w, h, n = 25, lims = c(range(x), range(y))) {
nx <- length(x)
if (length(y) != nx)
stop("data vectors must be the same length")
gx <- seq(lims[1], lims[2], length = n) # gridpoints x
gy <- seq(lims[3], lims[4], length = n) # gridpoints y
if (missing(h))
h <- c(bandwidth.nrd(x), bandwidth.nrd(y));
if (missing(w))
w <- numeric(nx)+1;
h <- h/4
ax <- outer(gx, x, "-")/h[1] # distance of each point to each grid point in x-direction
ay <- outer(gy, y, "-")/h[2] # distance of each point to each grid point in y-direction
z <- (matrix(rep(w,n), nrow=n, ncol=nx, byrow=TRUE)*matrix(dnorm(ax), n, nx)) %*% t(matrix(dnorm(ay), n, nx))/(sum(w) * h[1] * h[2]) # z is the density
return(list(x = gx, y = gy, z = z))
}
This is not natively embedded in ggplot2 as far as I'm aware, but you could preprocess your data outside ggplot to get the data you can put into stat_contour:
# Reading in your example data
zz <- " lat long x
1 7791000 598157.0 156
2 7790677 598520.0 307
3 7790795 598520.0 153
4 7790153 598808.0 135
5 7790935 598813.0 1888
6 7790765 598881.7 1169"
df <- read.table(text = zz)
# Doing the weighted 2d kde
wdf <- kde2d.weighted(df$lat, df$long, df$x)
wdf <- data.frame(lat = wdf$x[row(wdf$z)],
long = wdf$y[col(wdf$z)],
value = wdf$z[T])
# Plotting the result:
ggplot(df, aes(lat, long)) +
stat_contour(data = wdf, aes(z = value, fill = stat(level)), geom = "polygon") +
geom_text(aes(label = x)) # to show the weights
As you can see, the contours are a bit cut off at ugly points, but I suppose this could be amended by playing around with the lims argument of the kde2d.weighted().

Fill negative value area below geom_line [duplicate]

I frequently use kernel density plots to illustrate distributions. These are easy and fast to create in R like so:
set.seed(1)
draws <- rnorm(100)^2
dens <- density(draws)
plot(dens)
#or in one line like this: plot(density(rnorm(100)^2))
Which gives me this nice little PDF:
I'd like to shade the area under the PDF from the 75th to 95th percentiles. It's easy to calculate the points using the quantile function:
q75 <- quantile(draws, .75)
q95 <- quantile(draws, .95)
But how do I shade the the area between q75 and q95?
With the polygon() function, see its help page and I believe we had similar questions here too.
You need to find the index of the quantile values to get the actual (x,y) pairs.
Edit: Here you go:
x1 <- min(which(dens$x >= q75))
x2 <- max(which(dens$x < q95))
with(dens, polygon(x=c(x[c(x1,x1:x2,x2)]), y= c(0, y[x1:x2], 0), col="gray"))
Output (added by JDL)
Another solution:
dd <- with(dens,data.frame(x,y))
library(ggplot2)
qplot(x,y,data=dd,geom="line")+
geom_ribbon(data=subset(dd,x>q75 & x<q95),aes(ymax=y),ymin=0,
fill="red",colour=NA,alpha=0.5)
Result:
An expanded solution:
If you wanted to shade both tails (copy & paste of Dirk's code) and use known x values:
set.seed(1)
draws <- rnorm(100)^2
dens <- density(draws)
plot(dens)
q2 <- 2
q65 <- 6.5
qn08 <- -0.8
qn02 <- -0.2
x1 <- min(which(dens$x >= q2))
x2 <- max(which(dens$x < q65))
x3 <- min(which(dens$x >= qn08))
x4 <- max(which(dens$x < qn02))
with(dens, polygon(x=c(x[c(x1,x1:x2,x2)]), y= c(0, y[x1:x2], 0), col="gray"))
with(dens, polygon(x=c(x[c(x3,x3:x4,x4)]), y= c(0, y[x3:x4], 0), col="gray"))
Result:
This question needs a lattice answer. Here's a very basic one, simply adapting the method employed by Dirk and others:
#Set up the data
set.seed(1)
draws <- rnorm(100)^2
dens <- density(draws)
#Put in a simple data frame
d <- data.frame(x = dens$x, y = dens$y)
#Define a custom panel function;
# Options like color don't need to be hard coded
shadePanel <- function(x,y,shadeLims){
panel.lines(x,y)
m1 <- min(which(x >= shadeLims[1]))
m2 <- max(which(x <= shadeLims[2]))
tmp <- data.frame(x1 = x[c(m1,m1:m2,m2)], y1 = c(0,y[m1:m2],0))
panel.polygon(tmp$x1,tmp$y1,col = "blue")
}
#Plot
xyplot(y~x,data = d, panel = shadePanel, shadeLims = c(1,3))
Here's another ggplot2 variant based on a function that approximates the kernel density at the original data values:
approxdens <- function(x) {
dens <- density(x)
f <- with(dens, approxfun(x, y))
f(x)
}
Using the original data (rather than producing a new data frame with the density estimate's x and y values) has the benefit of also working in faceted plots where the quantile values depend on the variable by which the data is being grouped:
Code used
library(tidyverse)
library(RColorBrewer)
# dummy data
set.seed(1)
n <- 1e2
dt <- tibble(value = rnorm(n)^2)
# function that approximates the density at the provided values
approxdens <- function(x) {
dens <- density(x)
f <- with(dens, approxfun(x, y))
f(x)
}
probs <- c(0.75, 0.95)
dt <- dt %>%
mutate(dy = approxdens(value), # calculate density
p = percent_rank(value), # percentile rank
pcat = as.factor(cut(p, breaks = probs, # percentile category based on probs
include.lowest = TRUE)))
ggplot(dt, aes(value, dy)) +
geom_ribbon(aes(ymin = 0, ymax = dy, fill = pcat)) +
geom_line() +
scale_fill_brewer(guide = "none") +
theme_bw()
# dummy data with 2 groups
dt2 <- tibble(category = c(rep("A", n), rep("B", n)),
value = c(rnorm(n)^2, rnorm(n, mean = 2)))
dt2 <- dt2 %>%
group_by(category) %>%
mutate(dy = approxdens(value),
p = percent_rank(value),
pcat = as.factor(cut(p, breaks = probs,
include.lowest = TRUE)))
# faceted plot
ggplot(dt2, aes(value, dy)) +
geom_ribbon(aes(ymin = 0, ymax = dy, fill = pcat)) +
geom_line() +
facet_wrap(~ category, nrow = 2, scales = "fixed") +
scale_fill_brewer(guide = "none") +
theme_bw()
Created on 2018-07-13 by the reprex package (v0.2.0).

Resources