Quadratic/parabolic interpolation - r

I have this curve (COVID-19 cases per 100,000 inhabitants in California between 2020-09-01 and 2021-03-01):
It's clear that the dip at the end of December 2020 is an artifact of testing's having gone down during the winter holidays (the nadir occurs exactly on Christmas Day) rather than a true decline in cases.
What I would like to do is impute values via some sort of quadratic or parabolic interpolation to come up with plausible values for the real case rate (per 100k) between 2020-12-12 and 2021-01-13. How can I do this?
Here's the code I used to generate the plot:
x <- seq.Date(as.Date("2020-09-01"), as.Date("2021-03-01"), by=1)
y <- c(9.36,9.16,9.05,8.88,8.76,8.65,7.94,7.81,7.65,7.5,7.47,7.5,7.52,8.19,
8.03,8.1,8.12,8.14,8.19,8.24,8.21,8.19,8.19,8.22,8.24,8.2,8.16,8.14,
8.16,8.14,8.25,8.19,8.3,8.36,8.45,8.43,8.42,8.44,8.51,8.63,8.62,8.63,
8.66,8.69,8.73,8.81,8.79,8.9,9.15,9.46,9.67,9.78,10.07,10.19,10.32,10.48,
10.52,10.69,10.93,11.27,11.68,12.4,13.45,14.66,15.92,17.09,18.15,18.85,
19.04,19.98,20.93,21.69,22.89,24.28,25.52,26.78,29.08,31.29,33.62,35.34,
37.11,37.95,38.35,39.59,40.82,42.06,39.44,39.63,41.69,43.73,47.78,52.64,
57.16,65.24,70.15,72.29,73.01,76.01,78.53,81.46,84.64,87.58,89.86,90.79,
93.81,96.47,98.05,99.48,100.07,99.73,99.65,99.36,99.52,99.32,92.84,82.53,
84.34,86.33,89.6,92.99,96.15,99.42,101.56,102.45,102.72,103.63,102.26,101,
104.48,112.58,109.57,106.79,100.29,94.47,88.73,83.79,79.33,76.19,74.25,
67.69,63.86,60.59,57.27,54.07,51.8,50.69,49.49,46.01,42.54,39.79,37.28,
36.11,35.29,33.53,31.66,30.16,28.58,27.37,26.13,25.13,23.06,21.33,19.92,
18.65,17.51,16.71,16.13,14.63,13.89,13.03,12.27,11.56,11.1,10.79,10.63,
10.07,9.63,9.28,8.98,8.77,8.61,8.25)
df <- data.frame(x,y)
p <- ggplot(data=df) +
geom_line(aes(x=as.Date(x, origin=as.Date("1970-01-01")),y=y))
p
I'm not really sure where to begin, so I'd appreciate it if someone tossed me a bone, here. Thanks! :)

A colleague supplied this code:
#Quadratic Interpolation
library(magrittr)
library(dplyr)
library(ggplot2)
library(deSolve)
ca_pop <- 40129160L
x <- seq.Date(as.Date("2020-09-01"), as.Date("2021-03-01"), by=1)
y <- c(9.36,9.16,9.05,8.88,8.76,8.65,7.94,7.81,7.65,7.5,7.47,7.5,7.52,8.19,
8.03,8.1,8.12,8.14,8.19,8.24,8.21,8.19,8.19,8.22,8.24,8.2,8.16,8.14,
8.16,8.14,8.25,8.19,8.3,8.36,8.45,8.43,8.42,8.44,8.51,8.63,8.62,8.63,
8.66,8.69,8.73,8.81,8.79,8.9,9.15,9.46,9.67,9.78,10.07,10.19,10.32,10.48,
10.52,10.69,10.93,11.27,11.68,12.4,13.45,14.66,15.92,17.09,18.15,18.85,
19.04,19.98,20.93,21.69,22.89,24.28,25.52,26.78,29.08,31.29,33.62,35.34,
37.11,37.95,38.35,39.59,40.82,42.06,39.44,39.63,41.69,43.73,47.78,52.64,
57.16,65.24,70.15,72.29,73.01,76.01,78.53,81.46,84.64,87.58,89.86,90.79,
93.81,96.47,98.05,99.48,100.07,99.73,99.65,99.36,99.52,99.32,92.84,82.53,
84.34,86.33,89.6,92.99,96.15,99.42,101.56,102.45,102.72,103.63,102.26,101,
104.48,112.58,109.57,106.79,100.29,94.47,88.73,83.79,79.33,76.19,74.25,
67.69,63.86,60.59,57.27,54.07,51.8,50.69,49.49,46.01,42.54,39.79,37.28,
36.11,35.29,33.53,31.66,30.16,28.58,27.37,26.13,25.13,23.06,21.33,19.92,
18.65,17.51,16.71,16.13,14.63,13.89,13.03,12.27,11.56,11.1,10.79,10.63,
10.07,9.63,9.28,8.98,8.77,8.61,8.25)
df <- data.frame(x,y, day_num = 1:length(y))
leave_out <- which(df$x > "2020-12-18" & df$x < "2021-01-08")
#Plot curve with missing points
df[-leave_out,] %>% filter(x > "2020-10-15") %>%
ggplot() +
geom_point(aes(x=x,y=y), shape=1, alpha=.7, size=.6) +
theme_bw()
df_fit <- df %>%#[-leave_out,] %>%
filter(x > "2020-10-15") %>%
mutate(transform_day = 1 / day_num)
#Plot df_fit
#ggplot(data=df_fit, aes(x=x, y=transform_day)) + geom_line()
#quad_model <- lm(y ~ (poly(transform_day, 2)), data = df_fit)
#y_fit <- predict(quad_model)
#model_fit <- data.frame(x = df_fit$x,y_fit)
#model_fit %>% filter(x > "2020-10-15") %>%
# ggplot() +
# geom_line(aes(x = x, y = y_fit)) +
# geom_point(data = df_fit, aes(x = x, y =y)) + theme_bw()
halfway_ind <- round(mean(order(abs(y - 30))[1:2]))
halfway_ind #116
halfway_ind60 <- round(mean(order(abs(y - 60))[c(1,3)]))
halfway_ind60 #118
##Let's say 117 for the peak
df_fit$day_adj <- df_fit$day_num - 117
df_fit$model <- 150*375/ (df_fit$day_adj^2 + 375)
df_fit$cases <- df_fit$y * ca_pop / 1e5
df_fit %>%
ggplot() +
geom_line(aes(x = day_adj, y = model)) +
geom_point(aes(x = day_adj, y =y)) + theme_bw()

Related

ggplot - use data passed to ggplot to calculate the mean of the data in subsequent geom calls [duplicate]

I was wondering why variable mean_y is not recognized by my
geom_hline(yintercept = unique(mean_y)) call?
library(tidyverse)
set.seed(20)
n_groups <- 2
n_in_group <- 20
sd_e = 2
groups <- gl(n_groups, n_in_group, labels = c("T","C"))
age <-rnorm(length(groups), 25, 3)
betas <- c(5,0,0,2)
dat <- data.frame(groups=groups,age=age)
X <- model.matrix(~ groups * age, data = dat)
lin_pred <- as.vector(X %*% betas)
dat$y <- rnorm(nrow(X), lin_pred, sd_e)
dat %>% group_by(groups) %>% mutate(mean_y = mean(y)) %>%
ungroup() %>%
ggplot()+aes(x = age, y = y) +
geom_point(aes(color=groups)) +
geom_hline(yintercept = unique(mean_y)) # Error in unique(mean_y) :
# object 'mean_y' not found
Variables need to be inside aes(), try:
geom_hline(aes(yintercept = mean_y))

data column not recognized in the ggplot geom_hline

I was wondering why variable mean_y is not recognized by my
geom_hline(yintercept = unique(mean_y)) call?
library(tidyverse)
set.seed(20)
n_groups <- 2
n_in_group <- 20
sd_e = 2
groups <- gl(n_groups, n_in_group, labels = c("T","C"))
age <-rnorm(length(groups), 25, 3)
betas <- c(5,0,0,2)
dat <- data.frame(groups=groups,age=age)
X <- model.matrix(~ groups * age, data = dat)
lin_pred <- as.vector(X %*% betas)
dat$y <- rnorm(nrow(X), lin_pred, sd_e)
dat %>% group_by(groups) %>% mutate(mean_y = mean(y)) %>%
ungroup() %>%
ggplot()+aes(x = age, y = y) +
geom_point(aes(color=groups)) +
geom_hline(yintercept = unique(mean_y)) # Error in unique(mean_y) :
# object 'mean_y' not found
Variables need to be inside aes(), try:
geom_hline(aes(yintercept = mean_y))

ggplot2: How to get geom_text() to play nice with facet_grid()?

So I'm trying to plot a couple of curves using ggplot(), and I would like to have each curve sitting in its own plot in a facet_grid. All of this works fine.
The problem is that I'd also like to annotate the curve with the x value corresponding to the peak y value. I tried using geom_text(), and I tried implementing it as shown below, but it doesn't seem to quite work. It's clearly printing something onto the plot, but not the way I hoped it would; i.e., each plot has its corresponding x value printed on it at the location (x, max(y)).
I suspect I've not implemented the ifelse() correctly, but I'm not experienced enough with R to figure out what exactly the problem is.
Any suggestions on where I'm going wrong?
Output:
Data + code:
library('ggplot2')
x <- seq(5, 15, length=1000)
y <- dnorm(x, mean=10, sd=1)
z <- rep_len("z", length.out = 1000)
x1 <- seq(5, 15, length=1000)
y1 <- dnorm(x1, mean=10, sd=2)
z1 <- rep_len("z1", length.out = 1000)
x <- c(x, x1)
y <- c(y, y1)
z <- c(z, z1)
df <- data.frame(x, y, z)
ggplot(data = df, aes(x, y)) + geom_line() + facet_grid(.~z) + geom_text(data = df, aes(x, y, label = ifelse(y == max(y), as.numeric(x), '')), inherit.aes = FALSE, hjust = 0, vjust = 0)
Edit: the output I'm expecting is something like this:
You need to fix two things.
(1) calculate max per z
(2) avoid duplicate y_values
The following code should fix both:
library(dplyr)
df2 <- df %>%
distinct(y, .keep_all = TRUE) %>%
group_by(z) %>%
mutate(y_label = ifelse(y == max(y), as.numeric(x), ''))
as.data.frame(df2)
ggplot(data = df2, aes(x, y)) + geom_line() + facet_grid(.~z) + geom_text(aes(label = y_label), hjust = 0, vjust = 0)
You need to provide geom_text a data.frame with data for z and z1.
x y z
z 9.994995 0.3989373 z
z1 9.994995 0.1994705 z1
How to get that? Well, here's one way.
df.split <- split(df, f = df$z)
df.max <- sapply(df.split, FUN = function(x) which.max(x$y))
df.max <- mapply(function(x1, x2) x1[x2, ], x1 = df.split, x2 = df.max, SIMPLIFY = FALSE)
df.max <- do.call(rbind, df.max)
which you can then plot
ggplot(data = df, aes(x, y)) +
geom_line() +
geom_text(data = df.max, aes(x = x, y = y, label = round(y, 2))) +
facet_grid(. ~ z)
Get the means and maxes for each z:
Ys <- df %>% group_by(z) %>% summarise(maxY = max(y))
Xs <- df %>% group_by(z) %>% summarise(meanX = mean(x))
Plot with the geom_text
ggplot(data = df, aes(x, y)) +
geom_line() +
geom_text(data = left_join(Xs,Ys), aes(meanX, maxY, label = meanX)) +
facet_grid(.~z)
Or more succinctly
ggplot(data = df, aes(x, y)) +
geom_line() +
geom_text(data =
df %>%
group_by(z) %>%
summarise(maxY = max(y), meanX = mean(x)),
aes(meanX, maxY, label = meanX)) +
facet_grid(.~z)

Different colours for values above / below a linear trend line

I'm using ggplot to plot a time series with a linear regression line. I would like to have different colours for my time series depending on whether it is above or below the trend line.
Here is a code example to plot the series and the corresponding trend line with different colours for the series and the line:
x <- seq(as.Date("2000/1/1"), as.Date("2010/1/1"), "years")
y <- rnorm(length(x),0,10)
df <- data.frame(x,y)
ggplot(df, aes(x, y)) +
stat_smooth(method = 'lm', aes(colour = 'Trend'), se = FALSE) +
geom_line(aes(colour = 'Observation') ) +
theme_bw() +
xlab("x") +
ylab("y") +
scale_colour_manual(values = c("blue","red"))
Have a nice day!
I got rid of the dates, since they were driving me nuts. Perhaps someone can add a solution for that. Otherwise it seems quite doable, with some basic high school maths.
df <- data.frame(x = 2000:2010,
y = rnorm(11, 0, 10))
fm <- lm(y ~ x, data = df)
co <- coef(fm)
df$under_over <- sign(fm$residuals)
for (i in 1:(nrow(df) - 1)) {
# Get slope and intercept for line segment
slope <- (df$y[i + 1] - df$y[i]) / (df$x[i + 1] - df$x[i])
int <- df$y[i] - slope * df$x[i]
# find where they would cross
x <- (co[1] - int) / (slope - co[2])
y <- slope * x + int
# if that is in the range of the segment it is a crossing, add to the data
if (x > df$x[i] & x < df$x[i + 1])
df <- rbind(df, c(x = x, y = y, under_over = NA))
}
#order by x
df <- df[order(df$x), ]
# find color for intersections
for (i in 1:nrow(df))
if (is.na(df$under_over[i]))
df$under_over[i] <- df$under_over[i + 1]
ggplot(df) +
geom_abline(intercept = co[1], slope = co[2]) +
geom_path(aes(x, y, col = as.factor(under_over), group = 1)) +
theme_bw()

Adding custom image to geom_polygon fill in ggplot

I was asked by a student if it was possible to recreate a plot similar to the one below using R:
This is from this paper....
This sort of stuff isn't my specialty, but using the following code I was able to create 95% CI ellipses and to plot them with geom_polygon(). I filled the images with images I grabbed from the phylopic library using the rphylopic package.
#example data/ellipses
set.seed(101)
n <- 1000
x1 <- rnorm(n, mean=2)
y1 <- 1.75 + 0.4*x1 + rnorm(n)
df <- data.frame(x=x1, y=y1, group="A")
x2 <- rnorm(n, mean=8)
y2 <- 0.7*x2 + 2 + rnorm(n)
df <- rbind(df, data.frame(x=x2, y=y2, group="B"))
x3 <- rnorm(n, mean=6)
y3 <- x3 - 5 - rnorm(n)
df <- rbind(df, data.frame(x=x3, y=y3, group="C"))
#calculating ellipses
library(ellipse)
df_ell <- data.frame()
for(g in levels(df$group)){
df_ell <- rbind(df_ell, cbind(as.data.frame(with(df[df$group==g,], ellipse(cor(x, y),
scale=c(sd(x),sd(y)),
centre=c(mean(x),mean(y))))),group=g))
}
#drawing
library(ggplot2)
p <- ggplot(data=df, aes(x=x, y=y,colour=group)) +
#geom_point(size=1.5, alpha=.6) +
geom_polygon(data=df_ell, aes(x=x, y=y,colour=group, fill=group), alpha=0.1, size=1, linetype=1)
### get center points of ellipses
library(dplyr)
ell_center <- df_ell %>% group_by(group) %>% summarise(x=mean(x), y=mean(y))
### animal images
library(rphylopic)
lion <- get_image("e2015ba3-4f7e-4950-9bde-005e8678d77b", size = "512")[[1]]
mouse <- get_image("6b2b98f6-f879-445f-9ac2-2c2563157025", size="512")[[1]]
bug <- get_image("136edfe2-2731-4acd-9a05-907262dd1311", size="512")[[1]]
### overlay images on center points
p + add_phylopic(lion, alpha=0.9, x=ell_center[[1,2]], y=ell_center[[1,3]], ysize=2, color="firebrick1") +
add_phylopic(mouse, alpha=1, x=ell_center[[2,2]], y=ell_center[[2,3]], ysize=2, color="darkgreen") +
add_phylopic(bug, alpha=0.9, x=ell_center[[3,2]], y=ell_center[[3,3]], ysize=2, color="mediumblue") +
theme_bw()
Which gives the following:
This is ok, but what I'd really like to do is to add an image directly to the 'fill' command of geom_polygon. Is this possible ?
We can not set pattern fill for ggplot, but we can make a quite simple workaround with the help of geom_tile. Reproducing your initial data:
#example data/ellipses
set.seed(101)
n <- 1000
x1 <- rnorm(n, mean=2)
y1 <- 1.75 + 0.4*x1 + rnorm(n)
df <- data.frame(x=x1, y=y1, group="A")
x2 <- rnorm(n, mean=8)
y2 <- 0.7*x2 + 2 + rnorm(n)
df <- rbind(df, data.frame(x=x2, y=y2, group="B"))
x3 <- rnorm(n, mean=6)
y3 <- x3 - 5 - rnorm(n)
df <- rbind(df, data.frame(x=x3, y=y3, group="C"))
#calculating ellipses
library(ellipse)
df_ell <- data.frame()
for(g in levels(df$group)){
df_ell <-
rbind(df_ell, cbind(as.data.frame(
with(df[df$group==g,], ellipse(cor(x, y), scale=c(sd(x),sd(y)),
centre=c(mean(x),mean(y))))),group=g))
}
The key feature I want to show is converting a raster image into data.frame with columns X, Y, color so we can later plot it with geom_tile
require("dplyr")
require("tidyr")
require("ggplot2")
require("png")
# getting sample pictures
download.file("http://content.mycutegraphics.com/graphics/alligator/alligator-reading-a-book.png", "alligator.png", mode = "wb")
download.file("http://content.mycutegraphics.com/graphics/animal/elephant-and-bird.png", "elephant.png", mode = "wb")
download.file("http://content.mycutegraphics.com/graphics/turtle/girl-turtle.png", "turtle.png", mode = "wb")
pic_allig <- readPNG("alligator.png")
pic_eleph <- readPNG("elephant.png")
pic_turtl <- readPNG("turtle.png")
# converting raster image to plottable data.frame
ggplot_rasterdf <- function(color_matrix, bottom = 0, top = 1, left = 0, right = 1) {
require("dplyr")
require("tidyr")
if (dim(color_matrix)[3] > 3) hasalpha <- T else hasalpha <- F
outMatrix <- matrix("#00000000", nrow = dim(color_matrix)[1], ncol = dim(color_matrix)[2])
for (i in 1:dim(color_matrix)[1])
for (j in 1:dim(color_matrix)[2])
outMatrix[i, j] <- rgb(color_matrix[i,j,1], color_matrix[i,j,2], color_matrix[i,j,3], ifelse(hasalpha, color_matrix[i,j,4], 1))
colnames(outMatrix) <- seq(1, ncol(outMatrix))
rownames(outMatrix) <- seq(1, nrow(outMatrix))
as.data.frame(outMatrix) %>% mutate(Y = nrow(outMatrix):1) %>% gather(X, color, -Y) %>%
mutate(X = left + as.integer(as.character(X))*(right-left)/ncol(outMatrix), Y = bottom + Y*(top-bottom)/nrow(outMatrix))
}
Converting images:
# preparing image data
pic_allig_dat <-
ggplot_rasterdf(pic_allig,
left = min(df_ell[df_ell$group == "A",]$x),
right = max(df_ell[df_ell$group == "A",]$x),
bottom = min(df_ell[df_ell$group == "A",]$y),
top = max(df_ell[df_ell$group == "A",]$y) )
pic_eleph_dat <-
ggplot_rasterdf(pic_eleph, left = min(df_ell[df_ell$group == "B",]$x),
right = max(df_ell[df_ell$group == "B",]$x),
bottom = min(df_ell[df_ell$group == "B",]$y),
top = max(df_ell[df_ell$group == "B",]$y) )
pic_turtl_dat <-
ggplot_rasterdf(pic_turtl, left = min(df_ell[df_ell$group == "C",]$x),
right = max(df_ell[df_ell$group == "C",]$x),
bottom = min(df_ell[df_ell$group == "C",]$y),
top = max(df_ell[df_ell$group == "C",]$y) )
As far as I got, author wants to plot images only inside ellipses, not in their original rectangular shape. We can achieve it with the help of point.in.polygon function from package sp.
# filter image-data.frames keeping only rows inside ellipses
require("sp")
gr_A_df <-
pic_allig_dat[point.in.polygon(pic_allig_dat$X, pic_allig_dat$Y,
df_ell[df_ell$group == "A",]$x,
df_ell[df_ell$group == "A",]$y ) %>% as.logical,]
gr_B_df <-
pic_eleph_dat[point.in.polygon(pic_eleph_dat$X, pic_eleph_dat$Y,
df_ell[df_ell$group == "B",]$x,
df_ell[df_ell$group == "B",]$y ) %>% as.logical,]
gr_C_df <-
pic_turtl_dat[point.in.polygon(pic_turtl_dat$X, pic_turtl_dat$Y,
df_ell[df_ell$group == "C",]$x,
df_ell[df_ell$group == "C",]$y ) %>% as.logical,]
And finally...
#drawing
p <- ggplot(data=df) +
geom_polygon(data=df_ell, aes(x=x, y=y,colour=group, fill=group), alpha=0.1, size=1, linetype=1)
p + geom_tile(data = gr_A_df, aes(x = X, y = Y), fill = gr_A_df$color) +
geom_tile(data = gr_B_df, aes(x = X, y = Y), fill = gr_B_df$color) +
geom_tile(data = gr_C_df, aes(x = X, y = Y), fill = gr_C_df$color) + theme_bw()
We can easily resize the plot without making changes to the code.
And, of course, you should keep in mind performance capabilities of your machine, and, probably, not choose 20MP pictures for plotting inside your ggplot =)
A quick and ugly solution without using ggplot could be to use rasterImager and the package(jpg) (or png, depending on the format of you images):
set.seed(101)
n <- 1000
x1 <- rnorm(n, mean=2)
y1 <- 1.75 + 0.4*x1 + rnorm(n)
df <- data.frame(x=x1, y=y1, group="1")
x2 <- rnorm(n, mean=8)
y2 <- 0.7*x2 + 2 + rnorm(n)
df <- rbind(df, data.frame(x=x2, y=y2, group="2"))
x3 <- rnorm(n, mean=6)
y3 <- x3 - 5 - rnorm(n)
df <- rbind(df, data.frame(x=x3, y=y3, group="3"))
plot(df$x,df$y,type="n")
for(g in unique(df$group)){
ifile=readJPEG(paste(g,".jpg",sep=""),FALSE)
x=df$x[df$group == g]
y=df$y[df$group == g]
xmin=mean(x)-sd(x)*2
ymin=mean(y)-sd(y)*2
xmax=mean(x)+sd(x)*2
ymax=mean(y)+sd(y)*2
rasterImage(ifile,xmin,ymin,xmax,ymax)
}
(the images are "random"images found on wikimedia, renamed for the occasion)
Here I simply centered the image on the mean of each group (as in the article) and make their size proportional to the standard deviation. It won't be difficult to make it fit the 95% confidence interval used in the article.
It's not exactly the needed result but it's quite easy to do (although I would more go to a gimp solution if you really want to fit your image to the ellipse, as suggested by #Mike)
#example data/ellipses set.seed(101) n <- 1000 x1 <- rnorm(n, mean=2) y1 <- 1.75 + 0.4*x1 + rnorm(n) df <- data.frame(x=x1, y=y1,
group="A") x2 <- rnorm(n, mean=8) y2 <- 0.7*x2 + 2 + rnorm(n) df <-
rbind(df, data.frame(x=x2, y=y2, group="B")) x3 <- rnorm(n, mean=6)
y3 <- x3 - 5 - rnorm(n) df <- rbind(df, data.frame(x=x3, y=y3,
group="C"))
#calculating ellipses library(ellipse) df_ell <- data.frame() for(g in levels(df$group)){
df_ell <- rbind(df_ell,
cbind(as.data.frame(with(df[df$group==g,], ellipse(cor(x, y),
scale=c(sd(x),sd(y)),
centre=c(mean(x),mean(y))))),group=g)) }
#drawing library(ggplot2) p <- ggplot(data=df, aes(x=x, y=y,colour=group)) +
#geom_point(size=1.5, alpha=.6) +
geom_polygon(data=df_ell, aes(x=x, y=y,colour=group, fill=group),
alpha=0.1, size=1, linetype=1)

Resources