I have this curve (COVID-19 cases per 100,000 inhabitants in California between 2020-09-01 and 2021-03-01):
It's clear that the dip at the end of December 2020 is an artifact of testing's having gone down during the winter holidays (the nadir occurs exactly on Christmas Day) rather than a true decline in cases.
What I would like to do is impute values via some sort of quadratic or parabolic interpolation to come up with plausible values for the real case rate (per 100k) between 2020-12-12 and 2021-01-13. How can I do this?
Here's the code I used to generate the plot:
x <- seq.Date(as.Date("2020-09-01"), as.Date("2021-03-01"), by=1)
y <- c(9.36,9.16,9.05,8.88,8.76,8.65,7.94,7.81,7.65,7.5,7.47,7.5,7.52,8.19,
8.03,8.1,8.12,8.14,8.19,8.24,8.21,8.19,8.19,8.22,8.24,8.2,8.16,8.14,
8.16,8.14,8.25,8.19,8.3,8.36,8.45,8.43,8.42,8.44,8.51,8.63,8.62,8.63,
8.66,8.69,8.73,8.81,8.79,8.9,9.15,9.46,9.67,9.78,10.07,10.19,10.32,10.48,
10.52,10.69,10.93,11.27,11.68,12.4,13.45,14.66,15.92,17.09,18.15,18.85,
19.04,19.98,20.93,21.69,22.89,24.28,25.52,26.78,29.08,31.29,33.62,35.34,
37.11,37.95,38.35,39.59,40.82,42.06,39.44,39.63,41.69,43.73,47.78,52.64,
57.16,65.24,70.15,72.29,73.01,76.01,78.53,81.46,84.64,87.58,89.86,90.79,
93.81,96.47,98.05,99.48,100.07,99.73,99.65,99.36,99.52,99.32,92.84,82.53,
84.34,86.33,89.6,92.99,96.15,99.42,101.56,102.45,102.72,103.63,102.26,101,
104.48,112.58,109.57,106.79,100.29,94.47,88.73,83.79,79.33,76.19,74.25,
67.69,63.86,60.59,57.27,54.07,51.8,50.69,49.49,46.01,42.54,39.79,37.28,
36.11,35.29,33.53,31.66,30.16,28.58,27.37,26.13,25.13,23.06,21.33,19.92,
18.65,17.51,16.71,16.13,14.63,13.89,13.03,12.27,11.56,11.1,10.79,10.63,
10.07,9.63,9.28,8.98,8.77,8.61,8.25)
df <- data.frame(x,y)
p <- ggplot(data=df) +
geom_line(aes(x=as.Date(x, origin=as.Date("1970-01-01")),y=y))
p
I'm not really sure where to begin, so I'd appreciate it if someone tossed me a bone, here. Thanks! :)
A colleague supplied this code:
#Quadratic Interpolation
library(magrittr)
library(dplyr)
library(ggplot2)
library(deSolve)
ca_pop <- 40129160L
x <- seq.Date(as.Date("2020-09-01"), as.Date("2021-03-01"), by=1)
y <- c(9.36,9.16,9.05,8.88,8.76,8.65,7.94,7.81,7.65,7.5,7.47,7.5,7.52,8.19,
8.03,8.1,8.12,8.14,8.19,8.24,8.21,8.19,8.19,8.22,8.24,8.2,8.16,8.14,
8.16,8.14,8.25,8.19,8.3,8.36,8.45,8.43,8.42,8.44,8.51,8.63,8.62,8.63,
8.66,8.69,8.73,8.81,8.79,8.9,9.15,9.46,9.67,9.78,10.07,10.19,10.32,10.48,
10.52,10.69,10.93,11.27,11.68,12.4,13.45,14.66,15.92,17.09,18.15,18.85,
19.04,19.98,20.93,21.69,22.89,24.28,25.52,26.78,29.08,31.29,33.62,35.34,
37.11,37.95,38.35,39.59,40.82,42.06,39.44,39.63,41.69,43.73,47.78,52.64,
57.16,65.24,70.15,72.29,73.01,76.01,78.53,81.46,84.64,87.58,89.86,90.79,
93.81,96.47,98.05,99.48,100.07,99.73,99.65,99.36,99.52,99.32,92.84,82.53,
84.34,86.33,89.6,92.99,96.15,99.42,101.56,102.45,102.72,103.63,102.26,101,
104.48,112.58,109.57,106.79,100.29,94.47,88.73,83.79,79.33,76.19,74.25,
67.69,63.86,60.59,57.27,54.07,51.8,50.69,49.49,46.01,42.54,39.79,37.28,
36.11,35.29,33.53,31.66,30.16,28.58,27.37,26.13,25.13,23.06,21.33,19.92,
18.65,17.51,16.71,16.13,14.63,13.89,13.03,12.27,11.56,11.1,10.79,10.63,
10.07,9.63,9.28,8.98,8.77,8.61,8.25)
df <- data.frame(x,y, day_num = 1:length(y))
leave_out <- which(df$x > "2020-12-18" & df$x < "2021-01-08")
#Plot curve with missing points
df[-leave_out,] %>% filter(x > "2020-10-15") %>%
ggplot() +
geom_point(aes(x=x,y=y), shape=1, alpha=.7, size=.6) +
theme_bw()
df_fit <- df %>%#[-leave_out,] %>%
filter(x > "2020-10-15") %>%
mutate(transform_day = 1 / day_num)
#Plot df_fit
#ggplot(data=df_fit, aes(x=x, y=transform_day)) + geom_line()
#quad_model <- lm(y ~ (poly(transform_day, 2)), data = df_fit)
#y_fit <- predict(quad_model)
#model_fit <- data.frame(x = df_fit$x,y_fit)
#model_fit %>% filter(x > "2020-10-15") %>%
# ggplot() +
# geom_line(aes(x = x, y = y_fit)) +
# geom_point(data = df_fit, aes(x = x, y =y)) + theme_bw()
halfway_ind <- round(mean(order(abs(y - 30))[1:2]))
halfway_ind #116
halfway_ind60 <- round(mean(order(abs(y - 60))[c(1,3)]))
halfway_ind60 #118
##Let's say 117 for the peak
df_fit$day_adj <- df_fit$day_num - 117
df_fit$model <- 150*375/ (df_fit$day_adj^2 + 375)
df_fit$cases <- df_fit$y * ca_pop / 1e5
df_fit %>%
ggplot() +
geom_line(aes(x = day_adj, y = model)) +
geom_point(aes(x = day_adj, y =y)) + theme_bw()
So I'm trying to plot a couple of curves using ggplot(), and I would like to have each curve sitting in its own plot in a facet_grid. All of this works fine.
The problem is that I'd also like to annotate the curve with the x value corresponding to the peak y value. I tried using geom_text(), and I tried implementing it as shown below, but it doesn't seem to quite work. It's clearly printing something onto the plot, but not the way I hoped it would; i.e., each plot has its corresponding x value printed on it at the location (x, max(y)).
I suspect I've not implemented the ifelse() correctly, but I'm not experienced enough with R to figure out what exactly the problem is.
Any suggestions on where I'm going wrong?
Output:
Data + code:
library('ggplot2')
x <- seq(5, 15, length=1000)
y <- dnorm(x, mean=10, sd=1)
z <- rep_len("z", length.out = 1000)
x1 <- seq(5, 15, length=1000)
y1 <- dnorm(x1, mean=10, sd=2)
z1 <- rep_len("z1", length.out = 1000)
x <- c(x, x1)
y <- c(y, y1)
z <- c(z, z1)
df <- data.frame(x, y, z)
ggplot(data = df, aes(x, y)) + geom_line() + facet_grid(.~z) + geom_text(data = df, aes(x, y, label = ifelse(y == max(y), as.numeric(x), '')), inherit.aes = FALSE, hjust = 0, vjust = 0)
Edit: the output I'm expecting is something like this:
You need to fix two things.
(1) calculate max per z
(2) avoid duplicate y_values
The following code should fix both:
library(dplyr)
df2 <- df %>%
distinct(y, .keep_all = TRUE) %>%
group_by(z) %>%
mutate(y_label = ifelse(y == max(y), as.numeric(x), ''))
as.data.frame(df2)
ggplot(data = df2, aes(x, y)) + geom_line() + facet_grid(.~z) + geom_text(aes(label = y_label), hjust = 0, vjust = 0)
You need to provide geom_text a data.frame with data for z and z1.
x y z
z 9.994995 0.3989373 z
z1 9.994995 0.1994705 z1
How to get that? Well, here's one way.
df.split <- split(df, f = df$z)
df.max <- sapply(df.split, FUN = function(x) which.max(x$y))
df.max <- mapply(function(x1, x2) x1[x2, ], x1 = df.split, x2 = df.max, SIMPLIFY = FALSE)
df.max <- do.call(rbind, df.max)
which you can then plot
ggplot(data = df, aes(x, y)) +
geom_line() +
geom_text(data = df.max, aes(x = x, y = y, label = round(y, 2))) +
facet_grid(. ~ z)
Get the means and maxes for each z:
Ys <- df %>% group_by(z) %>% summarise(maxY = max(y))
Xs <- df %>% group_by(z) %>% summarise(meanX = mean(x))
Plot with the geom_text
ggplot(data = df, aes(x, y)) +
geom_line() +
geom_text(data = left_join(Xs,Ys), aes(meanX, maxY, label = meanX)) +
facet_grid(.~z)
Or more succinctly
ggplot(data = df, aes(x, y)) +
geom_line() +
geom_text(data =
df %>%
group_by(z) %>%
summarise(maxY = max(y), meanX = mean(x)),
aes(meanX, maxY, label = meanX)) +
facet_grid(.~z)
I feel like this should be really easy to do, but I'm having a really hard time figuring this out.
I have a data frame
type <- c("a","b","c","d","e")
x <- rnorm(5)
y <- rnorm(5)
z <- rnorm(5)
xsd <- sd(x)
ysd <- sd(y)
zsd <- sd(z)
df <- data.frame(type, x,y,z,xsd,ysd,zsd)
df
type x y z xsd ysd zsd
1 a -1.16788106 0.2260430 -1.16788106 0.8182508 0.7321015 0.9016335
2 b -0.09955193 -0.6647980 -0.09955193 0.8182508 0.7321015 0.9016335
3 c -0.87901053 -0.4269936 -0.87901053 0.8182508 0.7321015 0.9016335
4 d -0.87861339 -1.3669793 -0.87861339 0.8182508 0.7321015 0.9016335
5 e 0.84350228 0.4702580 0.84350228 0.8182508 0.7321015 0.9016335
and I need a grouped bar graph of the mean of x, y, and z by type with error bars showing the standard deviation for each variable. The standard deviation is in different columns xsd,ysdand zsd
I need to plot the mean in the y axis, type grouping the x, y, z variables in the x axis.
I tried using gather(), to rearrange the data, but I'm not having any success...
Let ggplot2 do the calculations for you:
install.packages("hmisc") # for mean_sdl
library(tidyverse)
type <- c("a","b","c","d","e")
x <- rnorm(5, 10, 5)
y <- rnorm(5, 8, 3)
z <- rnorm(5, 2, 4)
df <- data.frame(type,x,y,z)
df_long <- df %>%
gather(variable, value, x:z)
ggplot(df_long, aes(x = variable, y = value, fill = variable)) +
stat_summary(fun.y = "mean", geom = "col") +
stat_summary(fun.data = mean_sdl, geom = "errorbar", width = .5, fun.args = list(mult = 1))
This example should help:
type <- c("a","b","c","d","e")
x <- rnorm(50,20, 5)
y <- rnorm(50, 25,1)
z <- rnorm(50, 40, 1)
df <- data.frame(type, x,y,z)
df
library(tidyverse)
df %>%
gather(x,value,-type) %>%
group_by(type, x) %>%
summarise(MEAN = mean(value),
SD = sd(value)) %>%
ggplot(aes(x, MEAN, fill=type))+
geom_bar(stat="identity", position = "dodge")+
geom_errorbar(aes(ymin=MEAN-SD, ymax=MEAN+SD), position = "dodge")
here is a test code and I don't understand why is not working as expected. Is a ggplot2 question, not an R one.
library(ggplot2)
K = 10
x <- 1:100/100
y <- sapply (x, FUN= function(x) 1+x)
xy <- data.frame(x,y)
set.seed(1234)
xy$yrand <- xy$y + runif(100,min = -0.35, max = 0.5)
folds <- cut(seq(1, nrow(xy)), breaks = K, labels = FALSE)
p1 <- ggplot(xy, aes(x = xy$x, y = xy$yrand))+geom_point() +ggtitle ("Simple
x vs y plot with added random noise") + xlab("X") + ylab("Y")
for(i in 1:K){
#Segement your data by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- xy[testIndexes, ]
trainData <- xy[-testIndexes, ]
lmTemp <- lm(yrand ~ x, data = trainData)
p1 <- p1 + geom_line(data = trainData, aes(x = trainData$x, y = predict(lmTemp, newdata = trainData)))
}
p1
Now what I would like to see is a plot with 10 lines (the regression lines). But I only see one. Can someone help me out? Is the ggplot2 syntax that is wrong?
Thanks, Umberto
EDITED:
I marked the answer I got since it is a nice way of doing it. I just wanted to add a simple way of doing it preparing the datasets for the graph I wanted to create. I think this method is slightly easier to understand if you don't have so much R experience.
for(i in 1:K){
#Segement your data by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- xy[testIndexes, ]
trainData <- xy[-testIndexes, ]
lmTemp <- lm(yrand ~ x, data = trainData)
# Let's build a data set for the lines
fitLines <- rbind(fitLines, data.frame(rep(paste("set",i),nrow(trainData)),trainData[,1], predict(lmTemp, newdata = trainData)))
}
names(fitLines) <- c("set", "x","y")
p1 + geom_line(data = fitLines, aes(x = x, y = y, col = set))
And this is what you get
You could use the crossv_kfold()function from the modelr-package, and put your complete code into a "pipe-workflow":
library(modelr)
library(tidyverse)
x <- 1:100/100
y <- sapply (x, FUN= function(x) 1+x)
xy <- data.frame(x,y)
set.seed(1234)
xy$yrand <- xy$y + runif(100,min = -0.35, max = 0.5)
xy %>%
crossv_kfold() %>%
mutate(
models = map(train, ~ lm(yrand ~ x, data = .x)),
predictions = map2(models, test, ~predict(.x, newdata = .y, type = "response"))
) %>%
select(-train, -test, -models) %>%
unnest() %>%
bind_cols(xy) %>%
ggplot(aes(x = x, y = predictions)) +
stat_smooth(aes(colour = .id), method = "lm", se = FALSE) +
geom_point(aes(y = yrand))
Putting the colour-aes inside the ggplot-call would also map the points to the groups:
xy %>%
crossv_kfold() %>%
mutate(
models = map(train, ~ lm(yrand ~ x, data = .x)),
predictions = map2(models, test, ~predict(.x, newdata = .y, type = "response"))
) %>%
select(-train, -test, -models) %>%
unnest() %>%
bind_cols(xy) %>%
ggplot(aes(x = x, y = predictions, colour = .id)) +
stat_smooth(, method = "lm", se = FALSE) +
geom_point(aes(y = yrand))