How can I plot data with confidence intervals? - r

If I have 10 values, each of which has a fitted value F, and an upper and lower confidence interval U and L:
set.seed(0815)
F <- runif(10, 1, 2)
L <- runif(10, 0, 1)
U <- runif(10, 2, 3)
How can I show these 10 fitted values and their confidence intervals in the same plot like the one below in R?

Here is a plotrix solution:
set.seed(0815)
x <- 1:10
F <- runif(10,1,2)
L <- runif(10,0,1)
U <- runif(10,2,3)
require(plotrix)
plotCI(x, F, ui=U, li=L)
And here is a ggplot solution:
set.seed(0815)
df <- data.frame(x =1:10,
F =runif(10,1,2),
L =runif(10,0,1),
U =runif(10,2,3))
require(ggplot2)
ggplot(df, aes(x = x, y = F)) +
geom_point(size = 4) +
geom_errorbar(aes(ymax = U, ymin = L))
UPDATE:
Here is a base solution to your edits:
set.seed(1234)
x <- rnorm(20)
df <- data.frame(x = x,
y = x + rnorm(20))
plot(y ~ x, data = df)
# model
mod <- lm(y ~ x, data = df)
# predicts + interval
newx <- seq(min(df$x), max(df$x), length.out=100)
preds <- predict(mod, newdata = data.frame(x=newx),
interval = 'confidence')
# plot
plot(y ~ x, data = df, type = 'n')
# add fill
polygon(c(rev(newx), newx), c(rev(preds[ ,3]), preds[ ,2]), col = 'grey80', border = NA)
# model
abline(mod)
# intervals
lines(newx, preds[ ,3], lty = 'dashed', col = 'red')
lines(newx, preds[ ,2], lty = 'dashed', col = 'red')

Here is a solution using functions plot(), polygon() and lines().
set.seed(1234)
df <- data.frame(x =1:10,
F =runif(10,1,2),
L =runif(10,0,1),
U =runif(10,2,3))
plot(df$x, df$F, ylim = c(0,4), type = "l")
#make polygon where coordinates start with lower limit and
# then upper limit in reverse order
polygon(c(df$x,rev(df$x)),c(df$L,rev(df$U)),col = "grey75", border = FALSE)
lines(df$x, df$F, lwd = 2)
#add red lines on borders of polygon
lines(df$x, df$U, col="red",lty=2)
lines(df$x, df$L, col="red",lty=2)
Now use example data provided by OP in another question:
Lower <- c(0.418116841, 0.391011834, 0.393297710,
0.366144073,0.569956636,0.224775521,0.599166016,0.512269587,
0.531378573, 0.311448219, 0.392045751,0.153614913, 0.366684097,
0.161100849,0.700274810,0.629714150, 0.661641288, 0.533404093,
0.412427559, 0.432905333, 0.525306427,0.224292061,
0.28893064,0.099543648, 0.342995605,0.086973739,0.289030388,
0.081230826,0.164505624, -0.031290586,0.148383474,0.070517523,0.009686605,
-0.052703529,0.475924192,0.253382210, 0.354011010,0.130295355,0.102253218,
0.446598823,0.548330752,0.393985810,0.481691632,0.111811248,0.339626541,
0.267831909,0.133460254,0.347996621,0.412472322,0.133671128,0.178969601,0.484070587,
0.335833224,0.037258467, 0.141312363,0.361392799,0.129791998,
0.283759439,0.333893418,0.569533076,0.385258093,0.356201955,0.481816148,
0.531282473,0.273126565,0.267815691,0.138127486,0.008865700,0.018118398,0.080143484,
0.117861634,0.073697418,0.230002398,0.105855042,0.262367348,0.217799352,0.289108011,
0.161271889,0.219663224,0.306117717,0.538088622,0.320711912,0.264395149,0.396061543,
0.397350946,0.151726970,0.048650180,0.131914718,0.076629840,0.425849394,
0.068692279,0.155144797,0.137939059,0.301912657,-0.071415593,-0.030141781,0.119450922,
0.312927614,0.231345972)
Upper.limit <- c(0.6446223,0.6177311, 0.6034427, 0.5726503,
0.7644718, 0.4585430, 0.8205418, 0.7154043,0.7370033,
0.5285199, 0.5973728, 0.3764209, 0.5818298,
0.3960867,0.8972357, 0.8370151, 0.8359921, 0.7449118,
0.6152879, 0.6200704, 0.7041068, 0.4541011, 0.5222653,
0.3472364, 0.5956551, 0.3068065, 0.5112895, 0.3081448,
0.3745473, 0.1931089, 0.3890704, 0.3031025, 0.2472591,
0.1976092, 0.6906118, 0.4736644, 0.5770463, 0.3528607,
0.3307651, 0.6681629, 0.7476231, 0.5959025, 0.7128883,
0.3451623, 0.5609742, 0.4739216, 0.3694883, 0.5609220,
0.6343219, 0.3647751, 0.4247147, 0.6996334, 0.5562876,
0.2586490, 0.3750040, 0.5922248, 0.3626322, 0.5243285,
0.5548211, 0.7409648, 0.5820070, 0.5530232, 0.6863703,
0.7206998, 0.4952387, 0.4993264, 0.3527727, 0.2203694,
0.2583149, 0.3035342, 0.3462009, 0.3003602, 0.4506054,
0.3359478, 0.4834151, 0.4391330, 0.5273411, 0.3947622,
0.4133769, 0.5288060, 0.7492071, 0.5381701, 0.4825456,
0.6121942, 0.6192227, 0.3784870, 0.2574025, 0.3704140,
0.2945623, 0.6532694, 0.2697202, 0.3652230, 0.3696383,
0.5268808, 0.1545602, 0.2221450, 0.3553377, 0.5204076,
0.3550094)
Fitted.values<- c(0.53136955, 0.50437146, 0.49837019,
0.46939721, 0.66721423, 0.34165926, 0.70985388, 0.61383696,
0.63419092, 0.41998407, 0.49470927, 0.26501789, 0.47425695,
0.27859380, 0.79875525, 0.73336461, 0.74881668, 0.63915795,
0.51385774, 0.52648789, 0.61470661, 0.33919656, 0.40559797,
0.22339000, 0.46932536, 0.19689011, 0.40015996, 0.19468781,
0.26952645, 0.08090917, 0.26872696, 0.18680999, 0.12847285,
0.07245286, 0.58326799, 0.36352329, 0.46552867, 0.24157804,
0.21650915, 0.55738088, 0.64797691, 0.49494416, 0.59728999,
0.22848680, 0.45030036, 0.37087676, 0.25147426, 0.45445930,
0.52339711, 0.24922310, 0.30184215, 0.59185198, 0.44606040,
0.14795374, 0.25815819, 0.47680880, 0.24621212, 0.40404398,
0.44435727, 0.65524894, 0.48363255, 0.45461258, 0.58409323,
0.62599114, 0.38418264, 0.38357103, 0.24545011, 0.11461756,
0.13821664, 0.19183886, 0.23203127, 0.18702881, 0.34030391,
0.22090140, 0.37289121, 0.32846615, 0.40822456, 0.27801706,
0.31652008, 0.41746184, 0.64364785, 0.42944100, 0.37347037,
0.50412786, 0.50828681, 0.26510696, 0.15302635, 0.25116438,
0.18559609, 0.53955941, 0.16920626, 0.26018389, 0.25378867,
0.41439675, 0.04157232, 0.09600163, 0.23739430, 0.41666762,
0.29317767)
Assemble into a data frame (no x provided, so using indices)
df2 <- data.frame(x=seq(length(Fitted.values)),
fit=Fitted.values,lwr=Lower,upr=Upper.limit)
plot(fit~x,data=df2,ylim=range(c(df2$lwr,df2$upr)))
#make polygon where coordinates start with lower limit and then upper limit in reverse order
with(df2,polygon(c(x,rev(x)),c(lwr,rev(upr)),col = "grey75", border = FALSE))
matlines(df2[,1],df2[,-1],
lwd=c(2,1,1),
lty=1,
col=c("black","red","red"))

Here is part of my program related to plotting confidence interval.
1. Generate the test data
ads = 1
require(stats); require(graphics)
library(splines)
x_raw <- seq(1,10,0.1)
y <- cos(x_raw)+rnorm(len_data,0,0.1)
y[30] <- 1.4 # outlier point
len_data = length(x_raw)
N <- len_data
summary(fm1 <- lm(y~bs(x_raw, df=5), model = TRUE, x =T, y = T))
ht <-seq(1,10,length.out = len_data)
plot(x = x_raw, y = y,type = 'p')
y_e <- predict(fm1, data.frame(height = ht))
lines(x= ht, y = y_e)
Result
2. Fitting the raw data using B-spline smoother method
sigma_e <- sqrt(sum((y-y_e)^2)/N)
print(sigma_e)
H<-fm1$x
A <-solve(t(H) %*% H)
y_e_minus <- rep(0,N)
y_e_plus <- rep(0,N)
y_e_minus[N]
for (i in 1:N)
{
tmp <-t(matrix(H[i,])) %*% A %*% matrix(H[i,])
tmp <- 1.96*sqrt(tmp)
y_e_minus[i] <- y_e[i] - tmp
y_e_plus[i] <- y_e[i] + tmp
}
plot(x = x_raw, y = y,type = 'p')
polygon(c(ht,rev(ht)),c(y_e_minus,rev(y_e_plus)),col = rgb(1, 0, 0,0.5), border = NA)
#plot(x = x_raw, y = y,type = 'p')
lines(x= ht, y = y_e_plus, lty = 'dashed', col = 'red')
lines(x= ht, y = y_e)
lines(x= ht, y = y_e_minus, lty = 'dashed', col = 'red')
Result

Some addition to the previous answers. It is nice to regulate the density of the polygon to avoid obscuring the data points.
library(MASS)
attach(Boston)
lm.fit2 = lm(medv~poly(lstat,2))
plot(lstat,medv)
new.lstat = seq(min(lstat), max(lstat), length.out=100)
preds <- predict(lm.fit2, newdata = data.frame(lstat=new.lstat), interval = 'prediction')
lines(sort(lstat), fitted(lm.fit2)[order(lstat)], col='red', lwd=3)
polygon(c(rev(new.lstat), new.lstat), c(rev(preds[ ,3]), preds[ ,2]), density=10, col = 'blue', border = NA)
lines(new.lstat, preds[ ,3], lty = 'dashed', col = 'red')
lines(new.lstat, preds[ ,2], lty = 'dashed', col = 'red')
Please note that you see the prediction interval on the picture, which is several times wider than the confidence interval. You can read here the detailed explanation of those two types of interval estimates.

Related

Draw line through 2d density plot

I have a large dataset of gene expression from ~10,000 patient samples (TCGA), and I'm plotting a predicted expression value (x) and the actual observed value (y) of a certain gene signature. For my downstream analysis, I need to draw a precise line through the plot and calculate different parameters in samples above/below the line.
No matter how I draw a line through the data (geom_smooth(method = 'lm', 'glm', 'gam', or 'loess')), the line always seems imperfect - it doesn't cut through the data to my liking (red line is lm in figure).
After playing around for a while, I realized that the 2d kernel density lines (geom_density2d) actually do a good job of showing the slope/trends of my data, so I manually drew a line that kind of cuts through the density lines (black line in figure).
My question: how can I automatically draw a line that cuts through the kernel density lines, as for the black line in the figure? (Rather than manually playing with different intercepts and slopes till something looks good).
The best approach I can think of is to somehow calculate intercept and slope of the longest diameter for each of the kernel lines, take an average of all those intercepts and slopes and plot that line, but that's a bit out of my league. Maybe someone here has experience with this and can help?
A more hacky approach may be getting the x,y coords of each kernel density line from ggplot_build, and going from there, but it feels too hacky (and is also out of my league).
Thanks!
EDIT: Changed a few details to make the figure/analysis easier. (Density lines are smoother now).
Reprex:
library(MASS)
set.seed(123)
samples <- 10000
r <- 0.9
data <- mvrnorm(n=samples, mu=c(0, 0), Sigma=matrix(c(2, r, r, 2), nrow=2))
x <- data[, 1] # standard normal (mu=0, sd=1)
y <- data[, 2] # standard normal (mu=0, sd=1)
test.df <- data.frame(x = x, y = y)
lm(y ~ x, test.df)
ggplot(test.df, aes(x, y)) +
geom_point(color = 'grey') +
geom_density2d(color = 'red', lwd = 0.5, contour = T, h = c(2,2)) + ### EDIT: h = c(2,2)
geom_smooth(method = "glm", se = F, lwd = 1, color = 'red') +
geom_abline(intercept = 0, slope = 0.7, lwd = 1, col = 'black') ## EDIT: slope to 0.7
Figure:
I generally agree with #Hack-R.
However, it was kind of a fun problem and looking into ggplot_build is not such a big deal.
require(dplyr)
require(ggplot2)
p <- ggplot(test.df, aes(x, y)) +
geom_density2d(color = 'red', lwd = 0.5, contour = T, h = c(2,2))
#basic version of your plot
p_built <- ggplot_build(p)
p_data <- p_built$data[[1]]
p_maxring <- p_data[p_data[['level']] == min(p_data[['level']]),] %>%
select(x,y) # extracts the x/y coordinates of the points on the largest ellipse from your 2d-density contour
Now this answer helped me to find the points on this ellipse which are furthest apart.
coord_mean <- c(x = mean(p_maxring$x), y = mean(p_maxring$y))
p_maxring <- p_maxring %>%
mutate (mean_dev = sqrt((x - mean(x))^2 + (y - mean(y))^2)) #extra column specifying the distance of each point to the mean of those points
coord_farthest <- c('x' = p_maxring$x[which.max(p_maxring$mean_dev)], 'y' = p_maxring$y[which.max(p_maxring$mean_dev)])
# gives the coordinates of the point farthest away from the mean point
farthest_from_farthest <- sqrt((p_maxring$x - coord_farthest['x'])^2 + (p_maxring$y - coord_farthest['y'])^2)
#now this looks which of the points is the farthest from the point farthest from the mean point :D
coord_fff <- c('x' = p_maxring$x[which.max(farthest_from_farthest)], 'y' = p_maxring$y[which.max(farthest_from_farthest)])
ggplot(test.df, aes(x, y)) +
geom_density2d(color = 'red', lwd = 0.5, contour = T, h = c(2,2)) +
# geom_segment using the coordinates of the points farthest apart
geom_segment((aes(x = coord_farthest['x'], y = coord_farthest['y'],
xend = coord_fff['x'], yend = coord_fff['y']))) +
geom_smooth(method = "glm", se = F, lwd = 1, color = 'red') +
# as per your request with your geom_smooth line
coord_equal()
coord_equal is super important, because otherwise you will get super weird results - it messed up my brain too. Because if the coordinates are not set equal, the line will seemingly not pass through the point furthest apart from the mean...
I leave it to you to build this into a function in order to automate it. Also, I'll leave it to you to calculate the y-intercept and slope from the two points
Tjebo's approach was kind of good initially, but after a close look, I found that it found the longest distance between two points on an ellipse. While this is close to what I wanted, it failed with either an irregular shape of the ellipse, or the sparsity of points in the ellipse. This is because it measured the longest distance between two points; whereas what I really wanted is the longest diameter of an ellipse; i.e.: the semi-major axis. See image below for examples/details.
Briefly:
To find/draw density contours of specific density/percentage:
R - How to find points within specific Contour
To get the longest diameter ("semi-major axis") of an ellipse:
https://stackoverflow.com/a/18278767/3579613
For function that returns intercept and slope (as in OP), see last piece of code.
The two pieces of code and images below compare two Tjebo's approach vs. my new approach based on the above posts.
#### Reprex from OP
require(dplyr)
require(ggplot2)
require(MASS)
set.seed(123)
samples <- 10000
r <- 0.9
data <- mvrnorm(n=samples, mu=c(0, 0), Sigma=matrix(c(2, r, r, 2), nrow=2))
x <- data[, 1] # standard normal (mu=0, sd=1)
y <- data[, 2] # standard normal (mu=0, sd=1)
test.df <- data.frame(x = x, y = y)
#### From Tjebo
p <- ggplot(test.df, aes(x, y)) +
geom_density2d(color = 'red', lwd = 0.5, contour = T, h = 2)
p_built <- ggplot_build(p)
p_data <- p_built$data[[1]]
p_maxring <- p_data[p_data[['level']] == min(p_data[['level']]),][,2:3]
coord_mean <- c(x = mean(p_maxring$x), y = mean(p_maxring$y))
p_maxring <- p_maxring %>%
mutate (mean_dev = sqrt((x - mean(x))^2 + (y - mean(y))^2)) #extra column specifying the distance of each point to the mean of those points
p_maxring = p_maxring[round(seq(1, nrow(p_maxring), nrow(p_maxring)/23)),] #### Make a small ellipse to illustrate flaws of approach
coord_farthest <- c('x' = p_maxring$x[which.max(p_maxring$mean_dev)], 'y' = p_maxring$y[which.max(p_maxring$mean_dev)])
# gives the coordinates of the point farthest away from the mean point
farthest_from_farthest <- sqrt((p_maxring$x - coord_farthest['x'])^2 + (p_maxring$y - coord_farthest['y'])^2)
#now this looks which of the points is the farthest from the point farthest from the mean point :D
coord_fff <- c('x' = p_maxring$x[which.max(farthest_from_farthest)], 'y' = p_maxring$y[which.max(farthest_from_farthest)])
farthest_2_points = data.frame(t(cbind(coord_farthest, coord_fff)))
plot(p_maxring[,1:2], asp=1)
lines(farthest_2_points, col = 'blue', lwd = 2)
#### From answer in another post
d = cbind(p_maxring[,1], p_maxring[,2])
r = ellipsoidhull(d)
exy = predict(r) ## the ellipsoid boundary
lines(exy)
me = colMeans((exy))
dist2center = sqrt(rowSums((t(t(exy)-me))^2))
max(dist2center) ## major axis
lines(exy[dist2center == max(dist2center),], col = 'red', lwd = 2)
#### The plot here is made from the data in the reprex in OP, but with h = 0.5
library(MASS)
set.seed(123)
samples <- 10000
r <- 0.9
data <- mvrnorm(n=samples, mu=c(0, 0), Sigma=matrix(c(2, r, r, 2), nrow=2))
x <- data[, 1] # standard normal (mu=0, sd=1)
y <- data[, 2] # standard normal (mu=0, sd=1)
test.df <- data.frame(x = x, y = y)
## MAKE BLUE LINE
p <- ggplot(test.df, aes(x, y)) +
geom_density2d(color = 'red', lwd = 0.5, contour = T, h = 0.5) ## NOTE h = 0.5
p_built <- ggplot_build(p)
p_data <- p_built$data[[1]]
p_maxring <- p_data[p_data[['level']] == min(p_data[['level']]),][,2:3]
coord_mean <- c(x = mean(p_maxring$x), y = mean(p_maxring$y))
p_maxring <- p_maxring %>%
mutate (mean_dev = sqrt((x - mean(x))^2 + (y - mean(y))^2))
coord_farthest <- c('x' = p_maxring$x[which.max(p_maxring$mean_dev)], 'y' = p_maxring$y[which.max(p_maxring$mean_dev)])
farthest_from_farthest <- sqrt((p_maxring$x - coord_farthest['x'])^2 + (p_maxring$y - coord_farthest['y'])^2)
coord_fff <- c('x' = p_maxring$x[which.max(farthest_from_farthest)], 'y' = p_maxring$y[which.max(farthest_from_farthest)])
## MAKE RED LINE
## h = 0.5
## Given the highly irregular shape of the contours, I will use only the largest contour line (0.95) for draing the line.
## Thus, average = 1. See function below for details.
ln = long.diam("x", "y", test.df, h = 0.5, average = 1) ## NOTE h = 0.5
## PLOT
ggplot(test.df, aes(x, y)) +
geom_density2d(color = 'red', lwd = 0.5, contour = T, h = 0.5) + ## NOTE h = 0.5
geom_segment((aes(x = coord_farthest['x'], y = coord_farthest['y'],
xend = coord_fff['x'], yend = coord_fff['y'])), col = 'blue', lwd = 2) +
geom_abline(intercept = ln[1], slope = ln[2], color = 'red', lwd = 2) +
coord_equal()
Finally, I came up with the following function to deal with all this. Sorry for the lack of comments/clarity
#### This will return the intercept and slope of the longest diameter (semi-major axis).
####If Average = TRUE, it will average the int and slope across different density contours.
long.diam = function(x, y, df, probs = c(0.95, 0.5, 0.1), average = T, h = 2) {
fun.df = data.frame(cbind(df[,x], df[,y]))
colnames(fun.df) = c("x", "y")
dens = kde2d(fun.df$x, fun.df$y, n = 200, h = h)
dx <- diff(dens$x[1:2])
dy <- diff(dens$y[1:2])
sz <- sort(dens$z)
c1 <- cumsum(sz) * dx * dy
levels <- sapply(probs, function(x) {
approx(c1, sz, xout = 1 - x)$y
})
names(levels) = paste0("L", str_sub(formatC(probs, 2, format = 'f'), -2))
#plot(fun.df$x,fun.df$y, asp = 1)
#contour(dens, levels = levels, labels=probs, add=T, col = c('red', 'blue', 'green'), lwd = 2)
#contour(dens, add = T, col = 'red', lwd = 2)
#abline(lm(fun.df$y~fun.df$x))
ls <- contourLines(dens, levels = levels)
names(ls) = names(levels)
lines.info = list()
for (i in 1:length(ls)) {
d = cbind(ls[[i]]$x, ls[[i]]$y)
exy = predict(ellipsoidhull(d))## the ellipsoid boundary
colnames(exy) = c("x", "y")
me = colMeans((exy)) ## center of the ellipse
dist2center = sqrt(rowSums((t(t(exy)-me))^2))
#plot(exy,type='l',asp=1)
#points(d,col='blue')
#lines(exy[order(dist2center)[1:2],])
#lines(exy[rev(order(dist2center))[1:2],])
max.dist = data.frame(exy[rev(order(dist2center))[1:2],])
line.fit = lm(max.dist$y ~ max.dist$x)
lines.info[[i]] = c(as.numeric(line.fit$coefficients[1]), as.numeric(line.fit$coefficients[2]))
}
names(lines.info) = names(ls)
#plot(fun.df$x,fun.df$y, asp = 1)
#contour(dens, levels = levels, labels=probs, add=T, col = c('red', 'blue', 'green'), lwd = 2)
#abline(lines.info[[1]], col = 'red', lwd = 2)
#abline(lines.info[[2]], col = 'blue', lwd = 2)
#abline(lines.info[[3]], col = 'green', lwd = 2)
#abline(apply(simplify2array(lines.info), 1, mean), col = 'black', lwd = 4)
if (isTRUE(average)) {
apply(simplify2array(lines.info), 1, mean)
} else {
lines.info[[average]]
}
}
Finally, here's the final implementation of the different answers:
library(MASS)
set.seed(123)
samples = 10000
r = 0.9
data = mvrnorm(n=samples, mu=c(0, 0), Sigma=matrix(c(2, r, r, 2), nrow=2))
x = data[, 1] # standard normal (mu=0, sd=1)
y = data[, 2] # standard normal (mu=0, sd=1)
#plot(x, y)
test.df = data.frame(x = x, y = y)
#### Find furthest two points of contour
## BLUE
p <- ggplot(test.df, aes(x, y)) +
geom_density2d(color = 'red', lwd = 2, contour = T, h = 2)
p_built <- ggplot_build(p)
p_data <- p_built$data[[1]]
p_maxring <- p_data[p_data[['level']] == min(p_data[['level']]),][,2:3]
coord_mean <- c(x = mean(p_maxring$x), y = mean(p_maxring$y))
p_maxring <- p_maxring %>%
mutate (mean_dev = sqrt((x - mean(x))^2 + (y - mean(y))^2))
coord_farthest <- c('x' = p_maxring$x[which.max(p_maxring$mean_dev)], 'y' = p_maxring$y[which.max(p_maxring$mean_dev)])
farthest_from_farthest <- sqrt((p_maxring$x - coord_farthest['x'])^2 + (p_maxring$y - coord_farthest['y'])^2)
coord_fff <- c('x' = p_maxring$x[which.max(farthest_from_farthest)], 'y' = p_maxring$y[which.max(farthest_from_farthest)])
#### Find the average intercept and slope of 3 contour lines (0.95, 0.5, 0.1), as in my long.diam function above.
## RED
ln = long.diam("x", "y", test.df)
#### Plot everything. Black line is GLM
ggplot(test.df, aes(x, y)) +
geom_point(color = 'grey') +
geom_density2d(color = 'red', lwd = 1, contour = T, h = 2) +
geom_smooth(method = "glm", se = F, lwd = 1, color = 'black') +
geom_abline(intercept = ln[1], slope = ln[2], col = 'red', lwd = 1) +
geom_segment((aes(x = coord_farthest['x'], y = coord_farthest['y'],
xend = coord_fff['x'], yend = coord_fff['y'])), col = 'blue', lwd = 1) +
coord_equal()

Plot multiple curves in the same graph in R

library(ROCR);
lig <- unique(read.table("ligands.txt")[,1]);
dec <- unique(read.table("decoys.txt")[,1]);
uniqRes <- read.table("file1.txt",header=T);
colnames(uniqRes)[1]="LigandName";
uniqRes$IsActive <- as.numeric(uniqRes$LigandName %in% lig)
predTOTALuq <- prediction(uniqRes$TOTAL*-1, uniqRes$IsActive)
perfTOTALuq <- performance(predTOTALuq, 'tpr','fpr')
jpeg("hivpr_Rinter_ROC.jpg")
plot(perfTOTALuq,main="hivpr - ROC Curves",col="blue")
abline(0,1,col="grey")
dev.off()
here is the code for plotting single curve by taking data from single file.
i want to plot 3 curves in same plot by taking data from three different files i.e. file 1, file 2, file 3
please help me to do so
you can add abline or curve directly.
df1 <- data.frame(x = 1:10, y = 1:10)
df2 <- data.frame(x = 1:13, y = 2:14)
df3 <- data.frame(x = 6:10, y = 2:6)
lx <- range(c(df1$x, df2$x, df3$x))
ly <- range(c(df1$y, df2$y, df3$y))
plot(df1, main = "hivpr - ROC Curves", xlim = lx, ylim = ly, col = "blue")
abline(0, 1, col = "blue")
points(df2, col = 'red3')
points(df3, col = 'yellow')

Add the new regression line but keep the regression lines from previous runs in R

Background
I have a function called TPN (R code is below the picture). When you run this function, it produces two plots (see picture below). The bottom-row plot samples from the top-row plot and then adds a red regression line. Each time you run the TPN function, the bottom-row plot produces a new red-colored regression line.
Question
In the bottom-row plot, I was wondering if there is a way I could KEEP the regression lines from previous runs each time I run the TPN function (see picture below)?
That is, each time that I run a new TPN function the regression line from a previous run is kept in its place (probably in a color other than "red" for distinction purposes), and the new regression line is just added to he bottom-row plot?
############## Input Values #################
TPN = function( each.sub.pop.n = 150,
sub.pop.means = 20:10,
predict.range = 10:0,
sub.pop.sd = .75,
n.sample = 2 ) {
#############################################
par( mar = c(2, 4.1, 2.1, 2.1) )
m = matrix( c(1, 2), nrow = 2, ncol = 1 ); layout(m)
set.seed(2460986)
Vec.rnorm <- Vectorize(function(n, mean, sd) rnorm(n, mean, sd), 'mean')
y <- c( Vec.rnorm(each.sub.pop.n, sub.pop.means, sub.pop.sd) )
set.seed(NULL)
x <- rep(predict.range, each = each.sub.pop.n)
plot(x, y, ylim = range(y)) ## Top-Row Plot
sample <- lapply(split(y, x), function(z) sample(z, n.sample, replace = TRUE))
sample <- data.frame(y = unlist(sample),
x = as.numeric(rep(names(sample), each = n.sample)))
x = sample$x ; y = sample$y
plot(x, y, ylim = range(y)) #### BOTTOM-ROW PLOT
abline(lm(y ~ x), col = 'red') # Regression Line
}
## TEST HERE:
TPN()
It ain't that easy. I made another function and edit the first one as well.
To summarize what I have done:
I made the first function to set par(new = TRUE) at the end of it. Also, set the color for points in the bottom row plot to be white only for formatting. You can get rid of col = 'white', bg = 'white' if you wish.
Then, in the second function top row plot does not get plotted and yaxis won't be added to the bottom row plot from each "test".
Look below:
############## Input Values #################
TPN = function( each.sub.pop.n = 150,
sub.pop.means = 20:10,
predict.range = 10:0,
sub.pop.sd = .75,
n.sample = 2 ) {
#############################################
par( mar = c(2, 4.1, 2.1, 2.1) )
m = matrix( c(1, 2), nrow = 2, ncol = 1 ); layout(m)
set.seed(2460986)
Vec.rnorm <- Vectorize(function(n, mean, sd) rnorm(n, mean, sd), 'mean')
y <- c( Vec.rnorm(each.sub.pop.n, sub.pop.means, sub.pop.sd) )
set.seed(NULL)
x <- rep(predict.range, each = each.sub.pop.n)
par(new = FALSE)
plot(x, y, ylim = range(y)) ## Top-Row Plot
sample <- lapply(split(y, x), function(z) sample(z, n.sample, replace = TRUE))
sample <- data.frame(y = unlist(sample),
x = as.numeric(rep(names(sample), each = n.sample)))
x = sample$x ; y = sample$y
plot(x, y, ylim = range(y), col = 'white', bg = 'white') #### BOTTOM-ROW PLOT
abline(lm(y ~ x), col = 'red') # Regression Line
par(new = TRUE)
}
The second one does not plot the top row one:
############## Input Values #################
TPN2 = function( each.sub.pop.n = 150,
sub.pop.means = 20:10,
predict.range = 10:0,
sub.pop.sd = .75,
n.sample = 2 ) {
#############################################
par( mar = c(2, 4.1, 2.1, 2.1) )
m = matrix( c(1, 2), nrow = 2, ncol = 1 ); layout(m)
set.seed(2460986)
Vec.rnorm <- Vectorize(function(n, mean, sd) rnorm(n, mean, sd), 'mean')
y <- c( Vec.rnorm(each.sub.pop.n, sub.pop.means, sub.pop.sd) )
set.seed(NULL)
x <- rep(predict.range, each = each.sub.pop.n)
#par(new = FALSE) #comment-out
#plot(x, y, ylim = range(y)) ##Top-Row Plot #comment-out
sample <- lapply(split(y, x), function(z) sample(z, n.sample, replace = TRUE))
sample <- data.frame(y = unlist(sample),
x = as.numeric(rep(names(sample), each = n.sample)))
x = sample$x ; y = sample$y
plot(x, y, ylim = range(y), axes = FALSE, col = 'white', bg = 'white') ##BOTTOM-ROW PLOT
abline(lm(y ~ x), col = 'blue') # Regression Line
par(new = TRUE)
}
Then your test would be like this:
## TEST HERE:
TPN()
TPN2()
TPN2()
TPN2()
This is the output:
A simple way to do what you want is to change your main effect (currently none) to return an accumulation of previous regressions and your side effect (plotting) to loop through these previous regressions (in blue) in addition to the current one (in red).
Another tip: you can use the abline(reg=lm(y~x)) argument and just accumulate the lm objects in a list. It's not necessary to store coefficients and intercepts separately as suggested in the other answer. Keeping the lm objects is also a good idea in case you want to go back and look at average R-squared, etc. -- you couldn't do that using only the coefficients.
Your new function could look like:
TPN.accum <- function( each.sub.pop.n = 150,
sub.pop.means = 20:10,
predict.range = 10:0,
sub.pop.sd = .75,
n.sample = 2,
lm.history = list() # the accumulator
){
par( mar = c(2, 4.1, 2.1, 2.1) )
m <- matrix( c(1, 2), nrow = 2, ncol = 1 ); layout(m)
set.seed(2460986)
Vec.rnorm <- Vectorize(function(n, mean, sd) rnorm(n, mean, sd), 'mean')
y <- c( Vec.rnorm(each.sub.pop.n, sub.pop.means, sub.pop.sd) )
set.seed(NULL)
x <- rep(predict.range, each = each.sub.pop.n)
plot(x, y, ylim = range(y)) ### Top-Row Plot
sample <- lapply(split(y, x), function(z) sample(z, n.sample, replace = TRUE))
sample <- data.frame(y = unlist(sample),
x = as.numeric(rep(names(sample), each = n.sample)))
x <- sample$x ; y <- sample$y
lm.current <- lm(y~x) # the current regression
plot(x, y, ylim = range(y)) ### Bottom-Row Plot
abline(reg = lm.current, col = 'red') # plot current regression (red)
for( i in seq_along(lm.history) ){
abline(reg=lm.history[[i]], col='blue') # plot any previous regressions (blue)
}
return(c(lm.history, list(lm.current))) # append current regression to accumulator
}
To initialize it and then run it repeatedly, just do something like:
tpn.history <- TPN.accum()
for (i in 1:5) tpn.history <- TPN.accum(lm.history=tpn.history)
And your output will look like:
I propose two possibilities:
Use par(mfg) to define on which panel to draw, so that you can add new points or lines on any of the two. For the color, I propose to add options saying if this is the first plot or the last plot of the series.
Store the coefficients of the abline to be used on other plots.
Use par(mfg)
I used some transparent color so that we do not see all superimposition of each iteration. Depending on what you want to achieve, you can modify this.
############## Input Values #################
TPN <- function(each.sub.pop.n = 150,
sub.pop.means = 20:10,
predict.range = 10:0,
sub.pop.sd = .75,
n.sample = 2,
plot = TRUE,
first = FALSE,
last = FALSE) {
#############################################
if (plot & first) {
plot.new()
m <- matrix( c(1, 2), nrow = 2, ncol = 1 ); layout(m)
par( mar = c(2, 4.1, 2.1, 2.1) )
}
set.seed(2460986)
Vec.rnorm <- Vectorize(function(n, mean, sd) rnorm(n, mean, sd), 'mean')
y <- c( Vec.rnorm(each.sub.pop.n, sub.pop.means, sub.pop.sd) )
set.seed(NULL)
x <- rep(predict.range, each = each.sub.pop.n)
if (plot) {
par(mfg = c(1,1)) ## Top-Row Plot
if (first) {
plot(x, y, ylim = range(y), col = "transparent")
} else if (last) {
plot(x, y, ylim = range(y))
}
}
sample <- lapply(split(y, x), function(z) sample(z, n.sample, replace = TRUE))
sample <- data.frame(y = unlist(sample),
x = as.numeric(rep(names(sample), each = n.sample)))
x = sample$x ; y = sample$y
if (plot) {
par(mfg = c(2,1)) #### BOTTOM-ROW PLOT
if (first) {
plot(x, y, ylim = range(y), col = "transparent")
}
if (last) {
points(x, y)
}
abline(lm(y ~ x), col = c('blue', 'red')[(last) + 1]) # Regression Line
}
}
## TEST HERE:
n <- 10
for (i in 1:n) {
TPN(first = ifelse(i == 1, TRUE, FALSE), last = ifelse(i == n, TRUE, FALSE))
}
Store the abline coefficients
There is no need of transparent color here because, a new plot is created for each iteration.
############## Input Values #################
TPN <- function(each.sub.pop.n = 150,
sub.pop.means = 20:10,
predict.range = 10:0,
sub.pop.sd = .75,
n.sample = 2,
plot = TRUE,
coefs = FALSE,
coefsup = NULL) {
#############################################
if (plot) {
m <- matrix( c(1, 2), nrow = 2, ncol = 1 )
layout(m)
par( mar = c(2, 4.1, 2.1, 2.1) )
}
set.seed(2460986)
Vec.rnorm <- Vectorize(function(n, mean, sd) rnorm(n, mean, sd), 'mean')
y <- c( Vec.rnorm(each.sub.pop.n, sub.pop.means, sub.pop.sd) )
set.seed(NULL)
x <- rep(predict.range, each = each.sub.pop.n)
if (plot) {
plot(x, y, ylim = range(y))
}
sample <- lapply(split(y, x), function(z) sample(z, n.sample, replace = TRUE))
sample <- data.frame(y = unlist(sample),
x = as.numeric(rep(names(sample), each = n.sample)))
x = sample$x ; y = sample$y
if (plot) {
plot(x, y, ylim = range(y))
# Add the previous lines if exists
if (!is.null(coefsup)) {
apply(coefsup, 1, function(x) abline(a = x[1], b = x[2], col = "blue"))
}
abline(lm(y ~ x), col = 'red') # Regression Line
}
if (coefs) {return(coef(lm(y ~ x)))}
}
# TEST with coefs
n <- 10
coefsup <- NULL
for (i in 1:n) {
coefsup <- rbind(coefsup, TPN(coefs = TRUE, coefsup = coefsup))
}
In both cases, the output is what you expect:

how to find 95% confidence bands for predicting mean y per value of x and 95% prediction bands for predicting individual y values [duplicate]

If I have 10 values, each of which has a fitted value F, and an upper and lower confidence interval U and L:
set.seed(0815)
F <- runif(10, 1, 2)
L <- runif(10, 0, 1)
U <- runif(10, 2, 3)
How can I show these 10 fitted values and their confidence intervals in the same plot like the one below in R?
Here is a plotrix solution:
set.seed(0815)
x <- 1:10
F <- runif(10,1,2)
L <- runif(10,0,1)
U <- runif(10,2,3)
require(plotrix)
plotCI(x, F, ui=U, li=L)
And here is a ggplot solution:
set.seed(0815)
df <- data.frame(x =1:10,
F =runif(10,1,2),
L =runif(10,0,1),
U =runif(10,2,3))
require(ggplot2)
ggplot(df, aes(x = x, y = F)) +
geom_point(size = 4) +
geom_errorbar(aes(ymax = U, ymin = L))
UPDATE:
Here is a base solution to your edits:
set.seed(1234)
x <- rnorm(20)
df <- data.frame(x = x,
y = x + rnorm(20))
plot(y ~ x, data = df)
# model
mod <- lm(y ~ x, data = df)
# predicts + interval
newx <- seq(min(df$x), max(df$x), length.out=100)
preds <- predict(mod, newdata = data.frame(x=newx),
interval = 'confidence')
# plot
plot(y ~ x, data = df, type = 'n')
# add fill
polygon(c(rev(newx), newx), c(rev(preds[ ,3]), preds[ ,2]), col = 'grey80', border = NA)
# model
abline(mod)
# intervals
lines(newx, preds[ ,3], lty = 'dashed', col = 'red')
lines(newx, preds[ ,2], lty = 'dashed', col = 'red')
Here is a solution using functions plot(), polygon() and lines().
set.seed(1234)
df <- data.frame(x =1:10,
F =runif(10,1,2),
L =runif(10,0,1),
U =runif(10,2,3))
plot(df$x, df$F, ylim = c(0,4), type = "l")
#make polygon where coordinates start with lower limit and
# then upper limit in reverse order
polygon(c(df$x,rev(df$x)),c(df$L,rev(df$U)),col = "grey75", border = FALSE)
lines(df$x, df$F, lwd = 2)
#add red lines on borders of polygon
lines(df$x, df$U, col="red",lty=2)
lines(df$x, df$L, col="red",lty=2)
Now use example data provided by OP in another question:
Lower <- c(0.418116841, 0.391011834, 0.393297710,
0.366144073,0.569956636,0.224775521,0.599166016,0.512269587,
0.531378573, 0.311448219, 0.392045751,0.153614913, 0.366684097,
0.161100849,0.700274810,0.629714150, 0.661641288, 0.533404093,
0.412427559, 0.432905333, 0.525306427,0.224292061,
0.28893064,0.099543648, 0.342995605,0.086973739,0.289030388,
0.081230826,0.164505624, -0.031290586,0.148383474,0.070517523,0.009686605,
-0.052703529,0.475924192,0.253382210, 0.354011010,0.130295355,0.102253218,
0.446598823,0.548330752,0.393985810,0.481691632,0.111811248,0.339626541,
0.267831909,0.133460254,0.347996621,0.412472322,0.133671128,0.178969601,0.484070587,
0.335833224,0.037258467, 0.141312363,0.361392799,0.129791998,
0.283759439,0.333893418,0.569533076,0.385258093,0.356201955,0.481816148,
0.531282473,0.273126565,0.267815691,0.138127486,0.008865700,0.018118398,0.080143484,
0.117861634,0.073697418,0.230002398,0.105855042,0.262367348,0.217799352,0.289108011,
0.161271889,0.219663224,0.306117717,0.538088622,0.320711912,0.264395149,0.396061543,
0.397350946,0.151726970,0.048650180,0.131914718,0.076629840,0.425849394,
0.068692279,0.155144797,0.137939059,0.301912657,-0.071415593,-0.030141781,0.119450922,
0.312927614,0.231345972)
Upper.limit <- c(0.6446223,0.6177311, 0.6034427, 0.5726503,
0.7644718, 0.4585430, 0.8205418, 0.7154043,0.7370033,
0.5285199, 0.5973728, 0.3764209, 0.5818298,
0.3960867,0.8972357, 0.8370151, 0.8359921, 0.7449118,
0.6152879, 0.6200704, 0.7041068, 0.4541011, 0.5222653,
0.3472364, 0.5956551, 0.3068065, 0.5112895, 0.3081448,
0.3745473, 0.1931089, 0.3890704, 0.3031025, 0.2472591,
0.1976092, 0.6906118, 0.4736644, 0.5770463, 0.3528607,
0.3307651, 0.6681629, 0.7476231, 0.5959025, 0.7128883,
0.3451623, 0.5609742, 0.4739216, 0.3694883, 0.5609220,
0.6343219, 0.3647751, 0.4247147, 0.6996334, 0.5562876,
0.2586490, 0.3750040, 0.5922248, 0.3626322, 0.5243285,
0.5548211, 0.7409648, 0.5820070, 0.5530232, 0.6863703,
0.7206998, 0.4952387, 0.4993264, 0.3527727, 0.2203694,
0.2583149, 0.3035342, 0.3462009, 0.3003602, 0.4506054,
0.3359478, 0.4834151, 0.4391330, 0.5273411, 0.3947622,
0.4133769, 0.5288060, 0.7492071, 0.5381701, 0.4825456,
0.6121942, 0.6192227, 0.3784870, 0.2574025, 0.3704140,
0.2945623, 0.6532694, 0.2697202, 0.3652230, 0.3696383,
0.5268808, 0.1545602, 0.2221450, 0.3553377, 0.5204076,
0.3550094)
Fitted.values<- c(0.53136955, 0.50437146, 0.49837019,
0.46939721, 0.66721423, 0.34165926, 0.70985388, 0.61383696,
0.63419092, 0.41998407, 0.49470927, 0.26501789, 0.47425695,
0.27859380, 0.79875525, 0.73336461, 0.74881668, 0.63915795,
0.51385774, 0.52648789, 0.61470661, 0.33919656, 0.40559797,
0.22339000, 0.46932536, 0.19689011, 0.40015996, 0.19468781,
0.26952645, 0.08090917, 0.26872696, 0.18680999, 0.12847285,
0.07245286, 0.58326799, 0.36352329, 0.46552867, 0.24157804,
0.21650915, 0.55738088, 0.64797691, 0.49494416, 0.59728999,
0.22848680, 0.45030036, 0.37087676, 0.25147426, 0.45445930,
0.52339711, 0.24922310, 0.30184215, 0.59185198, 0.44606040,
0.14795374, 0.25815819, 0.47680880, 0.24621212, 0.40404398,
0.44435727, 0.65524894, 0.48363255, 0.45461258, 0.58409323,
0.62599114, 0.38418264, 0.38357103, 0.24545011, 0.11461756,
0.13821664, 0.19183886, 0.23203127, 0.18702881, 0.34030391,
0.22090140, 0.37289121, 0.32846615, 0.40822456, 0.27801706,
0.31652008, 0.41746184, 0.64364785, 0.42944100, 0.37347037,
0.50412786, 0.50828681, 0.26510696, 0.15302635, 0.25116438,
0.18559609, 0.53955941, 0.16920626, 0.26018389, 0.25378867,
0.41439675, 0.04157232, 0.09600163, 0.23739430, 0.41666762,
0.29317767)
Assemble into a data frame (no x provided, so using indices)
df2 <- data.frame(x=seq(length(Fitted.values)),
fit=Fitted.values,lwr=Lower,upr=Upper.limit)
plot(fit~x,data=df2,ylim=range(c(df2$lwr,df2$upr)))
#make polygon where coordinates start with lower limit and then upper limit in reverse order
with(df2,polygon(c(x,rev(x)),c(lwr,rev(upr)),col = "grey75", border = FALSE))
matlines(df2[,1],df2[,-1],
lwd=c(2,1,1),
lty=1,
col=c("black","red","red"))
Here is part of my program related to plotting confidence interval.
1. Generate the test data
ads = 1
require(stats); require(graphics)
library(splines)
x_raw <- seq(1,10,0.1)
y <- cos(x_raw)+rnorm(len_data,0,0.1)
y[30] <- 1.4 # outlier point
len_data = length(x_raw)
N <- len_data
summary(fm1 <- lm(y~bs(x_raw, df=5), model = TRUE, x =T, y = T))
ht <-seq(1,10,length.out = len_data)
plot(x = x_raw, y = y,type = 'p')
y_e <- predict(fm1, data.frame(height = ht))
lines(x= ht, y = y_e)
Result
2. Fitting the raw data using B-spline smoother method
sigma_e <- sqrt(sum((y-y_e)^2)/N)
print(sigma_e)
H<-fm1$x
A <-solve(t(H) %*% H)
y_e_minus <- rep(0,N)
y_e_plus <- rep(0,N)
y_e_minus[N]
for (i in 1:N)
{
tmp <-t(matrix(H[i,])) %*% A %*% matrix(H[i,])
tmp <- 1.96*sqrt(tmp)
y_e_minus[i] <- y_e[i] - tmp
y_e_plus[i] <- y_e[i] + tmp
}
plot(x = x_raw, y = y,type = 'p')
polygon(c(ht,rev(ht)),c(y_e_minus,rev(y_e_plus)),col = rgb(1, 0, 0,0.5), border = NA)
#plot(x = x_raw, y = y,type = 'p')
lines(x= ht, y = y_e_plus, lty = 'dashed', col = 'red')
lines(x= ht, y = y_e)
lines(x= ht, y = y_e_minus, lty = 'dashed', col = 'red')
Result
Some addition to the previous answers. It is nice to regulate the density of the polygon to avoid obscuring the data points.
library(MASS)
attach(Boston)
lm.fit2 = lm(medv~poly(lstat,2))
plot(lstat,medv)
new.lstat = seq(min(lstat), max(lstat), length.out=100)
preds <- predict(lm.fit2, newdata = data.frame(lstat=new.lstat), interval = 'prediction')
lines(sort(lstat), fitted(lm.fit2)[order(lstat)], col='red', lwd=3)
polygon(c(rev(new.lstat), new.lstat), c(rev(preds[ ,3]), preds[ ,2]), density=10, col = 'blue', border = NA)
lines(new.lstat, preds[ ,3], lty = 'dashed', col = 'red')
lines(new.lstat, preds[ ,2], lty = 'dashed', col = 'red')
Please note that you see the prediction interval on the picture, which is several times wider than the confidence interval. You can read here the detailed explanation of those two types of interval estimates.

Plotting deviations from regression line

I want to plot a regression line with (a = 0 and b = 1) and add the individual point deviations from this along with identifying the data point with name.
set.seed(123)
namelab <- paste ("ET", 1:10, sep = "")
xvar <- 1:10
yvar <- rnorm(10, 5, 5)
myd <- data.frame(namelab, xvar, yvar)
plot(xvar, yvar)
abline (a= 0, b = 1, col = "red", lty = 2)
Just manual sketch of my intention, I just labelled a single point just for example. The line drawn need a slim.
dev.new(width=4, height=4)
plot(xvar, yvar, asp=1)
a = 0
b = 1
abline (a, b, col = "red", lty = 2)
myd$xint = with(myd, (b*yvar + xvar - b*a) / (b^2 + 1))
myd$yint = with(myd, (b*yvar + b*xvar + a) / (b^2 + 1))
with(myd, segments(xvar, yvar, xint, yint))
with(myd, text(xvar, yvar, labels=namelab, pos=3, cex=0.5))
...and if you did want vertical as opposed to perpendicular offsets, here is a pretty straightforward option:
set.seed(123)
namelab <- paste ("ET", 1:10, sep = "")
xvar <- 1:10
yvar <- rnorm(10, 5, 5)
plot(xvar, yvar)
abline (a= 0, b = 1, col = "red", lty = 2)
segments(xvar,yvar,xvar,xvar)
text(xvar,yvar,namelab,pos=3)
For this to work for any value of a and b, you would use:
segments(xvar,yvar,xvar,((xvar*b)+a))

Resources