lines function returning too many random lines - r

I have a weird problem with drawing a graph with confidence intervals for predictions.
Here is my code:
rm(list = ls())
cat("\014")
set.seed(1)
file.name <- "testRegresji.pdf"
count = 20
pdf(file.name)
x <- runif(count, 0, 2)
y <- x + rnorm(count)
model <-lm(y ~ x)
xlab.label <- paste("y = ", format(model$coeff[1], digits = 4),
"+", format(model$coeff[2], digits = 4),
"* x + e")
plot(x, y, xlab = xlab.label, ylab = "", main = paste("n = ", count), col = 8)
matlines(x, predict(model, interval = "confidence"),
type = 'l', lty = c(1, 2, 2), col = "black")
abline(0, 1, col = grey(0.4), lwd = 3)
dev.off()
shell.exec(paste(getwd(), "/", file.name, sep = ""))
The resulting graph looks very weird with too many random lines for confidence intervals, although the result of predict function are correct.
Here's the screenshot of the graph:
What could be the issue? Thanks a lot for any help!

Related

Leave One Out Cross Validation - R - KNN

I have a daraset with three columns. I'm trying to implement LOOCV for KNN regression. I 'm able to do it with the LOOCV library but I'm unable to write a manual code like a function. How can I write this?
Here is how my data looks
A1 A2 A3
a b m
c d n
. . .
. . .
. . .
I've tried the sample below:
loocv_tmp <- matrix(NA, nrow = n_train, ncol = length(df))
for (k in 1:n_train) {
train_xy <- xy[-k, ]
test_xy <- xy[k, ]
x <- train_xy$x
y <- train_xy$y
fitted_models <- apply(t(df), 2, function(degf) lm(y ~ ns(x, df = degf)))
pred <- mapply(function(obj, degf) predict(obj, data.frame(x = test_xy$x)),
fitted_models, df)
loocv_tmp[k, ] <- (test_xy$y - pred)^2
}
loocv <- colMeans(loocv_tmp)
plot(df, mse, type = "l", lwd = 2, col = gray(.4), ylab = "Prediction error",
xlab = "Flexibilty (spline's degrees of freedom [log scaled])",
main = "Leave-One-Out Cross-Validation", ylim = c(.1, .8), log = "x")
lines(df, cv, lwd = 2, col = "steelblue2", lty = 2)
lines(df, loocv, lwd = 2, col = "darkorange")
legend(x = "topright", legend = c("Training error", "10-fold CV error", "LOOCV error"),
lty = c(1, 2, 1), lwd = rep(2, 3), col = c(gray(.4), "steelblue2", "darkorange"),
text.width = .3, cex = .85)
But I want something like
lcv(train.data, train.label, K, numfold)
Any suggestion on how can I write the lcv function to perform LOOCV.
Thanks in advance.

Turn off scatter plot and print only regression line

I'm trying to fit a regression curve to my data. My code generates the plot and curve that I want, however, I don't need the scatter plot--only the line. If I comment out the plot, my code fails. Is there a way to (bypass, turn-off, hide) the scatter plot?
Ultimately, I will need to compare multiple regression curves on my graph and the scatter charts become distracting. Also, my R2 shows NULL. Is there a coefficient for R2?
Code below.
# get underlying plot
y <- dataset$'Fuel Consumed 24h'
x <-dataset$'Performance Speed'
plot(x, y, xlab = "Performance Speed", ylab = "24h Fuel Consumption")
# polynomial
f <- function(x,a,b,d) {(a*x^2) + (b*x) + d}
fit <- nls(y ~ f(x,a,b,d), start = c(a=1, b=1, d=1))
co <- round(coef(fit), 2)
r2 <- format(summary(fit)$r.squared, digits = 3)
curve(f(x, a=co[1], b=co[2], d=co[3]), add = TRUE, col="red", lwd=2)
eq <- paste0("Fuel = ", co[1], "PS^2 ", ifelse(sign(co[2]) == 1, " + ", " - "), abs(co[2]), "PS +", co[3], " R2 = ", r2)
# print equation
mtext(eq, 3, line =-2)
mylabel = bquote(italic(R)^2 == .(format(r2, digits = 3)))
text(x = 1, y = 2.5, r2)
Here is an example with cars data
Fit:
data(cars)
f <- function(x,a,b,d) {(a*x^2) + (b*x) + d}
fit <- nls(dist ~ f(speed,a,b,d), data = cars, start = c(a=1, b=1, d=1))
goodness of fit: (as user20650 pointed out R2 makes little sense for a non linear model, perhaps a better metric is RMSE)
rmse <- function(error)
{
sqrt(mean(error^2))
}
error <- predict(fit, cars) - cars$dist
rms <- rmse(error)
eq <- paste0("Fuel = ", co[1], "PS^2 ", ifelse(sign(co[2]) == 1, " + ", " - "), abs(co[2]), "PS +", co[3], " RMSE = ", round(rms, 2))
plot (no need to call plot at all - add other curves with add = T):
curve(f(x, a=co[1], b=co[2], d=co[3]), col="red", lwd=2, from = min(cars$speed), to = max(cars$speed))
mtext(eq, 3, line =-2)
to add another curve:
f2 = function(x, a, b) {a + b*x}
co2 = coef(lm(dist ~ speed, data = cars))
curve(f2(x, a = co2[1], b = co2[2]), col="blue", lwd=2, add = T)
EDIT: as per user20650 suggestion (the nls is really not needed since the poly and nls curves are the same)
co3 = coef(lm(dist ~ poly(speed, 2, raw=TRUE), data = cars))
curve(f3(x, a = co3[1], b = co3[2], c = co3[3]), col="grey", lty = 2, lwd=2, add = T)
legend("topleft", legend = c("nls", "lm", "poly"), col = c("red", "blue", "grey"), lty =c(1,1,2))

Plotting the 95% confidence interval for means drawn from a normal distribution

I have drawn 100 samples of size 10 from a normal distribution with a mean of 10 and standard deviation of two. Code below:
n <- 10
nreps<-100
sample.mean<-numeric(nreps)
for (i in 1:nreps) {
sample <- rnorm(n=n, mean = 10, sd = 2)
sample.mean[i] <- mean(sample)
a <- qnorm(0.95*2/sqrt(n))
ci <- a
}
plot(sample.mean, 1:100)
I want to create a graph that looks like this
This is what I currently have
I know I need to interpret the left hand and right hand bounds of each mean and then insert a horizontal line between them. Means that fall outside the 95% confidence interval are supposed to be colored differently than the rest. I am just beginning to learn R, so a helpful walk-through would be very appreciated.
try it this way:
library(ggplot2)
set.seed(1321)
n <- 10
sd <- 2
n.reps <- 100
my.mean <- 10
alpha <- 0.05
mydata <- matrix(rnorm(n = n.reps * n, mean = my.mean, sd =sd), ncol = n.reps)
sample.means <- apply(mydata, 2, mean)
error <- apply(mydata, 2, function(x) qt(p=1-alpha/2,df=length(x)-1)*sd (x)/sqrt(length(x)))
dfx <- data.frame(sample.means, error, lcl = sample.means-error, ucl = sample.means+error, trial = 1:n.reps)
dfx$miss <- dfx$ucl < my.mean | dfx$lcl > my.mean
ggplot(dfx, aes(x = sample.means, y = trial, xmin = lcl, xmax = ucl, color = miss)) + geom_errorbarh() + geom_point(pch = 1) +
geom_vline(aes(xintercept=my.mean), lty=2) +
xlab("True Mean in Blue and 95% Confidence Intervals") + ylab ("Trial") + ggtitle(paste("Successful CI's:", 100*mean(!dfx$miss), "%")) + scale_color_manual(values = c("green", "red")) +
theme_bw()
or use base:
oldpar <- par(xpd=FALSE)
par(mar=c(8.1, 3.1, 3.1, 4.1))
with(subset(dfx, !miss), plot(sample.means, trial,
xlab = "Sample Mean",
ylab = "Trial",
col = "forestgreen",
xlim=c(min(dfx$lcl), max(dfx$ucl))))
with(subset(dfx, miss), points(sample.means, trial,
col = "red"))
with(subset(dfx, miss), segments(lcl, trial, ucl, trial, col = "red"))
with(subset(dfx, !miss), segments(lcl, trial, ucl, trial, col = "forestgreen"))
abline(v = my.mean, lty = 2, lwd = 2, col = "blue")
par(xpd=TRUE)
legend("bottomright", c("Successful CI", "Miss"), lty = c(1,1), col = c("forestgreen", "red"),
inset=c(-0.1,-0.45))
title(main = paste("Successful CI's:", 100*mean(!dfx$miss), "%"),
sub = "True mean (in blue) and CI's")
par(oldpar)
HTH
James

Add the new regression line but keep the regression lines from previous runs in R

Background
I have a function called TPN (R code is below the picture). When you run this function, it produces two plots (see picture below). The bottom-row plot samples from the top-row plot and then adds a red regression line. Each time you run the TPN function, the bottom-row plot produces a new red-colored regression line.
Question
In the bottom-row plot, I was wondering if there is a way I could KEEP the regression lines from previous runs each time I run the TPN function (see picture below)?
That is, each time that I run a new TPN function the regression line from a previous run is kept in its place (probably in a color other than "red" for distinction purposes), and the new regression line is just added to he bottom-row plot?
############## Input Values #################
TPN = function( each.sub.pop.n = 150,
sub.pop.means = 20:10,
predict.range = 10:0,
sub.pop.sd = .75,
n.sample = 2 ) {
#############################################
par( mar = c(2, 4.1, 2.1, 2.1) )
m = matrix( c(1, 2), nrow = 2, ncol = 1 ); layout(m)
set.seed(2460986)
Vec.rnorm <- Vectorize(function(n, mean, sd) rnorm(n, mean, sd), 'mean')
y <- c( Vec.rnorm(each.sub.pop.n, sub.pop.means, sub.pop.sd) )
set.seed(NULL)
x <- rep(predict.range, each = each.sub.pop.n)
plot(x, y, ylim = range(y)) ## Top-Row Plot
sample <- lapply(split(y, x), function(z) sample(z, n.sample, replace = TRUE))
sample <- data.frame(y = unlist(sample),
x = as.numeric(rep(names(sample), each = n.sample)))
x = sample$x ; y = sample$y
plot(x, y, ylim = range(y)) #### BOTTOM-ROW PLOT
abline(lm(y ~ x), col = 'red') # Regression Line
}
## TEST HERE:
TPN()
It ain't that easy. I made another function and edit the first one as well.
To summarize what I have done:
I made the first function to set par(new = TRUE) at the end of it. Also, set the color for points in the bottom row plot to be white only for formatting. You can get rid of col = 'white', bg = 'white' if you wish.
Then, in the second function top row plot does not get plotted and yaxis won't be added to the bottom row plot from each "test".
Look below:
############## Input Values #################
TPN = function( each.sub.pop.n = 150,
sub.pop.means = 20:10,
predict.range = 10:0,
sub.pop.sd = .75,
n.sample = 2 ) {
#############################################
par( mar = c(2, 4.1, 2.1, 2.1) )
m = matrix( c(1, 2), nrow = 2, ncol = 1 ); layout(m)
set.seed(2460986)
Vec.rnorm <- Vectorize(function(n, mean, sd) rnorm(n, mean, sd), 'mean')
y <- c( Vec.rnorm(each.sub.pop.n, sub.pop.means, sub.pop.sd) )
set.seed(NULL)
x <- rep(predict.range, each = each.sub.pop.n)
par(new = FALSE)
plot(x, y, ylim = range(y)) ## Top-Row Plot
sample <- lapply(split(y, x), function(z) sample(z, n.sample, replace = TRUE))
sample <- data.frame(y = unlist(sample),
x = as.numeric(rep(names(sample), each = n.sample)))
x = sample$x ; y = sample$y
plot(x, y, ylim = range(y), col = 'white', bg = 'white') #### BOTTOM-ROW PLOT
abline(lm(y ~ x), col = 'red') # Regression Line
par(new = TRUE)
}
The second one does not plot the top row one:
############## Input Values #################
TPN2 = function( each.sub.pop.n = 150,
sub.pop.means = 20:10,
predict.range = 10:0,
sub.pop.sd = .75,
n.sample = 2 ) {
#############################################
par( mar = c(2, 4.1, 2.1, 2.1) )
m = matrix( c(1, 2), nrow = 2, ncol = 1 ); layout(m)
set.seed(2460986)
Vec.rnorm <- Vectorize(function(n, mean, sd) rnorm(n, mean, sd), 'mean')
y <- c( Vec.rnorm(each.sub.pop.n, sub.pop.means, sub.pop.sd) )
set.seed(NULL)
x <- rep(predict.range, each = each.sub.pop.n)
#par(new = FALSE) #comment-out
#plot(x, y, ylim = range(y)) ##Top-Row Plot #comment-out
sample <- lapply(split(y, x), function(z) sample(z, n.sample, replace = TRUE))
sample <- data.frame(y = unlist(sample),
x = as.numeric(rep(names(sample), each = n.sample)))
x = sample$x ; y = sample$y
plot(x, y, ylim = range(y), axes = FALSE, col = 'white', bg = 'white') ##BOTTOM-ROW PLOT
abline(lm(y ~ x), col = 'blue') # Regression Line
par(new = TRUE)
}
Then your test would be like this:
## TEST HERE:
TPN()
TPN2()
TPN2()
TPN2()
This is the output:
A simple way to do what you want is to change your main effect (currently none) to return an accumulation of previous regressions and your side effect (plotting) to loop through these previous regressions (in blue) in addition to the current one (in red).
Another tip: you can use the abline(reg=lm(y~x)) argument and just accumulate the lm objects in a list. It's not necessary to store coefficients and intercepts separately as suggested in the other answer. Keeping the lm objects is also a good idea in case you want to go back and look at average R-squared, etc. -- you couldn't do that using only the coefficients.
Your new function could look like:
TPN.accum <- function( each.sub.pop.n = 150,
sub.pop.means = 20:10,
predict.range = 10:0,
sub.pop.sd = .75,
n.sample = 2,
lm.history = list() # the accumulator
){
par( mar = c(2, 4.1, 2.1, 2.1) )
m <- matrix( c(1, 2), nrow = 2, ncol = 1 ); layout(m)
set.seed(2460986)
Vec.rnorm <- Vectorize(function(n, mean, sd) rnorm(n, mean, sd), 'mean')
y <- c( Vec.rnorm(each.sub.pop.n, sub.pop.means, sub.pop.sd) )
set.seed(NULL)
x <- rep(predict.range, each = each.sub.pop.n)
plot(x, y, ylim = range(y)) ### Top-Row Plot
sample <- lapply(split(y, x), function(z) sample(z, n.sample, replace = TRUE))
sample <- data.frame(y = unlist(sample),
x = as.numeric(rep(names(sample), each = n.sample)))
x <- sample$x ; y <- sample$y
lm.current <- lm(y~x) # the current regression
plot(x, y, ylim = range(y)) ### Bottom-Row Plot
abline(reg = lm.current, col = 'red') # plot current regression (red)
for( i in seq_along(lm.history) ){
abline(reg=lm.history[[i]], col='blue') # plot any previous regressions (blue)
}
return(c(lm.history, list(lm.current))) # append current regression to accumulator
}
To initialize it and then run it repeatedly, just do something like:
tpn.history <- TPN.accum()
for (i in 1:5) tpn.history <- TPN.accum(lm.history=tpn.history)
And your output will look like:
I propose two possibilities:
Use par(mfg) to define on which panel to draw, so that you can add new points or lines on any of the two. For the color, I propose to add options saying if this is the first plot or the last plot of the series.
Store the coefficients of the abline to be used on other plots.
Use par(mfg)
I used some transparent color so that we do not see all superimposition of each iteration. Depending on what you want to achieve, you can modify this.
############## Input Values #################
TPN <- function(each.sub.pop.n = 150,
sub.pop.means = 20:10,
predict.range = 10:0,
sub.pop.sd = .75,
n.sample = 2,
plot = TRUE,
first = FALSE,
last = FALSE) {
#############################################
if (plot & first) {
plot.new()
m <- matrix( c(1, 2), nrow = 2, ncol = 1 ); layout(m)
par( mar = c(2, 4.1, 2.1, 2.1) )
}
set.seed(2460986)
Vec.rnorm <- Vectorize(function(n, mean, sd) rnorm(n, mean, sd), 'mean')
y <- c( Vec.rnorm(each.sub.pop.n, sub.pop.means, sub.pop.sd) )
set.seed(NULL)
x <- rep(predict.range, each = each.sub.pop.n)
if (plot) {
par(mfg = c(1,1)) ## Top-Row Plot
if (first) {
plot(x, y, ylim = range(y), col = "transparent")
} else if (last) {
plot(x, y, ylim = range(y))
}
}
sample <- lapply(split(y, x), function(z) sample(z, n.sample, replace = TRUE))
sample <- data.frame(y = unlist(sample),
x = as.numeric(rep(names(sample), each = n.sample)))
x = sample$x ; y = sample$y
if (plot) {
par(mfg = c(2,1)) #### BOTTOM-ROW PLOT
if (first) {
plot(x, y, ylim = range(y), col = "transparent")
}
if (last) {
points(x, y)
}
abline(lm(y ~ x), col = c('blue', 'red')[(last) + 1]) # Regression Line
}
}
## TEST HERE:
n <- 10
for (i in 1:n) {
TPN(first = ifelse(i == 1, TRUE, FALSE), last = ifelse(i == n, TRUE, FALSE))
}
Store the abline coefficients
There is no need of transparent color here because, a new plot is created for each iteration.
############## Input Values #################
TPN <- function(each.sub.pop.n = 150,
sub.pop.means = 20:10,
predict.range = 10:0,
sub.pop.sd = .75,
n.sample = 2,
plot = TRUE,
coefs = FALSE,
coefsup = NULL) {
#############################################
if (plot) {
m <- matrix( c(1, 2), nrow = 2, ncol = 1 )
layout(m)
par( mar = c(2, 4.1, 2.1, 2.1) )
}
set.seed(2460986)
Vec.rnorm <- Vectorize(function(n, mean, sd) rnorm(n, mean, sd), 'mean')
y <- c( Vec.rnorm(each.sub.pop.n, sub.pop.means, sub.pop.sd) )
set.seed(NULL)
x <- rep(predict.range, each = each.sub.pop.n)
if (plot) {
plot(x, y, ylim = range(y))
}
sample <- lapply(split(y, x), function(z) sample(z, n.sample, replace = TRUE))
sample <- data.frame(y = unlist(sample),
x = as.numeric(rep(names(sample), each = n.sample)))
x = sample$x ; y = sample$y
if (plot) {
plot(x, y, ylim = range(y))
# Add the previous lines if exists
if (!is.null(coefsup)) {
apply(coefsup, 1, function(x) abline(a = x[1], b = x[2], col = "blue"))
}
abline(lm(y ~ x), col = 'red') # Regression Line
}
if (coefs) {return(coef(lm(y ~ x)))}
}
# TEST with coefs
n <- 10
coefsup <- NULL
for (i in 1:n) {
coefsup <- rbind(coefsup, TPN(coefs = TRUE, coefsup = coefsup))
}
In both cases, the output is what you expect:

Density Dependent Growth

I'm trying to create a graph in R to show the carrying capacity of a population using an example given to me which is:
install.packages("deSolve", dependencies = TRUE)
clogistic <- function(times, y, parms){
n <- y[1]
r <- parms[1]
alpha <- parms [2]
dN.dt <- r * n * (1 - alpha * n)
return(list(c(dN.dt)))
}
prms <- c(r = 1, alpha = 0.01)
init.N <- c(1)
t.s <- seq(0.1, 10, by = 0.1)
library(deSolve)
out <- ode(y = init.N, times = t.s, clogistic, parms = prms)
plot(out[,1], out[,2], type="l", xlab = "Time", ylab = "N", col = "blue", lwd = 2)
Now I'm using this to try and show a starting population of 178 with an increase of 21 for 15 time steps. But when I try to change the formula it decreases and bottoms out after one time step and stays bottom for the remainder of the time.
I've tried changing init.N <- c(1) to c(178) which it does but then bottoms out. I've tried changing prms <- c(r = 1, alpha = 0.01) to (r = 21, along with the change in initial population change and without but it just doesn't increase. What is it that I am missing? Knowing R it's going to be something small but I just keep missing it.
Any help will be greatly appreciated.
This is the differential equation that is being integrated:
dN.dt <- r * n * (1 - alpha * n)
If you want an asymptote of n= 200 then set alpha to 1/200 so that the rate of change will go to zero when n gets to 200:
prms <- c(r = 1, alpha = .005)
init.N <- 178
t.s <- seq(0.1, 10, by = 0.1)
library(deSolve)
out <- ode(y = init.N, times = t.s, clogistic, parms = prms)
plot(out[,1], out[,2], type="l", xlab = "Time", ylab = "N", col = "blue", lwd = 2)
With a starting value of 178, the rate of change will be negative when alpha is greater than 1/178, will be flatline with alpha == 1/178, and will be logistic when alpha is less than 1/178.
To go from 300 to 200 you would keep alpha = 1/200 and start at 300:
prms <- c(r = 1, alpha = 1/200)
init.N <- c(300)
t.s <- seq(0.1, 10, by = 0.1)
out <- ode(y = init.N, times = t.s, clogistic, parms = prms)
plot(out[,1], out[,2], type="l", xlab = "Time", ylab = "N", col = "blue", lwd = 2)

Resources