Add datapoints to existing scatterplot - r

I have an existing ggplot2 scatterplot which shows the results of a parameter against from normal database. I then want to add two additional points to this graph which I would pass as command line arguments to my script script age value1 value2. I would like to show these points as red with an r and l geom_text above each point. I have the following code so far but do not know how to add the finishing touches
pkgLoad <- function(x)
{
if (!require(x,character.only = TRUE))
{
install.packages(x,dep=TRUE, repos='http://star-www.st-andrews.ac.uk/cran/')
if(!require(x,character.only = TRUE)) stop("Package not found")
}
}
pkgLoad("ggplot2")
#load current normals database
df<-data.frame(read.csv("dat_normals.txt", sep='\t', header=T))
args<-commandArgs(TRUE)
#specify what each argument is
age <- args[1]
rSBR <- args[2]
lSBR <- args[3]
# RUN REGRESSION AND APPEND PREDICTION INTERVALS
lm_fit = lm(SBR ~ Age, data = df)
sbr_with_pred = data.frame(df, predict(lm_fit, interval='prediction'))
p <- ggplot(sbr_with_pred, aes(x=Age, y=SBR)) +
geom_point(shape=19, alpha=1/4) +
geom_smooth(method = 'lm', aes(fill = 'confidence'), alpha = 0.5) +
geom_ribbon(aes(y = fit, ymin = lwr, ymax = upr,
fill = 'prediction'), alpha = 0.2) +
scale_fill_manual('Interval', values = c('green', 'blue')) +
theme_bw() +
theme(legend.position = "none")
ggsave(filename=paste("/home/data/wolf/FV_DAT/dat_results.png",sep=""))
browseURL(paste("/home/data/wolf/FV_DAT/dat_results.png",sep""))
Essentially, I want to see if the 2 new points fall within the 95% confidence intervals from the normal database (blue ribbon)

Your example is not reproducible. It is really constructive to create data and reproducible example. It is not a waste of time. For the solution, I write what it is said in the comment. You add a new layer with new data.
newdata <- data.frame(Age = args[1],
SBR = c(args[2],args[3]))
p + geom_point(data=newdata,colour="red",size=10)
For example:
sbr_with_pred <- data.frame(Age = sample(15:36,50,rep=T),
SBR = rnorm(50))
p <- ggplot(sbr_with_pred, aes(x=Age, y=SBR)) +
geom_point(shape=19, alpha=1/4) +
geom_smooth(method = 'lm', aes(fill = 'confidence'), alpha = 0.5)
args <- c(20,rnorm(1),rnorm(2))
newdata <- data.frame(Age = args[1],
SBR = c(args[2],args[3]))
p + geom_point(data=newdata,colour="red",size=10)

Related

Zig zag lines instead of straight line in linear modeling

Dataset: Here
I am trying to fit a linear model on the above dataset using R.
Here is the code in R:
library(tidyverse)
data <- read.csv("~/Desktop/Salary_Data.csv")
s_data <- data.frame(scale(data))
# Split data into test and train data sets
set.seed(123)
sam <- sample(c(T, F), size = nrow(s_data), replace=T, prob = c(0.8,0.2))
train <- s_data[sam,]
test <- s_data[!sam,]
model_train = lm(YearsExperience~Salary, data=train);
pred <- predict.lm(object = model_train, newdata = test)
pred_train <- predict.lm(model_train, train)
# Trying to plot using ggplot on test dataset.
ggplot() +
geom_point(aes(x = test$YearsExperience, y = test$Salary),
colour = 'red') +
geom_line(aes(x = test$YearsExperience, y = predict.lm(model_train, test)),
colour = 'blue') +
ggtitle('Salary vs Experience (Test set)') +
xlab('Years of experience') +
ylab('Salary')
Output
My understanding is that the simple linear regression model predicts values based on a linear equation of the form ax+b. So y values in geom_line() must fit in a straight line, but in my case, they don't. Why is that happening? Thanks for reading!
It looks like you just have a problem flipping your x and y values. If you plot years of experience on the x axis, it looks like you are trying to use that to predict salary. But your model is backwards. So you can flip the model and get a straight line
model_train = lm(Salary~YearsExperience, data=train);
ggplot(data.frame(test, pred=predict(model_train, newdata = test))) +
geom_point(aes(x = YearsExperience, y = Salary),
colour = 'red') +
geom_line(aes(x = YearsExperience, y = pred),
colour = 'blue') +
ggtitle('Salary vs Experience (Test set)') +
xlab('Years of experience') +
ylab('Salary')
Or you can flip the plot to get a straight line
model_train = lm(YearsExperience~Salary, data=train);
ggplot(data.frame(test, pred=predict(model_train, newdata = test))) +
geom_point(aes(x = Salary, y = YearsExperience),
colour = 'red') +
geom_line(aes(x = Salary, y = pred),
colour = 'blue') +
ggtitle('Salary vs Experience (Test set)')

Tried p + scale_fill_discrete(name = "New Legend Title") but legend title still not changing

I combined two plots of predicted mixed effect model and trying to change the legend title "sR" so that it is friendly to read but I couldn't get it to work when using p + scale_fill_discrete(name = "New Legend Title"). I believe it should be a simple fix but still scratching my head why I can't get it to work even after reading other posts in stackoverflow. Can someone help me please? Thanks.
Below is my R code for you to reproduce the problem:
library(nlme)
library(ggplot2)
library(ggeffects)
#For lowW dataset:
SurfaceCoverage <- c(0.04,0.08,0.1,0.12,0.15,0.2,0.04,0.08,0.1,0.12,0.15,0.2)
TotalSurfaceEnergy <- c(139.31449,105.17776,105.38411,99.27608,92.29064,91.55114,84.44251,78.40453,74.66656,73.33242,72.42429,77.08666)
sample <- c(1,1,1,1,1,1,2,2,2,2,2,2)
lowW <- data.frame(sample,SurfaceCoverage,TotalSurfaceEnergy)
lowW$sample <- sub("^", "Wettable", lowW$sample)
lowW$RelativeHumidity <- "Low relative humidity"; lowW$group <- "Wettable"
lowW$sR <- paste(lowW$sample,lowW$RelativeHumidity)
dflowW <- data.frame(
"y"=c(lowW$TotalSurfaceEnergy),
"x"=c(lowW$SurfaceCoverage),
"b"=c(lowW$sample),
"sR"=c(lowW$sR)
)
mixed.lme <- lme(y~log(x),random=~1|b,data=dflowW)
pred.mmlowW <- ggpredict(mixed.lme, terms = c("x"))
#For highW dataset:
SurfaceCoverage <- c(0.02,0.04,0.06,0.08,0.1,0.12,0.02,0.04,0.06,0.08,0.1,0.12)
TotalSurfaceEnergy <- c(66.79554,61.46907,57.56855,54.00953,54.28361,55.15855,50.72314,48.55892,47.41811,43.70885,42.13757,40.55924)
sample <- c(1,1,1,1,1,1,2,2,2,2,2,2)
highW <- data.frame(sample,SurfaceCoverage,TotalSurfaceEnergy)
highW$sample <- sub("^", "Wettable", highW$sample)
highW$RelativeHumidity <- "High relative humidity"; highW$group <- "Wettable"
highW$sR <- paste(highW$sample,highW$RelativeHumidity)
dfhighW <- data.frame(
"y"=c(highW$TotalSurfaceEnergy),
"x"=c(highW$SurfaceCoverage),
"b"=c(highW$sample),
"sR"=c(highW$sR)
)
mixed.lme <- lme(y~log(x),random=~1|b,data=dfhighW)
pred.mmhighW <- ggpredict(mixed.lme, terms = c("x"))
# Combine two predicted mixed effect model into a single ggplot:
p <- ggplot() +
#lowa plot
geom_line(data=pred.mmlowW, aes(x = x, y = predicted)) + # slope
geom_ribbon(data=pred.mmlowW, aes(x = x, ymin = predicted - std.error, ymax = predicted + std.error),
fill = "lightgrey", alpha = 0.5) + # error band
geom_point(data = dflowW, # adding the raw data (scaled values)
aes(x = x, y = y, shape = sR)) +
#higha plot
geom_line(data=pred.mmhighW, aes(x = x, y = predicted)) + # slope
geom_ribbon(data=pred.mmhighW, aes(x = x, ymin = predicted - std.error, ymax = predicted + std.error),
fill = "lightgrey", alpha = 0.5) + # error band
geom_point(data = dfhighW, # adding the raw data (scaled values)
aes(x = x, y = y, shape = sR)) +
xlim(0.01,0.2) +
ylim(30,150) +
labs(title = "") +
ylab(bquote('Total Surface Energy ' (mJ/m^2))) +
xlab(bquote('Surface Coverage ' (n/n[m]) )) +
theme_minimal()
print(p)
p1 <- p + scale_fill_discrete(name = "New Legend Title")
print(p1)

Plot logistic regression using parameters in ggplot2

I would like to plot a logistic regression directly from the parameter estimates using ggplot2, but not quite sure how to do it.
For example, if I had 1500 draws of alpha and beta parameter estimates, I could plot each of the lines thus:
alpha_post = rnorm(n=1500,mean=1.1,sd = .15)
beta_post = rnorm(n=1500,mean=1.8,sd = .19)
X_lim = seq(from = -3,to = 2,by=.01)
for (i in 1:length(alpha_post)){
print(i)
y = exp(alpha_post[i] + beta_post[i]*X_lim)/(1+ exp(alpha_post[i] + beta_post[i]*X_lim) )
if (i==1){plot(X_lim,y,type="l")}
else {lines(X_lim,y,add=T)}
}
How would I do this in ggplot2? I know how to use geom_smooth(), but this is a little different.
As always in ggplot, you want to make a data.frame with all data that needs to be plotted:
d <- data.frame(
alpha_post = alpha_post,
beta_post = beta_post,
X_lim = rep(seq(from = -3,to = 2,by=.01), each = length(alpha_post))
)
d$y <- with(d, exp(alpha_post + beta_post * X_lim) / (1 + exp(alpha_post + beta_post * X_lim)))
Then the plotting itself becomes quite easy:
ggplot(d, aes(X_lim, y, group = alpha_post)) + geom_line()
If you want to be more fancy, add a summary line with e.g. the mean:
ggplot(d, aes(X_lim, y)) +
geom_line(aes(group = alpha_post), alpha = 0.3) +
geom_line(size = 1, color = 'firebrick', stat = 'summary', fun.y = 'mean')

graphing confidence intervals nls r

I'm in the process of putting some incidence data together for a proposal. I know that the data takes on a sigmoid shape overall so I fit it using NLS in R. I was trying to get some confidence intervals to plot as well so I used bootstrapping for the parameters, made three lines and here's where I'm having my problem. The bootstrapped CIs give me three sets of values, but because of equation the lines they are crossing.
Picture of Current Plot with "Ideal" Lines in Black
NLS is not my strong suit so perhaps I'm not going about this the right way. I've used mainly a self start function to this point just to get something down on the plot. The second NLS equation will give the same output, but I've put it down now so that I can alter later if needed.
Here is my code thus far:
data <- readRDS(file = "Incidence.RDS")
inc <- nls(y ~ SSlogis(x, beta1, beta2, beta3),
data = data,
control = list(maxiter = 100))
b1 <- summary(inc)$coefficients[1,1]
b2 <- summary(inc)$coefficients[2,1]
b3 <- summary(inc)$coefficients[3,1]
inc2 <- nls(y ~ phi1 / (1 + exp(-(x - phi2) / phi3)),
data = data,
start = list(phi1 = b1, phi2 = b2, phi3 = b3),
control = list(maxiter = 100))
inc2.boot <- nlsBoot(inc2, niter = 1000)
phi1 <- summary(inc2)$coefficients[1,1]
phi2 <- summary(inc2)$coefficients[2,1]
phi3 <- summary(inc2)$coefficients[3,1]
phi1_L <- inc2.boot$bootCI[1,2]
phi2_L <- inc2.boot$bootCI[2,2]
phi3_L <- inc2.boot$bootCI[3,2]
phi1_U <- inc2.boot$bootCI[1,3]
phi2_U <- inc2.boot$bootCI[2,3]
phi3_U <- inc2.boot$bootCI[3,3]
#plot lines
age <- c(20:95)
mean_incidence <- phi1 / (1 + exp(-(age - phi2) / phi3))
lower_incidence <- phi1_L / (1 + exp(-(age - phi2_L) / phi3_L))
upper_incidence <- phi1_U / (1 + exp(-(age - phi2_U) / phi3_U))
inc_line <- data.frame(age, mean_incidence, lower_incidence, upper_incidence)
p <- ggplot()
p <- (p
+ geom_point(data = data, aes(x = x, y = y), color = "darkgreen")
+ geom_line(data = inc_line,
aes(x = age, y = mean_incidence),
color = "blue",
linetype = "solid")
+ geom_line(data = inc_line,
aes(x = age, y = lower_incidence),
color = "blue",
linetype = "dashed")
+ geom_line(data = inc_line,
aes(x = age, y = upper_incidence),
color = "blue",
linetype = "dashed")
+ geom_ribbon(data = inc_line,
aes(x = age, ymin = lower_incidence, ymax = upper_incidence),
fill = "blue", alpha = 0.20)
+ labs(x = "\nAge", y = "Incidence (per 1,000 person years)\n")
)
print(p)
Here's a link to the data.
Any help on what to do next or if this is even possible given my current set up would be appreciated.
Thanks
Try plot.drc in the drc package.
library(drc)
fm <- drm(y ~ x, data = data, fct = LL.3())
plot(fm, type = "bars")
P.S. Please include the library calls in your questions so that the code is self contained and complete. In the case of the question here: library(ggplot2); library(nlstools) .

Computing weighted average using lowess mthod in R

I am trying to use the lowess method from R to compute the weighted average of a data set which is not uniformly distributed along x axis. For example, the first 5 data points are like this, where the first column is the x and the second is the y.
375.0 2040.0
472.0 5538.0
510.0 4488.0
573.0 2668.0
586.0 7664.0
I used the following command in R:
x<-read.table(add,header=FALSE,sep="\t")
y<-lowess(x[,1],x[,2],f=0.01)
write.table(y, file = results , sep = "\t", col.names =FALSE, row.names =FALSE)
The output looks like this:
The green line shows the average computed by the smooth function in matlab (tri-cubic kernel), and the red line is the average line computed by lowess method in R. The blue dots are the data points.
I can't find why the method in R does not work. Do you have any idea?
Here is a link to part of the data.
Thanks a lot for your help.
Th smooth function in matlab is like a filter ,
yy = smooth(y)
yy(1) = y(1)
yy(2) = (y(1) + y(2) + y(3))/3
yy(3) = (y(1) + y(2) + y(3) + y(4) + y(5))/5 ## convolution of size 5
yy(4) = (y(2) + y(3) + y(4) + y(5) + y(6))/5
I think it is better to do a simple smooth here.
Here some attempts using loess, lowesss with f = 0.2(1/5) and using smooth.spline
I am using ggplot2 to plot ( to use geom_jitter with some alpha )
library(ggplot2)
dat <- subset(data, V2 < 5000)
#dat <- data
xy <- lowess(dat$V1,dat$V2,f = 0.8)
xy <- as.data.frame(do.call(cbind,xy))
p1<- ggplot(data = dat, aes(x= V1, y = V2))+
geom_jitter(position = position_jitter(width = .2), alpha= 0.1)+
geom_smooth()
xy <- lowess(dat$V1,dat$V2,f = 0.2)
xy <- as.data.frame(do.call(cbind,xy))
xy.smooth <- smooth.spline(dat$V1,dat$V2)
xy.smooth <- data.frame(x= xy.smooth$x,y = xy.smooth$y)
p2 <- ggplot(data = dat, aes(x= V1, y = V2))+
geom_jitter(position = position_jitter(width = .2), alpha= 0.1)+
geom_line(data = xy, aes(x=x, y = y, group = 1 ), color = 'red')+
geom_line(data = xy.smooth, aes(x=x, y = y, group = 1 ), color = 'blue')
library(gridExtra)
grid.arrange(p1,p2)

Resources