Visualization of predict glm using multiple variables in R - r

I would like to use the following dataset to fit glm and visualize the predict().
y=c(-18.948,-19.007,-18.899,-19.022,-20.599,-19.778,-17.344,-20.265,-20.258,-19.886,-18.05,-19.824,-20.1,-20.508,-20.455,-16.573,-20.249,-20.205,-20.436,-16.358,-17.717,-19.794,-20.372,-19.944,-20.072,-19.889,-20.139,-19.132,-20.275,-19.953,-19.769,-20.2,-19.638,-17.419,-19.086,-18.347,-18.73,-18.872,-18.956,-19.28,-18.176,-19.036,-18.084,-20.11,-19.641,-19.656,-19.25,-18.68,-19.089,-18.969,-18.161,-17.603,-20.37,-19.233,-18.961,-19.083,-20.118,-19.795,-17.154,-16.75)
x1=c(9.698,9.583,9.356,9.326,9.438,9.733,8.803,8.973,9.141,9.044,8.788,9.377,9.26,10.186,9.035,9.569,9.431,9.09,8.776,9.117,9.393,9.408,9.307,8.868,8.398,8.407,9.364,9.074,8.444,9.122,10.11,7.81,9.777,6.472,9.521,8.92,9.341,9.446,9.08,8.071,8.047,8.019,7.419,9.022,9.981,9.337,9.989,10.013,9.31,10.843,8.337,9.103,6.438,9.372,9.071,8.749,9.016,8.181,9.284,8.44)
x2=c('S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S04','S04','S04','S04','S04','S04','S06','S06','S06','S06','S06','S06','S06','S06','S07','S07','S07','S07','S07','S07','S07','S07','S07','S08','S08','S09','S09','S09','S09','S09','S09','S09','S10','S03','S03','S03','S04','S04','S07','S07')
x3=c('A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','P1','P1','P1','P1','P1','P1','P1')
mydata <- data.frame(y,x1,x2,x3)
Fit glm Model:
myglm <- glm(y ~ x1+x2+x3+x1:x2, family="gaussian", data= mydata)
Prediction:
1). Extract the range of x1
min <- min(mydata$x1)
max <- max(mydata$x1)
2). Create a new data frame.x
Here comes the question:
How should I include x2 and x3 in the new.x?
new.x <- data.frame(
x1=seq(min, max, length=60),
x2= ???
x3= ???)
Then predict new.y with myglm:
new.y = predict(myglm, newdata=new.x, se.fit=TRUE)
Combine new.x and new.y:
addThese <- data.frame(new.x, new.y)
interval
addThese <- mutate(addThese,
d15N=exp(fit),
lwr=exp(fit-1.96*se.fit),
upr=exp(fit+1.96*se.fit))
3). Visualization of the original data points and the glm prediction smooth line added:
ggplot(addThese, aes(x1, fit))+
geom_point(shape=21, size=3)+
geom_smooth(data=addThese,
aes(ymin=lwr, ymax=upr),
stat='identity')

I'm still wondering if this is a right way to create new.data, but I'll give it a try. So with your data, slightly modifying your code:
myglm <- glm(y ~ x1 + x2 + x3 + x1:x2, family = gaussian, data = mydata)
minx <- min(mydata$x1)
maxx <- max(mydata$x1)
# create data with all combinations of x1, x2, x3
new.data <- expand.grid(x1 = seq(minx, maxx, length.out = 60),
x2 = unique(mydata$x2),
x3 = unique(mydata$x3)
)
# visualize data
data.frame(predict(myglm, newdata = new.data, se.fit = T)[1:2]) %>%
bind_cols(new.data) %>%
mutate(d15N = exp(fit), lwr = fit - 1.96 * se.fit, upr = fit + 1.96 * se.fit) %>%
ggplot(aes(x = x1, y = fit, colour = interaction(x2, x3))) +
geom_point(size = 1, alpha = .75, pch = 19, position = "jitter") +
geom_smooth(aes(ymin = lwr, ymax = upr), stat = "identity", alpha = .5) +
facet_wrap(~interaction(x2, x3, sep = " : "), nrow = 5) +
ggthemes::theme_few() +
labs(y = "Predicted value", x = bquote(x[1])) +
theme(legend.position = "none")

Related

Fit and plot a Weibull model to a survival data

I want to achieve the exact same thing asked in this question:
How to plot the survival curve generated by survreg (package survival of R)?
Except for the fact that I don't want the data to be stratified by a variable (in the question above it was stratified by sex).
I just want the progression free survival for the whole group of treated patients.
So when I copy the code from the other question, here is where I get stuck:
library(survminer)
library(tidyr)
s <- with(lung,Surv(time,status))
fKM <- survfit(s ~ sex,data=lung)
sWei <- survreg(s ~ as.factor(sex),dist='weibull',data=lung) # in my case here I would replace as.factor(sex) by 1
pred.sex1 = predict(sWei, newdata=list(sex=1),type="quantile",p=seq(.01,.99,by=.01)) #Since I don't want to stratify, what do I do with these 2 lines of code?
pred.sex2 = predict(sWei, newdata=list(sex=2),type="quantile",p=seq(.01,.99,by=.01))
df = data.frame(y=seq(.99,.01,by=-.01), sex1=pred.sex1, sex2=pred.sex2)
df_long = gather(df, key= "sex", value="time", -y)
p = ggsurvplot(fKM, data = lung, risk.table = T)
p$plot = p$plot + geom_line(data=df_long, aes(x=time, y=y, group=sex))
I tried replacing as.factor(sex) by 1 and then the rest of the code just does not make sense, can someone help me with this?
Many thanks in advance!
If you just want to plot the overall empirical survival curve, you might do something like this:
library(survival)
library(survminer)
library(tidyr)
s <- with(lung, Surv(time, status))
fKM <- survfit(s ~ 1, data = survival::lung)
ggsurvplot(fKM, ggtheme = theme_bw())
However, if you want to fit a Weibull model with no predictors, then your formula is fine.
sWei <- survreg(s ~ 1, dist = 'weibull', data = lung)
probs <- seq(0.01, 1, by = 0.01)
time <- predict(sWei, type = "quantile", se = TRUE, p = probs)
The only problem is that time is now a named list of two matrices: fit and se.fit. Both have the same number of rows as lung, but all rows are identical, so we just take one from each and calculate the confidence interval in a data frame which we can then use to create a ggplot:
ggplot(data = data.frame(p = 1 - probs,
time = time$fit[1,],
upper = time$fit[1,] + 1.96 * time$se.fit[1,],
lower = time$fit[1,] - 1.96 * time$se.fit[1,])) +
geom_step(aes(p, time, colour = "All"), size = 1) +
geom_ribbon(aes(p, ymin = lower, ymax = upper, fill = "All"), alpha = 0.2) +
coord_flip(ylim = c(0, 1000)) +
scale_fill_discrete(name = "Strata") +
scale_color_discrete(name = "Strata") +
theme_bw() +
theme(legend.position = "top")
Which we can see looks like a pretty good fit.
If you want both in the same plot you can do something like:
df <- data.frame(p = 1 - probs,
time = time$fit[1,],
upper = time$fit[1,] + 1.96 * time$se.fit[1,],
lower = time$fit[1,] - 1.96 * time$se.fit[1,])
ggsurvplot(fKM, ggtheme = theme_bw())$plot +
geom_line(data = df, aes(time, p), linetype = 2, size = 1) +
geom_line(data = df, aes(upper, p), linetype = 2, size = 1) +
geom_line(data = df, aes(lower, p), linetype = 2, size = 1)
Created on 2020-08-18 by the reprex package (v0.3.0)

Overlay decision boundary for random forests and boostings

I generate some random data and am trying to overlay a decision boundary based upon fitting using random forests and boosting. I can recreate the problem below. I generate the data, and using regression trees I can easily overlay the decision boundary using the following code:
library(tidyverse)
# set seed and generate some random data
set.seed(123)
Dat <- tibble(
x1 = rnorm(100),
x2 = rnorm(100)
) %>% mutate(y = as_factor(ifelse(x1^2 + x2^2 > 1.39, "A", "B")))
circlepts <- tibble(theta = seq(0, 2*pi, length = 100)) %>%
mutate(x = sqrt(1.39) * sin(theta), y = sqrt(1.39) * cos(theta))
# graph the data and draw the boundary
p <- ggplot(Dat) + geom_point(aes(x1, x2, color = y)) + coord_fixed() +
geom_polygon(data = circlepts, aes(x, y), color = "blue", fill = NA)
# convert character to binary inputs making classification easier
binVec = as.vector(Dat$y)
binVec[which(binVec =="A")] = 1
binVec[which(binVec == "B")] = 0
binVec = as.numeric(binVec)
Dat$y = binVec
# split the data up
datasplit <- initial_split(Dat, prop = 0.7)
training_set <- as_tibble(training(datasplit))
testing_set <- as_tibble(testing(datasplit))
tree_fit <- tree(y~ ., training_set)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(tree_fit)
# plot the data with the decision overlay of the tree fit
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)
Now if I try doing so with random forests or gradient boosting, add_predictions doesn't cooperate that well...
rf_fit <- randomForest(y ~ ., data=training_set, mtry = 2, ntree=500)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(rf_fit)
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)
##ERROR: Error in if (is.na(out.type)) stop("type must be one of 'response', 'prob', 'vote'") : argument is of length zero
And for gradient boosting:
fitBoost <- gbm(y ~ ., data= Dat, distribution = "gaussian",
n.trees = 1000)
pred <- predict(fitBoost, newdata=training_set, n.trees=1000)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(fitBoost)
### ERROR: Error in paste("Using", n.trees, "trees...\n") : argument "n.trees" is missing, with no default
It seems to be a very simple problem. Could someone help me out?
The following code works with your random forest:
training_set$y <- factor(training_set$y)
rf_fit <- randomForest(y ~ ., data=training_set, mtry=2, ntree=500)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50),
x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(rf_fit)
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)
And here is the code for the gradient boosting machine:
fitBoost <- gbm(y ~ ., data=Dat, distribution="gaussian", n.trees=1000)
pred <- predict(fitBoost, newdata=training_set, n.trees=1000)
add_predictions2 <- function (data, model, var = "pred", type = NULL)
{
data[[var]] <- predict2(model, data, type = type)
data
}
predict2 <- function (model, data, type = NULL)
{
if (is.null(type)) {
stats::predict(model, data, n.trees=1000)
} else {
stats::predict(model, data, type = type, n.trees=1000)
}
}
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50),
x2 = modelr::seq_range(testing_set$x1, 50)) %>%
add_predictions2(fitBoost)
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)

How to plot 3 models in one Figure in R?

I'm new with R and I have fit 3 models for my data as follows:
Model 1: y = a(x) + b
lm1 = lm(data$CBI ~ data$dNDVI)
Model 2: y = a(x)2 + b(x) + c
lm2 <- lm(CBI ~ dNDVI + I(dNDVI^2), data=data)
Model 3: y = x(a|x| + b)–1
lm3 = nls(CBI ~ dNDVI*(a*abs(dNDVI) + b) - 1, start = c(a = 1.5, b = 2.7), data = data)
Now I would like to plot all these three models in R but I could not find the way to do it, can you please help me? I have tried with the first two models as follow and it work but I don't know how to add the Model 3 on it:
ggplot(data = data, aes(x = dNDVI, y = CBI)) +
geom_point() +
geom_smooth(method = lm, formula = y ~ x, size = 1, se = FALSE) +
geom_smooth(method = lm, formula = y ~ x + I(x^2), size = 1, se = FALSE ) +
theme_bw()
I also would like to add a legend which show 3 different colours or types of lines/curves for the 3 models as well. Can you please guide me how to make it in the figure?
Using iris as a dummy set to represent the three models:
new.dat <- data.frame(Sepal.Length=seq(min(iris$Sepal.Length),
max(iris$Sepal.Length), length.out=50)) #new data.frame to predict the fitted values for each model
m1 <- lm(Petal.Length ~ Sepal.Length, iris)
m2 <- lm(Petal.Length ~ Sepal.Length + I(Sepal.Length^2), data=iris)
m3 <- nls(Petal.Length ~ Sepal.Length*(a*abs(Sepal.Length) + b) - 1,
start = c(a = 1.5, b = 2.7), data = iris)
new.dat$m1.fitted <- predict(m1, new.dat)
new.dat$m2.fitted <- predict(m2, new.dat)
new.dat$m3.fitted <- predict(m3, new.dat)
new.dat <- new.dat %>% gather(var, val, m1.fitted:m3.fitted) #stacked format of fitted data of three models (to automatically generate the legend in ggplot)
ggplot(new.dat, aes(Sepal.Length, val, colour=var)) +
geom_line()

2 polynomial regressions in a ggplot() graph

This is my Dataset:
As you can see, there are two quantitative variables (X, Y) and 1 categorical variable (molar, with two factors: M1, M2).
I would like to represent in one single graph two polynomial regressions and their respective prediction intervals: one for the M1 factor and one for the M2 factor. Each polynomial regression has its own degree (M1 is a 4 degree polynomial regression, and M2 is a 6 degree).
I want to use ggplot() function (which is in package ggplot2 in R). I have actually performed this figure but with all data merged (I mean, with no distinction between factors). This is the code I used:
# Fit a linear model
m <- lm(Y ~ X+I(X^2)+I(X^3)+I(X^4), data = Dataset)
# cbind the predictions to Dataset
mpi <- cbind(Dataset, predict(m, interval = "prediction"))
ggplot(mpi, aes(x = X)) +
geom_ribbon(aes(ymin = lwr, ymax = upr),
fill = "blue", alpha = 0.2) +
geom_point(aes(y = Y)) +
geom_line(aes(y = fit), colour = "blue", size = 1)
With this result:
So, I would like to have two different-grade polynomial regressions (one for the M1 and one for the M2), taking into account their respective predictions intervals. Which would be the exact code?
UPDATE - New code! I run this code with no success:
M1=subset(Dataset,Dataset$molar=="M1",select=X:Y)
M2=subset(Dataset,Dataset$molar=="M2",select=X:Y)
M1.R <- lm(Y ~ X +I(X^2)+I(X^3)+I(X^4),
data=subset(Dataset,Dataset$molar=="M1",select=X:Y))
M2.R <- lm(Y ~ X +I(X^2)+I(X^3)+I(X^4),
data=subset(Dataset,Dataset$molar=="M2",select=X:Y))
newdf <- data.frame(x = seq(0, 1, c(408,663)))
M1.P <- cbind(data=subset(Dataset,Dataset$molar=="M1",select=X:Y), predict(M1.R, interval = "prediction"))
M2.P <- cbind(data=subset(Dataset,Dataset$molar=="M2",select=X:Y), predict(M2.R, interval = "prediction"))
p = cbind(as.data.frame(rbind(M1.P, M2.P)), f = factor(rep(1:2, c(408,663)), x = rep(newdf$x, 2))
mdf = with(Dataset, data.frame(x = rep(x, 2), y = c(subset(Dataset,Dataset$molar=="M1",select=Y), subset(Dataset,Dataset$molar=="M2",select=Y),
f = factor(rep(1:2, c(408,663))))
ggplot(mdf, aes(x = x, y = y, colour = f)) + geom_point() +
geom_ribbon(data = p, aes(x = x, ymin = lwr, ymax = upr,
fill = f, y = NULL, colour = NULL),
alpha = 0.2) +
geom_line(data = p, aes(x = x, y = fit))
These are the messages I get now:
[98] WARNING: Warning in if (n < 0L) stop("wrong sign in 'by' argument") :
the condition has length > 1 and only the first element will be used
Warning in if (n > .Machine$integer.max) stop("'by' argument is much too small") :
the condition has length > 1 and only the first element will be used
Warning in 0L:n :
numerical expression has 2 elements: only the first used
Warning in if (by > 0) pmin(x, to) else pmax(x, to) :
the condition has length > 1 and only the first element will be used
[99] WARNING: Warning in predict.lm(M1.R, interval = "prediction") :
predictions on current data refer to _future_ responses
[100] WARNING: Warning in predict.lm(M2.R, interval = "prediction") :
predictions on current data refer to _future_ responses
[101] ERROR: <text>
I think I am closer but still can't see it. Help!
Here is one way. If you have more than two models/levels in the factor you should look into code that will work over the levels of the factor and fit the models that way.
Anyway, first some dummy data:
set.seed(100)
x <- runif(100)
y1 <- 2 + (0.3 * x) + (2.4 * x^2) + (-2.5 * x^3) + (3.4 * x^4) + rnorm(100)
y2 <- -1 + (0.3 * x) + (2.4 * x^2) + (-2.5 * x^3) + (3.4 * x^4) +
(-0.3 * x^5) + (2.4 * x^6) + rnorm(100)
df <- data.frame(x, y1, y2)
Fit our two models:
m1 <- lm(y1 ~ poly(x, 4), data = df)
m2 <- lm(y2 ~ poly(x, 6), data = df)
Now precict at some new locations x and stick it together with x and f, a factor indexing the model, into a tidy format:
newdf <- data.frame(x = seq(0, 1, length = 100))
p1 <- predict(m1, newdata = newdf, interval = "prediction")
p2 <- predict(m2, newdata = newdf, interval = "prediction")
p <- cbind(as.data.frame(rbind(p1, p2)), f = factor(rep(1:2, each = 100)),
x = rep(newdf$x, 2))
Melt the original data into tidy form
mdf <- with(df, data.frame(x = rep(x, 2), y = c(y1, y2),
f = factor(rep(1:2, each = 100))))
Draw the plot, using colour to distinguish the models/data
ggplot(mdf, aes(x = x, y = y, colour = f)) +
geom_point() +
geom_ribbon(data = p, aes(x = x, ymin = lwr, ymax = upr,
fill = f, y = NULL, colour = NULL),
alpha = 0.2) +
geom_line(data = p, aes(x = x, y = fit))
This gets us

graphing confidence intervals nls r

I'm in the process of putting some incidence data together for a proposal. I know that the data takes on a sigmoid shape overall so I fit it using NLS in R. I was trying to get some confidence intervals to plot as well so I used bootstrapping for the parameters, made three lines and here's where I'm having my problem. The bootstrapped CIs give me three sets of values, but because of equation the lines they are crossing.
Picture of Current Plot with "Ideal" Lines in Black
NLS is not my strong suit so perhaps I'm not going about this the right way. I've used mainly a self start function to this point just to get something down on the plot. The second NLS equation will give the same output, but I've put it down now so that I can alter later if needed.
Here is my code thus far:
data <- readRDS(file = "Incidence.RDS")
inc <- nls(y ~ SSlogis(x, beta1, beta2, beta3),
data = data,
control = list(maxiter = 100))
b1 <- summary(inc)$coefficients[1,1]
b2 <- summary(inc)$coefficients[2,1]
b3 <- summary(inc)$coefficients[3,1]
inc2 <- nls(y ~ phi1 / (1 + exp(-(x - phi2) / phi3)),
data = data,
start = list(phi1 = b1, phi2 = b2, phi3 = b3),
control = list(maxiter = 100))
inc2.boot <- nlsBoot(inc2, niter = 1000)
phi1 <- summary(inc2)$coefficients[1,1]
phi2 <- summary(inc2)$coefficients[2,1]
phi3 <- summary(inc2)$coefficients[3,1]
phi1_L <- inc2.boot$bootCI[1,2]
phi2_L <- inc2.boot$bootCI[2,2]
phi3_L <- inc2.boot$bootCI[3,2]
phi1_U <- inc2.boot$bootCI[1,3]
phi2_U <- inc2.boot$bootCI[2,3]
phi3_U <- inc2.boot$bootCI[3,3]
#plot lines
age <- c(20:95)
mean_incidence <- phi1 / (1 + exp(-(age - phi2) / phi3))
lower_incidence <- phi1_L / (1 + exp(-(age - phi2_L) / phi3_L))
upper_incidence <- phi1_U / (1 + exp(-(age - phi2_U) / phi3_U))
inc_line <- data.frame(age, mean_incidence, lower_incidence, upper_incidence)
p <- ggplot()
p <- (p
+ geom_point(data = data, aes(x = x, y = y), color = "darkgreen")
+ geom_line(data = inc_line,
aes(x = age, y = mean_incidence),
color = "blue",
linetype = "solid")
+ geom_line(data = inc_line,
aes(x = age, y = lower_incidence),
color = "blue",
linetype = "dashed")
+ geom_line(data = inc_line,
aes(x = age, y = upper_incidence),
color = "blue",
linetype = "dashed")
+ geom_ribbon(data = inc_line,
aes(x = age, ymin = lower_incidence, ymax = upper_incidence),
fill = "blue", alpha = 0.20)
+ labs(x = "\nAge", y = "Incidence (per 1,000 person years)\n")
)
print(p)
Here's a link to the data.
Any help on what to do next or if this is even possible given my current set up would be appreciated.
Thanks
Try plot.drc in the drc package.
library(drc)
fm <- drm(y ~ x, data = data, fct = LL.3())
plot(fm, type = "bars")
P.S. Please include the library calls in your questions so that the code is self contained and complete. In the case of the question here: library(ggplot2); library(nlstools) .

Resources