Multiple Regression lines in ggplot2 - r

here is a test code and I don't understand why is not working as expected. Is a ggplot2 question, not an R one.
library(ggplot2)
K = 10
x <- 1:100/100
y <- sapply (x, FUN= function(x) 1+x)
xy <- data.frame(x,y)
set.seed(1234)
xy$yrand <- xy$y + runif(100,min = -0.35, max = 0.5)
folds <- cut(seq(1, nrow(xy)), breaks = K, labels = FALSE)
p1 <- ggplot(xy, aes(x = xy$x, y = xy$yrand))+geom_point() +ggtitle ("Simple
x vs y plot with added random noise") + xlab("X") + ylab("Y")
for(i in 1:K){
#Segement your data by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- xy[testIndexes, ]
trainData <- xy[-testIndexes, ]
lmTemp <- lm(yrand ~ x, data = trainData)
p1 <- p1 + geom_line(data = trainData, aes(x = trainData$x, y = predict(lmTemp, newdata = trainData)))
}
p1
Now what I would like to see is a plot with 10 lines (the regression lines). But I only see one. Can someone help me out? Is the ggplot2 syntax that is wrong?
Thanks, Umberto
EDITED:
I marked the answer I got since it is a nice way of doing it. I just wanted to add a simple way of doing it preparing the datasets for the graph I wanted to create. I think this method is slightly easier to understand if you don't have so much R experience.
for(i in 1:K){
#Segement your data by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- xy[testIndexes, ]
trainData <- xy[-testIndexes, ]
lmTemp <- lm(yrand ~ x, data = trainData)
# Let's build a data set for the lines
fitLines <- rbind(fitLines, data.frame(rep(paste("set",i),nrow(trainData)),trainData[,1], predict(lmTemp, newdata = trainData)))
}
names(fitLines) <- c("set", "x","y")
p1 + geom_line(data = fitLines, aes(x = x, y = y, col = set))
And this is what you get

You could use the crossv_kfold()function from the modelr-package, and put your complete code into a "pipe-workflow":
library(modelr)
library(tidyverse)
x <- 1:100/100
y <- sapply (x, FUN= function(x) 1+x)
xy <- data.frame(x,y)
set.seed(1234)
xy$yrand <- xy$y + runif(100,min = -0.35, max = 0.5)
xy %>%
crossv_kfold() %>%
mutate(
models = map(train, ~ lm(yrand ~ x, data = .x)),
predictions = map2(models, test, ~predict(.x, newdata = .y, type = "response"))
) %>%
select(-train, -test, -models) %>%
unnest() %>%
bind_cols(xy) %>%
ggplot(aes(x = x, y = predictions)) +
stat_smooth(aes(colour = .id), method = "lm", se = FALSE) +
geom_point(aes(y = yrand))
Putting the colour-aes inside the ggplot-call would also map the points to the groups:
xy %>%
crossv_kfold() %>%
mutate(
models = map(train, ~ lm(yrand ~ x, data = .x)),
predictions = map2(models, test, ~predict(.x, newdata = .y, type = "response"))
) %>%
select(-train, -test, -models) %>%
unnest() %>%
bind_cols(xy) %>%
ggplot(aes(x = x, y = predictions, colour = .id)) +
stat_smooth(, method = "lm", se = FALSE) +
geom_point(aes(y = yrand))

Related

ggplot - use data passed to ggplot to calculate the mean of the data in subsequent geom calls [duplicate]

I was wondering why variable mean_y is not recognized by my
geom_hline(yintercept = unique(mean_y)) call?
library(tidyverse)
set.seed(20)
n_groups <- 2
n_in_group <- 20
sd_e = 2
groups <- gl(n_groups, n_in_group, labels = c("T","C"))
age <-rnorm(length(groups), 25, 3)
betas <- c(5,0,0,2)
dat <- data.frame(groups=groups,age=age)
X <- model.matrix(~ groups * age, data = dat)
lin_pred <- as.vector(X %*% betas)
dat$y <- rnorm(nrow(X), lin_pred, sd_e)
dat %>% group_by(groups) %>% mutate(mean_y = mean(y)) %>%
ungroup() %>%
ggplot()+aes(x = age, y = y) +
geom_point(aes(color=groups)) +
geom_hline(yintercept = unique(mean_y)) # Error in unique(mean_y) :
# object 'mean_y' not found
Variables need to be inside aes(), try:
geom_hline(aes(yintercept = mean_y))

How to put plotmath labels in ggplot facets

We often want individual regression equations in ggplot facets. The best way to do this is build the labels in a dataframe and then add them manually. But what if the labels contain plotmath, e.g., superscripts?
Here is a way to do it. The plotmath is converted to a string and then parsed by ggplot. The test_eqn function is taken from another Stackoverflow post, I'll link it when I find it again. Sorry about that.
library(ggplot2)
library(dplyr)
test_eqn <- function(y, x){
m <- lm(log(y) ~ log(x)) # fit y = a * x ^ b in log space
p <- exp(predict(m)) # model prediction of y
eq <- substitute(expression(Y==a~X^~b),
list(
a = format(unname(exp(coef(m)[1])), digits = 3),
b = format(unname(coef(m)[2]), digits = 3)
))
list(eq = as.character(eq)[2], pred = p)
}
set.seed(123)
x <- runif(20)
y <- runif(20)
test_eqn(x,y)$eq
#> [1] "Y == \"0.57\" ~ X^~\"0.413\""
data <- data.frame(x = x,
y = y,
f = sample(c("A","B"), 20, replace = TRUE)) %>%
group_by(f) %>%
mutate(
label = test_eqn(y,x)$eq, # add label
labelx = mean(x),
labely = mean(y),
pred = test_eqn(y,x)$pred # add prediction
)
# plot fits (use slice(1) to avoid multiple copies of labels)
ggplot(data) +
geom_point(aes(x = x, y = y)) +
geom_line(aes(x = x, y = pred), colour = "red") +
geom_text(data = slice(data, 1), aes(x = labelx, y = labely, label = label), parse = TRUE) +
facet_wrap("f")
Created on 2021-10-20 by the reprex package (v2.0.1)

Overlay decision boundary for random forests and boostings

I generate some random data and am trying to overlay a decision boundary based upon fitting using random forests and boosting. I can recreate the problem below. I generate the data, and using regression trees I can easily overlay the decision boundary using the following code:
library(tidyverse)
# set seed and generate some random data
set.seed(123)
Dat <- tibble(
x1 = rnorm(100),
x2 = rnorm(100)
) %>% mutate(y = as_factor(ifelse(x1^2 + x2^2 > 1.39, "A", "B")))
circlepts <- tibble(theta = seq(0, 2*pi, length = 100)) %>%
mutate(x = sqrt(1.39) * sin(theta), y = sqrt(1.39) * cos(theta))
# graph the data and draw the boundary
p <- ggplot(Dat) + geom_point(aes(x1, x2, color = y)) + coord_fixed() +
geom_polygon(data = circlepts, aes(x, y), color = "blue", fill = NA)
# convert character to binary inputs making classification easier
binVec = as.vector(Dat$y)
binVec[which(binVec =="A")] = 1
binVec[which(binVec == "B")] = 0
binVec = as.numeric(binVec)
Dat$y = binVec
# split the data up
datasplit <- initial_split(Dat, prop = 0.7)
training_set <- as_tibble(training(datasplit))
testing_set <- as_tibble(testing(datasplit))
tree_fit <- tree(y~ ., training_set)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(tree_fit)
# plot the data with the decision overlay of the tree fit
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)
Now if I try doing so with random forests or gradient boosting, add_predictions doesn't cooperate that well...
rf_fit <- randomForest(y ~ ., data=training_set, mtry = 2, ntree=500)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(rf_fit)
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)
##ERROR: Error in if (is.na(out.type)) stop("type must be one of 'response', 'prob', 'vote'") : argument is of length zero
And for gradient boosting:
fitBoost <- gbm(y ~ ., data= Dat, distribution = "gaussian",
n.trees = 1000)
pred <- predict(fitBoost, newdata=training_set, n.trees=1000)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(fitBoost)
### ERROR: Error in paste("Using", n.trees, "trees...\n") : argument "n.trees" is missing, with no default
It seems to be a very simple problem. Could someone help me out?
The following code works with your random forest:
training_set$y <- factor(training_set$y)
rf_fit <- randomForest(y ~ ., data=training_set, mtry=2, ntree=500)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50),
x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(rf_fit)
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)
And here is the code for the gradient boosting machine:
fitBoost <- gbm(y ~ ., data=Dat, distribution="gaussian", n.trees=1000)
pred <- predict(fitBoost, newdata=training_set, n.trees=1000)
add_predictions2 <- function (data, model, var = "pred", type = NULL)
{
data[[var]] <- predict2(model, data, type = type)
data
}
predict2 <- function (model, data, type = NULL)
{
if (is.null(type)) {
stats::predict(model, data, n.trees=1000)
} else {
stats::predict(model, data, type = type, n.trees=1000)
}
}
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50),
x2 = modelr::seq_range(testing_set$x1, 50)) %>%
add_predictions2(fitBoost)
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)

How to plot the result of a regression prediction in R

I am beginning with ML in R, and I really like the idea of visualize the results of my calculations, I am wondering how to plot a Prediction.
library("faraway")
library(tibble)
library(stats)
data("sat")
df<-sat[complete.cases(sat),]
mod_sat_sal <- lm(total ~ salary, data = df)
new_teacher <- tibble(salary = 40)
predict(mod_sat_sal, new_teacher)
Expected result:
Data and Regression Model
data(sat, package = "faraway")
df <- sat[complete.cases(sat), ]
model <- lm(total ~ salary, data = df)
Method (1) : graphics way
# Compute the confidence band
x <- seq(min(df$salary), max(df$salary), length.out = 300)
x.conf <- predict(model, data.frame(salary = x),
interval = 'confidence')
# Plot
plot(total ~ salary, data = df, pch = 16, xaxs = "i")
polygon(c(x, rev(x)), c(x.conf[, 2], rev(x.conf[, 3])),
col = gray(0.5, 0.5), border = NA)
abline(model, lwd = 3, col = "darkblue")
Method (2) : ggplot2 way
library(ggplot2)
ggplot(df, aes(x = salary, y = total)) +
geom_point() +
geom_smooth(method = "lm")

How to draw some cumulative incidence plots in trellis/faceted style in R?

I'd like to create a trellis/faceted plot to see the cumulative incidence of several groups in my data. So I started with
library(rms)
library(dplyr)
data(colon)
fit1 <- npsurv(Surv(time, status) ~ 1, data = colon %>% filter(rx=="Obs"))
survplot(fit1, fun = function(x) 1-x)
fit2 <- npsurv(Surv(time, status) ~ 1, data = colon %>% filter(rx=="Lev"))
survplot(fit2, fun = function(x) 1-x)
fit3 <- npsurv(Surv(time, status) ~ 1, data = colon %>% filter(rx=="Lev+5FU"))
survplot(fit3, fun = function(x) 1-x)
Then I get stuck to put the three plots together and show the name of group on top each plot. I don't like the mfrow or mfcol solution. Can someone explain how to use lattice or ggplot2 to do this?
Thank you for any help!
A tidyverse approach - using purrr::map and purrr:map_df to fit the model for every level of rx. The 1 - x part is left as an excercise to the reader. The only 'magic' part is using mget to subset the list-like result of npsurv to get only the columns needed for the plot.
library(rms)
library(tidyverse)
data(colon)
colon %>%
split(.$rx) %>%
map(~ npsurv(Surv(time, status) ~ 1, data = .)) %>%
map_df(~ mget(c("surv", "upper", "lower", "time"),
as.environment(.)) %>%
data.frame,
.id = "rx") %>%
ggplot(aes(time)) +
geom_ribbon(aes(ymin = lower, ymax = upper), fill = "gray80") +
geom_line(aes(y = surv)) +
facet_wrap(~ rx)
I figured it out myself. Here is my solution
require(rms)
require(dplyr)
data(colon)
dat <- list()
for (i in 1:length(unique(colon$rx))) {
j = sort(unique(colon$rx))[i]
fit <- npsurv(Surv(time, status) ~ sex, data = colon%>%filter(rx == j))
dat[[i]] <- data.frame(Time = fit$time,
Probability = 1 - fit$surv,
Group = c(rep("Male", fit$strata[1]), rep("Female", fit$strata[2])),
Center = j)
}
data_all <- do.call(rbind, dat)
data_all$Group <- factor(data_all$Group, levels = c("Male", "Female"))
require(lattice)
p <- xyplot(Probability ~ Time| Center, group = Group, data = data_all,
grid = T, type = "l",
auto.key = list("top", lines = T, points = F),
par.settings = list(lwd = 2))
p

Resources