ggplot's stat_function() giving wrong result - r

I generated some data to perfome a regression on it:
library(tidyverse)
library(nnet)
# Generating the data --------------------------
set.seed(100)
helicopter <- rnorm(20, mean = 35, sd = 3)
car <- rnorm(20, mean = 30, sd = 3)
bus <- rnorm(20, mean = 25, sd = 3)
bike <- rnorm(20, mean = 20, sd = 3)
transportation_data <- data.frame(helicopter, car, bus, bike) %>%
pivot_longer(cols = 1:4, values_to = "income", names_to = "mode")
# Setting up the regression -------------------
transportation_regression <- multinom(mode~income, data = transportation_data)
So far, so good. I now want to plot the regression results (probability of choosing a certain mode of transportation based on income) using stat_function:
ins <- coef(transportation_regression)[1:3]
betas <- coef(transportation_regression)[4:6]
transportation_data %>%
ggplot(aes(x = income))+
stat_function(fun = function(x) { 1 / (1 + sum(exp(ins + betas * x))) }, aes(color = "bike"))+
stat_function(fun = function(x) { exp(ins[1] + betas[1] * x) / (1 + sum(exp(ins + betas * x))) }, aes(color = "bus"))+
stat_function(fun = function(x) { exp(ins[2] + betas[2] * x) / (1 + sum(exp(ins + betas * x))) }, aes(color = "car"))+
stat_function(fun = function(x) { exp(ins[3] + betas[3] * x) / (1 + sum(exp(ins + betas * x))) }, aes(color = "helicopter"))
I get this output, which is obviously wrong, and a warning Warning: longer object length is not a multiple of shorter object length where I don't know what it means.
When I use the same functions, but predict data points first, everything works just fine:
income <- seq(0,50,0.1)
result <- matrix( , nrow = length(income), ncol = 4)
i <- 1
for(x in income){
result[i,1] <- 1 / (1 + sum(exp(ins + betas * x))) # bike
result[i,2] <- exp(ins[1] + betas[1] * x) / (1 + sum(exp(ins + betas * x))) # bus
result[i,3] <- exp(ins[2] + betas[2] * x) / (1 + sum(exp(ins + betas * x))) # car
result[i,4] <- exp(ins[3] + betas[3] * x) / (1 + sum(exp(ins + betas * x))) # helicopter
i <- i + 1
}
cbind(income, as.data.frame(result)) %>%
pivot_longer(cols = V1:V4) %>%
ggplot(aes(x = income, y = value, color = name))+
geom_line()
Why don't the stat_function() in ggplot work?

I think it's just a misunderstanding of how the function works. Here's an example of using stat_function() to generate the right result:
library(tidyverse)
library(nnet)
# Generating the data --------------------------
set.seed(100)
helicopter <- rnorm(20, mean = 35, sd = 3)
car <- rnorm(20, mean = 30, sd = 3)
bus <- rnorm(20, mean = 25, sd = 3)
bike <- rnorm(20, mean = 20, sd = 3)
transportation_data <- data.frame(helicopter, car, bus, bike) %>%
pivot_longer(cols = 1:4, values_to = "income", names_to = "mode")
# Setting up the regression -------------------
transportation_regression <- multinom(mode~income, data = transportation_data)
#> # weights: 12 (6 variable)
#> initial value 110.903549
#> iter 10 value 48.674542
#> iter 20 value 46.980349
#> iter 30 value 46.766625
#> iter 40 value 46.734782
#> iter 50 value 46.732249
#> final value 46.732163
#> converged
ins <- coef(transportation_regression)[1:3]
betas <- coef(transportation_regression)[4:6]
transportation_data %>%
ggplot(aes(x = income))+
stat_function(fun = function(x) { 1 / (1 + exp(ins[1] + betas[1] * x) + exp(ins[2] + betas[2] * x) + exp(ins[3] + betas[3] * x)) }, aes(color = "bike"))+
stat_function(fun = function(x) { exp(ins[1] + betas[1] * x) / (1 + exp(ins[1] + betas[1] * x) + exp(ins[2] + betas[2] * x) + exp(ins[3] + betas[3] * x)) }, aes(color = "bus"))+
stat_function(fun = function(x) { exp(ins[2] + betas[2] * x) / (1 + exp(ins[1] + betas[1] * x) + exp(ins[2] + betas[2] * x) + exp(ins[3] + betas[3] * x)) }, aes(color = "car"))+
stat_function(fun = function(x) { exp(ins[3] + betas[3] * x) / (1 + exp(ins[1] + betas[1] * x) + exp(ins[2] + betas[2] * x) + exp(ins[3] + betas[3] * x)) }, aes(color = "helicopter"))
There were a couple of problems originally. Take, for example, the first instance of stat_function(),
stat_function(fun = function(x) {
1 / (1 + sum(exp(ins + betas * x))) },
aes(color = "bike"))
You're expecting ins + betas * x to be equivalent to ins[1] + betas[1] * x + ins[2] + betas[2] * x + ins[3] + betas[3] * x, but it isn't essentially recycling ins and betas to make them vectors as long as x and then multiplying betas by x and adding ins.
The other problem was the sum() around exp(ins ...) Rather than summing the rows, it's summing all rows and columns of the output, making a scalar value.
You could also make it a bit more general using matrix calculations:
b <- coef(transportation_regression)
transportation_data %>%
ggplot(aes(x = income))+
stat_function(fun = function(x) { 1 / (1 + rowSums(exp(cbind(1, x) %*% t(b)))) }, aes(color = "bike"))+
stat_function(fun = function(x) { exp(ins[1] + betas[1] * x) / (1 + rowSums(exp(cbind(1, x) %*% t(b)))) }, aes(color = "bus"))+
stat_function(fun = function(x) { exp(ins[2] + betas[2] * x) / (1 + rowSums(exp(cbind(1, x) %*% t(b)))) }, aes(color = "car"))+
stat_function(fun = function(x) { exp(ins[3] + betas[3] * x) / (1 + rowSums(exp(cbind(1, x) %*% t(b)))) }, aes(color = "helicopter"))
Created on 2023-02-04 by the reprex package (v2.0.1)

Related

Plotting fitted values from regression

Hey I have following code in R
S0 = 40
r = log(1 + 0.07)
sigma = 0.3
K = 45
n_steps_per_year = 4
dt = 1 / n_steps_per_year
T = 3
n_steps = n_steps_per_year * T
R = n_paths
Q = 70
P = 72
n_paths = P * Q
d = exp(-r * dt)
N = matrix(rnorm(n_paths * n_steps, mean = 0, sd = 1), n_paths, n_steps)
paths_S = matrix(nrow = n_paths, ncol = n_steps + 1, S0)
for(i in 1:n_paths){
for(j in 1:n_steps){
paths_S[i, j + 1] = paths_S[i, j] * exp((r - 0.5 * sigma ^ 2) * dt + sigma * sqrt(dt) * N[i, j])
}
}
I = apply(K - paths_S, c(1,2), max, 0)
V = matrix(nrow = n_paths, ncol = n_steps + 1)
V[, n_steps + 1] = I[, n_steps + 1]
dV = d * V[, n_steps + 1]
model = lm(dV ~ poly(paths_S[, n_steps], 10))
pred = predict(model, data.frame(x = paths_S[, n_steps]))
plot(paths_S[, n_steps], d * V[, n_steps + 1])
lines(paths_S[, n_steps], pred)
but when I run the last two lines then I get very strange plot (multiple lines instead of one line). What is going on?
You did not provide n_paths, lets assume:
n_paths = 7
set.seed(111)
Then running your code, before you plot, you need to order your x values before plotting:
o = order(paths_S[,12])
plot(paths_S[o, n_steps], d * V[o, n_steps + 1],cex=0.2,pch=20)
lines(paths_S[o, n_steps], pred[o],col="blue")

Add a Passing-Bablok regression line

I have to perform many comparisons between different measurement methods and I have to use the Passing-Bablok regression approach.
I would like to take advantage of ggplot2 and faceting, but I don't know how to add a geom_smooth layer based on the Passing-Bablok regression.
I was thinking about something like: https://stackoverflow.com/a/59173260/2096356
Furthermore, I would also need to show the regression line equation, with confidence interval for intercept and slope parameters, in each plot.
Edit with partial solution
I've found a partial solution combining the code provided in this post and in this answer.
## Regression algorithm
passing_bablok.fit <- function(x, y) {
x_name <- deparse(substitute(x))
lx <- length(x)
l <- lx*(lx - 1)/2
k <- 0
S <- rep(NA, lx)
for (i in 1:(lx - 1)) {
for (j in (i + 1):lx) {
k <- k + 1
S[k] <- (y[i] - y[j])/(x[i] - x[j])
}
}
S.sort <- sort(S)
N <- length(S.sort)
neg <- length(subset(S.sort,S.sort < 0))
K <- floor(neg/2)
if (N %% 2 == 1) {
b <- S.sort[(N+1)/2+K]
} else {
b <- sqrt(S.sort[N / 2 + K]*S.sort[N / 2 + K + 1])
}
a <- median(y - b * x)
res <- as.vector(c(a,b))
names(res) <- c("(Intercept)", x_name)
class(res) <- "Passing_Bablok"
res
}
## Computing confidence intervals
passing_bablok <- function(formula, data, R = 100, weights = NULL){
ret <- boot::boot(
data = model.frame(formula, data),
statistic = function(data, ind) {
data <- data[ind, ]
args <- rlang::parse_exprs(colnames(data))
names(args) <- c("y", "x")
rlang::eval_tidy(rlang::expr(passing_bablok.fit(!!!args)), data, env = rlang::current_env())
},
R=R
)
class(ret) <- c("Passing_Bablok", class(ret))
ret
}
## Plotting confidence bands
predictdf.Passing_Bablok <- function(model, xseq, se, level) {
pred <- as.vector(tcrossprod(model$t0, cbind(1, xseq)))
if(se) {
preds <- tcrossprod(model$t, cbind(1, xseq))
data.frame(
x = xseq,
y = pred,
ymin = apply(preds, 2, function(x) quantile(x, probs = (1-level)/2)),
ymax = apply(preds, 2, function(x) quantile(x, probs = 1-((1-level)/2)))
)
} else {
return(data.frame(x = xseq, y = pred))
}
}
An example of usage:
z <- data.frame(x = rnorm(100, mean = 100, sd = 5),
y = rnorm(100, mean = 110, sd = 8))
ggplot(z, aes(x, y)) +
geom_point() +
geom_smooth(method = passing_bablok) +
geom_abline(slope = 1, intercept = 0)
So far, I haven't been able to show the regression line equation, with confidence interval for intercept and slope parameters (as +- or in parentheses).
You've arguably done with difficult part with the PaBa regression.
Here's a basic solution using your passing_bablok.fit function:
z <- data.frame(x = 101:200+rnorm(100,sd=10),
y = 101:200+rnorm(100,sd=8))
mycoefs <- as.numeric(passing_bablok.fit(x = z$x, y=z$y))
paba_eqn <- function(thecoefs) {
l <- list(m = format(thecoefs[2], digits = 2),
b = format(abs(thecoefs[1]), digits = 2))
if(thecoefs[1] >= 0){
eq <- substitute(italic(y) == m %.% italic(x) + b,l)
} else {
eq <- substitute(italic(y) == m %.% italic(x) - b,l)
}
as.character(as.expression(eq))
}
library(ggplot2)
ggplot(z, aes(x, y)) +
geom_point() +
geom_smooth(method = passing_bablok) +
geom_abline(slope = 1, intercept = 0) +
annotate("text",x = 110, y = 220, label = paba_eqn(mycoefs), parse = TRUE)
Note the equation will vary because of rnorm in the data creation..
The solution could definitely be made more slick and robust, but it works for both positive and negative intercepts.
Equation concept sourced from: https://stackoverflow.com/a/13451587/2651663

How to combine linear model with step function

Suppose we have this data:
library(tidyverse)
library(modelr)
set.seed(42)
d1 <- tibble(x = 0:49, y = 5*x + rnorm(n = 50))
d2 <- tibble(x = 50:99, y = 10*x + rnorm(n = 50))
data <- rbind(d1, d2)
ggplot(data, aes(x, y)) +
geom_point()
How to fit that data?
What I tried:
Linear model
m1 <- lm(y ~ x, data = data)
data %>%
add_predictions(m1) %>%
gather(key = cat, value = y, -x) %>%
ggplot(aes(x, y, color = cat)) +
geom_point()
Step function
# step model
m2 <- lm(y ~ cut(x, 2), data = data)
data %>%
add_predictions(m2) %>%
gather(key = cat, value = y, -x) %>%
ggplot(aes(x, y, color = cat)) +
geom_point()
How to combine both?
Mathematically, your model takes the form
{ a_0 + a_1 x when x < 50
y = {
{ b_0 + b_1 x when x >= 50
You can combine this with indicator functions to arrive at a form in a one-line equation:
y = a_0 + (b_0 - a_0) * 1[x >= 50] + a_1 * x + (b_1 - a_1) * x * 1[x >= 50] + error
Simplifying, we could write this as:
y = c_0 + c_1 * x + c_2 * z + c_3 * x * z + error
Where I'm writing z = 1[x >= 50] to emphasize that this indicator function is just another regressor
In R, we can fit this like
lm(y ~ x * I(x >= 50), data = data)
Where * will fully interact x and 1[x >= 50] as desired.
with(data, {
plot(x, y)
reg = lm(y ~ x * I(x >= 50))
lines(x, predict(reg, data.frame(x)))
})
If you don't know that the jump happens at 50, the road is wide open, but you could for example compare mean squared errors:
x_range = 1:100
errs = sapply(x_range, function(BREAK) {
mean(lm(y ~ x * I(x >= BREAK), data = data)$residuals^2)
})
plot(x_range, errs)
x_min = x_range[which.min(errs)]
axis(side = 1L, at = x_min)
abline(v = x_min, col = 'red')

How to deal with vertical asymptotes in ggplot2

Consider three simple mathematical functions :
f1 <- function(x) 1/x
f2 <- function(x) tan(x)
f3 <- function(x) 1 / sin(x)
There exist certain vertical asymptotes respectively, i.e. f(x) almost gets infinity when x approaches some values. I plot these three functions by ggplot2::stat_function() :
# x is between -5 to 5
ggplot(data.frame(x = c(-5, 5)), aes(x)) +
stat_function(fun = f1, n = 1000) +
coord_cartesian(ylim = c(-50, 50))
# x is between -2*pi to 2*pi
ggplot(data.frame(x = c(-2*pi, 2*pi)), aes(x)) +
stat_function(fun = f2, n = 1000) +
coord_cartesian(ylim = c(-50, 50))
# x is between -2*pi to 2*pi
ggplot(data.frame(x = c(-2*pi, 2*pi)), aes(x)) +
stat_function(fun = f3, n = 1000) +
coord_cartesian(ylim = c(-50, 50))
The asymptotes appear respectively at :
x1 <- 0
x2 <- c(-3/2*pi, -1/2*pi, 1/2*pi, 3/2*pi)
x3 <- c(-pi, 0, pi)
Actually, these lines do not exist, but ggplot makes them visible. I attempted to use geom_vline() to cover them, namely :
+ geom_vline(xintercept = x1, color = "white")
+ geom_vline(xintercept = x2, color = "white")
+ geom_vline(xintercept = x3, color = "white")
The outputs seem rough and indistinct black marks can be seen. Are there any methods which are much robuster ?
A solution related to #Mojoesque's comments that is not perfect, but also relatively simple and with two minor shortcomings: a need to know the asymptotes (x1, x2, x3) and possibly to reduce the range of y.
eps <- 0.01
f1 <- function(x) if(min(abs(x - x1)) < eps) NA else 1/x
f2 <- function(x) if(min(abs(x - x2)) < eps) NA else tan(x)
f3 <- function(x) if(min(abs(x - x3)) < eps) NA else 1 / sin(x)
ggplot(data.frame(x = c(-5, 5)), aes(x)) +
stat_function(fun = Vectorize(f1), n = 1000) +
coord_cartesian(ylim = c(-30, 30))
ggplot(data.frame(x = c(-2*pi, 2*pi)), aes(x)) +
stat_function(fun = Vectorize(f2), n = 1000) +
coord_cartesian(ylim = c(-30, 30))
ggplot(data.frame(x = c(-2*pi, 2*pi)), aes(x)) +
stat_function(fun = Vectorize(f3), n = 1000) +
coord_cartesian(ylim = c(-30, 30))
This solution is based on #Mojoesque's comment, which uses piecewise skill to partition x-axis into several subintervals, and then execute multiple stat_function() by purrr::reduce(). The restraint is that asymptotes need to be given.
Take tan(x) for example :
f <- function(x) tan(x)
asymp <- c(-3/2*pi, -1/2*pi, 1/2*pi, 3/2*pi)
left <- -2 * pi # left border
right <- 2 * pi # right border
d <- 0.001
interval <- data.frame(x1 = c(left, asymp + d),
x2 = c(asymp - d, right))
interval # divide the entire x-axis into 5 sections
# x1 x2
# 1 -6.283185 -4.713389
# 2 -4.711389 -1.571796
# 3 -1.569796 1.569796
# 4 1.571796 4.711389
# 5 4.713389 6.283185
library(tidyverse)
pmap(interval, function(x1, x2) {
stat_function(fun = f, xlim = c(x1, x2), n = 1000)
}) %>% reduce(.f = `+`,
.init = ggplot(data.frame(x = c(left, right)), aes(x)) +
coord_cartesian(ylim = c(-50, 50)))

How to create covariance matrix in R?

I'm trying to build covariance matrix from a scratch (cov() function). My task is not to use any package. Hence I created my functions:
meanf <- function(x){
sum(x) / length(x)
}
sampleCov <- function(x,y){
stopifnot(identical(length(x), length(y)))
sum((x - meanf(x)) * (y - meanf(y))) / (length(x) - 1)
}
> sampleCov(winequality_red$quality, winequality_red$alcohol)
[1] 0.409789
Unfortunately, I'm stuck here. All loops I tried to apply are missing any point. Of course it's possible to just copy the sampleCov function and make it for every possible combination but that's not my point.
If I understand you correctly then I believe you want to recreate a covariate output like the one returned by cov function.
OPs given function:
meanf <- function(x){
sum(x) / length(x)
}
sampleCov <- function(x,y){
stopifnot(identical(length(x), length(y)))
sum((x - meanf(x)) * (y - meanf(y))) / (length(x) - 1)
}
You can try this way, I have taken mtcars data here:
Covariate Function:
vars <- names(mtcars)
egrid <- expand.grid(vars, vars)
egrid <- data.frame(sapply(egrid, as.character),stringsAsFactors = F)
egrid <- egrid[order(egrid$Var1, egrid$Var2),]
mat <- vector("list", nrow(egrid))
for(i in 1:nrow(egrid)){
mat[[i]] <- sampleCov(mtcars[,egrid[i,"Var1"]], mtcars[,egrid[i,"Var2"]])
}
finaldat <- cbind(egrid, cov = do.call('rbind', mat))
finaldat_list <- split(finaldat, finaldat$Var1)
mat_form <- do.call('cbind', finaldat_list)
cov_values <- mat_form[,grepl("\\.cov",names(mat_form))]
col_values <- mat_form[,paste0(egrid$Var1[1],".Var2")]
final_matrix_cov <- cbind(col_values, cov_values)
Sample Output:
> final_matrix_cov
col_values am.cov carb.cov cyl.cov disp.cov
9 mpg 1.80393145 -5.36310484 -9.1723790 -633.09721
20 cyl -0.46572581 1.52016129 3.1895161 199.66028
31 disp -36.56401210 79.06875000 199.6602823 15360.79983
42 hp -8.32056452 83.03629032 101.9314516 6721.15867
You need the matrix multiplication %*%.
sampleCov <- function(x,y){
stopifnot(identical(length(x), length(y)))
sum((x - mean(x)) %*% (y - mean(y))) / (length(x) - 1)
}
> sampleCov(rnorm(10000),rnorm(10000))
[1] 0.01808466
This is probably a little more than you need, but it should answer your question, and I think it is a nice illustration of the practical application of covariances, correlations, etc.
# load the data
link <- "https://raw.githubusercontent.com/DavZim/Efficient_Frontier/master/data/mult_assets.csv"
df <- data.table(read.csv(link))
# calculate the necessary values:
# I) expected returns for the two assets
er_x <- mean(df$x)
er_y <- mean(df$y)
# II) risk (standard deviation) as a risk measure
sd_x <- sd(df$x)
sd_y <- sd(df$y)
# III) covariance
cov_xy <- cov(df$x, df$y)
# create 1000 portfolio weights (omegas)
x_weights <- seq(from = 0, to = 1, length.out = 1000)
# create a data.table that contains the weights for the two assets
two_assets <- data.table(wx = x_weights,
wy = 1 - x_weights)
# calculate the expected returns and standard deviations for the 1000 possible portfolios
two_assets[, ':=' (er_p = wx * er_x + wy * er_y,
sd_p = sqrt(wx^2 * sd_x^2 +
wy^2 * sd_y^2 +
2 * wx * (1 - wx) * cov_xy))]
two_assets
# lastly plot the values
ggplot() +
geom_point(data = two_assets, aes(x = sd_p, y = er_p, color = wx)) +
geom_point(data = data.table(sd = c(sd_x, sd_y), mean = c(er_x, er_y)),
aes(x = sd, y = mean), color = "red", size = 3, shape = 18) +
# Miscellaneous Formatting
theme_bw() + ggtitle("Possible Portfolios with Two Risky Assets") +
xlab("Volatility") + ylab("Expected Returns") +
scale_y_continuous(label = percent, limits = c(0, max(two_assets$er_p) * 1.2)) +
scale_x_continuous(label = percent, limits = c(0, max(two_assets$sd_p) * 1.2)) +
scale_color_continuous(name = expression(omega[x]), labels = percent)
See the link below for all details.
https://datashenanigan.wordpress.com/2016/05/24/a-gentle-introduction-to-finance-using-r-efficient-frontier-and-capm-part-1/

Resources