I have found few codes on line but non of them could help me solve my problem. I know the expr needs x but I couldn't find way to plot these to functions using the curve function. I am able to plot them when the model has only one independent variable but not more than one. Here is the code
n <- 50
x1 <- runif(n = n, min = 0, max = 1)
x2 <- rnorm(n,mean = -50,1)
x3 <- rnorm(n=n,mean =50,sd=8)
z <- 3 - 4.2*x1 - x2 - x3
pr <- 1/(1+exp(-z))
y <- rbinom(n=n,1,pr)
y
# create dataframe
df = data.frame(y=y,x1=x1,x2=x2,x3=x3)
m <- glm( y~.,data=df,family=binomial(link = "logit"))
summary(m)
beta.hat <- m$coefficients
z.hat <- beta.hat[1] + beta.hat[2]*x1 + beta.hat[3]*x2 + beta.hat[4]*x3
curve(expr = exp(z) / (1 + exp(z)), xlim = c(0,1), ylab = expression(pi), n = 1000, lwd = 3, xlab = expression(z/hat(z)))
curve(expr = exp(z.hat)/(1 + exp(z.hat)), xlim = c(0,1), add = TRUE, col = "red", n = 1000)
Related
I have been playing with the bayesQR package, and want to apply it to an application that calls for variable selection using LASSO. As far as I understand, this is possible in bayesQR, but I haven't been able to get any variables dropped. My toy example is below, where the extraneous variables (c and d) are dropped by glmnet, but not by bayesQR.
Is there something fundamental that I am missing? How can I get model4 below to have eliminated variables?
library(data.table)
library(bayestestR)
library(bayesQR)
library(glmnet)
# Generate data
n = 250
seed = 22
noise_sd = 5
set.seed(seed)
dt = data.table(a = runif(n, min = 0, max = 10),
b = runif(n, min = 0, max = 10),
c = runif(n, min = 0, max = 10),
d = rnorm(n, sd = .01 * noise_sd))
dt[, y := (a + rnorm(n, sd = 1 * noise_sd)) * 2 + (b + rnorm(n, sd = 2 * noise_sd)) + rnorm(n, sd = 2 * noise_sd)]
formula = y ~ a + b + c + d
# Just with GLM
model1 = glm(data = dt,
formula = formula)
# LASSO with glmnet
x = as.matrix(dt[, .(a,b,c,d)])
y = dt$y
cv_model <- cv.glmnet(x, y, alpha = 1)
best_lambda <- cv_model$lambda.min
model2 <- glmnet(x, y, alpha = 1, lambda = best_lambda)
print(coef(model2))
# Quantile regression with bayesQR
model3 = bayesQR(data = dt,
formula = formula,
ndraw = 5000,
seed = seed)
# Quantile regression with bayesQR using adaptive lasso
model4 = bayesQR(data = dt,
formula = formula,
ndraw = 5000,
alasso = TRUE,
seed = seed)
message('GLM')
print(summary(model1))
message('glmnet with LASSO')
print(coef(model2))
message('bayesQR')
print(summary(model3))
message('bayesQR with ALASSO')
print(summary(model4))
Kernel regression is a non-parametric technique that wants to estimate the conditional expectation of a random variable. It uses local averaging of the response value, Y, in order to find some non-linear relationship between X and Y.
I am have used bootstrap for kernel density estimation and now want to use it for kernel regression as well. I have been told to use residual bootstrapping for kernel regression and have read a couple of papers on this. I am however unsure how to perform this. Programming has been done in R using the FKSUM package. I have made an attempt to use standard resampling on kernel regression:
library(FKSUM)
set.seed(1)
n <- 5000
sample.size <- 500
B.replications <- 200
x <- rbeta(n, 2, 2) * 10
y <- 3 * sin(2 * x) + 10 * (x > 5) * (x - 5)
y <- y + rnorm(n) + (rgamma(n, 2, 2) - 1) * (abs(x - 5) + 3)
#taking x.y to be the population
x.y <- data.frame(x, y)
xs <- seq(min(x), max(x), length = 1000)
ftrue <- 3 * sin(2 * xs) + 10 * (xs > 5) * (xs - 5)
#Sample from the population
seqx<-seq(1,5000,by=1)
sample.ind <- sample(seqx, size = sample.size, replace = FALSE)
sample.reg<-x.y[sample.ind,]
x_s <- sample.reg$x
y_s <- sample.reg$y
fhat_loc_lin.pop <- fk_regression(x, y)
fhat_loc_lin.sample <- fk_regression(x = x_s, y = y_s)
plot(x, y, col = rgb(.7, .7, .7, .3), pch = 16, xlab = 'x',
ylab = 'x', main = 'Local linear estimator with amise bandwidth')
lines(xs, ftrue, col = 2, lwd = 3)
lines(fhat_loc_lin, lty = 2, lwd = 2)
#Bootstrap
n.B.sample = sample.size # sample bootstrap size
boot.reg.mat.X <- matrix(0,ncol=B.replications, nrow=n.B.sample)
boot.reg.mat.Y <- matrix(0,ncol=B.replications, nrow=n.B.sample)
fhat_loc_lin.boot <- matrix(0,ncol = B.replications, nrow=100)
Temp.reg.y <- matrix(0,ncol = B.replications,nrow = 1000)
for(i in 1:B.replications){
sequence.x.boot <- seq(from=1,to=n.B.sample,by=1)
sample.ind.boot <- sample(sequence.x.boot, size = sample.size, replace = TRUE)
boot.reg.mat <- sample.reg[sample.ind.boot,]
boot.reg.mat.X <- boot.reg.mat$x
boot.reg.mat.Y <- boot.reg.mat$y
fhat_loc_lin.boot <- fk_regression(x = boot.reg.mat.X ,
y = boot.reg.mat.Y,
h = fhat_loc_lin.sample$h)
lines(y=fhat_loc_lin.boot$y,x= fhat_loc_lin.sample$x, col =c(i) )
Temp.reg.y[,i] <- fhat_loc_lin.boot$y
}
quan.reg.l <- vector()
quan.reg.u <- vector()
for(i in 1:length(xs)){
quan.reg.l[i] <- quantile(x = Temp.reg.y[i,],probs = 0.025)
quan.reg.u[i] <- quantile(x = Temp.reg.y[i,],probs = 0.975)
}
# Lower Bound
Temp.reg.2 <- quan.reg.l
lines(y=Temp.reg.2,x=fhat_loc_lin.boot$x ,col="red",lwd=4,lty=1)
# Upper Bound
Temp.reg.3 <- quan.reg.u
lines(y=Temp.reg.3,x=fhat_loc_lin.boot$x ,col="navy",lwd=4,lty=1)
Asking the question on here now since I haven't received any response on CV. Any help would be greatly appreciated!
Not sure about how to tackle this - I have a data distribution where data selection based on standard deviation does not include all data points (data is more variable on one end than on the other). However, when plotting a density plot I can see that all data outside the 8th blue ring are what I want to select.
Example code:
x <- sort(rnorm(1300, mean = 0, sd = 1))
y <- rnorm(1300, mean = 0, sd = 1)
x <- c(x, rnorm(300, mean = 4, sd = 2), rnorm(600, mean = -2, sd = 2))
y <- c(y, rnorm(300, mean = 3, sd = 4), rnorm(600, mean = -2, sd = 2))
mydata <- data.frame(x,y)
ggplot(data = mydata, aes(x = x, y = y)) +
geom_point(cex = 0.5) +
geom_density_2d()
I adapted this from http://slowkow.com/notes/ggplot2-color-by-density/.
Under the hood, geom_density_2d uses the MASS::kde2d function, so we can also apply it to the underlying data to subset by density.
set.seed(42)
x <- sort(rnorm(1300, mean = 0, sd = 1))
y <- rnorm(1300, mean = 0, sd = 1)
x <- c(x, rnorm(300, mean = 4, sd = 2), rnorm(600, mean = -2, sd = 2))
y <- c(y, rnorm(300, mean = 3, sd = 4), rnorm(600, mean = -2, sd = 2))
mydata <- data.frame(x,y)
# Copied from http://slowkow.com/notes/ggplot2-color-by-density/
get_density <- function(x, y, n = 100) {
dens <- MASS::kde2d(x = x, y = y, n = n)
ix <- findInterval(x, dens$x)
iy <- findInterval(y, dens$y)
ii <- cbind(ix, iy)
return(dens$z[ii])
}
mydata$density <- get_density(mydata$x, mydata$y)
Select points based on arbitrary contour
EDIT: Changed to allow selection based on contour levels
# First create plot with geom_density
gg <- ggplot(data = mydata, aes(x = x, y = y)) +
geom_point(cex = 0.5) +
geom_density_2d(size = 1, n = 100)
gg
# Extract levels denoted by contours by going into the
# ggplot build object. I found these coordinates by
# examining the object in RStudio; Note, the coordinates
# would change if the layer order were altered.
gb <- ggplot_build(gg)
contour_levels <- unique(gb[["data"]][[2]][["level"]])
# contour_levels
# [1] 0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08
# Add layer that relies on given contour level
gg2 <- gg +
geom_point(data = mydata %>%
filter(density <= contour_levels[1]),
color = "red", size = 0.5)
gg2
I'm trying to create a graph in R to show the carrying capacity of a population using an example given to me which is:
install.packages("deSolve", dependencies = TRUE)
clogistic <- function(times, y, parms){
n <- y[1]
r <- parms[1]
alpha <- parms [2]
dN.dt <- r * n * (1 - alpha * n)
return(list(c(dN.dt)))
}
prms <- c(r = 1, alpha = 0.01)
init.N <- c(1)
t.s <- seq(0.1, 10, by = 0.1)
library(deSolve)
out <- ode(y = init.N, times = t.s, clogistic, parms = prms)
plot(out[,1], out[,2], type="l", xlab = "Time", ylab = "N", col = "blue", lwd = 2)
Now I'm using this to try and show a starting population of 178 with an increase of 21 for 15 time steps. But when I try to change the formula it decreases and bottoms out after one time step and stays bottom for the remainder of the time.
I've tried changing init.N <- c(1) to c(178) which it does but then bottoms out. I've tried changing prms <- c(r = 1, alpha = 0.01) to (r = 21, along with the change in initial population change and without but it just doesn't increase. What is it that I am missing? Knowing R it's going to be something small but I just keep missing it.
Any help will be greatly appreciated.
This is the differential equation that is being integrated:
dN.dt <- r * n * (1 - alpha * n)
If you want an asymptote of n= 200 then set alpha to 1/200 so that the rate of change will go to zero when n gets to 200:
prms <- c(r = 1, alpha = .005)
init.N <- 178
t.s <- seq(0.1, 10, by = 0.1)
library(deSolve)
out <- ode(y = init.N, times = t.s, clogistic, parms = prms)
plot(out[,1], out[,2], type="l", xlab = "Time", ylab = "N", col = "blue", lwd = 2)
With a starting value of 178, the rate of change will be negative when alpha is greater than 1/178, will be flatline with alpha == 1/178, and will be logistic when alpha is less than 1/178.
To go from 300 to 200 you would keep alpha = 1/200 and start at 300:
prms <- c(r = 1, alpha = 1/200)
init.N <- c(300)
t.s <- seq(0.1, 10, by = 0.1)
out <- ode(y = init.N, times = t.s, clogistic, parms = prms)
plot(out[,1], out[,2], type="l", xlab = "Time", ylab = "N", col = "blue", lwd = 2)
Using car::scatter3d(), I am trying to create a 3D figure with a regression surface indicating an interaction between a categorical and a continuous variable. Partly following the code here, I obtained a figure below.
The figure is obviously wrong in that the regression surface does not reach one of the values of the categorical variable. The problem perhaps lies in the use of the rgl::persp3d() (the last block of the code below), but I have not been able to identify what exactly I'm doing wrongly. Could someone let me know what I'm missing and how to fix the problem?
library(rgl)
library(car)
n <- 100
set.seed(1)
x <- runif(n, 0, 10)
set.seed(1)
z <- sample(c(0, 1), n, replace = TRUE)
set.seed(1)
y <- 0.5 * x + 0.1 * z + 0.3 * x * z + rnorm(n, sd = 1.5)
d <- data.frame(x, z, y)
scatter3d(y ~ x + z, data = d,
xlab = "continuous", zlab = "categorical", ylab = "outcome",
residuals = FALSE, surface = FALSE
)
d2 <- d
d2$x <- d$x / (max(d$x) - min(d$x))
d2$y <- d$y / (max(d$y) - min(d$y))
mod <- lm(y ~ x * z, data = d2)
grd <- expand.grid(x = unique(d2$x), z = unique(d2$z))
grd$pred <- predict(mod, newdata = grd)
grd <- grd[order(grd$z, grd$x), ]
# The problem is likely to lie somewhere below.
persp3d(x = unique(grd$x), y = unique(grd$z),
z = matrix(grd$pred, length(unique(grd$z)), length(unique(grd$x))),
alpha = 0.5,
col = "blue",
add = TRUE,
xlab = "", ylab = "", zlab = ""
)
I prefer sticking to car::scatter3d() in drawing the original graph because I already made several figures with car::scatter3d() and want to make this figure consistent with them as well.