I simulated the following data:
library(lavaan)
library(lavaanPlot)
set.seed(2002)
#simulate predictor variables
pred1<- c(1:60)
pred2<- rnorm(60, mean=100, sd=10)
pred3 <- .05 + .05*pred1 + rnorm(length(pred1),1,.5)
#simulate response variables
resp <- 350 -2*pred1 -50*pred3 + rnorm(length(pred1),50,50)
#create df
df <- cbind(resp, pred1, pred3, pred2)
Developed the following model:
#sem model
model <-
'pred2 ~ pred1
resp ~ pred1
resp~ pred3
pred3 ~ pred1'
Fit the model:
# fit model
fit <- sem(model, data = df)
summary(fit,rsq = T, fit.measures = TRUE, standardized = TRUE)
Using the lavaanPlot function I get a floating bubble in the right corner. I would like to know what it means, why it appears and how to remove it from the output diagram.
lavaanPlot(name = "MODEL1", fit, labels = df, coefs = TRUE)
Related
I want to evaluate in terms of MSE, AIC, and Adjusted R squared, various Principal Component models based on different correlation coefficients (e.g., Pearson, Kendall) in R. I have created the following function however I can't find the way to "force" the function to perform principal component regression based on the given correlation matrix cor1 and cor2. Therefore, I end up with the exact same results. Can someone help me?
library(caret)
set.seed(123)
df <- data.frame(Y = rnorm(100), X1 = rnorm(100), X2 = rnorm(100), X3 = rnorm(100), X4 = rnorm(100), X5 = rnorm(100))
X <- df[,-1]
Y <- df[,1]
# compute Pearson's and Kendall's correlation matrices
cor1 <- cor(X, method = "pearson")
cor2 <- cor(X, method = "kendall")
# define function to compute PCA with cross-validation and return MSE, AIC, and adjusted R-squared
pca_cv_mse_aic_r2 <- function(X, Y, cor_mat, ncomp, nfolds) {
# create empty vectors to store results
mse <- rep(0, ncomp)
aic <- rep(0, ncomp)
adj_r2 <- rep(0, ncomp)
# loop over the number of components
for (i in 1:ncomp) {
# perform PCA with cross-validation
pca <- caret::train(X, Y, method = "pcr", preProc = c("center", "scale"),
tuneLength = nfolds, trControl = trainControl(method = "cv", number = nfolds),
tuneGrid = data.frame(ncomp = i))
# compute MSE, AIC, and adjusted R-squared
pred <- predict(pca, newdata = X)
mse[i] <- mean((pred - Y)^2)
aic[i] <- AIC(lm(Y ~ pred + 1))
adj_r2[i] <- summary(lm(Y ~ pred))$adj.r.squared
}
# return a list of results
return(list(mse = mse, aic = aic, adj_r2 = adj_r2))
}
# compute the MSE, AIC, and adjusted R-squared of PCA models with different correlation matrices and numbers of components
results1 <- pca_cv_mse_aic_r2(X, Y, cor1, 5, 10)
results2 <- pca_cv_mse_aic_r2(X, Y, cor2, 5, 10)
If I have a gbm poisson regression model as follows:
# My data
set.seed(0)
df <- data.frame(count = rpois(100,1),
pred1 = rnorm(100, 10, 1),
pred2 = rnorm(100, 0, 1),
pred3 = rnorm(100, 0, 1))
# My Split
split <- initial_split(df)
# My model
library(gbm)
m <- gbm(
formula = count ~ .,
distribution ="poisson",
data = training(split))
And I make a prediction:
# My prediction
p <- predict(m,
n.trees=m$n.trees,
testing(split),
type="response")
I'd like to generate some confidence intervals around the values of p. I cannot seem to find a way of doing this when I use m to predict on the test data set or a new dataset (where the predictor variables have identical underlying distributions).
I would like to be able to do a forward stepwise linear regression, but constrain the number of predictor variables to a maximum (in my specific case, three). Here is some sample data.
set.seed(123)
myDep <- runif(100)
pred1 <- myDep + runif(100)
pred2 <- myDep + rnorm(100)
pred3 <- myDep + runif(100) + rnorm(100)
pred4 <- myDep + runif(100) + runif(100)
pred5 <- runif(100)
myDF <- data.frame(myDep, pred1, pred2, pred3, pred4, pred5)
If I were to simply run a linear regression using the following code below, I would get all five predictor variables, obviously.
myModel <- lm(myDep ~ ., data = myDF)
What I would like to do it use step() or other R command to run a forward-direction stepwise that picks only three predictor variables and then stops.
For what it is worth, I tried this:
step(lm(myDep ~ ., data = myDF), steps = 3, direction = "forward")
and the results were the following -- but not what I want because it uses all five predictor variables.
Start: AIC=-378.09
myDep ~ pred1 + pred2 + pred3 + pred4 + pred5
Call:
lm(formula = myDep ~ pred1 + pred2 + pred3 + pred4 + pred5, data = myDF)
Coefficients:
(Intercept) pred1 pred2 pred3 pred4 pred5
-0.16617 0.30043 0.07983 0.03670 0.17869 0.01606
I'm sure there's a way to do this, but I cannot seem to figure out the proper formatting. Thanks in advance.
You could use the regsubsets package in R, where you can limit the variables and choose your method ("forward").
https://www.rdocumentation.org/packages/leaps/versions/2.1-1/topics/regsubsets
library(regsubsets)
b <- regsubsets(myDep ~ ., data=myDF, nbest=1, nvmax=[enter your max # of predictors])
summary(b)
I have been trying to fit a lasso model using cv.glmnet. I tried to implement four different models (3 using cv.glmnet and 1 using caret::train) based on standardization. All the four models give very different coefficient estimates which I can't figure out why.
Here is a fully reproducible code:
library("glmnet")
data(iris)
iris <- iris
dat <- iris[iris$Species %in% c("setosa","versicolor"),]
X <- as.matrix(dat[,1:4])
Y <- as.factor(as.character(dat$Species))
set.seed(123)
model1 <- cv.glmnet(x = X,
y = Y,
family = "binomial",
standardize = FALSE,
alpha = 1,
lambda = rev(seq(0,1,length=100)),
nfolds = 3)
set.seed(123)
model2 <- cv.glmnet(x = scale(X, center = T, scale = T),
y = Y,
family = "binomial",
standardize = FALSE,
alpha = 1,
lambda = rev(seq(0,1,length=100)),
nfolds = 3)
set.seed(123)
model3 <- cv.glmnet(x = X,
y = Y,
family = "binomial",
standardize = TRUE,
alpha = 1,
lambda = rev(seq(0,1,length=100)),
nfolds = 3)
##Using caret
library("caret")
lambda.grid <- rev(seq(0,1,length=100)) #set of lambda values for cross-validation
alpha.grid <- 1 #alpha
trainControl <- trainControl(method ="cv",
number=3) #3-fold cross-validation
tuneGrid <- expand.grid(.alpha=alpha.grid, .lambda=lambda.grid) #these are tuning parameters to be passed into the train function below
set.seed(123)
model4 <- train(x = X,
y = Y,
method="glmnet",
family="binomial",
standardize = FALSE,
trControl = trainControl,
tuneGrid = tuneGrid)
c1 <- coef(model1, s=model1$lambda.min)
c2 <- coef(model2, s=model2$lambda.min)
c3 <- coef(model3, s=model3$lambda.min)
c4 <- coef(model4$finalModel, s=model4$finalModel$lambdaOpt)
c1 <- as.matrix(c1)
c2 <- as.matrix(c2)
c3 <- as.matrix(c3)
c4 <- as.matrix(c4)
model2 scales the independent variables (vector X) beforehand and model3 does so by setting standardize = TRUE. So atleast these two models should return identical results - but it is not so.
The lambda.min obtained from the four models are:
model1 = 0
model2 = 0
model3 = 0
model4 = 0.6565657
The coefficient estimates between the models differ drastically too. Why would this be occurring?
Actually there is a little different between scale(x) & standardize = FALSE and x & standardize = TRUE. We need to multiple (N-1)/N.
See here.
If we use gaussian family,
library(glmnet)
X <- matrix(runif(100, 0, 1), ncol=2)
y <- 1 -2*X[,1] + X[,2]
enet <- glmnet(X, y, lambda=0.1,standardize = T,family="gaussian")
coefficients(enet)
coef <- coefficients(enet)
coef[2]*sd(X[,1])/sd(y) #standardized coef
#[1] -0.6895065
enet1 <- glmnet(scale(X)/99*100, y/(99/100*sd(y)),lambda=0.1/(99/100*sd(y)),standardize = F,family="gaussian")
coefficients(enet1)[2]
#[1] -0.6894995
If we use binomial family,
data(iris)
iris <- iris
dat <- iris[iris$Species %in% c("setosa","versicolor"),]
X <- as.matrix(dat[,1:4])
Y <- as.factor(as.character(dat$Species))
set.seed(123)
model1 <- cv.glmnet(x = X,
y = Y,
family = "binomial",
standardize = T,
alpha = 1,
lambda = rev(seq(0,1,length=100)),
nfolds = 3)
coefficients(model1,s=0.03)[3]*sd(X[,2])
#[1] -0.3374946
set.seed(123)
model3 <- cv.glmnet(x = scale(X)/99*100,
y = Y,
family = "binomial",
standardize = F,
alpha = 1,
lambda = rev(seq(0,1,length=100)),
nfolds = 3)
coefficients(model3,s=0.03)[3]
#[1] -0.3355027
These results are nearly the same. Hope it is not too late for this answer.
In R predict.lm computes predictions based on the results from linear regression and also offers to compute confidence intervals for these predictions. According to the manual, these intervals are based on the error variance of fitting, but not on the error intervals of the coefficient.
On the other hand predict.glm which computes predictions based on logistic and Poisson regression (amongst a few others) doesn't have an option for confidence intervals. And I even have a hard time imagining how such confidence intervals could be computed to provide a meaningful insight for Poisson and logistic regression.
Are there cases in which it is meaningful to provide confidence intervals for such predictions? How can they be interpreted? And what are the assumptions in these cases?
The usual way is to compute a confidence interval on the scale of the linear predictor, where things will be more normal (Gaussian) and then apply the inverse of the link function to map the confidence interval from the linear predictor scale to the response scale.
To do this you need two things;
call predict() with type = "link", and
call predict() with se.fit = TRUE.
The first produces predictions on the scale of the linear predictor, the second returns the standard errors of the predictions. In pseudo code
## foo <- mtcars[,c("mpg","vs")]; names(foo) <- c("x","y") ## Working example data
mod <- glm(y ~ x, data = foo, family = binomial)
preddata <- with(foo, data.frame(x = seq(min(x), max(x), length = 100)))
preds <- predict(mod, newdata = preddata, type = "link", se.fit = TRUE)
preds is then a list with components fit and se.fit.
The confidence interval on the linear predictor is then
critval <- 1.96 ## approx 95% CI
upr <- preds$fit + (critval * preds$se.fit)
lwr <- preds$fit - (critval * preds$se.fit)
fit <- preds$fit
critval is chosen from a t or z (normal) distribution as required (I forget exactly now which to use for which type of GLM and what the properties are) with the coverage required. The 1.96 is the value of the Gaussian distribution giving 95% coverage:
> qnorm(0.975) ## 0.975 as this is upper tail, 2.5% also in lower tail
[1] 1.959964
Now for fit, upr and lwr we need to apply the inverse of the link function to them.
fit2 <- mod$family$linkinv(fit)
upr2 <- mod$family$linkinv(upr)
lwr2 <- mod$family$linkinv(lwr)
Now you can plot all three and the data.
preddata$lwr <- lwr2
preddata$upr <- upr2
ggplot(data=foo, mapping=aes(x=x,y=y)) + geom_point() +
stat_smooth(method="glm", method.args=list(family=binomial)) +
geom_line(data=preddata, mapping=aes(x=x, y=upr), col="red") +
geom_line(data=preddata, mapping=aes(x=x, y=lwr), col="red")
I stumbled upon Liu WenSui's method that uses bootstrap or simulation approach to solve that problem for Poisson estimates.
Example from the Author
pkgs <- c('doParallel', 'foreach')
lapply(pkgs, require, character.only = T)
registerDoParallel(cores = 4)
data(AutoCollision, package = "insuranceData")
df <- rbind(AutoCollision, AutoCollision)
mdl <- glm(Claim_Count ~ Age + Vehicle_Use, data = df, family = poisson(link = "log"))
new_fake <- df[1:5, 1:2]
boot_pi <- function(model, pdata, n, p) {
odata <- model$data
lp <- (1 - p) / 2
up <- 1 - lp
set.seed(2016)
seeds <- round(runif(n, 1, 1000), 0)
boot_y <- foreach(i = 1:n, .combine = rbind) %dopar% {
set.seed(seeds[i])
bdata <- odata[sample(seq(nrow(odata)), size = nrow(odata), replace = TRUE), ]
bpred <- predict(update(model, data = bdata), type = "response", newdata = pdata)
rpois(length(bpred), lambda = bpred)
}
boot_ci <- t(apply(boot_y, 2, quantile, c(lp, up)))
return(data.frame(pred = predict(model, newdata = pdata, type = "response"), lower = boot_ci[, 1], upper = boot_ci[, 2]))
}
boot_pi(mdl, new_fake, 1000, 0.95)
sim_pi <- function(model, pdata, n, p) {
odata <- model$data
yhat <- predict(model, type = "response")
lp <- (1 - p) / 2
up <- 1 - lp
set.seed(2016)
seeds <- round(runif(n, 1, 1000), 0)
sim_y <- foreach(i = 1:n, .combine = rbind) %dopar% {
set.seed(seeds[i])
sim_y <- rpois(length(yhat), lambda = yhat)
sdata <- data.frame(y = sim_y, odata[names(model$x)])
refit <- glm(y ~ ., data = sdata, family = poisson)
bpred <- predict(refit, type = "response", newdata = pdata)
rpois(length(bpred),lambda = bpred)
}
sim_ci <- t(apply(sim_y, 2, quantile, c(lp, up)))
return(data.frame(pred = predict(model, newdata = pdata, type = "response"), lower = sim_ci[, 1], upper = sim_ci[, 2]))
}
sim_pi(mdl, new_fake, 1000, 0.95)