I am trying to use to predict function to predict 100 points new points. I have a data.frame with one vector that is 100 doubls long.
I am trying the predict function: predict(model, newdata=mydat)
The function only returns a vector of length four.
This could be due to the fact that the model was made only with four points, but I am unsure.
EDIT:
Creation of mydat
mydat <- data.frame(V1 = seq(0, max(myExperimentSummary$V1), length.out = 100))
The model I am using
model
#Nonlinear regression model
# model: mean ~ (1/(1 + exp(-b * (V1 - c))))
# data: myExperimentSummary
# b c
#-0.6721 3.2120
# residual sum-of-squares: 0.04395
#
#Number of iterations to convergence: 1
#Achieved convergence tolerance: 5.204e-06
EDIT2: Fixing the typos
EDIT3:
fitcoef = nlsLM(mean~(a/(1+exp(-b*(V5-c)))), data = myExperimentSummary,
start=c(a=1,b=.1,c=25))
fitmodel = nls(mean~(1/(1+exp(-b*(V1-c)))), data = myExperimentSummary,
start=coef(fitcoef))
mydat <- data.frame(V1 = seq(0, max(myExperimentSummary$V1), length.out = 100))
predict(fitmodel, mydat)
If your data are still as in your previous question:
dat <- read.table(text = " V1 N mean
0.1 9 0.9
1 9 0.8
10 9 0.1
5 9 0.2",
header = TRUE)
model <- nls(mean ~ -a/(1 + exp(-b * (V1-o))), data = dat,
start=list(a=-1.452, b=-0.451, o=1.292))
Then I can not reproduce your problem:
mydat <- data.frame(V1 = seq(0, max(dat$V1), length.out = 100))
y <- predict(model, mydat)
length(y)
# [1] 100
Related
sample_size <- 200
sample_meanvector <- c(3, 4)
sample_covariance_matrix <- matrix(c(2, 1, 1, 2),
ncol = 2)
# create bivariate normal distribution
sample_distribution <- mvrnorm(n = sample_size,
mu = sample_meanvector,
Sigma = sample_covariance_matrix)
#Convert the datatype
df_sample_distribution <- as.data.frame(sample_distribution)
df_sample_distribution$Y <- (1 + df_sample_distribution$V1*2 + df_sample_distribution$V2 + rnorm(200,0,1))
colnames(df_sample_distribution)[1] <- "X1"
colnames(df_sample_distribution)[2] <- "X2"
Code above is the one I use to generate a bivariate normal distribution vectors and code below is the code to run regression over the generated data.
Test2 <- lm( Y ~ X1, data = df_sample_distribution)
#to extract only specific coefficients
summary(Test)$coefficients[2,1]
My question is whether there is a way such that I can regenerate data and run regression over it for 200 times and save all the outputs in a list. Here is the pseudo code in my head.
for (){
#generate data
for ()
{
#extract coeffiients and insert them in a list
}
}
In simple terms,
step 1: create data
step 2: run regression over it
step 3: get the coefficient (hopefully save them in a list)
I am looking for code that can loop through step 1 to 3 for 200 times and save everything results. Any ideas or inspirations are welcomed. Thank you guys in advance.
Just wrap your code into a for-loop like your pseudo code:
library(MASS)
iterations <- 10 # In your example this should be 200
sample_size <- 200
sample_meanvector <- c(3, 4)
sample_covariance_matrix <- matrix(c(2, 1, 1, 2),
ncol = 2)
# create output data.frame
df_output <- data.frame(iteration = integer(0), coeff = double(0))
# loop over data generation and regression
for (i in seq_len(iterations)) {
sample_distribution <- mvrnorm(n = sample_size,
mu = sample_meanvector,
Sigma = sample_covariance_matrix)
#Convert the datatype
df_sample_distribution <- as.data.frame(sample_distribution)
df_sample_distribution$Y <- (1 + df_sample_distribution$V1*2 + df_sample_distribution$V2 + rnorm(200,0,1))
colnames(df_sample_distribution)[1] <- "X1"
colnames(df_sample_distribution)[2] <- "X2"
df_output[i, 1] <- i
df_output[i, 2] <- summary(lm( Y ~ X1, data = df_sample_distribution))$coefficients[2,1]
}
This returns df_output containing coefficients for each iteration:
iteration coeff
1 1 2.647886
2 2 2.274654
3 3 2.447453
4 4 2.451471
5 5 2.568877
6 6 2.428295
7 7 2.440396
8 8 2.478357
9 9 2.477211
10 10 2.367012
When I run a Tweedie GLM, one can get the prediction from the link by doing exp(link). To get the prediction for a Tweedie GLM, I get the prediction from the link by doing exp(link)/2. I don't understand why I need to divide by 2.
Minimal reproducible example below, inspired from the tweedie regression demo at https://github.com/dmlc/xgboost/blob/master/R-package/demo/tweedie_regression.R
library(xgboost)
library(data.table)
library(cplm) # for insurance data
library(statmod) # for tweedie glm
data(AutoClaim)
# auto insurance dataset analyzed by Yip and Yau (2005)
dt <- data.table(AutoClaim)
# exclude these columns from the model matrix
exclude <- c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY')
# retains the missing values
# NOTE: this dataset is comes ready out of the box
options(na.action = 'na.pass')
x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = F])
options(na.action = 'na.omit')
# response
y <- dt[, CLM_AMT5]
d_train <- xgb.DMatrix(data = x, label = y, missing = NA)
# the tweedie_variance_power parameter determines the shape of
# distribution
# - closer to 1 is more poisson like and the mass
# is more concentrated near zero
# - closer to 2 is more gamma like and the mass spreads to the
# the right with less concentration near zero
params <- list(
objective = 'reg:tweedie',
eval_metric = 'rmse',
tweedie_variance_power = 1.4,
max_depth = 2,
eta = 1)
set.seed(42)
bst <- xgb.train(
data = d_train,
params = params,
maximize = FALSE,
watchlist = list(train = d_train),
nrounds = 3)
xgb.plot.tree(model = bst)
# Manually extract the values for the first record :
x[1,]
# travtime < 102, bluebook <61645 -->tree #1 value= 2.49922585
# revolkedyes < -9.53674316e-07, npolicy < 5.5 --> tree #2 value= 2.48586464
# REVOLKEDYes < -9.53674316e-07, areaurban > -9.53674316e-07 --> tree #2 vakye = 2.36028123
link_gbm <- 2.49922585 +2.48586464+ 2.36028123
link_gbm # 7.345372
# Take exp(link_gbm), divide by 2
exp(link_gbm ) / 2 # 774.5053
# Compare with getting prediction directly from GBM.
predict(bst, d_train)[1] # 774.5053
# Let's do the same with a GLM:
dt2 <- dt[, -exclude, with = F]
dt2$CLM_AMT5 <- dt$CLM_AMT5
tweedie_fit <-
glm(CLM_AMT5 ~ .,
family=tweedie(var.power=1.4, link.power=0),
data = dt2)
summary(tweedie_fit)
# Manually get the link value for the first record
dt2[1,]
link_glm <- tweedie_fit$coefficients["(Intercept)"] +
14 * tweedie_fit$coefficients["TRAVTIME"] +
14230 * tweedie_fit$coefficients["BLUEBOOK"] +
11 * tweedie_fit$coefficients["RETAINED"] +
1 * tweedie_fit$coefficients["NPOLICY"] +
1 * tweedie_fit$coefficients["CAR_TYPESedan"] +
1 * tweedie_fit$coefficients["RED_CARyes"] +
3 * tweedie_fit$coefficients["MVR_PTS"] +
60 * tweedie_fit$coefficients["AGE"] +
11 * tweedie_fit$coefficients["YOJ"] +
67349 * tweedie_fit$coefficients["INCOME"] +
1 * tweedie_fit$coefficients["GENDERM"] +
1 * tweedie_fit$coefficients["JOBCLASSProfessional"] +
1 * tweedie_fit$coefficients["MAX_EDUCPhD"] +
18 * tweedie_fit$coefficients["SAMEHOME"] +
1 * tweedie_fit$coefficients["AREAUrban"]
link_glm # 8.299899
# prediction is exp(link_glm)
exp(link_glm) # 4023.466
# compare with link and prediction from glm ... yes, it's identical
predict(tweedie_fit, type="link")[1]
predict(tweedie_fit, type="response")[1] # 4023.466
Is there way to get predict behavior with standard errors from lfe::felm if the fixed effects are swept out using the projection method in felm? This question is very similar to the question here, but none of the answers to that question can be used to estimate standard errors or confidence/prediction intervals. I know that there's currently no predict.felm, but I am wondering if there are workarounds similar to those linked above that might also work for estimating the prediction interval
library(DAAG)
library(lfe)
model1 <- lm(data = cps1, re74 ~ age + nodeg + marr)
predict(model1, newdata = data.frame(age=40, nodeg = 0, marr=1), se.fit = T, interval="prediction")$fit
# Result: fit lwr upr
# 1 18436.18 2339.335 34533.03
model2 <- felm(data = cps1, re74 ~ age | nodeg + marr)
predict(model2, newdata = data.frame(age=40, nodeg = 0, marr=1), se.fit = T, interval="prediction")$fit
# Does not work
The goal is to estimate a prediction interval for yhat, for which I think I'd need to compute the full variance-covariance matrix (including the fixed effects). I haven't been able to figure out how to do this, and I'm wondering if it's even computationally feasible.
After conversations with several people, I don't believe it is possible to obtain an estimate the distribution of yhat=Xb (where X includes both the covariates and the fixed effects) directly from felm, which is what this question boils down to. It is possible bootstrap them, however. The following code does so in parallel. There is scope for performance improvements, but this gives the general idea.
Note: here I do not compute full prediction interval, just the SEs on Xb, but obtaining the prediction interval is straightforward - just add the root of sigma^2 to the SE.
library(DAAG)
library(lfe)
library(parallel)
model1 <- lm(data = cps1, re74 ~ age + nodeg + marr)
yhat_lm <- predict(model1, newdata = data.frame(age=40, nodeg = 0, marr=1), se.fit = T)
set.seed(42)
boot_yhat <- function(b) {
print(b)
n <- nrow(cps1)
boot <- cps1[sample(1:n, n, replace=T),]
lm.model <- lm(data=demeanlist(boot[, c("re74", "age")], list(factor(boot$nodeg), factor(boot$marr))),
formula = re74 ~ age)
fe <- getfe(felm(data = boot, re74 ~ age | nodeg + marr))
bootResult <- predict(lm.model, newdata = data.frame(age = 40)) +
fe$effect[fe$fe == "nodeg" & fe$idx==0] +
fe$effect[fe$fe == "marr" & fe$idx==1]
return(bootResult)
}
B = 1000
yhats_boot <- mclapply(1:B, boot_yhat)
plot(density(rnorm(10000, mean=yhat_lm$fit, sd=yhat_lm$se.fit)))
lines(density(yhats), col="red")
From your first model predict(.) yields this:
# fit lwr upr
# 1 18436.18 2339.335 34533.03
Following 李哲源 we can achieve these results manually, too.
beta.hat.1 <- coef(model1) # save coefficients
# model matrix: age=40, nodeg = 0, marr=1:
X.1 <- cbind(1, matrix(c(40, 0, 1), ncol=3))
pred.1 <- as.numeric(X.1 %*% beta.hat.1) # prediction
V.1 <- vcov(model1) # save var-cov matrix
se2.1 <- unname(rowSums((X.1 %*% V.1) * X.1)) # prediction var
alpha.1 <- qt((1-0.95)/2, df = model1$df.residual) # 5 % level
pred.1 + c(alpha.1, -alpha.1) * sqrt(se2.1) # 95%-CI
# [1] 18258.18 18614.18
sigma2.1 <- sum(model1$residuals ^ 2) / model1$df.residual # sigma.sq
PI.1 <- pred.1 + c(alpha.1, -alpha.1) * sqrt(se2.1 + sigma2.1) # prediction interval
matrix(c(pred.1, PI.1), nrow = 1, dimnames = list(1, c("fit", "lwr", "upr")))
# fit lwr upr
# 1 18436.18 2339.335 34533.03
Now, your linked example applied to multiple FE, we get this results:
lm.model <- lm(data=demeanlist(cps1[, c(8, 2)],
list(as.factor(cps1$nodeg),
as.factor(cps1$marr))), re74 ~ age)
fe <- getfe(model2)
predict(lm.model, newdata = data.frame(age = 40)) + fe$effect[fe$idx=="1"]
# [1] 15091.75 10115.21
The first value is with and the second without added FE (try fe$effect[fe$idx=="1"]).
Now we're following the manual approach above.
beta.hat <- coef(model2) # coefficient
x <- 40 # age = 40
pred <- as.numeric(x %*% beta.hat) # prediction
V <- model2$vcv # var/cov
se2 <- unname(rowSums((x %*% V) * x)) # prediction var
alpha <- qt((1-0.95)/2, df = model2$df.residual) # 5% level
pred + c(alpha, -alpha) * sqrt(se2) # CI
# [1] 9599.733 10630.697
sigma2 <- sum(model2$residuals ^ 2) / model2$df.residual # sigma^2
PI <- pred + c(alpha, -alpha) * sqrt(se2 + sigma2) # PI
matrix(c(pred, PI), nrow = 1, dimnames = list(1, c("fit", "lwr", "upr"))) # output
# fit lwr upr
# 1 10115.21 -5988.898 26219.33
As we see, the fit is the same as the linked example approach, but now with prediction interval. (Disclaimer: The logic of the approach should be straightforward, the values of the PI should still be evaluated, e.g. in Stata with reghdfe.)
Edit: In case you want to achieve exactly the same output from felm() which predict.lm() yields with the linear model1, you simply need to "include" again the fixed effects in your model (see model3 below). Just follow the same approach then. For more convenience you easily could wrap it into a function.
library(DAAG)
library(lfe)
model3 <- felm(data = cps1, re74 ~ age + nodeg + marr)
pv <- c(40, 0, 1) # prediction x-values
predict0.felm <- function(mod, pv.=pv) {
beta.hat <- coef(mod) # coefficient
x <- cbind(1, matrix(pv., ncol=3)) # prediction vector
pred <- as.numeric(x %*% beta.hat) # prediction
V <- mod[['vcv'] ] # var/cov
se2 <- unname(rowSums((x %*% V) * x)) # prediction var
alpha <- qt((1-0.95)/2, df = mod[['df.residual']]) # 5% level
CI <- structure(pred + c(alpha, -alpha) * sqrt(se2),
names=c("CI lwr", "CI upr")) # CI
sigma2 <- sum(mod[['residuals']] ^ 2) / mod[['df.residual']] # sigma^2
PI <- pred + c(alpha, -alpha) * sqrt(se2 + sigma2) # PI
mx <- matrix(c(pred, PI), nrow = 1,
dimnames = list(1, c("PI fit", "PI lwr", "PI upr"))) # output
list(CI, mx)
}
predict0.felm(model3)[[2]]
# PI fit PI lwr PI upr
# 1 18436.18 2339.335 34533.03
By this with felm() you can achieve the same prediction interval as with predict.lm().
The nls function works normally like the following:
x <- 1:10
y <- 2*x + 3 # perfect fit
yeps <- y + rnorm(length(y), sd = 0.01) # added noise
nls(yeps ~ a + b*x, start = list(a = 0.12345, b = 0.54321))#
Because the model I use have a lot of parameters or I don't know beforehand what will be included in the parameter list, I want something like following
tmp <- function(x,p) { p["a"]+p["b"]*x }
p0 <- c(a = 0.12345, b = 0.54321)
nls(yeps ~ tmp(x,p), start = list(p=p0))
Does anyone know how to modify the nls function so that it can accept a parameter vector argument in the formula instead of many seperate parameters?
You can give a vector of init coefficients like this :
tmp <- function(x, coef){
a <- coef[1]
b <- coef[2]
a +b*x
}
x <- 1:10
yeps <- y + rnorm(length(y), sd = 0.01) # added noise
nls(yeps ~ a + b*x, start = list(a = 0.12345, b = 0.54321))#
nls(yeps ~ tmp(x,coef), start = list(coef = c(0.12345, 0.54321)))
Nonlinear regression model
model: yeps ~ tmp(x, coef)
data: parent.frame()
coef1 coef2
3 2
residual sum-of-squares: 0.0016
Number of iterations to convergence: 2
Achieved convergence tolerance: 3.47e-08
PS:
example(nls)
Should be a good start to understand how to play with nls.
Is there a single function, similar to "runif", "rnorm" and the like which will produce simulated predictions for a linear model? I can code it on my own, but the code is ugly and I assume that this is something someone has done before.
slope = 1.5
intercept = 0
x = as.numeric(1:10)
e = rnorm(10, mean=0, sd = 1)
y = slope * x + intercept + e
fit = lm(y ~ x, data = df)
newX = data.frame(x = as.numeric(11:15))
What I'm interested in is a function that looks like the line below:
sims = rlm(1000, fit, newX)
That function would return 1000 simulations of y values, based on the new x variables.
Showing that Gavin Simpson's suggestion of modifying stats:::simulate.lm is a viable one.
## Modify stats:::simulate.lm by inserting some tracing code immediately
## following the line that reads "ftd <- fitted(object)"
trace(what = stats:::simulate.lm,
tracer = quote(ftd <- list(...)[["XX"]]),
at = list(6))
## Prepare the data and 'fit' object
df <- data.frame(x =x<-1:10, y = 1.5*x + rnorm(length(x)))
fit <- lm(y ~ x, data = df)
## Define new covariate values and compute their predicted/fitted values
newX <- 8:1
newFitted <- predict(fit, newdata = data.frame(x = newX))
## Pass in fitted via the argument 'XX'
simulate(fit, nsim = 4, XX = newFitted)
# sim_1 sim_2 sim_3 sim_4
# 1 11.0910257 11.018211 10.95988582 13.398902
# 2 12.3802903 10.589807 10.54324607 11.728212
# 3 8.0546746 9.925670 8.14115433 9.039556
# 4 6.4511230 8.136040 7.59675948 7.892622
# 5 6.2333459 3.131931 5.63671024 7.645412
# 6 3.7449859 4.686575 3.45079655 5.324567
# 7 2.9204519 3.417646 2.05988078 4.453807
# 8 -0.5781599 -1.799643 -0.06848592 0.926204
That works, but this is a cleaner (and likely better) approach:
## A function for simulating at new x-values
simulateX <- function(object, nsim = 1, seed = NULL, X, ...) {
object$fitted.values <- predict(object, X)
simulate(object = object, nsim = nsim, seed = seed, ...)
}
## Prepare example data and a fit object
df <- data.frame(x =x<-1:10, y = 1.5*x + rnorm(length(x)))
fit <- lm(y ~ x, data = df)
## Supply new x-values in a data.frame of the form expected by
## the newdata= argument of predict.lm()
newX <- data.frame(x = 8:1)
## Try it out
simulateX(fit, nsim = 4, X = newX)
# sim_1 sim_2 sim_3 sim_4
# 1 11.485024 11.901787 10.483908 10.818793
# 2 10.990132 11.053870 9.181760 10.599413
# 3 7.899568 9.495389 10.097445 8.544523
# 4 8.259909 7.195572 6.882878 7.580064
# 5 5.542428 6.574177 4.986223 6.289376
# 6 5.622131 6.341748 4.929637 4.545572
# 7 3.277023 2.868446 4.119017 2.609147
# 8 1.296182 1.607852 1.999305 2.598428