loop linear regression over samples that contain multiple observations - r

I have a linear regression model y = 50 + 10x + e, where e is normally distributed.
Every time I fit the model, I'm required to use 20 pairs of x and y values, where x is seq(from = 0.5, to = 10, by = 0.5).
My first task is to fit the model 100 times. In other words, generate 100 samples, where each sample consists of 10 pairs of x and y values.
My second task is to save the intercept and slope of each of the 100 instances of model-fitting.
My un-successful code is below:
linear_model <- c()
intercept <- c()
slope <- c()
for (i in 1:100) {
e <- rnorm(n = 20, mean = 0, sd = 4)
x <- seq(from = 0.5, to = 10, by = 0.5)
y <- 50 + 10 * x + e
linear_model[i] <- lm(formula = y ~ x)
intercept[i] <- summary(object = linear_model[i])$coefficients[1, 1]
slope[i] <- summary(object = linear_model[i])$coefficients[2, 1]
}

You've generated 10 random variables for error but 20 x values so that the dimensions don't match. Either 20 random variables or 10 x values should work.
Below is my trial - note that loops are made only twice (times = 2) while it is 100 in your example.
errs <- lapply(rep(x=20, times=2), rnorm, mean=0, sd=4)
x <- seq(0.5, 10, 0.5)
y <- lapply(errs, function(err) 50 * x + err)
myLM <- function(res) {
mod <- lm(formula = res ~ x)
out <- list(intercept = mod$coefficients[1],
slope = mod$coefficients[2])
out
}
fit <- sapply(y, myLM)
fit
[,1] [,2]
intercept 0.005351345 -2.362931
slope 50.13638 50.60856

Related

Calculating RSS manually with given pairs of beta0 and beta1

I am trying to manually calculate the RSS for a dataset with given pairs of beta0 and beta1. For each (beta_0,beta_1) pair of values, I need to calculate the residual sum of squares. Store it as a vector in data called RSS. Here's the code provided.
x = pinotnoir$Aroma
y = pinotnoir$Quality
fit = lm(y ~ x)
summary(fit)
b0s <- seq(0, 10, .1)
b1s <- seq(0, 4, .01)
data <- expand.grid(beta0=b0s, beta1=b1s)
Here's what I have so far. I think the residual calculation is wrong but I'm not sure how to fix it.
rows = length(b1s)
rsd <- rep(NA,rows)
for (i in 1:rows){
residual = (y - (b0s[i] + b1s[i] * x))^2
rsd[i] <- residual
}
data <- expand.grid(beta0=b0s, beta1=b1s, RSS=rsd)
Any help would be appreciated. Thanks in advance!
I am not sure this is exactly what you aim but adapting your code slightly you can get the sum of squared residuals and which betas minimizes them. (using mtcars data for the example)
mtcars
x = mtcars$drat
y = mtcars$wt
(fit = lm(y ~ x))
summary(fit)
grid_len <- 20
b0s <- seq(5, 10, length.out = grid_len)
b1s <- seq(-3, -1, length.out = grid_len)
(data <- expand.grid(beta0=b0s, beta1=b1s))
rows = nrow(data)
resids <- rep(NA,rows)
for (i in 1:rows) {
fitted <- (data$beta0[i] + (data$beta1[i] * x))
squared_resid <- (y - fitted)^2
SSR <- sum(squared_resid)
resids[i] <- SSR
cat(i, ": ", SSR, "\n")
}
data[which.min(resids), ]
fit
results:
> data[which.min(resids), ]
beta0 beta1
332 7.894737 -1.315789
> fit
Call:
lm(formula = y ~ x)
Coefficients:
(Intercept) x
7.906 -1.304

R convert regression model fit to a function

I want to quickly extract the fit of a regression model to a function.
So I want to get from:
# generate some random data
set.seed(123)
x <- rnorm(n = 100, mean = 10, sd = 4)
z <- rnorm(n = 100, mean = -8, sd = 3)
y <- 9 * x - 10 * x ^ 2 + 5 * z + 10 + rnorm(n = 100, 0, 30)
df <- data.frame(x,y)
plot(df$x,df$y)
model1 <- lm(formula = y ~ x + I(x^2) + z, data = df)
summary(model1)
to a model_function(x) that describes the fitted values for me.
Of course I could do this by hand in a way like this:
model_function <- function(x, z, model) {
fit <- coefficients(model)["(Intercept)"] + coefficients(model)["x"]*x + coefficients(model)["I(x^2)"]*x^2 + coefficients(model)["z"]*z
return(fit)
}
fit <- model_function(df$x,df$z, model1)
which I can compare to the actual fitted values and (with some rounding errors) works perfectly.
all(round(as.numeric(model1$fitted.values),5) == round(fit,5))
But of course this is not a universal solution (e.g. more variables etc.).
So to be clear:
Is there an easy way to extract the fitted values relationship as a function with the coefficients that were just estimated?
Note: I know of course about predict and the ability to generate fitted values from new data - but I'm really looking for that underlying function. Maybe that's possible through predict?
Grateful for any help!
If you want an actual function you can do something like this:
get_func <- function(mod) {
vars <- as.list(attr(mod$terms, "variables"))[-(1:2)]
funcs <- lapply(vars, function(x) list(quote(`*`), 1, x))
terms <- mapply(function(x, y) {x[[2]] <- y; as.call(x)}, funcs, mod$coefficients[-1],
SIMPLIFY = FALSE)
terms <- c(as.numeric(mod$coefficients[1]), terms)
body <- Reduce(function(a, b) as.call(list(quote(`+`), a, b)), terms)
vars <- setNames(lapply(seq_along(vars), function(x) NULL), sapply(vars, as.character))
f <- as.function(c(do.call(alist, vars), body))
formals(f) <- formals(f)[!grepl("\\(", names(formals(f)))]
f
}
Which allows:
my_func <- get_func(model1)
my_func
#> function (x = NULL, z = NULL)
#> 48.6991866925322 + 3.31343108778127 * x + -9.77589420188036 * I(x^2) + 5.38229596972984 * z
<environment: 0x00000285a1982b48>
and
my_func(x = 1:10, z = 3)
#> [1] 58.38361 32.36936 -13.19668 -78.31451 -162.98413 -267.20553
#> [7] -390.97872 -534.30371 -697.18048 -879.60903
and
plot(1:10, my_func(x = 1:10, z = 3), type = "b")
At the moment, this would not work with interaction terms, etc, but should work for most simple linear models
Any of these give the fitted values:
fitted(model1)
predict(model1)
model.matrix(model1) %*% coef(model1)
y - resid(model1)
X <- model.matrix(model1); X %*% qr.solve(X, y)
X <- cbind(1, x, x^2, z); X %*% qr.solve(X, y)
Any of these give the predicted values for any particular x and z:
cbind(1, x, x^2, z) %*% coef(model1)
predict(model1, list(x = x, z = z))

Performing residual bootstrap using kernel regression in R

Kernel regression is a non-parametric technique that wants to estimate the conditional expectation of a random variable. It uses local averaging of the response value, Y, in order to find some non-linear relationship between X and Y.
I am have used bootstrap for kernel density estimation and now want to use it for kernel regression as well. I have been told to use residual bootstrapping for kernel regression and have read a couple of papers on this. I am however unsure how to perform this. Programming has been done in R using the FKSUM package. I have made an attempt to use standard resampling on kernel regression:
library(FKSUM)
set.seed(1)
n <- 5000
sample.size <- 500
B.replications <- 200
x <- rbeta(n, 2, 2) * 10
y <- 3 * sin(2 * x) + 10 * (x > 5) * (x - 5)
y <- y + rnorm(n) + (rgamma(n, 2, 2) - 1) * (abs(x - 5) + 3)
#taking x.y to be the population
x.y <- data.frame(x, y)
xs <- seq(min(x), max(x), length = 1000)
ftrue <- 3 * sin(2 * xs) + 10 * (xs > 5) * (xs - 5)
#Sample from the population
seqx<-seq(1,5000,by=1)
sample.ind <- sample(seqx, size = sample.size, replace = FALSE)
sample.reg<-x.y[sample.ind,]
x_s <- sample.reg$x
y_s <- sample.reg$y
fhat_loc_lin.pop <- fk_regression(x, y)
fhat_loc_lin.sample <- fk_regression(x = x_s, y = y_s)
plot(x, y, col = rgb(.7, .7, .7, .3), pch = 16, xlab = 'x',
ylab = 'x', main = 'Local linear estimator with amise bandwidth')
lines(xs, ftrue, col = 2, lwd = 3)
lines(fhat_loc_lin, lty = 2, lwd = 2)
#Bootstrap
n.B.sample = sample.size # sample bootstrap size
boot.reg.mat.X <- matrix(0,ncol=B.replications, nrow=n.B.sample)
boot.reg.mat.Y <- matrix(0,ncol=B.replications, nrow=n.B.sample)
fhat_loc_lin.boot <- matrix(0,ncol = B.replications, nrow=100)
Temp.reg.y <- matrix(0,ncol = B.replications,nrow = 1000)
for(i in 1:B.replications){
sequence.x.boot <- seq(from=1,to=n.B.sample,by=1)
sample.ind.boot <- sample(sequence.x.boot, size = sample.size, replace = TRUE)
boot.reg.mat <- sample.reg[sample.ind.boot,]
boot.reg.mat.X <- boot.reg.mat$x
boot.reg.mat.Y <- boot.reg.mat$y
fhat_loc_lin.boot <- fk_regression(x = boot.reg.mat.X ,
y = boot.reg.mat.Y,
h = fhat_loc_lin.sample$h)
lines(y=fhat_loc_lin.boot$y,x= fhat_loc_lin.sample$x, col =c(i) )
Temp.reg.y[,i] <- fhat_loc_lin.boot$y
}
quan.reg.l <- vector()
quan.reg.u <- vector()
for(i in 1:length(xs)){
quan.reg.l[i] <- quantile(x = Temp.reg.y[i,],probs = 0.025)
quan.reg.u[i] <- quantile(x = Temp.reg.y[i,],probs = 0.975)
}
# Lower Bound
Temp.reg.2 <- quan.reg.l
lines(y=Temp.reg.2,x=fhat_loc_lin.boot$x ,col="red",lwd=4,lty=1)
# Upper Bound
Temp.reg.3 <- quan.reg.u
lines(y=Temp.reg.3,x=fhat_loc_lin.boot$x ,col="navy",lwd=4,lty=1)
Asking the question on here now since I haven't received any response on CV. Any help would be greatly appreciated!

fitting function for a given data set

I'm trying to fitting the following function y(x)=a*( 1 + (x^2)/(b^2) )^t to a particular set of data , where, a, b and t are constants that want to determine by fitting.
I try the following, for example
len <- 24
x = runif(len)
y = x^3 + runif(len, min = -0.1, max = 0.1)
plot(x, y)
s <- seq(from = 0, to = 1, length = 50)
lines(s, s^3, lty = 2)
df <- data.frame(x, y)
m <- nls(y~a*( 1 + (x^2)/(b^2) )^t, data = df, start = list(a=1,t=0, b=1), trace = T)
> Error in nlsModel(formula, mf, start, wts) :
singular gradient matrix at initial parameter estimates
Can someone help me to set this function to these points, even if the fitting becomes bad, the important is to get fit this function, ie that she run on the data
thanks everyone
Because your data are changing randomly, for some situations the value of a is close to zero and your function becomes zero. The curve fit procedure fails at that point. Randomizing the start parameters might work for some situations.
A slightly more stable output can be computed using the LM algorithm:
require("minpack.lm")
LMCurveFit <- function(df) {
# The function to be fit
FitFunction <- function(params, x) {
with (
as.list(params), {
a*(1 + x^2/b^2)^t
}
)
}
# Residual
Residual <- function(params, x, y) {
FitFunction(params, x) - y
}
# Sum of squares of residuals
ssqfun <- function(params, x, y) {
sum(Residual(params, x, y)^2)
}
# Normalize the data
x_max = max(x)
y_max = max(y)
df$x = df$x/x_max
df$y = df$y/y_max
# Define start parameters
a_start = 0.1
b_start = 1.0
t_start = 1.0
param_start = c(a = a_start,
b = b_start,
t = t_start)
# Do LM fit
nls.out <- nls.lm(par = param_start,
fn = Residual,
control = nls.lm.control(nprint=0,
ftol=.Machine$double.eps,
ptol=.Machine$double.eps,
maxfev=10000, maxiter=1024),
x = df$x,
y = df$y)
# Revert scaling
nls.out$par[1] = nls.out$par[1]*y_max
nls.out$par[2] = nls.out$par[2]*x_max
# Get the parameters
params_fit = coef(nls.out)
print(params_fit)
# Compute predicted values
predicted = FitFunction(as.list(params_fit), df$x*x_max)
}
# LM fit
pred_y = LMCurveFit(df)
lines(x, pred_y)

R Remove intercepts in logistic regression

I am using the rms library to perform regularized logistic regression, and wish to force the intercept to zero. I'm using the following to simulate and regress:
library(rms)
N = 100
pred <- vapply(1:12, function(i) rnorm(N, mean = 0, sd =1), numeric(N))
resp <- 20*pred[, 1] - 3*pred[, 7] - 2*pred[, 8] + matrix(rnorm(N, sd = 0.1)) + 20
pr <- 1 / (1 + exp(-resp))
y <- rbinom(N, 1, pr)
lrm(y ~ pred, penalty = 1)
The post at How to remove intercept in R suggests including '0 +' or '- 1' in the model formula. However, this does not appear to work for lrm.
You can use glmnet. It also includes a cross validation function for choosing the turning parameter.
library(glmnet)
N = 1000
pred <- vapply(1:12, function(i) rnorm(N, mean = 0, sd =1), numeric(N))
resp <- 20*pred[, 1] - 3*pred[, 7] - 2*pred[, 8] + matrix(rnorm(N, sd = 0.1)) + 20
pr <- 1 / (1 + exp(-resp))
y <- rbinom(N, 1, pr)
result <- cv.glmnet(pred, y, family="binomial", intercept=FALSE)
# best lambda based on cv
result$lambda.min
# coefficient
coef(result$glmnet.fit, s=result$lambda.min)

Resources