help me improve my bootstrap - r

Consider the following code:
require(Hmisc)
num.boots <- 10
data <- rchisq(500, df = 5) #generate fake data
#create bins
binx <- cut(data, breaks = 10)
binx <- levels(binx)
binx <- sub("^.*\\,", "", binx)
binx <- as.numeric(substr(binx, 1, nchar(binx) - 1))
#pre-allocate a matrix to be filled with samples
output <- matrix(NA, nrow = num.boots, ncol = length(binx))
#do random sampling from the vector and calculate percent
# of values equal or smaller to the bin number (i)
for (i in 1:num.boots) {
walk.pair.sample <- sample(data, size = length(data), replace = TRUE)
data.cut <- cut2(x = walk.pair.sample, cuts = binx)
data.cut <- table(data.cut)/sum(table(data.cut))
output[i, ] <- data.cut
}
#do some plotting
plot(1:10, seq(0, max(output), length.out = nrow(output)), type = "n", xlab = "", ylab = "")
for (i in 1:nrow(output)) {
lines(1:10, output[i, 1:nrow(output)])
}
#mean values by columns
output.mean <- apply(output, 2, mean)
lines(output.mean, col="red", lwd = 3)
legend(x = 8, y = 0.25, legend = "mean", col = "red", lty = "solid", lwd = 3)
I was wondering if I can supply the boot:boot() function a function that has as its output a vector of length n > 1? Is it at all possible?
Here are my feeble attempts, but I must be doing something wrong.
require(boot)
bootstrapDistances <- function(data, binx) {
data.cut <- cut2(x = data, cuts = binx)
data.cut <- table(data.cut)/sum(table(data.cut))
return(data.cut)
}
> x <- boot(data = data, statistic = bootstrapDistances, R = 100)
Error in cut.default(x, k2) : 'breaks' are not unique
I don't really understand why Hmisc::cut2() isn't working properly in the boot() call, but works when I call it in a for() loop (see code above). Is the logic of my bootstrapDistances() function feasible with boot()? Any pointers much appreciated.
.:EDIT:.
Aniko suggested I modify my function in such a way, to include an index. While reading the documentation for boot(), this wasn't clear to me how it works, which explains why the function may not be working. Here's the new function Aniko suggested:
bootstrapDistances2 <- function(data, idx, binx) {
data.cut <- cut2(x = data[idx], cuts = binx)
data.cut <- table(data.cut)/sum(table(data.cut))
return(data.cut)
}
However, I managed to produce an error and I'm still working how to remove it.
> x <- boot(data = data, statistic = bootstrapDistances2, R = 100, binx = binx)
Error in t.star[r, ] <- statistic(data, i[r, ], ...) :
number of items to replace is not a multiple of replacement length
After I restarted my R session (also tried another version, 2.10.1), it seems to be working fine.

From the help-file for the boot function:
In all other cases statistic must take at least two arguments. The first argument passed will always be the original data. The second will be a vector of indices, frequencies or weights which define the bootstrap sample.
So you need to add a second parameter to your bootstrapDistances function that will tell it which elements of the data are selected:
bootstrapDistances2 <- function(data, idx, binx) {
data.cut <- cut2(x = data[idx], cuts = binx)
data.cut <- table(data.cut)/sum(table(data.cut))
return(data.cut)
}
And the results:
x <- boot(data = data, statistic = bootstrapDistances2, R = 100, binx=binx)
x
ORDINARY NONPARAMETRIC BOOTSTRAP
Call:
boot(data = data, statistic = bootstrapDistances2, R = 100, binx = binx)
Bootstrap Statistics :
original bias std. error
t1* 0.208 0.00134 0.017342783
t2* 0.322 0.00062 0.021700803
t3* 0.190 -0.00034 0.018873433
t4* 0.136 -0.00116 0.016206197
t5* 0.078 -0.00120 0.011413265
t6* 0.036 0.00070 0.008510837
t7* 0.016 0.00074 0.005816417
t8* 0.006 0.00024 0.003654581
t9* 0.000 0.00000 0.000000000
t10* 0.008 -0.00094 0.003368961

Good answer, Aniko.
Also, the help page for "boot" states that the bootstrap statistic function may return a vector, not merely a scalar.

Related

Error in confidence interval mice R package

everyone I am trying to execute the code in found in the book "Flexible Imputation of Missing Data 2ed" in 2.5.3 section, that calculates a confidence interval for two imputation methods. The problem is that I cannot reproduce the results as the result is always NaN
Here is the code
require(mice)
# function randomly draws artificial data from the specified linear model
create.data <- function(beta = 1, sigma2 = 1, n = 50, run = 1) {
set.seed(seed = run)
x <- rnorm(n)
y <- beta * x + rnorm(n, sd = sqrt(sigma2))
cbind(x = x, y = y)
}
#Remove some data
make.missing <- function(data, p = 0.5){
rx <- rbinom(nrow(data), 1, p)
data[rx == 0, "x"] <- NA
data
}
# Apply Rubin’s rules to the imputed data
test.impute <- function(data, m = 5, method = "norm", ...) {
imp <- mice(data, method = method, m = m, print = FALSE, ...)
fit <- with(imp, lm(y ~ x))
tab <- summary(pool(fit), "all", conf.int = TRUE)
as.numeric(tab["x", c("estimate", "2.5 %", "97.5 %")])
}
#Bind everything together
simulate <- function(runs = 10) {
res <- array(NA, dim = c(2, runs, 3))
dimnames(res) <- list(c("norm.predict", "norm.nob"),
as.character(1:runs),
c("estimate", "2.5 %","97.5 %"))
for(run in 1:runs) {
data <- create.data(run = run)
data <- make.missing(data)
res[1, run, ] <- test.impute(data, method = "norm.predict",
m = 2)
res[2, run, ] <- test.impute(data, method = "norm.nob")
}
res
}
res <- simulate(1000)
#Estimate the lower and upper bounds of the confidence intervals per method
apply(res, c(1, 3), mean, na.rm = TRUE)
Best Regards
Replace "x" by tab$term == "x" in the last line of test.impute():
as.numeric( tab[ tab$term == "x", c("estimate", "2.5 %", "97.5 %")])

Adding descriptive statistics to a function when making use of the ellipsis as input variable

For an assignment I have created a function in R that calculates the regression coefficients, predicted values and residuals of data that is useful for multiple linear regression. It did that as follows:
MLR <- function(y_var, ...){
y <- y_var
X <- as.matrix(cbind(...))
intercept <- rep(1, length(y))
X <- cbind(intercept, X)
regression_coef <- solve(t(X) %*% X) %*% t(X) %*% y
predicted_val <- X %*% regression_coef
residual_val <- y - predicted_val
scatterplot <- plot(predicted_val, residual_val,
ylab = 'Residuals', xlab = 'Predicted values',
main = 'Predicted values against the residuals',
abline(0,0))
list('y' = y,
'X' = X,
'Regression coefficients' = regression_coef,
'Predicted values' = predicted_val,
'Residuals' = residual_val,
'Scatterplot' = scatterplot
)
}
Now, my struggle is to add descriptive statistics of my input variables. Since I want my independent variables to be able to be any number, I used the ellipsis as input variable. Is there a way to calculate useful descriptive statistics (mean, variance, standard deviation) of my independent variables (defined by the ...)?
This
mean(...)
does not work...
Thank you for the replies already!
Try this slight changes on your function. I have applied to some variables of iris dataset. You can compute the desired statistics over X and then output as an additional slot for your output. Here the code:
#Function
MLR <- function(y_var, ...){
y <- y_var
X <- as.matrix(cbind(...))
RX <- X
intercept <- rep(1, length(y))
X <- cbind(intercept, X)
regression_coef <- solve(t(X) %*% X) %*% t(X) %*% y
predicted_val <- X %*% regression_coef
residual_val <- y - predicted_val
scatterplot <- plot(predicted_val, residual_val,
ylab = 'Residuals', xlab = 'Predicted values',
main = 'Predicted values against the residuals',
abline(0,0))
#Summary
#Stats
DMeans <- apply(RX,2,mean,na.rm=T)
DSD <- apply(RX,2,sd,na.rm=T)
DVar <- apply(RX,2,var,na.rm=T)
DSummary <- rbind(DMeans,DSD,DVar)
#Out
list('y' = y,
'X' = X,
'Regression coefficients' = regression_coef,
'Predicted values' = predicted_val,
'Residuals' = residual_val,
'Scatterplot' = scatterplot,
'Summary' = DSummary
)
}
#Apply
MLR(y_var = iris$Sepal.Length,iris$Sepal.Width,iris$Petal.Length)
The final slot of the output will look like this:
$Scatterplot
NULL
$Summary
[,1] [,2]
DMeans 3.0573333 3.758000
DSD 0.4358663 1.765298
DVar 0.1899794 3.116278
I think I've got it. Unfortunately, the ellipsis seems to be quite quirky to work with them. Check if the cbind(...) functions correctly inside your function (when I've checked it at the output, it was only 1 column wide, while I input 2 variables into it, and that don't seem right.
My solution don't read variable names - it uses placeholder names (Var_1, Var_2, ... , Var_n)
MLR <- function(y_var, ...){
# these two packages will come in handy
require(dplyr)
require(tidyr)
y <- y_var
X <- as.matrix(cbind(...))
# firstly, we need to make df/tibble out of ellipsis
X2 <- list(...)
n <- tibble(n = rep(0, times = length(y)))
index <- 0
for(Var in X2){
index <- index + 1
n[, paste0("Var_", index)] <- Var
}
# after the df was created, now it's time for calculating desc
# Using tidyr::gather with dplyr::summarize creates nice summary,
# where each row is another variable
descriptives <- tidyr::gather(n, key = "Variable", value = "Value") %>%
group_by(Variable) %>%
summarize(mean = mean(Value), var = var(Value), sd = sd(Value), .groups = "keep")
# everything except the output list is the same
intercept <- rep(1, length(y))
X <- cbind(intercept, X)
regression_coef <- solve(t(X) %*% X) %*% t(X) %*% y
predicted_val <- X %*% regression_coef
residual_val <- y - predicted_val
scatterplot <- plot(predicted_val, residual_val,
ylab = 'Residuals', xlab = 'Predicted values',
main = 'Predicted values against the residuals',
abline(0,0))
list('y' = y,
'X' = X,
'Regression coefficients' = regression_coef,
'Predicted values' = predicted_val,
'Residuals' = residual_val,
'Scatterplot' = scatterplot,
'descriptives' = descriptives[-1,] # need to remove the first row
# because it is "n" placeholder
)
}

Function that will generate iter samples of size n from a gamma distribution with shape parameter alpha and rate parameter beta

The function needs to return the mean and standard deviation of each sample.
This is what I have:
sample_gamma <- function(alpha, beta, n, iter) {
mean = alpha/beta
var = alpha/(beta)^2
sd = sqrt(var)
gamma = rgamma(n,shape = alpha, scale = 1/beta)
sample_gamma = data.frame(mean = replicate(n = iter, expr = mean))
}
I'm very lost for this. I also need to create a data frame for this function.
Thank you for your time.
Edit:
sample_gamma <- function(alpha, beta, n, iter) {
output <- rgamma(iter, alpha, 1/beta)
output_1 <- matrix(output, ncol = iter)
means <- apply(output_1, 2, mean)
sds <- apply(output_1, 2, sd)
mystats <- data.frame(means, sds)
return(mystats)
}
This works except for the sds. It's returning NAs.
It's not really clear to me what you want. But say you want to create 10 samples of size 1000, alpha = 1, beta = 2. Then you can create a single stream of rgamma realizations, dimension them into a matrix, then get your stats with apply, and finally create a data frame with those vectors:
output <- rgamma(10*1000, 1, 1/2)
output <- matrix(output, ncol = 10)
means <- apply(output, 2, mean)
sds <- apply(output, 2, sd)
mystats <- data.frame(means, sds)
You could wrap your function around that code, replacing the hard values with parameters.

Fit student t to incomplete distribution

Thanks to a closed form formula (I work on risk neutral density, with this king of formula: RND formula, page 8), I have an incomplete distribution of this type:
My idea would be to fit this density with a student-t.
I already tried the MASS and fitdistrplus packages but just can't find how to perform my task. Everything I can do for now is to get the fitted parameters (m=1702.041, s=6.608536, df=15.18036), but from here I don't know how to get my fitted values for my distribution.
A sample of code:
temp = matrix(nrow=1000, ncol=3)
colnames(temp) = c("strikes", "first_density", "mulitply_first_density")
temp = as.data.frame(temp)
# we generate fake data
temp$strikes = seq(1000,2000,length=1000)
temp$first_density = runif(1000,max=0.006, min=1e-10)
# we multiply our first density to generate our sample
temp$mulitply_first_density = temp$first_density*1000000
# we generate our sample
vec = vector()
for (i in 1:nrow(temp))
{
vec = c(vec, rep(temp$strike[i], temp$mulitply_first_density[i]))
}
# we laod our library
library("MASS")
# we fir our parameters
fitted_parameters = fitdistr(vec, "t")
The formula for the t-density function using the location and scale parameters is given in the examples of the documentation as mydt.
#simulated data
set.seed(42)
x <- rt(1e4, 7, 10)
plot(density(x))
library(MASS)
fitted_parameters = fitdistr(x, "t", start = list(df = 10, m = 10, s = 5))
# df m s
# 3.81901649 10.56816146 2.66905346
#( 0.15295551) ( 0.03448627) ( 0.03361758)
mydt <- function(x, m, s, df) dt((x-m)/s, df)/s
curve(do.call(mydt, c(list(x), as.list(fitted_parameters$estimate))), add = TRUE, col = "red")
legend("topright", legend = c("kernel density estimate", "fitted t distribution"),
col = c("black", "red"), lty = 1)

How to change metrics using the library(caret)?

I would like to change the metric from RMSE to RMSLE using the
caret library
Given some sample data:
ivar1<-rnorm(500, mean = 3, sd = 1)
ivar2<-rnorm(500, mean = 4, sd = 1)
ivar3<-rnorm(500, mean = 5, sd = 1)
ivar4<-rnorm(500, mean = 4, sd = 1)
dvar<-rpois(500, exp(3+ 0.1*ivar1 - 0.25*ivar2))
data<-data.frame(dvar,ivar4,ivar3,ivar2,ivar1)
ctrl <- rfeControl(functions=rfFuncs,
method="cv",
repeats = 5,
verbose = FALSE,
number=5)
model <- rfe(data[,2:4], data[,1], sizes=c(1:4), rfeControl=ctrl)
Here I would like to change to RMSLE and keeping the idea of the graph
plot <-ggplot(model,type=c("g", "o"), metric="RMSE")+ scale_x_continuous(breaks = 2:4, labels = names(data)[2:4])
Im not sure how / if you can easily convert RMSE to RMSLE, so you can try changing the control function.
Look at rfFuncs$summary it calls a function postResample. This is where the RMSE is calculated - look at the section
mse <- mean((pred - obs)^2)
n <- length(obs)
out <- c(sqrt(mse), resamplCor^2)
So you can amend this function to calculate the RMSLE instead:
msle <- mean((log(pred) - log(obs))^2)
out <- sqrt(msle)
}
names(out) <- "RMSLE"
Then if this amended function has been saved in a function called mypostResample, you then need to update the rfFuncs$summary.
So altogether:
First update the summary function - this will call the new function with RMSLE
newSumm <- function (data, lev = NULL, model = NULL)
{
if (is.character(data$obs))
data$obs <- factor(data$obs, levels = lev)
mypostResample(data[, "pred"], data[, "obs"])
}
Then define new function to calculate RMSLE
mypostResample <- function (pred, obs)
{
isNA <- is.na(pred)
pred <- pred[!isNA]
obs <- obs[!isNA]
msle <- mean((log(pred) - log(obs))^2)
out <- sqrt(msle)
names(out) <- "RMSLE"
if (any(is.nan(out)))
out[is.nan(out)] <- NA
out
}
Update rfFuncs
# keep old settings for future use
oldSumm <- rfFuncs$summary
# update with new function
rfFuncs$summary <- newSumm
ctrl <- rfeControl(functions=rfFuncs,
method="cv",
repeats = 5,
verbose = FALSE,
number=5)
set.seed(1)
model <- rfe(data[,2:4], data[,1], sizes=c(1:4), rfeControl=ctrl, metric="RMSLE")
# plot
ggplot(model,type=c("g", "o"), metric="RMSLE")+ scale_x_continuous(breaks = 2:4, labels = names(data)[2:4])

Resources