Confidence interval for all rows of a dataframe - t distribution - r

I'm trying to calculate confidence intervals for all rows in a dataframe
I've been using something like this for a normal distribution:
function(x){
mean(x)+c(-1.96,1.96)*sd(x)/sqrt(length(x))
}
Any advice how to modify the above for a t-distribution? Thanks a lot in advance

try the below. I broke it down in parts but also provided the function similar to yours.
## In parts
n <- 25
x <- sample(1:100, n, replace = TRUE)
mean <- mean(x)
sd <- sd(x)
error <- qt(0.975, df = n-1) * sd / sqrt(n)
lower <- mean - error
upper <- mean + error
# As function
example_function <- function(x){
tCI <- mean(x)+c(-( qt(0.975, df = n-1) * sd / sqrt(n)), qt(0.975, df = n-1) * sd / sqrt(n))
tCI
}
example_function(x)
# > lower
# [1] 37.23457
# > upper
# [1] 60.76543
# > example_function(x)
# [1] 37.23457 60.76543

Related

Constructing confidence intervals for trimmed means in R

I'd like to test the coverage probabilities for trimmed means, I am using the formula form Wilcox book for confidence intervals:
Confidence interval
The s_w is Winsorised variance and γ is the proportion coefficient, in my code it's denoted as alpha. The problem is, that the code, I have made outputs confidence intervals with 0 always in them, so that the coverage probability is 1. So, I think there is some error in the construction.
Code:
sample_var <- function(data, alpha){
n <- length(data)
data <- sort(data)
data_t <- data[(floor(n*alpha)+1):(n-floor(alpha*n))]
m <- length(data_t)
t_mean <- mean(data_t)
sigma <- (1/(1-2*alpha)^2)* ((1/n) *sum((data_t-t_mean)^2)+ alpha*(data_t[1]-t_mean)^2 +
alpha*(data_t[m]-t_mean)^2)
sigma
}
sample_var <- Vectorize(sample_var, vectorize.args = "alpha")
conf_int <- function(data,alpha){
a <- floor(alpha * n)
n <- length(data)
df <- n-2*a-1
data_t <- data[a:(n-a)]
t_mean <- mean(data_t)
t_quantile <- qt(p = alpha, df = df)
sw <- sample_var(data = data, alpha = alpha)
ul <- t_mean + t_quantile * sw / ((1-2*alpha)*sqrt(n))
ll <- t_mean - t_quantile * sw / ((1-2*alpha)*sqrt(n))
c(ll, ul)
}
Maybe someone sees the error?
EDIT:
Here I tried to construct the intervals using wilcox.test function, but I don't know whether it accurately constructs the interval for the trimmed mean. Furthermore, no matter which alpha I use, for the given data set, I get the same interval. So, I suppose that the subset argument is wrong.
set_seed(1)
data <- rnorm(100)
wilcox_test <- function(data, alpha){
n <- length(alpha)
a <- floor(alpha*n)+1
b <- n-floor(alpha)
wilcox.test(data, subset = data[a:b], conf.int = TRUE)
}
OK...with rnorm(100) and set.seed(1)
Close-ish...
set.seed(1) # note set.seed() is what you want here, I think.
data <- rnorm(100)
wilcox_test_out <- wilcox.test(data, subset = data[a:b], conf.int = .95)
summary(wilcox_test_out)
# Note the CI's are in wilcox_test_out$conf.int for further use should you need them
wilcox_test_out$conf.int

Why do I get he error: argument is of length 0 for dffits?

I have a problem when I try to run the dffits() function for an object of my own logistic regression.
When I'm running dffits(log) I get the error message:
error in if (model$rank == 0) { : Argument is of length 0
However, when I'm using the inbuilt gym function (family = binomial), then dffits(glm) works just fine.
Here is my function for the logistic regression and a short example of my problem:
mydata <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
mydata$rank <- factor(mydata$rank)
mydata$admit <- factor(mydata$admit)
logRegEst <- function(x, y, threshold = 1e-10, maxIter = 100)
{
calcPi <- function(x, beta)
{
beta <- as.vector(beta)
return(exp(x %*% beta) / (1 + exp(x %*% beta)))
}
beta <- rep(0, ncol(x)) # initial guess for beta
diff <- 1000
# initial value bigger than threshold so that we can enter our while loop
iterCount = 0
# counter to ensure we're not stuck in an infinite loop
while(diff > threshold) # tests for convergence
{
pi <- as.vector(calcPi(x, beta))
# calculate pi by using the current estimate of beta
W <- diag(pi * (1 - pi)) # calculate matrix of weights W
beta_change <- solve(t(x) %*% W %*% x) %*% t(x) %*% (y - pi)
# calculate the change in beta
beta <- beta + beta_change # new beta
diff <- sum(beta_change^2)
# calculate how much we changed beta by in this iteration
# if this is less than threshold, we'll break the while loop
iterCount <- iterCount + 1
# see if we've hit the maximum number of iterations
if(iterCount > maxIter){
stop("This isn't converging.")
}
# stop if we have hit the maximum number of iterations
}
df <- length(y) - ncol(x)
# calculating the degrees of freedom by taking the length of y minus
# the number of x columns
vcov <- solve(t(x) %*% W %*% x)
list(coefficients = beta, vcov = vcov, df = df)
# returning results
}
logReg <- function(formula, data)
{
mf <- model.frame(formula = formula, data = data)
# model.frame() returns us a data.frame with the variables needed to use the
# formula.
x <- model.matrix(attr(mf, "terms"), data = mf)
# model.matrix() creates a design matrix. That means that for example the
#"Sex"-variable is given as a dummy variable with ones and zeros.
y <- as.numeric(model.response(mf)) - 1
# model.response gives us the response variable.
est <- logRegEst(x, y)
# Now we have the starting position to apply our function from above.
est$formula <- formula
est$call <- match.call()
est$data <- data
# We add the formular and the call to the list.
est$x <- x
est$y <- y
# We add x and y to the list.
class(est) <- "logReg"
# defining the class
est
}
log <- logReg(admit ~ gre + gpa, data= mydata)
glm <- glm(admit ~ gre + gpa, data= mydata, family = binomial)
dffits(glm)
dffits(log)
log$data
glm$data
I don't understand why mydata$rank == 0, because when I look at log$data I see that the rank is just defined as in glm$data.
I really appreciate your help!

How to calculate standardized Pearson residuals by hand in R?

I am trying to calculate the standardized Pearson Residuals by hand in R. However, I am struggling when it comes to calculating the hat matrix.
I have built my own logistic regression and I am trying to calculate the standardized Pearson residuals in the logReg function.
logRegEst <- function(x, y, threshold = 1e-10, maxIter = 100)
{
calcPi <- function(x, beta)
{
beta <- as.vector(beta)
return(exp(x %*% beta) / (1 + exp(x %*% beta)))
}
beta <- rep(0, ncol(x)) # initial guess for beta
diff <- 1000
# initial value bigger than threshold so that we can enter our while loop
iterCount = 0
# counter for the iterations to ensure we're not stuck in an infinite loop
while(diff > threshold) # tests for convergence
{
pi <- as.vector(calcPi(x, beta))
# calculate pi by using the current estimate of beta
W <- diag(pi * (1 - pi))
# calculate matrix of weights W as defined int he fisher scooring algorithem
beta_change <- solve(t(x) %*% W %*% x) %*% t(x) %*% (y - pi)
# calculate the change in beta
beta <- beta + beta_change # new beta
diff <- sum(beta_change^2)
# calculate how much we changed beta by in this iteration
# if this is less than threshold, we'll break the while loop
iterCount <- iterCount + 1
# see if we've hit the maximum number of iterations
if(iterCount > maxIter){
stop("This isn't converging.")
}
# stop if we have hit the maximum number of iterations
}
n <- length(y)
df <- length(y) - ncol(x)
# calculating the degrees of freedom by taking the length of y minus
# the number of x columns
vcov <- solve(t(x) %*% W %*% x)
logLik <- sum(y * log(pi / (1 - pi)) + log(1 - pi))
deviance <- -2 * logLik
AIC <- -2 * logLik + 2 * ncol(x)
rank <- ncol(x)
list(coefficients = beta, vcov = vcov, df = df, deviance = deviance,
AIC = AIC, iter = iterCount - 1, x = x, y = y, n = n, rank = rank)
# returning results
}
logReg <- function(formula, data)
{
if (sum(is.na(data)) > 0) {
print("missing values in data")
} else {
mf <- model.frame(formula = formula, data = data)
# model.frame() returns us a data.frame with the variables needed to use the
# formula.
x <- model.matrix(attr(mf, "terms"), data = mf)
# model.matrix() creates a design matrix. That means that for example the
#"Sex"-variable is given as a dummy variable with ones and zeros.
y <- as.numeric(model.response(mf)) - 1
# model.response gives us the response variable.
est <- logRegEst(x, y)
# Now we have the starting position to apply our function from above.
est$formula <- formula
est$call <- match.call()
# We add the formular and the call to the list.
nullModel <- logRegEst(x = as.matrix(rep(1, length(y))), y)
est$nullDeviance <- nullModel$deviance
est$nullDf <- nullModel$df
mu <- exp(as.vector(est$x %*% est$coefficients)) /
(1 + exp(as.vector(est$x %*% est$coefficients)))
# computing the fitted values
est$residuals <- (est$y - mu) / sqrt(mu * (1 - mu))
est$mu <- mu
est$x <- x
est$y <- y
est$data <- data
hat <- (t(mu))^(1/2)%*%x%*%(t(x)%*%mu%*%x)^(-1)%*%t(x)%*%mu^(1/2)
est$stdresiduals <- est$residuals/(sqrt(1-hat))
class(est) <- "logReg"
# defining the class
est
}
}
I am struggling when it comes to calculating 𝐻=𝑉̂1/2𝑋(𝑋𝑇𝑉̂𝑋)−1𝑋𝑇𝑉̂1/2. This is called hat in my code.
If I try to calculate the hat matrix (hat) I get the error that I cannot multiply the vector mu and the matrix x in this case: t(x)%*%mu%*%x.
I can see that the rank of the matrices are not identical and therefor I can't multiply them.
Can Anyone see where my mistake is? Help is very appreciated. Thanks!

replace NA by truncated normal distribution values in r

I am trying to replace NAs by truncated normal distribution values.
First I used sample as follows and the function worked:
v.new <- replace(vector,v, sample(8,length(v),replace =FALSE))
However when I try to use rtnorm it seems not to work. I got any error messages and it takes ages to replace the NAs by the desired interval. Any suggestion to make this work?
library(msm)
# Some data
data("airquality")
airquality$Ozone
# My function
add.trunc.to.NAvector <- function(vector){
v <- NULL
for(i in 1:length(vector)){
if(is.na(vector[i])==TRUE)
v <- append(v, i)
}
mean.val <- mean(vector)
sd.val <- sd(vector)
min.val <- mean.val - 4 * sd.val
max.val <- mean.val + 4 * sd.val
v.new <- replace(vector,v, rtnorm(length(v), lower = min.val, upper = max.val))
return(v.new)
}
Should not this work?
v <- airquality$Ozone
v.new <- v
indices <- which(is.na(v))
m <- mean(v[-indices])
s <- sd(v[-indices])
v.new[indices] <- rtnorm(length(indices), lower = m-4*s, upper = m+4*s)

setting upper and lower limits in rnorm

I am simulating data using rnorm, but I need to set an upper and lower limit, does anyone know how to do this?
code:
rnorm(n = 10, mean = 39.74, sd = 25.09)
Upper limit needs to be 340, and the lower limit 0
I am asking this question because I am rewriting an SAS-code into an R-code. I have never used SAS.
I am trying to rewrite the following piece of code:
sim_sample(simtot=100000,seed=10004,lbound=0,ubound=340,round_y=0.01,round_m=0.01,round_sd=0.01,n=15,m=39.74,sd=25.11,mk=4)
The rtruncnorm() function will return the results you need.
library(truncnorm)
rtruncnorm(n=10, a=0, b=340, mean=39.4, sd=25.09)
You can make your own truncated normal sampler that doesn't require you to throw out observations quite simply
rtnorm <- function(n, mean, sd, a = -Inf, b = Inf){
qnorm(runif(n, pnorm(a, mean, sd), pnorm(b, mean, sd)), mean, sd)
}
Like this?
mysamp <- function(n, m, s, lwr, upr, nnorm) {
samp <- rnorm(nnorm, m, s)
samp <- samp[samp >= lwr & samp <= upr]
if (length(samp) >= n) {
return(sample(samp, n))
}
stop(simpleError("Not enough values to sample from. Try increasing nnorm."))
}
set.seed(42)
mysamp(n=10, m=39.74, s=25.09, lwr=0, upr=340, nnorm=1000)
#[1] 58.90437 38.72318 19.64453 20.24153 39.41130 12.80199 59.88558 30.88578 19.66092 32.46025
However, the result is not normal distributed and usually won't have the mean and sd you've specified (in particular if the limits are not symmetric around the specified mean).
Edit:
According to your comment it seems you want to translate this SAS function. I am not an SAS user, but this should do more or less the same:
mysamp <- function(n, m, s, lwr, upr, rounding) {
samp <- round(rnorm(n, m, s), rounding)
samp[samp < lwr] <- lwr
samp[samp > upr] <- upr
samp
}
set.seed(8)
mysamp(n=10, m=39.74, s=25.09, lwr=0, upr=340, rounding=3)
#[1] 37.618 60.826 28.111 25.920 58.207 37.033 35.467 12.434 0.000 24.857
You may then want to use replicate to run the simulations. Or if you want faster code:
sim <- matrix(mysamp(n=10*10, m=39.74, s=25.09, lwr=0, upr=340, rounding=3), 10)
means <- colMeans(sim)
sds <- apply(sim, 2, sd)
Assuming you want exactly 10 numbers and not the subset of them that is >0, <340 (and night not be a normal distribution):
aa <- rnorm(n = 10, mean = 39.74, s = 25.09)
while(any(aa<0 | aa>340)) { aa <- rnorm(n = 10, mean = 39.74, s = 25.09) }
This is the function that I wrote to achieve the same purpose. It normalizes the result from the rnorm function and then adjusts it to fit the range.
NOTE: The standard deviation and mean (if specified) get altered during the normalization process.
#' Creates a random normal distribution within the specified bounds.
#'
#' WARNING: This function does not preserve the standard deviation or mean.
#' #param n The number of values to be generated
#' #param mean The mean of the distribution
#' #param sd The standard deviation of the distribution
#' #param lower The lower limit of the distribution
#' #param upper The upper limit of the distribution
rtnorm <- function(n, mean=NA, sd=1, lower=-1, upper=1){
mean = ifelse(is.na(mean)|| mean < lower || mean > upper,
mean(c(lower, upper)), mean)
data <- rnorm(n, mean=m, sd=sd) # data
if (!is.na(lower) && !is.na(upper)){ # adjust data to specified range
drange <- range(data) # data range
irange <- range(lower, upper) # input range
data <- (data - drange[1])/(drange[2] - drange[1]) # normalize data (make it 0 to 1)
data <- (data * (irange[2] - irange[1]))+irange[1] # adjust to specified range
}
return(data)
}
There are several ways to set upper and lower limits to a normal distribution, what will cause that the result is no longer normal distributed.
Assuming a mean=0, sd=1 producing N=1e5 values with a lower boundary of LO=-1 and an upper boundary of UP=2.
N <- 1e5L
LO <- -1
UP <- 2
Move outliers to border (#Roland)
set.seed(42)
x <- pmax(LO, pmin(UP, rnorm(N)))
mean(x)
#[1] 0.07238029
median(x)
#[1] -0.002066374
sd(x)
#[1] 0.8457605
hist(x, 30)
Cut outliers of (#Dason, #Roland, truncnorm::rtruncnorm, MCMCglmm::rtnorm)
set.seed(42)
x <- qnorm(runif(N, pnorm(LO), pnorm(UP)))
mean(x)
#[1] 0.2317875
median(x)
#[1] 0.173679
sd(x)
#[1] 0.7236536
Scale (#Alex Essilfie)
set.seed(42)
x <- rnorm(N)
x <- (x-min(x))/(max(x)-min(x))*(UP-LO)+LO
mean(x)
#[1] 0.4474876
median(x)
#[1] 0.4482257
sd(x)
#[1] 0.3595199
Combination of methods. E.g. Cut and scale:
set.seed(42)
x <- qnorm(runif(N, pnorm(-3), pnorm(3)))
x <- (x-min(x))/(max(x)-min(x))*(UP-LO)+LO
mean(x)
#[1] 0.5010759
median(x)
#[1] 0.5014713
sd(x)
#[1] 0.4957751
Asymmetric combination
set.seed(42)
n <- round(N*abs(LO)/diff(range(c(LO, UP))))
x <- c(qnorm(runif(n, pnorm(-3), 0.5)), qnorm(runif(N-n, 0.5, pnorm(3))))
x <- ifelse(x < 0, x/min(x)*LO, x/max(x)*UP)
mean(x)
#[1] 0.2651627
median(x)
#[1] 0.2127903
sd(x)
#[1] 0.5078264

Resources