How to represent the binary t-statistic? - r
The question is given like this:
Read the file diabetes.csv. There are two variables called BMI and Outcome. The variable Outcome takes on only two values: 0 and 1. Conduct a non-parametric two sample test for the hypothesis that the standard deviation of BMI is the same for both Outcome values
bmi <- diabetes$BMI
bmi
outcome <- diabetes$Outcome
outcome
n <- length(bmi)
# tstat
tstat <- ???
# Describe the population and draw synthetic samples
f1 <- function()
{
x <- c(bmi, outcome)
x <- sample(x)
m1 <- sd(x[1:n])
m2 <- sd(x[(n+1):length(x)])
return(m1 - m2)
}
# Create sampling distribution
sdist <- replicate(10000, f1())
plot(density(sdist))
# Gap
gap <- abs(mean(sdist) - tstat)
abline(v = mean(sdist) + c(-1,1) * gap, col = "dark orange")
s1 <- sdist[sdist <(mean(sdist - gap)) | sdist >(mean(sdist + gap))]
pvalue <- length(s1) / length(sdist)
pvalue
The data is in some dataset called "diabetes". My question is how to represent the "t-statistic" since the outcome is binary?
Use this code:
# Sort the table diabetes on accending order of Outcome to separate the BMI
# values with outcome = 0 and BMI values with outcome = 1
diabetes = diabetes[order(diabetes$Outcome),]
View(diabetes)
# Find the number of values with outcome = 0
n = length(which(diabetes$Outcome == 0))
# Find total number of rows
l = length(diabetes$BMI)
# Find BMI values to create the sample later on
g = diabetes$BMI
# Create function to take the values of BMI and shuffle it every time and
# to find the difference between the standard deviations
f1 = function()
{
x = sample(g)
z = abs(sd(x[1:n]) - sd(x[(n+1):l]))
return(z)
}
# Replicate the function several times
dist = replicate(100000,f1())
# Plot density of distribution
plot(density(dist))
polygon(density(dist),col="green")
diabetes0 = diabetes[diabetes$Outcome == 0,]
diabetes1 = diabetes[diabetes$Outcome == 1,]
View(diabetes0)
View(diabetes1)
# Find the difference between standard deviation of BMI when outcome = 0 and
# when outcome = 1
tstat = abs(sd(diabetes0$BMI) - sd(diabetes1$BMI))
tstat
abline(v=tstat)
rside = dist[dist>tstat]
pvalue = length(rside)/length(dist)
pvalue
Related
How to fix vector size discrepancy(?): lmeInfo::g_mlm function, between r_const and utils::combn(1:cor_q, 2) : n < m
Using this code block in fresh install of R, if (!require('readr')) install.packages('readr') data <- as.data.frame(readr::read_delim("/Users/{le me}/Desktop/{path}/Histology Data for R.csv", delim = ",")) DataGroupLabel <- "3groupAnova" subdata <- data[3:80,1:7] # Select a single trial of data to analyze for 3-way ANOVA colnames(subdata) <- c("Tx","XCoordinate","Channel","VarianceLabel", "VarianceCode", "BrainRegion", "CellCount") # give the data simple labels subdata$CellCount <- as.numeric(subdata$CellCount) # convert CellCount to numeric # Calculate intraclass correlation (ICC) ANOVAresult <- aov(CellCount ~ XCoordinate, data = subdata) ANOVAtable <- summary(ANOVAresult) names(ANOVAtable) <- "Ftable" Atable <- as.data.frame(ANOVAtable$Ftable) icc <- Atable[1,3]/(Atable[1,3] + Atable[2,3]) icc # dummy code the 3 groups (sham = baseline = 0 in $CCI and 0 in $Xpro) subdata$CCI <- 0 subdata$CCI[subdata$Tx=="CCI"]<- 1 subdata$Xpro <- 0 subdata$Xpro[subdata$Tx=="Xpro"]<- 1 subdata$CCI=as.factor(subdata$CCI) subdata$Xpro=as.factor(subdata$Xpro) #subdata$XCoordinate <- type_convert(subdata$XCoordinate) head(subdata, 100) # Preview & inspect if (!require('nlme')) install.packages('nlme') # fit multilevel model to run t-test HCSresult <- nlme::gls(CellCount ~ CCI + Xpro, # heterogeneous variances weights=varIdent(form = ~1|VarianceCode), correlation = corSymm(value=numeric(0),form = ~1|XCoordinate,fixed=T), data = subdata) residual <- HCSresult$residuals #setup plots for inspection par(mfrow=c(1,2)) h <- hist(residual, xlab="Residual") xfit <- seq(min(residual), max(residual), length = 40) yfit <- dnorm(xfit, mean = mean(residual), sd = sd(residual)) yfit <- yfit * diff(h$mids[1:2]) * length(residual) lines(xfit, yfit, col = "black", lwd = 2) qqnorm(residual, frame = FALSE) qqline(residual) par(mfrow=c(1,1)) summary(HCSresult) # fit null model for comparison NULresult <- nlme::gls(CellCount ~ 1, # heterogeneous variances weights=varIdent(form = ~1|VarianceCode), correlation = corSymm(value=numeric(0),form = ~1|XCoordinate,fixed=T), data = subdata) logLik(NULresult, REML=FALSE) logLik(HCSresult, REML=FALSE) chidiff <- -2*as.numeric(logLik(NULresult, REML=FALSE) - logLik(HCSresult, REML=FALSE)) chidiff # print delta chi-square value qchisq(.95, df=2) # print critical value round(pchisq(chidiff, df=2, lower.tail=FALSE), 4) # p-value if (!require('lmeInfo')) install.packages('lmeInfo') # asymmetric conf interval for ES ctrl <- lmeControl(opt='optim'); result <- lme(fixed = CellCount ~ CCI + Xpro, control=ctrl, random = ~ 1 | XCoordinate, correlation = corSymm(value=numeric(0),form = ~1|XCoordinate,fixed=T), # heterogeneous variances weights=varIdent(form = ~1|VarianceCode), data = subdata) #control=lmeControl(opt="optim") # Switching optimizers for occasional convergence errors summary(result) # extract_varcomp(result,separate_variances = TRUE) ESresult <- lmeInfo::g_mlm(result, p_const = c(0,1,1), r_const = c(1,1,0,0,1,1,1), infotype = "expected", separate_variances = TRUE) summary(ESresult) CI_g(ESresult, symmetric = FALSE) I fixed a bunch of errors in this to get it to run all the way to the final ESresult <- lmeInfo ::gmlm function call. Unfortunately the lmeInfo module's GitHub page ( https://github.com/jepusto/lmeInfo ) doesn't seem to go into detail on how to specify the r_const = c() vector. My advisor said to use these modules to run this analysis, but is functionally unavailable to teach me how to use them. Specifically, how to understand the string of integers in r_const(). I think I'll need to adjust r_const for other blocks of data in this set, as many of the blocks are of unique sample sizes. Got this error: Error in utils::combn(1:cor_q, 2) : n < m ...against the following example block of data: CCI,2108,FITC,Ipsi Group 1,A,Cortex,12.2222222 CCI,1218,FITC,Ipsi Group 1,A,Cortex,19.4736842 CCI,2380,FITC,Ipsi Group 1,A,Cortex,15 CCI,4479,FITC,Ipsi Group 1,A,Cortex,9.47368421 CCI,949,FITC,Ipsi Group 1,A,Cortex,25.2631579 CCI,2129,FITC,Ipsi Group 1,A,Cortex,0 CCI,618,FITC,Ipsi Group 1,A,Cortex,15.5555556 CCI,5061,FITC,Ipsi Group 1,A,Cortex,28.3333333 CCI,2768,FITC,Ipsi Group 1,A,Cortex,25 CCI,2733,FITC,Ipsi Group 1,A,Cortex,31 CCI,2063,FITC,Ipsi Group 1,A,Cortex,25.5 CCI,5307,FITC,Ipsi Group 1,A,Cortex,36 CCI,1923,FITC,Ipsi Group 1,A,Cortex,16.3157895 CCI,9414,FITC,Contra Group 1,D,Cortex,1.05263158 CCI,9123,FITC,Contra Group 1,D,Cortex,4 CCI,10723,FITC,Contra Group 1,D,Cortex,5.2631579 CCI,9737,FITC,Contra Group 1,D,Cortex,6.5 CCI,7422,FITC,Contra Group 1,D,Cortex,8.42105263 CCI,9012,FITC,Contra Group 1,D,Cortex,5 CCI,8924,FITC,Contra Group 1,D,Cortex,5 CCI,9550,FITC,Contra Group 1,D,Cortex,9 CCI,8070,FITC,Contra Group 1,D,Cortex,6 CCI,8734,FITC,Contra Group 1,D,Cortex,7.5 CCI,10154,FITC,Contra Group 1,D,Cortex,9.5 CCI,10203,FITC,Contra Group 1,D,Cortex,38.5 CCI,11305,FITC,Contra Group 1,D,Cortex,24.4444444 Sham,803,FITC,Ipsi Group 2,B,Cortex,1.49342891 Sham,1937,FITC,Ipsi Group 2,B,Cortex,1.88101926 Sham,278,FITC,Ipsi Group 2,B,Cortex,1.38966092 Sham,1414,FITC,Ipsi Group 2,B,Cortex,0 Sham,2762,FITC,Ipsi Group 2,B,Cortex,2.86418056 Sham,3218,FITC,Ipsi Group 2,B,Cortex,1.06126694 Sham,157,FITC,Ipsi Group 2,B,Cortex,5.68860572 Sham,1174,FITC,Ipsi Group 2,B,Cortex,3.09463391 Sham,4481,FITC,Ipsi Group 2,B,Cortex,5.33253701 Sham,4011,FITC,Ipsi Group 2,B,Cortex,4.53607312 Sham,2784,FITC,Ipsi Group 2,B,Cortex,1.1154863 Sham,2379,FITC,Ipsi Group 2,B,Cortex,1.05037604 Sham,1417,FITC,Ipsi Group 2,B,Cortex,0 Sham,6739,FITC,Contra Group 2,E,Cortex,9.59452079 Sham,5166,FITC,Contra Group 2,E,Cortex,2.18957325 Sham,7252,FITC,Contra Group 2,E,Cortex,5.16175655 Sham,7115,FITC,Contra Group 2,E,Cortex,9.05843988 Sham,8000,FITC,Contra Group 2,E,Cortex,11.5952485 Sham,5843,FITC,Contra Group 2,E,Cortex,0 Sham,6361,FITC,Contra Group 2,E,Cortex,4.84261501 Sham,8444,FITC,Contra Group 2,E,Cortex,6.10209357 Sham,8247,FITC,Contra Group 2,E,Cortex,5.22677678 Sham,5260,FITC,Contra Group 2,E,Cortex,3.88888889 Sham,7526,FITC,Contra Group 2,E,Cortex,3.97645936 Sham,7495,FITC,Contra Group 2,E,Cortex,5.7390456 Sham,8052,FITC,Contra Group 2,E,Cortex,3.17302506 Xpro,2845,FITC,Ipsi Group 3,C,Cortex,33.6842105 Xpro,1379,FITC,Ipsi Group 3,C,Cortex,9.09090909 Xpro,1592,FITC,Ipsi Group 3,C,Cortex,21.6666667 Xpro,968,FITC,Ipsi Group 3,C,Cortex,6.25 Xpro,2982,FITC,Ipsi Group 3,C,Cortex,17.2222222 Xpro,1212,FITC,Ipsi Group 3,C,Cortex,6.31578947 Xpro,2563,FITC,Ipsi Group 3,C,Cortex,29 Xpro,4615,FITC,Ipsi Group 3,C,Cortex,30 Xpro,630,FITC,Ipsi Group 3,C,Cortex,23 Xpro,5014,FITC,Ipsi Group 3,C,Cortex,27 Xpro,1409,FITC,Ipsi Group 3,C,Cortex,16.5 Xpro,3969,FITC,Ipsi Group 3,C,Cortex,23 Xpro,3527,FITC,Ipsi Group 3,C,Cortex,15 Xpro,7253,FITC,Contra Group 3,F,Cortex,7 Xpro,7891,FITC,Contra Group 3,F,Cortex,9 Xpro,6711,FITC,Contra Group 3,F,Cortex,15.625 Xpro,7830,FITC,Contra Group 3,F,Cortex,11.1111111 Xpro,7218,FITC,Contra Group 3,F,Cortex,10.2857143 Xpro,7849,FITC,Contra Group 3,F,Cortex,4.57085395 Xpro,7303,FITC,Contra Group 3,F,Cortex,12.6019498 Xpro,6928,FITC,Contra Group 3,F,Cortex,33.8199036 Xpro,7991,FITC,Contra Group 3,F,Cortex,5.81483248 Xpro,8452,FITC,Contra Group 3,F,Cortex,1.28539661 Xpro,6717,FITC,Contra Group 3,F,Cortex,36.0048967 Xpro,7328,FITC,Contra Group 3,F,Cortex,23.0207289 Xpro,5772,FITC,Contra Group 3,F,Cortex,7 So, I've troubleshot all the way to the end and I'm stuck here on the last lines of code. Much appreciation to anyone for wisdom.
How can I extract confidence intervals for all non-reference leveled contiuous variable coefficients from one lmer model?
I'm fitting a y ~ v + m + s + m:s + (1|subunit) model with lmer(). s is a continuous variable interacting with m, a categorical factor with 3 levels: A, B, and C. Fitting the model uses A as the reference level for factor m: fit_ref_A <- lmer(y ~ 1 + v + m*s + (1|subunit), data=df) The parameter estimates for B and C can just be calculated using the estimate for A and the differences for B and C. I'm interested in extracting the confidence intervals. Running confint() gives the confidence interval for the slope of variable s at A. I'm also interested in the confidence intervals of the slopes of s at B and C, not just the confidence intervals for the differences from the slope at A. Is there a way to extract this from fit_ref_A? So far the only thing I've been able to figure out is to relevel with B as the reference, fit a new fit, then relevel with C as the reference, and fit the third fit. Question: Is there a way to extract everything (especially the confidence intervals) from fit_ref_A? Code: library(lme4) # create the dataset, unbalanced at the lowest stratum ( 2 repeats for m==A instead of 3) set.seed(2) s_levels <- 1:5 m_levels <- c("A", "B", "C") v_levels <- c("L2", "L3", "L4") reps <- 1:3 df <- expand.grid(rep=reps, s=s_levels, m=m_levels, v=v_levels) df$subunit <- as.factor(paste(df$v,"-",df$m,"-",df$s, sep="")) df$y <- rnorm(nrow(df), 0, 1) df <- subset(df, !(rep==3 & m=="A")) # drop the 3rd repeat for m=="A" table(df$m) # shows 30 for A, 45 for B, 45 for C as expected # fit 3 different models, with three different reference levels for 'm' fit_ref_A <- lmer(y ~ 1 + v + m*s + (1|subunit), data=df) df$m <- relevel(df$m, ref = "B") fit_ref_B <- lmer(y ~ 1 + v + m*s + (1|subunit), data=df) df$m <- relevel(df$m, ref = "C") fit_ref_C <- lmer(y ~ 1 + v + m*s + (1|subunit), data=df) # Calculate the confidence intervals for the continuous variable s at the three # different levels for categorical factor m. Must use 3 separate fits. cis_at_m_reference_A <- confint(fit_ref_A) cis_at_m_reference_B <- confint(fit_ref_B) cis_at_m_reference_C <- confint(fit_ref_C) cis_at_m_reference_A["s",] cis_at_m_reference_B["s",] cis_at_m_reference_C["s",] # Any way to just extract all three from fit_ref_A?
You can get approximate CIs with Gaussian error propagation: sum(fixef(fit_ref_A)[c("s", "mB:s")]) + c(-1.96, 1.96) * sqrt(sum(vcov(fit_ref_A)[c("s", "mB:s"), c("s", "mB:s")])) #[1] -0.3346310 0.1863014 Or you could bootstrap: myboot <- bootMer(fit_ref_A, function(x) { cf <- fixef(x) c(sA = cf[["s"]], sB = cf[["s"]] + cf[["mB:s"]], sC = cf[["s"]] + cf[["mC:s"]]) }, nsim = 1e4, seed = 42) apply(myboot$t, 2, quantile, probs = c(0.025, 0.975)) # sA sB sC #2.5% -0.4022927 -0.3415690 -0.3969831 #97.5% 0.2041610 0.1858731 0.1266355
Simulating a mixed linear model and evaluating it with lmerTest in R
I am trying to understand how to use mixed linear models to analyse my data by simulating a model, but I can't reproduce the input parameters. What am I missing? I want to start simulating a model with a random intercept for each subject. Here is the formula of what I want to simulate and reproduce: If beta1 (<11) is small I find gamma00 as the intercept in fixed section, but I am completedly unaable to retrieve the slope (beta1). Also, the linear effect is not significant. Where is my conceptual mistake? library(lmerTest) # Generating data set # General values and variables numObj <- 20 numSub <- 100 e <- rnorm(numObj * numSub, mean = 0, sd = 0.1) x <- scale(runif(numObj * numSub, min = -100, max = 100)) y <- c() index <- 1 # Coefficients gamma00 <- 18 gamma01 <- 0.5 beta1 <- -100 w <- runif(numSub, min = -3, max = 3) uo <- rnorm(numSub, mean = 0, sd = 0.1) meanBeta0 <- mean(gamma00 + gamma01*w + uo) # I should be able to retrieve that parameter. for(j in 1:numSub){ for(i in 1:numObj){ y[index] <- gamma00 + gamma01*w[j]+ uo[j] + beta1*x[i] + e[index] index <- index + 1 } } dataFrame2 <- data.frame(y = y, x = x, subNo = factor(rep(1:numSub, each = numObj)), objNum = factor(rep(1:numObj, numSub))) model2 <- lmer(y ~ x + (1 | subNo), data = dataFrame2) summary(model2) anova(model2)
No conceptual mistake here, just a mixed up index value: you should be using index rather than i to index x in your data generation loop. Basically due to the mix-up you were using the first subject's x values for generating data for all the subjects, but using the individual x values in the model.
Attempting to use MonteCarlo package in R
I am trying to run a Monte Carlo simulation of a difference in differences estimator, but I am running into an error. Here is the code I am running: # Set the random seed set.seed(1234567) library(MonteCarlo) #Set up problem, doing this before calling the function # set sample size n<- 400 # set true parameters: betas and sd of u b0 <- 1 # intercept for control data (b0 in diffndiff) b1 <- 1 # shift on both control and treated after treatment (b1 in #diffndiff) b2 <- 2 # difference between intercept on control vs. treated (b2-this is #the level difference pre-treatment to compare to coef on treat) b3 <- 3 # shift after treatment that is only for treated group (b3-this is #the coefficient of interest in diffndiff) b4 <- 0 # parallel time trend (not measured in diffndiff) biases b0,b1 but #not b3 that we care about b5 <- 0 # allows for treated group trend to shift after treatment (0 if #parallel trends holds) su <- 4 # std. dev for errors dnd <- function(n,b0,b1,b2,b3,b4,b5,su){ #initialize a time vector (set observations equal to n) timelength = 10 t <- c(1:timelength) num_obs_per_period = n/timelength #allows for multiple observations in one #time period (can simulate multiple states within one group or something) t0 <- c(1:timelength) for (p in 1:(num_obs_per_period-1)){ t <- c(t,t0) } T<- 5 #set treatment period g <- t >T post <- as.numeric(g) # assign equal amounts of observations to each state to start with (would #like to allow selection into treatment at some point) treat <- vector() for (m in 1:(round(n/2))){ treat <- c(treat,0) } for (m in 1:(round(n/2))){ treat <- c(treat,1) } u <- rnorm(n,0,su) #This assumes the mean error is zero #create my y vector now from the data y<- b0 + b1*post + b2*treat + b3*treat*post + b4*t + b5*(t-T)*treat*post +u interaction <- treat*post #run regression olsres <- lm(y ~ post + treat + interaction) olsres$coefficients # assign the coeeficients bhat0<- olsres$coefficients[1] bhat1 <- olsres$coefficients[2] bhat2<- olsres$coefficients[3] bhat3<- olsres$coefficients[4] bhat3_stderr <- coef(summary(olsres))[3, "Std. Error"] #Here I will use bhat3 to conduct a t-test and determine if this was a pass #or a fail tval <- (bhat3-b3)/ bhat3_stderr #decision at 5% confidence I believe (False indicates the t-stat was less #than 1.96, and we fail to reject the null) decision <- abs(tval) > 1.96 decision <- unname(decision) return(list(decision)) } #Define a parameter grid to simulate over from <- -5 to <- 5 increment <- .25 gridparts<- c(from , to , increment) b5_grid <- seq(from = gridparts[1], to = gridparts[2], by = gridparts[3]) parameter <- list("n" = n, "b0" = b0 , "b1" = b1 ,"b2" = b2 ,"b3" = b3 ,"b4" = b4 ,"b5" = b5_grid ,"su" = su) #Now simulate this multiple times in a monte carlo setting results <- MonteCarlo(func = dnd ,nrep = 100, param_list = parameter) And the error that comes up is: in results[[i]] <- array(NA, dim = c(dim_vec, nrep)) : attempt to select less than one element in integerOneIndex This leads me to believe that somewhere something is attempting to access the "0th" element of a vector, which doesn't exist in R as far as I understand. I don't think the part that is doing this arises from my code vs. internal to this package however, and I can't make sense of the code that runs when I run the package. I am also open to hearing about other methods that will essentially replace simulate() from Stata.
The function passed to MonteCarlo must return a list with named components. Changing line 76 to return(list("decision" = decision)) should work
Random draws from an ANOVA-like design with given population effect sizes
Let's say that you have a normally distributed variable y with a 3-group categorical predictor x that has the orthogonal contrasts c1 and c2. I am trying to create a program in R that, given x, c1, and c2, creates y such that c1 and c2 have effect sizes r1 and r2 specified by the user. For example, let's say that x, c1, c2, r1, and r2 were created like the following: x <- factor(rep(c(1, 2, 3), 100)) contrasts(x) <- matrix(c(0, -.5, .5, -2/3, 1/3, 1/3), nrow = 3, ncol = 2, dimnames = list(c("1", "2", "3"), c("c1", "c2"))) contrasts(x) c1 c2 1 0.0 -0.6666667 2 -0.5 0.3333333 3 0.5 0.3333333 r1 <- .09 r2 <- 0 I would like the program to create y such that the variance in y accounted for by c1 equals r1 (.09) and the variance in y accounted for by c2 equals r2 (0). Does anybody know how I might go about this? I know that I should be using the rnorm function, but I'm stuck on which population means / sds rnorm should use when it does its sampling.
Courtesy of some generous advice from my colleagues, I now have one function that creates simulated data given a specified number of groups, a set of contrasts, a set of regression coefficients, a specified N per cell, and a specified within-group variance sim.factor <- function(levels, contr, beta, perCell, errorVar){ # Build design matrix X X <- cbind(rep(1,levels*perCell), kronecker(contr, rep(1,perCell))) # Generate y y <- X %*% beta + rnorm(levels*perCell, sd=sqrt(errorVar)) # Build and return data frame dat <- cbind.data.frame(y, X[,-1]) names(dat)[-1] <- colnames(contr) return(dat) } I also wrote a function that, given a set of regression coefficients, N per cell, number of groups, set of orthogonal contrasts, desired delta-R^2 for a contrast of interest, returns the required within-group variance: ws.var <- function(levels, contr, beta, perCell, dc){ # Build design matrix X X <- cbind(rep(1,levels), contr) # Generate the expected means means <- X %*% beta # Find the sum of squares due to each contrast var <- (t(means) %*% contr)^2 / apply(contr^2 / perCell, 2, sum) # Calculate the within-conditions sum of squares wvar <- var[1] / dc - sum(var) # Convert the sum of squares to variance errorVar <- wvar / (3 * (perCell - 1)) return(errorVar) } After doing some testing as follows, the functions seem to generate the desired delta R^2 for contrast c1. contr <- contr.helmert(3) colnames(contr) <- c("c1","c2") beta <- c(0, 1, 0) perCell <- 50 levels = 3 dc <- .08 N <- 1000 # Calculate the error variance errorVar <- ws.var(levels, contr, beta, perCell, dc) # To store delta R^2 values d1 <- vector("numeric", length = N) # Use the functions for(i in 1:N) { d <- sim.factor(levels=3, contr=contr, beta=beta, perCell=perCell, errorVar=errorVar) d1[i] <- lm.sumSquares(lm(y ~ c1 + c2, data = d))[1, 2] # From the lmSupport package } m <- round(mean(d1), digits = 3) bmp("Testing simulation functions.bmp") hist(d1, xlab = "Percentage of variance due to c1", main = "") text(.18, 180, labels = paste("Mean =", m)) dev.off() Patrick