report t.test value efficiently in R - r

I have 2- different groups namely, am.0 and am.1 and I would like to create a data frame that shows mean(sd) of each group as well as diff(sd) with it t.test sign ***. Many thanks in advance.
mtcars$am <- factor(mtcars$am)
my.mtcars <- mtcars %>%
dplyr::group_by(am)%>%
dplyr::summarise(
mean = mean(mpg),
sd = sd(mpg)); my.mtcars
my.mtcars$sd <- paste0("(", round(my.mtcars$sd,2), ")") ; my.mtcars
Expected Answer is a data frame
am.0 am.1 diff (SD)
mpg 17.1 (3.83) 24.4 (6.17) 17.1-24.4 (SD)**
disp ... ... ...
gear ... ... ...
Where SD is the standard error of the difference between two independent samples,
SD = sqrt(s_1^2/n1 + s_2^2/n2)

This is my rather brute attempt:
my_signif = function(x, digits) floor(x) + signif(x %% 1, digits)
t.test.df <- function(x,y,df)
{
t = t.test(eval(parse(text=x))~eval(parse(text=y)), df)
p = t$p.value
sig = ifelse(p < 0.001,"***", ifelse(p < 0.01, "**", ifelse(p < 0.05, "*", "")))
est1 = my_signif(t$estimate[1],2)
est2 = my_signif(t$estimate[2],2)
sd1 = my_signif(sd(df[[x]][df[[y]] == levels(df[[y]])[1]]),2)
sd2 = my_signif(sd(df[[x]][df[[y]] == levels(df[[y]])[2]]),2)
out = data.frame(a = paste0(est1, " (",sd1,")"), b = paste0(est2, " (",sd2,")"), c = paste0(est1 - est2, " (",sig,")"),row.names = x)
colnames(out) = c(paste0(y,levels(df[[y]])[1]), paste0(y,levels(df[[y]])[2]), "diff")
out
}
t.test.df("mpg","am",mtcars)
output:
> t.test.df("mpg","am",mtcars)
am0 am1 diff
mpg 17.15 (3.83) 24.39 (6.17) -7.24 (**)
Further:
t.test.df2 <- function(cols,y,df) do.call(rbind,lapply(cols, function(x) t.test.df(x,y,df)))
output:
cols = c("mpg", "disp", "gear")
t.test.df2(cols,"am",mtcars)
am0 am1 diff
mpg 17.15 (3.83) 24.39 (6.17) -7.24 (**)
disp 290.38 (110.17) 143.53 (87.2) 146.85 (***)
gear 3.21 (0.42) 4.38 (0.51) -1.17 (***)

Related

How to I change confidence interval display format

I'm looking to change the format of 95% CI from (0.1 -- 0.6) to (0.1 to 0.6) or (0.1, 0.6).
Using epikit::unite_ci() function
https://cran.r-project.org/web/packages/epikit/vignettes/intro.html
In general, you can substitute text strings with gsub:
gsub(" --", ",",x = "(0.1 -- 0.6)")
gsub("to", ",",x = "(0.1 -- 0.6)")
If you want a more appropriate answer, please provide a small reproducible example.
I suggest to modify the fmt_ci function of epikit as follows:
library(epikit)
fmt_ci <- function (e = numeric(), l = numeric(), u = numeric(), digits = 2,
percent = TRUE) {
stopifnot(is.numeric(e), is.numeric(l), is.numeric(u), is.numeric(digits))
# Below the modified row
msg <- "%s (CI %.2f to %.2f)"
msg <- gsub("2", digits, msg)
fun <- if (percent)
match.fun(scales::percent)
else match.fun(scales::number)
e <- fun(e, scale = 1, accuracy = 1/(10^digits), big.mark = ",")
sprintf(msg, e, l, u)
}
# Replace fmt_ci in epikit with the above modified function
assignInNamespace("fmt_ci", fmt_ci, pos="package:epikit")
Running the code:
fit <- lm(100/mpg ~ disp + hp + wt + am, data = mtcars)
df <- data.frame(v = names(coef(fit)), e = coef(fit), confint(fit), row.names = NULL)
names(df) <- c("variable", "estimate", "lower", "upper")
print(df)
out <- unite_ci(df, "slope (CI)", estimate, lower, upper, m100 = FALSE, percent = FALSE)
print(out)
now you get:
variable slope (CI)
1 (Intercept) 0.74 (-0.77 to 2.26)
2 disp 0.00 (-0.00 to 0.01)
3 hp 0.01 (-0.00 to 0.01)
4 wt 1.00 (0.38 to 1.62)
5 am 0.16 (-0.61 to 0.93)

How to add a column to a lm based flextable

Let's say I make a model with lm such as
library(flextable)
set.seed(123)
mydata <- data.frame(y=runif(100,1,100), x1=runif(100,1,100), x2=runif(100,1,100))
model <- lm(y~x1+x2, data=mydata)
as_flextable(model)
This gives me a flextable with the Estimate, Standard Error, t value, and Pr(>|t|). Let's say I want to add a column to the flextable, for instance, if my y is logged and I want a column that shows exp(model$coefficients)-1.
Is there a straightforward way to do that or do I have to recreate the table from scratch?
In referencing the source code of flextable's as_flextable.lm function it's clear there's no built in way to do it. I made a "new" function by copying from source.
pvalue_format <- function(x){
z <- cut(x, breaks = c(-Inf, 0.001, 0.01, 0.05, 0.1, Inf), labels = c("***", "**", "*", ".", ""))
as.character(z)
}
as_flextable_newcol<-function(x,new_cols=NULL) {
data_t <- broom::tidy(x)
data_g <- broom::glance(x)
##this is my addition
if(!is.null(new_cols)&is.list(new_cols)) {
for(i in names(new_cols)) {
data_t <- data_t %>% mutate("{i}":=new_cols[[i]](term, estimate, std.error, p.value))
}
}
##end of my addition
ft <- flextable(data_t, col_keys = c("term", "estimate", "std.error", "statistic", "p.value", "signif"))
ft <- colformat_double(ft, j = c("estimate", "std.error", "statistic"), digits = 3)
ft <- colformat_double(ft, j = c("p.value"), digits = 4)
ft <- compose(ft, j = "signif", value = as_paragraph(pvalue_format(p.value)) )
ft <- set_header_labels(ft, term = "", estimate = "Estimate",
std.error = "Standard Error", statistic = "t value",
p.value = "Pr(>|t|)", signif = "" )
dimpretty <- dim_pretty(ft, part = "all")
ft <- add_footer_lines(ft, values = c(
"Signif. codes: 0 <= '***' < 0.001 < '**' < 0.01 < '*' < 0.05 < '.' < 0.1 < '' < 1",
"",
sprintf("Residual standard error: %s on %.0f degrees of freedom", formatC(data_g$sigma), data_g$df.residual),
sprintf("Multiple R-squared: %s, Adjusted R-squared: %s", formatC(data_g$r.squared), formatC(data_g$adj.r.squared)),
sprintf("F-statistic: %s on %.0f and %.0f DF, p-value: %.4f", formatC(data_g$statistic), data_g$df.residual, data_g$df, data_g$p.value)
))
ft <- align(ft, i = 1, align = "right", part = "footer")
ft <- italic(ft, i = 1, italic = TRUE, part = "footer")
ft <- hrule(ft, rule = "auto")
ft <- autofit(ft, part = c("header", "body"))
ft
}
the new_cols parameter to this function needs to be a named list of functions where the name of each function in the list will become the new column name. The functions inside the list will take term, estimate, std.error, p.value as input as those are the names of the data_t tibble.
For example:
new_cols=list(perc_change=function(term, estimate, std.error, p.value) {
ifelse(term=="(Intercept)","", paste0(round(100*(exp(estimate)-1),0),"%"))
})

Making a function for matching on multiple dependent variables, purrr

I want to estimate the matched treatment effect using the Matching package on multiple dependent variables.
For just a single dependent variable, I can run the below which returns what I want:
library(carData)
library(purrr)
library(tidyverse)
library(Matching)
matching_df <- Mroz %>%
mutate(wc = case_when(wc == "yes" ~ "TRUE",
wc == "no" ~ "FALSE")) %>%
drop_na(k5, k618, age, wc, hc, lfp)
matching_df$wc <- as.logical(matching_df$wc)
ps1 <- glm(wc ~ k5 + k618 + age + hc,
family = binomial, data = matching_df)
pscore <- ps1$fitted.values
matching_df <- cbind(matching_df, pscore)
Y <- matching_df$lfp
Tr <- as.logical(matching_df$wc)
psm1 <- Matching::Match(
Y = Y,
Tr = Tr,
X = pscore,
estimand = "ATT",
M = 1,
replace = TRUE,
caliper = 0.05,
version = "fast")
summary(psm1)
Estimate... 0.17479
SE......... 0.044963
T-stat..... 3.8873
p.val...... 0.00010135
Original number of observations.............. 753
Original number of treated obs............... 212
Matched number of observations............... 207
Matched number of observations (unweighted). 1074
Caliper (SDs)........................................ 0.05
Number of obs dropped by 'exact' or 'caliper' 5
But when I try and make a function using purrr:map_dfr so I can repeat this operation for multiple dependent variables, it returns an error. This is my attempt at the function:
vars <- c("lfp", "lwg", "inc")
names(vars) <- vars
matching_fcn <- function(.x){
matching_df <- Mroz %>%
mutate(wc = case_when(wc == "yes" ~ "TRUE",
wc == "no" ~ "FALSE")) %>%
drop_na(k5, k618, age, wc, hc, .x)
matching_df$wc <- as.logical(matching_df$wc)
ps1 <- glm(wc ~ k5 + k618 + age + hc,
family = binomial, data = matching_df)
pscore <- ps1$fitted.values
matching_df <- cbind(matching_df, pscore)
Y <- matching_df$.x
Tr <- as.logical(matching_df$wc)
psm1 <- Matching::Match(
Y = Y,
Tr = Tr,
X = pscore,
estimand = "ATT",
M = 1,
replace = TRUE,
caliper = 0.05,
version = "fast")
summary(psm1)
}
purrr::map_dfr(
.x = all_of(vars),
.f = matching_fcn)
Error: All columns in a tibble must be vectors.
x Column `lfp` is a `summary.Match` object.
x Column `lwg` is a `summary.Match` object.
x Column `inc` is a `summary.Match` object.
Run `rlang::last_error()` to see where the error occurred.
Ultimately, I would like a tibble which includes the name of the dependent variable in one column, then the estimate, se, T-stat, and p.val that are returned by the Matching::Match function in other columns
The summary(psm1) can't be put into a tibble. So choose some values of psm1 and make your own. Further, drop_na is no good idea and will bias your results.
library(Matching)
vars <- c("dependent_var_1", "dependent_var_2", "dependent_var_3")
names(vars) <- vars
matching_fcn <- function(.x){
# matching_df <- matching_df %>%
# drop_na(covar_1, covar_2, covar_3, covar_4, covar_5, covar_6, covar_7, treat_1, .x)
ps1 <- glm(treat_1 ~ covar_1 + covar_2 + covar_3 + covar_4 + covar_5 + covar_6 + covar_7,
family = binomial, data = matching_df)
pscore <- ps1$fitted.values
matching_df <- cbind(matching_df, pscore)
Y <- matching_df[[.x]]
Tr <- matching_df$treat_1
psm1 <- Matching::Match(
Y = Y,
Tr = Tr,
X = pscore,
estimand = "ATT",
M = 1,
replace = TRUE,
caliper = 0.05,
version = "fast")
p <- 1 - pnorm(abs(psm1$est.noadj/psm1$se.standard))
with(psm1, tibble(dv=.x, est=est.noadj, se=se.standard, p=p, ndrops=ndrops))
}
Usage and result
library(dplyr)
library(tidyr)
purrr::map_df(
.x = tidyselect::all_of(vars),
.f = matching_fcn)
# # A tibble: 3 × 5
# dv est se p ndrops
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 dependent_var_1 0.652 0.231 0.00238 8
# 2 dependent_var_2 -0.216 0.188 0.125 8
# 3 dependent_var_3 -0.506 0.249 0.0210 8
Data
v <- c('covar_1', 'covar_2', 'covar_3', 'covar_4', 'covar_5', 'covar_6',
'covar_7', 'treat_1', 'dependent_var_1', 'dependent_var_2', 'dependent_var_3')
set.seed(830595665)
matching_df <- data.frame(matrix(rnorm(100*length(v)), 100, length(v), dimnames=list(c(), v)))
matching_df$treat_1 <- +(matching_df$treat_1 > 0)

Optimize function in r

Here is my code:
cee = abs(qnorm(.5*0.1)) # Bonferroni threshold for achieving study-wide significance = 0.1
p.value = (simAll %>% select("p.value"))
p.value1 <- as.numeric(unlist(p.value))
# we use "cee" so R does not get confused with the function 'c'
betahat = log(OR) # Reported OR
z = sign(betahat)*abs(qnorm(0.5*p.value1)) # Reported p-value = 5.7e-4, which we convert to a z-value
###################################################
# THE PROPOSED APPROACH #
###################################################
se = betahat/z # standard error of betahat
mutilde1 = optimize(f=conditional.like,c(-20,20),maximum=T,z=z,cee=cee)$maximum # the conditional mle
The p.value is the p-values for 1000 simulations, same as OR, for the "se“ part, I can get 1000 different se values there. But for the mutilde1 line, there is an error exist: "Error in optimize(f = conditional.like, c(-20, 20), maximum = T, z = z, :
invalid function value in 'optimize'"
How can I fix the issue?
The conditional.like() function:
conditional.like=function(mu,cee,z){
like=dnorm(z-mu)/(pnorm(mu-cee)+pnorm(-cee-mu))
return((abs(z)>cee)*like) }
The simALL is a table looks like this (total 1000 lines):
# A tibble: 1,000 x 6
id term estimate std.error statistic p.value
<int> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 .x 0.226 0.127 1.78 0.0747
2 2 .x 0.137 0.127 1.08 0.280
3 3 .x 0.304 0.127 2.38 0.0171
4 4 .x 0.497 0.128 3.87 0.000111
OR (total 1000 lines):
> OR
[1] 1.5537098 1.0939850 1.4491432 1.6377551 1.1646904 1.3387534 1.6377551 1.5009351 1.7918552
Also, here is my overall code:
library(tidyverse)
library(broom)
# create a tibble with an id column for each simulation and x wrapped in list()
sim <- tibble(id = 1:1000,
x = list(rbinom(1000,1,0.5))) %>%
# to generate z, pr, y, k use map and map2 from the purrr package to loop over the list column x
# `~ ... ` is similar to `function(.x) {...}`
# `.x` represents the variable you are using map on
mutate(z = map(x, ~ log(1.3) * .x),
pr = map(z, ~ 1 / (1 + exp(-.x))),
y = map(pr, ~ rbinom(1000, 1, .x)),
k = map2(x, y, ~ glm(.y ~ .x, family="binomial")),
# use broom::tidy to get the model summary in form of a tibble
sum = map(k, broom::tidy)) %>%
# select id and sum and unnest the tibbles
select(id, sum) %>%
unnest(cols = c(sum))
simOR <- sim %>%
# drop the intercepts and every .x with a p < 0.05
filter(term !="(Intercept)",
p.value < 0.05)
sim
j1=exp(simOR %>% select("estimate"))
OR1=as.numeric(unlist(j1))
mean(OR1)
simAll <- sim %>%
filter(term !="(Intercept)")
j <- exp(simAll %>% select("estimate"))
OR2 <- as.numeric(unlist(j))
mean(OR2)
simOR2 <- sim %>%
filter(term !="(Intercept)",
p.value < 0.005)
j2 <- exp(simOR2 %>% select("estimate"))
OR3 <- as.numeric(unlist(j2))
mean(OR3)
#op <- par(mfrow = c(3, 1))
hga=hist(OR2, main = NULL, freq = T, breaks = 10) #OR2:Overall OR
hgb=hist(OR1, freq = T,col=2,breaks=10, main="OR:p-value<0.05") #OR1:p-value<0.05
hgc=hist(OR3, freq = T,col=2,breaks=10, main="OR:p-value<0.005") #OR3:p-value<0.005
plot(hga,col=rgb(0,1,0,0.5),main = "OR",xlim=c(0.8,2),ylim=c(0,250))
plot(hgb, add = TRUE,col=rgb(0,0,0.8,0.5),xlim=c(0.8,2),ylim=c(0,250))
plot(hgc, add = TRUE,col=rgb(1,0,0,0.5),xlim=c(0.8,2))
abline(v = mean(OR2), lwd = 4, col = 3)
abline(v = mean(OR3), lwd = 4, col=2)
text(1.65,240,"1.31",col=1)
arrows(1.5,240,1.31,240,length=0.1,col=1,lwd=2)
abline(v = mean(OR1), lwd = 4, col=4)
text(2.1,220,"1.43",col=4)
arrows(1.98,220,1.43,220,length=0.1,col=4,lwd=2)
text(2.1,220,"1.55",col=2)
arrows(1.98,220,1.55,220,length=0.1,col=2,lwd=2)
#########################################
## THE FUNCTIONS BELOW ARE USED TO OBTAIN THE
## BIAS-CORRECTED ESTIMATES
#########################################
conditional.like=function(mu,cee,z){
like=dnorm(z-mu)/(pnorm(mu-cee)+pnorm(-cee-mu))
return((abs(z)>cee)*like) }
conditional.like.z=function(mu,cee,z){
return(conditional.like(mu,cee,z)*mu)
}
#########################################
## THE FUNCTIONS BELOW ARE USED TO OBTAIN THE
## BIAS-CORRECTED CONFIDENCE INTERVAL
#########################################
ptruncnorm.lower=function(z,mu,cee,alpha){
A=pnorm(-cee+mu)+pnorm(-cee-mu)
term1=pnorm(z-mu)
term2=pnorm(-cee-mu)
term3=pnorm(-cee-mu)+pnorm(z-mu)-pnorm(cee-mu)
result=(1/A)*(term1*(z<= -cee)+term2*(abs(z)<cee)+term3*(z>=cee))
return(result-(alpha/2))
}
ptruncnorm.upper=function(z,mu,cee,alpha){
A=pnorm(-cee+mu)+pnorm(-cee-mu)
term1=pnorm(z-mu)
term2=pnorm(-cee-mu)
term3=pnorm(-cee-mu)+pnorm(z-mu)-pnorm(cee-mu)
result=(1/A)*(term1*(z<= -cee)+term2*(abs(z)<cee)+term3*(z>=cee))
return(result-(1-alpha/2))
}
find.lowerz=function(mu,z,cee,alpha){
lowerz=uniroot(ptruncnorm.lower,lower=-20,upper=20,mu=mu,cee=cee,alpha=alpha)$root
return(lowerz-z)
}
find.upperz=function(mu,z,cee,alpha){
upperz=uniroot(ptruncnorm.upper,lower=-20,upper=20,mu=mu,cee=cee,alpha=alpha)$root
return(upperz-z)
}
getCI=function(z,cee,alpha){
uppermu=uniroot(find.lowerz,interval=c(-15,15),cee=cee,z=z,alpha=alpha)$root
lowermu=uniroot(find.upperz,interval=c(-15,15),cee=cee,z=z,alpha=alpha)$root
out=list(lowermu,uppermu)
names(out)=c("lowermu","uppermu")
return(out)
}
source("GW-functions.R")# YOU READ IN THE FUNCTIONS FOR OUR METHOD
cee=abs(qnorm(.5*0.1)) # Bonferroni threshold for achieving study-wide significance = 0.1
p.value=(simAll %>% select("p.value"))
p.value1 <- as.numeric(unlist(p.value))
# we use "cee" so R does not get confused with the function 'c'
betahat=log(OR) # Reported OR
z=sign(betahat)*abs(qnorm(0.5*p.value1)) # Reported p-value = 5.7e-4, which we convert to a z-value
###################################################
# THE PROPOSED APPROACH #
###################################################
se=betahat/z # standard error of betahat
mutilde1=optimize(f=conditional.like,c(-20,20),maximum=T,z=z,cee=cee)$maximum

Printing p-values with <0.001

I wonder how to put <0.001 symbol if p-value is small than 0.001 to be used in Sweave.
ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14)
trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69)
group <- gl(2, 10, 20, labels = c("Ctl","Trt"))
weight <- c(ctl, trt)
lm.D9 <- lm(weight ~ group)
summary(lm.D9)$coef
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.8465 0.1557174 31.12368 4.185248e-17
group1 -0.1855 0.1557174 -1.19126 2.490232e-01
Desired Output
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.8465 0.1557174 31.12368 <0.001
group1 -0.1855 0.1557174 -1.19126 0.249
There are two main functions that I use, format.pval and this one that I ripped from gforge and tweaked.
ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14)
trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69)
group <- gl(2, 10, 20, labels = c("Ctl","Trt"))
weight <- c(ctl, trt)
lm.D9 <- lm(weight ~ group)
tmp <- data.frame(summary(lm.D9)$coef)
tmp <- setNames(tmp, colnames(summary(lm.D9)$coef))
tmp[ , 4] <- format.pval(tmp[ , 4], eps = .001, digits = 2)
tmp
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 5.032 0.2202177 22.85012 <0.001
# groupTrt -0.371 0.3114349 -1.19126 0.25
I like this one because it removes precision from pvalues > .1 (or whatever threshold you like if you want something different; that is, regardless of digits, it only keeps two decimal places if the values is > .1), keeps trailing zeros (see example below), and adds in the < like you want for some level of precision (here 0.001).
pvalr <- function(pvals, sig.limit = .001, digits = 3, html = FALSE) {
roundr <- function(x, digits = 1) {
res <- sprintf(paste0('%.', digits, 'f'), x)
zzz <- paste0('0.', paste(rep('0', digits), collapse = ''))
res[res == paste0('-', zzz)] <- zzz
res
}
sapply(pvals, function(x, sig.limit) {
if (x < sig.limit)
if (html)
return(sprintf('< %s', format(sig.limit))) else
return(sprintf('< %s', format(sig.limit)))
if (x > .1)
return(roundr(x, digits = 2)) else
return(roundr(x, digits = digits))
}, sig.limit = sig.limit)
}
And examples:
pvals <- c(.133213, .06023, .004233, .000000134234)
pvalr(pvals, digits = 3)
# [1] "0.13" "0.060" "0.004" "< 0.001"

Resources