I wonder how to put <0.001 symbol if p-value is small than 0.001 to be used in Sweave.
ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14)
trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69)
group <- gl(2, 10, 20, labels = c("Ctl","Trt"))
weight <- c(ctl, trt)
lm.D9 <- lm(weight ~ group)
summary(lm.D9)$coef
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.8465 0.1557174 31.12368 4.185248e-17
group1 -0.1855 0.1557174 -1.19126 2.490232e-01
Desired Output
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.8465 0.1557174 31.12368 <0.001
group1 -0.1855 0.1557174 -1.19126 0.249
There are two main functions that I use, format.pval and this one that I ripped from gforge and tweaked.
ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14)
trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69)
group <- gl(2, 10, 20, labels = c("Ctl","Trt"))
weight <- c(ctl, trt)
lm.D9 <- lm(weight ~ group)
tmp <- data.frame(summary(lm.D9)$coef)
tmp <- setNames(tmp, colnames(summary(lm.D9)$coef))
tmp[ , 4] <- format.pval(tmp[ , 4], eps = .001, digits = 2)
tmp
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 5.032 0.2202177 22.85012 <0.001
# groupTrt -0.371 0.3114349 -1.19126 0.25
I like this one because it removes precision from pvalues > .1 (or whatever threshold you like if you want something different; that is, regardless of digits, it only keeps two decimal places if the values is > .1), keeps trailing zeros (see example below), and adds in the < like you want for some level of precision (here 0.001).
pvalr <- function(pvals, sig.limit = .001, digits = 3, html = FALSE) {
roundr <- function(x, digits = 1) {
res <- sprintf(paste0('%.', digits, 'f'), x)
zzz <- paste0('0.', paste(rep('0', digits), collapse = ''))
res[res == paste0('-', zzz)] <- zzz
res
}
sapply(pvals, function(x, sig.limit) {
if (x < sig.limit)
if (html)
return(sprintf('< %s', format(sig.limit))) else
return(sprintf('< %s', format(sig.limit)))
if (x > .1)
return(roundr(x, digits = 2)) else
return(roundr(x, digits = digits))
}, sig.limit = sig.limit)
}
And examples:
pvals <- c(.133213, .06023, .004233, .000000134234)
pvalr(pvals, digits = 3)
# [1] "0.13" "0.060" "0.004" "< 0.001"
Related
I am using the package "table1" to create a fancy table one with extra column containing the standardized mean difference of continuous variables in my dataset.
The SMD should be a combination between the treatment and control groups stratified for a given variable within the table.
I am struggling to figure out a good way of doing this and would love some help creating the function to calculate SMD.
Here is some sample code:
f <- function(x, n, ...) factor(sample(x, n, replace=T, ...), levels=x)
set.seed(427)
n <- 146
dat <- data.frame(id=1:n)
dat$treat <- f(c("Placebo", "Treated"), n, prob=c(1, 2)) # 2:1 randomization
dat$age <- sample(18:65, n, replace=TRUE)
dat$sex <- f(c("Female", "Male"), n, prob=c(.6, .4)) # 60% female
dat$wt <- round(exp(rnorm(n, log(70), 0.23)), 1)
# Add some missing data
dat$wt[sample.int(n, 5)] <- NA
label(dat$age) <- "Age"
label(dat$sex) <- "Sex"
label(dat$wt) <- "Weight"
label(dat$treat) <- "Treatment Group"
units(dat$age) <- "years"
units(dat$wt) <- "kg"
my.render.cont <- function(x) {
with(stats.apply.rounding(stats.default(x), digits=2), c("",
"Mean (SD)"=sprintf("%s (± %s)", MEAN, SD)))
}
my.render.cat <- function(x) {
c("", sapply(stats.default(x), function(y) with(y,
sprintf("%d (%0.0f %%)", FREQ, PCT))))
}
#My attempt at an SMD function
smd_value <- function(x, ...) {
x <- x[-length(x)] # Remove "overall" group
# Construct vectors of data y, and groups (strata) g
y <- unlist(x)
g <- factor(rep(1:length(x), times=sapply(x, length)))
if (is.numeric(y) & g==1) {
# For numeric variables, calculate SMD
smd_val1 <- (mean(y)/sd(y))
} else if (is.numeric(y) & g==2) {
# For numeric variables, calculate SMD
smd_val2 <- (mean(y)/sd(y))
} else {print("--")
}
smd_val <- smdval2 - smdval1
}
table1(~ age + sex + wt | treat, data=dat, render.continuous=my.render.cont, render.categorical=my.render.cat, extra.col=list(`SMD`=smd_value))
I get the following error:
"Error in if (is.numeric(y) & g == 1) { : the condition has length > 1"
Any insight into a potential solution?
Thanks!
Here you go!
# Install Packages---------------------------------------------------
library(stddiff)
library(cobalt)
library(table1)
library(Hmisc)
#Using 'mtcars' as an example
my_data<-mtcars
# Format variables--------------------------------------------------------------
# amd - Transmission (0 = automatic; 1 = manual)
my_data$am <-factor(my_data$am,
levels = c(0,1),
labels =c("Automatic","Manual"))
label(my_data$am) <-"Transmission Type" #adding a label for the variable
# vs - Engine (0 = V-shaped, 1 = Straight)
my_data$vs <-factor(my_data$vs,
levels = c(0,1),
labels =c("V-shaped","Straight"))
label(my_data$vs) <-"Engine"
# Adding a label to the numeric variables
label(my_data$mpg)<-"Miles per gallon"
label(my_data$hp)<-"Horsepower"
# SMD FUNCTION------------------------------------------------------------------
SMD_value <- function(x, ...) {
# Construct vectors of data y, and groups (strata) g
y <- unlist(x)
g <- factor(rep(1:length(x), times=sapply(x, length)))
if (is.numeric(y)) {
# For numeric variables
try({a<-data.frame(y)
a$g<-g
smd<-(as.data.frame(stddiff.numeric(data=a,gcol = "g", vcol = "y")))$stddiff
},silent=TRUE)
} else {
# For categorical variables
try({
a<-data.frame(y)
a$g<-g
smd<-(abs((bal.tab(a, treat = "g",data=a,binary="std",continuous =
"std",s.d.denom = "pooled",stats=c("mean.diffs"))$Balance)$Diff.Un))
},silent=TRUE)
}
c("",format(smd,digits=2)) #Formatting number of digits
}
# CONTINUOUS VARIABLES FORMATTING-----------------------------------------------
my.render.cont <- function(x) {
with(stats.default(x),
c("",
"Mean (SD)" = sprintf("%s (%s)",
round_pad(MEAN, 1),
round_pad(SD, 1)),
"Median (IQR)" = sprintf("%s (%s, %s)",
round_pad(MEDIAN, 1),
round_pad(Q1, 1),
round_pad(Q3, 1)))
)}
# Creating the final table-----------------------------------------------------
Table1<-table1(~ vs + mpg + hp | am,
data=my_data,
overall = FALSE,
render.continuous = my.render.cont,
extra.col=list(`SMD`=SMD_value)) #SMD Column
Table1 #displays final table
I'm looking to change the format of 95% CI from (0.1 -- 0.6) to (0.1 to 0.6) or (0.1, 0.6).
Using epikit::unite_ci() function
https://cran.r-project.org/web/packages/epikit/vignettes/intro.html
In general, you can substitute text strings with gsub:
gsub(" --", ",",x = "(0.1 -- 0.6)")
gsub("to", ",",x = "(0.1 -- 0.6)")
If you want a more appropriate answer, please provide a small reproducible example.
I suggest to modify the fmt_ci function of epikit as follows:
library(epikit)
fmt_ci <- function (e = numeric(), l = numeric(), u = numeric(), digits = 2,
percent = TRUE) {
stopifnot(is.numeric(e), is.numeric(l), is.numeric(u), is.numeric(digits))
# Below the modified row
msg <- "%s (CI %.2f to %.2f)"
msg <- gsub("2", digits, msg)
fun <- if (percent)
match.fun(scales::percent)
else match.fun(scales::number)
e <- fun(e, scale = 1, accuracy = 1/(10^digits), big.mark = ",")
sprintf(msg, e, l, u)
}
# Replace fmt_ci in epikit with the above modified function
assignInNamespace("fmt_ci", fmt_ci, pos="package:epikit")
Running the code:
fit <- lm(100/mpg ~ disp + hp + wt + am, data = mtcars)
df <- data.frame(v = names(coef(fit)), e = coef(fit), confint(fit), row.names = NULL)
names(df) <- c("variable", "estimate", "lower", "upper")
print(df)
out <- unite_ci(df, "slope (CI)", estimate, lower, upper, m100 = FALSE, percent = FALSE)
print(out)
now you get:
variable slope (CI)
1 (Intercept) 0.74 (-0.77 to 2.26)
2 disp 0.00 (-0.00 to 0.01)
3 hp 0.01 (-0.00 to 0.01)
4 wt 1.00 (0.38 to 1.62)
5 am 0.16 (-0.61 to 0.93)
Given a fractional polynomial GLM, I am looking to find the value of a covariate that gives me an output of a given probability.
My data is simulated using:
# FUNCTIONS ====================================================================
logit <- function(p){
x = log(p/(1-p))
x
}
sigmoid <- function(x){
p = 1/(1 + exp(-x))
p
}
beta_duration <- function(D, select){
logit(
switch(select,
0.05 + 0.9 / (1 + exp(-2*D + 25)),
0.9 * exp(-exp(-0.5 * (D - 11))),
0.9 * exp(-exp(-(D - 11))),
0.9 * exp(-2 * exp(-(D - 9))),
sigmoid(0.847 + 0.210 * (D - 10)),
0.7 + 0.0015 * (D - 10) ^ 2,
0.7 - 0.0015 * (D - 10) ^ 2 + 0.03 * (D - 10)
)
)
}
beta_sex <- function(sex, OR = 1){
ifelse(sex == "Female", -0.5 * log(OR), 0.5 * log(OR))
}
plot_beta_duration <- function(select){
x <- seq(10, 20, by = 0.01)
y <- beta_duration(x, select)
data.frame(x = x,
y = y) %>%
ggplot(aes(x = x, y = y)) +
geom_line() +
ylim(0, 1)
}
# DATA SIMULATION ==============================================================
duration <- c(10, 12, 14, 18, 20)
sex <- factor(c("Female", "Male"))
eta <- function(duration, sex, duration_select, sex_OR, noise_sd){
beta_sex(sex, sex_OR) + beta_duration(duration, duration_select) + rnorm(length(duration), 0, noise_sd)
}
sim_data <- function(durations_type, sex_OR, noise_sd, p_female, n, seed){
set.seed(seed)
data.frame(
duration = sample(duration, n, TRUE),
sex = sample(sex, n, TRUE, c(p_female, 1 - p_female))
) %>%
rowwise() %>%
mutate(eta = eta(duration, sex, durations_type, sex_OR, noise_sd),
p = sigmoid(eta),
cured = sample(0:1, 1, prob = c(1 - p, p)))
}
# DATA SIM PARAMETERS
durations_type <- 4 # See beta_duration for functions
sex_OR <- 3 # Odds of cure for male vs female (ref)
noise_sd <- 1
p_female <- 0.7 # proportion of females in the sample
n <- 500
data <- sim_data(durations_type = 1, # See beta_duration for functions
sex_OR = 3, # Odds of cure for male vs female (ref)
noise_sd = 1,
p_female = 0.7, # proportion of females in the sample
n = 500,
seed = 21874564)
And my model is fitted by:
library(mfp)
model1 <- mfp(cured ~ fp(duration) + sex,
family = binomial(link = "logit"),
data = data)
summary(model1)
For each level of sex (i.e. "Male" or "Female"), I want to find the value of duration that gives me a probability equal to some value frontier <- 0.8.
So far, I can only think of using an approximation using a vector of possibilities:
pred_duration <- seq(10, 20, by = 0.1)
pred <- data.frame(expand.grid(duration = pred_duration,
sex = sex),
p = predict(model1,
newdata = expand.grid(duration = pred_duration,
sex = sex),
type = "response"))
pred[which(pred$p > 0.8), ] %>%
group_by(sex) %>%
summarize(min(duration))
But I am really after an exact solution.
The function uniroot allows you to detect the point at which the output of a function equals 0. If you create a function that takes duration as input, calculates the predicted probability from that duration, then subtracts the desired probability, then this function will have an output of 0 at the desired value of duration. uniroot will find this value for you. If you wrap this process in a little function, it makes it very easy to use:
find_prob <- function(p) {
f <- function(v) {
predict(model1, type = 'response',
newdata = data.frame(duration = v, sex = 'Male')) - p
}
uniroot(f, interval = range(data$duration), tol = 1e-9)$root
}
So, for example, to find the duration that gives an 80% probability, we just do:
find_prob(0.8)
#> [1] 12.86089
To prove that this is the correct value, we can feed it directly into predict to see what the predicted probability will be given sex = male and duration = 12.86089
predict(model1, type = 'response',
newdata = data.frame(sex = 'Male', duration = find_prob(0.8)))
#> 1
#> 0.8
How can I extract the values of ('Estimate', 'std. Error', 'Pr (> | z |)') from the output results and place them in a table format from the data example below?
require(GJRM)
set.seed(123)
x1 <- sample(1:100, size = 20)
bid1 <- sample(c(5, 10, 20, 30), size = 20, replace = T)
bid2 <- sample(c(5, 10, 20, 30), size = 20, replace = T)
ans1 <- sample(c(1,0), size = 20, replace = T)
ans2 <- sample(c(1,0), size = 20, replace = T)
df <- cbind(x1, bid1, bid2, ans1, ans2)
df <- as.data.frame(df)
treat.eq <- ans1 ~ bid1 + x1
out.eq <- ans2 ~ bid2 + x1
f.list <- list(treat.eq, out.eq)
mr <- c("probit", "probit")
## Model
bvp <- gjrm(f.list, data=df, Model="B", margins= mr)
summary(bvp)
The data is present in model$tableP1 and model$tableP2 :
library(GJRM)
bvp <- gjrm(f.list, data=df, Model="B", margins= mr)
model <- summary(bvp)
model$tableP1
# Estimate Std. Error z value Pr(>|z|)
#(Intercept) 0.83797602 0.79822170 1.049804 0.2938084
#bid1 -0.04682910 0.03137645 -1.492492 0.1355702
#x1 -0.01065357 0.00993613 -1.072205 0.2836278
model$tableP2
# Estimate Std. Error z value Pr(>|z|)
#(Intercept) 0.434060433 0.64213719 0.6759621 0.4990647
#bid2 -0.012464492 0.02930301 -0.4253656 0.6705702
#x1 -0.005763137 0.01025844 -0.5617947 0.5742559
The default plot function of BoomSpikeSlab models is a bar plot of each predictor's inclusion probability, colored by its probability of being positive:
set.seed(0)
simulate.lm.spike <- function(n=100, p=10, ngood=3, niter=1000, sigma=1) {
x <- cbind(matrix(rnorm(n * (p - 1)), nrow=n))
beta <- c(rnorm(ngood), rep(0, p - ngood))
y <- rnorm(n, beta[1] + x %*% beta[-1], sigma)
draws <- lm.spike(y ~ x, niter=niter)
return(invisible(draws))
}
model <- simulate.lm.spike(n=1000, p=50, sigma=.3)
plot(model, inclusion.threshold=.01)
How can I extract the data behind this plot, i.e. a data frame with each predictor's inclusion probability and probability of being positive?
Adapting the PlotMarginalInclusionProbabilities function:
GetMarginalInclusionProbabilities = function(
model,
burn = 0,
inclusion.threshold = 0,
unit.scale = TRUE,
number.of.variables = NULL) {
beta <- model$beta
if (burn > 0) {
beta <- beta[-(1:burn), , drop = FALSE]
}
inclusion.prob <- colMeans(beta != 0)
index <- order(inclusion.prob)
beta <- beta[, index, drop = FALSE]
inclusion.prob <- inclusion.prob[index]
compute.positive.prob <- function(x) {
## Compute the probability that x is positive, given that it is
## nonzero. If all(x == 0) then zero is returned.
x <- x[x != 0]
if (length(x) == 0) {
return(0)
}
return(mean(x > 0))
}
positive.prob <- apply(beta, 2, compute.positive.prob)
res <- data.frame(predictor = names(inclusion.prob),
inclusion.prob = inclusion.prob,
positive.prob = positive.prob)
return(res[order(-res$inclusion.prob), ])
}
Example:
GetMarginalInclusionProbabilities(model)
# predictor inclusion.prob positive.prob
# (Intercept) (Intercept) 1.000 1
# x1 x1 1.000 0
# x2 x2 0.999 1
# x15 x15 0.014 1
# x43 x43 0.002 1