How to add a column to a lm based flextable - r

Let's say I make a model with lm such as
library(flextable)
set.seed(123)
mydata <- data.frame(y=runif(100,1,100), x1=runif(100,1,100), x2=runif(100,1,100))
model <- lm(y~x1+x2, data=mydata)
as_flextable(model)
This gives me a flextable with the Estimate, Standard Error, t value, and Pr(>|t|). Let's say I want to add a column to the flextable, for instance, if my y is logged and I want a column that shows exp(model$coefficients)-1.
Is there a straightforward way to do that or do I have to recreate the table from scratch?

In referencing the source code of flextable's as_flextable.lm function it's clear there's no built in way to do it. I made a "new" function by copying from source.
pvalue_format <- function(x){
z <- cut(x, breaks = c(-Inf, 0.001, 0.01, 0.05, 0.1, Inf), labels = c("***", "**", "*", ".", ""))
as.character(z)
}
as_flextable_newcol<-function(x,new_cols=NULL) {
data_t <- broom::tidy(x)
data_g <- broom::glance(x)
##this is my addition
if(!is.null(new_cols)&is.list(new_cols)) {
for(i in names(new_cols)) {
data_t <- data_t %>% mutate("{i}":=new_cols[[i]](term, estimate, std.error, p.value))
}
}
##end of my addition
ft <- flextable(data_t, col_keys = c("term", "estimate", "std.error", "statistic", "p.value", "signif"))
ft <- colformat_double(ft, j = c("estimate", "std.error", "statistic"), digits = 3)
ft <- colformat_double(ft, j = c("p.value"), digits = 4)
ft <- compose(ft, j = "signif", value = as_paragraph(pvalue_format(p.value)) )
ft <- set_header_labels(ft, term = "", estimate = "Estimate",
std.error = "Standard Error", statistic = "t value",
p.value = "Pr(>|t|)", signif = "" )
dimpretty <- dim_pretty(ft, part = "all")
ft <- add_footer_lines(ft, values = c(
"Signif. codes: 0 <= '***' < 0.001 < '**' < 0.01 < '*' < 0.05 < '.' < 0.1 < '' < 1",
"",
sprintf("Residual standard error: %s on %.0f degrees of freedom", formatC(data_g$sigma), data_g$df.residual),
sprintf("Multiple R-squared: %s, Adjusted R-squared: %s", formatC(data_g$r.squared), formatC(data_g$adj.r.squared)),
sprintf("F-statistic: %s on %.0f and %.0f DF, p-value: %.4f", formatC(data_g$statistic), data_g$df.residual, data_g$df, data_g$p.value)
))
ft <- align(ft, i = 1, align = "right", part = "footer")
ft <- italic(ft, i = 1, italic = TRUE, part = "footer")
ft <- hrule(ft, rule = "auto")
ft <- autofit(ft, part = c("header", "body"))
ft
}
the new_cols parameter to this function needs to be a named list of functions where the name of each function in the list will become the new column name. The functions inside the list will take term, estimate, std.error, p.value as input as those are the names of the data_t tibble.
For example:
new_cols=list(perc_change=function(term, estimate, std.error, p.value) {
ifelse(term=="(Intercept)","", paste0(round(100*(exp(estimate)-1),0),"%"))
})

Related

How to calculate Standardized Mean Difference for Table1 Package in R?

I am using the package "table1" to create a fancy table one with extra column containing the standardized mean difference of continuous variables in my dataset.
The SMD should be a combination between the treatment and control groups stratified for a given variable within the table.
I am struggling to figure out a good way of doing this and would love some help creating the function to calculate SMD.
Here is some sample code:
f <- function(x, n, ...) factor(sample(x, n, replace=T, ...), levels=x)
set.seed(427)
n <- 146
dat <- data.frame(id=1:n)
dat$treat <- f(c("Placebo", "Treated"), n, prob=c(1, 2)) # 2:1 randomization
dat$age <- sample(18:65, n, replace=TRUE)
dat$sex <- f(c("Female", "Male"), n, prob=c(.6, .4)) # 60% female
dat$wt <- round(exp(rnorm(n, log(70), 0.23)), 1)
# Add some missing data
dat$wt[sample.int(n, 5)] <- NA
label(dat$age) <- "Age"
label(dat$sex) <- "Sex"
label(dat$wt) <- "Weight"
label(dat$treat) <- "Treatment Group"
units(dat$age) <- "years"
units(dat$wt) <- "kg"
my.render.cont <- function(x) {
with(stats.apply.rounding(stats.default(x), digits=2), c("",
"Mean (SD)"=sprintf("%s (± %s)", MEAN, SD)))
}
my.render.cat <- function(x) {
c("", sapply(stats.default(x), function(y) with(y,
sprintf("%d (%0.0f %%)", FREQ, PCT))))
}
#My attempt at an SMD function
smd_value <- function(x, ...) {
x <- x[-length(x)] # Remove "overall" group
# Construct vectors of data y, and groups (strata) g
y <- unlist(x)
g <- factor(rep(1:length(x), times=sapply(x, length)))
if (is.numeric(y) & g==1) {
# For numeric variables, calculate SMD
smd_val1 <- (mean(y)/sd(y))
} else if (is.numeric(y) & g==2) {
# For numeric variables, calculate SMD
smd_val2 <- (mean(y)/sd(y))
} else {print("--")
}
smd_val <- smdval2 - smdval1
}
table1(~ age + sex + wt | treat, data=dat, render.continuous=my.render.cont, render.categorical=my.render.cat, extra.col=list(`SMD`=smd_value))
I get the following error:
"Error in if (is.numeric(y) & g == 1) { : the condition has length > 1"
Any insight into a potential solution?
Thanks!
Here you go!
# Install Packages---------------------------------------------------
library(stddiff)
library(cobalt)
library(table1)
library(Hmisc)
#Using 'mtcars' as an example
my_data<-mtcars
# Format variables--------------------------------------------------------------
# amd - Transmission (0 = automatic; 1 = manual)
my_data$am <-factor(my_data$am,
levels = c(0,1),
labels =c("Automatic","Manual"))
label(my_data$am) <-"Transmission Type" #adding a label for the variable
# vs - Engine (0 = V-shaped, 1 = Straight)
my_data$vs <-factor(my_data$vs,
levels = c(0,1),
labels =c("V-shaped","Straight"))
label(my_data$vs) <-"Engine"
# Adding a label to the numeric variables
label(my_data$mpg)<-"Miles per gallon"
label(my_data$hp)<-"Horsepower"
# SMD FUNCTION------------------------------------------------------------------
SMD_value <- function(x, ...) {
# Construct vectors of data y, and groups (strata) g
y <- unlist(x)
g <- factor(rep(1:length(x), times=sapply(x, length)))
if (is.numeric(y)) {
# For numeric variables
try({a<-data.frame(y)
a$g<-g
smd<-(as.data.frame(stddiff.numeric(data=a,gcol = "g", vcol = "y")))$stddiff
},silent=TRUE)
} else {
# For categorical variables
try({
a<-data.frame(y)
a$g<-g
smd<-(abs((bal.tab(a, treat = "g",data=a,binary="std",continuous =
"std",s.d.denom = "pooled",stats=c("mean.diffs"))$Balance)$Diff.Un))
},silent=TRUE)
}
c("",format(smd,digits=2)) #Formatting number of digits
}
# CONTINUOUS VARIABLES FORMATTING-----------------------------------------------
my.render.cont <- function(x) {
with(stats.default(x),
c("",
"Mean (SD)" = sprintf("%s (%s)",
round_pad(MEAN, 1),
round_pad(SD, 1)),
"Median (IQR)" = sprintf("%s (%s, %s)",
round_pad(MEDIAN, 1),
round_pad(Q1, 1),
round_pad(Q3, 1)))
)}
# Creating the final table-----------------------------------------------------
Table1<-table1(~ vs + mpg + hp | am,
data=my_data,
overall = FALSE,
render.continuous = my.render.cont,
extra.col=list(`SMD`=SMD_value)) #SMD Column
Table1 #displays final table

How to I change confidence interval display format

I'm looking to change the format of 95% CI from (0.1 -- 0.6) to (0.1 to 0.6) or (0.1, 0.6).
Using epikit::unite_ci() function
https://cran.r-project.org/web/packages/epikit/vignettes/intro.html
In general, you can substitute text strings with gsub:
gsub(" --", ",",x = "(0.1 -- 0.6)")
gsub("to", ",",x = "(0.1 -- 0.6)")
If you want a more appropriate answer, please provide a small reproducible example.
I suggest to modify the fmt_ci function of epikit as follows:
library(epikit)
fmt_ci <- function (e = numeric(), l = numeric(), u = numeric(), digits = 2,
percent = TRUE) {
stopifnot(is.numeric(e), is.numeric(l), is.numeric(u), is.numeric(digits))
# Below the modified row
msg <- "%s (CI %.2f to %.2f)"
msg <- gsub("2", digits, msg)
fun <- if (percent)
match.fun(scales::percent)
else match.fun(scales::number)
e <- fun(e, scale = 1, accuracy = 1/(10^digits), big.mark = ",")
sprintf(msg, e, l, u)
}
# Replace fmt_ci in epikit with the above modified function
assignInNamespace("fmt_ci", fmt_ci, pos="package:epikit")
Running the code:
fit <- lm(100/mpg ~ disp + hp + wt + am, data = mtcars)
df <- data.frame(v = names(coef(fit)), e = coef(fit), confint(fit), row.names = NULL)
names(df) <- c("variable", "estimate", "lower", "upper")
print(df)
out <- unite_ci(df, "slope (CI)", estimate, lower, upper, m100 = FALSE, percent = FALSE)
print(out)
now you get:
variable slope (CI)
1 (Intercept) 0.74 (-0.77 to 2.26)
2 disp 0.00 (-0.00 to 0.01)
3 hp 0.01 (-0.00 to 0.01)
4 wt 1.00 (0.38 to 1.62)
5 am 0.16 (-0.61 to 0.93)

Override cldList stop if no statistical differences

I'm trying to add cld to my data but the cldList stops working the moment it encounters statistical differences. My data has groups with both statistical differences and without, so I need a way to work around this issue. The cldList is part of the rcompanion package and is written as follow:
cldList = function(formula = NULL,
data = NULL,
comparison = NULL,
p.value = NULL,
threshold = 0.05,
print.comp = FALSE,
remove.space = TRUE,
remove.equal = TRUE,
remove.zero = TRUE,
swap.colon = TRUE,
swap.vs = FALSE,
...)
{
if(!is.null(formula)){
p.value = eval(parse(text=paste0("data","$",all.vars(formula[[2]])[1])))
comparison = eval(parse(text=paste0("data","$",all.vars(formula[[3]])[1])))
}
Comparison = (as.numeric(p.value) <= threshold)
if (sum(Comparison) == 0){stop("No significant differences.", call.=FALSE)} # THIS LINE HERE #
if(remove.space == TRUE) {comparison = gsub(" ", "", comparison)}
if(remove.equal == TRUE) {comparison = gsub("=", "", comparison)}
if(remove.zero == TRUE) {comparison = gsub("0", "", comparison)}
if(swap.colon == TRUE) {comparison = gsub(":", "-", comparison)}
if(swap.vs == TRUE) {comparison = gsub("vs", "-", comparison)}
names(Comparison) = comparison
if(print.comp == TRUE)
{Y = data.frame(Comparisons = names(Comparison),
p.value = p.value, Value=Comparison,
Threshold=threshold)
cat("\n", "\n")
print(Y)
cat("\n", "\n")}
MCL = multcompLetters(Comparison, ...)
Group = names(MCL$Letters)
Letter = as.character(MCL$Letters)
MonoLetter = as.character(MCL$monospacedLetters)
Z = data.frame(Group, Letter, MonoLetter)
return(Z)
}
Here's a portion of my data (output from another program):
dummy_df <-data.frame(target = c("A1","A1","A1","A1","A1","A1","A2","A2","A2","A2","A2","A2","A3","A3","A3","A3","A3","A3","A4","A4","A4","A4","A4","A4"),
comparison = c("a - b","a - c","a - d","b - c","b - d","c - d","a - b","a - c","a - d","b - c","b - d","c - d","e - c","e - d","e - f","c - d","c - f","d - f","e - c","e - d","e - f","c - d","c - f","d - f"),
significant = c("Yes","No","Yes","Yes","Yes","Yes","Yes","No","Yes","Yes","Yes","Yes","Yes","Yes","Yes","No","Yes","Yes","No","No","No","No","No","No"),
p.val = c( 0.04,0.06,0.04,0.04,0.04,0.04,0.04,0.06,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.06,0.04,0.04,0.06,0.06,0.06,0.06,0.06,0.06))
My code looks like this
i <- 1
targets <- data.frame(Genes = unique(dummy_df$target))
df <- data.frame()
df1 <- data.frame(Group = "No", Letter ="significant", MonoLetter = "differences")
while (i<=nrow(targets)) {
print(i)
df2 <-cldList(p.val~comparison, data = subset.data.frame(dummy_df, dummy_df$target==targets[i,]), threshold = 0.05)
print(df2)
df2$target <- targets[i,]
#print(df2)
df <- rbind(df, df2)
i <- i+1
}
I tried adding if (nrow(df2)==0) {df2 <- df1} after the cldList step to at least have the information about which targets aren't statistically difference, but without success.
Is there a workaround that makes the loop continue through all the targets? Ideally, the cldList output with the same letter for all Groups would be the goal.
I am the author of the cldList() function.
Unfortunately, the looping code you presented isn't working for me.
However, I can address the behavior of cldList() when there are no significant differences. The issue was that multcompLetters, which the function relies on, has different output when there are no significant differences.
EDIT: The function in the rcompanion package has been updated to return a data frame with the same letter for all groups when there are no significant differences. (rcompanion v. 2.4.13, CRAN.R-project.org/package=rcompanion
#### Examples
library(rcompanion)
Comparison = c("A-B", "A-C", "B-C")
Pvalue = c(0.04, 1, 1)
DataFrame = data.frame(Comparison, Pvalue)
cldList(Pvalue ~ Comparison, data=DataFrame)
### Group Letter MonoLetter
### 1 A a a
### 2 B b b
### 3 C ab ab
Comparison2 = c("A-B", "A-C", "B-C")
Pvalue2 = c(1, 1, 1)
DataFrame2 = data.frame(Comparison2, Pvalue2)
cldList(Pvalue2 ~ Comparison2, data=DataFrame2)
### Group Letter MonoLetter
### 1 A a a
### 2 B a a
### 3 C a a

Error in confidence interval mice R package

everyone I am trying to execute the code in found in the book "Flexible Imputation of Missing Data 2ed" in 2.5.3 section, that calculates a confidence interval for two imputation methods. The problem is that I cannot reproduce the results as the result is always NaN
Here is the code
require(mice)
# function randomly draws artificial data from the specified linear model
create.data <- function(beta = 1, sigma2 = 1, n = 50, run = 1) {
set.seed(seed = run)
x <- rnorm(n)
y <- beta * x + rnorm(n, sd = sqrt(sigma2))
cbind(x = x, y = y)
}
#Remove some data
make.missing <- function(data, p = 0.5){
rx <- rbinom(nrow(data), 1, p)
data[rx == 0, "x"] <- NA
data
}
# Apply Rubin’s rules to the imputed data
test.impute <- function(data, m = 5, method = "norm", ...) {
imp <- mice(data, method = method, m = m, print = FALSE, ...)
fit <- with(imp, lm(y ~ x))
tab <- summary(pool(fit), "all", conf.int = TRUE)
as.numeric(tab["x", c("estimate", "2.5 %", "97.5 %")])
}
#Bind everything together
simulate <- function(runs = 10) {
res <- array(NA, dim = c(2, runs, 3))
dimnames(res) <- list(c("norm.predict", "norm.nob"),
as.character(1:runs),
c("estimate", "2.5 %","97.5 %"))
for(run in 1:runs) {
data <- create.data(run = run)
data <- make.missing(data)
res[1, run, ] <- test.impute(data, method = "norm.predict",
m = 2)
res[2, run, ] <- test.impute(data, method = "norm.nob")
}
res
}
res <- simulate(1000)
#Estimate the lower and upper bounds of the confidence intervals per method
apply(res, c(1, 3), mean, na.rm = TRUE)
Best Regards
Replace "x" by tab$term == "x" in the last line of test.impute():
as.numeric( tab[ tab$term == "x", c("estimate", "2.5 %", "97.5 %")])

R - Error using summary() from speedglm package

I'm using speedglm to estimate a logistic regression model on some data. I've created a reproducible example which generates the same error that I get using my original data.
library(speedglm)
n <- 10000
dtf <- data.frame( y = sample(c(0,1), n, 1),
x1 = as.factor(sample(c("a","b"), n, 1)),
x2 = rnorm(n, 30, 10))
m <- speedglm(y ~ x1 + x2, dtf, family=binomial())
summary(m)
The output is the following:
Generalized Linear Model of class 'speedglm':
Call: speedglm(formula = y ~ x1 + x2, data = dtf, family = binomial())
Coefficients:
------------------------------------------------------------------
Error in data.frame(..., check.names = FALSE) :
arguments imply differing number of rows: 3, 0
I've checked the source code of summary.speedglm by executing getS3method("summary", "speedglm") and found the code line which generates the error, but it didn't help to solve the problem.
PS: someone with 1500+ rep should create the speedglm tag.
UPDATE
Marco Enea, the maintainer of speedglm, asked to post the following temporary fix for summary.speedglm and print.summary.speedglm.
summary.speedglm <- function (object, correlation = FALSE, ...)
{
if (!inherits(object, "speedglm"))
stop("object is not of class speedglm")
z <- object
var_res <- as.numeric(z$RSS/z$df)
dispersion <- if (z$family$family %in% c("poisson", "binomial")) 1 else var_res
if (z$method == "qr") {
z$XTX <- z$XTX[z$ok, z$ok]
}
inv <- solve(z$XTX, tol = z$tol.solve)
covmat <- diag(inv)
se_coef <- rep(NA, length(z$coefficients))
se_coef[z$ok] <- sqrt(dispersion * covmat)
if (z$family$family %in% c("binomial", "poisson")) {
z1 <- z$coefficients/se_coef
p <- 2 * pnorm(abs(z1), lower.tail = FALSE)
} else {
t1 <- z$coefficients/se_coef
p <- 2 * pt(abs(t1), df = z$df, lower.tail = FALSE)
}
ip <- !is.na(p)
p[ip] <- as.numeric(format(p[ip], digits = 3))
dn <- c("Estimate", "Std. Error")
if (z$family$family %in% c("binomial", "poisson")) {
format.coef <- if (any(na.omit(abs(z$coef)) < 1e-04))
format(z$coefficients, scientific = TRUE, digits = 4) else
round(z$coefficients, digits = 7)
format.se <- if (any(na.omit(se_coef) < 1e-04))
format(se_coef, scientific = TRUE, digits = 4) else round(se_coef, digits = 7)
format.pv <- if (any(na.omit(p) < 1e-04))
format(p, scientific = TRUE, digits = 4) else round(p, digits = 4)
param <- data.frame(format.coef, format.se, round(z1,
digits = 4), format.pv)
dimnames(param) <- list(names(z$coefficients), c(dn,
"z value", "Pr(>|z|)"))
} else {
format.coef <- if (any(abs(na.omit(z$coefficients)) <
1e-04))
format(z$coefficients, scientific = TRUE, digits = 4) else
round(z$coefficients, digits = 7)
format.se <- if (any(na.omit(se_coef) < 1e-04))
format(se_coef, scientific = TRUE, digits = 4) else
round(se_coef, digits = 7)
format.pv <- if (any(na.omit(p) < 1e-04))
format(p, scientific = TRUE, digits = 4) else round(p, digits = 4)
param <- data.frame(format.coef, format.se, round(t1,
digits = 4), format.pv)
dimnames(param) <- list(names(z$coefficients), c(dn,
"t value", "Pr(>|t|)"))
}
eps <- 10 * .Machine$double.eps
if (z$family$family == "binomial") {
if (any(z$mu > 1 - eps) || any(z$mu < eps))
warning("fitted probabilities numerically 0 or 1 occurred")
}
if (z$family$family == "poisson") {
if (any(z$mu < eps))
warning("fitted rates numerically 0 occurred")
}
keep <- match(c("call", "terms", "family", "deviance", "aic",
"df", "nulldev", "nulldf", "iter", "tol", "n", "convergence",
"ngoodobs", "logLik", "RSS", "rank"), names(object),
0)
ans <- c(object[keep], list(coefficients = param, dispersion = dispersion,
correlation = correlation, cov.unscaled = inv, cov.scaled = inv *
var_res))
if (correlation) {
ans$correl <- (inv * var_res)/outer(na.omit(se_coef),
na.omit(se_coef))
}
class(ans) <- "summary.speedglm"
return(ans)
}
print.summary.speedglm <- function (x, digits = max(3, getOption("digits") - 3), ...)
{
cat("Generalized Linear Model of class 'speedglm':\n")
if (!is.null(x$call))
cat("\nCall: ", deparse(x$call), "\n\n")
if (length(x$coef)) {
cat("Coefficients:\n")
cat(" ------------------------------------------------------------------",
"\n")
sig <- function(z){
if (!is.na(z)){
if (z < 0.001)
"***"
else if (z < 0.01)
"** "
else if (z < 0.05)
"* "
else if (z < 0.1)
". "
else " "
} else " "
}
options(warn=-1)
sig.1 <- sapply(as.numeric(as.character(x$coefficients[,4])),
sig)
options(warn=0)
est.1 <- cbind(format(x$coefficients, digits = digits),
sig.1)
colnames(est.1)[ncol(est.1)] <- ""
print(est.1)
cat("\n")
cat("-------------------------------------------------------------------",
"\n")
cat("Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1",
"\n")
cat("\n")
}
else cat("No coefficients\n")
cat("---\n")
cat("null df: ", x$nulldf, "; null deviance: ", round(x$nulldev,
digits = 2), ";\n", "residuals df: ", x$df, "; residuals deviance: ",
round(x$deviance, digits = 2), ";\n", "# obs.: ", x$n,
"; # non-zero weighted obs.: ", x$ngoodobs, ";\n", "AIC: ",
x$aic, "; log Likelihood: ", x$logLik, ";\n", "RSS: ",
round(x$RSS, digits = 1), "; dispersion: ", x$dispersion,
"; iterations: ", x$iter, ";\n", "rank: ", round(x$rank,
digits = 1), "; max tolerance: ", format(x$tol, scientific = TRUE,
digits = 3), "; convergence: ", x$convergence, ".\n",
sep = "")
invisible(x)
if (x$correlation) {
cat("---\n")
cat("Correlation of Coefficients:\n")
x$correl[upper.tri(x$correl, diag = TRUE)] <- NA
print(x$correl[-1, -nrow(x$correl)], na.print = "", digits = 2)
}
}
Following 42' suggestion, I would also add the following:
environment(summary.speedglm) <- environment(speedglm)
environment(print.summary.speedglm) <- environment(speedglm)
The print.summary.speedglm function has a tiny bug in it. If you change this line:
sig.1 <- cbind(sapply(as.numeric(as.character(x$coefficients$"Pr(>|t|)")), sig))
To this line:
sig.1 <- cbind(sapply(as.numeric(as.character(x$coefficients$"Pr(>|z|)")), sig))
And also run:
environment(print.summary.speedglm) <- environment(speedglm)
You will not see the error message anymore.
The proper way to report bugs is to contact the maintainer (I'll send him an email):
maintainer('speedglm')
[1] "Marco Enea <emarco76#libero.it>"
It appears that this is a bug; in speedglm:::print.summary.speedglm there is the line:
sig.1 <- sapply(as.numeric(as.character(x$coefficients$"Pr(>|t|)")),
sig)
but when you look at the object, you can see:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.0546397 0.0655713 -0.8333 0.405
x1b -0.0618225 0.0400126 -1.5451 0.122
x2 0.0020771 0.0019815 1.0483 0.295
which has a Pr(>|z|) instead of Pr(>|t|), so the sig stars fail.

Resources