paste formula in function (c constant?) - r

I'm doing a function pasting a formula and then returning a feols result. But, I get a c at the beginning. How can I solve this?
library(dplyr)
library(fixest)
data(base_did)
base_did = base_did %>% mutate(D = 5*rnorm(1080),
x2 = 10*rnorm(1080),
rand_wei = abs(rnorm(1080)))
f <- function(data, arg=NULL){
arg = enexpr(arg)
if (length(arg) == 0) {
formula = "D ~ 1"
}
else {
formula = paste(arg, collapse = " + ")
formula = paste("D ~ ", formula, sep = "")
}
formula = paste(formula, " | id + period", sep = "")
denom.lm <- feols(as.formula(formula), data = data,
weights = abs(data$rand_wei))
return(denom.lm)
}
f(base_did, arg = c(x1,x2))
#Error in feols(as.formula(formula), data = data, weights = abs(data$rand_wei)) :
# Evaluation of the right-hand-side of the formula raises an error:
# In NULL: Evaluation of .Primitive("c") returns an object of length 1
#while the data set has 1080 rows.
If I return(formula) at the end. I get [1] "D ~ c + x1 + x2 | id + period".
But I need only D ~ x1 + x2 | id + period.

Perhaps one option to make your function work would be to pass the arguments via ... so that c is not needed and which would prevent the c to be added to your formula. To make this work you also have switch to enexprs inside your function.
Note: I slightly adjusted your function for the reprex to return just the formula.
library(dplyr, warn = FALSE)
library(fixest)
data(base_did)
base_did = base_did %>% mutate(D = 5*rnorm(1080),
x2 = 10*rnorm(1080),
rand_wei = abs(rnorm(1080)))
f <- function(data, ...){
arg = enexprs(...)
if (length(arg) == 0) {
formula = "D ~ 1"
}
else {
formula = paste(arg, collapse = " + ")
formula = paste("D ~ ", formula, sep = "")
}
formula = paste(formula, " | id + period", sep = "")
as.formula(formula)
}
f(base_did, x1, x2)
#> D ~ x1 + x2 | id + period
#> <environment: 0x7fe8f3567618>
f(base_did)
#> D ~ 1 | id + period
#> <environment: 0x7fe8f366f848>
UPDATE There is probably a better approach but after some research a possible option would be:
Note: When passing multiple arguments via c enexpr will return a call object which behaves like a list and where the first element contains the function name, i.e. c. That's why you get the c added to your formula.
f <- function(data, arg = NULL) {
arg <- enexpr(arg)
if (length(arg) == 0) {
formula <- "D ~ 1"
} else {
if (length(arg) > 1) arg <- vapply(as.list(arg[-1]), rlang::as_string, FUN.VALUE = character(1))
formula <- paste(arg, collapse = " + ")
formula <- paste("D ~ ", formula, sep = "")
}
formula <- paste(formula, " | id + period", sep = "")
as.formula(formula)
}
f(base_did, c(x1, x2))
#> D ~ x1 + x2 | id + period
#> <environment: 0x7fa763431388>
f(base_did, x1)
#> D ~ x1 | id + period
#> <environment: 0x7fa763538c40>
f(base_did)
#> D ~ 1 | id + period
#> <environment: 0x7fa765e22028>

You can tremendously simplify your function if you use fixest's built-in formula manipulation tools (see here). In particular the dot-square-bracket operator:
library(fixest)
data(base_did)
n = 1080
base = within(base_did, {
D = 5 * rnorm(n)
x2 = 10 *rnorm(n)
rand_wei = abs(rnorm(n))
})
f = function(data, ctrl = "1"){
feols(D ~ .[ctrl] | id + period,
data = data, weights = ~abs(rand_wei))
}
est1 = f(base)
est2 = f(base, ~x1) # with a formula
est3 = f(base, c("x1", "x2")) # with a character vector
etable(est1, est2, est3)
#> est1 est2 est3
#> Dependent Var.: D D D
#>
#> x1 0.0816 (0.0619) 0.0791 (0.0618)
#> x2 -0.0157 (0.0186)
#> Fixed-Effects: -------- --------------- ----------------
#> id Yes Yes Yes
#> period Yes Yes Yes
#> _______________ ________ _______________ ________________
#> S.E. type -- by: id by: id
#> Observations 1,080 1,080 1,080
#> R2 0.12810 0.13005 0.13094
#> Within R2 -- 0.00224 0.00326
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
EDIT: note that the formula thing (est2) only works with version >= 0.11.0.

Related

How can the effect size of a PERMANOVA be calculated?

I use the "vegan" package to perform a PERMANOVA (adonis2()), and I also want to calculate the effect size (ω²). For this, I tried to use omega_squared() from the "effectsize" package, but I failed. I think it does not understand the output table, specifically the part with the mean squares. Is it possible to fix this or do I have to calculate manually?
library(vegan)
#> Lade nötiges Paket: permute
#> Lade nötiges Paket: lattice
#> This is vegan 2.6-4
library(effectsize)
data(dune)
data(dune.env)
ado <- adonis2(dune ~ Management, data = dune.env, permutations = 100)
ado
#> Permutation test for adonis under reduced model
#> Terms added sequentially (first to last)
#> Permutation: free
#> Number of permutations: 100
#>
#> adonis2(formula = dune ~ Management, data = dune.env, permutations = 100)
#> Df SumOfSqs R2 F Pr(>F)
#> Management 3 1.4686 0.34161 2.7672 0.009901 **
#> Residual 16 2.8304 0.65839
#> Total 19 4.2990 1.00000
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
w2 <- omega_squared(ado)
#> Error in `[[<-.data.frame`(`*tmp*`, "Mean_Square", value = numeric(0)): Ersetzung hat 0 Zeilen, Daten haben 3
interpret_omega_squared(w2)
#> Error in interpret(es, rules): Objekt 'w2' nicht gefunden
Created on 2022-11-15 with reprex v2.0.2
EDIT
I tried to do it manually:
library(vegan, quietly = T, warn.conflicts = F)
#> This is vegan 2.6-4
library(effectsize)
library(dplyr, quietly = T, warn.conflicts = F)
library(tibble)
library(purrr)
data(dune)
data(dune.env)
ado <- adonis2(dune ~ Management, data = dune.env, permutations = 100)
w2 <- omega_squared(ado) # Does not work
#> Error in `[[<-.data.frame`(`*tmp*`, "Mean_Square", value = numeric(0)): Ersetzung hat 0 Zeilen, Daten haben 3
interpret_omega_squared(w2) # Does not work
#> Error in interpret(es, rules): Objekt 'w2' nicht gefunden
ado_tidy <- tibble( # manually create Adonis test result table
parameter = c("Management", "Residual", "Total"),
df = ado %>% pull("Df"), # Degree of freedom
ss = ado %>% pull("SumOfSqs"), # sum of squares
meansqs = ss / df, # mean squares
p_r2 = ado %>% pull("R2"), # partial R²
f = ado %>% pull("F"), # F value
p = ado %>% pull("Pr(>F)") # p value
)
ado_tidy
#> # A tibble: 3 x 7
#> parameter df ss meansqs p_r2 f p
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Management 3 1.47 0.490 0.342 2.77 0.00990
#> 2 Residual 16 2.83 0.177 0.658 NA NA
#> 3 Total 19 4.30 0.226 1 NA NA
# Formula:
# W2 = (DFm * (F - 1)) / ((DFm * (F - 1)) + (DFm + 1))
W2 <- abs(
(ado_tidy %>% pull(df) %>% chuck(3) * (ado_tidy %>% pull(f) %>% chuck(1) - 1)) /
((ado_tidy %>% pull(df) %>% chuck(3) * (ado_tidy %>% pull(f) %>% chuck(1) - 1) +
ado_tidy %>% pull(df) %>% chuck(3) + 1)
)
)
W2
#> [1] 0.6267099
interpret_omega_squared(W2, rules = "field2013")
#> [1] "large"
#> (Rules: field2013)
Created on 2022-11-15 with reprex v2.0.2
Hopefully, the equation is correct...
Here is the MicEco::adonis_OmegaSq function edited so that it works both with the current vegan::adonis2 and deprecated vegan::adonis:
#' Calculate (partial) Omega-squared (effect-size calculation) for PERMANOVA and add it to the input object
#'
#' #param adonisOutput An adonis object
#' #param partial Should partial omega-squared be calculated (sample size adjusted). Default TRUE
#' #return Original adonis object with the (partial) Omega-squared values added
#' #import vegan
#' #export
adonis_OmegaSq <- function(adonisOutput, partial = TRUE){
if(!(is(adonisOutput, "adonis") || is(adonisOutput, "anova.cca")))
stop("Input should be an adonis object")
if (is(adonisOutput, "anova.cca")) {
aov_tab <- adonisOutput
aov_tab$MeanSqs <- aov_tab$SumOfSqs / aov_tab$Df
aov_tab$MeanSqs[length(aov_tab$Df)] <- NA
} else {
aov_tab <- adonisOutput$aov.tab
}
heading <- attr(aov_tab, "heading")
MS_res <- aov_tab[pmatch("Residual", rownames(aov_tab)), "MeanSqs"]
SS_tot <- aov_tab[rownames(aov_tab) == "Total", "SumsOfSqs"]
N <- aov_tab[rownames(aov_tab) == "Total", "Df"] + 1
if(partial){
omega <- apply(aov_tab, 1, function(x) (x["Df"]*(x["MeanSqs"]-MS_res))/(x["Df"]*x["MeanSqs"]+(N-x["Df"])*MS_res))
aov_tab$parOmegaSq <- c(omega[1:(length(omega)-2)], NA, NA)
} else {
omega <- apply(aov_tab, 1, function(x) (x["SumsOfSqs"]-x["Df"]*MS_res)/(SS_tot+MS_res))
aov_tab$OmegaSq <- c(omega[1:(length(omega)-2)], NA, NA)
}
if (is(adonisOutput, "adonis"))
cn_order <- c("Df", "SumsOfSqs", "MeanSqs", "F.Model", "R2",
if (partial) "parOmegaSq" else "OmegaSq", "Pr(>F)")
else
cn_order <- c("Df", "SumOfSqs", "F", if (partial) "parOmegaSq" else "OmegaSq",
"Pr(>F)")
aov_tab <- aov_tab[, cn_order]
attr(aov_tab, "names") <- cn_order
attr(aov_tab, "heading") <- heading
if (is(adonisOutput, "adonis"))
adonisOutput$aov.tab <- aov_tab
else
adonisOutput <- aov_tab
return(adonisOutput)
}
source() this function and it should work. In my test it gave the same results for both adonis2 and adonis.

Error in MEEM(object, conLin, control$niterEM) / fixed-effect model matrix is rank deficient

I'm trying to make a multilevel mediation analysis (as done here and here).
library(data.table)
library(lme4)
library(nlme)
library(magrittr)
library(dplyr)
set.seed(1)
# Simulated data ------------------------------------------------------------------
dt_1 <- data.table(id = rep(1:10, each=4),
time = as.factor(rep(1:4, 10)),
x = rnorm(40),
m = rnorm(40),
y = rnorm(40))
# Melt m and y into z ------------------------------------------------------------------
dt_2 <- dt_1 %>%
mutate(mm = m, .after=x) %>%
melt(id.vars = c("id","time","x","mm"),
na.rm = F,
variable.name = "dv",
value.name = "z") %>%
within({
dy <- as.integer(dv == "y")
dm <- as.integer(dv == "m")
}) %>%
arrange(id,time)
> head(dt_2,4)
id time x mm dv z dm dy
1: 1 1 -0.6264538 -0.1645236 m -0.1645236 1 0
2: 1 1 -0.6264538 -0.1645236 y -0.5686687 0 1
3: 1 2 0.1836433 -0.2533617 m -0.2533617 1 0
4: 1 2 0.1836433 -0.2533617 y -0.1351786 0 1
# lme mediation model ------------------------------------------------------------------
model_lme <- lme(fixed = z ~ 0 +
dm + dm:x + dm:time + #m as outcome
dy + dy:mm + dy:x + dy:time, #y as outcome
random = ~ 0 + dm:x + dy:mm + dy:x | id,
weights = varIdent(form = ~ 1 | dm), #separate sigma^{2}_{e} for each outcome
data = dt_2,
na.action = na.exclude)
Error in MEEM(object, conLin, control$niterEM): Singularity in backsolve at level 0, block 1
# lmer mediation model ------------------------------------------------------------------
model_lmer <- lmer(z ~ 0 + dm + dm:x + dm:time + dy + dy:mm + dy:x + dy:time +
(0 + dm:x + dy:mm + dy:x | id) + (0 + dm | time),
data = dt_2,
na.action = na.exclude)
fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
I've seen some posts about these error (nlme) / warning (lme4) (eg, this and this), but I didn't figure out what is the problem here.
I checked
X <- model.matrix(~0 + dm + dm:x + dm:time + dy + dy:mm + dy:x + dy:time, data=dt_2)
> caret::findLinearCombos(X)
$linearCombos
$linearCombos[[1]]
[1] 7 1 4 5 6
$remove
[1] 7
but I don't quite understand the output.
From the summary of model_lmer I verify that dm:time4 and time1:dy coefficients are missing, but why? There are all the possible combinations (0/0, 0/1, 1/0, 1/1) in the dataset.
Fixed effects:
Estimate Std. Error t value
dm 0.30898 0.92355 0.335
dy 0.03397 0.27480 0.124
dm:x 0.21267 0.19138 1.111
dm:time1 -0.19713 1.30583 -0.151
dm:time2 -0.30206 1.30544 -0.231
dm:time3 -0.20828 1.30620 -0.159
dy:mm 0.22625 0.18728 1.208
x:dy -0.37747 0.17130 -2.204
time2:dy 0.29894 0.39123 0.764
time3:dy 0.22640 0.39609 0.572
time4:dy -0.16758 0.39457 -0.425
On the other hand, using time as numeric gives no error/warning:
# lmer mediation model - time as numeric
model_lmer2 <- lmer(z ~ 0 + dm + dm:x + dm:time + dy + dy:mm + dy:x + dy:time +
(0 + dm:x + dy:mm + dy:x | id) + (0 + dm | time),
data = within(dt_2, time <- as.numeric(time)),
na.action = na.exclude)
It's true that one can know dm from dy (if one is 1 the other is 0), but if that was the problem, this last model (model_lmer2) would still give a warning, I guess.
In my real dataset, I could eventually use time as numeric (although not my first approach), but I would like to understand what the problem is with using it as categorical.
Thank you!
This is really a generic question about linear model construction/formulas in R: it's not mixed-model specific.
Let's look at the names of the columns involved in the multicollinear combination of variables (i.e. columns 7, 1, 4, 5, 6):
cc <- caret::findLinearCombos(X)
colnames(X)[cc$linearCombos[[1]]]
## [1] "dm:time4" "dm" "dm:time1" "dm:time2" "dm:time3"
This is telling us that the main effect of dm is confounded with the dm:time interaction; once we know dm:time[i] for all levels of i, the main effect of dm is redundant.
It's too bad that lme doesn't automatically drop columns to adjust for multicollinearity as lmer does, and that lmer doesn't have a super-convenient way to model heteroscedasticity à la varIdent(); it is possible but it's a nuisance. It would be possible to build the auto-dropping into nlme, or glmmTMB (which can also model heteroscedasticity easily), but no-one's gotten around to it yet).
... if you're OK with specifying dm:time and leaving dm out of your model then that might be easiest!
You can experiment with what happens with different model specifications:
lc <- function(f) {
X <- model.matrix(f, dt_2)
cc <- caret::findLinearCombos(X)
lapply(cc$linearCombos, function(z) colnames(X)[z])
}
lc(~0 + dm + dm:time)
lc(~0 + dy + dy:time)
lc(~0 + dm + dm:time + dy + dy:time)
lc(~0 + dy + dy:time + dm + dm:time)
or similar stuff looking at the (heads of) the model matrices, column names of the model matrices, etc..

Convert part of a statistical function's output into a data.frame

I was wondering if there might be a way to turn the following part of the OUTPUT of the res and res2 objects into a data.frame?
Note: answer below works with res but not res2.
A functional answer is appreciated as the data below is just toy.
library(metafor)
dat <- dat.konstantopoulos2011
res <- rma.mv(yi, vi, random = ~ 1 | district/school, data=dat)
#== OUTPUT (CAN WE TURN ONLY BELOW PART INTO A data.frame?):
#Variance Components:
# estim sqrt nlvls fixed factor
#sigma^2.1 0.0651 0.2551 11 no district
#sigma^2.2 0.0327 0.1809 56 no district/school
#Test for Heterogeneity:
#Q(df = 55) = 578.8640, p-val < .0001
# AND
res2 <- rma.mv(yi, vi, random = ~ factor(school) | district, data=dat)
#== OUTPUT (CAN WE TURN ONLY BELOW PART INTO A data.frame?):
#Variance Components:
#outer factor: district (nlvls = 11)
#inner factor: factor(school) (nlvls = 11)
# estim sqrt fixed
#tau^2 0.0978 0.3127 no
#rho 0.6653 no
#Test for Heterogeneity:
#Q(df = 55) = 578.8640, p-val < .0001
If there is no default/standard way to extract the data then you can manipulate the output using capture.output.
return_data <- function(res) {
tmp <- capture.output(res)
#data start from second line after "Variance Components:"
start <- which(tmp == "Variance Components:") + 2
index <- which(tmp == "")
#Data ends before the empty line after "Variance Components:"
end <- index[which.max(index > start)] - 1
data <- read.table(text = paste0(tmp[start:end], collapse = '\n'), header = T)
heterogeneity_index <- which(tmp == "Test for Heterogeneity:") + 1
list(data = data, heterogeneity = tmp[heterogeneity_index])
}
res <- rma.mv(yi, vi, random = ~ 1 | district/school, data=dat)
return_data(res)
#$data
# estim sqrt nlvls fixed factor
#sigma^2.1 0.0651 0.2551 11 no district
#sigma^2.2 0.0327 0.1809 56 no district/school
#$heterogeneity
#[1] "Q(df = 55) = 578.8640, p-val < .0001"
Would this suit your purposes? The 'Test for Heterogeneity' doesn't really fit in the dataframe, so I added it as a seperate column and it gets duplicated as a result. I'm not sure how else you could do it.
library(tidyverse)
#install.packages("metafor")
library(metafor)
#> Loading required package: Matrix
#>
#> Attaching package: 'Matrix'
#> The following objects are masked from 'package:tidyr':
#>
#> expand, pack, unpack
#>
#> Loading the 'metafor' package (version 3.0-2). For an
#> introduction to the package please type: help(metafor)
dat <- dat.konstantopoulos2011
res <- rma.mv(yi, vi, random = ~ 1 | district/school, data=dat)
res
#>
#> Multivariate Meta-Analysis Model (k = 56; method: REML)
#>
#> Variance Components:
#>
#> estim sqrt nlvls fixed factor
#> sigma^2.1 0.0651 0.2551 11 no district
#> sigma^2.2 0.0327 0.1809 56 no district/school
#>
#> Test for Heterogeneity:
#> Q(df = 55) = 578.8640, p-val < .0001
#>
#> Model Results:
#>
#> estimate se zval pval ci.lb ci.ub
#> 0.1847 0.0846 2.1845 0.0289 0.0190 0.3504 *
#>
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vc <- cbind(estim = res$sigma2,
sqrt = res$sigma,
nlvls = res$s.nlevels,
fixed = ifelse(res$vc.fix$sigma2, "yes", "no"),
factor = res$s.names,
R = ifelse(res$Rfix, "yes", "no"),
Test_for_heterogeneity = paste0("Q(df = ", res$k - res$p, ") = ", metafor:::.fcf(res$QE, res$digits[["test"]]), ", p-val ", metafor:::.pval(res$QEp,
res$digits[["pval"]], showeq = TRUE, sep = " "))
)
rownames(vc) <- c("sigma^2.1", "sigma^2.2")
result <- as.data.frame(vc)
result
#> estim nlvls fixed factor R Test_for_heterogeneity
#> sigma^2.1 "0.0650619442753117" "11" "no" "district" "no" "Q(df = 55) = 578.8640, p-val < .0001"
#> sigma^2.2 "0.0327365170279351" "56" "no" "district/school" "no" "Q(df = 55) = 578.8640, p-val < .0001"
Created on 2021-10-06 by the reprex package (v2.0.1)

Use quasiquotation for formula syntax in a user-created function?

When I run this code:
# Create example data
df <- tibble(age=rnorm(10),
income=rnorm(10))
make_model <- function(response_var, df){
# Create formula
form <- as.formula(response_var ~ .)
# Create model
model <- lm(form , data=df)
# Return coefficients
return(coef(model))
}
make_model(income, df)
I obtain the following error
Error in eval(predvars, data, env) : object 'income' not found
How can I make this function work using quasiquotation? I assume the logic is the same as how we can call library(dplyr) instead of library("dplyr").
Use blast() (to be included in rlang 0.5.0)
blast <- function(expr, env = caller_env()) {
eval_bare(enexpr(expr), env)
}
make_model <- function(data, column) {
f <- blast(!!enexpr(column) ~ .)
model <- lm(f, data = data)
coef(model)
}
df <- data.frame(
age = rnorm(10),
income = rnorm(10)
)
make_model(df, income)
#> (Intercept) age
#> -0.3563103 -0.2200773
Works flexibly:
blast(list(!!!1:3))
#> [[1]]
#> [1] 1
#>
#> [[2]]
#> [1] 2
#>
#> [[3]]
#> [1] 3
The following should work:
library(tidyverse)
# Your original function, modified
make_model <- function(df, column) {
column <- enexpr(column)
form <- as.formula(paste0(quo_text(column), " ~ ."))
model <- lm(form, data = df)
return(coef(model))
}
# Your original data and call
tibble(
age = rnorm(10),
income = rnorm(10)
) %>%
make_model(income)

How to get survdiff returned p value

I am using R survival package, survdiff function. I wonder how to get the p value from the return value.
> diff = survdiff(Surv(Time, Censored) ~ Treatment+Gender, data = dat)
> diff
Call:
survdiff(formula = Surv(Time, Censored) ~ Treatment + Gender,
data = dat)
N Observed Expected (O-E)^2/E (O-E)^2/V
Treatment=Control, Gender=M 2 1 1.65 0.255876 0.360905
Treatment=Control, Gender=F 7 3 2.72 0.027970 0.046119
Treatment=IND, Gender=M 5 2 2.03 0.000365 0.000519
Treatment=IND, Gender=F 6 2 1.60 0.100494 0.139041
Chisq= 0.5 on 3 degrees of freedom, p= 0.924
I want to get the p value 0.924 using some function. Thanks.
The p value is not stored in the survdiff class, so it must be calculated on the fly at the time of output. To reproduce the p value one could use the chisq distribution function: "pchisq"
diff = survdiff(Surv(Time, Censored) ~ Treatment+Gender, data = dat)
pchisq(diff$chisq, length(diff$n)-1, lower.tail = FALSE)
The code in the function print.survdiff that displays the p-value is:
cat("\n Chisq=", format(round(x$chisq, 1)), " on", df,
"degrees of freedom, p=", format(signif(1 - pchisq(x$chisq,
df), digits)), "\n")
The code leading up to it:
if (is.matrix(x$obs)) {
otmp <- apply(x$obs, 1, sum)
etmp <- apply(x$exp, 1, sum)
} else {
otmp <- x$obs
etmp <- x$exp
}
df <- (sum(1 * (etmp > 0))) - 1
And 'digits' is set to 3 in the argument list, so using the example on the surv.diff help page:
x <- survdiff(Surv(time, status) ~ pat.karno + strata(inst), data=lung)
cat( "p=", format(signif(1 - pchisq(x$chisq,
df), digits)) )
#p= 0.00326
Addressing the comment: In the example the second code block reduces to:
df <- with(x, (sum(1 * (apply(x$exp, 1, sum) > 0))) - 1 )
> df
[1] 7
With the glance() function from the broom package, it is very easy to get the p.value.
diff = survdiff(Surv(Time, Censored) ~ Treatment+Gender, data = dat)
broom::glance(diff)$p.value

Resources