formula error inside function - r

I want use survfit() and basehaz() inside a function, but they do not work. Could you take a look at this problem. Thanks for your help. The following code leads to the error:
library(survival)
n <- 50 # total sample size
nclust <- 5 # number of clusters
clusters <- rep(1:nclust,each=n/nclust)
beta0 <- c(1,2)
set.seed(13)
#generate phmm data set
Z <- cbind(Z1=sample(0:1,n,replace=TRUE),
Z2=sample(0:1,n,replace=TRUE),
Z3=sample(0:1,n,replace=TRUE))
b <- cbind(rep(rnorm(nclust),each=n/nclust),rep(rnorm(nclust),each=n/nclust))
Wb <- matrix(0,n,2)
for( j in 1:2) Wb[,j] <- Z[,j]*b[,j]
Wb <- apply(Wb,1,sum)
T <- -log(runif(n,0,1))*exp(-Z[,c('Z1','Z2')]%*%beta0-Wb)
C <- runif(n,0,1)
time <- ifelse(T<C,T,C)
event <- ifelse(T<=C,1,0)
mean(event)
phmmd <- data.frame(Z)
phmmd$cluster <- clusters
phmmd$time <- time
phmmd$event <- event
fmla <- as.formula("Surv(time, event) ~ Z1 + Z2")
BaseFun <- function(x){
start.coxph <- coxph(x, phmmd)
print(start.coxph)
betahat <- start.coxph$coefficient
print(betahat)
print(333)
print(survfit(start.coxph))
m <- basehaz(start.coxph)
print(m)
}
BaseFun(fmla)
Error in formula.default(object, env = baseenv()) : invalid formula
But the following function works:
fit <- coxph(fmla, phmmd)
basehaz(fit)

It is a problem of scoping.
Notice that the environment of basehaz is:
environment(basehaz)
<environment: namespace:survival>
meanwhile:
environment(BaseFun)
<environment: R_GlobalEnv>
Therefore that is why the function basehaz cannot find the local variable inside the function.
A possible solution is to send x to the top using assign:
BaseFun <- function(x){
assign('x',x,pos=.GlobalEnv)
start.coxph <- coxph(x, phmmd)
print(start.coxph)
betahat <- start.coxph$coefficient
print(betahat)
print(333)
print(survfit(start.coxph))
m <- basehaz(start.coxph)
print(m)
rm(x)
}
BaseFun(fmla)
Other solutions may involved dealing with the environments more directly.

I'm following up on #moli's comment to #aatrujillob's answer. They were helpful so I thought I would explain how it solved things for me and a similar problem with the rpart and partykit packages.
Some toy data:
N <- 200
data <- data.frame(X = rnorm(N),W = rbinom(N,1,0.5))
data <- within( data, expr = {
trtprob <- 0.4 + 0.08*X + 0.2*W -0.05*X*W
Trt <- rbinom(N, 1, trtprob)
outprob <- 0.55 + 0.03*X -0.1*W - 0.3*Trt
Outcome <- rbinom(N,1,outprob)
rm(outprob, trtprob)
})
I want to split the data to training (train_data) and testing sets, and train the classification tree on train_data.
Here's the formula I want to use, and the issue with the following example. When I define this formula, the train_data object does not yet exist.
my_formula <- Trt~W+X
exists("train_data")
# [1] FALSE
exists("train_data", envir = environment(my_formula))
# [1] FALSE
Here's my function, which is similar to the original function. Again,
badFunc <- function(data, my_formula){
train_data <- data[1:100,]
ct_train <- rpart::rpart(
data= train_data,
formula = my_formula,
method = "class")
ct_party <- partykit::as.party(ct_train)
}
Trying to run this function throws an error similar to OP's.
library(rpart)
library(partykit)
bad_out <- badFunc(data=data, my_formula = my_formula)
# Error in is.data.frame(data) : object 'train_data' not found
# 10. is.data.frame(data)
# 9. model.frame.default(formula = Trt ~ W + X, data = train_data,
# na.action = function (x) {Terms <- attr(x, "terms") ...
# 8. stats::model.frame(formula = Trt ~ W + X, data = train_data,
# na.action = function (x) {Terms <- attr(x, "terms") ...
# 7. eval(expr, envir, enclos)
# 6. eval(mf, env)
# 5. model.frame.rpart(obj)
# 4. model.frame(obj)
# 3. as.party.rpart(ct_train)
# 2. partykit::as.party(ct_train)
# 1. badFunc(data = data, my_formula = my_formula)
print(bad_out)
# Error in print(bad_out) : object 'bad_out' not found
Luckily, rpart() is like coxph() in that you can specify the argument model=TRUE to solve these issues. Here it is again, with that extra argument.
goodFunc <- function(data, my_formula){
train_data <- data[1:100,]
ct_train <- rpart::rpart(
data= train_data,
## This solved it for me
model=TRUE,
##
formula = my_formula,
method = "class")
ct_party <- partykit::as.party(ct_train)
}
good_out <- goodFunc(data=data, my_formula = my_formula)
print(good_out)
# Model formula:
# Trt ~ W + X
#
# Fitted party:
# [1] root
# | [2] X >= 1.59791: 0.143 (n = 7, err = 0.9)
##### etc
documentation for model argument in rpart():
model:
if logical: keep a copy of the model frame in the result? If
the input value for model is a model frame (likely from an earlier
call to the rpart function), then this frame is used rather than
constructing new data.
Formulas can be tricky as they use lexical scoping and environments in a way that is not always natural (to me). Thank goodness Terry Therneau has made our lives easier with model=TRUE in these two packages!

Related

How can I pass weights in a variable to rpart?

For some reason, the rpart from the rpart package can't see a variable defined in the context from which it is called. You can see in the reprex below that wts is defined just before the call to rpart, but when I call rpart, I get the error "object 'wts' not found".
If I omit the weights argument, there is no problem.
library(rpart)
data(mpg, package = "ggplot2")
scale <- function(x) {
x/sum(x)
}
fit_rpart <- function(formula, data, iters = 10) {
data <- as.data.frame(data)
models <- list()
for (i in 1:iters) {
wts <- scale(runif(nrow(data)))
print(head(wts))
models[[i]] <- rpart(formula = formula,
data = data, weights = wts,
method = "class")
}
return(models)
}
results <- fit_rpart(cyl == 4 ~ drv + cty + fl + class, mpg)
#> [1] 0.0072092177 0.0059019498 0.0007893446 0.0038617957 0.0067420603
#> [6] 0.0076892493
#> Error in eval(extras, data, env): object 'wts' not found
Created on 2022-09-28 by the reprex package (v2.0.1)
It seems the answer is similar to that provided here. (Similar, but not the same, as this issue relates to rpart, not lm. Not sure how to generalize the question to cover both.)
If I put these two lines at the top of fit_rpart, the code runs fine.
formula <- as.formula(formula)
environment(formula) <- environment()
It seems to be a name scoping problem as you said. You can also use <<- to pass an affectation to the parent environment.
(Also, data and formula are already existing objects in R so to be clear one could prefer to rename them).
library(rpart)
data(mpg, package = "ggplot2")
scale <- function(x) {
x/sum(x)
}
fit_rpart <- function(f, d, iters = 10) {
d <- as.data.frame(d)
models <- list()
for (i in 1:10) {
wts <<- scale(runif(nrow(d)))
print(head(wts))
models[[i]] <- rpart(formula = f,
data = d,
weights = wts,
method = "class")
}
return(models)
}
results <- fit_rpart(cyl == 4 ~ drv + cty + fl + class, mpg)
# [1] 0.0007921094 0.0039999229 0.0026862458 0.0018832820 0.0006866826 0.0076998391
# [1] 0.005962060 0.006942240 0.001572535 0.009360314 0.005485438 0.008135806
# [1] 0.007224990 0.004209857 0.007706282 0.007071345 0.003784652 0.006335056
# [1] 0.004098536 0.003289626 0.006710783 0.007364727 0.003099702 0.007693150
# [1] 0.005154814 0.001212012 0.001169259 0.005829825 0.004401704 0.004269959
# [1] 0.001262500 0.003705485 0.007466314 0.005450551 0.001292365 0.007920012
# [1] 0.0007083308 0.0033698483 0.0073883706 0.0013445097 0.0068669108 0.0048488413
# [1] 0.005139870 0.002053938 0.002125759 0.006488419 0.007129400 0.006937384
# [1] 0.006957664 0.006296365 0.000640707 0.008121049 0.008014404 0.007706194

Apply logistic regression in a function in R

I want to run logistic regression for multiple parameters and store the different metrics i.e AUC.
I wrote the function below but I get an error when I call it: Error in eval(predvars, data, env) : object 'X0' not found even if the variable exists in both my training and testing dataset. Any idea?
new.function <- function(a) {
model = glm(extry~a,family=binomial("logit"),data = train_df)
pred.prob <- predict(model,test_df, type='response')
predictFull <- prediction(pred.prob, test_df$extry)
auc_ROCR <- performance(predictFull, measure = "auc")
my_list <- list("AUC" = auc_ROCR)
return(my_list)
}
# Call the function new.function supplying 6 as an argument.
les <- new.function(X0)
The main reason why your function didn't work is that you are trying to call an object into a formula. You can fix it with paste formula function, but that is ultimately quite limiting.
I suggest instead that you consider using update. This allow you more flexibility to change with multiple variable combination, or change a training dataset, without breaking the function.
model = glm(extry~a,family=binomial("logit"),data = train_df)
new.model = update(model, .~X0)
new.function <- function(model){
pred.prob <- predict(model, test_df, type='response')
predictFull <- prediction(pred.prob, test_df$extry)
auc_ROCR <- performance(predictFull, measure = "auc")
my_list <- list("AUC" = auc_ROCR)
return(my_list)
}
les <- new.function(new.model)
The function can be further improved by calling the test_df as a separate argument, so that you can fit it with an alternative testing data.
To run the function in the way you intended, you would need to use non-standard evaluation to capture the symbol and insert it in a formula. This can be done using match.call and as.formula. Here's a fully reproducible example using dummy data:
new.function <- function(a) {
# Convert symbol to character
a <- as.character(match.call()$a)
# Build formula from character strings
form <- as.formula(paste("extry", a, sep = "~"))
model <- glm(form, family = binomial("logit"), data = train_df)
pred.prob <- predict(model, test_df, type = 'response')
predictFull <- ROCR::prediction(pred.prob, test_df$extry)
auc_ROCR <- ROCR::performance(predictFull, "auc")
list("AUC" = auc_ROCR)
}
Now we can call the function in the way you intended:
new.function(X0)
#> $AUC
#> A performance instance
#> 'Area under the ROC curve'
new.function(X1)
#> $AUC
#> A performance instance
#> 'Area under the ROC curve'
If you want to see the actual area under the curve you would need to do:
new.function(X0)$AUC#y.values[[1]]
#> [1] 0.6599759
So you may wish to modify your function so that the list contains auc_ROCR#y.values[[1]] rather than auc_ROCR
Data used
set.seed(1)
train_df <- data.frame(X0 = sample(100), X1 = sample(100))
train_df$extry <- rbinom(100, 1, (train_df$X0 + train_df$X1)/200)
test_df <- data.frame(X0 = sample(100), X1 = sample(100))
test_df$extry <- rbinom(100, 1, (test_df$X0 + test_df$X1)/200)
Created on 2022-06-29 by the reprex package (v2.0.1)

How to put NA in a stored list of a For loop when no result

I've created a For loop to apply different models on a lot of variables :
my_data <- read.xlsx("phyla.xlsx")
start_variables <- 6
variables <- variable.names(my_data)
variables <- variables[-c(1:(start_variables-1))]
#make a list of data frame for each variable
DF.ls <- list()
for (i in start_variables:length(my_data)){
DF.ls[[i]] <- data.frame(Sample_ID=my_data$Sample_ID,Genotype=my_data$Genotype,
Substrate=my_data$Substrate, y=my_data[[i]])
}
DF.ls <- DF.ls[-1]; DF.ls <- DF.ls[-1]; DF.ls <- DF.ls[-1]; DF.ls <- DF.ls[-1]; DF.ls <- DF.ls[-1]
#model1
model1.ls <- list()
for (i in 1:length(DF.ls)){
model1.ls[[i]] <- gls(y~Substrate+Genotype+Substrate:Genotype, data=DF.ls[[i]])
}
AIC_model1_ls <- list()
for (i in 1:length(model1.ls)){
AIC_model1_ls[[i]] <- AIC(summary(model1.ls[[i]]))
}
AIC_model1 = unlist(AIC_model1_ls)
AIC_model1 = rbind(AIC_model1)
#model2
model2.ls <- list()
for (i in 1:length(DF.ls)){
model2.ls[[i]] <- gls(y~Substrate+Genotype+Substrate:Genotype, weights=varIdent(form=~1|Substrate), data=DF.ls[[i]])
}
AIC_model2_ls <- list()
for (i in 1:length(model2.ls)){
AIC_model2_ls[[i]] <- AIC(summary(model2.ls[[i]]))
}
AIC_model2 = unlist(AIC_model2_ls)
AIC_model2 = rbind(AIC_model2)
It may not be the best way to make it but it works just fine, except for when there are some treatments with zeros.
So my problem is that I have this error that occurs when there are zeros :
Error in gls(y ~ Substrate + Genotype + Substrate:Genotype, weights = varIdent(form = ~1 | : false convergence (8)
I'd like my For loop to put a NA in my list when this error occurs instead of nothing. How can I do that?
You can use the try function. This will capture an error without stopping your program. Here is an example since you did not provide a reproducible example:
# This model will fail since we have not assigned values to x and y
model <- try(lm(x ~ y), silent = TRUE)
# However check the output
model
> model
[1] "Error in eval(predvars, data, env) : object 'x' not found\n"
attr(,"class")
[1] "try-error"
attr(,"condition")
<simpleError in eval(predvars, data, env): object 'x' not found>
If you pass something valid to try you see that you will get the proper result
model <- try(lm(wt~ mpg, data = mtcars))
model
> model
Call:
lm(formula = wt ~ mpg, data = mtcars)
Coefficients:
(Intercept) mpg
6.0473 -0.1409
So with this in mind you could augment your code to do something like:
fit <- try(gls(y~Substrate+Genotype+Substrate:Genotype, weights=varIdent(form=~1|Substrate), data=DF.ls[[i]]), silent = TRUE)
if(class(fit)=="try-error"){
model1.ls[[i]] <- NA
} else {
model1.ls[[i]] <- fit
}

How should I call model.frame in R?

I am trying to write my own modeling function in R, one which takes a formula, some data, and maybe some extra context, like weights; after calling model.frame to extract the necessary numeric data, it will perform a fit. My first pass looked like:
my_modfunc <- function(formula,data,weights=NULL) {
mf <- model.frame(formula,data=data,weights=weights)
wt <- model.weights(mf)
# do some fitting here...
}
# make fake data to test it
set.seed(1234)
data <- data.frame(x1=rnorm(50),x2=rnorm(50),y=rnorm(50),w=runif(50))
# call it:
my_modfunc(y ~ x1 + x2,data=data,weights=w)
This fails, I get the error:
Error in model.frame.default(formula, data = data, weights = weights) :
invalid type (closure) for variable '(weights)'
Similarly, if I call
my_modfunc(y ~ x1 + x2,data=data,weights='w')
I get the same error. I suspect there is some problem with environment, quoting, and so on.
Cutting and pasting the source for lm, I could rewrite my function as
# based on lm
weird_modfunc <- function(formula,data,weights=NULL ) {
cl <- match.call() # what?
mf <- match.call(expand.dots = FALSE) # what??
m <- match(c("formula", "data", "weights"), names(mf), 0L)
mf <- mf[c(1L, m)] # ??
mf$drop.unused.levels <- TRUE # ??
mf[[1L]] <- quote(stats::model.frame) ## ???
mf <- eval(mf, parent.frame())
wt <- as.vector(model.weights(mf))
# do some fitting here...
}
# this runs without error:
weird_modfunc(y ~ x1 + x2,data=data,weights=w)
# this fails with the same error as above about variable lengths.
weird_modfunc(y ~ x1 + x2,data=data,weights='w')
The problem is that this contains multiple somewhat mystical incantations that I do not know how to interpret, modify or maintain.
What is the right way to call model.frame? Bonus points for making my function accept both weights=w and weights='w'
Welcome to the joys of non-standard evaluation. I suggest you base your function on the lm approach. It constructs a call to model.frame and evaluates it. That's necessary, because model.frame does non-standard evaluation, i.e., it accepts/expects a symbol for the weights parameter. Furthermore, it also ensures correct scoping regarding the formula's environment.
weird_modfunc <- function(formula,data,weights=NULL ) {
#cl not needed, lm only adds this call to the return object
mf <- match.call(expand.dots = FALSE)
message("Call with ellipses not expanded: ")
#note that there are no ellipses in the function arguments for now,
#but you might want to change that later
print(mf)
#turn weights into symbol if character is passed
if (is.character(mf$weights)) mf$weights <- as.symbol(mf$weights)
m <- match(c("formula", "data", "weights"), names(mf), 0L)
message("Position of formula, data and weights in the call:")
print(m)
mf <- mf[c(1L, m)]
message("New call that only contains what is needed:")
print(mf)
mf$drop.unused.levels <- TRUE
message("Call with argument added:")
print(mf)
mf[[1L]] <- quote(stats::model.frame)
message("Change call to a call to model.frame:")
print(mf)
mf <- eval(mf, parent.frame()) #evaluate call
wt <- as.vector(model.weights(mf))
# do some fitting here...
message("Return value:")
wt
}
# this runs without error:
weird_modfunc(y ~ x1 + x2,data=data,weights=w)
#Call with ellipses not expanded:
#weird_modfunc(formula = y ~ x1 + x2, data = data, weights = w)
#Position of formula, data and weights in the call
#[1] 2 3 4
#New call that only contains what is needed:
#weird_modfunc(formula = y ~ x1 + x2, data = data, weights = w)
#Call with argument added:
#weird_modfunc(formula = y ~ x1 + x2, data = data, weights = w,
# drop.unused.levels = TRUE)
#Change call to a call to model.frame:
#stats::model.frame(formula = y ~ x1 + x2, data = data, weights = w,
# drop.unused.levels = TRUE)
#Return value:
# [1] 0.35299850 0.98095832 0.53888276 0.44403386 0.94936678 0.45248337 0.19062580 0.99160915 0.54845545 0.76881577 0.91342167 0.68211200 0.40725142
#[14] 0.40759230 0.14608279 0.19666771 0.19220934 0.40841440 0.34822131 0.83454285 0.19840001 0.86180531 0.39718531 0.15325377 0.33928338 0.36718044
#[27] 0.42737908 0.18633690 0.65801660 0.92041138 0.73389406 0.88231927 0.95334653 0.19490154 0.47261674 0.38605066 0.37416586 0.02785566 0.92935521
#[40] 0.41052928 0.95584022 0.27215284 0.51724649 0.97830984 0.36969649 0.31043044 0.03420963 0.66756585 0.92091638 0.04498960
#this runs without error too:
weird_modfunc(y ~ x1 + x2,data=data,weights='w')
Here is a simpler version but there might be problems (well, more than usual with non-standard evaluation):
my_modfunc <- function(formula,data,weights=NULL) {
weights <- substitute(weights)
if (!is.symbol(weights)) weights <- as.symbol(weights)
#substitute the symbol into the call:
mf <- eval(substitute(model.frame(formula,data=data,weights=weights)))
wt <- model.weights(mf)
# do some fitting here...
wt
}
my_modfunc(y ~ x1 + x2,data=data,weights=w)
#works
my_modfunc(y ~ x1 + x2,data=data,weights="w")
#works

pgmm from plm package gives error for summary

I am trying to use the pgmm function from the plm package for R. The regression runs and I can call up the results, however, asking for the summary gives the following error:
Error in t(y) %*% x : non-conformable arguments
I've imported the data from the World Bank using the WDI package:
library(plm) # load package
library(WDI) # Load package
COUNTRIES <- c("AGO","BEN","BWA","BFA","BDI") # Specify countries
INDICATORS <- c("NY.GDP.PCAP.KN", "SP.DYN.TFRT.IN", "SP.DYN.CBRT.IN", "SP.POP.TOTL") # Specify indicators
LONG <- WDI(country=COUNTRIES, indicator=INDICATORS, start=2005, end=2009, extra=FALSE) # Load data
PANEL <- pdata.frame(LONG, c("iso2c","year")) # Transform to PANEL dataframe
PANEL$year <- as.numeric(as.character(PANEL$year)) # Encode year
EQ <- pgmm( log(fertility) ~ log(gdp) + lag(log(fertility), 2) | lag(log(fertility), 2), data=PANEL, effect="twoways", model="twosteps", gmm.inst=~log(fertility) ) # Run regression
Calling the results as follows works.
EQ
But the summary (below) gives the error message mentioned above.
summary(EQ)
I think the error occurs because summary.pgmm tries to do a second order Arelland-Bond test of serial correlation on your data, but your data only have two points (2008 and 2009) so it fails.
To fix this problem, you could patch the function so that it checks whether you only have two points in the data set and runs the test only if you have more than two points. I provide a patched function below:
summary.pgmm.patched <- function (object, robust = FALSE, time.dummies = FALSE, ...)
{
model <- plm:::describe(object, "model")
effect <- plm:::describe(object, "effect")
transformation <- plm:::describe(object, "transformation")
if (robust) {
vv <- vcovHC(object)
}
else {
vv <- vcov(object)
}
if (model == "onestep")
K <- length(object$coefficients)
else K <- length(object$coefficients[[2]])
Kt <- length(object$args$namest)
if (!time.dummies && effect == "twoways")
rowsel <- -c((K - Kt + 1):K)
else rowsel <- 1:K
std.err <- sqrt(diag(vv))
b <- coef(object)
z <- b/std.err
p <- 2 * pnorm(abs(z), lower.tail = FALSE)
CoefTable <- cbind(b, std.err, z, p)
colnames(CoefTable) <- c("Estimate", "Std. Error", "z-value",
"Pr(>|z|)")
object$CoefTable <- CoefTable[rowsel, , drop = FALSE]
object$sargan <- sargan(object)
object$m1 <- plm:::mtest(object, 1, vv)
# The problem line:
# object$m2 <- mtest(object, 2, vv)
if (length(object$residuals[[1]] ) > 2) object$m2 <- plm:::mtest(object, 2, vv)
object$wald.coef <- plm:::wald(object, "param", vv)
if (plm:::describe(object, "effect") == "twoways")
object$wald.td <- plm:::wald(object, "time", vv)
class(object) <- "summary.pgmm"
object
}
You might want to write to the author of the plm package and show him this post. The author will be able to write a less 'hacky' patch.
Using your own (slightly modified) example data, here is how you would use the function:
library(WDI) # Load package
library(plm)
COUNTRIES <- c("AGO","BEN","BWA","BFA","BDI") # Specify countries
INDICATORS <- c("NY.GDP.PCAP.KN", "SP.DYN.TFRT.IN", "SP.DYN.CBRT.IN", "SP.POP.TOTL") # Specify indicators
LONG <- WDI(country=COUNTRIES, indicator=INDICATORS, start=2005, end=2009, extra=FALSE) # Load data
PANEL <- pdata.frame(LONG, c("iso2c","year")) # Transform to PANEL dataframe
PANEL$year <- as.numeric(as.character(PANEL$year)) # Encode year
names(PANEL) [c(4,5)] = c('gdp','fertility')
EQ <- pgmm( log(fertility) ~ log(gdp) + lag(log(fertility), 2) | lag(log(fertility), 2), data=PANEL, effect="twoways", model="twosteps", gmm.inst=~log(fertility) ) # Run regression
summary.pgmm.patched(EQ)

Resources