Using $ to refer to multiple variables in user defined function R

Using $ to refer to multiple variables in user defined function R - r

I am trying to run a function which calculates the marginal effects for different mixed effects models, based on two different main predictors (var1 vs. var2). The original code can be found here:
https://stats.idre.ucla.edu/r/dae/mixed-effects-logistic-regression/. Below is a reproducible example:
I create a dataframe (ex):
time <- seq(from = 1, to = 500, by =1)
var1 <- factor(sample(0:1, 500, replace = TRUE))
var2 <- factor(sample(0:1, 500, replace = TRUE))
var3 <- sample(1:500, 500, replace = TRUE)
group <- rep(1001:1005, 500)
out <- sample(0:1, 500, replace = TRUE)
group <- as.factor(group)
ex <- data.frame(time,var1,var2,var3,group,out)
Run the models:
m1a <- glmer(out ~ time + var1 + (1|group), data=ex, family = binomial(link = "logit"), nAGQ = 1,
control = glmerControl(calc.derivs = FALSE))
m1b <- glmer(out ~ time + var2 + (1|group), data=ex, family = binomial(link = "logit"), nAGQ = 1,
control = glmerControl(calc.derivs = FALSE))
Create subsets of the data with only the predictors for complete cases:
sub1a <- na.omit(ex[, c("time", "var1", "group")])
sub1b <- na.omit(ex[, c("time", "var2", "group")])
I cannot attach my data frame, ex, because R says var1 and var2 are masked. Therefore, the only way I know to refer to the variables is using $. However, every function I create produces a wrong or null result. I first tried:
marg <- function(v1, v2, d, m) {
biprobs <- lapply(levels(v1), function(var) {
v2[ ] <- var
lapply(time, function(ti) {
d$time <- ti
predict(m, newdata = d, type = "response")
})
})
plotdat <- lapply(biprobs, function(X) {
temp <- t(sapply(X, function(x) {
c(M=mean(x), quantile(x, c(.25, .75)))
}))
temp <- as.data.frame(cbind(temp,time))
colnames(temp) <- c("PP", "Lower", "Upper", "Dayssince")
return(temp)
})
plotdat <- do.call(rbind, plotdat)
}
result1 <- marg(ex$var1, sub1a$var1, sub1a, m1a)
Although this creates a data frame, it produces the same predicted probabilities for each level of var1 (0 vs. 1) at a given time (1-500), which is not what I want. So then I tried:
marg <- function(v, d, m) {
biprobs <- lapply(levels(ex$v), function(var) {
d$v[ ] <- var
lapply(time, function(ti) {
d$time <- ti
predict(m, newdata = d, type = "response")
})
})
.....
}
result2 <- marg(var1,sub1a, m1a)
This produces a null result. I also tried, which produces a null result:
marg <- function(d1,v,d2,m) {
biprobs <- lapply(levels(d1$v), function(var) {
d2$v[ ] <- var
lapply(time, function(ti) {
d2$time <- ti
predict(m, newdata = d2, type = "response")
})
})
......
}
result3 <- marg(ex,var1,sub1a,m1a)
I also tried creating a new object to input directly into the function:
v1 <- ex$var1
marg <- function(d, m) {
biprobs <- lapply(levels(v1), function(var) {
.....
})
})
That also produces a null result. How do I refer to different variables in an unattached data frame?? The code works with direct inputs, so it's a matter of correctly defining the function arguments. I appreciate any help!

Related

How to have output from lm() include std. error and others without using summary() for stargazer

I'm fitting several linear models in r in the following way:
set.seed(12345)
n = 100
x1 = rnorm(n)
x2 = rnorm(n)+0.1
y = x + rnorm(n)
df <- data.frame(x1, x2, y)
x_str <- c("x1", "x1+x2")
regf_lm <- function(df,y_var, x_str ) {
frmla <- formula(paste0(y_var," ~ ", x_str ))
fit <- lm(frmla, data = df )
summary(fit) #fit
}
gbind_lm <- function(vv) {
n <- vv %>% length()
fits <- list()
coefs <- list()
ses <- list()
for (i in 1:n ) {
coefs[[i]] <- vv[[i]]$coefficients[,1]
ses[[i]] <- vv[[i]]$coefficients[,2]
fits[[i]] <- vv[[i]]
}
list("fits" = fits, "coefs" = coefs, "ses" = ses)
}
stargazer_lm <- function(mylist, fname, title_str,m_type = "html",...) {
stargazer(mylist$fits, coef = mylist$coefs,
se = mylist$ses,
type = m_type, title = title_str,
out = paste0("~/projects/outputs",fname), single.row = T ,...)
}
p_2 <- map(x_str,
~ regf_lm (df = df ,
y_var = "y", x_str = .))
m_all <- do.call(c, list(p_2)) %>% gbind_lm()
stargazer_lm(m_all,"name.html","My model", m_type = "html")
In regf_lm, if I use summary(fit) on the last line, I'm able to generate reg output with columns for estimated coefficients, std. error, etc. But Stargazer() does not work with summary(lm()) (returns error $ operator is invalid for atomic vectors). However, if I just use "fit" on the last line in regf_lm, the output shows only the estimated coefficients and not std error, R sq...and gbind_lm() won't work because I cannot extract ses or fit.
Any advice is greatly appreciated.

You can directly export model statistics in tidy format with the package broom
library(broom)
set.seed(12345)
n = 100
x1 = rnorm(n)
x2 = rnorm(n)+0.1
y = x1 + rnorm(n)
df <- data.frame(x1, x2, y)
x_str <- c("x1", "x1+x2")
regf_lm <- function(df,y_var, x_str ) {
frmla <- formula(paste0(y_var," ~ ", x_str ))
fit <- lm(frmla, data = df )
return(list(fit,select(broom::tidy(fit),std.error))) #fit
}
exm_model <- regf_lm(iris,'Sepal.Width','Sepal.Length')
stargazer(exm_model[[1]], coef = exm_model[[2]], title = 'x_model',
out ='abc', single.row = T)
This piece of code worked on my local with no problem, I think you can apply this in your workflow.

Loop a function in r to create a new table

I have a dataframe in r and want to perform the levene's/ variance test on multiple variables with two groups and save all results in a table. I have tried to do this using a for() loop and sapply() but I get neither working:
df <- data.frame(
x = rnorm(100, 0, 1),
y = rnorm(100, 50, 1),
z = rnorm(100, 70, 2),
group = rep(c(0,1), each = 50)
)
varlist <- c("x","y","z")
res.var <- character(length(varlist))
res.f <- numeric(length(varlist))
res.p <- numeric(length(varlist))
Option 1)
for(i in seq_along(varlist)) {
form <- substitute(i ~ group, list(i = as.name(varlist)))
result <- var.test(
formula = form,
data = df)
res.var[i] <- varlist[i]
res.f[i] <- result$estimate
res.p[i] <- result$p.value
}
Option 2:
sapply(varlist, function(x) {
form <- substitute(i ~ group, list(i = as.name(varlist)))
result <- var.test(
formula = form,
data = df)
res.var[i] <- varlist[i]
res.f[i] <- result$estimate
res.p[i] <- result$p.value
})
Maybe there's an easier way to that this. I'd be glad for any help ;o) Thank you in advance.

nls boot error must have positive length

I am getting the error below with nlsBoot() any idea what is wrong?
Error in apply(tabboot, 1, quantile, c(0.5, 0.025, 0.975)) :
dim(X) must have a positive length
set.seed(1)
x = 1:100
y = x^2+rnorm(100,50,500)
plot(x,y)
d = data.frame(x =x, y=y)
mymodel = nls(y~x^b,start= list(b=1),data = d)
mymodel
library(nlstools)
nlsBoot(mymodel, niter = 999)

Try to define the formula before applying the nls function, like this:
formula <- as.formula(y ~ x^b)
mymodel <- nls(formula,start= list(b=1),data = d)
added
Well, I've modified the code and now it can handle one parameter fit.
# My suggestion is to erase all the environment first:
rm(list = ls())
# Then we start again:
set.seed(1)
x = 1:100
y = x^2+rnorm(100,50,500)
plot(x,y)
d = data.frame(x =x, y=y)
mymodel = nls(y~x^b,start= list(b=1),data = d)
Here is the function that you have to use:
nlsboot_onepar <- function (nls, niter = 999)
{
if (!inherits(nls, "nls"))
stop("Use only with 'nls' objects")
data2 <- eval(nls$data, sys.frame(0))
fitted1 <- fitted(nls)
resid1 <- resid(nls)
var1 <- all.vars(formula(nls)[[2]])
l1 <- lapply(1:niter, function(i) {
data2[, var1] <- fitted1 + sample(scale(resid1, scale = FALSE),
replace = TRUE)
nls2 <- try(update(nls, start = as.list(coef(nls)),
data = data2), silent = TRUE)
if (inherits(nls2, "nls"))
return(list(coef = coef(nls2), rse = summary(nls2)$sigma))
})
if (sum(sapply(l1, is.null)) > niter/2)
stop(paste("Procedure aborted: the fit only converged in",
round(sum(sapply(l1, is.null))/niter), "% during bootstrapping"))
tabboot <- sapply(l1[!sapply(l1, is.null)], function(z) z$coef,simplify =
FALSE)
tabboot <- as.matrix(t(as.numeric(tabboot)))
rownames(tabboot) <- "b"
rseboot <- sapply(l1[!sapply(l1, is.null)], function(z) z$rse)
recapboot <- t(apply(tabboot, 1, quantile, c(0.5, 0.025,
0.975)))
colnames(recapboot) <- c("Median", "2.5%", "97.5%")
estiboot <- t(apply(tabboot, 1, function(z) c(mean(z), sd(z))))
colnames(estiboot) <- c("Estimate", "Std. error")
serr <- sum(sapply(l1, is.null))
if (serr > 0)
warning(paste("The fit did not converge", serr, "times during
bootstrapping"))
listboot <- list(coefboot = t(tabboot), rse = rseboot, bootCI = recapboot,
estiboot = estiboot)
class(listboot) <- "nlsBoot"
return(listboot)
}
And then we use it:
result <- nlsboot_onepar(mymodel, niter = 999)
If you want to plot the parameter distribution, you can do this:
graphics.off()
plot(density(as.vector(result$coefboot)))
# or
hist(as.vector(result$coefboot))
I hope that helps you.

How to remove correlated variables from GLM in R

I am trying to exclude correlated variables from GLModel. Firstly, I calculate correlation matrix. Afterwards, I would like to implement it into combn function in some way to exclude the variables (column headers) that are correlated. At this point I fail - I am not able to incorporate it in combn function so that it worked and correlated variables were excluded.
Here is the link for data I use:
https://drive.google.com/open?id=0B5IgiR_svnKcZkxHeTJXTm9jUjQ
Here is the code I am trying to make it work:
## rm(list = ls()) ## Edited out to prevent accidents
mod_data <- read.csv("mod_data.csv", header = T)
mod_headers <- names(mod_data[3:ncol(mod_data)-1])
CM = which(abs(cor(mod_data[,1:ncol(mod_data)-1])-diag(1,ncol(mod_data)-1)) > 0.5, arr.ind = T)
f <- function(){
null_model <- glm(newcol ~ 1, data=mod_data, family = binomial(link = "logit"), control = list(maxit = 50))
best_model <- null_model
best_aic <- AIC(null_model)
for(i in 1:length(mod_headers)){
tab <- combn(mod_headers,i)
for(j in 1:ncol(tab)){
tab_new <- c(tab[,j])
mod_tab_new <- c(tab_new, "newcol")
model <- glm(newcol ~., data=mod_data[c(mod_tab_new)], family = binomial(link = "logit"), control = list(maxit = 50000))
if(AIC(model) < best_aic){
best_model <- model
best_aic <- AIC(model)
}
}
}
return(best_model)
}
f()
Thanks for your tips!

Reproducing results from previous answer is not working due to using new version of lme4

I have tried to reproduce the results from the answers for this question “Estimating random effects and applying user defined correlation/covariance structure with R lme4 or nlme package “ https://stats.stackexchange.com/questions/18563/estimating-random-effects-and-applying-user-defined-correlation-covariance-struc
Aaron Rendahl's codes
library(pedigreemm)
relmatmm <- function (formula, data, family = NULL, REML = TRUE, relmat = list(),
control = list(), start = NULL, verbose = FALSE, subset,
weights, na.action, offset, contrasts = NULL, model = TRUE,
x = TRUE, ...)
{
mc <- match.call()
lmerc <- mc
lmerc[[1]] <- as.name("lmer")
lmerc$relmat <- NULL
if (!length(relmat))
return(eval.parent(lmerc))
stopifnot(is.list(relmat), length(names(relmat)) == length(relmat))
lmerc$doFit <- FALSE
lmf <- eval(lmerc, parent.frame())
relfac <- relmat
relnms <- names(relmat)
stopifnot(all(relnms %in% names(lmf$FL$fl)))
asgn <- attr(lmf$FL$fl, "assign")
for (i in seq_along(relmat)) {
tn <- which(match(relnms[i], names(lmf$FL$fl)) == asgn)
if (length(tn) > 1)
stop("a relationship matrix must be associated with only one random effects term")
Zt <- lmf$FL$trms[[tn]]$Zt
relmat[[i]] <- Matrix(relmat[[i]][rownames(Zt), rownames(Zt)],
sparse = TRUE)
relfac[[i]] <- chol(relmat[[i]])
lmf$FL$trms[[tn]]$Zt <- lmf$FL$trms[[tn]]$A <- relfac[[i]] %*% Zt
}
ans <- do.call(if (!is.null(lmf$glmFit))
lme4:::glmer_finalize
else lme4:::lmer_finalize, lmf)
ans <- new("pedigreemm", relfac = relfac, ans)
ans#call <- match.call()
ans
}
the original example
set.seed(1234)
mydata <- data.frame (gen = factor(rep(1:10, each = 10)),
repl = factor(rep(1:10, 10)),
yld = rnorm(10, 5, 0.5))
library(lme4)
covmat <- round(nearPD(matrix(runif(100, 0, 0.2), nrow = 10))$mat, 2)
diag(covmat) <- diag(covmat)/10+1
rownames(covmat) <- colnames(covmat) <- levels(mydata$gen)
m <- relmatmm(yld ~ (1|gen) + (1|repl), relmat=list(gen=covmat), data=mydata)
here is the error message
Error in lmf$FL : $ operator not defined for this S4 class
In addition: Warning message:
In checkArgs("lmer", doFit = FALSE) : extra argument(s) ‘doFit’ disregarded
I will appreciate any help ?
Thanks

This is a re-implementation of the previous code -- I have done some slight modifications, and I have not tested it in any way -- test yourself and/or use at your own risk.
First create a slightly more modularized function that constructs the deviance function and fits the model:
doFit <- function(lmod,lmm=TRUE) {
## see ?modular
if (lmm) {
devfun <- do.call(mkLmerDevfun, lmod)
opt <- optimizeLmer(devfun)
mkMerMod(environment(devfun), opt, lmod$reTrms, fr = lmod$fr)
} else {
devfun <- do.call(mkGlmerDevfun, lmod)
opt <- optimizeGlmer(devfun)
devfun <- updateGlmerDevfun(devfun, lmod$reTrms)
opt <- optimizeGlmer(devfun, stage=2)
mkMerMod(environment(devfun), opt, lmod$reTrms, fr = lmod$fr)
}
}
Now create a function to construct the object that doFit needs and modify it:
relmatmm <- function (formula, ..., lmm=TRUE, relmat = list()) {
ff <- if (lmm) lFormula(formula, ...) else glFormula(formula, ...)
stopifnot(is.list(relmat), length(names(relmat)) == length(relmat))
relnms <- names(relmat)
relfac <- relmat
flist <- ff$reTrms[["flist"]] ## list of factors
## random-effects design matrix components
Ztlist <- ff$reTrms[["Ztlist"]]
stopifnot(all(relnms %in% names(flist)))
asgn <- attr(flist, "assign")
for (i in seq_along(relmat)) {
tn <- which(match(relnms[i], names(flist)) == asgn)
if (length(tn) > 1)
stop("a relationship matrix must be",
" associated with only one random effects term")
zn <- rownames(Ztlist[[i]])
relmat[[i]] <- Matrix(relmat[[i]][zn,zn],sparse = TRUE)
relfac[[i]] <- chol(relmat[[i]])
Ztlist[[i]] <- relfac[[i]] %*% Ztlist[[i]]
}
ff$reTrms[["Ztlist"]] <- Ztlist
ff$reTrms[["Zt"]] <- do.call(rBind,Ztlist)
fit <- doFit(ff,lmm)
}
Example
set.seed(1234)
mydata <- data.frame (gen = factor(rep(1:10, each = 10)),
repl = factor(rep(1:10, 10)),
yld = rnorm(10, 5, 0.5))
library(lme4)
covmat <- round(nearPD(matrix(runif(100, 0, 0.2), nrow = 10))$mat, 2)
diag(covmat) <- diag(covmat)/10+1
rownames(covmat) <- colnames(covmat) <- levels(mydata$gen)
m <- relmatmm(yld ~ (1|gen) + (1|repl), relmat=list(gen=covmat),
data=mydata)
This runs -- I don't know if the output is correct. It also doesn't make the resulting object into a pedigreemm object ...