R: can `assign` be used for models? - r

Can the assign function in R be used to assign object names to models (e.g., gls models)? I'm guessing not b/c I keep getting a warning:
> Warning messages:
In assign(paste0(deparse(substitute(mod)), "_", i, j), update(mod, :
only the first element is used as variable name
As a result the objects are not created
Is there a way to do this?
Here is my fucntion code if it helps:
#Choose best corARMA structure for model of choice:
corARMA.chooser <- function(mod,min = 0,max = 3 ) {
#This function creates 1. object for each combo of ARMA {0:3} 2. AIC table comparing all of these models
mod <- get('mod')
aic.arma <- AIC(mod)
ps <- 0
qs <- 0
for(i in min:max) {
js <- if(i == 0) c(1:max) else c(min:max)
for(j in js) {
arma <- corARMA(p = i, q = j)
assign(paste0(deparse(substitute(mod)),'_',i,j), update(mod, .~., correlation = arma), envir = .GlobalEnv)
aic.arma <- c(aic.arma, AIC(get(paste0(deparse(substitute(mod)),'_',i,j))))
ps <- c(ps, i)
qs <- c(qs, i)
}
aic.arma.out <- data.frame(ps, qs, aic.arma)
aic.arma.out
}
}
Update:
I tried using the list approach, but I get the error:
Error in names(mod.list) <- c(names(mod.list), paste0(deparse(substitute(mod)), :
'names' attribute [1275] must be the same length as the vector [1]

EDIT: what actually tears your variable name apart is this line mod <- get('mod') where you overwrite your named instance of mod why do you actually do this? If change your function to this it behaves as I'd expect it to:
corARMA.chooser <- function(modIn,min = 0,max = 3 ) {
#This function creates 1. object for each combo of ARMA {0:3} 2. AIC table comparing all of these models mod <- get('modIn') aic.arma <- AIC(modIn) ps <- 0 qs <- 0 for(i in min:max) {
js <- if(i == 0) c(1:max) else c(min:max)
for(j in js) {
arma <- corARMA(p = i, q = j)
browser()
assign(paste0(deparse(substitute(modIn)),'_',i,j), update(mod, .~., correlation = arma), envir = .GlobalEnv)
aic.arma <- c(aic.arma, AIC(get(paste0(deparse(substitute(mod)),'_',i,j))))
ps <- c(ps, i)
qs <- c(qs, i)
}
aic.arma.out <- data.frame(ps, qs, aic.arma)
aic.arma.out
}
}
hope this is what you were trying to achieve.

Still not sure why the code works alone but not in the function, but it is clear that the deparse(substitute(mod)) is for some reason pulls mod apart to all of its parts first in the function, vs. simply creating a name of the object itself.
Here is my new code that works:
corARMA.chooser <- function(mod,p = 1,q = 0 ) {
#This function creates 1. object for each combo of ARMA {0:3} 2. AIC table comparing all of these models
mod.list <- NULL
nms <- NULL
aic.arma <- AIC(mod)
ps <- 0
qs <- 0
for(i in c(p)) {
js <- if(i == 0) c(q[q>0]) else c(q)
for(j in c(js)) {
arma <- corARMA(p = i, q = j)
mod.list <- c(mod.list, list(update(mod, .~., correlation = arma)))
names(mod.list) <- c(names(mod.list), paste0(deparse(substitute(mod)),'_',i,j))
aic.arma <- c(aic.arma, AIC(eval(parse(text=(paste0('mod.list$',deparse(substitute(mod)),'_',i,j))))))
ps <- c(ps, i)
qs <- c(qs, j)
}
}
assign(paste0(deparse(substitute(mod)),'_','ARMA'),mod.list, envir = .GlobalEnv)
aic.arma.out <- data.frame(p = ps, q = qs, AIC = aic.arma)
aic.arma.out
}

Related

Chi square matrix residuals

below is a function to extract p-values from multiple Chi-Square tests and display them as a matrix. I'm trying to do the same, but to extract residuals instead. Any help is appreciated.
Sample data:
df <- data.frame(first_column = c(rep("E1_C1",5), rep("E1_C2",3), rep("E2_C2",7),rep("E3_C3",5)),
second_column = c(rep("E1_C1",3), rep("E1_C2",10), rep("E2_C2",4),rep("E3_C3",3)),
third_column = c(rep("E1_C1",7), rep("E1_C2",4), rep("E2_C2",3),rep("E3_C3",6)),
fourth_column = c(rep("E1_C1",4), rep("E1_C2",6), rep("E2_C2",6),rep("E3_C3",4))
)
Chi-square matrix function for P-Values:
chisqmatrix <- function(x) {
names = colnames(x); num = length(names)
m = matrix(nrow=num,ncol=num,dimnames=list(names,names))
for (i in 1:(num-1)) {
for (j in (i+1):num) {
#browser()
m[j,i] = chisq.test(x[, i, drop = TRUE],x[, j, drop = TRUE])$p.value
}
}
return (m)
}
Generate Chi-Square p-value matrix
res <- chisqmatrix(df)
res[, -ncol(res)]
In your case, the returned residuals is a 4x4 matrix. Instead of using a matrix to take the results, the following solution uses a list instead. This way you can have matrices of different sizes.
With minimal changes from your original code:
chisqlist <- function(x) {
names = colnames(x); num = length(names)
m = list()
index = 1
for (i in 1:(num-1)) {
for (j in (i+1):num) {
#browser()
m[[index]] = chisq.test(x[, i, drop = TRUE],x[, j, drop = TRUE])$residuals
index=index+1
}
}
return (m)
}
Edit:
I do prefer # Onyambu's answer, which I didn't see. It would be faster than a nested for loop.
Simply change your function from requesting $p.value to requesting $residuals. This will provide (observed - expected) / sqrt(expected). If you desire standardized residuals request $stdres.
chisqmatrix <- function(x) {
names = colnames(x); num = length(names)
m = matrix(nrow=num,ncol=num,dimnames=list(names,names))
for (i in 1:(num-1)) {
for (j in (i+1):num) {
#browser()
m[j,i] = chisq.test(x[, i, drop = TRUE],x[, j, drop = TRUE])$residuals
}
}
return (m)
}

Add a progress bar to boot function in R

I am trying to add a progress bar to a bootstrap function in R.
I tried to make the example function as simple as possible (hence i'm using mean in this example).
library(boot)
v1 <- rnorm(1000)
rep_count = 1
m.boot <- function(data, indices) {
d <- data[indices]
setWinProgressBar(pb, rep_count)
rep_count <- rep_count + 1
Sys.sleep(0.01)
mean(d, na.rm = T)
}
tot_rep <- 200
pb <- winProgressBar(title = "Bootstrap in progress", label = "",
min = 0, max = tot_rep, initial = 0, width = 300)
b <- boot(v1, m.boot, R = tot_rep)
close(pb)
The bootstrap functions properly, but the problem is that the value of rep_count does not increase in the loop and the progress bar stays frozen during the process.
If I check the value of rep_count after the bootstrap is complete, it is still 1.
What am i doing wrong? maybe the boot function does not simply insert the m.boot function in a loop and so the variables in it are not increased?
Thank you.
You could use the package progress as below:
library(boot)
library(progress)
v1 <- rnorm(1000)
#add progress bar as parameter to function
m.boot <- function(data, indices, prog) {
#display progress with each run of the function
prog$tick()
d <- data[indices]
Sys.sleep(0.01)
mean(d, na.rm = T)
}
tot_rep <- 200
#initialize progress bar object
pb <- progress_bar$new(total = tot_rep + 1)
#perform bootstrap
boot(data = v1, statistic = m.boot, R = tot_rep, prog = pb)
I haven't quite figured out yet why it's necessary to set the number of iterations for progress_bar to be +1 the total bootstrap replicates (parameter R), but this is what was necessary in my own code, otherwise it throws an error. It seems like the bootstrap function is run one more time than you specify in parameter R, so if the progress bar is set to only run R times, it thinks the job is finished before it really is.
The pbapply package was designed to work with vectorized functions. There are 2 ways to achieve that in the context of this question: (1) write a wrapper as was suggested, which will not produce the same object of class 'boot'; (2) alternatively, the line lapply(seq_len(RR), fn) can be written as pblapply(seq_len(RR), fn). Option 2 can happen either by locally copying/updating the boot function as shown in the example below, or asking the package maintainer, Brian Ripley, if he would consider adding a progress bar directly or through pbapply as dependency.
My solution (changes indicated by comments):
library(boot)
library(pbapply)
boot2 <- function (data, statistic, R, sim = "ordinary", stype = c("i",
"f", "w"), strata = rep(1, n), L = NULL, m = 0, weights = NULL,
ran.gen = function(d, p) d, mle = NULL, simple = FALSE, ...,
parallel = c("no", "multicore", "snow"), ncpus = getOption("boot.ncpus",
1L), cl = NULL)
{
call <- match.call()
stype <- match.arg(stype)
if (missing(parallel))
parallel <- getOption("boot.parallel", "no")
parallel <- match.arg(parallel)
have_mc <- have_snow <- FALSE
if (parallel != "no" && ncpus > 1L) {
if (parallel == "multicore")
have_mc <- .Platform$OS.type != "windows"
else if (parallel == "snow")
have_snow <- TRUE
if (!have_mc && !have_snow)
ncpus <- 1L
loadNamespace("parallel")
}
if (simple && (sim != "ordinary" || stype != "i" || sum(m))) {
warning("'simple=TRUE' is only valid for 'sim=\"ordinary\", stype=\"i\", n=0', so ignored")
simple <- FALSE
}
if (!exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE))
runif(1)
seed <- get(".Random.seed", envir = .GlobalEnv, inherits = FALSE)
n <- NROW(data)
if ((n == 0) || is.null(n))
stop("no data in call to 'boot'")
temp.str <- strata
strata <- tapply(seq_len(n), as.numeric(strata))
t0 <- if (sim != "parametric") {
if ((sim == "antithetic") && is.null(L))
L <- empinf(data = data, statistic = statistic, stype = stype,
strata = strata, ...)
if (sim != "ordinary")
m <- 0
else if (any(m < 0))
stop("negative value of 'm' supplied")
if ((length(m) != 1L) && (length(m) != length(table(strata))))
stop("length of 'm' incompatible with 'strata'")
if ((sim == "ordinary") || (sim == "balanced")) {
if (isMatrix(weights) && (nrow(weights) != length(R)))
stop("dimensions of 'R' and 'weights' do not match")
}
else weights <- NULL
if (!is.null(weights))
weights <- t(apply(matrix(weights, n, length(R),
byrow = TRUE), 2L, normalize, strata))
if (!simple)
i <- index.array(n, R, sim, strata, m, L, weights)
original <- if (stype == "f")
rep(1, n)
else if (stype == "w") {
ns <- tabulate(strata)[strata]
1/ns
}
else seq_len(n)
t0 <- if (sum(m) > 0L)
statistic(data, original, rep(1, sum(m)), ...)
else statistic(data, original, ...)
rm(original)
t0
}
else statistic(data, ...)
pred.i <- NULL
fn <- if (sim == "parametric") {
ran.gen
data
mle
function(r) {
dd <- ran.gen(data, mle)
statistic(dd, ...)
}
}
else {
if (!simple && ncol(i) > n) {
pred.i <- as.matrix(i[, (n + 1L):ncol(i)])
i <- i[, seq_len(n)]
}
if (stype %in% c("f", "w")) {
f <- freq.array(i)
rm(i)
if (stype == "w")
f <- f/ns
if (sum(m) == 0L)
function(r) statistic(data, f[r, ], ...)
else function(r) statistic(data, f[r, ], pred.i[r,
], ...)
}
else if (sum(m) > 0L)
function(r) statistic(data, i[r, ], pred.i[r, ],
...)
else if (simple)
function(r) statistic(data, index.array(n, 1, sim,
strata, m, L, weights), ...)
else function(r) statistic(data, i[r, ], ...)
}
RR <- sum(R)
res <- if (ncpus > 1L && (have_mc || have_snow)) {
if (have_mc) {
parallel::mclapply(seq_len(RR), fn, mc.cores = ncpus)
}
else if (have_snow) {
list(...)
if (is.null(cl)) {
cl <- parallel::makePSOCKcluster(rep("localhost",
ncpus))
if (RNGkind()[1L] == "L'Ecuyer-CMRG")
parallel::clusterSetRNGStream(cl)
res <- parallel::parLapply(cl, seq_len(RR), fn)
parallel::stopCluster(cl)
res
}
else parallel::parLapply(cl, seq_len(RR), fn)
}
}
else pblapply(seq_len(RR), fn) #### changed !!!
t.star <- matrix(, RR, length(t0))
for (r in seq_len(RR)) t.star[r, ] <- res[[r]]
if (is.null(weights))
weights <- 1/tabulate(strata)[strata]
boot.return(sim, t0, t.star, temp.str, R, data, statistic,
stype, call, seed, L, m, pred.i, weights, ran.gen, mle)
}
## Functions not exported by boot
isMatrix <- boot:::isMatrix
index.array <- boot:::index.array
boot.return <- boot:::boot.return
## Now the example
m.boot <- function(data, indices) {
d <- data[indices]
mean(d, na.rm = T)
}
tot_rep <- 200
v1 <- rnorm(1000)
b <- boot2(v1, m.boot, R = tot_rep)
The increased rep_count is a local variable and lost after each function call. In the next iteration the function gets rep_count from the global environment again, i.e., its value is 1.
You can use <<-:
rep_count <<- rep_count + 1
This assigns to the rep_count first found on the search path outside the function. Of course, using <<- is usually not recommended because side effects of functions should be avoided, but here you have a legitimate use case. However, you should probably wrap the whole thing in a function to avoid a side effect on the global environment.
There might be better solutions ...
I think i found a possible solution. This merges the answer of #Roland with the convenience of the pbapply package, using its functions startpb(), closepb(), etc..
library(boot)
library(pbapply)
v1 <- rnorm(1000)
rep_count = 1
tot_rep = 200
m.boot <- function(data, indices) {
d <- data[indices]
setpb(pb, rep_count)
rep_count <<- rep_count + 1
Sys.sleep(0.01) #Just to slow down the process
mean(d, na.rm = T)
}
pb <- startpb(min = 0, max = tot_rep)
b <- boot(v1, m.boot, R = tot_rep)
closepb(pb)
rep_count = 1
As previously suggested, wrapping everything in a function avoids messing with the rep_count variable.
The progress bar from the package dplyr works well:
library(dplyr)
library(boot)
v1 <- rnorm(1000)
m.boot <- function(data, indices) {
d <- data[indices]
p$tick()$print() # update progress bar
Sys.sleep(0.01)
mean(d, na.rm = T)
}
tot_rep <- 200
p <- progress_estimated(tot_rep+1) # init progress bar
b <- boot(v1, m.boot, R = tot_rep)
You can use the package pbapply
library(boot)
library(pbapply)
v1 <- rnorm(1000)
rep_count = 1
# your m.boot function ....
m.boot <- function(data, indices) {
d <- data[indices]
mean(d, na.rm = T)
}
# ... wraped in `bootfunc`
bootfunc <- function(x) { boot(x, m.boot, R = 200) }
# apply function to v1 , returning progress bar
pblapply(v1, bootfunc)
# > b <- pblapply(v1, bootfunc)
# > |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% Elapsed time: 02s

R rfe feature selection caret

I am using R and the caret package for a classification task. For feature elimination I am using rfe, which has different options, among them, what is the metric that I want to maximize/minimize.
The problem is that rfe accepts metrics such as RMSE, kappa, and I want to use a different metric to maximize, in mi case I want to maximize ScoreQuadraticWeightedKappa from the Metrics library, but I don't know how to do that.
I have the following code:
control <- rfeControl(functions = rfFuncs, method="cv", number=2)
results <- rfe(dataset[, -59], dataset[, 59],
sizes = c(1:58), rfeControl = control)
How do I edit it, for rfe to maximize ScoreQuadraticWeightedKappa?
You need to modify the postResample function, or create your own function that's similar, and then insert it into rfFuncs$summary. The default postResample function here below:
> postResample
function (pred, obs)
{
isNA <- is.na(pred)
pred <- pred[!isNA]
obs <- obs[!isNA]
if (!is.factor(obs) & is.numeric(obs)) {
if (length(obs) + length(pred) == 0) {
out <- rep(NA, 2)
}
else {
if (length(unique(pred)) < 2 || length(unique(obs)) <
2) {
resamplCor <- NA
}
else {
resamplCor <- try(cor(pred, obs, use = "pairwise.complete.obs"),
silent = TRUE)
if (class(resamplCor) == "try-error")
resamplCor <- NA
}
mse <- mean((pred - obs)^2)
n <- length(obs)
out <- c(sqrt(mse), resamplCor^2)
}
names(out) <- c("RMSE", "Rsquared")
}
else {
if (length(obs) + length(pred) == 0) {
out <- rep(NA, 2)
}
else {
pred <- factor(pred, levels = levels(obs))
requireNamespaceQuietStop("e1071")
out <- unlist(e1071::classAgreement(table(obs, pred)))[c("diag",
"kappa")]
}
names(out) <- c("Accuracy", "Kappa")
}
if (any(is.nan(out)))
out[is.nan(out)] <- NA
out
}
More specifically, since you are doing classification, you will need to modify the portion of postResample that says:
else {
if (length(obs) + length(pred) == 0) {
out <- rep(NA, 2)
}
else {
pred <- factor(pred, levels = levels(obs))
requireNamespaceQuietStop("e1071")
out <- unlist(e1071::classAgreement(table(obs, pred)))[c("diag",
"kappa")]
}
names(out) <- c("Accuracy", "Kappa")
}
After you've edited postResample, or created your own equivalent function, you can run:
rfFuncs$summary <- function (data, lev = NULL, model = NULL) {
if (is.character(data$obs))
data$obs <- factor(data$obs, levels = lev)
postResample(data[, "pred"], data[, "obs"])
}
Just make sure postResample has been edited or replace it with the name of your equivalent function.

How to extract the p.value and estimate from cor.test()?

I perform cor.test for a dataset in a for loop, but I don't know how to extract the information like estimate and tau from my test.
Before performing for loop in the dataset, The cor.test() function returns as follows:
cor.test(armpit$Corynebacterium.1, armpit$Staphylococcus.1, alterantive="two-sided", method="kendall", exact=FALSE, continuity=TRUE)
return result
Here is my code for performing for loop. Now I want to extract estimate and tau from my test.
for (i in 1:8) {
for (j in 1:8) {
if (j != i)
cor.test( as.numeric(unlist(armpit[i])),
as.numeric(unlist(armpit[j])), alterantive="two-sided",
method="kendall", exact=FALSE, continuity=TRUE)
}
}
I have check the similar question from
similar question
Then I change my code as:
estimates = numeric(50)
pvalues = numeric(50)
for (i in 1:8) {
for (j in 1:8) {
if (j != i)
cor.test( as.numeric(unlist(armpit[i])),
as.numeric(unlist(armpit[j])), alterantive="two-sided",
method="kendall", exact=FALSE, continuity=TRUE)
estimates[i] = cor.test$estimate
pvalues[i]= cor.test$p-value
}
}
But it returns:
Error in cor.test$estimate : object of type 'closure' is not subsettable
Could anyone offer me some help about how to extract estimate and tau value from cor.test() function in a for loop? Thanks in advance.
cor.test returns a list. You can create an object to capture this list:
cor_test <- cor.test( as.numeric(unlist(armpit[i])), as.numeric(unlist(armpit[j])), alterantive="two-sided", method="kendall", exact=FALSE, continuity=TRUE)
Then use cor_test afterwards with $ to access each element of the list:
estimates[i] = cor_test$estimate
pvalues[i]= cor_test$p.value # note the ., not the -
The original error is pretty arcane, so it's understandable you were confused about this. You wrote cor.test$estimate, which asks R to access the estimate component of the cor.test function, not the result of the test.
estimates = numeric(50)
pvalues = numeric(50)
for (i in 1:8) {
for (j in 1:8) {
if (j != i)
cor_test <-
cor.test( as.numeric(unlist(armpit[i])),
as.numeric(unlist(armpit[j])), alterantive="two-sided",
method="kendall", exact=FALSE, continuity=TRUE)
estimates[i] = cor_test$estimate
pvalues[i]= cor_test$p.value
}
}
Alright, found it, we should have seen it earlier. The if (j != i) statement needs to have brackets around everything that should be done if the statement is true. With the particular formatting you had, R was not parsing it correctly. I couldn't get your data, so I made some up (which will test random rows against random columns). This works:
M <- matrix(rnorm(8*8), ncol = 8) # made up test data
estimates = numeric(50)
pvalues = numeric(50)
for (i in 1:8) {
for (j in 1:8) {
if (j != i) { # need this bracket
cor_test <- cor.test(M[i,], M[,j],
alternative="two.sided",
method="kendall", exact=FALSE, continuity=TRUE)
estimates[i] = cor_test$estimate
pvalues[i]= cor_test$p.value
} # and this bracket
}
}
estimates
pvalues
EDIT: alternative version to store all results in a data frame.
M <- matrix(rnorm(8*8), ncol = 8) # made up test data
ans <- data.frame(i = rep(NA, 64), j = rep(NA, 64), estimate = rep(NA, 64), pvalue = rep(NA, 64))
cnt <- 1
for (i in 1:8) {
for (j in 1:8) {
if (j != i) {
cor_test <- cor.test(M[i,], M[,j], alternative="two.sided", method="kendall", exact=FALSE, continuity=TRUE)
ans[cnt,1] <- i
ans[cnt,2] <- j
ans[cnt,3] <- cor_test$estimate
ans[cnt,4] <- cor_test$p.value
cnt <- cnt + 1
}
}
}
ans <- na.omit(ans)

error: replacement has 1 row, data has 0

corr <- function(directory, threshold = 0) {
setwd("c:/users/hp1/desktop")
files_full <- list.files(directory,full.names = TRUE)
cr <- data.frame()
j = 1
for(i in 1:332)
{
y <- read.csv(files_full[i])
z <- y[complete.cases(y),]
if (nrow(z) > threshold){
cr[j] <- cor(z[,'sulfate'], z[,'nitrate'], method = "pearson")
j = j+1
}
}
cr
}
it's showing the following error:
Error in [<-.data.frame(*tmp*, j, value = -0.222552560758546) :
replacement has 1 row, data has 0
I was expecting as j increments, values would get added to the cr dtaframe. however that is not happening.
please suggest necessary changes
You could try something like this. If you provide a reproducible example I can show you how to clean the result. sapply will try to simplify the result, but you can stop it by specifying simplify = FALSE and remove unwanted list elements.
setwd("c:/users/hp1/desktop") # I would use this outside a function
corr <- function(directory, threshold = 0) {
files_full <- list.files(directory, full.names = TRUE)
sapply(files_full, FUN = function(x) {
y <- read.csv(x)
z <- y[complete.cases(y),]
if (nrow(z) > threshold){
out <- cor(z[,'sulfate'], z[,'nitrate'], method = "pearson")
} else {
return(NA) # or some other way you want to handle the result
}
})
}

Resources