Loop through (subsets) using jags - r

I have a big dataframe with 10000 rows and 12 columns (discountdataset).
The columns contain different variables. The first 210 rows represents subject 1 (there is also a column with "subject1"), the next 210 rows represent subject 2, and so on.
I want to use jags and a loop function to loop through all 52 subjects in the dataframe, and assign a function to each of them. My code looks like this:
#subsetting the dataframe by the variable subjectid
subsetdiscount <- split(discountdataset, as.factor(discountdataset$subjectid))
Here my plan is to loop and assign the following jags function to all subjects in the subset), but, it doesn't work. I think my mistake is that the variables "nt", "Choice" that I want to pass on to jags are not defined right, or, are not updated.
library(rjags)
for (i in 1:length(subsetdiscount))
{
nt <- nrow (subsetdiscount)
Choice <- subsetdiscount$choice
amountSS <- subsetdiscount$val_basic
amountLL <- subsetdiscount$val_d
delayDIFF <- subsetdiscount$delay
con <- subsetdiscount$condition
data <- list("nt", "Choice", "amountSS", "amountLL", "delayDIFF", "con") # to be passed on to JAGS
myinits <- list(
list(k = (c(0.01, 0.01))),
list(temp = (c(6, 6))))
parameters <- c("k", "temp")
samples <- jags(data, inits=myinits, parameters,
model.file ="singlesubmodel_Ben_roundedchoice.txt", n.chains=2, n.iter=20000,
n.burnin=1, n.thin=1, DIC=T)

Try:
library(rjags)
library(R2jags)
subsetdiscount <- split(discountdataset, as.factor(discountdataset$subjectid))
output_models <- lapply(subsetdiscount, function(x) {
nt <- nrow(x)
Choice <- x$choice
amountSS <- x$val_basic
amountLL <- x$val_d
delayDIFF <- x$delay
con <- x$condition
data <- list("nt", "Choice", "amountSS", "amountLL", "delayDIFF", "con") # to be passed on to JAGS
myinits <- list(list(k = (c(0.01, 0.01))),
list(temp = (c(6, 6))))
parameters <- c("k", "temp")
samples <- jags(data, inits=myinits, parameters,
model.file ="singlesubmodel_Ben_roundedchoice.txt",
n.chains=2, n.iter=20000,
n.burnin=1, n.thin=1, DIC=T)
return(samples)
})
output_models should be a list containing outputs for each of the factors you split main dataset by.
Please note that it is quite hard to test this without any provided data. So, if this fails to work, you may want to provide some data for testing.
I hope it helps.

Related

Expand for-loop to accommodate list in R?

I've recently been interested in trying to develop a for-loop that would be able to run multiple generalized additive models and then produce results in a table that ranks them based on AIC, p-value of each smooth in the model, deviance explained of the overall model, etc.
I found this related question in stack overflow which is basically what I want and was able to run this well for gam() instead of gamm(), however I want to expand this to include multiple independent variables in the model, not just 1.
Ideally, the models would run all possible combinations of independent variables against the dependent variable, and it would test combinations anywhere from 1 independent variable in the model, up to all of the possible covariates in "d_pred" in the model.
I have attempted to do this so far by starting out small and finding all possible combinations of 2 independent variables (df_combinations2), which results in a list of data frames. Then I adjusted the rest of the code to run the for loop such that each iteration will run a different combination of the two variables:
library(mgcv)
## Example data
set.seed(0)
dat <- gamSim(1,n=200,scale=2)
set.seed(1)
dat2 <- gamSim(1,n=200,scale=2)
names(dat2)[1:5] <- c("y1", paste0("x", 4:7))
d <- cbind(dat[, 1:5], dat2[, 1:5])
d_resp <- d[ c("y", "y1")]
d_pred <- d[, !(colnames(d) %in% c("y", "y1"))]
df_combinations2 <- lapply(1:(ncol(combn(1:ncol(d_pred), m = 2))),
function(y) d_pred[, combn(1:ncol(d_pred), m = 2)[,y]])
## create a "matrix" list of dimensions i x j
results_m2 <-lapply(1:length(df_combinations2), matrix, data= NA, nrow=ncol(d_resp), ncol=2)
## for-loop
for(k in 1:length(df_combinations2)){
for(i in 1:ncol(d_resp)){
for(j in 1:ncol(df_combinations2[[k]])){
results_m2[i, j][[1]] <- gam(d_resp[, i] ~ s(df_combinations2[[k]][,1])+s(df_combinations2[[k]][,2]))
}
}}
However, after running the for-loop I get the error "Error in all.vars1(gp$fake.formula[-2]) : can't handle [[ in formula".
Anyone know why I am getting this error/ how to fix it?
Any insight is much appreciated. Thanks!
Personally, I would create a data.table() containing all combinations of target variables and combinations of predictors and loop through all rows. See below.
library(data.table)
library(dplyr)
# Example data
set.seed(0)
dat <- gamSim(1,n=200,scale=2)
set.seed(1)
dat2 <- gamSim(1,n=200,scale=2)
names(dat2)[1:5] <- c("y1", paste0("x", 4:7))
d <- cbind(dat[, 1:5], dat2[, 1:5])
#select names of targets and predictors
targets <- c("y", "y1")
predictors <- colnames(d)[!colnames(d) %in% targets]
#create all combinations of predictors
predictor_combinations <- lapply(1:length(predictors), FUN = function(x){
#create combination
combination <- combn(predictors, m = x) |> as.data.table()
#add s() to all for gam
combination <- sapply(combination, FUN = function(y) paste0("s(", y, ")")) |> as.data.table()
#collapse
combination <- summarize_all(combination, .funs = paste0, collapse = "+")
#unlist
combination <- unlist(combination)
#remove names
names(combination) <- NULL
#return
return(combination)
})
#merge combinations of predictors as vector
predictor_combinations <- do.call(c, predictor_combinations)
#create folder to save results to
if(!dir.exists("dev")){
dir.create("dev")
}
if(!dir.exists("dev/models")){
dir.create("dev/models")
}
#create and save hypergrid (all combinations of targets and predictors combinations)
if(!file.exists("dev/hypergrid.csv")){
#create hypergrid and save to dev
hypergrid <- expand.grid(target = targets, predictors = predictor_combinations) |> as.data.table()
#add identifier
hypergrid[, model := paste0("model", 1:nrow(hypergrid))]
#save to dev
fwrite(hypergrid, file = "dev/hypergrid.csv")
} else{
#if file exists read
hypergrid <- fread("dev/hypergrid.csv")
}
#loop through hypergrid, create GAM models
#progressbar
pb <- txtProgressBar(min = 1, max = nrow(hypergrid), style = 3)
for(i in 1:nrow(hypergrid)){
#update progressbar
setTxtProgressBar(pb, i)
#select target
target <- hypergrid[i,]$target
#select predictors
predictors <- hypergrid[i,]$predictors
#create formula
gam.formula <- as.formula(paste0(target, "~", predictors))
#run gam
gam.model <- gam(gam.formula, data = d)
#save gam model do dev/model
saveRDS(gam.model, file = paste0("dev/models/", hypergrid[i,]$model, ".RDS"))
}
#example where you extract model performances
for(i in 1:nrow(hypergrid)){
#read the right model
rel.model <- readRDS(paste0("dev/models/", hypergrid[i,]$model, ".RDS"))
#extract model performance, add to hypergrid
hypergrid[i, R2 := summary(rel.model)[["r.sq"]]]
}
#arrange hypergrid on target and r2
hypergrid <- dplyr::arrange(hypergrid, hypergrid$target, desc(hypergrid$R2))
Which would give
head(hypergrid)
target predictors model R2
1: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5) model319 0.6957242
2: y s(x0)+s(x1)+s(x2)+s(x3)+s(x4)+s(x5) model423 0.6953753
3: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5)+s(x7) model437 0.6942054
4: y s(x0)+s(x1)+s(x2)+s(x5) model175 0.6941025
5: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5)+s(x6) model435 0.6940569
6: y s(x0)+s(x1)+s(x2)+s(x3)+s(x4)+s(x5)+s(x7) model481 0.6939756
All models are saved to a folder with an identifier (for if you want to use the model or extract more information from the model).
Notably, p-hacking comes to mind using this appraoch and I would be careful by conducting your analysis like this.

Apply Machine learning process simultaneously on multiple datasets with R

I want to delete correlated variables and perform lasso regression on multiple datasets. So i divided my data in two lists: first list contains variables and the second contains targets.
I want also to divide my data into train and test before applying Lasso, making predictions and store tthe results in a final dataframe.
The main steps:
1- Correlation: (delete correlated variables)
2- divide data inton train and test
3- Perform LASSO
4- Make predictions
5- store predictions in a dataframe with their labels
Thanks!
set.seed(99)
library("caret")
# Create data frames
H <- data.frame(replicate(10,sample(0:20,10,rep=TRUE)))
C <- data.frame(replicate(5,sample(0:100,10,rep=FALSE)))
R <- data.frame(replicate(7,sample(0:30,10,rep=TRUE)))
E <- data.frame(replicate(4,sample(0:40,10,rep=FALSE)))
# Create target variables
Y_H <- data.frame(replicate(1,sample(20:35, 10, rep = TRUE)))
Y_H
names(Y_H)<-names(Y_H)[names(Y_H)=="replicate.1..sample.20.35..10..rep...TRUE.."] <-"label_1"
Y_C <- data.frame(replicate(1,sample(15:65, 10, rep = TRUE)))
names(Y_C) <- names(Y_C)[names(Y_C)=="replicate.1..sample.15.65..10..rep...TRUE.."] <-"label_2"
Y_R <- data.frame(replicate(1,sample(25:45, 10, rep = TRUE)))
names(Y_R) <-names(Y_R)[names(Y_R) == "replicate.1..sample.25.45..10..rep...TRUE.."] <- "label_3"
Y_E <- data.frame(replicate(1,sample(21:80, 10, rep = TRUE)))
names(Y_E) <-names(Y_E)[names(Y_E) == "replicate.1..sample.15.65..10..rep...TRUE.."] <- "label_4"
# Store observations and targets in lists
inputs <- list(H, C, R, E)
targets <- list(Y_H, Y_C, Y_R, Y_E)
# Perform correlation
outputs <- list()
for(df in inputs){
data.cor <- cor(df)
high.cor <- findCorrelation(data.cor, cutoff=0.40)
outputs <- append(outputs, list(df[,-high.cor]))
}
library("glmnet")
lasso_cv <- list()
lasso_model <- list()
for(i in outputs){
for(j in targets){
lasso_cv[i] <- cv.glmnet(as.matrix(outputs[[i]]), as.matrix(targets[[j]]), standardize = TRUE, type.measure="mse", alpha = 1,nfolds = 3)
lasso_model[i] <- glmnet(as.matrix(outputs[[i]]), as.matrix(targets[[j]]),lambda = lasso_cv[i]$lambda_cv, alpha = 1, standardize = TRUE)
}
}
When i run my for loop, it gives this error:
Error in h(simpleError(msg, call)) :
erreur d'�valuation de l'argument 'x' lors de la s�lection d'une
m�thode pour la fonction 'as.matrix' : invalid subscript type 'list'
It seems to me that the error is in the range of the last for loop.
You wrote for(i in outputs), and then used as.matrix(outputs[[i]]). So, at the first iteration you are basically calling as.matrix(outputs[[outputs[[1]]), which does not make sense. Similar reasoning applies to for(j in targets).
Try to replace the code I quoted by for(i in seq_len(length(outputs))) and for(i in seq_len(length(targets))). That should work. In this way, at the first iteration as.matrix(outputs[[i]]) translates to as.matrix(outputs[[1]]), and similarly for the other line, which it seems to me is the idea you were looking for.
Ps I am not sure about your code. If we check, lasso_cv[i]$lambda_cv returns NULL for every i. Maybe you can check into it.

How to store values from loop to a dataframe in R?

I am new to R and programming, I want to store values from loop to a data frame in R. I want ker, cValues, accuracyValues values to be stored a data frame from bellow code. I am not able to achieve this, Data Frame is only saving last value not all the values.
Can you please help me with this please.
# Define a vector which has different kernel methods
kerna <- c("rbfdot","polydot","vanilladot","tanhdot","laplacedot",
"besseldot","anovadot","splinedot")
# Define a for loop to calculate accuracy for different values of C and kernel
for (ker in kerna){
cValues <- c()
accuracyValues <- c()
for (c in 1:100) {
model <- ksvm(V11~V1+V2+V3+V4+V5+V6+V7+V8+V9+V10,
data = credit_card_data,
type ="C-svc",
kernel = ker,
C=c,
scaled =TRUE)
pred <- predict(model,credit_card_data[,1:10])
#pred
accuracy <- sum(pred== credit_card_data$V11)/nrow(credit_card_data)
cValues[c] <- c;
accuracyValues[c] <- accuracy;
}
for(i in 1:100) {
print(paste("kernal:",ker, "c=",cValues[i],"accuracy=",accuracyValues[i]))
}
}
Starting from your base code, set up the structure of the output data frame. Then, loop through and fill in the accuracy values on each iteration. This method also "flattens" the nested loop and gets rid of your c variable which conflicts with the built-in c() function.
kerna <- c("rbfdot","polydot","vanilladot","tanhdot","laplacedot",
"besseldot","anovadot","splinedot")
# Create dataframe to store output data
df <- data.frame(kerna = rep(kerna, each = 100),
cValues = rep(1:100, times = length(kerna)),
accuracyValues = NA,
stringsAsFactors = F)
# Define a for loop to calculate accuracy for different values of C and kernel
for (i in 1:nrow(df)){
ker <- df$kerna[i]
j <- df$cValues[i]
model <- ksvm(V11~V1+V2+V3+V4+V5+V6+V7+V8+V9+V10,
data = credit_card_data,
type ="C-svc",
kernel = ker,
C=j,
scaled =TRUE)
pred <- predict(model,credit_card_data[,1:10])
accuracy <- sum(pred== credit_card_data$V11)/nrow(credit_card_data)
# Insert accuracy into df$accuracyValues
df$accuracyValues[i] <- accuracy;
}
Consider Map to build a list of data frames from each pairing of ker and cValues (1:100) generated from expand.grid and row bind all elements together.
k_c_pairs_df <- expand.grid(kerna=kerna, c_value=1:100, stringsAsFactors = FALSE)
model_fct <- function(ker, c) {
model <- ksvm(V11~V1+V2+V3+V4+V5+V6+V7+V8+V9+V10,
data = credit_card_data,
type ="C-svc",
kernel = ker,
C=c,
scaled =TRUE)
pred <- predict(model,credit_card_data[,1:10])
accuracy <- sum(pred== credit_card_data$V11)/nrow(credit_card_data)
print(paste("kernal:",ker, "c=",cValues[i],"accuracy=",accuracyValues[i]))
return(data.frame(kernel = ker, cValues = c, accuracyValues = accuracy))
}
df_list <- Map(model_fct, k_c_pairs_df$ker, k_c_pairs_df$c_value)
final_df <- do.call(rbind, df_list)

Combining multiple function arguments inside list2serv(lapply(),)

I'm working with ten training datasets, train1 through train10, and would like to repeat the following statements for 1 through 10 with a single block of code:
train_y_1 <- c(train1$y)
train1$y <-NULL
train_x_1 <- data.matrix(train1)
olsfit_1 <- cv.glmnet(y=train_y_1, x=train_x_1, alpha=1, family="gaussian")
I've read in the forums that lapply() is preferable to for loops. My code:
# Create empty data frames and list (to be populated with values in main program)
list2env(setNames(lapply(1:10, function(i) data.frame()), paste0('train_y_', 1:10)), envir=.GlobalEnv)
list2env(setNames(lapply(1:10, function(i) data.frame()), paste0('train_x_', 1:10)), envir=.GlobalEnv)
list2env(setNames(lapply(1:10, function(i) list()), paste0('lasso_', 1:10)), envir=.GlobalEnv)
# Create y and x input matrices and run ten lasso regressions
list2env(lapply(mget(paste0('train', 1:10)), mget(paste0('train_y_', 1:10)), mget(paste0('train_x_', 1:10)), mget(paste0('lasso_', 1:10)),
function(a,b,c,d)
{
b <- c(a$y);
a$y <- NULL;
c <- data.matrix(a);
d <- cv.glmnet(y=b, x=c, alpha=1, family="gaussian");
}), envir=.GlobalEnv)
which produces the error message:
Error in match.fun(FUN) :
'mget(paste0("train_y_", 1:10))' is not a function, character or symbol
So it looks like R is confused by the four mget() functions which I intended to be reading in values for the a,b,c,d arguments, but I'm not sure how to proceed next.
Any suggestions?
You want to keep all your data in lists whenever possible, avoiding polluting the global environment with a bunch of variables. This isn't tested, and train is missing, but should be a similar list of your train data. Then, you could do something like,
trainy <- setNames(lapply(1:10, function(i) data.frame()), paste0('train_y_', 1:10))
trainx <- setNames(lapply(1:10, function(i) data.frame()), paste0('train_x_', 1:10))
lasso <- setNames(lapply(1:10, function(i) list()), paste0('lasso_', 1:10))
f <- function(a,b,c,d) {
b <- c(a$y);
a$y <- NULL;
c <- data.matrix(a);
d <- cv.glmnet(y=b, x=c, alpha=1, family="gaussian");
}
mapply(f, train, trainy, trainx, lasso, SIMPLIFY=F)
Although, since your lists are just initializing variables, you probably just want to loop (apply) over a list of your training data,
lapply(train, function(x) {
... # the statements you want to repeat
list(...) # return a list of the three data.frames
})
We can achieve this with the following code.
# Load libraries
library(dplyr);library(glmnet)
# Gather all the variables in global into a list
fit = mget(paste0("train", 1:10), envir = .GlobalEnv) %>%
# Pipe each element of the list into `cv.glmnet` function
lapply(function(dat) {cv.glmnet(y = dat$y,
x = data.matrix(dat %>% mutate(y = NULL)),
alpha = 1,
family = "gaussian")})
Your output will be neatly stored in fit, which is a list with 10 elements. You can call each element with fit[[i]]. For example coef(fit[[1]]) pulls out the coefs for train1 and lapply(fit, coef) pulls the coef for all 10 models and stores them in a list.

R Simulation Programming Efficiency

I am a relatively new R programmer and have written a script that takes some statistical results and will ultimately compare it to a vector of results in which the target variable has been randomized. The result vector contains the statistical results of n simulations. As the number of simulations increases (I would like to run 10,000 simulations at least) the run time is longer than I would like. I have tried increasing the performance in ways I know to modify the code, but would love the help of others in optimizing it. The relevant part of the code is below.
#CREATE DATA
require(plyr)
Simulations <- 10001
Variation <- c("Control", "A", "B","C")
Trials <- c(727,724,723,720)
NonResponse <- c(692,669,679,682)
Response <- c(35,55,44,38)
ConfLevel <- .95
#PERFORM INITIAL CALCS
NonResponse <- Trials-Response
Data <-data.frame(Variation, NonResponse, Response, Trials)
total <- ddply(Data,.(Variation),function(x){data.frame(value = rep(c(0,1),times = c(x$NonResponse,x$Response)))})
total <- total[sample(1:nrow(total)), ]
colnames(total) <- c("Variation","Response")
#CREATE FUNCTION TO PERFORM SIMULATIONS
targetshuffle <- function(x)
{
shuffle_target <- x[,"Response"]
shuffle_target <- data.frame(sample(shuffle_target))
revised <- cbind(x[,"Variation"], shuffle_target)
colnames(revised) <- c("Variation","Yes")
yes_variation <- data.frame(table(revised$Yes,revised$Variation))
colnames(yes_variation) <- c("Yes","Variation","Shuffled_Response")
Shuffled_Data <- subset(yes_variation, yes_variation$Yes==1)
Shuffled_Data <- Shuffled_Data[match(Variation, Shuffled_Data$Variation),]
yes_variation <- cbind(Data,Shuffled_Data)
VectorPTest_All <- yes_variation[,c("Variation","NonResponse","Response","Trials","Shuffled_Response")]
Control_Only <- yes_variation[yes_variation$Variation=="Control",]
VectorPTest_Chall <- subset(yes_variation,!(Variation=="Control"))
VectorPTest_Chall <- VectorPTest_Chall[,c("Variation","NonResponse","Response","Trials","Shuffled_Response")]
ControlResponse <- Control_Only$Response
ControlResponseRevised <- Control_Only$Shuffled_Response
ControlTotal <- Control_Only$Trials
VariationCount <- length(VectorPTest_Chall$Variation)
VP <- data.frame(c(VectorPTest_Chall,rep(ControlResponse),rep(ControlResponseRevised),rep(ControlTotal)))
names(VP) <- c("Variation","NonResponse","Response", "Trials", "ResponseShuffled", "ControlReponse",
"ControlResponseShuffled","ControlTotal")
VP1 <<- data.frame(VP[,c(5,7,4,8)])
VP2 <<- data.frame(VP[,c(3,6,4,8)])
ptest <- apply(VP1, 1, function(column) prop.test(x=c(column[1], column[2]),
n=c(column[3], column[4]), alternative="two.sided",
conf.level=ConfLevel, correct=FALSE)$p.value)
min_p_value <- min(ptest)
return(min_p_value)
}
#CALL FUNCTION
sim_result <- do.call(rbind, rlply(Simulations, targetshuffle(total)))
Offhand, one thing to look at is creating all the data frames. Each time you do that you're copying all the data in the constituent object. If the dimensions are predictable you might consider creating empty matrices at the beginning of the function and populating them as you go.

Resources