I am using R to do a ML project, I have prepared the dataset and split the data into 10 equal splits but the problem is I need to fit the model 10 times manually (10-fold CV). I have tried to create train and test data using a for loop but each time it runs, train is the whole dataset and test is null. Can someone help me, please?
# Preparing the data
data <- read.csv("./project.csv")
id <- seq(1:103342)
data[, 'id'] <- id
for (i in 3:8) {
data[,i] <- as.factor(data[,i])
}
# splitting the data into 10 equal data frames
f <- rep(seq(1, 10), each=round(103342/10), length.out=103342)
df <- split(data, f)
lapply(df, dim)
# running 10-fold cross-validation and computing error rate and AUC for each run.
results <- matrix(nrow=10, ncol=2, dimnames= list(c(), c('error_rate', 'auc')))
for (i in 1:10) {
train <- data[!(data$id %in% df$`i`$id),]
test <- df$`i`
print(dim(test)) # Here is my problem the print statement will print null 10 times
glm.fit <- glm(canceled ~ ., data=train, family=binomial)
glm.prob <- predict(glm.fit, newdata=test, type="response")
...
}
Related
I've recently been interested in trying to develop a for-loop that would be able to run multiple generalized additive models and then produce results in a table that ranks them based on AIC, p-value of each smooth in the model, deviance explained of the overall model, etc.
I found this related question in stack overflow which is basically what I want and was able to run this well for gam() instead of gamm(), however I want to expand this to include multiple independent variables in the model, not just 1.
Ideally, the models would run all possible combinations of independent variables against the dependent variable, and it would test combinations anywhere from 1 independent variable in the model, up to all of the possible covariates in "d_pred" in the model.
I have attempted to do this so far by starting out small and finding all possible combinations of 2 independent variables (df_combinations2), which results in a list of data frames. Then I adjusted the rest of the code to run the for loop such that each iteration will run a different combination of the two variables:
library(mgcv)
## Example data
set.seed(0)
dat <- gamSim(1,n=200,scale=2)
set.seed(1)
dat2 <- gamSim(1,n=200,scale=2)
names(dat2)[1:5] <- c("y1", paste0("x", 4:7))
d <- cbind(dat[, 1:5], dat2[, 1:5])
d_resp <- d[ c("y", "y1")]
d_pred <- d[, !(colnames(d) %in% c("y", "y1"))]
df_combinations2 <- lapply(1:(ncol(combn(1:ncol(d_pred), m = 2))),
function(y) d_pred[, combn(1:ncol(d_pred), m = 2)[,y]])
## create a "matrix" list of dimensions i x j
results_m2 <-lapply(1:length(df_combinations2), matrix, data= NA, nrow=ncol(d_resp), ncol=2)
## for-loop
for(k in 1:length(df_combinations2)){
for(i in 1:ncol(d_resp)){
for(j in 1:ncol(df_combinations2[[k]])){
results_m2[i, j][[1]] <- gam(d_resp[, i] ~ s(df_combinations2[[k]][,1])+s(df_combinations2[[k]][,2]))
}
}}
However, after running the for-loop I get the error "Error in all.vars1(gp$fake.formula[-2]) : can't handle [[ in formula".
Anyone know why I am getting this error/ how to fix it?
Any insight is much appreciated. Thanks!
Personally, I would create a data.table() containing all combinations of target variables and combinations of predictors and loop through all rows. See below.
library(data.table)
library(dplyr)
# Example data
set.seed(0)
dat <- gamSim(1,n=200,scale=2)
set.seed(1)
dat2 <- gamSim(1,n=200,scale=2)
names(dat2)[1:5] <- c("y1", paste0("x", 4:7))
d <- cbind(dat[, 1:5], dat2[, 1:5])
#select names of targets and predictors
targets <- c("y", "y1")
predictors <- colnames(d)[!colnames(d) %in% targets]
#create all combinations of predictors
predictor_combinations <- lapply(1:length(predictors), FUN = function(x){
#create combination
combination <- combn(predictors, m = x) |> as.data.table()
#add s() to all for gam
combination <- sapply(combination, FUN = function(y) paste0("s(", y, ")")) |> as.data.table()
#collapse
combination <- summarize_all(combination, .funs = paste0, collapse = "+")
#unlist
combination <- unlist(combination)
#remove names
names(combination) <- NULL
#return
return(combination)
})
#merge combinations of predictors as vector
predictor_combinations <- do.call(c, predictor_combinations)
#create folder to save results to
if(!dir.exists("dev")){
dir.create("dev")
}
if(!dir.exists("dev/models")){
dir.create("dev/models")
}
#create and save hypergrid (all combinations of targets and predictors combinations)
if(!file.exists("dev/hypergrid.csv")){
#create hypergrid and save to dev
hypergrid <- expand.grid(target = targets, predictors = predictor_combinations) |> as.data.table()
#add identifier
hypergrid[, model := paste0("model", 1:nrow(hypergrid))]
#save to dev
fwrite(hypergrid, file = "dev/hypergrid.csv")
} else{
#if file exists read
hypergrid <- fread("dev/hypergrid.csv")
}
#loop through hypergrid, create GAM models
#progressbar
pb <- txtProgressBar(min = 1, max = nrow(hypergrid), style = 3)
for(i in 1:nrow(hypergrid)){
#update progressbar
setTxtProgressBar(pb, i)
#select target
target <- hypergrid[i,]$target
#select predictors
predictors <- hypergrid[i,]$predictors
#create formula
gam.formula <- as.formula(paste0(target, "~", predictors))
#run gam
gam.model <- gam(gam.formula, data = d)
#save gam model do dev/model
saveRDS(gam.model, file = paste0("dev/models/", hypergrid[i,]$model, ".RDS"))
}
#example where you extract model performances
for(i in 1:nrow(hypergrid)){
#read the right model
rel.model <- readRDS(paste0("dev/models/", hypergrid[i,]$model, ".RDS"))
#extract model performance, add to hypergrid
hypergrid[i, R2 := summary(rel.model)[["r.sq"]]]
}
#arrange hypergrid on target and r2
hypergrid <- dplyr::arrange(hypergrid, hypergrid$target, desc(hypergrid$R2))
Which would give
head(hypergrid)
target predictors model R2
1: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5) model319 0.6957242
2: y s(x0)+s(x1)+s(x2)+s(x3)+s(x4)+s(x5) model423 0.6953753
3: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5)+s(x7) model437 0.6942054
4: y s(x0)+s(x1)+s(x2)+s(x5) model175 0.6941025
5: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5)+s(x6) model435 0.6940569
6: y s(x0)+s(x1)+s(x2)+s(x3)+s(x4)+s(x5)+s(x7) model481 0.6939756
All models are saved to a folder with an identifier (for if you want to use the model or extract more information from the model).
Notably, p-hacking comes to mind using this appraoch and I would be careful by conducting your analysis like this.
I would like to create replicate predictions for one integer independent variable (iv1) given some model and a data frame called training. This is my current approach. I appreciate this is not self containing but hopefully it is self explanatory:
number_of_samples <- 10
results <- NULL
for (row in 1:nrow(training)) {
fake_iv1_values <- sample(1:100, number_of_samples)
case <- training[row,]
for (iv1 in fake_iv1_values) {
case$iv1 <- iv1
case$prediction <- predict(some_model, newdata = case)
results <- rbind(results, case)
}
}
Using loops is very slow. I wonder, if this could be sped up? Thanks!
Try with this.
Reproducible fake data and model:
# create fake data
n_row <- 100
n_xs <- 100
training <- data.frame(y = rnorm(n_row), iv1 = rnorm(n_row))
training[, paste0("x",1:n_xs)] <- replicate(n_xs, list(rnorm(n_row)))
# example model
some_model <- lm(y~., training)
Rewritten code:
number_of_samples <- 10
results <- NULL
# vector of several fake_iv1_values vectors
fake_iv1_values <- as.numeric(replicate(nrow(training), sample(1:100, number_of_samples)))
# replicate each row of the original dataframe
results <- training[rep(seq_len(nrow(training)), each = number_of_samples), ]
# add fake values to the replicated dataframe
results$iv1 <- fake_iv1_values
# get predictions
results$prediction <- predict(some_model, newdata = results)
I have 9 different datasets with same variables. I added them all in a list.
Now I was to apply lm model on all the datasets in a list via for loops. I am having difficulty in splitting the data for testing.
for(i in data_list){
train = i[1: nrow(i)-444, ]
test = i[593:616,]
daily_lm <- lm(load ~ Daily, data = datatrain)
test$forecastlm_d <- list(predict(daily_lag_lm, test)
}```
How do I train and predict data so that it is in accordance with each dataset?
We could loop over the sequence of list while initiating a list to store the output of predictions
preds <- vector('list', length(data_list))
for(i in seq_along(data_list)){
tmpdat <- data_list[[i]]
train <- tmpdat[1:(nrow(tmpdat)-444),]
test <- tmpdat[593:616,]
daily_lm <- lm(load ~ Daily, data = train)
preds[[i]] <- predict(daily_lm, test)
}
Or another option is lapply
preds <- lapply(data_list, function(dat) {
train <- dat[1:(nrow(dat) - 444),]
test <- dat[593:616,]
daily_lm <- lm(load ~ Daily, data = train)
predict(daily_lm, test)
})
I have a big data set and I want to choose randomly subsets (randomly_live) from it and then run a model (logistic regression) in R. So I want to run 100 logistic regressions to count how many times coefficients were with positive sign, haw many times they were significant and show the best model by Hosmer-Lemeshow criteria.
I think it's possible to make it by loop, but I feel really confused with that.
This is a piece of code that I have for one iteration
randomRows = function(df,n){
return(df[sample(nrow(df),n),])
}
set.seed(567)
df.split <- split(full_data, full_data$ID)
df.sample <- lapply(df.split, randomRows, 1)
df.final <- do.call("rbind", df.sample)
randomly_live <- randomRows(df.final, nrow(default))
data1 <- rbind(default, randomly_live)
model = glm(default ~ log(assets)+…+H1, data = data1,
family = 'binomial')
library(ResourceSelection)
hl <- hoslem.test(model$y, fitted(model), g=10)
Can anyone please help?
Here is something that could work
myResults <- list()
for(i in 1:100){
model <- glm(vs ~ . , data = mtcars)
hl <- hoslem.test(model$y, fitted(model), g=10)
pos <- length(which(coef(model)>0))
pvals <- summary(model)$coefficients[,4]
hl_pval <- hl$p.value
myResults[[i]] <- list(pos = pos, pvals = pvals,hl_pval=hl_pval)
}
# lowest pvalue
which.min(unlist(lapply(myResults, FUN = function(x) x[[3]])))
I am new to R (day 2) and have been tasked with building a forest of random forests. Each individual random forest will be built using a different training set and we will combine all the forests at the end to make predictions. I am implementing this in R and am having some difficulty combining two forests not built using the same set. My attempt is as follows:
d1 = read.csv("../data/rr/train/10/chunk0.csv",header=TRUE)
d2 = read.csv("../data/rr/train/10/chunk1.csv",header=TRUE)
rf1 = randomForest(A55~., data=d1, ntree=10)
rf2 = randomForest(A55~., data=d2, ntree=10)
rf = combine(rf1,rf2)
This of course produces an error:
Error in rf$votes + ifelse(is.na(rflist[[i]]$votes), 0, rflist[[i]]$votes) :
non-conformable arrays
In addition: Warning message:
In rf$oob.times + rflist[[i]]$oob.times :
longer object length is not a multiple of shorter object length
I have been browsing the web for some time looking at a clue for this but haven't had any success yet. Any help here would be most appreciated.
Ah. This is either an oversight in combine or what you're trying to do is nonsensical, depending on your point of view.
The votes matrix records the number of votes in the forest for each case in the training data for each response category. Naturally, it will have the same number of rows as the number of rows in your training data.
combine is assuming that you ran your random forests twice on the same set of data, so the dimensions of those matrices will be the same. It's doing this because it wants to provide you with some "overall" error estimates for the combined forest.
But if the two data sets are different combining the votes matrices becomes simply nonsensical. You could get combine to run by simply removing one row from your larger training data set, but the resulting votes matrix in the combined forest would be gibberish, since each row would be a combination of votes for two different training cases.
So maybe this is simply something that should be an option that can be turned off in combine. Because it should still make sense to combine the actual trees and predict on the resulting object. But some of "combined" error estimates in the output from combine will be meaningless.
Long story short, make each training data set the same size, and it will run. But if you do, I wouldn't use the resulting object for anything other than making new predictions. Anything that is combined that was summarizing the performance of the forests will be nonsense.
However, I think the intended way to use combine is to fit multiple random forests on the full data set, but with a reduced number of trees and then to combine those forests.
Edit
I went ahead and modified combine to "handle" unequal training set sizes. All that means really is that I removed a large chunk of code that was trying to stitch things together that weren't going to match up. But I kept the portion that combines the forests, so you can still use predict:
my_combine <- function (...)
{
pad0 <- function(x, len) c(x, rep(0, len - length(x)))
padm0 <- function(x, len) rbind(x, matrix(0, nrow = len -
nrow(x), ncol = ncol(x)))
rflist <- list(...)
areForest <- sapply(rflist, function(x) inherits(x, "randomForest"))
if (any(!areForest))
stop("Argument must be a list of randomForest objects")
rf <- rflist[[1]]
classRF <- rf$type == "classification"
trees <- sapply(rflist, function(x) x$ntree)
ntree <- sum(trees)
rf$ntree <- ntree
nforest <- length(rflist)
haveTest <- !any(sapply(rflist, function(x) is.null(x$test)))
vlist <- lapply(rflist, function(x) rownames(importance(x)))
numvars <- sapply(vlist, length)
if (!all(numvars[1] == numvars[-1]))
stop("Unequal number of predictor variables in the randomForest objects.")
for (i in seq_along(vlist)) {
if (!all(vlist[[i]] == vlist[[1]]))
stop("Predictor variables are different in the randomForest objects.")
}
haveForest <- sapply(rflist, function(x) !is.null(x$forest))
if (all(haveForest)) {
nrnodes <- max(sapply(rflist, function(x) x$forest$nrnodes))
rf$forest$nrnodes <- nrnodes
rf$forest$ndbigtree <- unlist(sapply(rflist, function(x) x$forest$ndbigtree))
rf$forest$nodestatus <- do.call("cbind", lapply(rflist,
function(x) padm0(x$forest$nodestatus, nrnodes)))
rf$forest$bestvar <- do.call("cbind", lapply(rflist,
function(x) padm0(x$forest$bestvar, nrnodes)))
rf$forest$xbestsplit <- do.call("cbind", lapply(rflist,
function(x) padm0(x$forest$xbestsplit, nrnodes)))
rf$forest$nodepred <- do.call("cbind", lapply(rflist,
function(x) padm0(x$forest$nodepred, nrnodes)))
tree.dim <- dim(rf$forest$treemap)
if (classRF) {
rf$forest$treemap <- array(unlist(lapply(rflist,
function(x) apply(x$forest$treemap, 2:3, pad0,
nrnodes))), c(nrnodes, 2, ntree))
}
else {
rf$forest$leftDaughter <- do.call("cbind", lapply(rflist,
function(x) padm0(x$forest$leftDaughter, nrnodes)))
rf$forest$rightDaughter <- do.call("cbind", lapply(rflist,
function(x) padm0(x$forest$rightDaughter, nrnodes)))
}
rf$forest$ntree <- ntree
if (classRF)
rf$forest$cutoff <- rflist[[1]]$forest$cutoff
}
else {
rf$forest <- NULL
}
#
#Tons of stuff removed here...
#
if (classRF) {
rf$confusion <- NULL
rf$err.rate <- NULL
if (haveTest) {
rf$test$confusion <- NULL
rf$err.rate <- NULL
}
}
else {
rf$mse <- rf$rsq <- NULL
if (haveTest)
rf$test$mse <- rf$test$rsq <- NULL
}
rf
}
And then you can test it like this:
data(iris)
d <- iris[sample(150,150),]
d1 <- d[1:70,]
d2 <- d[71:150,]
rf1 <- randomForest(Species ~ ., d1, ntree=50, norm.votes=FALSE)
rf2 <- randomForest(Species ~ ., d2, ntree=50, norm.votes=FALSE)
rf.all <- my_combine(rf1,rf2)
predict(rf.all,newdata = iris)
Obviously, this comes with absolutely no warranty! :)