I have a big data set and I want to choose randomly subsets (randomly_live) from it and then run a model (logistic regression) in R. So I want to run 100 logistic regressions to count how many times coefficients were with positive sign, haw many times they were significant and show the best model by Hosmer-Lemeshow criteria.
I think it's possible to make it by loop, but I feel really confused with that.
This is a piece of code that I have for one iteration
randomRows = function(df,n){
return(df[sample(nrow(df),n),])
}
set.seed(567)
df.split <- split(full_data, full_data$ID)
df.sample <- lapply(df.split, randomRows, 1)
df.final <- do.call("rbind", df.sample)
randomly_live <- randomRows(df.final, nrow(default))
data1 <- rbind(default, randomly_live)
model = glm(default ~ log(assets)+…+H1, data = data1,
family = 'binomial')
library(ResourceSelection)
hl <- hoslem.test(model$y, fitted(model), g=10)
Can anyone please help?
Here is something that could work
myResults <- list()
for(i in 1:100){
model <- glm(vs ~ . , data = mtcars)
hl <- hoslem.test(model$y, fitted(model), g=10)
pos <- length(which(coef(model)>0))
pvals <- summary(model)$coefficients[,4]
hl_pval <- hl$p.value
myResults[[i]] <- list(pos = pos, pvals = pvals,hl_pval=hl_pval)
}
# lowest pvalue
which.min(unlist(lapply(myResults, FUN = function(x) x[[3]])))
Related
I've recently been interested in trying to develop a for-loop that would be able to run multiple generalized additive models and then produce results in a table that ranks them based on AIC, p-value of each smooth in the model, deviance explained of the overall model, etc.
I found this related question in stack overflow which is basically what I want and was able to run this well for gam() instead of gamm(), however I want to expand this to include multiple independent variables in the model, not just 1.
Ideally, the models would run all possible combinations of independent variables against the dependent variable, and it would test combinations anywhere from 1 independent variable in the model, up to all of the possible covariates in "d_pred" in the model.
I have attempted to do this so far by starting out small and finding all possible combinations of 2 independent variables (df_combinations2), which results in a list of data frames. Then I adjusted the rest of the code to run the for loop such that each iteration will run a different combination of the two variables:
library(mgcv)
## Example data
set.seed(0)
dat <- gamSim(1,n=200,scale=2)
set.seed(1)
dat2 <- gamSim(1,n=200,scale=2)
names(dat2)[1:5] <- c("y1", paste0("x", 4:7))
d <- cbind(dat[, 1:5], dat2[, 1:5])
d_resp <- d[ c("y", "y1")]
d_pred <- d[, !(colnames(d) %in% c("y", "y1"))]
df_combinations2 <- lapply(1:(ncol(combn(1:ncol(d_pred), m = 2))),
function(y) d_pred[, combn(1:ncol(d_pred), m = 2)[,y]])
## create a "matrix" list of dimensions i x j
results_m2 <-lapply(1:length(df_combinations2), matrix, data= NA, nrow=ncol(d_resp), ncol=2)
## for-loop
for(k in 1:length(df_combinations2)){
for(i in 1:ncol(d_resp)){
for(j in 1:ncol(df_combinations2[[k]])){
results_m2[i, j][[1]] <- gam(d_resp[, i] ~ s(df_combinations2[[k]][,1])+s(df_combinations2[[k]][,2]))
}
}}
However, after running the for-loop I get the error "Error in all.vars1(gp$fake.formula[-2]) : can't handle [[ in formula".
Anyone know why I am getting this error/ how to fix it?
Any insight is much appreciated. Thanks!
Personally, I would create a data.table() containing all combinations of target variables and combinations of predictors and loop through all rows. See below.
library(data.table)
library(dplyr)
# Example data
set.seed(0)
dat <- gamSim(1,n=200,scale=2)
set.seed(1)
dat2 <- gamSim(1,n=200,scale=2)
names(dat2)[1:5] <- c("y1", paste0("x", 4:7))
d <- cbind(dat[, 1:5], dat2[, 1:5])
#select names of targets and predictors
targets <- c("y", "y1")
predictors <- colnames(d)[!colnames(d) %in% targets]
#create all combinations of predictors
predictor_combinations <- lapply(1:length(predictors), FUN = function(x){
#create combination
combination <- combn(predictors, m = x) |> as.data.table()
#add s() to all for gam
combination <- sapply(combination, FUN = function(y) paste0("s(", y, ")")) |> as.data.table()
#collapse
combination <- summarize_all(combination, .funs = paste0, collapse = "+")
#unlist
combination <- unlist(combination)
#remove names
names(combination) <- NULL
#return
return(combination)
})
#merge combinations of predictors as vector
predictor_combinations <- do.call(c, predictor_combinations)
#create folder to save results to
if(!dir.exists("dev")){
dir.create("dev")
}
if(!dir.exists("dev/models")){
dir.create("dev/models")
}
#create and save hypergrid (all combinations of targets and predictors combinations)
if(!file.exists("dev/hypergrid.csv")){
#create hypergrid and save to dev
hypergrid <- expand.grid(target = targets, predictors = predictor_combinations) |> as.data.table()
#add identifier
hypergrid[, model := paste0("model", 1:nrow(hypergrid))]
#save to dev
fwrite(hypergrid, file = "dev/hypergrid.csv")
} else{
#if file exists read
hypergrid <- fread("dev/hypergrid.csv")
}
#loop through hypergrid, create GAM models
#progressbar
pb <- txtProgressBar(min = 1, max = nrow(hypergrid), style = 3)
for(i in 1:nrow(hypergrid)){
#update progressbar
setTxtProgressBar(pb, i)
#select target
target <- hypergrid[i,]$target
#select predictors
predictors <- hypergrid[i,]$predictors
#create formula
gam.formula <- as.formula(paste0(target, "~", predictors))
#run gam
gam.model <- gam(gam.formula, data = d)
#save gam model do dev/model
saveRDS(gam.model, file = paste0("dev/models/", hypergrid[i,]$model, ".RDS"))
}
#example where you extract model performances
for(i in 1:nrow(hypergrid)){
#read the right model
rel.model <- readRDS(paste0("dev/models/", hypergrid[i,]$model, ".RDS"))
#extract model performance, add to hypergrid
hypergrid[i, R2 := summary(rel.model)[["r.sq"]]]
}
#arrange hypergrid on target and r2
hypergrid <- dplyr::arrange(hypergrid, hypergrid$target, desc(hypergrid$R2))
Which would give
head(hypergrid)
target predictors model R2
1: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5) model319 0.6957242
2: y s(x0)+s(x1)+s(x2)+s(x3)+s(x4)+s(x5) model423 0.6953753
3: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5)+s(x7) model437 0.6942054
4: y s(x0)+s(x1)+s(x2)+s(x5) model175 0.6941025
5: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5)+s(x6) model435 0.6940569
6: y s(x0)+s(x1)+s(x2)+s(x3)+s(x4)+s(x5)+s(x7) model481 0.6939756
All models are saved to a folder with an identifier (for if you want to use the model or extract more information from the model).
Notably, p-hacking comes to mind using this appraoch and I would be careful by conducting your analysis like this.
I want to use mixture copula for reliability analysis, now ,with the help of a friend ,I've already finished it ‘RVMs_fitted’ 。now i want to perform the probability integral transformation (PIT),but the function of RVINEPIT can’t use,because RVINEPIT(data,RVM),this RVM not RVINEMATRIX Here is my code:
library(vineclust)
data1 <- read.csv("D:/ASTUDY/Rlanguage/Mix copula/data.csv", header = FALSE)
fit <- vcmm(data = data1, total_comp=3,is_cvine = 0)
print(fit)
summary(fit)
RVMs_fitted <- list()
RVMs_fitted[[1]] <- VineCopula::RVineMatrix(Matrix=fit$output$vine_structure[,,1],
family=fit$output$bicop_familyset[,,1],
par=fit$output$bicop_param[,,1],
par2=fit$output$bicop_param2[,,1])
RVMs_fitted[[2]] <- VineCopula::RVineMatrix(Matrix=fit$output$vine_structure[,,2],
family=fit$output$bicop_familyset[,,2],
par=fit$output$bicop_param[,,2],
par2=fit$output$bicop_param2[,,2])
RVMs_fitted[[3]] <- VineCopula::RVineMatrix(Matrix=fit$output$vine_structure[,,3],
family=fit$output$bicop_familyset[,,3],
par=fit$output$bicop_param[,,3],
par2=fit$output$bicop_param2[,,3])
RVM<-RVMs_fitted
meanx <- c(0.47,0.508,0.45,0.52,0.48)
sigmax <- c(0.318,0.322,0.296,0.29,0.279)
ux1<-pnorm(x[1],meanx[1],sigmax[1])
ux2<-pnorm(x[2],meanx[2],sigmax[2])
ux3<-pnorm(x[3],meanx[3],sigmax[3])
ux4<-pnorm(x[4],meanx[4],sigmax[4])
ux5<-pnorm(x[5],meanx[5],sigmax[5])
data <- c(ux1,ux2,ux3,ux4,ux5)
du=RVinePIT(data, RVM)
y=t(qnorm(t(du)))
Error:
In RVinePIT: RVM has to be an RVineMatrix object.
You have multiple problems here:
RVM is a list. However, you tried to fit RVinePIT to a list, while it works for one data at a time.
The same holds for the y.
I do not have your data, but try it with other data.
Here is the code (it should work):
library(vineclust)
library(VineCopula)
data1 <- read.csv("D:/ASTUDY/Rlanguage/Mix copula/data.csv", header = FALSE)
fit <- vcmm(data = data, total_comp=3,is_cvine = 0)
print(fit)
summary(fit)
RVMs_fitted <- list()
RVMs_fitted[[1]] <- RVineMatrix(Matrix=fit$output$vine_structure[,,1],
family=fit$output$bicop_familyset[,,1],
par=fit$output$bicop_param[,,1],
par2=fit$output$bicop_param2[,,1])
RVMs_fitted[[2]] <- RVineMatrix(Matrix=fit$output$vine_structure[,,2],
family=fit$output$bicop_familyset[,,2],
par=fit$output$bicop_param[,,2],
par2=fit$output$bicop_param2[,,2])
RVMs_fitted[[3]] <- RVineMatrix(Matrix=fit$output$vine_structure[,,3],
family=fit$output$bicop_familyset[,,3],
par=fit$output$bicop_param[,,3],
par2=fit$output$bicop_param2[,,3])
RVM<-RVMs_fitted
meanx <- c(0.47,0.508,0.45,0.52,0.48)
sigmax <- c(0.318,0.322,0.296,0.29,0.279)
ux1<-pnorm(x[1],meanx[1],sigmax[1])
ux2<-pnorm(x[2],meanx[2],sigmax[2])
ux3<-pnorm(x[3],meanx[3],sigmax[3])
ux4<-pnorm(x[4],meanx[4],sigmax[4])
ux5<-pnorm(x[5],meanx[5],sigmax[5])
data <- c(ux1,ux2,ux3,ux4,ux5)### This must be a matrix to work with RVinePIT
du=lapply(1:3, function(i) RVinePIT(data, RVM[[i]]))
y <-lapply(1:3, function(i) t(qnorm(t(du[[i]]))))
I want to be able to input the variable name that I'll be using in the "weights" option in the lmer function. So then I can change the dataset, and cycle through the "weights" and pull the correct variable.
I want to pull the correct column for weights within the for loop.
So for y, the equation would be:
lmer(y~x+(1|study), weights = weight.var)
And y1:
lmer(y1~x+(1|study),weights = weight.var1)
So I named the weighting variables (weight.opt), then want to use them in the formula within the for loop. I can use "as.formula" to get the formula working and connected to the dataset, but I'm not sure how to do something similar with the weights.
x <- rnorm(300,0,1)
y <- x*rnorm(300,2,0.5)
y1 <- x*rnorm(300,0.1,0.1)
study <- rep(c("a","b","c"),each = 100)
weight.var <- rep(c(0.5,2,4),each = 100)
weight.var1 <- rep(c(0.1,.2,.15),each = 100)
library(lme4)
dataset <- data.frame(x,y,y1,study,weight.var,weight.var1)
resp1 <- c("y","y1")
weight.opt <- c("weight.var","weight.var1")
for(i in 1:2){
lmer(as.formula(paste(resp1[i],"~x+(1|study)")),weights = weight.opt[i],data = dataset)
}
This seems to work fine:
res_list <- list()
for(i in 1:2){
res_list[[i]] <- lmer(as.formula(paste(resp1[i],"~x+(1|study)")),
weights = dataset[[weight.opt[i]]],data = dataset)
}
I am using R to do a ML project, I have prepared the dataset and split the data into 10 equal splits but the problem is I need to fit the model 10 times manually (10-fold CV). I have tried to create train and test data using a for loop but each time it runs, train is the whole dataset and test is null. Can someone help me, please?
# Preparing the data
data <- read.csv("./project.csv")
id <- seq(1:103342)
data[, 'id'] <- id
for (i in 3:8) {
data[,i] <- as.factor(data[,i])
}
# splitting the data into 10 equal data frames
f <- rep(seq(1, 10), each=round(103342/10), length.out=103342)
df <- split(data, f)
lapply(df, dim)
# running 10-fold cross-validation and computing error rate and AUC for each run.
results <- matrix(nrow=10, ncol=2, dimnames= list(c(), c('error_rate', 'auc')))
for (i in 1:10) {
train <- data[!(data$id %in% df$`i`$id),]
test <- df$`i`
print(dim(test)) # Here is my problem the print statement will print null 10 times
glm.fit <- glm(canceled ~ ., data=train, family=binomial)
glm.prob <- predict(glm.fit, newdata=test, type="response")
...
}
I'm trying to combine multiple lm outputs into a data frame, for further calculations. I have a dataset of 1000 observations and 62 variables. The project is to randomly split the dataset 63/37, train the model, repeat this 1000 times and save the coefficients, the fitted values, and the r2 for all 1000 runs. So I'm doing most of that here (using mtcars):
data("mtcars")
f <- function () {
fit <- lm(mpg ~ ., data = mtcars, subset = sample <- sample.int(n = nrow(mtcars), size = floor(.63*nrow(mtcars)), replace = F))
coef(fit)
}
output <- t(replicate(1000, f()))
I know I can get the rsq values with summary(fit)$r.squared and I can use predict() to get the fitted values. I'm just struggling with how to get them into the data frame with the saved coefficients.
The below should do
get_model <- function (input_data) {
fit <- lm(mpg ~ .,
data = mtcars,
subset = sample <- sample.int(n = nrow(mtcars),
size = floor(.63*nrow(mtcars)), replace = F)
)
return(fit)
}
get_results <- function(lm_model){
data <- data.frame()
data <- rbind(data, coef(lm_model))
data <- cbind(data, summary(lm_model)$r.squared)
colnames(data) <- c(names(mtcars), "rsquared")
return(data)
}
# running the above
input_data <- mtcars
general_df <- data.frame()
for(i in 1:1000){
my_model <- get_model(input_data)
final_data <- get_results(my_model)
general_df <- rbind(general_df, final_data)
}
You are very close:
library(tidyverse)
library(modelr)
data("mtcars")
get_data_lm <- function(data_df, testPCT = 0.37){
data_resample <- modelr::crossv_mc(data_df, n = 1, test = testPCT)
fit <- lm(mpg ~ ., data = as.data.frame(data_resample$train))
stats <- c(coef(fit),
"R2" = summary(fit)$r.squared,
"AdjR2" = summary(fit)$adj.r.squared)
pred_vals <- predict(fit, newdata = as.data.frame(data_resample$test))
c(stats, pred_vals)
}
output <- t(replicate(1000, get_data_lm(mtcars)))
The only thing you needed to do is concatenate the other statistics and predicted values you want. Alternatively, you could use a parallel sapply() variant to make your simulation considerably faster.
Another comment: I use the crossv_mc() function from the modelr:: package to create one testing and training partition. However, I could have used n = 1000 outside the function instead; this would have created a resample data frame in my working environment for me to apply() a function over. See the modelr:: GitHub page for more info.