Merge several confusion matrix with xtable - r

I have trained several models and want to summarise their performance with three confusion matrix. What I want to do is to combine three different confusion matrix into one table using xtable. I want to combine table 1, 2 and 3. See an example below using XGBoost.
require(xgboost)
require(xtable)
require(caTools)
require(tidyverse)
set.seed(1234)
# Loading data
x1 = c(rnorm(10000, 0,1), rnorm(10000,3,1))
x2 = rnorm(1000)
x3 = rnorm(1000)
class= factor(rep(0:1, each=10000))
df <- as.data.frame(cbind(x1, x2, x3, class))
# Preparing target variable
df$class <- as.numeric(df$class)
df$class <- df$class -1
# Creating a hold-out data
train <- sample.split(df$class, SplitRatio = 0.70)
train.df <- subset(df, train == TRUE)
test.df <- subset(df, train == FALSE)
#Labels.
labels.train <- train.df[c('class')]
labels.test <- test.df[c('class')]
# Dropping target variable.
train.df <- train.df %>%
dplyr::select(-class)
test.df <- test.df %>%
dplyr::select(-class)
# Converting to appropiate format.
train <- xgb.DMatrix(as.matrix(train.df), label = as.matrix(labels.train))
test <- xgb.DMatrix(as.matrix(test.df), label = as.matrix(labels.test))
watchlist <- list(eval = test, train = train)
# Running the model
model <- xgb.train(data=train,
watchlist = watchlist,
nround = 1000,
early_stopping_rounds = 25,
objective = "binary:logistic")
# Predictions
pred <- predict(model, test)
# Evaluating the p-distribution.
hist(pred)
# Confusion matrix
table1 <- table(pred > 0.5, labels.test$class)
table2 <- table(pred > 0.25, labels.test$class)
table3 <- table(pred > 0.75, labels.test$class)
print(xtable(table1, caption = 'Threshhold = 50%'))
print(xtable(table2, caption = 'Threshhold = 25%'))
print(xtable(table3, caption = 'Threshhold = 75%'))
The outcome now looks like this
but I want it to look something like this

A possible solution using kable() from knitr, add_header_above() and kable_styling() from kableExtra is next. Add this code after creating the confusion matrices:
#Format table
t1 <- as.data.frame.matrix(table1)
t2 <- as.data.frame.matrix(table2)
t3 <- as.data.frame.matrix(table3)
#Bind
tm <- cbind(t1,t2,t3)
Then next code produces the output you want:
kable(tm,"latex",longtable =T,booktabs =T,caption ="Longtable")%>%
add_header_above(c(" ","p=50%"=2,"p=25%"=2,"p=75%"=2))%>%
kable_styling(latex_options =c("repeat_header"))
I have run the previous code in a rmarkdown document and the result is next:
You must also add libraries knitr and kableExtra to your code.

Related

Expand for-loop to accommodate list in R?

I've recently been interested in trying to develop a for-loop that would be able to run multiple generalized additive models and then produce results in a table that ranks them based on AIC, p-value of each smooth in the model, deviance explained of the overall model, etc.
I found this related question in stack overflow which is basically what I want and was able to run this well for gam() instead of gamm(), however I want to expand this to include multiple independent variables in the model, not just 1.
Ideally, the models would run all possible combinations of independent variables against the dependent variable, and it would test combinations anywhere from 1 independent variable in the model, up to all of the possible covariates in "d_pred" in the model.
I have attempted to do this so far by starting out small and finding all possible combinations of 2 independent variables (df_combinations2), which results in a list of data frames. Then I adjusted the rest of the code to run the for loop such that each iteration will run a different combination of the two variables:
library(mgcv)
## Example data
set.seed(0)
dat <- gamSim(1,n=200,scale=2)
set.seed(1)
dat2 <- gamSim(1,n=200,scale=2)
names(dat2)[1:5] <- c("y1", paste0("x", 4:7))
d <- cbind(dat[, 1:5], dat2[, 1:5])
d_resp <- d[ c("y", "y1")]
d_pred <- d[, !(colnames(d) %in% c("y", "y1"))]
df_combinations2 <- lapply(1:(ncol(combn(1:ncol(d_pred), m = 2))),
function(y) d_pred[, combn(1:ncol(d_pred), m = 2)[,y]])
## create a "matrix" list of dimensions i x j
results_m2 <-lapply(1:length(df_combinations2), matrix, data= NA, nrow=ncol(d_resp), ncol=2)
## for-loop
for(k in 1:length(df_combinations2)){
for(i in 1:ncol(d_resp)){
for(j in 1:ncol(df_combinations2[[k]])){
results_m2[i, j][[1]] <- gam(d_resp[, i] ~ s(df_combinations2[[k]][,1])+s(df_combinations2[[k]][,2]))
}
}}
However, after running the for-loop I get the error "Error in all.vars1(gp$fake.formula[-2]) : can't handle [[ in formula".
Anyone know why I am getting this error/ how to fix it?
Any insight is much appreciated. Thanks!
Personally, I would create a data.table() containing all combinations of target variables and combinations of predictors and loop through all rows. See below.
library(data.table)
library(dplyr)
# Example data
set.seed(0)
dat <- gamSim(1,n=200,scale=2)
set.seed(1)
dat2 <- gamSim(1,n=200,scale=2)
names(dat2)[1:5] <- c("y1", paste0("x", 4:7))
d <- cbind(dat[, 1:5], dat2[, 1:5])
#select names of targets and predictors
targets <- c("y", "y1")
predictors <- colnames(d)[!colnames(d) %in% targets]
#create all combinations of predictors
predictor_combinations <- lapply(1:length(predictors), FUN = function(x){
#create combination
combination <- combn(predictors, m = x) |> as.data.table()
#add s() to all for gam
combination <- sapply(combination, FUN = function(y) paste0("s(", y, ")")) |> as.data.table()
#collapse
combination <- summarize_all(combination, .funs = paste0, collapse = "+")
#unlist
combination <- unlist(combination)
#remove names
names(combination) <- NULL
#return
return(combination)
})
#merge combinations of predictors as vector
predictor_combinations <- do.call(c, predictor_combinations)
#create folder to save results to
if(!dir.exists("dev")){
dir.create("dev")
}
if(!dir.exists("dev/models")){
dir.create("dev/models")
}
#create and save hypergrid (all combinations of targets and predictors combinations)
if(!file.exists("dev/hypergrid.csv")){
#create hypergrid and save to dev
hypergrid <- expand.grid(target = targets, predictors = predictor_combinations) |> as.data.table()
#add identifier
hypergrid[, model := paste0("model", 1:nrow(hypergrid))]
#save to dev
fwrite(hypergrid, file = "dev/hypergrid.csv")
} else{
#if file exists read
hypergrid <- fread("dev/hypergrid.csv")
}
#loop through hypergrid, create GAM models
#progressbar
pb <- txtProgressBar(min = 1, max = nrow(hypergrid), style = 3)
for(i in 1:nrow(hypergrid)){
#update progressbar
setTxtProgressBar(pb, i)
#select target
target <- hypergrid[i,]$target
#select predictors
predictors <- hypergrid[i,]$predictors
#create formula
gam.formula <- as.formula(paste0(target, "~", predictors))
#run gam
gam.model <- gam(gam.formula, data = d)
#save gam model do dev/model
saveRDS(gam.model, file = paste0("dev/models/", hypergrid[i,]$model, ".RDS"))
}
#example where you extract model performances
for(i in 1:nrow(hypergrid)){
#read the right model
rel.model <- readRDS(paste0("dev/models/", hypergrid[i,]$model, ".RDS"))
#extract model performance, add to hypergrid
hypergrid[i, R2 := summary(rel.model)[["r.sq"]]]
}
#arrange hypergrid on target and r2
hypergrid <- dplyr::arrange(hypergrid, hypergrid$target, desc(hypergrid$R2))
Which would give
head(hypergrid)
target predictors model R2
1: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5) model319 0.6957242
2: y s(x0)+s(x1)+s(x2)+s(x3)+s(x4)+s(x5) model423 0.6953753
3: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5)+s(x7) model437 0.6942054
4: y s(x0)+s(x1)+s(x2)+s(x5) model175 0.6941025
5: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5)+s(x6) model435 0.6940569
6: y s(x0)+s(x1)+s(x2)+s(x3)+s(x4)+s(x5)+s(x7) model481 0.6939756
All models are saved to a folder with an identifier (for if you want to use the model or extract more information from the model).
Notably, p-hacking comes to mind using this appraoch and I would be careful by conducting your analysis like this.

Automate Machine Learning process with R on multiple datasets

I have multiple datasets with different lengths. I want to apply a correlation function to delete correlated variables with 98%. How can I use a loop to apply a correlation function on multiple datasets in the same time and store the variables selected in new dataframes?
How can I also use lasso regression on multiple datasets, also using loop functions? Thank you
H<-data.frame(replicate(10,sample(0:20,10,rep=TRUE)))
C<-data.frame(replicate(5,sample(0:100,10,rep=FALSE)))
R<-data.frame(replicate(7,sample(0:30,10,rep=TRUE)))
E<-data.frame(replicate(4,sample(0:40,10,rep=FALSE)))
# Corrélation
library("caret")
library("dplyr")
data.cor <- cor(subset(H, select = -c(X10)))
high.cor <- findCorrelation(data.cor, cutoff=0.98)
remove <- names(H[high.cor])
remove <- c(remove)
myvars <- names(H) %in% remove
var_selected <- H[!myvars]
new_data_H <- var_selected
Here's one way (of several) to do this:
# Corrélation
library(caret)
library(dplyr)
set.seed(99)
H <- data.frame(replicate(10,sample(0:20,10,rep=TRUE)))
C <- data.frame(replicate(5,sample(0:100,10,rep=FALSE)))
R <- data.frame(replicate(7,sample(0:30,10,rep=TRUE)))
E <- data.frame(replicate(4,sample(0:40,10,rep=FALSE)))
# Combine input datasets a list
inputs <- list(H, C, R, E)
# Empty list to hold results
outputs <- list()
# Loop over each dataset, one at a time
for(df in inputs){
data.cor <- cor(df)
high.cor <- findCorrelation(data.cor, cutoff=0.40)
# Subset the dataset based on `high.cor`
# Add the subsetted dataset to a output list of datasets
outputs <- append(outputs, list(df[,-high.cor]))
}
# This is the first dataset processed by the loop
outputs[[1]]
# Second...
outputs[[2]]
# Third...
outputs[[3]]
edit: integrating your lasso routine
library(glmnet)
library(caret)
set.seed(99)
## Define data (indpendent variables)
H <- data.frame(replicate(10,sample(0:20,10,rep=TRUE)))
C <- data.frame(replicate(5,sample(0:100,10,rep=FALSE)))
R <- data.frame(replicate(7,sample(0:30,10,rep=TRUE)))
E <- data.frame(replicate(4,sample(0:40,10,rep=FALSE)))
inputs <- list(H, C, R, E)
## Define targets (dependent variables)
Y_H <- data.frame(label_1 = replicate(1,sample(20:35, 10, rep = TRUE)))
Y_C <- data.frame(label_2 = replicate(1,sample(15:65, 10, rep = TRUE)))
Y_R <- data.frame(label_3 = replicate(1,sample(25:45, 10, rep = TRUE)))
Y_E <- data.frame(label_4 = replicate(1,sample(21:80, 10, rep = TRUE)))
targets <- list(Y_H, Y_C, Y_R, Y_E)
## Remove coorelated independent variables
outputs <- list()
for(df in inputs){
data.cor <- cor(df)
high.cor <- findCorrelation(data.cor, cutoff=0.40)
outputs <- append(outputs, list(df[,-high.cor]))
}
## Do lasso regression
lasso_cv <- list()
lasso_model <- list()
for(i in 1:length(outputs)){
for(j in 1:length(targets)){
lasso_cv[[i]] <- cv.glmnet(
as.matrix(outputs[[i]]), as.matrix(targets[[j]]), standardize = TRUE, type.measure = "mse", alpha = 1, nfolds = 3)
lasso_model[[i]] <- glmnet(
as.matrix(outputs[[i]]), as.matrix(targets[[j]]), lambda = lasso_cv[[i]]$lambda_cv, standardize = TRUE, alpha = 1)
}
}
Create target variables for each dataframe
Combhine all dataframes in list
Combine all targets in list
Note: every target variable correspond to a dataframe
Correlation: delete correlated variables
Performing lasso regression for all lists
Create dataframes
set.seed(99)
H <- data.frame(replicate(10,sample(0:20,10,rep=TRUE)))
C <- data.frame(replicate(5,sample(0:100,10,rep=FALSE)))
R <- data.frame(replicate(7,sample(0:30,10,rep=TRUE)))
E <- data.frame(replicate(4,sample(0:40,10,rep=FALSE)))
Y_H <- data.frame(replicate(1,sample(20:35, 10, rep = TRUE)))
Y_H
names(Y_H)<-
names(Y_H)names(Y_H)=="replicate.1..sample.20.35..10..rep...TRUE.."] <-"label_1"
Y_C <- data.frame(replicate(1,sample(15:65, 10, rep = TRUE)))
names(Y_C) <-
names(Y_C)[names(Y_C)=="replicate.1..sample.15.65..10..rep...TRUE.."] <-"label_2"
Y_R <- data.frame(replicate(1,sample(25:45, 10, rep = TRUE)))
names(Y_R) <-names(Y_R)[names(Y_R) == "replicate.1..sample.25.45..10..rep...TRUE.."] <- "label_3"
Y_E <- data.frame(replicate(1,sample(21:80, 10, rep = TRUE)))
names(Y_E) <-names(Y_E)[names(Y_E) == "replicate.1..sample.15.65..10..rep...TRUE.."] <- "label_4"
inputs <- list(H, C, R, E)
targets <- list(Y_H, Y_C, Y_R, Y_E)
outputs <- list()
for(df in inputs){
data.cor <- cor(df)
high.cor <- findCorrelation(data.cor, cutoff=0.40)
outputs <- append(outputs, list(df[,-high.cor]))
}
library("glmnet")
lasso_cv <- list()
lasso_model <- list()
for(i in outputs){
for(j in targets){
lasso_cv[i] <- cv.glmnet(as.matrix(outputs[[i]]), as.matrix(targets[[j]]),
standardize = TRUE, type.measure="mse", alpha = 1,nfolds = 3)
lasso_model[i] <- glmnet(as.matrix(outputs[[i]]), as.matrix(targets[[j]]),lambda = lasso_cv[i]$lambda_cv, alpha = 1, standardize = TRUE)
}
}

Accessing a variable in a data frame by columns number in R?

I have a data frame as "df" and 41 variables var1 to var41. If I write this command
pcdtest(plm(var1~ 1 , data = df, model = "pooling"))[[1]]
I can see the test value. But I need to apply this test 41 times. I want to access variable by column number which is "df[1]" for "var1" and "df[41]" for "var41"
pcdtest(plm(df[1]~ 1 , data = dfp, model = "pooling"))[[1]]
But it fails. Could you please help me to do this? I will have result in for loop. And I will calculate the descriptive statistics for all the results. But it is very difficult to do test for each variable.
I think you can easily adapt the following code to your data. Since you didn't provide any of your data, I used data that comes with the plm package.
library(plm) # for pcdtest
# example data from plm package
data("Cigar" , package = "plm")
Cigar[ , "fact1"] <- c(0,1)
Cigar[ , "fact2"] <- c(1,0)
Cigar.p <- pdata.frame(Cigar)
# example for one column
p_model <- plm(formula = pop~1, data = Cigar.p, model = "pooling")
pcdtest(p_model)[[1]]
# run through multiple models
l_plm_models <- list() # store plm models in this list
l_tests <- list() # store testresults in this list
for(i in 3:ncol(Cigar.p)){ # start in the third column, since the first two are state and year
fmla <- as.formula(paste(names(Cigar.p)[i], '~ 1', sep = ""))
l_plm_models[[i]] <- plm(formula = as.formula(paste0(colnames(Cigar.p)[i], "~ 1", sep = "")),
data = Cigar.p,
model = "pooling")
l_tests[[i]] <- pcdtest(l_plm_models[[i]])[[1]]
}
testresult <- data.frame("z" = unlist(l_tests), row.names = (colnames(Cigar.p[3:11])))
> testresult
z
price 175.36476
pop 130.45774
pop16 155.29092
cpi 176.21010
ndi 175.51938
sales 99.02973
pimin 175.74600
fact1 176.21010
fact2 176.21010
# example for cipstest
matrix_results <- matrix(NA, nrow = 11, ncol = 2) # use 41 here for your df
l_ctest <- list()
for(i in 3:ncol(Cigar.p)){
l_ctest[[i]] <- cipstest(Cigar.p[, i], lags = 4, type = 'none', model = 'cmg', truncated = F)
matrix_results[i, 1] <- as.numeric(l_ctest[[i]][1])
matrix_results[i, 2] <- as.numeric(l_ctest[[i]][7])
}
res <- data.frame(matrix_results)
names(res) <- c('cips-statistic', 'p-value')
print(res)
Try using as.formula(), for example:
results <- list()
for (i in 1:41){
varName <- paste0('var',i)
frml <- paste0(varName, ' ~ 1')
results[[i]] <-
pcdtest(plm(as.formula(frml) , data = dfp, model = "pooling"))[[1]]
}
You can use reformulate to create the formula and apply the code for 41 times using lapply :
var <- paste0('var', 1:41)
result <- lapply(var, function(x) pcdtest(plm(reformulate('1', x),
data = df, model = "pooling"))[[1]])

Combining regression summary outputs from multiple samples into a single dataframe in R

I'm trying to combine multiple lm outputs into a data frame, for further calculations. I have a dataset of 1000 observations and 62 variables. The project is to randomly split the dataset 63/37, train the model, repeat this 1000 times and save the coefficients, the fitted values, and the r2 for all 1000 runs. So I'm doing most of that here (using mtcars):
data("mtcars")
f <- function () {
fit <- lm(mpg ~ ., data = mtcars, subset = sample <- sample.int(n = nrow(mtcars), size = floor(.63*nrow(mtcars)), replace = F))
coef(fit)
}
output <- t(replicate(1000, f()))
I know I can get the rsq values with summary(fit)$r.squared and I can use predict() to get the fitted values. I'm just struggling with how to get them into the data frame with the saved coefficients.
The below should do
get_model <- function (input_data) {
fit <- lm(mpg ~ .,
data = mtcars,
subset = sample <- sample.int(n = nrow(mtcars),
size = floor(.63*nrow(mtcars)), replace = F)
)
return(fit)
}
get_results <- function(lm_model){
data <- data.frame()
data <- rbind(data, coef(lm_model))
data <- cbind(data, summary(lm_model)$r.squared)
colnames(data) <- c(names(mtcars), "rsquared")
return(data)
}
# running the above
input_data <- mtcars
general_df <- data.frame()
for(i in 1:1000){
my_model <- get_model(input_data)
final_data <- get_results(my_model)
general_df <- rbind(general_df, final_data)
}
You are very close:
library(tidyverse)
library(modelr)
data("mtcars")
get_data_lm <- function(data_df, testPCT = 0.37){
data_resample <- modelr::crossv_mc(data_df, n = 1, test = testPCT)
fit <- lm(mpg ~ ., data = as.data.frame(data_resample$train))
stats <- c(coef(fit),
"R2" = summary(fit)$r.squared,
"AdjR2" = summary(fit)$adj.r.squared)
pred_vals <- predict(fit, newdata = as.data.frame(data_resample$test))
c(stats, pred_vals)
}
output <- t(replicate(1000, get_data_lm(mtcars)))
The only thing you needed to do is concatenate the other statistics and predicted values you want. Alternatively, you could use a parallel sapply() variant to make your simulation considerably faster.
Another comment: I use the crossv_mc() function from the modelr:: package to create one testing and training partition. However, I could have used n = 1000 outside the function instead; this would have created a resample data frame in my working environment for me to apply() a function over. See the modelr:: GitHub page for more info.

I am getting same predicted values for all test data while using artifical neural network code in R

I am very new to R language. Here is my code ..
apply(DataFrame,2,range)
maxValue <- apply(DataFrame,2,max)
minValue <- apply(DataFrame,2,min)
#not used ...
DataFrame<- as.data.frame(scale(DataFrame, center = minValue,scale = maxValue-minValue))
ind<- sample(1:nrow(DataFrame),2000)
trainDF <- DataFrame[ind,]
testDF <- DataFrame[-ind,]
head(trainDF)
allVars <-colnames(DataFrame)
predictorVars <- allVars[!allVars%in%"TEMP"]
predictorVars <- paste(predictorVars,collapse = "+")
form=as.formula(paste("TEMP~",predictorVars,collapse = "+"))
#form= (paste("TEMP~DEWP+SLP+STP+VISIB+WDSP+MAX+MIN"))
neuralModel <- neuralnet(formula= form, hidden = c(4,2),linear.output = FALSE ,data=trainDF)
plot(neuralModel)
head(testDF)
prediction <- compute(neuralModel, testDF[,1:11])
str(prediction)
prediction <- prediction$net.result*(max(testDF$TEMP)-min(testDF$TEMP))+min(testDF$TEMP)
actualValues <- ((testDF$TEMP)*(max(testDF$TEMP)-min(testDF$TEMP))+min(testDF$TEMP))
here when I check prediction[1] or any test data, it always give me same response.

Resources