Neural Network model doesn't run - r

for some reason, my model is not running. I created a model matrix to run a simple model with the package neuralnet. I know it might be challenging to debug other people code especially without the data but in case you think you could assist me here is the code:
library(tidyverse)
library(neuralnet)
#Activity 1 Load Data
featchannels <-read.csv("features_channel.csv")
trainTargets <-read.table("traintargets.txt")
#Activity 2 Normalize every column of the features dataset using min-max
normalization to range [0-1].
normalized <- function(x) {
return((x-min(x)) /(max(x) -min(x)))
}
featchannels <- normalized(featchannels)
#Activity 3 Add a target feature named response to the features dataset
with 0-1 values read from trainTargets.txt, with 1 indicating P300
response and 0 otherwise.
colnames(trainTargets)[1] <- "State"
featchannels <- cbind(featchannels, trainTargets)
# Changing rows to P300 and others.
featchannels <- within(featchannels, State <- factor(State, labels =
c("Other", "P300")))
featchannels$State <- as.factor(featchannels$State)
#4. Take the first 3840 rows of the dataset as the training data set, and
the remaining 960 rows as the testing data set.
training <- featchannels[1:3840,]
testing <- featchannels[3841:4800,]
enter code here
#Activitry 6
#Creating model matrix before runing the model
df_comb_training <- training
y <- model.matrix(~ df_comb_training$State + 0, data = df_comb_training[,
c('State'), drop=FALSE])
# fix up names for as.formula
y_feats <- gsub("^[^ ]+\\$", "", colnames(y))
colnames(y) <- y_feats
df_comb_training <- df_comb_training[, !(colnames(df_comb_training) ==
"State")]
feats <- colnames(df_comb_training)
df_comb_training <- cbind(y, df_comb_training)
# Concatenate strings
f <- paste(feats, collapse=' + ')
y_f <- paste(y_feats, collapse=' + ')
f <- paste(y_f, '~', f)
# Convert to formula
f <- as.formula(f)
model_h5 <- neuralnet(f, df_comb_training, stepmax = 1e+08, hidden = 5)

Related

Expand for-loop to accommodate list in R?

I've recently been interested in trying to develop a for-loop that would be able to run multiple generalized additive models and then produce results in a table that ranks them based on AIC, p-value of each smooth in the model, deviance explained of the overall model, etc.
I found this related question in stack overflow which is basically what I want and was able to run this well for gam() instead of gamm(), however I want to expand this to include multiple independent variables in the model, not just 1.
Ideally, the models would run all possible combinations of independent variables against the dependent variable, and it would test combinations anywhere from 1 independent variable in the model, up to all of the possible covariates in "d_pred" in the model.
I have attempted to do this so far by starting out small and finding all possible combinations of 2 independent variables (df_combinations2), which results in a list of data frames. Then I adjusted the rest of the code to run the for loop such that each iteration will run a different combination of the two variables:
library(mgcv)
## Example data
set.seed(0)
dat <- gamSim(1,n=200,scale=2)
set.seed(1)
dat2 <- gamSim(1,n=200,scale=2)
names(dat2)[1:5] <- c("y1", paste0("x", 4:7))
d <- cbind(dat[, 1:5], dat2[, 1:5])
d_resp <- d[ c("y", "y1")]
d_pred <- d[, !(colnames(d) %in% c("y", "y1"))]
df_combinations2 <- lapply(1:(ncol(combn(1:ncol(d_pred), m = 2))),
function(y) d_pred[, combn(1:ncol(d_pred), m = 2)[,y]])
## create a "matrix" list of dimensions i x j
results_m2 <-lapply(1:length(df_combinations2), matrix, data= NA, nrow=ncol(d_resp), ncol=2)
## for-loop
for(k in 1:length(df_combinations2)){
for(i in 1:ncol(d_resp)){
for(j in 1:ncol(df_combinations2[[k]])){
results_m2[i, j][[1]] <- gam(d_resp[, i] ~ s(df_combinations2[[k]][,1])+s(df_combinations2[[k]][,2]))
}
}}
However, after running the for-loop I get the error "Error in all.vars1(gp$fake.formula[-2]) : can't handle [[ in formula".
Anyone know why I am getting this error/ how to fix it?
Any insight is much appreciated. Thanks!
Personally, I would create a data.table() containing all combinations of target variables and combinations of predictors and loop through all rows. See below.
library(data.table)
library(dplyr)
# Example data
set.seed(0)
dat <- gamSim(1,n=200,scale=2)
set.seed(1)
dat2 <- gamSim(1,n=200,scale=2)
names(dat2)[1:5] <- c("y1", paste0("x", 4:7))
d <- cbind(dat[, 1:5], dat2[, 1:5])
#select names of targets and predictors
targets <- c("y", "y1")
predictors <- colnames(d)[!colnames(d) %in% targets]
#create all combinations of predictors
predictor_combinations <- lapply(1:length(predictors), FUN = function(x){
#create combination
combination <- combn(predictors, m = x) |> as.data.table()
#add s() to all for gam
combination <- sapply(combination, FUN = function(y) paste0("s(", y, ")")) |> as.data.table()
#collapse
combination <- summarize_all(combination, .funs = paste0, collapse = "+")
#unlist
combination <- unlist(combination)
#remove names
names(combination) <- NULL
#return
return(combination)
})
#merge combinations of predictors as vector
predictor_combinations <- do.call(c, predictor_combinations)
#create folder to save results to
if(!dir.exists("dev")){
dir.create("dev")
}
if(!dir.exists("dev/models")){
dir.create("dev/models")
}
#create and save hypergrid (all combinations of targets and predictors combinations)
if(!file.exists("dev/hypergrid.csv")){
#create hypergrid and save to dev
hypergrid <- expand.grid(target = targets, predictors = predictor_combinations) |> as.data.table()
#add identifier
hypergrid[, model := paste0("model", 1:nrow(hypergrid))]
#save to dev
fwrite(hypergrid, file = "dev/hypergrid.csv")
} else{
#if file exists read
hypergrid <- fread("dev/hypergrid.csv")
}
#loop through hypergrid, create GAM models
#progressbar
pb <- txtProgressBar(min = 1, max = nrow(hypergrid), style = 3)
for(i in 1:nrow(hypergrid)){
#update progressbar
setTxtProgressBar(pb, i)
#select target
target <- hypergrid[i,]$target
#select predictors
predictors <- hypergrid[i,]$predictors
#create formula
gam.formula <- as.formula(paste0(target, "~", predictors))
#run gam
gam.model <- gam(gam.formula, data = d)
#save gam model do dev/model
saveRDS(gam.model, file = paste0("dev/models/", hypergrid[i,]$model, ".RDS"))
}
#example where you extract model performances
for(i in 1:nrow(hypergrid)){
#read the right model
rel.model <- readRDS(paste0("dev/models/", hypergrid[i,]$model, ".RDS"))
#extract model performance, add to hypergrid
hypergrid[i, R2 := summary(rel.model)[["r.sq"]]]
}
#arrange hypergrid on target and r2
hypergrid <- dplyr::arrange(hypergrid, hypergrid$target, desc(hypergrid$R2))
Which would give
head(hypergrid)
target predictors model R2
1: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5) model319 0.6957242
2: y s(x0)+s(x1)+s(x2)+s(x3)+s(x4)+s(x5) model423 0.6953753
3: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5)+s(x7) model437 0.6942054
4: y s(x0)+s(x1)+s(x2)+s(x5) model175 0.6941025
5: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5)+s(x6) model435 0.6940569
6: y s(x0)+s(x1)+s(x2)+s(x3)+s(x4)+s(x5)+s(x7) model481 0.6939756
All models are saved to a folder with an identifier (for if you want to use the model or extract more information from the model).
Notably, p-hacking comes to mind using this appraoch and I would be careful by conducting your analysis like this.

Same R code not working with 2 different .csv files but same data structure

Hi I have 2 pieces of code, they are doing the same, one is using a dataset that comes with R, the other a .csv file that I have created, nothing on the code has changed other than more columns on the .csv file with different names, but for some reason I am getting a wrong output
the csv file for this code is located here
https://github.com/juandavidlozano/Data_1/blob/main/high_northell.csv
this is the first code
library(plyr)
library(readr)
library(dplyr)
library(caret)
library(ggplot2)
library(repr)
dat <- economics
drops <- c("date")
dat <- dat[ , !(names(dat) %in% drops)]
cols = c('pce', 'pop', 'psavert', 'uempmed')
pre_proc_val <- preProcess(dat[,cols], method = c("center", "scale"))
dat[,cols] = predict(pre_proc_val, dat[,cols])
###### Linear Regression
number_days = 10
dat <- as.data.frame(dat)
new_cols <- c('Intercept', paste0(cols, '_predict'))
dat[new_cols] <- NA
inds <- nrow(dat) - number_days
dat[(number_days+1):nrow(dat), new_cols] <- do.call(rbind, lapply(seq(inds), function(x) {
lr = lm(unemploy ~ uempmed + psavert + pop + pce, data = dat[x:(x + number_days - 1), ])
t(lr$coefficients)
}))
the dat dataframe looks like this
the first 10 rows have some NA's that is because of the variable number_days it leaves the first X rows empty and from there it fills it with the coefficients of a linear regression model for the last X days of data and so on.
this dataframe is correct, all the column have values.
the second code is this one
library(plyr)
library(readr)
library(dplyr)
library(caret)
library(ggplot2)
library(repr)
dat<-read.csv("high_northell.csv", header = TRUE, stringsAsFactors=FALSE)
drops <- c("date")
dat <- dat[ , !(names(dat) %in% drops)]
cols = c("state_covid_death","kantar_state_tv_daily","VIX", "interest_urgent_care","CPI","SPY",
"kantar_state_digital_daily", "Flu_indicator","covid_cases",
"Search.Cost","Display.Cost")
pre_proc_val <- preProcess(dat[,cols], method = c("center", "scale"))
dat[,cols] = predict(pre_proc_val, dat[,cols])
###### Linear Regression
number_days = 10
dat <- as.data.frame(dat)
new_cols <- c('Intercept', paste0(cols, '_predict'))
dat[new_cols] <- NA
inds <- nrow(dat) - number_days
dat[(number_days+1):nrow(dat), new_cols] <- do.call(rbind, lapply(seq(inds), function(x) {
lr = lm(Total.Visits~ state_covid_death + kantar_state_tv_daily + VIX+ interest_urgent_care+ CPI+ SPY +kantar_state_digital_daily+ Flu_indicator + covid_cases+ Search.Cost+ Display.Cost, data = dat[x:(x + number_days - 1), ])
t(lr$coefficients)
}))
As you can see is the same code but the dat dataframe in this case for some reason some columns are filled with NA's and some are filled NA's and data here and there, this data frame should look like the one at top, all columns should be filled except for the first X rows.
here is a pic data dataframe for this second code
Any help on what is causing this issue?
The linear regression fails to define some variables due to singularities.
For a given 10 day subset those variables are constant across all days, thus those variables are perfectly multicollinear and the X'X matrix is singular.

speed up replication of rows using model

I would like to create replicate predictions for one integer independent variable (iv1) given some model and a data frame called training. This is my current approach. I appreciate this is not self containing but hopefully it is self explanatory:
number_of_samples <- 10
results <- NULL
for (row in 1:nrow(training)) {
fake_iv1_values <- sample(1:100, number_of_samples)
case <- training[row,]
for (iv1 in fake_iv1_values) {
case$iv1 <- iv1
case$prediction <- predict(some_model, newdata = case)
results <- rbind(results, case)
}
}
Using loops is very slow. I wonder, if this could be sped up? Thanks!
Try with this.
Reproducible fake data and model:
# create fake data
n_row <- 100
n_xs <- 100
training <- data.frame(y = rnorm(n_row), iv1 = rnorm(n_row))
training[, paste0("x",1:n_xs)] <- replicate(n_xs, list(rnorm(n_row)))
# example model
some_model <- lm(y~., training)
Rewritten code:
number_of_samples <- 10
results <- NULL
# vector of several fake_iv1_values vectors
fake_iv1_values <- as.numeric(replicate(nrow(training), sample(1:100, number_of_samples)))
# replicate each row of the original dataframe
results <- training[rep(seq_len(nrow(training)), each = number_of_samples), ]
# add fake values to the replicated dataframe
results$iv1 <- fake_iv1_values
# get predictions
results$prediction <- predict(some_model, newdata = results)

The for loop is working but without any good

I am using R to do a ML project, I have prepared the dataset and split the data into 10 equal splits but the problem is I need to fit the model 10 times manually (10-fold CV). I have tried to create train and test data using a for loop but each time it runs, train is the whole dataset and test is null. Can someone help me, please?
# Preparing the data
data <- read.csv("./project.csv")
id <- seq(1:103342)
data[, 'id'] <- id
for (i in 3:8) {
data[,i] <- as.factor(data[,i])
}
# splitting the data into 10 equal data frames
f <- rep(seq(1, 10), each=round(103342/10), length.out=103342)
df <- split(data, f)
lapply(df, dim)
# running 10-fold cross-validation and computing error rate and AUC for each run.
results <- matrix(nrow=10, ncol=2, dimnames= list(c(), c('error_rate', 'auc')))
for (i in 1:10) {
train <- data[!(data$id %in% df$`i`$id),]
test <- df$`i`
print(dim(test)) # Here is my problem the print statement will print null 10 times
glm.fit <- glm(canceled ~ ., data=train, family=binomial)
glm.prob <- predict(glm.fit, newdata=test, type="response")
...
}

Optimizing using R

The data is:
name <- c("Gen1","Gen2","Gen3")
QuantityE <- c(200,100,50)
PriceE <- c(10,12,50)
QuantityAS <- c(100,50,10)
PriceAS <- c(1,5,7)
mydata <- data.frame(name, QuantityE, PriceE , QuantityAS,PriceAS )
I have the following objective function:
Minimize total cost when multiplying combinations of
((PriceE*QuantityE) + (PriceAS* QuantityAS))
Subject to constraints:
Total QuantityE = 300
Total QuantityAS = 0.06* QuantityE
What is the best approach to use, or what I can read up to solve the problem?
For completeness, after some reading, found the right way to code the LP.
There is neater ways of doing it, but this works for me.
name <- c("Gen1","Gen2","Gen3")
QuantityE <- c(200,100,50)
PriceE <- c(10,12,50)
QuantityAS <- c(100,50,10)
PriceAS <- c(1,5,7)
mydata <- data.frame(name, QuantityE, PriceE , QuantityAS,PriceAS )
#System Data
EnergyDemand <- 300 #Total QuantityE
CRRequired <- 0.06*EnergyDemand #Total Quantity AS
library(lpSolve)
#Set up Objective function, prices will be the co-ef's
obj.fun <- as.vector(stack(mydata[,c(3,5)])[1])
##Set up the constraints matrix
#This will set up individual quantityE and quantityAS coef's
D <- diag(1, NROW(obj.fun),NROW(obj.fun))
#This sets up coefficients with the ability to combine QuantityAS and QuantityE
E <- diag(1, NROW(name),NROW(name))
FA <- cbind(E,E)
#This sets up the cofficients for all quantityE
G <- matrix(c(rep(1,NROW(name)),rep(0,NROW(name))),1)
#This sets up the cofficients for all quantityAS
H <- matrix(c(rep(0,NROW(name)),rep(1,NROW(name))),1)
#This combines the above constraints into one matrix
constr <- rbind(D,FA,G,H)
#Set up directional constraints. All except the last 2 are <=
#This allows flexibility in choosing volumes
# The last two have to be equal to for Energy and AS demand
constr.dir <- c(rep("<=",NROW(constr)-2), rep("=",2))
#This sets up the rhs numbers for the matrix above
rhs <- c(QuantityE, QuantityAS, pmax(QuantityE, QuantityAS), EnergyDemand,CRRequired)
#This is the algorithm parameters
prod.sol <- lp("min", obj.fun, constr, constr.dir, rhs, compute.sens = TRUE)
a <- matrix(prod.sol$solution, nrow= length(name)) #decision variables values
rownames(a) <- name
colnames(a) <- c("Energy MW", "AS MW")
#This is the Summary of results
print(mydata) #This gives the initial dataset
a # This gives the combination of quantity used from Gen's
prod.sol #This gives the optimal minimized cost

Resources