fitting data to bnlearn model in R - r

I have a bnlearn model in R that is learned using the gs function with 4 categorical variables and 8 numerical variables.
when I try to validate my model with a test set, I get this error when trying to predict some of the nodes:
Error in check.fit.vs.data(fitted = object, data = data, subset = object[[node]]$parents) :
'Keyword' has different number of levels in the node and in the data.
Is it not possible to use both numerical and categorical variables with bnlearn? and if it is possible, what am I doing wrong?
mydata$A <- as.factor(mydata$A)
mydata$B <- as.numeric(mydata$B)
mydata$C <- as.numeric(mydata$C)
mydata$D <- as.numeric(mydata$D)
mydata$E <- as.factor(mydata$E)
mydata$F <- as.numeric(mydata$F)
mydata$G <- as.numeric(mydata$G)
mydata$H <- as.numeric(mydata$H)
mydata$I <- as.numeric(mydata$I)
mydata$J <- as.numeric(mydata$J)
mydata$K <- as.numeric(mydata$K)
mydata$L <- as.numeric(mydata$L)
mydata$M <- as.numeric(mydata$M)
mydata$N <- as.numeric(mydata$N)
mydata$O <- as.numeric(mydata$O)
mydata$P <- as.numeric(mydata$P)
mydata$Q <- as.numeric(mydata$Q)
#create vector of black arcs
temp1=vector(mode = "character", length = 0)
for (i in 1:length(varnames)){
for (j in 1:length(varnames)){
temp1 <- c(temp1,varnames[i])
}
}
temp2=vector(mode = "character", length = 0)
for (i in 1:length(varnames)){
temp2 <- c(temp2,varnames)
}
#creat to arcs of the model
arcdata = read.csv("C:/users/asaf/desktop/in progress/whitearcs.csv", header = T)
wfrom=arcdata[,1]
wto=arcdata[,2]
whitelist = data.frame(from = wfrom,to =wto)
#block unwanted arcs
blacklist = data.frame(from = temp1, to = temp2)
#fit and plot the model
#gaussian method
model = gs(mydata, whitelist = whitelist, blacklist = blacklist)
#inference procedure
learntmodel = bn.fit(model,mydata,method = "mle",debug = F)
graphviz.plot(learntmodel)
myvalidation=read.csv("C:/users/asaf/desktop/in progress/val.csv", header = T)
#predicate A
pred = predict(learntmodel, node="A", myvalidation)
myvalidation$A <- pred
#predicate B
pred = predict(learntmodel, node="B", myvalidation)
myvalidation$B <- pred
at this point it throws the following error :
Error in check.fit.vs.data(fitted = object, data = data, subset = object[[node]]$parents) :
'A' has different number of levels in the node and in the data.

bnlearn can't work with mixed variables (qualitative and quantitative) at same time, I read it is possible in deal package.
Another possibility is to use discretize to transform your continous variables into discrete variables:
dmydata <- discretize(mydata, breaks = 2, method = "interval")
model <- gs(dmydata, whitelist = whitelist, blacklist = blacklist)
... and continue your code.

Actually I had the same problem today, I resolved it by ensuring that the other nodes that are connected to the one in question... i.e. $A, had also the same number of levels.

Related

Use of PCA results as input to XGboost model throwing an error: Feature names stored in `object` and `newdata` are different

I use PCA on my divided train dataset and project the test dataset to the results after removing irrelevant columns.
data <- read.csv('bottom10.csv')
set.seed(1)
inTrain <- createDataPartition(data$cuisine, p = .8)[[1]]
dataTrain <- data[,-1][inTrain,][,-1]
dataTest <- data[,-1][-inTrain,][,-1]
cuisine.pca <- prcomp(dataTrain[,-1])
Then I extract the first 500 components and project the test dataset.
traincom <- cuisine.pca$x[,1:500]
testcom <- scale(dataTest[,-1], cuisine.pca$center) %*% cuisine.pca$rotation
Then I transfer the labels into integer, and combine components and labels into xgbDMatrix form.
label_train <- as.integer(dataTrain$cuisine) - 1
label_test <- as.integer(dataTest$cuisine) - 1
xgb_train <- xgb.DMatrix(data = traincom, label = label_train)
xgb_test <- xgb.DMatrix(data = testcom, label = label_test)
Then I build the xgboost model as
xgb.fit <- xgboost(cuisine~., data = xgb_train, nrounds = 40, num_class = 10, early_stopping_rounds = 5)
And after I run this, there is a warning but the training can still run.
xgboost: label will be ignored
I can predict the train dataset using the model but when I try to predict test dataset there will be an error.
xgb_pred <- predict(xgb.fit, newdata = xgb_train)
sum(label_train == xgb_pred)/length(label_train)
xgb_pred <- predict(xgb.fit, newdata = xgb_test, rescale = T)
Error in predict.xgb.Booster(xgb.fit, newdata = xgb_test, rescale = T) :
Feature names stored in `object` and `newdata` are different!
Please let me know what am I doing wrong?
Regards

How to cluster standard error in clubSandwich's vcovCR()?

I'm trying to specify a cluster variable after plm using vcovCR() in clubSandwich package for my simulated data (which I use for power simulation), but I get the following error message:
"Error in [.data.frame(eval(mf$data, envir), , index_names) : undefined columns selected"
I'm not sure if this is specific to vcovCR() or something general about R, but could anyone tell me what's wrong with my code? (I saw a related post here How to cluster standard errors of plm at different level rather than id or time?, but it didn't solve my problem).
My code:
N <- 100;id <- 1:N;id <- c(id,id);gid <- 1:(N/2);
gid <- c(gid,gid,gid,gid);T <- rep(0,N);T = c(T,T+1)
a <- qnorm(runif(N),mean=0,sd=0.005)
gp <- qnorm(runif(N/2),mean=0,sd=0.0005)
u <- qnorm(runif(N*2),mean=0,sd=0.05)
a <- c(a,a);gp = c(gp,gp,gp,gp)
Ylatent <- -0.05*T + a + u
Data <- data.frame(
Y = ifelse(Ylatent > 0, 1, 0),
id = id,gid = gid,T = T
)
library(clubSandwich)
library(plm)
fe.fit <- plm(formula = Y ~ T, data = Data, model = "within", index = "id",effect = "individual", singular.ok = FALSE)
vcovCR(fe.fit,cluster=Data$id,type = "CR2") # doesn't work, but I can run this by not specifying cluster as in the next line
vcovCR(fe.fit,type = "CR2")
vcovCR(fe.fit,cluster=Data$gid,type = "CR2") # I ultimately want to run this
Make your data a pdata.frame first. This is safer, especially if you want to have the time index created automatically (seems to be the case looking at your code).
Continuing what you have:
pData <- pdata.frame(Data, index = "id") # time index is created automatically
fe.fit2 <- plm(formula = Y ~ T, data = pData, model = "within", effect = "individual")
vcovCR(fe.fit2, cluster=Data$id,type = "CR2")
vcovCR(fe.fit2, type = "CR2")
vcovCR(fe.fit2,cluster=Data$gid,type = "CR2")
Your example does not work due to a bug in clubSandwich's data extraction function get_index_order (from version 0.3.3) for plm objects. It assumes both index variables are in the original data but this is not the case in your example where the time index is created automatically by only specifying the individual dimension by the index argument.

Linear Model error or data type error?

The program generates a matrix with location and treatment. I generates some data from a normal distribution. It then tries to fit a linear model predicting yield based on treatment and location. The linear model does not work. Why?
trts = paste(0:6)
locs = paste(1:6)
reps = paste(1:4)
plotsize = 4
DF = expand.grid(locs, reps, trts, plotsize, stringsAsFactors = TRUE)
colnames(DF) = c("Location","Replicate","Treatment")
vector = rnorm(1000000, mean=138.2, sd=54.89)
DF$Treatment = as.numeric(DF$Treatment)
DF$Location = as.numeric(DF$Location)
#This approach takes one set of "plotsize" values from "vector" and adds for 5 for each treatment.
DF$Yield = apply(DF, 1, function(x) (5*DF$Treatment)+mean(sample vector,plotsize)))
DF<-t(DF)
Yield<-DF$Yield
trt=as.factor(DF$Treatment)
loc=as.factor(DF$Location)
summary(fm1 <- aov(Yield ~ loc*trt))
result1<-TukeyHSD(fm1, "trtm", ordered = TRUE)

R: Cross validation on a dataset with factors

Often, I want to run a cross validation on a dataset which contains some factor variables and after running for a while, the cross validation routine fails with the error: factor x has new levels Y.
For example, using package boot:
library(boot)
d <- data.frame(x=c('A', 'A', 'B', 'B', 'C', 'C'), y=c(1, 2, 3, 4, 5, 6))
m <- glm(y ~ x, data=d)
m.cv <- cv.glm(d, m, K=2) # Sometimes succeeds
m.cv <- cv.glm(d, m, K=2)
# Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) :
# factor x has new levels B
Update: This is a toy example. The same problem occurs with larger datasets as well, where there are several occurrences of level C but none of them is present in the training partition.
The function createDataPartition function from the package caret does stratified sampling for the outcome variables and correctly warns:
Also, for ‘createDataPartition’, very small class sizes (<= 3) the classes may not show up in both the training and test data.
There are two solutions which spring to mind:
First, create a subset of the data by selecting one random sample of each factor level first, starting from the rarest class (by frequency) and then greedily satisfying the next rare class and so on. Then using createDataPartition on the rest of the dataset and merging the results to create a new train dataset which contains all levels.
Using createDataPartitions and and doing rejection sampling.
So far, option 2 has worked for me because of the data sizes, but I cannot help but think that there must be a better solution than a hand rolled out one.
Ideally, I would want a solution which just works for creating partitions and fails early if there is no way to create such partitions.
Is there a fundamental theoretical reason why packages do not offer this? Do they offer it and I just haven't been able to spot them because of a blind spot? Is there a better way of doing this stratified sampling?
Please leave a comment if I should ask this question on stats.stackoverflow.com.
Update:
This is what my hand rolled out solution (2) looks like:
get.cv.idx <- function(train.data, folds, factor.cols = NA) {
if (is.na(factor.cols)) {
all.cols <- colnames(train.data)
factor.cols <- all.cols[laply(llply(train.data[1, ], class), function (x) 'factor' %in% x)]
}
n <- nrow(train.data)
test.n <- floor(1 / folds * n)
cond.met <- FALSE
n.tries <- 0
while (!cond.met) {
n.tries <- n.tries + 1
test.idx <- sample(nrow(train.data), test.n)
train.idx <- setdiff(1:nrow(train.data), test.idx)
cond.met <- TRUE
for(factor.col in factor.cols) {
train.levels <- train.data[ train.idx, factor.col ]
test.levels <- train.data[ test.idx , factor.col ]
if (length(unique(train.levels)) < length(unique(test.levels))) {
cat('Factor level: ', factor.col, ' violated constraint, retrying.\n')
cond.met <- FALSE
}
}
}
cat('Done in ', n.tries, ' trie(s).\n')
list( train.idx = train.idx
, test.idx = test.idx
)
}
Everyone agrees that there sure is an optimal solution. But personally, I would just try the cv.glm call until it works usingwhile.
m.cv<- try(cv.glm(d, m, K=2)) #First try
class(m.cv) #Sometimes error, sometimes list
while ( inherits(m.cv, "try-error") ) {
m.cv<- try(cv.glm(d, m, K=2))
}
class(m.cv) #always list
I've tried it with 100,000 rows in the data.fame and it only takes a few seconds.
library(boot)
n <-100000
d <- data.frame(x=c(rep('A',n), rep('B', n), 'C', 'C'), y=1:(n*2+2))
m <- glm(y ~ x, data=d)
m.cv<- try(cv.glm(d, m, K=2))
class(m.cv) #Sometimes error, sometimes list
while ( inherits(m.cv, "try-error") ) {
m.cv<- try(cv.glm(d, m, K=2))
}
class(m.cv) #always list
When I call traceback I get this:
> traceback()
9: stop(sprintf(ngettext(length(m), "factor %s has new level %s",
"factor %s has new levels %s"), nm, paste(nxl[m], collapse = ", ")),
domain = NA)
8: model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels)
7: model.frame(Terms, newdata, na.action = na.action, xlev = object$xlevels)
6: predict.lm(object, newdata, se.fit, scale = 1, type = ifelse(type ==
"link", "response", type), terms = terms, na.action = na.action)
5: predict.glm(d.glm, data[j.out, , drop = FALSE], type = "response")
4: predict(d.glm, data[j.out, , drop = FALSE], type = "response")
3: mean((y - yhat)^2)
2: cost(glm.y[j.out], predict(d.glm, data[j.out, , drop = FALSE],
type = "response"))
1: cv.glm(d, m, K = 2)
And looking at the cv.glm function gives:
> cv.glm
function (data, glmfit, cost = function(y, yhat) mean((y - yhat)^2),
K = n)
{
call <- match.call()
if (!exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE))
runif(1)
seed <- get(".Random.seed", envir = .GlobalEnv, inherits = FALSE)
n <- nrow(data)
out <- NULL
if ((K > n) || (K <= 1))
stop("'K' outside allowable range")
K.o <- K
K <- round(K)
kvals <- unique(round(n/(1L:floor(n/2))))
temp <- abs(kvals - K)
if (!any(temp == 0))
K <- kvals[temp == min(temp)][1L]
if (K != K.o)
warning(gettextf("'K' has been set to %f", K), domain = NA)
f <- ceiling(n/K)
s <- sample0(rep(1L:K, f), n)
n.s <- table(s)
glm.y <- glmfit$y
cost.0 <- cost(glm.y, fitted(glmfit))
ms <- max(s)
CV <- 0
Call <- glmfit$call
for (i in seq_len(ms)) {
j.out <- seq_len(n)[(s == i)]
j.in <- seq_len(n)[(s != i)]
Call$data <- data[j.in, , drop = FALSE]
d.glm <- eval.parent(Call)
p.alpha <- n.s[i]/n
cost.i <- cost(glm.y[j.out], predict(d.glm, data[j.out,
, drop = FALSE], type = "response"))
CV <- CV + p.alpha * cost.i
cost.0 <- cost.0 - p.alpha * cost(glm.y, predict(d.glm,
data, type = "response"))
}
list(call = call, K = K, delta = as.numeric(c(CV, CV + cost.0)),
seed = seed)
}
It seems the problem has to do with your extremely small sample size and categorical effect (with values "A", "B", and "C"). You are fitting a glm with 2 effects: "B:A" and "C:A". In each CV iteration you bootstrap from the sample dataset and fit a new model d.glm. Given the size, the bootstrapped data are guaranteed to come up with 1 or more iteration in which the value "C" is not sampled, hence the error comes from calculating fitted probabilities from the bootstrap model from the training data in which validation data has a "C" level for x not observed in the training data.
Frank Harrell (often on stats.stackexchange.com) wrote in Regression Modelling Strategies that one ought to favor against split sample validation when sample size is small and/or some cell counts are small in categorical data analysis. Singularity (as you are seeing here) is one of many reasons why I think this is true.
Given the small sample size here, you should consider some split sample cross validation alternatives like a permutation test, or a parametric bootstrap. Another important consideration is exactly why you feel model based inference isn't correct. As Tukey said of the bootstrap, he'd like to call it a shotgun. It will blow the head off of any problem, as long as you're willing to reassemble the pieces.
There don't seem to be many simple solutions around the web so here's one I worked out that should be easy to generalize to as many factors as you need. It uses pre-installed packages and Caret but you could get away with just base R if you really wanted.
To use cross-validation when you have multiple factors follow a two-step process. Convert the factors to numerics and then multiply them together. Use this new variable as the target variable in a stratified sampling function. Be sure to remove it or keep it out of your training set after creating your folds.
If y is your DV and x is a factor then:
#Simulated factors (which are conveniently distributed for the example)
dataset <-data.frame(x=as.factor(rep(c(1,10),1000)),y=as.factor(rep(c(1,2,3,4),250)[sample(1000)]))
#Convert the factors to numerics and multiply together in new variable
dataset$cv.variable <-as.numeric(levels(dataset$x))[dataset$x]*as.numeric(levels(dataset$y))[dataset$y]
prop.table(table(dataset$y)) #One way to view distribution of levels
ftable(dataset$x,dataset$y) #A full table of all x and y combinations
folds <- caret::createFolds(dataset$cv.variable,k=10)
testIndexes <- folds[[k]]
testData <- as.data.frame(dataset[testIndexes, ])
trainData <- as.data.frame(dataset[-testIndexes, ])
prop.table(table(testData$y))
ftable(testData$x,testData$y) #evaluate distribution
which should produce a result that is close to balanced.
Note: In real life, if your sample lacks the requisite unique combinations of factors then your problem is harder overcome and might be impossible. You can either drop some levels from consideration before creating folds or employ some kind of over-sampling.

binning continuous variables by IV value in R

I am building a logistic regression model in R. I want to bin continuous predictors in an optimal way in relationship to the target variable. There are two things that I know of:
the continuous variables are binned such that its IV (information value) is maximized
maximize the chi-square in the two way contingency table -- the target has two values 0 and 1, and the binned continuous variable has the binned buckets
Does anyone know of any functions in R that can perform such binning?
Your help will be greatly appreciated.
For the first point, you could bin using the weight of evidence (woe) with the package woebinning which optimizes the number of bins for the IV
library(woeBinning)
# get the bin cut points from your dataframe
cutpoints <- woe.binning(dataset, "target_name", "Variable_name")
woe.binning.plot(cutpoints)
# apply the cutpoints to your dataframe
dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe")
It returns your dataset with two extra columns
Variable_name.binned which is the labels
Variable_name.woe.binned which is the replaced values that you can then parse into your regression instead of Variable_name
For the second point, on chi2, the package discretization seems to handle it but I haven't tested it.
The methods used by regression splines to set knot locations might be considered. The rpart package probably has relevant code. You do need to penalize the inferential statistics because this results in an implicit hiding of the degrees of freedom expended in the process of moving the breaks around to get the best fit. Another common method is to specify breaks at equally spaced quantiles (quartiles or quintiles) within the subset with IV=1. Something like this untested code:
cont.var.vec <- # names of all your continuous variables
breaks <- function(var,n) quantiles( dfrm[[var]],
probs=seq(0,1,length.out=n),
na.rm=TRUE)
lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5)
s
etwd("D:")
rm(list=ls())
options (scipen = 999)
read.csv("dummy_data.txt") -> dt
head(dt)
summary(dt)
mydata <- dt
head(mydata)
summary(mydata)
##Capping
for(i in 1:ncol(mydata)){
if(is.numeric(mydata[,i])){
val.quant <- unname(quantile(mydata[,i],probs = 0.75))
mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}})
}
}
library(randomForest)
x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")]
y <- as.factor(mydata$Y)
set.seed(21)
fit <- randomForest(x,y,importance=T,ntree = 70)
mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")]
mydata2$Y <- as.factor(mydata2$Y)
fit$importance
####var reduction#####
vartoremove <- ncol(mydata2) - 20
library(rminer)
#####
for(i in 1:vartoremove){
rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100)
varImportance <- Importance(rf,mydata2,method="sensg")
Z <- order(varImportance$imp,decreasing = FALSE)
IND <- Z[2]
var_to_remove <- names(mydata2[IND])
mydata2[IND] = NULL
print(i)
}
###########
library(smbinning)
as.data.frame(mydata2) -> inp
summary(inp)
attach(inp)
rm(result)
str(inp)
inp$target <- as.numeric(inp$Y) *1
table(inp$target)
ftable(inp$Y,inp$target)
inp$target <- inp$target -1
result= smbinning(df=inp, y="target", x="X37", p=0.0005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
summary(inp)
result$ivtable
boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution")
###Sample
require(caTools)
inp$Y <- NULL
sample = sample.split(inp$target, SplitRatio = .7)
train = subset(inp, sample == TRUE)
test = subset(inp, sample == FALSE)
head(train)
nrow(train)
fit1 <- glm(train$target~.,data=train,family = binomial)
summary(rf)
prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") )
result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
tail(prediction1)
write.csv(prediction1 , "test_pred_logistic.csv")
predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") )
write.csv(predict_train , "train_pred_logistic.csv")
result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
####random forest
rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200)
prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train))
result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
###########IV
library(devtools)
install_github("riv","tomasgreif")
library(woe)
##### K-fold Validation ########
library(caret)
cv_fold_count = 2
folds = createFolds(mydata2$Y,cv_fold_count,list=T);
smpl = folds[[i]];
g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")];
g_test = mydata2[smpl,!names(mydata2) %in% c("Y")];
cost_train = mydata2[-smpl,"Y"];
cost_test = mydata2[smpl,"Y"];
rf <- randomForest(g_train,cost_train)
logit.data <- cbind(cost_train,g_train)
logit.fit <- glm(cost_train~.,data=logit.data,family = binomial)
prediction <- data.f
rame(actual = test$Y, predicted = predict(rf,test))

Resources