Descriptive statistics of the data from nhanes 2003 -2004 - r

I am trying to reproduce the original results (n = 9643) of missing data percentage from table 5 of the article "A robust imputation method for missing responses and covariates in sample selection models". I downloaded the nhanes data 2003-2004 and created a script to read them. I was able to faithfully reproduce the results of all variables except the income variable. I've read the article several times and researched a lot, but I can't see where I'm going wrong. Does anyone know how to find the 24.41% missing data value for the income variable? Below is my code!
rm(list = ls())
cat("\014")
library("tidyverse")
library(Hmisc)
mydata <- sasxport.get("https://raw.githack.com/maf335/stack/master/DEMO_C.XPT")
attach(mydata)
newdata <- mydata %>% select(seqn,ridageyr, riagendr, dmdeduc, ridreth1, indhhinc)
names(newdata) <- c("id","age","gender", "educ", "race", "income")
attach(newdata)
##################
mydata2 <- sasxport.get("https://raw.githack.com/maf335/stack/master/BMX_C.XPT")
attach(mydata2)
newdata2 <- mydata2 %>% select(seqn,bmxbmi)
names(newdata2) <- c("id","bmi")
attach(newdata2)
##############
mydata3 <- sasxport.get("https://raw.githack.com/maf335/stack/master/BPX_C.XPT")
attach(mydata3)
newdata3 <- mydata3 %>% select(seqn, bpxsy1)
names(newdata3) <- c("id", "sbp")
attach(newdata3)
#################
dt <- merge(newdata, newdata2, by="id")
data <- merge(dt, newdata3, by= "id")
attach(data)
####################
perc <- function(x,data){
nna <- ifelse(sum(is.na(x))!=0,summary(x)[[7]],"x has no missing data")
perc <- ifelse(sum(is.na(x))!=0,(nna/length(data$id))*100,"x has no missing data")
#perc <- (nna/length(data$id))*100
return(perc)
}
perc(sbp,data)
perc(age,data)
perc(gender,data)
perc(bmi,data)
perc(educ,data)
perc(race,data)
perc(income,data)
hist(data$income, prob= TRUE, breaks = seq(1, 99, 0.5), xlim = c(1,10), ylim = c(0,0.35), main = "Histogram of Income", xlab = "Category")
The article "Subsample ignorable likelihood for regression
analysis with missing data" also presents, in table 1, the income variable with high value of missing data. Even considering a smaller number of observations (n = 9041).

Related

How to avoid spread error in two-way repeated measures ANOVA in R?

I am trying to run a two-way repeated measures ANOVA using the rstatix package however, I keep getting the following error message that I don't know how to interpret although I suspect it has something to do with dat$id.
Error in `spread()`:
! Each row of output must be identified by a unique combination of keys.
Keys are shared for 72 rows:
The data I am using has two locations with three measurements per Location for each Date. Any idea how to avoid this error?
Example Data
library(dplyr)
set.seed(321)
dat <- data.frame(matrix(ncol = 3, nrow = 72))
colnames(dat)[1:3] <- c("Date","Location","Value")
dat$Value <- round(rnorm(72, 100,50),0)
dat$Location <- rep(c("Location 1","Location 2"), each = 36)
st <- as.Date("2020-01-01")
en <- as.Date("2020-12-31")
dat$Date <- rep(seq.Date(st,en,by = '1 month'),3)
dat <- dat %>% mutate(id = dense_rank(Date))
dat$Date <- as.factor(dat$Date)
View(dat)
Two-way repeated measures ANOVA that throws the error
library(rstatix)
resaov <- anova_test(
data = dat, dv = Value, wid = id,
within = c(Location, Date))

randomForest error shown NA not permitted in predictors

Can I get some help and suggestion from you guys since I am trying to run randomForest in classification problem on currency data but I got this pop-up showing NA not permitted in predictors. However, I have tried to solve it by myself but still cannot figure it out.
library(priceR)
library(tidyverse)
library(quantmod)
library(dplyr)
Get the data
a <- historical_exchange_rates("THB", to = "USD",start_date = "2010-01-01", end_date = "2021-12-31")
Set up input indicators
a.avg10 <- rollapply(a[,2],10,mean)
a.avg20 <- rollapply(a[,2],20,mean)
a.std10 <- rollapply(a[,2],20,sd)
a.std20 <- rollapply(a[,2],20,sd)
a.rsi5 <- na.omit(RSI(a[,2],5,"SMA"))
a.rsi14 <- na.omit(RSI(a[,2],14,"SMA"))
a.macd12269 <- na.omit(MACD(a[,2],12,26,9,"SMA"))
a.macd7205 <- na.omit(MACD(a[,2],7,20,5,"SMA"))
a.bbands <- na.omit(BBands(a[,2],20,"SMA",2))
Create variable direction
a.direction <- a %>% mutate(direction = ifelse(one_THB_equivalent_to_x_USD - lag(one_THB_equivalent_to_x_USD, 10) <= 0, 0, 1))
Combining variables
a.data <- cbind(a[1:4350,2],a.avg10[1:4350],a.avg20[1:4350],a.bbands[1:4350,1:4],a.std10[1:4350],a.std20[1:4350],a.rsi5[1:4350],a.rsi14[1:4350],a.macd12269[1:4350,1:2],a.macd7205[1:4350,1:2],a.direction[1:4350,3])
Train and test
a.split <- sample(c(rep(0,0.7*nrow(a.data)),rep(1,0.3*nrow(a.data))))
Building in-sample and out-sample datasets
isa.data <- a.data[a.split == 0,]
osa.data <- a.data[a.split == 1,]
Standardizing the dataset of in-sample and out-sample
ismea.data <- sapply(isa.data,mean,2)
issta.data <- apply(isa.data,2,sd)
isida.data <- matrix (1,dim(isa.data)[1],dim(isa.data)[2])
osmea.data <- sapply(osa.data,mean,2)
ossta.data <- apply(osa.data,2,sd)
osida.data <- matrix (1,dim(osa.data)[1],dim(osa.data)[2])
Normalizing the data
norma.data <- (isa.data - t(ismea.data*t(isida.data)))/t(issta.data*t(isida.data))
normosa.data <- (osa.data - t(osmea.data*t(osida.data)))/t(ossta.data*t(osida.data))
Replacing last column with variable direction
a.dm <- dim(isa.data)
norma.data[,a.dm[2]] <- a.direction[1:3045,3]
normosa.data[,a.dm[2]] <- a.direction[3046:4350,3]
Combine as dataframe
isnorma.data <- as.data.frame(norma.data)
osnorma.data <- as.data.frame(normosa.data)
colnames(isnorma.data) <- c("exchage rate", "avg10", "avg20", "down", "mavg", "up", "pctB", "std10", "std20", "rsi5", "rsi14", "macd12269", "signal12269", "macd7205", "signal7205", "Direction")
colnames(osnorma.data) <- c("exchage rate", "avg10", "avg20", "down", "mavg", "up", "pctB", "std10", "std20", "rsi5", "rsi14", "macd12269", "signal12269", "macd7205", "signal7205", "Direction")
Modelling with random forest
rfisnorma.data <- isnorma.data %>% select(-Direction)
rfosnorma.data <- osnorma.data %>% select(-Direction)
Labeling train and test data with direction
a.lagret <- (a[,2] - lag(a[,2],10))/ lag(a[,2],10)
rfa.direction <- NULL
rfa.direction[a.lagret > 0.02] <- "Up"
rfa.direction[a.lagret < -0.02] <- "Down"
rfa.direction[a.lagret < 0.02 & a.lagret > -0.02] <- "Nowhere"
isdira.data <- rfa.direction[1:3045]
osdira.data <- rfa.direction[3046:4350]
Convert labeled data into factors as only accepted by randomForest
isdira.data <- na.omit(as.factor(isdira.data))
osdira.data <- na.omit(as.factor(osdira.data))
Modelling data with input parameters
rfmodela.data <- randomForest(rfisnorma.data[11:3045,1:15], y=as.factor(isdira.data), xtest=rfosnorma.data, ytest=as.factor(osdira.data), ntree=500, importance=TRUE)
In this step is where I got an error "NA not permitted in predictors"
You have missing data somewhere between 2840 and 2850 if you replace the last line of code with the lines I show bellow it should run. You can use the arguments xtest=xtest[index,], ytest=y[index] but I am not sure if you want them since the test data are the same with the train data. Please check the documentation to make sure that you are doing the right thing.
tempdata<-xtest<-rfisnorma.data[11:3045,1:15]
y<-as.factor(as.character(isdira.data))
index<-c(1:2840,2850:nrow(tempdata))
rfmodela.data <- randomForest(tempdata[index,], y=y[index], ntree=500, importance=TRUE)
summary(rfmodela.data)

'factors with the same levels' in Confusion Matrix

I'm trying to make a decision tree but this error comes up when I make a confusion matrix in the last line :
Error : `data` and `reference` should be factors with the same levels
Here's my code:
library(rpart)
library(caret)
library(dplyr)
library(rpart.plot)
library(xlsx)
library(caTools)
library(data.tree)
library(e1071)
#Loading the Excel File
library(readxl)
FINALDATA <- read_excel("Desktop/FINALDATA.xlsm")
View(FINALDATA)
df <- FINALDATA
View(df)
#Selecting the meaningful columns for prediction
#df <- select(df, City, df$`Customer type`, Gender, Quantity, Total, Date, Time, Payment, Rating)
df <- select(df, City, `Customer type`, Gender, Quantity, Total, Date, Time, Payment, Rating)
#making sure the data is in the right format
df <- mutate(df, City= as.character(City), `Customer type`= as.character(`Customer type`), Gender= as.character(Gender), Quantity= as.numeric(Quantity), Total= as.numeric(Total), Time= as.numeric(Time), Payment = as.character(Payment), Rating= as.numeric(Rating))
#Splitting into training and testing data
set.seed(123)
sample = sample.split('Customer type', SplitRatio = .70)
train = subset(df, sample==TRUE)
test = subset(df, sample == FALSE)
#Training the Decision Tree Classifier
tree <- rpart(df$`Customer type` ~., data = train)
#Predictions
tree.customertype.predicted <- predict(tree, test, type= 'class')
#confusion Matrix for evaluating the model
confusionMatrix(tree.customertype.predicted, test$`Customer type`)
So I've tried to do this as said in another topic:
confusionMatrix(table(tree.customertype.predicted, test$`Customer type`))
But I still have an error:
Error in !all.equal(nrow(data), ncol(data)) : argument type is invalid
I made a toy data set and examined your code. There were a couple issues:
R has a easier time with variable names that follow a certain style. Your 'Customer type' variable has a space in it. In general, coding is easier when you avoid spaces. So I renamed it 'Customer_type". For your data.frame you could simply go into the source file, or use names(df) <- gsub("Customer type", "Customer_type", names(df)).
I coded 'Customer_type' as a factor. For you this will look like df$Customer_type <- factor(df$Customer_type)
The documentation for sample.split() says the first argument 'Y' should be a vector of labels. But in your code you gave the variable name. The labels are the names of the levels of the factor. In my example these levels are High, Med and Low. To see the levels of your variable you could use levels(df$Customer_type). Input these to sample.split() as a character vector.
Adjust the rpart() call as shown below.
With these adjustments, your code might be OK.
# toy data
df <- data.frame(City = factor(sample(c("Paris", "Tokyo", "Miami"), 100, replace = T)),
Customer_type = factor(sample(c("High", "Med", "Low"), 100, replace = T)),
Gender = factor(sample(c("Female", "Male"), 100, replace = T)),
Quantity = sample(1:10, 100, replace = T),
Total = sample(1:10, 100, replace = T),
Date = sample(seq(as.Date('2020/01/01'), as.Date('2020/12/31'), by="day"), 100),
Rating = factor(sample(1:5, 100, replace = T)))
library(rpart)
library(caret)
library(dplyr)
library(caTools)
library(data.tree)
library(e1071)
#Splitting into training and testing data
set.seed(123)
sample = sample.split(levels(df$Customer_type), SplitRatio = .70) # ADJUST YOUR CODE TO MATCH YOUR FACTOR LABEL NAMES
train = subset(df, sample==TRUE)
test = subset(df, sample == FALSE)
#Training the Decision Tree Classifier
tree <- rpart(Customer_type ~., data = train) # ADJUST YOUR CODE SO IT'S LIKE THIS
#Predictions
tree.customertype.predicted <- predict(tree, test, type= 'class')
#confusion Matrix for evaluating the model
confusionMatrix(tree.customertype.predicted, test$Customer_type)
Try to keep factor levels of train and test same as df.
train$`Customer type` <- factor(train$`Customer type`, unique(df$`Customer type`))
test$`Customer type` <- factor(test$`Customer type`, unique(df$`Customer type`))

Why is my model so accurate when using knn(), where k=1?

I am currently using genomic expression levels, age, and smoking intensity levels to predict the number of days Lung Cancer Patients have to live. I have a small amount of data; 173 patients and 20,438 variables, including gene expression levels (which make up for 20,436). I have split up my data into test and training, utilizing an 80:20 ratio. There are no missing values in the data.
I am using knn() to train the model. Here is what the code looks like:
prediction <- knn(train = trainData, test = testData, cl = trainAnswers, k=1)
Nothing seems out of the ordinary until you notice that k=1. "Why is k=1?" you may ask. The reason k=1 is because when k=1, the model is the most accurate. This makes no sense to me. There are quite a few concerns:
I am using knn() to predict a continuous variable. I should be using something along the lines of, cox maybe.
The model is waaaaaaay too accurate. Here are a few examples of the test answer and the model's predictions. For the first patient, the number of days to death is 274. The model predicts 268. For the second patient, test: 1147, prediction: 1135. 3rd, test: 354, prediction: 370. 4th, test: 995, prediction 995. How is this possible? Out of the entire test data, the model was only off by and average of 9.0625 days! The median difference was 7 days, and the mode was 6 days. Here is a graph of the results:
Bar Graph.
So I guess my main question is what does knn() do, what does k represent, and how is the model so accurate when k=1? Here is my entire code (I am unable to attach the actual data):
# install.packages(c('caret', 'skimr', 'RANN', 'randomForest', 'fastAdaboost', 'gbm', 'xgboost', 'caretEnsemble', 'C50', 'earth'))
library(caret)
# Gather the data and store it in variables
LUAD <- read.csv('/Users/username/Documents/ClinicalData.csv')
geneData <- read.csv('/Users/username/Documents/GenomicExpressionLevelData.csv')
geneData <- data.frame(geneData)
row.names(geneData) = geneData$X
geneData <- geneData[2:514]
colNamesGeneData <- gsub(".","-",colnames(geneData),fixed = TRUE)
colnames(geneData) = colNamesGeneData
# Organize the data
# Important columns are 148 (smoking), 123 (OS Month, basically how many days old), and the gene data. And column 2 (barcode).
LUAD = data.frame(LUAD$patient, LUAD$TOBACCO_SMOKING_HISTORY_INDICATOR, LUAD$OS_MONTHS, LUAD$days_to_death)[complete.cases(data.frame(LUAD$patient, LUAD$TOBACCO_SMOKING_HISTORY_INDICATOR, LUAD$OS_MONTHS, LUAD$days_to_death)), ]
rownames(LUAD)=LUAD$LUAD.patient
LUAD <- LUAD[2:4]
# intersect(rownames(LUAD),colnames(geneData))
# ind=which(colnames(geneData)=="TCGA-778-7167-01A-11R-2066-07")
gene_expression=geneData[, rownames(LUAD)]
# Merge the two datasets to use the geneomic expression levels in your model
LUAD <- data.frame(LUAD,t(gene_expression))
LUAD.days_to_death <- LUAD[,3]
LUAD <- LUAD[,c(1:2,4:20438)]
LUAD <- data.frame(LUAD.days_to_death,LUAD)
set.seed(401)
# Number of Rows in the training data (createDataPartition(dataSet, percentForTraining, boolReturnAsList))
trainRowNum <- createDataPartition(LUAD$LUAD.days_to_death, p=0.8, list=FALSE)
# Training/Test Dataset
trainData <- LUAD[trainRowNum, ]
testData <- LUAD[-trainRowNum, ]
x = trainData[, c(2:20438)]
y = trainData$LUAD.days_to_death
v = testData[, c(2:20438)]
w = testData$LUAD.days_to_death
# Imputing missing values into the data
preProcess_missingdata_model <- preProcess(trainData, method='knnImpute')
library(RANN)
if (anyNA(trainData)) {
trainData <- predict(preProcess_missingdata_model, newdata = trainData)
}
anyNA(trainData)
# Normalizing the data
preProcess_range_model <- preProcess(trainData, method='range')
trainData <- predict(preProcess_range_model, newdata = trainData)
trainData$LUAD.days_to_death <- y
apply(trainData[,1:20438], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})
preProcess_range_model_Test <- preProcess(testData, method='range')
testData <- predict(preProcess_range_model_Test, newdata = testData)
testData$LUAD.days_to_death <- w
apply(testData[,1:20438], 2, FUN=function(v){c('min'=min(v), 'max'=max(v))})
# To uncomment, select the text and press 'command' + 'shift' + 'c'
# set.seed(401)
# options(warn=-1)
# subsets <- c(1:10)
# ctrl <- rfeControl(functions = rfFuncs,
# method = "repeatedcv",
# repeats = 5,
# verbose = TRUE)
# lmProfile <- rfe(x=trainData[1:20437], y=trainAnswers,
# sizes = subsets,
# rfeControl = ctrl)
# lmProfile
trainAnswers <- trainData[,1]
testAnswers <- testData[,1]
library(class)
prediction <- knn(train = trainData, test = testData, cl = trainAnswers, k=1)
#install.packages("plotly")
library(plotly)
Test_Question_Number <- c(1:32)
prediction2 <- data.frame(prediction[1:32])
prediction2 <- as.numeric(as.vector(prediction2[c(1:32),]))
data <- data.frame(Test_Question_Number, prediction2, testAnswers)
names(data) <- c("Test Question Number","Prediction","Answer")
p <- plot_ly(data, x = ~Test_Question_Number, y = ~prediction2, type = 'bar', name = 'Prediction') %>%
add_trace(y = ~testAnswers, name = 'Answer') %>%
layout(yaxis = list(title = 'Days to Death'), barmode = 'group')
p
merge <- data.frame(prediction2,testAnswers)
difference <- abs((merge[,1])-(merge[,2]))
difference <- sort(difference)
meanDifference <- mean(difference)
medianDifference <- median(difference)
modeDifference <- names(table(difference))[table(difference)==max(table(difference))]
cat("Mean difference:", meanDifference, "\n")
cat("Median difference:", medianDifference, "\n")
cat("Mode difference:", modeDifference,"\n")
Lastly, for clarification purposes, ClinicalData.csv is the age, days to death, and smoking intensity data. The other .csv is the genomic expression data. The data above line 29 doesn't really matter, so you can just skip to the part of the code where it says "set.seed(401)".
Edit: Some samples of the data:
days_to_death OS_MONTHS
121 3.98
NACC1 2001.5708 2363.8063 1419.879
NACC2 58.2948 61.8157 43.4386
NADK 706.868 1053.4424 732.1562
NADSYN1 1628.7634 912.1034 638.6471
NAE1 832.8825 793.3014 689.7123
NAF1 140.3264 165.4858 186.355
NAGA 1523.3441 1524.4619 1858.9074
NAGK 983.6809 899.869 1168.2003
NAGLU 621.3457 510.9453 1172.511
NAGPA 346.9762 257.5654 275.5533
NAGS 460.7732 107.2116 321.9763
NAIF1 217.1219 202.5108 132.3054
NAIP 101.2305 87.8942 77.261
NALCN 13.9628 36.7031 48.0809
NAMPT 3245.6584 1257.8849 5465.6387
Because K = 1 is the most complex knn model. It has the most flexible decision boundary. It creates an overfit. It will perform well within the training data by poorly on a holdout set (but not always).

binning continuous variables by IV value in R

I am building a logistic regression model in R. I want to bin continuous predictors in an optimal way in relationship to the target variable. There are two things that I know of:
the continuous variables are binned such that its IV (information value) is maximized
maximize the chi-square in the two way contingency table -- the target has two values 0 and 1, and the binned continuous variable has the binned buckets
Does anyone know of any functions in R that can perform such binning?
Your help will be greatly appreciated.
For the first point, you could bin using the weight of evidence (woe) with the package woebinning which optimizes the number of bins for the IV
library(woeBinning)
# get the bin cut points from your dataframe
cutpoints <- woe.binning(dataset, "target_name", "Variable_name")
woe.binning.plot(cutpoints)
# apply the cutpoints to your dataframe
dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe")
It returns your dataset with two extra columns
Variable_name.binned which is the labels
Variable_name.woe.binned which is the replaced values that you can then parse into your regression instead of Variable_name
For the second point, on chi2, the package discretization seems to handle it but I haven't tested it.
The methods used by regression splines to set knot locations might be considered. The rpart package probably has relevant code. You do need to penalize the inferential statistics because this results in an implicit hiding of the degrees of freedom expended in the process of moving the breaks around to get the best fit. Another common method is to specify breaks at equally spaced quantiles (quartiles or quintiles) within the subset with IV=1. Something like this untested code:
cont.var.vec <- # names of all your continuous variables
breaks <- function(var,n) quantiles( dfrm[[var]],
probs=seq(0,1,length.out=n),
na.rm=TRUE)
lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5)
s
etwd("D:")
rm(list=ls())
options (scipen = 999)
read.csv("dummy_data.txt") -> dt
head(dt)
summary(dt)
mydata <- dt
head(mydata)
summary(mydata)
##Capping
for(i in 1:ncol(mydata)){
if(is.numeric(mydata[,i])){
val.quant <- unname(quantile(mydata[,i],probs = 0.75))
mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}})
}
}
library(randomForest)
x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")]
y <- as.factor(mydata$Y)
set.seed(21)
fit <- randomForest(x,y,importance=T,ntree = 70)
mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")]
mydata2$Y <- as.factor(mydata2$Y)
fit$importance
####var reduction#####
vartoremove <- ncol(mydata2) - 20
library(rminer)
#####
for(i in 1:vartoremove){
rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100)
varImportance <- Importance(rf,mydata2,method="sensg")
Z <- order(varImportance$imp,decreasing = FALSE)
IND <- Z[2]
var_to_remove <- names(mydata2[IND])
mydata2[IND] = NULL
print(i)
}
###########
library(smbinning)
as.data.frame(mydata2) -> inp
summary(inp)
attach(inp)
rm(result)
str(inp)
inp$target <- as.numeric(inp$Y) *1
table(inp$target)
ftable(inp$Y,inp$target)
inp$target <- inp$target -1
result= smbinning(df=inp, y="target", x="X37", p=0.0005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
summary(inp)
result$ivtable
boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution")
###Sample
require(caTools)
inp$Y <- NULL
sample = sample.split(inp$target, SplitRatio = .7)
train = subset(inp, sample == TRUE)
test = subset(inp, sample == FALSE)
head(train)
nrow(train)
fit1 <- glm(train$target~.,data=train,family = binomial)
summary(rf)
prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") )
result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
tail(prediction1)
write.csv(prediction1 , "test_pred_logistic.csv")
predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") )
write.csv(predict_train , "train_pred_logistic.csv")
result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
####random forest
rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200)
prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train))
result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
###########IV
library(devtools)
install_github("riv","tomasgreif")
library(woe)
##### K-fold Validation ########
library(caret)
cv_fold_count = 2
folds = createFolds(mydata2$Y,cv_fold_count,list=T);
smpl = folds[[i]];
g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")];
g_test = mydata2[smpl,!names(mydata2) %in% c("Y")];
cost_train = mydata2[-smpl,"Y"];
cost_test = mydata2[smpl,"Y"];
rf <- randomForest(g_train,cost_train)
logit.data <- cbind(cost_train,g_train)
logit.fit <- glm(cost_train~.,data=logit.data,family = binomial)
prediction <- data.f
rame(actual = test$Y, predicted = predict(rf,test))

Resources