randomForest error shown NA not permitted in predictors - r

Can I get some help and suggestion from you guys since I am trying to run randomForest in classification problem on currency data but I got this pop-up showing NA not permitted in predictors. However, I have tried to solve it by myself but still cannot figure it out.
library(priceR)
library(tidyverse)
library(quantmod)
library(dplyr)
Get the data
a <- historical_exchange_rates("THB", to = "USD",start_date = "2010-01-01", end_date = "2021-12-31")
Set up input indicators
a.avg10 <- rollapply(a[,2],10,mean)
a.avg20 <- rollapply(a[,2],20,mean)
a.std10 <- rollapply(a[,2],20,sd)
a.std20 <- rollapply(a[,2],20,sd)
a.rsi5 <- na.omit(RSI(a[,2],5,"SMA"))
a.rsi14 <- na.omit(RSI(a[,2],14,"SMA"))
a.macd12269 <- na.omit(MACD(a[,2],12,26,9,"SMA"))
a.macd7205 <- na.omit(MACD(a[,2],7,20,5,"SMA"))
a.bbands <- na.omit(BBands(a[,2],20,"SMA",2))
Create variable direction
a.direction <- a %>% mutate(direction = ifelse(one_THB_equivalent_to_x_USD - lag(one_THB_equivalent_to_x_USD, 10) <= 0, 0, 1))
Combining variables
a.data <- cbind(a[1:4350,2],a.avg10[1:4350],a.avg20[1:4350],a.bbands[1:4350,1:4],a.std10[1:4350],a.std20[1:4350],a.rsi5[1:4350],a.rsi14[1:4350],a.macd12269[1:4350,1:2],a.macd7205[1:4350,1:2],a.direction[1:4350,3])
Train and test
a.split <- sample(c(rep(0,0.7*nrow(a.data)),rep(1,0.3*nrow(a.data))))
Building in-sample and out-sample datasets
isa.data <- a.data[a.split == 0,]
osa.data <- a.data[a.split == 1,]
Standardizing the dataset of in-sample and out-sample
ismea.data <- sapply(isa.data,mean,2)
issta.data <- apply(isa.data,2,sd)
isida.data <- matrix (1,dim(isa.data)[1],dim(isa.data)[2])
osmea.data <- sapply(osa.data,mean,2)
ossta.data <- apply(osa.data,2,sd)
osida.data <- matrix (1,dim(osa.data)[1],dim(osa.data)[2])
Normalizing the data
norma.data <- (isa.data - t(ismea.data*t(isida.data)))/t(issta.data*t(isida.data))
normosa.data <- (osa.data - t(osmea.data*t(osida.data)))/t(ossta.data*t(osida.data))
Replacing last column with variable direction
a.dm <- dim(isa.data)
norma.data[,a.dm[2]] <- a.direction[1:3045,3]
normosa.data[,a.dm[2]] <- a.direction[3046:4350,3]
Combine as dataframe
isnorma.data <- as.data.frame(norma.data)
osnorma.data <- as.data.frame(normosa.data)
colnames(isnorma.data) <- c("exchage rate", "avg10", "avg20", "down", "mavg", "up", "pctB", "std10", "std20", "rsi5", "rsi14", "macd12269", "signal12269", "macd7205", "signal7205", "Direction")
colnames(osnorma.data) <- c("exchage rate", "avg10", "avg20", "down", "mavg", "up", "pctB", "std10", "std20", "rsi5", "rsi14", "macd12269", "signal12269", "macd7205", "signal7205", "Direction")
Modelling with random forest
rfisnorma.data <- isnorma.data %>% select(-Direction)
rfosnorma.data <- osnorma.data %>% select(-Direction)
Labeling train and test data with direction
a.lagret <- (a[,2] - lag(a[,2],10))/ lag(a[,2],10)
rfa.direction <- NULL
rfa.direction[a.lagret > 0.02] <- "Up"
rfa.direction[a.lagret < -0.02] <- "Down"
rfa.direction[a.lagret < 0.02 & a.lagret > -0.02] <- "Nowhere"
isdira.data <- rfa.direction[1:3045]
osdira.data <- rfa.direction[3046:4350]
Convert labeled data into factors as only accepted by randomForest
isdira.data <- na.omit(as.factor(isdira.data))
osdira.data <- na.omit(as.factor(osdira.data))
Modelling data with input parameters
rfmodela.data <- randomForest(rfisnorma.data[11:3045,1:15], y=as.factor(isdira.data), xtest=rfosnorma.data, ytest=as.factor(osdira.data), ntree=500, importance=TRUE)
In this step is where I got an error "NA not permitted in predictors"

You have missing data somewhere between 2840 and 2850 if you replace the last line of code with the lines I show bellow it should run. You can use the arguments xtest=xtest[index,], ytest=y[index] but I am not sure if you want them since the test data are the same with the train data. Please check the documentation to make sure that you are doing the right thing.
tempdata<-xtest<-rfisnorma.data[11:3045,1:15]
y<-as.factor(as.character(isdira.data))
index<-c(1:2840,2850:nrow(tempdata))
rfmodela.data <- randomForest(tempdata[index,], y=y[index], ntree=500, importance=TRUE)
summary(rfmodela.data)

Related

I am trying to run XGBoost in R but am facing some issues

I have a dataset of 25 variables and 248 rows.
There are 8-factor variables and the rest are integers and numbers.
I am trying to run XGBoost.
I have done the following code: -
# Partition Data
set.seed(1234)
ind <- sample(2, nrow(mission), replace = T, prob = c(0.7,0.3))
train <- mission[ind == 1,]
test <- mission[ind == 2,]
# Create matrix - One-Hot Encoding for Factor variables
trainm <- sparse.model.matrix(GRL ~ .-1, data = train)
head(trainm)
train_label <- train[,"GRL"]
train_matrix <- xgb.DMatrix(data = as.matrix(trainm), label = train_label)
testm <- sparse.model.matrix(GRL~.-1, data = test)
test_label <- test[,"GRL"]
test_matrix <- xgb.DMatrix(data = as.matrix(testm),label = test_label)
The response variable here is "GRL" and I am running the test_label <- test[,"GRL"]
The above code is getting executed but when I am trying to use it in xgb.DMatrix, I am encountering the following error:
Error in setinfo.xgb.DMatrix(dmat, names(p), p[[1]]) :
The length of labels must equal to the number of rows in the input data
I have partitioned the data into 70:30.
test[,"GRL"] returns a data.frame, and XGBoost needs the label to be a vector.
Just use teste$GRL or test[["GRL"]] instead. You also need to do the same for the training dataset

Descriptive statistics of the data from nhanes 2003 -2004

I am trying to reproduce the original results (n = 9643) of missing data percentage from table 5 of the article "A robust imputation method for missing responses and covariates in sample selection models". I downloaded the nhanes data 2003-2004 and created a script to read them. I was able to faithfully reproduce the results of all variables except the income variable. I've read the article several times and researched a lot, but I can't see where I'm going wrong. Does anyone know how to find the 24.41% missing data value for the income variable? Below is my code!
rm(list = ls())
cat("\014")
library("tidyverse")
library(Hmisc)
mydata <- sasxport.get("https://raw.githack.com/maf335/stack/master/DEMO_C.XPT")
attach(mydata)
newdata <- mydata %>% select(seqn,ridageyr, riagendr, dmdeduc, ridreth1, indhhinc)
names(newdata) <- c("id","age","gender", "educ", "race", "income")
attach(newdata)
##################
mydata2 <- sasxport.get("https://raw.githack.com/maf335/stack/master/BMX_C.XPT")
attach(mydata2)
newdata2 <- mydata2 %>% select(seqn,bmxbmi)
names(newdata2) <- c("id","bmi")
attach(newdata2)
##############
mydata3 <- sasxport.get("https://raw.githack.com/maf335/stack/master/BPX_C.XPT")
attach(mydata3)
newdata3 <- mydata3 %>% select(seqn, bpxsy1)
names(newdata3) <- c("id", "sbp")
attach(newdata3)
#################
dt <- merge(newdata, newdata2, by="id")
data <- merge(dt, newdata3, by= "id")
attach(data)
####################
perc <- function(x,data){
nna <- ifelse(sum(is.na(x))!=0,summary(x)[[7]],"x has no missing data")
perc <- ifelse(sum(is.na(x))!=0,(nna/length(data$id))*100,"x has no missing data")
#perc <- (nna/length(data$id))*100
return(perc)
}
perc(sbp,data)
perc(age,data)
perc(gender,data)
perc(bmi,data)
perc(educ,data)
perc(race,data)
perc(income,data)
hist(data$income, prob= TRUE, breaks = seq(1, 99, 0.5), xlim = c(1,10), ylim = c(0,0.35), main = "Histogram of Income", xlab = "Category")
The article "Subsample ignorable likelihood for regression
analysis with missing data" also presents, in table 1, the income variable with high value of missing data. Even considering a smaller number of observations (n = 9041).

Predict warning-----new data rows <> variable rows

I'm a beginner in R.
I tried to build a model by using a part of samples and predict response by using the rest samples. But when I use predict(), I got a warning message:
'newdata' had 152 rows but variables found have 354 rows
I have searched some answers, but I still can't understand T.T. Please help
library(MASS)
data(Boston)
n <- nrow(Boston)
n_train <- round(.70*n)
train_set <- sample(n,size=n_train,replace = FALSE)
x <- cbind(Boston$lstat,log(Boston$lstat))
y <- Boston$medv
x_train <- x[train_set,]
y_train <- y[train_set]
x_test <- x[-train_set,]
y_test <- y[-train_set]
lm_temp <- lm(y_train~x_train)
y_test_hat <- predict(lm_temp,newdata=data.frame(x_test))
It looks like R is getting confused when you pass a matrix as the independent variables, but then the predict function requires a data frame(which is a list).
You can solve the problem by running your lm on a data frame
library(MASS)
data(Boston)
n <- nrow(Boston)
n_train <- round(.70*n)
train_set <- sample(n,size=n_train,replace = FALSE)
data <- Boston[ , c('medv', 'lstat')]
data$loglstat <- log(data$lstat)
train <- data[train_set, ]
test <- data[-train_set,]
lm_temp <- lm(medv ~ ., data = train)
y_test_hat <- predict(lm_temp,newdata=test)

pgmm from plm package gives error for summary

I am trying to use the pgmm function from the plm package for R. The regression runs and I can call up the results, however, asking for the summary gives the following error:
Error in t(y) %*% x : non-conformable arguments
I've imported the data from the World Bank using the WDI package:
library(plm) # load package
library(WDI) # Load package
COUNTRIES <- c("AGO","BEN","BWA","BFA","BDI") # Specify countries
INDICATORS <- c("NY.GDP.PCAP.KN", "SP.DYN.TFRT.IN", "SP.DYN.CBRT.IN", "SP.POP.TOTL") # Specify indicators
LONG <- WDI(country=COUNTRIES, indicator=INDICATORS, start=2005, end=2009, extra=FALSE) # Load data
PANEL <- pdata.frame(LONG, c("iso2c","year")) # Transform to PANEL dataframe
PANEL$year <- as.numeric(as.character(PANEL$year)) # Encode year
EQ <- pgmm( log(fertility) ~ log(gdp) + lag(log(fertility), 2) | lag(log(fertility), 2), data=PANEL, effect="twoways", model="twosteps", gmm.inst=~log(fertility) ) # Run regression
Calling the results as follows works.
EQ
But the summary (below) gives the error message mentioned above.
summary(EQ)
I think the error occurs because summary.pgmm tries to do a second order Arelland-Bond test of serial correlation on your data, but your data only have two points (2008 and 2009) so it fails.
To fix this problem, you could patch the function so that it checks whether you only have two points in the data set and runs the test only if you have more than two points. I provide a patched function below:
summary.pgmm.patched <- function (object, robust = FALSE, time.dummies = FALSE, ...)
{
model <- plm:::describe(object, "model")
effect <- plm:::describe(object, "effect")
transformation <- plm:::describe(object, "transformation")
if (robust) {
vv <- vcovHC(object)
}
else {
vv <- vcov(object)
}
if (model == "onestep")
K <- length(object$coefficients)
else K <- length(object$coefficients[[2]])
Kt <- length(object$args$namest)
if (!time.dummies && effect == "twoways")
rowsel <- -c((K - Kt + 1):K)
else rowsel <- 1:K
std.err <- sqrt(diag(vv))
b <- coef(object)
z <- b/std.err
p <- 2 * pnorm(abs(z), lower.tail = FALSE)
CoefTable <- cbind(b, std.err, z, p)
colnames(CoefTable) <- c("Estimate", "Std. Error", "z-value",
"Pr(>|z|)")
object$CoefTable <- CoefTable[rowsel, , drop = FALSE]
object$sargan <- sargan(object)
object$m1 <- plm:::mtest(object, 1, vv)
# The problem line:
# object$m2 <- mtest(object, 2, vv)
if (length(object$residuals[[1]] ) > 2) object$m2 <- plm:::mtest(object, 2, vv)
object$wald.coef <- plm:::wald(object, "param", vv)
if (plm:::describe(object, "effect") == "twoways")
object$wald.td <- plm:::wald(object, "time", vv)
class(object) <- "summary.pgmm"
object
}
You might want to write to the author of the plm package and show him this post. The author will be able to write a less 'hacky' patch.
Using your own (slightly modified) example data, here is how you would use the function:
library(WDI) # Load package
library(plm)
COUNTRIES <- c("AGO","BEN","BWA","BFA","BDI") # Specify countries
INDICATORS <- c("NY.GDP.PCAP.KN", "SP.DYN.TFRT.IN", "SP.DYN.CBRT.IN", "SP.POP.TOTL") # Specify indicators
LONG <- WDI(country=COUNTRIES, indicator=INDICATORS, start=2005, end=2009, extra=FALSE) # Load data
PANEL <- pdata.frame(LONG, c("iso2c","year")) # Transform to PANEL dataframe
PANEL$year <- as.numeric(as.character(PANEL$year)) # Encode year
names(PANEL) [c(4,5)] = c('gdp','fertility')
EQ <- pgmm( log(fertility) ~ log(gdp) + lag(log(fertility), 2) | lag(log(fertility), 2), data=PANEL, effect="twoways", model="twosteps", gmm.inst=~log(fertility) ) # Run regression
summary.pgmm.patched(EQ)

binning continuous variables by IV value in R

I am building a logistic regression model in R. I want to bin continuous predictors in an optimal way in relationship to the target variable. There are two things that I know of:
the continuous variables are binned such that its IV (information value) is maximized
maximize the chi-square in the two way contingency table -- the target has two values 0 and 1, and the binned continuous variable has the binned buckets
Does anyone know of any functions in R that can perform such binning?
Your help will be greatly appreciated.
For the first point, you could bin using the weight of evidence (woe) with the package woebinning which optimizes the number of bins for the IV
library(woeBinning)
# get the bin cut points from your dataframe
cutpoints <- woe.binning(dataset, "target_name", "Variable_name")
woe.binning.plot(cutpoints)
# apply the cutpoints to your dataframe
dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe")
It returns your dataset with two extra columns
Variable_name.binned which is the labels
Variable_name.woe.binned which is the replaced values that you can then parse into your regression instead of Variable_name
For the second point, on chi2, the package discretization seems to handle it but I haven't tested it.
The methods used by regression splines to set knot locations might be considered. The rpart package probably has relevant code. You do need to penalize the inferential statistics because this results in an implicit hiding of the degrees of freedom expended in the process of moving the breaks around to get the best fit. Another common method is to specify breaks at equally spaced quantiles (quartiles or quintiles) within the subset with IV=1. Something like this untested code:
cont.var.vec <- # names of all your continuous variables
breaks <- function(var,n) quantiles( dfrm[[var]],
probs=seq(0,1,length.out=n),
na.rm=TRUE)
lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5)
s
etwd("D:")
rm(list=ls())
options (scipen = 999)
read.csv("dummy_data.txt") -> dt
head(dt)
summary(dt)
mydata <- dt
head(mydata)
summary(mydata)
##Capping
for(i in 1:ncol(mydata)){
if(is.numeric(mydata[,i])){
val.quant <- unname(quantile(mydata[,i],probs = 0.75))
mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}})
}
}
library(randomForest)
x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")]
y <- as.factor(mydata$Y)
set.seed(21)
fit <- randomForest(x,y,importance=T,ntree = 70)
mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")]
mydata2$Y <- as.factor(mydata2$Y)
fit$importance
####var reduction#####
vartoremove <- ncol(mydata2) - 20
library(rminer)
#####
for(i in 1:vartoremove){
rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100)
varImportance <- Importance(rf,mydata2,method="sensg")
Z <- order(varImportance$imp,decreasing = FALSE)
IND <- Z[2]
var_to_remove <- names(mydata2[IND])
mydata2[IND] = NULL
print(i)
}
###########
library(smbinning)
as.data.frame(mydata2) -> inp
summary(inp)
attach(inp)
rm(result)
str(inp)
inp$target <- as.numeric(inp$Y) *1
table(inp$target)
ftable(inp$Y,inp$target)
inp$target <- inp$target -1
result= smbinning(df=inp, y="target", x="X37", p=0.0005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
summary(inp)
result$ivtable
boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution")
###Sample
require(caTools)
inp$Y <- NULL
sample = sample.split(inp$target, SplitRatio = .7)
train = subset(inp, sample == TRUE)
test = subset(inp, sample == FALSE)
head(train)
nrow(train)
fit1 <- glm(train$target~.,data=train,family = binomial)
summary(rf)
prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") )
result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
tail(prediction1)
write.csv(prediction1 , "test_pred_logistic.csv")
predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") )
write.csv(predict_train , "train_pred_logistic.csv")
result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
####random forest
rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200)
prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train))
result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
###########IV
library(devtools)
install_github("riv","tomasgreif")
library(woe)
##### K-fold Validation ########
library(caret)
cv_fold_count = 2
folds = createFolds(mydata2$Y,cv_fold_count,list=T);
smpl = folds[[i]];
g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")];
g_test = mydata2[smpl,!names(mydata2) %in% c("Y")];
cost_train = mydata2[-smpl,"Y"];
cost_test = mydata2[smpl,"Y"];
rf <- randomForest(g_train,cost_train)
logit.data <- cbind(cost_train,g_train)
logit.fit <- glm(cost_train~.,data=logit.data,family = binomial)
prediction <- data.f
rame(actual = test$Y, predicted = predict(rf,test))

Resources