Predict warning-----new data rows <> variable rows - r

I'm a beginner in R.
I tried to build a model by using a part of samples and predict response by using the rest samples. But when I use predict(), I got a warning message:
'newdata' had 152 rows but variables found have 354 rows
I have searched some answers, but I still can't understand T.T. Please help
library(MASS)
data(Boston)
n <- nrow(Boston)
n_train <- round(.70*n)
train_set <- sample(n,size=n_train,replace = FALSE)
x <- cbind(Boston$lstat,log(Boston$lstat))
y <- Boston$medv
x_train <- x[train_set,]
y_train <- y[train_set]
x_test <- x[-train_set,]
y_test <- y[-train_set]
lm_temp <- lm(y_train~x_train)
y_test_hat <- predict(lm_temp,newdata=data.frame(x_test))

It looks like R is getting confused when you pass a matrix as the independent variables, but then the predict function requires a data frame(which is a list).
You can solve the problem by running your lm on a data frame
library(MASS)
data(Boston)
n <- nrow(Boston)
n_train <- round(.70*n)
train_set <- sample(n,size=n_train,replace = FALSE)
data <- Boston[ , c('medv', 'lstat')]
data$loglstat <- log(data$lstat)
train <- data[train_set, ]
test <- data[-train_set,]
lm_temp <- lm(medv ~ ., data = train)
y_test_hat <- predict(lm_temp,newdata=test)

Related

randomForest error shown NA not permitted in predictors

Can I get some help and suggestion from you guys since I am trying to run randomForest in classification problem on currency data but I got this pop-up showing NA not permitted in predictors. However, I have tried to solve it by myself but still cannot figure it out.
library(priceR)
library(tidyverse)
library(quantmod)
library(dplyr)
Get the data
a <- historical_exchange_rates("THB", to = "USD",start_date = "2010-01-01", end_date = "2021-12-31")
Set up input indicators
a.avg10 <- rollapply(a[,2],10,mean)
a.avg20 <- rollapply(a[,2],20,mean)
a.std10 <- rollapply(a[,2],20,sd)
a.std20 <- rollapply(a[,2],20,sd)
a.rsi5 <- na.omit(RSI(a[,2],5,"SMA"))
a.rsi14 <- na.omit(RSI(a[,2],14,"SMA"))
a.macd12269 <- na.omit(MACD(a[,2],12,26,9,"SMA"))
a.macd7205 <- na.omit(MACD(a[,2],7,20,5,"SMA"))
a.bbands <- na.omit(BBands(a[,2],20,"SMA",2))
Create variable direction
a.direction <- a %>% mutate(direction = ifelse(one_THB_equivalent_to_x_USD - lag(one_THB_equivalent_to_x_USD, 10) <= 0, 0, 1))
Combining variables
a.data <- cbind(a[1:4350,2],a.avg10[1:4350],a.avg20[1:4350],a.bbands[1:4350,1:4],a.std10[1:4350],a.std20[1:4350],a.rsi5[1:4350],a.rsi14[1:4350],a.macd12269[1:4350,1:2],a.macd7205[1:4350,1:2],a.direction[1:4350,3])
Train and test
a.split <- sample(c(rep(0,0.7*nrow(a.data)),rep(1,0.3*nrow(a.data))))
Building in-sample and out-sample datasets
isa.data <- a.data[a.split == 0,]
osa.data <- a.data[a.split == 1,]
Standardizing the dataset of in-sample and out-sample
ismea.data <- sapply(isa.data,mean,2)
issta.data <- apply(isa.data,2,sd)
isida.data <- matrix (1,dim(isa.data)[1],dim(isa.data)[2])
osmea.data <- sapply(osa.data,mean,2)
ossta.data <- apply(osa.data,2,sd)
osida.data <- matrix (1,dim(osa.data)[1],dim(osa.data)[2])
Normalizing the data
norma.data <- (isa.data - t(ismea.data*t(isida.data)))/t(issta.data*t(isida.data))
normosa.data <- (osa.data - t(osmea.data*t(osida.data)))/t(ossta.data*t(osida.data))
Replacing last column with variable direction
a.dm <- dim(isa.data)
norma.data[,a.dm[2]] <- a.direction[1:3045,3]
normosa.data[,a.dm[2]] <- a.direction[3046:4350,3]
Combine as dataframe
isnorma.data <- as.data.frame(norma.data)
osnorma.data <- as.data.frame(normosa.data)
colnames(isnorma.data) <- c("exchage rate", "avg10", "avg20", "down", "mavg", "up", "pctB", "std10", "std20", "rsi5", "rsi14", "macd12269", "signal12269", "macd7205", "signal7205", "Direction")
colnames(osnorma.data) <- c("exchage rate", "avg10", "avg20", "down", "mavg", "up", "pctB", "std10", "std20", "rsi5", "rsi14", "macd12269", "signal12269", "macd7205", "signal7205", "Direction")
Modelling with random forest
rfisnorma.data <- isnorma.data %>% select(-Direction)
rfosnorma.data <- osnorma.data %>% select(-Direction)
Labeling train and test data with direction
a.lagret <- (a[,2] - lag(a[,2],10))/ lag(a[,2],10)
rfa.direction <- NULL
rfa.direction[a.lagret > 0.02] <- "Up"
rfa.direction[a.lagret < -0.02] <- "Down"
rfa.direction[a.lagret < 0.02 & a.lagret > -0.02] <- "Nowhere"
isdira.data <- rfa.direction[1:3045]
osdira.data <- rfa.direction[3046:4350]
Convert labeled data into factors as only accepted by randomForest
isdira.data <- na.omit(as.factor(isdira.data))
osdira.data <- na.omit(as.factor(osdira.data))
Modelling data with input parameters
rfmodela.data <- randomForest(rfisnorma.data[11:3045,1:15], y=as.factor(isdira.data), xtest=rfosnorma.data, ytest=as.factor(osdira.data), ntree=500, importance=TRUE)
In this step is where I got an error "NA not permitted in predictors"
You have missing data somewhere between 2840 and 2850 if you replace the last line of code with the lines I show bellow it should run. You can use the arguments xtest=xtest[index,], ytest=y[index] but I am not sure if you want them since the test data are the same with the train data. Please check the documentation to make sure that you are doing the right thing.
tempdata<-xtest<-rfisnorma.data[11:3045,1:15]
y<-as.factor(as.character(isdira.data))
index<-c(1:2840,2850:nrow(tempdata))
rfmodela.data <- randomForest(tempdata[index,], y=y[index], ntree=500, importance=TRUE)
summary(rfmodela.data)

predict() function throws error using factors on linear model in R

I am using the "lung capacity" data set to try to set up a linear model:
library(tidyverse)
library(rvest)
h <- "https://docs.google.com/spreadsheets/d/0BxQfpNgXuWoIWUdZV1ZTc2ZscnM/edit?resourcekey=0-gqXT7Re2eUS2JGt_w1y4vA#gid=1055321634"
t <- rvest::read_html(h)
Nodes <- t %>% html_nodes("table")
table <- html_table(Nodes[[1]])
colnames(table) <- table[1,]
table <- table[-1,]
table <- table %>% select(LungCap, Age, Height, Smoke, Gender, Caesarean)
Lung_Capacity <- table
Lung_Capacity$LungCap <- as.numeric(Lung_Capacity$LungCap)
Lung_Capacity$Age <- as.numeric(Lung_Capacity$Age)
Lung_Capacity$Height <- as.numeric(Lung_Capacity$Height)
Lung_Capacity$Smoke <- as.numeric(Lung_Capacity$Smoke == "yes")
Lung_Capacity$Gender <- as.numeric(Lung_Capacity$Gender == "male")
Lung_Capacity$Caesarean <- as.numeric(Lung_Capacity$Caesarean == "yes")
colnames(Lung_Capacity)[4] <- "Smoker_YN"
colnames(Lung_Capacity)[5] <- "Male_YN"
colnames(Lung_Capacity)[6] <- "Caesarean_YN"
head(Lung_Capacity)
Capacity <- Lung_Capacity
I am splitting the data into a training set and a validation set:
library(caret)
set.seed(1)
y <- Capacity$LungCap
testIndex <- caret::createDataPartition(y, times = 1, p = 0.2, list = FALSE)
train <- Capacity[-testIndex,]
test <- Capacity[testIndex,]
Cross-validating to obtain my final model:
set.seed(3)
control <- trainControl(method="cv", number = 5)
LinearModel <- train(LungCap ~ ., data = train, method = "lm", trControl = control)
LM <- LinearModel$finalModel
summary(LM)
And trying to run a prediction on the held-out test set:
lmPredictions <- predict(LM, newdata = test)
However, there is an error thrown that reads:
Error in eval(predvars, data, env) : object 'Smoker_YN1' not found
Looking through this site, I thought the column names of the test and train tables may have been off, but that is not the case, they are identical. The issue seems to be that training the model has renamed the factor predictors "Smoker_YN1" as opposed to the column name "Smokey_YN" that is intended. I tried renaming the column headers in the test set and I tried re-naming the coefficient headers. Neither approach was successful.
I've run out of research and experimental approaches, can anyone please help with this issue?
I am not sure. Please go through and tell me: My guess (and I am not an expert, is that LungCap character and Lung numeric interfer in this code):
h <- "https://docs.google.com/spreadsheets/d/0BxQfpNgXuWoIWUdZV1ZTc2ZscnM/edit?resourcekey=0-gqXT7Re2eUS2JGt_w1y4vA#gid=1055321634"
#install.packages("textreadr")
library(textreadr)
library(rvest)
t <- read_html(h)
t
Nodes <- t %>% html_nodes("table")
table <- html_table(Nodes[[1]])
colnames(table) <- table[1,]
table <- table[-1,]
table <- table %>% select(LungCap, Age, Height, Smoke, Gender, Caesarean)
Lung_Capacity <- table
# I changed Lung_Capacity$LungCap <- as.numeric(Lung_Capacity$LungCap) to
Lung_Capacity$Lung <- as.numeric(Lung_Capacity$LungCap)
Lung_Capacity$Age <- as.numeric(Lung_Capacity$Age)
Lung_Capacity$Height <- as.numeric(Lung_Capacity$Height)
Lung_Capacity$Smoke <- as.numeric(Lung_Capacity$Smoke == "yes")
Lung_Capacity$Gender <- as.numeric(Lung_Capacity$Gender == "male")
Lung_Capacity$Caesarean <- as.numeric(Lung_Capacity$Caesarean == "yes")
colnames(Lung_Capacity)[4] <- "Smoker_YN"
colnames(Lung_Capacity)[5] <- "Male_YN"
colnames(Lung_Capacity)[6] <- "Caesarean_YN"
head(Lung_Capacity)
# I changed to
Capacity <- Lung_Capacity
Capacity
library(caret)
set.seed(1)
# I changed y <- Capacity$LungCap to
y <- Capacity$Lung
testIndex <- caret::createDataPartition(y, times = 1, p = 0.2, list = FALSE)
train <- Capacity[-testIndex,]
test <- Capacity[testIndex,]
# I removed
train$LungCap <- NULL
test$LungCap <- NULL
set.seed(3)
control <- trainControl(method="cv", number = 5)
# I changed LungCap to Lung
LinearModel <- train(Lung ~ ., data = train, method = "lm", trControl = control)
LM <- LinearModel$finalModel
summary(LM)
lmPredictions <- predict(LM, newdata = test)
lmPredictions
Output:
1 2 3 4 5 6 7
6.344355 10.231586 4.902900 7.500179 5.295711 9.434454 8.879997
8 9 10 11 12 13 14
12.227635 11.097691 7.775063 8.085810 6.399364 7.852107 9.480219
15 16 17 18 19 20
8.982051 10.115840 7.917863 12.089960 7.838881 9.653292

Error while running randomForest in R: "Error in y - ymean : non-numeric argument to binary operator"

birth <- import("smoker_data1.xlsx")
## Splitting the dataset in test and train datasets
mysplit <- sample.split(birth, SplitRatio = 0.65)
train <- subset(birth, mysplit == T)
test <- subset(birth, mysplit == F)
## Build Random Forest model on the test set
mod1 <- randomForest(smoke~., train)
Error message: Error: Error in y - ymean : non-numeric argument to binary operator**
I think the best way is to check the data type for smoke variable first.
If possible try to change the variable using as.factor().
library(readxl)
birth <- read_excel("smoker_data1.xlsx")
## Splitting the dataset in test and train datasets
mysplit <- sample.split(birth, SplitRatio = 0.65)
train <- subset(birth, mysplit == T)
test <- subset(birth, mysplit == F)
train$smoke <- as.factor(train$smoke)
## Build Random Forest model on the test set
mod1 <- randomForest(smoke~., train)
I already tried with the data you gave, just need to specify the type of data correctly before fitting randomForest function.
data1$baby_wt <- as.numeric(data1$baby_wt)
data1$income <- as.factor(data1$income)
data1$mother_a <- as.numeric(data1$mother_a)
data1$smoke <- as.factor(data1$smoke)
data1$gestation <- as.numeric(data1$gestation)
data1$mother_wt <- as.numeric(data1$mother_wt)
library(caret)
library(randomForest)
predictors <- names(data1)[!names(data1) %in% "smoke"]
inTrainingSet <- createDataPartition(data1$smoke, p=0.7, list=F)
train<- data1[inTrainingSet,]
test<- data1[-inTrainingSet,]
library(randomForest)
m.rf = randomForest(smoke~., data=train, mtry=sqrt(ncol(x)), ntree=5000,
importance=T, proximity=T, probability=T)
m.rf
#############################################
# Test Performance
#############################################
m.pred = predict(m.rf, test[-4], response="class")
m.table <- table(m.pred, test$smoke)
library(caret)
confusionMatrix(m.table)

I am trying to run XGBoost in R but am facing some issues

I have a dataset of 25 variables and 248 rows.
There are 8-factor variables and the rest are integers and numbers.
I am trying to run XGBoost.
I have done the following code: -
# Partition Data
set.seed(1234)
ind <- sample(2, nrow(mission), replace = T, prob = c(0.7,0.3))
train <- mission[ind == 1,]
test <- mission[ind == 2,]
# Create matrix - One-Hot Encoding for Factor variables
trainm <- sparse.model.matrix(GRL ~ .-1, data = train)
head(trainm)
train_label <- train[,"GRL"]
train_matrix <- xgb.DMatrix(data = as.matrix(trainm), label = train_label)
testm <- sparse.model.matrix(GRL~.-1, data = test)
test_label <- test[,"GRL"]
test_matrix <- xgb.DMatrix(data = as.matrix(testm),label = test_label)
The response variable here is "GRL" and I am running the test_label <- test[,"GRL"]
The above code is getting executed but when I am trying to use it in xgb.DMatrix, I am encountering the following error:
Error in setinfo.xgb.DMatrix(dmat, names(p), p[[1]]) :
The length of labels must equal to the number of rows in the input data
I have partitioned the data into 70:30.
test[,"GRL"] returns a data.frame, and XGBoost needs the label to be a vector.
Just use teste$GRL or test[["GRL"]] instead. You also need to do the same for the training dataset

Error: Please use column names for `x` when using caret() for logistic regression

I'd like to build a logistic regression model using the caret package.
This is my code.
library(caret)
df <- data.frame(response = sample(0:1, 200, replace=TRUE), predictor = rnorm(200,10,45))
outcomeName <-"response"
predictors <- names(df)[!(names(df) %in% outcomeName)]
index <- createDataPartition(df$response, p=0.75, list=FALSE)
trainSet <- df[ index,]
testSet <- df[-index,]
model_glm <- train(trainSet[,outcomeName], trainSet[,predictors], method='glm', family="binomial", data = trainSet)
I get the error Error: Please use column names for x.
I receive the same error when I replace trainSet[,predictors] with the column name predictors.
Unfortunately R has a nasty behavior when subsetting just one column like df[,1] to change outcome to a vector and as you have only one predictor you encountered this feature. You can preserve results as data.frame by either
trainSet[,predictors, drop = FALSE]
or
trainSet[predictors]
BTW. there are two additional issues with the code:
First argument should be predictors, not response
For logistic regression with caret you need response to be a factor
The full code should be:
library(caret)
df <- data.frame(response = sample(0:1, 200, replace=TRUE),
predictor = rnorm(200,10,45))
df$response <- as.factor(df$response)
outcomeName <-"response"
predictors <- names(df)[!(names(df) %in% outcomeName)]
index <- createDataPartition(df$response, p=0.75, list=FALSE)
trainSet <- df[ index,]
testSet <- df[-index,]
model_glm <- train(trainSet[predictors], trainSet[[outcomeName]], method='glm', family="binomial", data = trainSet)
*changed trainSet[,outcomeName] to trainSet[[outcomeName]] for more explicit transformation to vector
I had the same problem,
`head(iris)
xx <- iris[,-5]
yy <- iris[,5]
rf.imp <- train(x = xx, y = yy, method = "rf", data = iris); rf.imp`

Resources