KNN for predict class from new data - r

How do you provide a class for new data that does not have a class?
I use the KNN algorithm and here is the code for modeling.
(Text Classification)
train_set <- sample(1:nrow(dtm.df), 15)
test_set <- (1:nrow(dtm.df))[- train_set]
# Isolate classifier
classifier <- dtm.df[, "class"]
# Create model data and remove "category"
model_data <- dtm.df[,!colnames(dtm.df) %in% "class"]
# Create model: training set, test set, training set classifier
knn.pred <- knn(model_data[train_set, ], model_data[test_set, ],
classifier[train_set], k=5)
I try this code
newdata <- data.frame(text="bagus sekali")
newdata <- Corpus(VectorSource(newdata))
newdata <- DocumentTermMatrix(newdata)
model = knn(model_data[train_set, ], newdata, classifier[train_set], k =5)
Error in knn(model_data[train_set, ], newdata, classifier[train_set], :
dims of 'test' and 'train' differ
I know the dims is different, test is 37 288 and newdata is 1 1.

Related

Error while running randomForest in R: "Error in y - ymean : non-numeric argument to binary operator"

birth <- import("smoker_data1.xlsx")
## Splitting the dataset in test and train datasets
mysplit <- sample.split(birth, SplitRatio = 0.65)
train <- subset(birth, mysplit == T)
test <- subset(birth, mysplit == F)
## Build Random Forest model on the test set
mod1 <- randomForest(smoke~., train)
Error message: Error: Error in y - ymean : non-numeric argument to binary operator**
I think the best way is to check the data type for smoke variable first.
If possible try to change the variable using as.factor().
library(readxl)
birth <- read_excel("smoker_data1.xlsx")
## Splitting the dataset in test and train datasets
mysplit <- sample.split(birth, SplitRatio = 0.65)
train <- subset(birth, mysplit == T)
test <- subset(birth, mysplit == F)
train$smoke <- as.factor(train$smoke)
## Build Random Forest model on the test set
mod1 <- randomForest(smoke~., train)
I already tried with the data you gave, just need to specify the type of data correctly before fitting randomForest function.
data1$baby_wt <- as.numeric(data1$baby_wt)
data1$income <- as.factor(data1$income)
data1$mother_a <- as.numeric(data1$mother_a)
data1$smoke <- as.factor(data1$smoke)
data1$gestation <- as.numeric(data1$gestation)
data1$mother_wt <- as.numeric(data1$mother_wt)
library(caret)
library(randomForest)
predictors <- names(data1)[!names(data1) %in% "smoke"]
inTrainingSet <- createDataPartition(data1$smoke, p=0.7, list=F)
train<- data1[inTrainingSet,]
test<- data1[-inTrainingSet,]
library(randomForest)
m.rf = randomForest(smoke~., data=train, mtry=sqrt(ncol(x)), ntree=5000,
importance=T, proximity=T, probability=T)
m.rf
#############################################
# Test Performance
#############################################
m.pred = predict(m.rf, test[-4], response="class")
m.table <- table(m.pred, test$smoke)
library(caret)
confusionMatrix(m.table)

R caret extractPrediction with random forest model: Error: $ operator is invalid for atomic vectors

I want to extract the predictions for new unseen data using the function caret::extractPrediction with a random forest model but I cannot figure out, why my code throws the error Error: $ operator is invalid for atomic vectors. How should the input parameters be structured, to use this function?
Here is my reproducible code:
library(caret)
dat <- as.data.frame(ChickWeight)
# create column set
dat$set <- rep("train", nrow(dat))
# split into train and validation set
set.seed(1)
dat[sample(nrow(dat), 50), which(colnames(dat) == "set")] <- "validation"
# predictors and response
all_preds <- dat[which(dat$set == "train"), which(names(dat) %in% c("Time", "Diet"))]
response <- dat[which(dat$set == "train"), which(names(dat) == "weight")]
# set train control parameters
contr <- caret::trainControl(method="repeatedcv", number=3, repeats=5)
# recursive feature elimination caret
set.seed(1)
model <- caret::train(x = all_preds,
y = response,
method ="rf",
ntree = 250,
metric = "RMSE",
trControl = contr)
# validation set
vali <- dat[which(dat$set == "validation"), ]
# not working
caret::extractPrediction(models = model, testX = vali[,-c(3,5,1)], testY = vali[,1])
caret::extractPrediction(models = model, testX = vali, testY = vali)
# works without problems
caret::predict.train(model, newdata = vali)
I found a solution by looking at the documentation of extractPrediction. Basically, the argument models doesn't want a single model instance, but a list of models. So I just inserted list(my_rf = model) and not just model.
caret::extractPrediction(models = list(my_rf = model), testX = vali[,-c(3,5,1)], testY = vali[,1])

Problems with Predict() function when trying to fit Multiple Linear Regression Model

I've fitted a multi-linear regression model using all predictors from my training set except for 'lastname' using lm(), and now I want to make predictions based on my test set. However, when I try to do that with predict(model.fit, test), I get an error regarding the variable 'lastname'
I've tried passing in a test set excluding the column 'lastname' but that didn't work
Code:
cf_df <- read.csv(file="cap_friendly_data.csv", header=TRUE, sep=",")
new_cols <- c('lastname', 'Position', 'Age.Years', 'Original.Cap.Hit', 'New.Signing.Status', 'PPG.Prior.Signing', 'PPG.Contract.Year', 'New.Cap.Hit')
new_stats <- cf_df[, new_cols]
#create training and testing datasets
set.seed(2430)
num_training_samples <- 2000
train_indices <- sample(1:nrow(new_stats), num_training_samples, replace = FALSE,)
train <- new_stats[train_indices, ]
test <- new_stats[-train_indices, ]
test_results <- test$New.Cap.Hit
#fit model
cap.fit <- lm(New.Cap.Hit ~ . - lastname, data = train)
summary(cap.fit)
predictions <- predict(cap.fit, test)
I thought I'd just get a list of predictions from the model but instead I got this error message:
predictions <- predict(cap.fit, test)
Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) :
factor lastname has new levels Ã…berg, Acciari, Acolatse, Alfredsson, Anderson, Angelidis, Arnold, Backes, Balisy, Baptiste, Barch...
Can you try this?
str(new_stats)
# remove column
new_stats = subset(new_stats, select = -c(lastname))
#create training and testing datasets
set.seed(2430)
num_training_samples <- 2000
train_indices <- sample(1:nrow(new_stats), num_training_samples, replace = FALSE,)
train <- new_stats[train_indices, ]
test <- new_stats[-train_indices, ]
test_results <- test$New.Cap.Hit
#fit model
cap.fit <- lm(New.Cap.Hit ~ ., data = train)
summary(cap.fit)
# do predictions
predictions <- predict(cap.fit, test)

Error in root.matrix(crossprod(process)) : matrix is not positive semidefinite

I want to extend the RandomForest so that each leaf will contain naivebayes regression instead of average. In the following, I first tried to use mob() for adding linearModel. I got the following error:
Error in root.matrix(crossprod(process)) : matrix is not positive semidefinite
Here is my code:
require (data.table)
require (party)
set.seed(123)
data1 <- read.csv('https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data',header = TRUE)
colnames(data1)<- c("BuyingPrice","Maintenance","NumDoors","NumPersons","BootSpace","Safety","Condition")
# Split into Train and Validation sets
# Training Set : Validation Set = 70 : 30 (random)
set.seed(100)
train <- sample(nrow(data1), 0.7*nrow(data1), replace = FALSE)
TrainSet <- data1[train,]
ValidSet <- data1[-train,]
summary(TrainSet)
summary(ValidSet)
# Create a Random Forest model with default parameters
model1 <- randomForest(Condition ~ ., data = TrainSet, importance = TRUE)
model1
# Fine tuning parameters of Random Forest model
model2 <- randomForest(Condition ~ ., data = TrainSet, ntree = 500, mtry = 6, importance = TRUE)
model2
fmBH <- mob(Condition ~ BuyingPrice + Maintenance | NumDoors+ NumPersons + BootSpace + Safety ,
data = TrainSet, model = linearModel)

predict() R function caret package errors: "newdata" rows different, "type" not accepted

I am running a logistic regression analysis using the caret package.
Data is input as a 18x6 matrix
everything is fine so far except the predict() function.
R is telling me the type parameter is supposed to be raw or prob but raw just spits out an exact copy of the last column (the values of the binomial variable). prob gives me the following error:
"Error in dimnames(out)[[2]] <- modelFit$obsLevels :
length of 'dimnames' [2] not equal to array extent
In addition: Warning message:
'newdata' had 7 rows but variables found have 18 rows"
install.packages("pbkrtest")
install.packages("caret")
install.packages('e1071', dependencies=TRUE)
#install.packages('caret', dependencies = TRUE)
require(caret)
library(caret)
A=matrix(
c(
64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946,66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627,68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755,69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500,73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500,1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1
),
nrow = 18,
ncol = 6,
byrow = FALSE) #"bycol" does NOT exist
################### data set as vectors
a<-c(64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946)
b<-c(66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627)
c<-c(68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755)
d<-c(69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500)
e<-c(73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500)
f<-c(1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1)
######################
n<-nrow(A);
K<-ncol(A)-1;
Train <- createDataPartition(f, p=0.6, list=FALSE) #60% of data set is used as training.
training <- A[ Train, ]
testing <- A[ -Train, ]
nrow(training)
#this is the logistic formula:
#estimates from logistic regression characterize the relationship between the predictor and response variable on a log-odds scale
mod_fit <- train(f ~ a + b + c + d +e, data=training, method="glm", family="binomial")
mod_fit
#this isthe exponential function to calculate the odds ratios for each preditor:
exp(coef(mod_fit$finalModel))
predict(mod_fit, newdata=training)
predict(mod_fit, newdata=testing, type="prob")
I'm not very sure to understand, but A is a matrix of (a,b,c,d,e,f). So you don't need to create two objects.
install.packages("pbkrtest")
install.packages("caret")
install.packages('e1071', dependencies=TRUE)
#install.packages('caret', dependencies = TRUE)
require(caret)
library(caret)
A=matrix(
c(
64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946,66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627,68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755,69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500,73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500,1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1
),
nrow = 18,
ncol = 6,
byrow = FALSE) #"bycol" does NOT exist
A <- data.frame(A)
colnames(A) <- c('a','b','c','d','e','f')
A$f <- as.factor(A$f)
Train <- createDataPartition(A$f, p=0.6, list=FALSE) #60% of data set is used as training.
training <- A[ Train, ]
testing <- A[ -Train, ]
nrow(training)
And to predict a variable you must enter the explanatory variables and not the variable to predict
mod_fit <- train(f ~ a + b + c + d +e, data=training, method="glm", family="binomial")
mod_fit
#this isthe exponential function to calculate the odds ratios for each preditor:
exp(coef(mod_fit$finalModel))
predict(mod_fit, newdata=training[,-which(colnames(training)=="f")])
predict(mod_fit, newdata=testing[,-which(colnames(testing)=="f")])
Short answer, you should not include the explained variable, which is f in your predict equation. So you should do:
predict(mod_fit, newdata=training[, -ncol(training])
predict(mod_fit, newdata=testing[, -ncol(testing])
The issue with the warning message 'newdata' had 11 rows but variables found have 18 rows is because you run the regression using the whole data set (18 observations), but predict using just part of it (either 11 or 7).
EDIT: To simplify the data creation and glm processes we can do:
library(caret)
A <- data.frame(a = c(64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946),
b = c(66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627),
c = c(68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755),
d = c(69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500),
e = c(73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500),
f = c(1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1))
Train <- createDataPartition(f, p=0.6, list=FALSE) #60% of data set is used as training.
training <- A[ Train, ]
testing <- A[ -Train, ]
mod_fit <- train(f ~ a + b + c + d + e, data=training, method="glm", family="binomial")
I try to run logistic regression model. I wrote this code:
install.packages('caret')
library(caret)
setwd('C:\\Users\\BAHOZ\\Documents\\')
D<-read.csv(file = "D.csv",header = T)
D<-read.csv(file = 'DataSet.csv',header=T)
names(D)
set.seed(111134)
Train<-createDataPartition(D$X, p=0.7,list = FALSE)
training<-D[Train,]
length(training$age)
testing<-D[-Train,]
length(testing$age)
mod_fit<-train(X~age + gender + total.Bilirubin + direct.Bilirubin + total.proteins + albumin + A.G.ratio+SGPT + SGOT + Alkphos,data=training,method="glm", family="binomial")
summary(mod_fit)
exp(coef(mod_fit$finalModel))
And I recived this message for last command:
(Intercept) age gender total.Bilirubin direct.Bilirubin total.proteins albumin A.G.ratio
0.01475027 1.01596886 1.03857883 1.00022899 1.78188072 1.00065332 1.01380334 1.00115742
SGPT SGOT Alkphos
3.93498241 0.05616662 38.29760014
By running this command I could predict my data,
predict(mod_fit , newdata=testing)
But if I set type="prob" or type="raw"
predict(mod_fit , newdata=testing, type = "prob")
it falls in error:
Error in dimnames(out) <- *vtmp* :
length of 'dimnames' [2] not equal to array extent

Resources