r mgcv Error in predict.gam model - r

My testing data a my trining data have different factors levels. I try to merge levels but it doesnt works.
library(mgcv)
library(ff)
myData <- read.csv.ffdf(file = "myFile.csv")
myData$myVar <- as.factor(myData$myVar)
testData <- read.csv(file = "test.csv")
testData$myVar <- as.factor(testData$myVar)
form <- dependent ~ .
model <- gam(form, data=myData)
model$xlevels[["myVar"]] <- union(model$xlevels[["myVar"]], levels(testData$myVar))
predictedData <- predict(model, newdata=testData)
then R gives me this error:
Error in predict.gam(model, newdata = testData) : 1001, 1213,1231 not in original fit
Calls: predict -> predict.gam

Related

Error while running randomForest in R: "Error in y - ymean : non-numeric argument to binary operator"

birth <- import("smoker_data1.xlsx")
## Splitting the dataset in test and train datasets
mysplit <- sample.split(birth, SplitRatio = 0.65)
train <- subset(birth, mysplit == T)
test <- subset(birth, mysplit == F)
## Build Random Forest model on the test set
mod1 <- randomForest(smoke~., train)
Error message: Error: Error in y - ymean : non-numeric argument to binary operator**
I think the best way is to check the data type for smoke variable first.
If possible try to change the variable using as.factor().
library(readxl)
birth <- read_excel("smoker_data1.xlsx")
## Splitting the dataset in test and train datasets
mysplit <- sample.split(birth, SplitRatio = 0.65)
train <- subset(birth, mysplit == T)
test <- subset(birth, mysplit == F)
train$smoke <- as.factor(train$smoke)
## Build Random Forest model on the test set
mod1 <- randomForest(smoke~., train)
I already tried with the data you gave, just need to specify the type of data correctly before fitting randomForest function.
data1$baby_wt <- as.numeric(data1$baby_wt)
data1$income <- as.factor(data1$income)
data1$mother_a <- as.numeric(data1$mother_a)
data1$smoke <- as.factor(data1$smoke)
data1$gestation <- as.numeric(data1$gestation)
data1$mother_wt <- as.numeric(data1$mother_wt)
library(caret)
library(randomForest)
predictors <- names(data1)[!names(data1) %in% "smoke"]
inTrainingSet <- createDataPartition(data1$smoke, p=0.7, list=F)
train<- data1[inTrainingSet,]
test<- data1[-inTrainingSet,]
library(randomForest)
m.rf = randomForest(smoke~., data=train, mtry=sqrt(ncol(x)), ntree=5000,
importance=T, proximity=T, probability=T)
m.rf
#############################################
# Test Performance
#############################################
m.pred = predict(m.rf, test[-4], response="class")
m.table <- table(m.pred, test$smoke)
library(caret)
confusionMatrix(m.table)

Problems with my code and lime package in R

I'm trying to use the "lime" package to interpret a Random Forest model with the "import85" dataset, but when I run the explain command it returns an error:
library(lime)
library(caret)
data("imports85", package = "randomForest")
imp85 <- imports85[,-2]
imp85 <- imp85[complete.cases(imp85), ]
imp85[] <- lapply(imp85, function(x) if (is.factor(x)) x[, drop=TRUE] else x)
stopifnot(require(randomForest))
NROW(imp85)*0.7
idx <- sample(1:NROW(imp85),135)
test <- imp85[idx, c(1:4, 6:25)]
train <- imp85[-idx, c(1:4, 6:25)]
resp <- imp85[[5]][-idx]
model <- train(train, resp, method = 'rf')
explainer <- lime(train, model)
explanation <- explain(test, explainer, n_labels = 1, n_features = 2)
Error in predict.randomForest(modelFit, newdata, type = "prob") :
Type of predictors in new data do not match that of the training data.
How can I solve it?
EDIT 1:
I tried to force the factor variable levels to be the same for both train and test datasets, but it doesn't work

Problems with Predict() function when trying to fit Multiple Linear Regression Model

I've fitted a multi-linear regression model using all predictors from my training set except for 'lastname' using lm(), and now I want to make predictions based on my test set. However, when I try to do that with predict(model.fit, test), I get an error regarding the variable 'lastname'
I've tried passing in a test set excluding the column 'lastname' but that didn't work
Code:
cf_df <- read.csv(file="cap_friendly_data.csv", header=TRUE, sep=",")
new_cols <- c('lastname', 'Position', 'Age.Years', 'Original.Cap.Hit', 'New.Signing.Status', 'PPG.Prior.Signing', 'PPG.Contract.Year', 'New.Cap.Hit')
new_stats <- cf_df[, new_cols]
#create training and testing datasets
set.seed(2430)
num_training_samples <- 2000
train_indices <- sample(1:nrow(new_stats), num_training_samples, replace = FALSE,)
train <- new_stats[train_indices, ]
test <- new_stats[-train_indices, ]
test_results <- test$New.Cap.Hit
#fit model
cap.fit <- lm(New.Cap.Hit ~ . - lastname, data = train)
summary(cap.fit)
predictions <- predict(cap.fit, test)
I thought I'd just get a list of predictions from the model but instead I got this error message:
predictions <- predict(cap.fit, test)
Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) :
factor lastname has new levels Ã…berg, Acciari, Acolatse, Alfredsson, Anderson, Angelidis, Arnold, Backes, Balisy, Baptiste, Barch...
Can you try this?
str(new_stats)
# remove column
new_stats = subset(new_stats, select = -c(lastname))
#create training and testing datasets
set.seed(2430)
num_training_samples <- 2000
train_indices <- sample(1:nrow(new_stats), num_training_samples, replace = FALSE,)
train <- new_stats[train_indices, ]
test <- new_stats[-train_indices, ]
test_results <- test$New.Cap.Hit
#fit model
cap.fit <- lm(New.Cap.Hit ~ ., data = train)
summary(cap.fit)
# do predictions
predictions <- predict(cap.fit, test)

R New Prediction with decision tree

I have this dataset:
"Density","bodyfat","Age","state"
1.0708,12.3,23,normal
1.0853,6.1,22,slim
1.0414,25.3,22,fat
1.0751,10.4,26,normal
I wrote this code:
library(rpart)
set.seed(1234)
ind <- sample(2,nrow(mydata),replace=TRUE, prob= c(0.7,0.3))
trainData <- mydata[ind==1,]
testData <- mydata[ind==2,]
myFormula <- state ~ bodyfat
albero <- rpart(state ~ bodyfat)
newdata <- data.frame(Density=1.0515,bodyfat=11.1,Age=24)
newdata
predict(albero,newdata,type="class")
print(albero)
This code not working and i obtain this 2 error:
albero <- rpart(state~bodyfat)
Error in eval(expr,envir,enclos): object "state" not find
and:
predict(albero,newdata,type="class")
Error in match.arg(type): 'arg' should be one of "responde","node","prob"

Error using glm in R

I am trying to apply simple binomial logistic regression in json data I downloaded from Kaggle:
https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries/data
I changed values of interest_level column to either 1 if the value is "high" and 0 if otherwise.
This is my first time using glm so any help is welcome.
library(rjson)
library(dplyr)
library(purrr)
library(nnet)
json.data <- fromJSON(file = "train.json")
json.data = as.data.frame(t(do.call(rbind, json.data)))
#head(json.data)
#colnames(json.data)
x <- json.data$interest_level
for (i in 1:length(x)){
if (json.data$interest_level[i] =="high"){
json.data$interest_level[i] <- 1
}else {json.data$interest_level[i] <- 0}
}
indexes = sample(1:nrow(json.data), size=0.5*nrow(json.data))
train.data <- json.data[indexes,]
test.data <- json.data[-indexes,]
model <- glm(train.data~interest_level,family=binomial(link='logit'),data=train.data)
I'm getting this error message:
Error in model.frame.default(formula = train.data ~ interest_level, data = train.data, : invalid type (list) for variable 'train.data'

Resources