My testing data a my trining data have different factors levels. I try to merge levels but it doesnt works.
library(mgcv)
library(ff)
myData <- read.csv.ffdf(file = "myFile.csv")
myData$myVar <- as.factor(myData$myVar)
testData <- read.csv(file = "test.csv")
testData$myVar <- as.factor(testData$myVar)
form <- dependent ~ .
model <- gam(form, data=myData)
model$xlevels[["myVar"]] <- union(model$xlevels[["myVar"]], levels(testData$myVar))
predictedData <- predict(model, newdata=testData)
then R gives me this error:
Error in predict.gam(model, newdata = testData) : 1001, 1213,1231 not in original fit
Calls: predict -> predict.gam
Related
birth <- import("smoker_data1.xlsx")
## Splitting the dataset in test and train datasets
mysplit <- sample.split(birth, SplitRatio = 0.65)
train <- subset(birth, mysplit == T)
test <- subset(birth, mysplit == F)
## Build Random Forest model on the test set
mod1 <- randomForest(smoke~., train)
Error message: Error: Error in y - ymean : non-numeric argument to binary operator**
I think the best way is to check the data type for smoke variable first.
If possible try to change the variable using as.factor().
library(readxl)
birth <- read_excel("smoker_data1.xlsx")
## Splitting the dataset in test and train datasets
mysplit <- sample.split(birth, SplitRatio = 0.65)
train <- subset(birth, mysplit == T)
test <- subset(birth, mysplit == F)
train$smoke <- as.factor(train$smoke)
## Build Random Forest model on the test set
mod1 <- randomForest(smoke~., train)
I already tried with the data you gave, just need to specify the type of data correctly before fitting randomForest function.
data1$baby_wt <- as.numeric(data1$baby_wt)
data1$income <- as.factor(data1$income)
data1$mother_a <- as.numeric(data1$mother_a)
data1$smoke <- as.factor(data1$smoke)
data1$gestation <- as.numeric(data1$gestation)
data1$mother_wt <- as.numeric(data1$mother_wt)
library(caret)
library(randomForest)
predictors <- names(data1)[!names(data1) %in% "smoke"]
inTrainingSet <- createDataPartition(data1$smoke, p=0.7, list=F)
train<- data1[inTrainingSet,]
test<- data1[-inTrainingSet,]
library(randomForest)
m.rf = randomForest(smoke~., data=train, mtry=sqrt(ncol(x)), ntree=5000,
importance=T, proximity=T, probability=T)
m.rf
#############################################
# Test Performance
#############################################
m.pred = predict(m.rf, test[-4], response="class")
m.table <- table(m.pred, test$smoke)
library(caret)
confusionMatrix(m.table)
I'm trying to use the "lime" package to interpret a Random Forest model with the "import85" dataset, but when I run the explain command it returns an error:
library(lime)
library(caret)
data("imports85", package = "randomForest")
imp85 <- imports85[,-2]
imp85 <- imp85[complete.cases(imp85), ]
imp85[] <- lapply(imp85, function(x) if (is.factor(x)) x[, drop=TRUE] else x)
stopifnot(require(randomForest))
NROW(imp85)*0.7
idx <- sample(1:NROW(imp85),135)
test <- imp85[idx, c(1:4, 6:25)]
train <- imp85[-idx, c(1:4, 6:25)]
resp <- imp85[[5]][-idx]
model <- train(train, resp, method = 'rf')
explainer <- lime(train, model)
explanation <- explain(test, explainer, n_labels = 1, n_features = 2)
Error in predict.randomForest(modelFit, newdata, type = "prob") :
Type of predictors in new data do not match that of the training data.
How can I solve it?
EDIT 1:
I tried to force the factor variable levels to be the same for both train and test datasets, but it doesn't work
I've fitted a multi-linear regression model using all predictors from my training set except for 'lastname' using lm(), and now I want to make predictions based on my test set. However, when I try to do that with predict(model.fit, test), I get an error regarding the variable 'lastname'
I've tried passing in a test set excluding the column 'lastname' but that didn't work
Code:
cf_df <- read.csv(file="cap_friendly_data.csv", header=TRUE, sep=",")
new_cols <- c('lastname', 'Position', 'Age.Years', 'Original.Cap.Hit', 'New.Signing.Status', 'PPG.Prior.Signing', 'PPG.Contract.Year', 'New.Cap.Hit')
new_stats <- cf_df[, new_cols]
#create training and testing datasets
set.seed(2430)
num_training_samples <- 2000
train_indices <- sample(1:nrow(new_stats), num_training_samples, replace = FALSE,)
train <- new_stats[train_indices, ]
test <- new_stats[-train_indices, ]
test_results <- test$New.Cap.Hit
#fit model
cap.fit <- lm(New.Cap.Hit ~ . - lastname, data = train)
summary(cap.fit)
predictions <- predict(cap.fit, test)
I thought I'd just get a list of predictions from the model but instead I got this error message:
predictions <- predict(cap.fit, test)
Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) :
factor lastname has new levels Ã…berg, Acciari, Acolatse, Alfredsson, Anderson, Angelidis, Arnold, Backes, Balisy, Baptiste, Barch...
Can you try this?
str(new_stats)
# remove column
new_stats = subset(new_stats, select = -c(lastname))
#create training and testing datasets
set.seed(2430)
num_training_samples <- 2000
train_indices <- sample(1:nrow(new_stats), num_training_samples, replace = FALSE,)
train <- new_stats[train_indices, ]
test <- new_stats[-train_indices, ]
test_results <- test$New.Cap.Hit
#fit model
cap.fit <- lm(New.Cap.Hit ~ ., data = train)
summary(cap.fit)
# do predictions
predictions <- predict(cap.fit, test)
I have this dataset:
"Density","bodyfat","Age","state"
1.0708,12.3,23,normal
1.0853,6.1,22,slim
1.0414,25.3,22,fat
1.0751,10.4,26,normal
I wrote this code:
library(rpart)
set.seed(1234)
ind <- sample(2,nrow(mydata),replace=TRUE, prob= c(0.7,0.3))
trainData <- mydata[ind==1,]
testData <- mydata[ind==2,]
myFormula <- state ~ bodyfat
albero <- rpart(state ~ bodyfat)
newdata <- data.frame(Density=1.0515,bodyfat=11.1,Age=24)
newdata
predict(albero,newdata,type="class")
print(albero)
This code not working and i obtain this 2 error:
albero <- rpart(state~bodyfat)
Error in eval(expr,envir,enclos): object "state" not find
and:
predict(albero,newdata,type="class")
Error in match.arg(type): 'arg' should be one of "responde","node","prob"
I am trying to apply simple binomial logistic regression in json data I downloaded from Kaggle:
https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries/data
I changed values of interest_level column to either 1 if the value is "high" and 0 if otherwise.
This is my first time using glm so any help is welcome.
library(rjson)
library(dplyr)
library(purrr)
library(nnet)
json.data <- fromJSON(file = "train.json")
json.data = as.data.frame(t(do.call(rbind, json.data)))
#head(json.data)
#colnames(json.data)
x <- json.data$interest_level
for (i in 1:length(x)){
if (json.data$interest_level[i] =="high"){
json.data$interest_level[i] <- 1
}else {json.data$interest_level[i] <- 0}
}
indexes = sample(1:nrow(json.data), size=0.5*nrow(json.data))
train.data <- json.data[indexes,]
test.data <- json.data[-indexes,]
model <- glm(train.data~interest_level,family=binomial(link='logit'),data=train.data)
I'm getting this error message:
Error in model.frame.default(formula = train.data ~ interest_level, data = train.data, : invalid type (list) for variable 'train.data'