Problems with Predict() function when trying to fit Multiple Linear Regression Model - r
I've fitted a multi-linear regression model using all predictors from my training set except for 'lastname' using lm(), and now I want to make predictions based on my test set. However, when I try to do that with predict(model.fit, test), I get an error regarding the variable 'lastname'
I've tried passing in a test set excluding the column 'lastname' but that didn't work
Code:
cf_df <- read.csv(file="cap_friendly_data.csv", header=TRUE, sep=",")
new_cols <- c('lastname', 'Position', 'Age.Years', 'Original.Cap.Hit', 'New.Signing.Status', 'PPG.Prior.Signing', 'PPG.Contract.Year', 'New.Cap.Hit')
new_stats <- cf_df[, new_cols]
#create training and testing datasets
set.seed(2430)
num_training_samples <- 2000
train_indices <- sample(1:nrow(new_stats), num_training_samples, replace = FALSE,)
train <- new_stats[train_indices, ]
test <- new_stats[-train_indices, ]
test_results <- test$New.Cap.Hit
#fit model
cap.fit <- lm(New.Cap.Hit ~ . - lastname, data = train)
summary(cap.fit)
predictions <- predict(cap.fit, test)
I thought I'd just get a list of predictions from the model but instead I got this error message:
predictions <- predict(cap.fit, test)
Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) :
factor lastname has new levels Ã…berg, Acciari, Acolatse, Alfredsson, Anderson, Angelidis, Arnold, Backes, Balisy, Baptiste, Barch...
Can you try this?
str(new_stats)
# remove column
new_stats = subset(new_stats, select = -c(lastname))
#create training and testing datasets
set.seed(2430)
num_training_samples <- 2000
train_indices <- sample(1:nrow(new_stats), num_training_samples, replace = FALSE,)
train <- new_stats[train_indices, ]
test <- new_stats[-train_indices, ]
test_results <- test$New.Cap.Hit
#fit model
cap.fit <- lm(New.Cap.Hit ~ ., data = train)
summary(cap.fit)
# do predictions
predictions <- predict(cap.fit, test)
Related
Error while running randomForest in R: "Error in y - ymean : non-numeric argument to binary operator"
birth <- import("smoker_data1.xlsx") ## Splitting the dataset in test and train datasets mysplit <- sample.split(birth, SplitRatio = 0.65) train <- subset(birth, mysplit == T) test <- subset(birth, mysplit == F) ## Build Random Forest model on the test set mod1 <- randomForest(smoke~., train) Error message: Error: Error in y - ymean : non-numeric argument to binary operator**
I think the best way is to check the data type for smoke variable first. If possible try to change the variable using as.factor(). library(readxl) birth <- read_excel("smoker_data1.xlsx") ## Splitting the dataset in test and train datasets mysplit <- sample.split(birth, SplitRatio = 0.65) train <- subset(birth, mysplit == T) test <- subset(birth, mysplit == F) train$smoke <- as.factor(train$smoke) ## Build Random Forest model on the test set mod1 <- randomForest(smoke~., train) I already tried with the data you gave, just need to specify the type of data correctly before fitting randomForest function. data1$baby_wt <- as.numeric(data1$baby_wt) data1$income <- as.factor(data1$income) data1$mother_a <- as.numeric(data1$mother_a) data1$smoke <- as.factor(data1$smoke) data1$gestation <- as.numeric(data1$gestation) data1$mother_wt <- as.numeric(data1$mother_wt) library(caret) library(randomForest) predictors <- names(data1)[!names(data1) %in% "smoke"] inTrainingSet <- createDataPartition(data1$smoke, p=0.7, list=F) train<- data1[inTrainingSet,] test<- data1[-inTrainingSet,] library(randomForest) m.rf = randomForest(smoke~., data=train, mtry=sqrt(ncol(x)), ntree=5000, importance=T, proximity=T, probability=T) m.rf ############################################# # Test Performance ############################################# m.pred = predict(m.rf, test[-4], response="class") m.table <- table(m.pred, test$smoke) library(caret) confusionMatrix(m.table)
KNN for predict class from new data
How do you provide a class for new data that does not have a class? I use the KNN algorithm and here is the code for modeling. (Text Classification) train_set <- sample(1:nrow(dtm.df), 15) test_set <- (1:nrow(dtm.df))[- train_set] # Isolate classifier classifier <- dtm.df[, "class"] # Create model data and remove "category" model_data <- dtm.df[,!colnames(dtm.df) %in% "class"] # Create model: training set, test set, training set classifier knn.pred <- knn(model_data[train_set, ], model_data[test_set, ], classifier[train_set], k=5) I try this code newdata <- data.frame(text="bagus sekali") newdata <- Corpus(VectorSource(newdata)) newdata <- DocumentTermMatrix(newdata) model = knn(model_data[train_set, ], newdata, classifier[train_set], k =5) Error in knn(model_data[train_set, ], newdata, classifier[train_set], : dims of 'test' and 'train' differ I know the dims is different, test is 37 288 and newdata is 1 1.
Error: Please use column names for `x` when using caret() for logistic regression
I'd like to build a logistic regression model using the caret package. This is my code. library(caret) df <- data.frame(response = sample(0:1, 200, replace=TRUE), predictor = rnorm(200,10,45)) outcomeName <-"response" predictors <- names(df)[!(names(df) %in% outcomeName)] index <- createDataPartition(df$response, p=0.75, list=FALSE) trainSet <- df[ index,] testSet <- df[-index,] model_glm <- train(trainSet[,outcomeName], trainSet[,predictors], method='glm', family="binomial", data = trainSet) I get the error Error: Please use column names for x. I receive the same error when I replace trainSet[,predictors] with the column name predictors.
Unfortunately R has a nasty behavior when subsetting just one column like df[,1] to change outcome to a vector and as you have only one predictor you encountered this feature. You can preserve results as data.frame by either trainSet[,predictors, drop = FALSE] or trainSet[predictors] BTW. there are two additional issues with the code: First argument should be predictors, not response For logistic regression with caret you need response to be a factor The full code should be: library(caret) df <- data.frame(response = sample(0:1, 200, replace=TRUE), predictor = rnorm(200,10,45)) df$response <- as.factor(df$response) outcomeName <-"response" predictors <- names(df)[!(names(df) %in% outcomeName)] index <- createDataPartition(df$response, p=0.75, list=FALSE) trainSet <- df[ index,] testSet <- df[-index,] model_glm <- train(trainSet[predictors], trainSet[[outcomeName]], method='glm', family="binomial", data = trainSet) *changed trainSet[,outcomeName] to trainSet[[outcomeName]] for more explicit transformation to vector
I had the same problem, `head(iris) xx <- iris[,-5] yy <- iris[,5] rf.imp <- train(x = xx, y = yy, method = "rf", data = iris); rf.imp`
r mgcv Error in predict.gam model
My testing data a my trining data have different factors levels. I try to merge levels but it doesnt works. library(mgcv) library(ff) myData <- read.csv.ffdf(file = "myFile.csv") myData$myVar <- as.factor(myData$myVar) testData <- read.csv(file = "test.csv") testData$myVar <- as.factor(testData$myVar) form <- dependent ~ . model <- gam(form, data=myData) model$xlevels[["myVar"]] <- union(model$xlevels[["myVar"]], levels(testData$myVar)) predictedData <- predict(model, newdata=testData) then R gives me this error: Error in predict.gam(model, newdata = testData) : 1001, 1213,1231 not in original fit Calls: predict -> predict.gam
predict() R function caret package errors: "newdata" rows different, "type" not accepted
I am running a logistic regression analysis using the caret package. Data is input as a 18x6 matrix everything is fine so far except the predict() function. R is telling me the type parameter is supposed to be raw or prob but raw just spits out an exact copy of the last column (the values of the binomial variable). prob gives me the following error: "Error in dimnames(out)[[2]] <- modelFit$obsLevels : length of 'dimnames' [2] not equal to array extent In addition: Warning message: 'newdata' had 7 rows but variables found have 18 rows" install.packages("pbkrtest") install.packages("caret") install.packages('e1071', dependencies=TRUE) #install.packages('caret', dependencies = TRUE) require(caret) library(caret) A=matrix( c( 64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946,66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627,68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755,69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500,73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500,1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1 ), nrow = 18, ncol = 6, byrow = FALSE) #"bycol" does NOT exist ################### data set as vectors a<-c(64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946) b<-c(66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627) c<-c(68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755) d<-c(69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500) e<-c(73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500) f<-c(1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1) ###################### n<-nrow(A); K<-ncol(A)-1; Train <- createDataPartition(f, p=0.6, list=FALSE) #60% of data set is used as training. training <- A[ Train, ] testing <- A[ -Train, ] nrow(training) #this is the logistic formula: #estimates from logistic regression characterize the relationship between the predictor and response variable on a log-odds scale mod_fit <- train(f ~ a + b + c + d +e, data=training, method="glm", family="binomial") mod_fit #this isthe exponential function to calculate the odds ratios for each preditor: exp(coef(mod_fit$finalModel)) predict(mod_fit, newdata=training) predict(mod_fit, newdata=testing, type="prob")
I'm not very sure to understand, but A is a matrix of (a,b,c,d,e,f). So you don't need to create two objects. install.packages("pbkrtest") install.packages("caret") install.packages('e1071', dependencies=TRUE) #install.packages('caret', dependencies = TRUE) require(caret) library(caret) A=matrix( c( 64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946,66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627,68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755,69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500,73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500,1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1 ), nrow = 18, ncol = 6, byrow = FALSE) #"bycol" does NOT exist A <- data.frame(A) colnames(A) <- c('a','b','c','d','e','f') A$f <- as.factor(A$f) Train <- createDataPartition(A$f, p=0.6, list=FALSE) #60% of data set is used as training. training <- A[ Train, ] testing <- A[ -Train, ] nrow(training) And to predict a variable you must enter the explanatory variables and not the variable to predict mod_fit <- train(f ~ a + b + c + d +e, data=training, method="glm", family="binomial") mod_fit #this isthe exponential function to calculate the odds ratios for each preditor: exp(coef(mod_fit$finalModel)) predict(mod_fit, newdata=training[,-which(colnames(training)=="f")]) predict(mod_fit, newdata=testing[,-which(colnames(testing)=="f")])
Short answer, you should not include the explained variable, which is f in your predict equation. So you should do: predict(mod_fit, newdata=training[, -ncol(training]) predict(mod_fit, newdata=testing[, -ncol(testing]) The issue with the warning message 'newdata' had 11 rows but variables found have 18 rows is because you run the regression using the whole data set (18 observations), but predict using just part of it (either 11 or 7). EDIT: To simplify the data creation and glm processes we can do: library(caret) A <- data.frame(a = c(64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946), b = c(66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627), c = c(68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755), d = c(69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500), e = c(73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500), f = c(1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1)) Train <- createDataPartition(f, p=0.6, list=FALSE) #60% of data set is used as training. training <- A[ Train, ] testing <- A[ -Train, ] mod_fit <- train(f ~ a + b + c + d + e, data=training, method="glm", family="binomial")
I try to run logistic regression model. I wrote this code: install.packages('caret') library(caret) setwd('C:\\Users\\BAHOZ\\Documents\\') D<-read.csv(file = "D.csv",header = T) D<-read.csv(file = 'DataSet.csv',header=T) names(D) set.seed(111134) Train<-createDataPartition(D$X, p=0.7,list = FALSE) training<-D[Train,] length(training$age) testing<-D[-Train,] length(testing$age) mod_fit<-train(X~age + gender + total.Bilirubin + direct.Bilirubin + total.proteins + albumin + A.G.ratio+SGPT + SGOT + Alkphos,data=training,method="glm", family="binomial") summary(mod_fit) exp(coef(mod_fit$finalModel)) And I recived this message for last command: (Intercept) age gender total.Bilirubin direct.Bilirubin total.proteins albumin A.G.ratio 0.01475027 1.01596886 1.03857883 1.00022899 1.78188072 1.00065332 1.01380334 1.00115742 SGPT SGOT Alkphos 3.93498241 0.05616662 38.29760014 By running this command I could predict my data, predict(mod_fit , newdata=testing) But if I set type="prob" or type="raw" predict(mod_fit , newdata=testing, type = "prob") it falls in error: Error in dimnames(out) <- *vtmp* : length of 'dimnames' [2] not equal to array extent