Getting unnacurate number of rows when using predict function in a cross validation excercise - r

I'm Performing a K-fold exercise with K = 10 for polinomials from degree 1 to 5 with the purpose of identifying which polynomial fits the best the data provided. Never the less, when I try to predict Y-Hat using the testing data (X-test) which length is 32. R shows me a warning letting me know that the predictions have been adjusted to the length of the training data that has 288 and I don't really understand why this happens.
What I believe is that after fitting the gml and then predicting I should get the 32 y predicted for the 32 points included in the x-test set.
"...Warning: 'newdata' had 32 rows but variables found have 288 rowsWarning: 'newdata' had 32 rows but variables found have 288 rowsWarning: 'newdata' had 32 rows but variables found have 288 rowsWarning..."
Here is my code:
k = 10
CVMSE = matrix(NA, nrow = k, ncol = 5)
set <- 1:320
random_x = sample(train_x, size = length(train_x))
random_y = sample(train_noisy_y, size = length(train_noisy_y))
n <- length(train_x)
k <- 10
group_sizes_x <- rep(floor(n/k), k)
groups_x <-split(random_x, rep(1:k,group_sizes_x))
n <- length(train_noisy_y)
k <- 10
group_sizes_y <- rep(floor(n/k), k)
groups_y <-split(random_y, rep(1:k,group_sizes_y))
for (deg in 1:5) {
for (i in 1:k){
x_test <- groups_x[[i]] %>% unlist()
y_test <- groups_y[[i]] %>% unlist()
x_train <- groups_x[-i] %>% unlist()
y_train <- groups_y[-i] %>% unlist()
model <- glm(y_train ~ poly(x_train, deg))
y_pred <- predict.glm(model, newdata = data.frame(x = x_test))
CVMSE[i, deg] <- mean((y_test - y_pred)^2)
}}
meanCVMSE = apply(CVMSE, 2, mean)
meanCVMSE
At the end I get the meanCVMSE but with the warning I mentioned before.

Related

predict() function throws error using factors on linear model in R

I am using the "lung capacity" data set to try to set up a linear model:
library(tidyverse)
library(rvest)
h <- "https://docs.google.com/spreadsheets/d/0BxQfpNgXuWoIWUdZV1ZTc2ZscnM/edit?resourcekey=0-gqXT7Re2eUS2JGt_w1y4vA#gid=1055321634"
t <- rvest::read_html(h)
Nodes <- t %>% html_nodes("table")
table <- html_table(Nodes[[1]])
colnames(table) <- table[1,]
table <- table[-1,]
table <- table %>% select(LungCap, Age, Height, Smoke, Gender, Caesarean)
Lung_Capacity <- table
Lung_Capacity$LungCap <- as.numeric(Lung_Capacity$LungCap)
Lung_Capacity$Age <- as.numeric(Lung_Capacity$Age)
Lung_Capacity$Height <- as.numeric(Lung_Capacity$Height)
Lung_Capacity$Smoke <- as.numeric(Lung_Capacity$Smoke == "yes")
Lung_Capacity$Gender <- as.numeric(Lung_Capacity$Gender == "male")
Lung_Capacity$Caesarean <- as.numeric(Lung_Capacity$Caesarean == "yes")
colnames(Lung_Capacity)[4] <- "Smoker_YN"
colnames(Lung_Capacity)[5] <- "Male_YN"
colnames(Lung_Capacity)[6] <- "Caesarean_YN"
head(Lung_Capacity)
Capacity <- Lung_Capacity
I am splitting the data into a training set and a validation set:
library(caret)
set.seed(1)
y <- Capacity$LungCap
testIndex <- caret::createDataPartition(y, times = 1, p = 0.2, list = FALSE)
train <- Capacity[-testIndex,]
test <- Capacity[testIndex,]
Cross-validating to obtain my final model:
set.seed(3)
control <- trainControl(method="cv", number = 5)
LinearModel <- train(LungCap ~ ., data = train, method = "lm", trControl = control)
LM <- LinearModel$finalModel
summary(LM)
And trying to run a prediction on the held-out test set:
lmPredictions <- predict(LM, newdata = test)
However, there is an error thrown that reads:
Error in eval(predvars, data, env) : object 'Smoker_YN1' not found
Looking through this site, I thought the column names of the test and train tables may have been off, but that is not the case, they are identical. The issue seems to be that training the model has renamed the factor predictors "Smoker_YN1" as opposed to the column name "Smokey_YN" that is intended. I tried renaming the column headers in the test set and I tried re-naming the coefficient headers. Neither approach was successful.
I've run out of research and experimental approaches, can anyone please help with this issue?
I am not sure. Please go through and tell me: My guess (and I am not an expert, is that LungCap character and Lung numeric interfer in this code):
h <- "https://docs.google.com/spreadsheets/d/0BxQfpNgXuWoIWUdZV1ZTc2ZscnM/edit?resourcekey=0-gqXT7Re2eUS2JGt_w1y4vA#gid=1055321634"
#install.packages("textreadr")
library(textreadr)
library(rvest)
t <- read_html(h)
t
Nodes <- t %>% html_nodes("table")
table <- html_table(Nodes[[1]])
colnames(table) <- table[1,]
table <- table[-1,]
table <- table %>% select(LungCap, Age, Height, Smoke, Gender, Caesarean)
Lung_Capacity <- table
# I changed Lung_Capacity$LungCap <- as.numeric(Lung_Capacity$LungCap) to
Lung_Capacity$Lung <- as.numeric(Lung_Capacity$LungCap)
Lung_Capacity$Age <- as.numeric(Lung_Capacity$Age)
Lung_Capacity$Height <- as.numeric(Lung_Capacity$Height)
Lung_Capacity$Smoke <- as.numeric(Lung_Capacity$Smoke == "yes")
Lung_Capacity$Gender <- as.numeric(Lung_Capacity$Gender == "male")
Lung_Capacity$Caesarean <- as.numeric(Lung_Capacity$Caesarean == "yes")
colnames(Lung_Capacity)[4] <- "Smoker_YN"
colnames(Lung_Capacity)[5] <- "Male_YN"
colnames(Lung_Capacity)[6] <- "Caesarean_YN"
head(Lung_Capacity)
# I changed to
Capacity <- Lung_Capacity
Capacity
library(caret)
set.seed(1)
# I changed y <- Capacity$LungCap to
y <- Capacity$Lung
testIndex <- caret::createDataPartition(y, times = 1, p = 0.2, list = FALSE)
train <- Capacity[-testIndex,]
test <- Capacity[testIndex,]
# I removed
train$LungCap <- NULL
test$LungCap <- NULL
set.seed(3)
control <- trainControl(method="cv", number = 5)
# I changed LungCap to Lung
LinearModel <- train(Lung ~ ., data = train, method = "lm", trControl = control)
LM <- LinearModel$finalModel
summary(LM)
lmPredictions <- predict(LM, newdata = test)
lmPredictions
Output:
1 2 3 4 5 6 7
6.344355 10.231586 4.902900 7.500179 5.295711 9.434454 8.879997
8 9 10 11 12 13 14
12.227635 11.097691 7.775063 8.085810 6.399364 7.852107 9.480219
15 16 17 18 19 20
8.982051 10.115840 7.917863 12.089960 7.838881 9.653292

Why is my model so accurate when using knn(), where k=1?

I am currently using genomic expression levels, age, and smoking intensity levels to predict the number of days Lung Cancer Patients have to live. I have a small amount of data; 173 patients and 20,438 variables, including gene expression levels (which make up for 20,436). I have split up my data into test and training, utilizing an 80:20 ratio. There are no missing values in the data.
I am using knn() to train the model. Here is what the code looks like:
prediction <- knn(train = trainData, test = testData, cl = trainAnswers, k=1)
Nothing seems out of the ordinary until you notice that k=1. "Why is k=1?" you may ask. The reason k=1 is because when k=1, the model is the most accurate. This makes no sense to me. There are quite a few concerns:
I am using knn() to predict a continuous variable. I should be using something along the lines of, cox maybe.
The model is waaaaaaay too accurate. Here are a few examples of the test answer and the model's predictions. For the first patient, the number of days to death is 274. The model predicts 268. For the second patient, test: 1147, prediction: 1135. 3rd, test: 354, prediction: 370. 4th, test: 995, prediction 995. How is this possible? Out of the entire test data, the model was only off by and average of 9.0625 days! The median difference was 7 days, and the mode was 6 days. Here is a graph of the results:
Bar Graph.
So I guess my main question is what does knn() do, what does k represent, and how is the model so accurate when k=1? Here is my entire code (I am unable to attach the actual data):
# install.packages(c('caret', 'skimr', 'RANN', 'randomForest', 'fastAdaboost', 'gbm', 'xgboost', 'caretEnsemble', 'C50', 'earth'))
library(caret)
# Gather the data and store it in variables
LUAD <- read.csv('/Users/username/Documents/ClinicalData.csv')
geneData <- read.csv('/Users/username/Documents/GenomicExpressionLevelData.csv')
geneData <- data.frame(geneData)
row.names(geneData) = geneData$X
geneData <- geneData[2:514]
colNamesGeneData <- gsub(".","-",colnames(geneData),fixed = TRUE)
colnames(geneData) = colNamesGeneData
# Organize the data
# Important columns are 148 (smoking), 123 (OS Month, basically how many days old), and the gene data. And column 2 (barcode).
LUAD = data.frame(LUAD$patient, LUAD$TOBACCO_SMOKING_HISTORY_INDICATOR, LUAD$OS_MONTHS, LUAD$days_to_death)[complete.cases(data.frame(LUAD$patient, LUAD$TOBACCO_SMOKING_HISTORY_INDICATOR, LUAD$OS_MONTHS, LUAD$days_to_death)), ]
rownames(LUAD)=LUAD$LUAD.patient
LUAD <- LUAD[2:4]
# intersect(rownames(LUAD),colnames(geneData))
# ind=which(colnames(geneData)=="TCGA-778-7167-01A-11R-2066-07")
gene_expression=geneData[, rownames(LUAD)]
# Merge the two datasets to use the geneomic expression levels in your model
LUAD <- data.frame(LUAD,t(gene_expression))
LUAD.days_to_death <- LUAD[,3]
LUAD <- LUAD[,c(1:2,4:20438)]
LUAD <- data.frame(LUAD.days_to_death,LUAD)
set.seed(401)
# Number of Rows in the training data (createDataPartition(dataSet, percentForTraining, boolReturnAsList))
trainRowNum <- createDataPartition(LUAD$LUAD.days_to_death, p=0.8, list=FALSE)
# Training/Test Dataset
trainData <- LUAD[trainRowNum, ]
testData <- LUAD[-trainRowNum, ]
x = trainData[, c(2:20438)]
y = trainData$LUAD.days_to_death
v = testData[, c(2:20438)]
w = testData$LUAD.days_to_death
# Imputing missing values into the data
preProcess_missingdata_model <- preProcess(trainData, method='knnImpute')
library(RANN)
if (anyNA(trainData)) {
trainData <- predict(preProcess_missingdata_model, newdata = trainData)
}
anyNA(trainData)
# Normalizing the data
preProcess_range_model <- preProcess(trainData, method='range')
trainData <- predict(preProcess_range_model, newdata = trainData)
trainData$LUAD.days_to_death <- y
apply(trainData[,1:20438], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})
preProcess_range_model_Test <- preProcess(testData, method='range')
testData <- predict(preProcess_range_model_Test, newdata = testData)
testData$LUAD.days_to_death <- w
apply(testData[,1:20438], 2, FUN=function(v){c('min'=min(v), 'max'=max(v))})
# To uncomment, select the text and press 'command' + 'shift' + 'c'
# set.seed(401)
# options(warn=-1)
# subsets <- c(1:10)
# ctrl <- rfeControl(functions = rfFuncs,
# method = "repeatedcv",
# repeats = 5,
# verbose = TRUE)
# lmProfile <- rfe(x=trainData[1:20437], y=trainAnswers,
# sizes = subsets,
# rfeControl = ctrl)
# lmProfile
trainAnswers <- trainData[,1]
testAnswers <- testData[,1]
library(class)
prediction <- knn(train = trainData, test = testData, cl = trainAnswers, k=1)
#install.packages("plotly")
library(plotly)
Test_Question_Number <- c(1:32)
prediction2 <- data.frame(prediction[1:32])
prediction2 <- as.numeric(as.vector(prediction2[c(1:32),]))
data <- data.frame(Test_Question_Number, prediction2, testAnswers)
names(data) <- c("Test Question Number","Prediction","Answer")
p <- plot_ly(data, x = ~Test_Question_Number, y = ~prediction2, type = 'bar', name = 'Prediction') %>%
add_trace(y = ~testAnswers, name = 'Answer') %>%
layout(yaxis = list(title = 'Days to Death'), barmode = 'group')
p
merge <- data.frame(prediction2,testAnswers)
difference <- abs((merge[,1])-(merge[,2]))
difference <- sort(difference)
meanDifference <- mean(difference)
medianDifference <- median(difference)
modeDifference <- names(table(difference))[table(difference)==max(table(difference))]
cat("Mean difference:", meanDifference, "\n")
cat("Median difference:", medianDifference, "\n")
cat("Mode difference:", modeDifference,"\n")
Lastly, for clarification purposes, ClinicalData.csv is the age, days to death, and smoking intensity data. The other .csv is the genomic expression data. The data above line 29 doesn't really matter, so you can just skip to the part of the code where it says "set.seed(401)".
Edit: Some samples of the data:
days_to_death OS_MONTHS
121 3.98
NACC1 2001.5708 2363.8063 1419.879
NACC2 58.2948 61.8157 43.4386
NADK 706.868 1053.4424 732.1562
NADSYN1 1628.7634 912.1034 638.6471
NAE1 832.8825 793.3014 689.7123
NAF1 140.3264 165.4858 186.355
NAGA 1523.3441 1524.4619 1858.9074
NAGK 983.6809 899.869 1168.2003
NAGLU 621.3457 510.9453 1172.511
NAGPA 346.9762 257.5654 275.5533
NAGS 460.7732 107.2116 321.9763
NAIF1 217.1219 202.5108 132.3054
NAIP 101.2305 87.8942 77.261
NALCN 13.9628 36.7031 48.0809
NAMPT 3245.6584 1257.8849 5465.6387
Because K = 1 is the most complex knn model. It has the most flexible decision boundary. It creates an overfit. It will perform well within the training data by poorly on a holdout set (but not always).

KNN Error for Flights Dataset

I am trying to learn how to do KNN in R, and am practicing on the flights dataset from the package nycflights13. I get an error running the below code saying
'train' and 'class' have different lengths
My code:
library(nycflights13)
library(class)
deparr <- na.omit(flights[c(4, 7, 16)])
classframe <- deparr[3]
flights %>% ggvis(~dep_time, ~arr_time, fill = ~distance) %>% layer_points()
set.seed(1234)
ind <- sample(2, nrow(deparr), replace=TRUE, prob=c(0.67, 0.33))
flights.training <- deparr[ind==1, 1:2]
flights.test <- deparr[ind==2, 1:2]
flights.trainlabels <- deparr[ind==1, 3]
flights.testlabels <- deparr[ind==2, 3]
predictions <- knn(train = flights.training, test = flights.test, cl = flights.trainlabels[,1], k = 3)
Here is code that divides up the train and test sets based on percentages. If you want to split out the two subsets in a different way, you should be able to work from this, but it proves that it works.
deparr <- na.omit(flights[c(4, 7, 16)])
set.seed(1234)
# prepare to divide up the full dataset into two groups, 65%/35%
n <- nrow(deparr)
train_n <- round(0.65 * n)
# randomize our data
deparr <- deparr[sample(n)]
# split up the actual data. We will use these as inputs to knn
flights.train <- deparr[1:train_n, ]
flights.test <- deparr[(train_n + 1):n, ]
# target variable, $distance, is in column 3, so exclude from train and test
predictions <- knn(train = flights.train[, 1:2], test = flights.test[, 1:2], cl = flights.train$distance, k = 10)
This runs and I get as a result:
> str(predictions)
Factor w/ 209 levels "80","94","96",..: 121 159 18 54 207 18 94 55 159 136 ...

Predict Logistf

I'm using a R package called logistf to make a Logistc Regression and I saw that there's no predict function for new data in this package and predict package does not work with this, so I found a code that show how making this with new data:
fit<-logistf(Tax ~ L20+L24+L28+L29+L31+L32+L33+L36+S10+S15+S16+S17+S20, data=trainData)
betas <- coef(fit)
X <- model.matrix(fit, data=testData)
probs <- 1 / (1 + exp(-X %*% betas))
I want to make a cross validation version with this using fit$predict and the probabilities that probs generate for me. Has anyone ever done something like this before?
Other thing that I want to know is about fit$predict I'm making a binary logistic regression, and this function returns many values, are these values from class 0 or 1, how can I know this? Thanks
While the code that you wrote works perfectly, there is a concise way of getting the same results seemingly:
brglm_model <- brglm(formula = response ~ predictor , family = "binomial", data = train )
brglm_pred <- predict(object = brglm_model, newdata = test , type = "response")
About the CV, you have to write a few lines of code I guess:
#Setting the number of folds, and number of instances in each fold
n_folds <- 5
fold_size <- nrow(dataset) %/% 5
residual <- nrow(dataset) %% 5
#label the instances based on the number of folds
cv_labels <- c(rep(1,fold_size),rep(2,fold_size), rep(3,fold_size), rep(4,fold_size), rep(5,fold_size), rep(5,residual))
# the error term would differ based on each threshold value
t_seq <- seq(0.1,0.9,by = 0.1)
index_mat <- matrix(ncol = (n_folds+1) , nrow = length(t_seq))
index_mat[,1] <- t_seq
# the main loop for calculation of the CV error on each fold
for (i in 1:5){
train <- dataset %>% filter(cv_labels != i)
test <- dataset %>% filter(cv_labels == i )
brglm_cv_model <- brglm(formula = response_var ~ . , family = "binomial", data = train )
brglm_cv_pred <- predict(object = brglm_model, newdata = test , type = "response")
# error formula that you want, e.g. misclassification
counter <- 0
for (treshold in t_seq ) {
counter <- counter + 1
conf_mat <- table( factor(test$response_var) , factor(brglm_cv_pred>treshold, levels = c("FALSE","TRUE") ))
sen <- conf_mat[2,2]/sum(conf_mat[2,])
# other indices can be computed as follows
#spec <- conf_mat[1,1]/sum(conf_mat[1,])
#prec <- conf_mat[2,2]/sum(conf_mat[,2])
#F1 <- (2*prec * sen)/(prec+sen)
#accuracy <- (conf_mat[1,1]+conf_mat[2,2])/sum(conf_mat)
#here I am only interested in sensitivity
index_mat[counter,(i+1)] <- sen
}
}
# final data.frame would be the mean of sensitivity over each threshold value
final_mat <- matrix(nrow = length(t_seq), ncol = 2 )
final_mat[,1] <- t_seq
final_mat[,2] <- apply(X = index_mat[,-1] , MARGIN = 1 , FUN = mean)
final_mat <- data.frame(final_mat)
colnames(final_mat) <- c("treshold","sensitivity")
#why not having a look at the CV-sensitivity of the model over threshold values?
ggplot(data = final_mat) +
geom_line(aes(x = treshold, y = sensitivity ), color = "blue")

Predict warning-----new data rows <> variable rows

I'm a beginner in R.
I tried to build a model by using a part of samples and predict response by using the rest samples. But when I use predict(), I got a warning message:
'newdata' had 152 rows but variables found have 354 rows
I have searched some answers, but I still can't understand T.T. Please help
library(MASS)
data(Boston)
n <- nrow(Boston)
n_train <- round(.70*n)
train_set <- sample(n,size=n_train,replace = FALSE)
x <- cbind(Boston$lstat,log(Boston$lstat))
y <- Boston$medv
x_train <- x[train_set,]
y_train <- y[train_set]
x_test <- x[-train_set,]
y_test <- y[-train_set]
lm_temp <- lm(y_train~x_train)
y_test_hat <- predict(lm_temp,newdata=data.frame(x_test))
It looks like R is getting confused when you pass a matrix as the independent variables, but then the predict function requires a data frame(which is a list).
You can solve the problem by running your lm on a data frame
library(MASS)
data(Boston)
n <- nrow(Boston)
n_train <- round(.70*n)
train_set <- sample(n,size=n_train,replace = FALSE)
data <- Boston[ , c('medv', 'lstat')]
data$loglstat <- log(data$lstat)
train <- data[train_set, ]
test <- data[-train_set,]
lm_temp <- lm(medv ~ ., data = train)
y_test_hat <- predict(lm_temp,newdata=test)

Resources