I have an issue running a confusionMatrix.
here is what I do:
rf <- caret::train(tested ~.,
data = training_data,
method = "rf",
trControl = ctrlInside,
metric = "ROC",
na.action = na.exclude)
rf
After I get my model this is the next step I take:
evalResult.rf <- predict(rf, testing_data, type = "prob")
predict_rf <- as.factor(ifelse(evalResult.rf <0.5, "positive", "negative"))
And then I am running my confusion matrix.
cm_rf_forest <- confusionMatrix(predict_rf, testing_data$tested, "positive")
And the error comes after I apply the confusionMatrix:
Error in table(data, reference, dnn = dnn, ...) :
all arguments must have the same length
Nevertheless, I give you bits of my data.
train data:
structure(list(tested = structure(c(1L, 1L, 1L, 1L, 1L,
1L), .Label = c("negative", "positive"), class = "factor"), Gender = structure(c(2L,
2L, 1L, 1L, 2L, 2L), .Label = c("Female", "Male", "Other"), class = "factor"),
Age = c(63, 23, 28, 40, 31, 60), number_days_symptoms = c(1,
1, 16, 1, 14, 1), care_home_worker = structure(c(1L, 2L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
health_care_worker = structure(c(1L, 1L, 1L, 1L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), how_unwell = c(1, 1, 6, 4, 2,
1), self_diagnosis = structure(c(1L, 1L, 2L, 1L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), chills = structure(c(1L, 1L, 2L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
cough = structure(c(1L, 1L, 2L, 2L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), diarrhoea = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
fatigue = structure(c(1L, 2L, 2L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), headache = structure(c(2L, 2L,
3L, 2L, 2L, 2L), .Label = c("Headcahe", "No", "Yes"), class = "factor"),
loss_smell_taste = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), muscle_ache = structure(c(1L,
1L, 2L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
nasal_congestion = structure(c(1L, 1L, 1L, 2L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), nausea_vomiting = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
shortness_breath = structure(c(1L, 1L, 1L, 1L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), sore_throat = structure(c(1L,
1L, 1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
sputum = structure(c(1L, 1L, 2L, 2L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), temperature = structure(c(4L,
4L, 4L, 4L, 1L, 4L), .Label = c("37.5-38", "38.1-39", "39.1-41",
"No"), class = "factor"), asthma = structure(c(2L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
diabetes_type_one = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), diabetes_type_two = structure(c(2L,
1L, 1L, 1L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
obesity = structure(c(1L, 2L, 2L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), hypertension = structure(c(1L,
1L, 2L, 1L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 2L), .Label = c("No",
"Yes"), class = "factor"), lung_condition = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), kidney_disease = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor")), row.names = c(1L,
3L, 4L, 5L, 6L, 7L), class = "data.frame")
and here is my test_data:
structure(list(tested = structure(c(1L, 1L, 1L, 1L, 1L,
1L), .Label = c("negative", "positive"), class = "factor"), Gender = structure(c(1L,
2L, 1L, 1L, 1L, 2L), .Label = c("Female", "Male", "Other"), class = "factor"),
Age = c(19, 26, 30, 45, 40, 43), number_days_symptoms = c(20,
1, 1, 20, 14, 1), care_home_worker = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
health_care_worker = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), how_unwell = c(7, 6, 6, 6, 6,
2), self_diagnosis = structure(c(2L, 1L, 1L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), chills = structure(c(2L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
cough = structure(c(2L, 1L, 1L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), diarrhoea = structure(c(2L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
fatigue = structure(c(2L, 1L, 1L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), headache = structure(c(2L, 2L,
2L, 3L, 2L, 3L), .Label = c("Headcahe", "No", "Yes"), class = "factor"),
loss_smell_taste = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), muscle_ache = structure(c(2L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
nasal_congestion = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), nausea_vomiting = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
shortness_breath = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), sore_throat = structure(c(1L,
1L, 1L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
sputum = structure(c(2L, 1L, 1L, 2L, 1L, 2L), .Label = c("No",
"Yes"), class = "factor"), temperature = structure(c(4L,
4L, 4L, 1L, 1L, 4L), .Label = c("37.5-38", "38.1-39", "39.1-41",
"No"), class = "factor"), asthma = structure(c(1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
diabetes_type_one = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), diabetes_type_two = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
obesity = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), hypertension = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), lung_condition = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), kidney_disease = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor")), row.names = c(2L,
8L, 11L, 14L, 20L, 27L), class = "data.frame")
Additionally, I perform a smote balancing class, on a subsample in ctrInside.
This is my smote function:
smotest <- list(name = "SMOTE with more neighbors!",
func = function (x, y) {
115
library(DMwR)
dat <- if (is.data.frame(x)) x else as.data.frame(x)
dat$.y <- y
dat <- SMOTE(.y ~ ., data = dat, k = 3, perc.over = 100, perc.under =
200)
list(x = dat[, !grepl(".y", colnames(dat), fixed = TRUE)],
y = dat$.y) },
first = TRUE)
And ctrlInside is this:
ctrlInside <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE,
search = "grid",
sampling = smotest)
Those function are given just so that you have an idea of what I am doing per whole. Is there a reason why this is happening?
You can use complete.cases to predict only those that have no nas, also you must operate on the matrix, I will show below. Using an example dataset, I make 10 of the variable in a column NAs, and train:
idx = sample(nrow(iris),100)
data = iris
data$Petal.Length[sample(nrow(data),10)] = NA
data$tested = factor(ifelse(data$Species=="versicolor","positive","negative"))
data = data[,-5]
training_data = data[idx,]
testing_data= data[-idx,]
rf <- caret::train(tested ~., data = training_data,
method = "rf",
trControl = ctrlInside,
metric = "ROC",
na.action = na.exclude)
Do the evaluation result and you can see i get the same error:
evalResult.rf <- predict(rf, testing_data, type = "prob")
predict_rf <- as.factor(ifelse(evalResult.rf <0.5, "positive", "negative"))
cm_rf_forest <- confusionMatrix(predict_rf, testing_data$tested, "positive")
Error in table(data, reference, dnn = dnn, ...) :
all arguments must have the same length
So there's two sources of error, 1.. you have NAs and they cannot predict that, and second, evalResult.rf returns a matrix of probabilities, first column is probability being negative class, 2nd being postive:
head(evalResult.rf)
negative positive
3 1.000 0.000
6 1.000 0.000
9 0.948 0.052
12 1.000 0.000
13 0.976 0.024
19 0.998 0.002
To get the classes, you do, get the column with max value for each row, and return the corresponding column name, which is the class:
colnames(evalResult.rf)[max.col(evalResult.rf)]
We do now:
testing_data = testing_data[complete.cases(testing_data),]
evalResult.rf <- predict(rf, testing_data, type = "prob")
predict_rf <- factor(colnames(evalResult.rf)[max.col(evalResult.rf)])
cm_rf_forest <- confusionMatrix(predict_rf, testing_data$tested, "positive")
Confusion Matrix and Statistics
Reference
Prediction negative positive
negative 33 1
positive 0 11
Accuracy : 0.9778
95% CI : (0.8823, 0.9994)
No Information Rate : 0.7333
P-Value [Acc > NIR] : 1.507e-05
Kappa : 0.9416
I have seen lots of answers with regards to this particular error. I haven't found any answer to it with specifics to my particular issue. Therefore, my problem
This is what I do:
shortness_breath_data <- data_categ_nosev %>%
dplyr::select(shortness_breath, obesity, asthma, diabetes_type_one, diabetes_type_two, obesity, hypertension, heart_disease, lung_condition, liver_disease, kidney_disease, Covid_tested, Gender)
And this is put(head(shortness_breath_data)):
structure(list(shortness_breath = structure(c(1L, 2L, 1L, 1L,
1L, 2L), .Label = c("No", "Yes"), class = "factor"), obesity = structure(c(1L,
1L, 2L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
asthma = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), diabetes_type_one = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
diabetes_type_two = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), hypertension = structure(c(1L,
1L, 1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), lung_condition = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), kidney_disease = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
Covid_tested = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("negative",
"positive"), class = "factor"), Gender = structure(c(2L,
1L, 2L, 1L, 1L, 2L), .Label = c("Female", "Male", "Other"
), class = "factor")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"), problems = structure(list(row = c(2910L,
35958L), col = c("how_unwell", "how_unwell"), expected = c("a double",
"a double"), actual = c("How Unwell", "How Unwell"), file = c("'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'",
"'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'"
)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
)))
And I divide this into training and testing dataset.
shortness_breath_data$shortness_breath <- as.factor(shortness_breath_data$shortness_breath)
n <- nrow(shortness_breath_data)
set.seed(22)
trainingdx <- sample(1:n, 0.7 * n)
train <- shortness_breath_data[trainingdx,]
validate <- shortness_breath_data[-trainingdx,]
train %>% distinct(shortness_breath)
validate %>% distinct(shortness_breath)
And just to do the same in case it will ease you job in finding the issue, I provided dput(head(train)) and dput(head(validate))
train dataset:
structure(list(shortness_breath = structure(c(1L, 1L, 1L, 1L,
1L, 1L), .Label = c("No", "Yes"), class = "factor"), obesity = structure(c(2L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
asthma = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), diabetes_type_one = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
diabetes_type_two = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), hypertension = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), lung_condition = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), kidney_disease = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
Covid_tested = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("negative",
"positive"), class = "factor"), Gender = structure(c(1L,
1L, 1L, 2L, 1L, 2L), .Label = c("Female", "Male", "Other"
), class = "factor")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"), problems = structure(list(row = c(2910L,
35958L), col = c("how_unwell", "how_unwell"), expected = c("a double",
"a double"), actual = c("How Unwell", "How Unwell"), file = c("'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'",
"'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'"
)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
)))
validate dataset:
structure(list(shortness_breath = structure(c(1L, 2L, 2L, 1L,
1L, 1L), .Label = c("No", "Yes"), class = "factor"), obesity = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
asthma = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), diabetes_type_one = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
diabetes_type_two = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), hypertension = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), lung_condition = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), kidney_disease = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
Covid_tested = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("negative",
"positive"), class = "factor"), Gender = structure(c(2L,
1L, 2L, 2L, 1L, 1L), .Label = c("Female", "Male", "Other"
), class = "factor")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"), problems = structure(list(row = c(2910L,
35958L), col = c("how_unwell", "how_unwell"), expected = c("a double",
"a double"), actual = c("How Unwell", "How Unwell"), file = c("'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'",
"'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'"
)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
)))
And then, I build my logistic regression model with stepwise, forward method.
null_model <- glm(shortness_breath ~ 1, data = train, family = "binomial")
fm_shortness_breath <- glm(shortness_breath ~., data = train, family = "binomial")
stepmodel <- step(null_model, scope = list(lower = null_model, upper = fm_shortness_breath), direction = "forward")
Then I get my summary model and store the predictions in the source data frame.
summary(stepmodel)
validate$pred <- predict(stepmodel, validate, type = "response")
validate$real <- validate$shortness_breath
train$pred <- predict(stepmodel, train, type = "response")
train$real <- train$shortness_breath
Then I plot my ROC curve with no problem:
plot.roc(validate$real, validate$pred, col = "red", main = "ROC Validation Set", percent = TRUE, print.auc = TRUE)
Yet, when I am trying to get my confusion matrix, this is where I get my error. But this is my code:
cm_stepmodel <- confusionMatrix(stepmodel, validate)
And then, the error comes in:
Error: `data` and `reference` should be factors with the same levels.
With Show Traceback:
3.
stop("`data` and `reference` should be factors with the same levels.", call. = FALSE)
2.
confusionMatrix.default(stepmodel, validate)
1.
confusionMatrix(stepmodel, validate)
I simply do not see the problem. And tried several other options but did not work. I have reproduced, step by step the exact approach I am undertaking. And I do not get my answer. Also, I have tag this issue with RMarkdown as well, alongside caret and R, just in case.
Also, libraries used are:
library(tidyverse)
library(conflicted)
library(tidymodels)
library(ggrepel)
library(corrplot)
library(dplyr)
library(corrr)
library(themis)
library(rsample)
library(caret)
library(forcats)
library(rcompanion)
library(MASS)
library(pROC)
library(ROCR)
library(data.table)
Try to convert your predicted probabilities to labels, and then run your confusionMatrix on this:
validate$pred <- predict(stepmodel, validate, type = "response")
validate$pred_label <- as.factor(ifelse(validate$pred >= 0.5, "Yes", "No"))
confusionMatrix(validate$real, validate$pred) # Error
confusionMatrix(validate$real, validate$pred_label) # This will work
Check that you are correctly assigning labels as in your original dataset in the validate$pred_label statement.
I'm not particularly familiar with confusionMatrix, but the general idea is that you make predictions of labels and compare to the actual labels of your data. It threw an error because you were comparing labels with probabilities -- you needed to assign the labels. Please correct me if I made a conceptual error or coding mistake above.
reshape2 is a package which allows an powerful array of data transformations, through its two-part melt/cast approach. However, like all tools it embeds assumptions which limit the cases it can handle.
What data reshaping problem can reshape2 not handle in its current form?
The ideal answer will include:
A description of the type of use cases where this data shape is typically found
Sample data
Code to accomplish the transformation (ideally using as much of the transformation with reshape2 as possible)
Example
"Wide" data is common in panel applications.
melt.wide <- function(data, id.vars, new.names, sep=".", variable.name="variable", ... ) {
# Guess number of variables currently wide
colnames(data) <- sub( paste0(sep,"$"), "", colnames(data) )
wide.vars <- colnames(data)[grep( sep, colnames(data) )]
n.wide <- str_count( wide.vars, sep )
stopifnot(length(new.names)==unique(n.wide))
# Melt
data.melt <- melt(data,id.vars=id.vars,measure.vars=wide.vars,...)
new <- stack.list(str_split(data.melt$variable,sep))
colnames(new) <- c(variable.name,new.names)
data.melt <- subset(data.melt,select=c(-variable))
cbind(data.melt,new)
}
choice.vars <- colnames(res)[grep("_",colnames(res))]
melt.wide( subset(res,select=c("WorkerId",choice.vars)), id.vars="WorkerId", new.names=c("set","option"), sep="_")
The new function returns a melted object that can then be *cast.
Where the data is:
so <- structure(list(WorkerId = c(12L, 13L, 27L, 25L, 30L, 8L), pio_1_1 = structure(c(2L,
1L, 2L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
pio_1_2 = structure(c(1L, 2L, 2L, 2L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), pio_1_3 = structure(c(1L, 1L,
1L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
pio_1_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
pio_2_1 = structure(c(1L, 2L, 2L, 1L, 1L, 2L), .Label = c("No",
"Yes"), class = "factor"), pio_2_2 = structure(c(1L, 1L,
1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
pio_2_3 = structure(c(2L, 2L, 2L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), pio_2_4 = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = "No", class = "factor"), pio_3_1 = structure(c(2L,
2L, 2L, 2L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
pio_3_2 = structure(c(2L, 1L, 1L, 1L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), pio_3_3 = structure(c(2L, 1L,
2L, 1L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
pio_3_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
pio_4_1 = structure(c(2L, 1L, 2L, 2L, 1L, 2L), .Label = c("No",
"Yes"), class = "factor"), pio_4_2 = structure(c(2L, 2L,
2L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
pio_4_3 = structure(c(1L, 2L, 1L, 1L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), pio_4_4 = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = "No", class = "factor"), caremgmt_1_1 = structure(c(2L,
2L, 1L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_1_2 = structure(c(1L, 2L, 2L, 2L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_1_3 = structure(c(1L,
1L, 1L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_1_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
caremgmt_2_1 = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_2_2 = structure(c(1L,
2L, 1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_2_3 = structure(c(2L, 1L, 2L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_2_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"), caremgmt_3_1 = structure(c(2L,
1L, 2L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_3_2 = structure(c(2L, 1L, 2L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_3_3 = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_3_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
caremgmt_4_1 = structure(c(1L, 1L, 2L, 1L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_4_2 = structure(c(2L,
2L, 2L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_4_3 = structure(c(1L, 1L, 1L, 1L, 1L, 2L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_4_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"), prev_1_1 = structure(c(1L,
1L, 2L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
prev_1_2 = structure(c(1L, 2L, 1L, 2L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), prev_1_3 = structure(c(2L, 1L,
1L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
prev_1_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
prev_2_1 = structure(c(1L, 1L, 2L, 1L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), prev_2_2 = structure(c(2L, 2L,
1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
prev_2_3 = structure(c(1L, 2L, 1L, 1L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), prev_2_4 = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = "No", class = "factor"), prev_3_1 = structure(c(1L,
2L, 1L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
prev_3_2 = structure(c(1L, 1L, 2L, 1L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), prev_3_3 = structure(c(2L, 2L,
1L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
prev_3_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
prev_4_1 = structure(c(1L, 2L, 2L, 1L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), prev_4_2 = structure(c(1L, 1L,
2L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
prev_4_3 = structure(c(1L, 1L, 1L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), prev_4_4 = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = "No", class = "factor"), price_1_1 = structure(c(30L,
12L, 1L, 16L, 28L, 17L), .Label = c("$2,500", "$2,504", "$2,507",
"$2,509", "$2,512", "$2,513", "$2,515", "$2,526", "$2,547",
"$2,548", "$2,578", "$2,588", "$2,594", "$2,605", "$2,607",
"$2,617", "$2,618", "$2,622", "$2,635", "$2,649", "$2,670",
"$2,672", "$2,679", "$2,681", "$2,698", "$2,704", "$2,721",
"$2,782", "$2,851", "$2,884", "$2,919", "$2,925", "$2,935",
"$3,022"), class = "factor"), price_1_2 = structure(c(1L,
19L, 5L, 17L, 7L, 1L), .Label = c("$2,500", "$2,501", "$2,502",
"$2,504", "$2,513", "$2,515", "$2,517", "$2,532", "$2,535",
"$2,558", "$2,564", "$2,571", "$2,575", "$2,578", "$2,608",
"$2,633", "$2,634", "$2,675", "$2,678", "$2,687", "$2,730",
"$2,806", "$2,827", "$2,848", "$2,891", "$2,901", "$2,923",
"$2,933", "$2,937", "$2,958", "$2,987"), class = "factor"),
price_1_3 = structure(c(11L, 1L, 1L, 8L, 19L, 14L), .Label = c("$2,500",
"$2,504", "$2,507", "$2,513", "$2,516", "$2,518", "$2,564",
"$2,579", "$2,580", "$2,583", "$2,584", "$2,592", "$2,604",
"$2,608", "$2,639", "$2,643", "$2,646", "$2,665", "$2,667",
"$2,695", "$2,698", "$2,709", "$2,710", "$2,713", "$2,714",
"$2,750", "$2,757", "$2,876", "$2,978", "$2,984", "$3,024",
"$3,059"), class = "factor"), price_1_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "$2,500", class = "factor"),
price_2_1 = structure(c(27L, 32L, 19L, 22L, 4L, 26L), .Label = c("$2,500",
"$2,504", "$2,505", "$2,510", "$2,511", "$2,512", "$2,515",
"$2,517", "$2,518", "$2,529", "$2,533", "$2,537", "$2,551",
"$2,553", "$2,574", "$2,593", "$2,600", "$2,605", "$2,608",
"$2,612", "$2,613", "$2,618", "$2,639", "$2,657", "$2,714",
"$2,730", "$2,747", "$2,764", "$2,771", "$2,773", "$2,813",
"$2,859", "$2,901", "$3,019", "$3,037"), class = "factor"),
price_2_2 = structure(c(12L, 2L, 1L, 27L, 1L, 7L), .Label = c("$2,500",
"$2,502", "$2,510", "$2,514", "$2,515", "$2,516", "$2,517",
"$2,518", "$2,520", "$2,521", "$2,523", "$2,536", "$2,544",
"$2,575", "$2,583", "$2,592", "$2,602", "$2,624", "$2,644",
"$2,652", "$2,662", "$2,677", "$2,720", "$2,761", "$2,765",
"$2,770", "$2,772", "$2,835", "$2,873", "$2,911", "$2,950",
"$2,962"), class = "factor"), price_2_3 = structure(c(32L,
1L, 8L, 33L, 29L, 11L), .Label = c("$2,500", "$2,506", "$2,507",
"$2,510", "$2,511", "$2,512", "$2,515", "$2,517", "$2,527",
"$2,528", "$2,540", "$2,554", "$2,562", "$2,565", "$2,568",
"$2,581", "$2,597", "$2,611", "$2,616", "$2,631", "$2,652",
"$2,663", "$2,671", "$2,672", "$2,685", "$2,727", "$2,731",
"$2,742", "$2,771", "$2,778", "$2,781", "$2,970", "$2,984",
"$2,986", "$3,030"), class = "factor"), price_2_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "$2,500", class = "factor"),
price_3_1 = structure(c(24L, 1L, 28L, 7L, 18L, 21L), .Label = c("$2,500",
"$2,501", "$2,503", "$2,505", "$2,509", "$2,512", "$2,535",
"$2,537", "$2,542", "$2,553", "$2,556", "$2,560", "$2,561",
"$2,574", "$2,584", "$2,618", "$2,624", "$2,629", "$2,637",
"$2,664", "$2,761", "$2,840", "$2,875", "$2,883", "$2,891",
"$2,933", "$2,953", "$2,978", "$3,039", "$3,043", "$3,067"
), class = "factor"), price_3_2 = structure(c(3L, 1L, 5L,
19L, 25L, 9L), .Label = c("$2,500", "$2,501", "$2,503", "$2,504",
"$2,512", "$2,517", "$2,540", "$2,543", "$2,546", "$2,560",
"$2,567", "$2,573", "$2,586", "$2,592", "$2,594", "$2,603",
"$2,604", "$2,606", "$2,628", "$2,633", "$2,635", "$2,693",
"$2,696", "$2,714", "$2,734", "$2,739", "$2,770", "$2,791",
"$2,797", "$2,936", "$2,967", "$3,021", "$3,024"), class = "factor"),
price_3_3 = structure(c(26L, 7L, 5L, 32L, 10L, 24L), .Label = c("$2,500",
"$2,501", "$2,502", "$2,505", "$2,506", "$2,507", "$2,508",
"$2,509", "$2,512", "$2,515", "$2,519", "$2,547", "$2,556",
"$2,574", "$2,587", "$2,592", "$2,608", "$2,616", "$2,621",
"$2,635", "$2,638", "$2,667", "$2,671", "$2,688", "$2,694",
"$2,700", "$2,717", "$2,759", "$2,809", "$2,864", "$2,891",
"$2,912", "$3,011", "$3,012"), class = "factor"), price_3_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "$2,500", class = "factor"),
price_4_1 = structure(c(29L, 13L, 16L, 24L, 33L, 19L), .Label = c("$2,500",
"$2,505", "$2,506", "$2,508", "$2,511", "$2,525", "$2,549",
"$2,562", "$2,577", "$2,582", "$2,586", "$2,591", "$2,621",
"$2,636", "$2,654", "$2,670", "$2,722", "$2,726", "$2,733",
"$2,744", "$2,745", "$2,755", "$2,768", "$2,805", "$2,817",
"$2,827", "$2,835", "$2,888", "$2,925", "$2,959", "$3,001",
"$3,027", "$3,061", "$3,071"), class = "factor"), price_4_2 = structure(c(33L,
31L, 21L, 16L, 25L, 13L), .Label = c("$2,500", "$2,502",
"$2,503", "$2,505", "$2,506", "$2,511", "$2,513", "$2,516",
"$2,529", "$2,539", "$2,547", "$2,554", "$2,557", "$2,562",
"$2,567", "$2,579", "$2,581", "$2,583", "$2,585", "$2,591",
"$2,612", "$2,629", "$2,640", "$2,670", "$2,695", "$2,726",
"$2,737", "$2,788", "$2,790", "$2,798", "$2,852", "$3,031",
"$3,063"), class = "factor"), price_4_3 = structure(c(4L,
30L, 4L, 19L, 1L, 27L), .Label = c("$2,500", "$2,504", "$2,507",
"$2,509", "$2,511", "$2,512", "$2,514", "$2,516", "$2,543",
"$2,552", "$2,562", "$2,575", "$2,578", "$2,581", "$2,594",
"$2,614", "$2,615", "$2,617", "$2,636", "$2,640", "$2,641",
"$2,652", "$2,749", "$2,755", "$2,805", "$2,812", "$2,867",
"$2,906", "$2,910", "$2,917", "$2,924", "$2,927", "$2,961",
"$3,028", "$3,053", "$3,054"), class = "factor"), price_4_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "$2,500", class = "factor"),
plan_1_1 = structure(c(2L, 2L, 2L, 1L, 1L, 2L), .Label = c("",
"X"), class = "factor"), plan_1_2 = structure(c(1L, 1L, 1L,
2L, 1L, 1L), .Label = c("", "X"), class = "factor"), plan_1_3 = structure(c(1L,
1L, 1L, 1L, 2L, 1L), .Label = c("", "X"), class = "factor"),
plan_1_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"X"), class = "factor"), plan_2_1 = structure(c(1L, 2L, 1L,
2L, 2L, 2L), .Label = c("", "X"), class = "factor"), plan_2_2 = structure(c(1L,
1L, 2L, 1L, 1L, 1L), .Label = c("", "X"), class = "factor"),
plan_2_3 = structure(c(2L, 1L, 1L, 1L, 2L, 1L), .Label = c("",
"X"), class = "factor"), plan_2_4 = structure(c(1L, 1L, 1L,
1L, 1L, 1L), .Label = c("", "X"), class = "factor"), plan_3_1 = structure(c(1L,
2L, 1L, 1L, 2L, 1L), .Label = c("", "X"), class = "factor"),
plan_3_2 = structure(c(1L, 1L, 1L, 2L, 1L, 1L), .Label = c("",
"X"), class = "factor"), plan_3_3 = structure(c(2L, 1L, 1L,
1L, 1L, 2L), .Label = c("", "X"), class = "factor"), plan_3_4 = structure(c(1L,
1L, 2L, 1L, 1L, 1L), .Label = c("", "X"), class = "factor"),
plan_4_1 = structure(c(2L, 2L, 1L, 1L, 1L, 1L), .Label = c("",
"X"), class = "factor"), plan_4_2 = structure(c(2L, 1L, 1L,
2L, 1L, 1L), .Label = c("", "X"), class = "factor"), plan_4_3 = structure(c(1L,
1L, 1L, 1L, 2L, 2L), .Label = c("", "X"), class = "factor"),
plan_4_4 = structure(c(1L, 1L, 2L, 1L, 1L, 1L), .Label = c("",
"X"), class = "factor")), .Names = c("WorkerId", "pio_1_1",
"pio_1_2", "pio_1_3", "pio_1_4", "pio_2_1", "pio_2_2", "pio_2_3",
"pio_2_4", "pio_3_1", "pio_3_2", "pio_3_3", "pio_3_4", "pio_4_1",
"pio_4_2", "pio_4_3", "pio_4_4", "caremgmt_1_1", "caremgmt_1_2",
"caremgmt_1_3", "caremgmt_1_4", "caremgmt_2_1", "caremgmt_2_2",
"caremgmt_2_3", "caremgmt_2_4", "caremgmt_3_1", "caremgmt_3_2",
"caremgmt_3_3", "caremgmt_3_4", "caremgmt_4_1", "caremgmt_4_2",
"caremgmt_4_3", "caremgmt_4_4", "prev_1_1", "prev_1_2", "prev_1_3",
"prev_1_4", "prev_2_1", "prev_2_2", "prev_2_3", "prev_2_4", "prev_3_1",
"prev_3_2", "prev_3_3", "prev_3_4", "prev_4_1", "prev_4_2", "prev_4_3",
"prev_4_4", "price_1_1", "price_1_2", "price_1_3", "price_1_4",
"price_2_1", "price_2_2", "price_2_3", "price_2_4", "price_3_1",
"price_3_2", "price_3_3", "price_3_4", "price_4_1", "price_4_2",
"price_4_3", "price_4_4", "plan_1_1", "plan_1_2", "plan_1_3",
"plan_1_4", "plan_2_1", "plan_2_2", "plan_2_3", "plan_2_4", "plan_3_1",
"plan_3_2", "plan_3_3", "plan_3_4", "plan_4_1", "plan_4_2", "plan_4_3",
"plan_4_4"), row.names = c(NA, 6L), class = "data.frame")
... almost a year later...
This came to mind the other day, and I have a sneaking suspicion that it is what you tried to show in your example, but unfortunately, your example code doesn't run!
melt sometimes takes things a bit too far for me when making my data "long". Sometimes, even though it is not what would necessarily be called "tidy data", I prefer to have a "semi-long" data.frame. This is easily achieved using base R's reshape, but requires a few extra steps with the "reshape2" package, as demonstrated below:
Prerequisite: sample data.
set.seed(1)
myDf <- data.frame(
ID.1 = sample(letters[1:5], 5, replace = TRUE),
ID.2 = 1:5,
V.1 = sample(10:14, 5, replace = TRUE),
V.2 = sample(5:9, 5, replace = TRUE),
V.3 = sample(3:14, 5, replace = TRUE),
W.1 = sample(LETTERS, 5, replace = TRUE),
W.2 = sample(LETTERS, 5, replace = TRUE),
W.3 = sample(LETTERS, 5, replace = TRUE)
)
myDf
# ID.1 ID.2 V.1 V.2 V.3 W.1 W.2 W.3
# 1 b 1 14 6 8 Y K M
# 2 b 2 14 5 11 F A P
# 3 c 3 13 8 14 Q J M
# 4 e 4 13 6 7 D W E
# 5 b 5 10 8 12 G I V
The "semi-long" output that I'm looking for. Easily achieved with base R's reshape.
reshape(myDf, direction = "long", idvar=1:2, varying = 3:ncol(myDf))
# ID.1 ID.2 time V W
# b.1.1 b 1 1 14 Y
# b.2.1 b 2 1 14 F
# c.3.1 c 3 1 13 Q
# e.4.1 e 4 1 13 D
# b.5.1 b 5 1 10 G
# b.1.2 b 1 2 6 K
# b.2.2 b 2 2 5 A
# c.3.2 c 3 2 8 J
# e.4.2 e 4 2 6 W
# b.5.2 b 5 2 8 I
# b.1.3 b 1 3 8 M
# b.2.3 b 2 3 11 P
# c.3.3 c 3 3 14 M
# e.4.3 e 4 3 7 E
# b.5.3 b 5 3 12 V
melt is great if you wanted the equivalent of stack, especially since stack discards all factor variables, which is frustrating when read.table and family defaults to stringsAsFactors = TRUE. (You can make it work, but you need to convert the relevant columns to character before you can use stack). But, it is not what I'm looking for, in particular because of how it has handled the "variable" column.
library(reshape2)
myDfL <- melt(myDf, id.vars=1:2)
head(myDfL)
# ID.1 ID.2 variable value
# 1 b 1 V.1 14
# 2 b 2 V.1 14
# 3 c 3 V.1 13
# 4 e 4 V.1 13
# 5 b 5 V.1 10
# 6 b 1 V.2 6
To fix this, one needs to first split the "variable" column, and then use dcast to get the same format of output as you would get from reshape.
myDfL <- cbind(myDfL, colsplit(myDfL$variable, "\\.", names=c("var", "time")))
dcast(myDfL, ID.1 + ID.2 + time ~ var, value.var="value")
# ID.1 ID.2 time V W
# 1 b 1 1 14 Y
# 2 b 1 2 6 K
# 3 b 1 3 8 M
# 4 b 2 1 14 F
# 5 b 2 2 5 A
# 6 b 2 3 11 P
# 7 b 5 1 10 G
# 8 b 5 2 8 I
# 9 b 5 3 12 V
# 10 c 3 1 13 Q
# 11 c 3 2 8 J
# 12 c 3 3 14 M
# 13 e 4 1 13 D
# 14 e 4 2 6 W
# 15 e 4 3 7 E