Group by several columns and print - r

I have a data frame where I want at the end to print the results of several columns grouped by the number of observations of each column and their frequency separately.
This is what I have done, but it gives me the count and freq of the total but I want for each column, q1a, q1a_30d,q1a_60d,q1a_90d
a<- df %>% group_by(q1a, q1a_30d, q1a_60d,q1a_90d) %>% summarise(cnt = n()) %>%mutate(freq = formattable::percent(cnt / sum(cnt),1))
and then for print
kable( a, col.names = c(" ", "cnt", "freq"),align = c("lcr"),longtable = T, booktabs = T, valign = 't', escape = F, caption = '<b> Wore a face covering or mask<b>') %>%
kable_styling(bootstrap_options = c("striped", "hold_position"),full_width = T,position = "center",html_font = "Arial") %>%
add_header_above(c("Baseline", "30 days", "60 days", "90 days"))%>%
column_spec(border_left = T, border_right = T)
dput(a[1:10, ])
structure(list(q1a = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L), .Label = c("All of the time", "Very frequently", "Somewhat frequently", "Never", "No answer"), class = "factor"), q1a_30d = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,2L), .Label = c("All of the time", "Very frequently", "Somewhat frequently", "Never", "No answer"), class = "factor"), q1a_60d = structure(c(1L, 1L, 1L, 1L, 2L, 3L, 5L, 1L, 2L, 2L), .Label = c("All of the time", "Very frequently", "Somewhat frequently", "Never", "No answer"), class = "factor"), q1a_90d = structure(c(1L, 2L, 3L, 5L,5L, 1L, 5L, 1L, 1L, 3L), .Label = c("All of the time", "Very frequently", "Somewhat frequently", "Never", "No answer"), class = "factor"), cnt = c(8L, 1L, 1L, 13L, 1L, 1L, 14L, 4L, 1L, 1L), freq = structure(c(0.347826086956522, 0.0434782608695652, 0.0434782608695652, 0.565217391304348, 1, 1, 1, 1, 0.5, 0.5), formattable = list(formatter = "formatC", format = list(format = "f", digits = 1), preproc = "percent_preproc", postproc = "percent_postproc"), class = c("formattable", "numeric"))), row.names = c(NA, -10L), groups = structure(list(q1a = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("All of the time", "Very frequently", "Somewhat frequently", "Never", "No answer"), class = "factor"), q1a_30d = structure(c(1L, 1L, 1L, 1L,2L, 2L), .Label = c("All of the time", "Very frequently","Somewhat frequently", "Never", "No answer"), class = "factor"),q1a_60d = structure(c(1L, 2L, 3L, 5L, 1L, 2L), .Label = c("All of the time","Very frequently", "Somewhat frequently", "Never", "No answer"), class = "factor"), .rows = structure(list(1:4, 5L, 6L,7L, 8L, 9:10), ptype = integer(0), class = ("vctrs_list_of","vctrs_vctr", "list"))), row.names = c(NA, -6L), class = c("tbl_df","tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df", "tbl", "data.frame"))

Related

cut.default error in heatmap generation R

I want to generate a heatmap from a 8*6 dataframe. The last row in the dataframe has the information to annotate the columns. Structure of the dataframe is as follows:
heatmap_try <-structure(list(BGC0000041 = structure(c(1L, 2L, 1L, 1L, 1L, 3L
), .Label = c("0", "0.447458977", "a"), class = "factor"), BGC0000128 = structure(c(1L,
1L, 1L, 3L, 2L, 4L), .Label = c("0", "1.785875195", "4.093659107",
"a"), class = "factor"), BGC0000287 = structure(c(1L, 1L, 1L,
3L, 2L, 4L), .Label = c("0", "1.785875195", "4.456229186", "b"
), class = "factor"), BGC0000294 = structure(c(3L, 1L, 2L, 4L,
1L, 5L), .Label = c("0", "2.035046947", "3.230553742", "3.286304185",
"b"), class = "factor"), BGC0000295 = structure(c(1L, 1L, 1L,
2L, 1L, 3L), .Label = c("0", "2.286304185", "c"), class = "factor"),
BGC0000308 = structure(c(4L, 2L, 3L, 5L, 1L, 6L), .Label = c("6.277728291",
"6.313707588", "6.607936616", "6.622871165", "6.64385619",
"c"), class = "factor"), BGC0000323 = structure(c(1L, 2L,
1L, 1L, 1L, 3L), .Label = c("0", "0.447458977", "c"), class = "factor"),
BGC0000328 = structure(c(1L, 2L, 1L, 1L, 1L, 3L), .Label = c("0",
"0.447458977", "c"), class = "factor")), class = "data.frame", row.names = c("Gut",
"Oral", "Anterior_nares", "Retroauricular_crease", "Vagina",
"AL"))
My code for heatmap generation is as follows (I am using pheatmap library):
library(pheatmap)
heatmap_data1 <- heatmap_try[ c(1:5), c(1:8) ]
anotation_data <- as.data.frame(t(heatmap_try[6, ]))
row.names(anotation_data) <- colnames(heatmap_data1)
pheatmap(heatmap_data1, annotation_col = anotation_data, color = colorRampPalette(c("white","blue"))(n=100),cellwidth = 40,cellheight = 6,fontsize_row = 5,cluster_rows = F,cluster_cols = F)
However, I am getting the following error:
Error in cut.default(x, breaks = breaks, include.lowest = T) :
'x' must be numeric
What I am doing wrong?
Thanks!
This is because the columns of heatmap_data1 are factors, they need to be numeric. One way to convert is with:
heatmap_data1_num <- as.data.frame(lapply(heatmap_data1,
function(x) as.numeric(as.character(x))))
# then as before
pheatmap(heatmap_data1_num, annotation_col = anotation_data, color = colorRampPalette(c("white","blue"))(n=100),cellwidth = 40,cellheight = 6,fontsize_row = 5,cluster_rows = F,cluster_cols = F)

Error in table(data, reference, dnn = dnn, ...) : all arguments must have the same length when run confusionMatrix with caret, in R

I have an issue running a confusionMatrix.
here is what I do:
rf <- caret::train(tested ~.,
data = training_data,
method = "rf",
trControl = ctrlInside,
metric = "ROC",
na.action = na.exclude)
rf
After I get my model this is the next step I take:
evalResult.rf <- predict(rf, testing_data, type = "prob")
predict_rf <- as.factor(ifelse(evalResult.rf <0.5, "positive", "negative"))
And then I am running my confusion matrix.
cm_rf_forest <- confusionMatrix(predict_rf, testing_data$tested, "positive")
And the error comes after I apply the confusionMatrix:
Error in table(data, reference, dnn = dnn, ...) :
all arguments must have the same length
Nevertheless, I give you bits of my data.
train data:
structure(list(tested = structure(c(1L, 1L, 1L, 1L, 1L,
1L), .Label = c("negative", "positive"), class = "factor"), Gender = structure(c(2L,
2L, 1L, 1L, 2L, 2L), .Label = c("Female", "Male", "Other"), class = "factor"),
Age = c(63, 23, 28, 40, 31, 60), number_days_symptoms = c(1,
1, 16, 1, 14, 1), care_home_worker = structure(c(1L, 2L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
health_care_worker = structure(c(1L, 1L, 1L, 1L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), how_unwell = c(1, 1, 6, 4, 2,
1), self_diagnosis = structure(c(1L, 1L, 2L, 1L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), chills = structure(c(1L, 1L, 2L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
cough = structure(c(1L, 1L, 2L, 2L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), diarrhoea = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
fatigue = structure(c(1L, 2L, 2L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), headache = structure(c(2L, 2L,
3L, 2L, 2L, 2L), .Label = c("Headcahe", "No", "Yes"), class = "factor"),
loss_smell_taste = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), muscle_ache = structure(c(1L,
1L, 2L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
nasal_congestion = structure(c(1L, 1L, 1L, 2L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), nausea_vomiting = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
shortness_breath = structure(c(1L, 1L, 1L, 1L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), sore_throat = structure(c(1L,
1L, 1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
sputum = structure(c(1L, 1L, 2L, 2L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), temperature = structure(c(4L,
4L, 4L, 4L, 1L, 4L), .Label = c("37.5-38", "38.1-39", "39.1-41",
"No"), class = "factor"), asthma = structure(c(2L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
diabetes_type_one = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), diabetes_type_two = structure(c(2L,
1L, 1L, 1L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
obesity = structure(c(1L, 2L, 2L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), hypertension = structure(c(1L,
1L, 2L, 1L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 2L), .Label = c("No",
"Yes"), class = "factor"), lung_condition = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), kidney_disease = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor")), row.names = c(1L,
3L, 4L, 5L, 6L, 7L), class = "data.frame")
and here is my test_data:
structure(list(tested = structure(c(1L, 1L, 1L, 1L, 1L,
1L), .Label = c("negative", "positive"), class = "factor"), Gender = structure(c(1L,
2L, 1L, 1L, 1L, 2L), .Label = c("Female", "Male", "Other"), class = "factor"),
Age = c(19, 26, 30, 45, 40, 43), number_days_symptoms = c(20,
1, 1, 20, 14, 1), care_home_worker = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
health_care_worker = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), how_unwell = c(7, 6, 6, 6, 6,
2), self_diagnosis = structure(c(2L, 1L, 1L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), chills = structure(c(2L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
cough = structure(c(2L, 1L, 1L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), diarrhoea = structure(c(2L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
fatigue = structure(c(2L, 1L, 1L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), headache = structure(c(2L, 2L,
2L, 3L, 2L, 3L), .Label = c("Headcahe", "No", "Yes"), class = "factor"),
loss_smell_taste = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), muscle_ache = structure(c(2L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
nasal_congestion = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), nausea_vomiting = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
shortness_breath = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), sore_throat = structure(c(1L,
1L, 1L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
sputum = structure(c(2L, 1L, 1L, 2L, 1L, 2L), .Label = c("No",
"Yes"), class = "factor"), temperature = structure(c(4L,
4L, 4L, 1L, 1L, 4L), .Label = c("37.5-38", "38.1-39", "39.1-41",
"No"), class = "factor"), asthma = structure(c(1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
diabetes_type_one = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), diabetes_type_two = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
obesity = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), hypertension = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), lung_condition = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), kidney_disease = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor")), row.names = c(2L,
8L, 11L, 14L, 20L, 27L), class = "data.frame")
Additionally, I perform a smote balancing class, on a subsample in ctrInside.
This is my smote function:
smotest <- list(name = "SMOTE with more neighbors!",
func = function (x, y) {
115
library(DMwR)
dat <- if (is.data.frame(x)) x else as.data.frame(x)
dat$.y <- y
dat <- SMOTE(.y ~ ., data = dat, k = 3, perc.over = 100, perc.under =
200)
list(x = dat[, !grepl(".y", colnames(dat), fixed = TRUE)],
y = dat$.y) },
first = TRUE)
And ctrlInside is this:
ctrlInside <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE,
search = "grid",
sampling = smotest)
Those function are given just so that you have an idea of what I am doing per whole. Is there a reason why this is happening?
You can use complete.cases to predict only those that have no nas, also you must operate on the matrix, I will show below. Using an example dataset, I make 10 of the variable in a column NAs, and train:
idx = sample(nrow(iris),100)
data = iris
data$Petal.Length[sample(nrow(data),10)] = NA
data$tested = factor(ifelse(data$Species=="versicolor","positive","negative"))
data = data[,-5]
training_data = data[idx,]
testing_data= data[-idx,]
rf <- caret::train(tested ~., data = training_data,
method = "rf",
trControl = ctrlInside,
metric = "ROC",
na.action = na.exclude)
Do the evaluation result and you can see i get the same error:
evalResult.rf <- predict(rf, testing_data, type = "prob")
predict_rf <- as.factor(ifelse(evalResult.rf <0.5, "positive", "negative"))
cm_rf_forest <- confusionMatrix(predict_rf, testing_data$tested, "positive")
Error in table(data, reference, dnn = dnn, ...) :
all arguments must have the same length
So there's two sources of error, 1.. you have NAs and they cannot predict that, and second, evalResult.rf returns a matrix of probabilities, first column is probability being negative class, 2nd being postive:
head(evalResult.rf)
negative positive
3 1.000 0.000
6 1.000 0.000
9 0.948 0.052
12 1.000 0.000
13 0.976 0.024
19 0.998 0.002
To get the classes, you do, get the column with max value for each row, and return the corresponding column name, which is the class:
colnames(evalResult.rf)[max.col(evalResult.rf)]
We do now:
testing_data = testing_data[complete.cases(testing_data),]
evalResult.rf <- predict(rf, testing_data, type = "prob")
predict_rf <- factor(colnames(evalResult.rf)[max.col(evalResult.rf)])
cm_rf_forest <- confusionMatrix(predict_rf, testing_data$tested, "positive")
Confusion Matrix and Statistics
Reference
Prediction negative positive
negative 33 1
positive 0 11
Accuracy : 0.9778
95% CI : (0.8823, 0.9994)
No Information Rate : 0.7333
P-Value [Acc > NIR] : 1.507e-05
Kappa : 0.9416

Error: `data` and `reference` should be factors with the same levels. Confusion matrix for Logistic Regression

I have seen lots of answers with regards to this particular error. I haven't found any answer to it with specifics to my particular issue. Therefore, my problem
This is what I do:
shortness_breath_data <- data_categ_nosev %>%
dplyr::select(shortness_breath, obesity, asthma, diabetes_type_one, diabetes_type_two, obesity, hypertension, heart_disease, lung_condition, liver_disease, kidney_disease, Covid_tested, Gender)
And this is put(head(shortness_breath_data)):
structure(list(shortness_breath = structure(c(1L, 2L, 1L, 1L,
1L, 2L), .Label = c("No", "Yes"), class = "factor"), obesity = structure(c(1L,
1L, 2L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
asthma = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), diabetes_type_one = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
diabetes_type_two = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), hypertension = structure(c(1L,
1L, 1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), lung_condition = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), kidney_disease = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
Covid_tested = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("negative",
"positive"), class = "factor"), Gender = structure(c(2L,
1L, 2L, 1L, 1L, 2L), .Label = c("Female", "Male", "Other"
), class = "factor")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"), problems = structure(list(row = c(2910L,
35958L), col = c("how_unwell", "how_unwell"), expected = c("a double",
"a double"), actual = c("How Unwell", "How Unwell"), file = c("'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'",
"'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'"
)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
)))
And I divide this into training and testing dataset.
shortness_breath_data$shortness_breath <- as.factor(shortness_breath_data$shortness_breath)
n <- nrow(shortness_breath_data)
set.seed(22)
trainingdx <- sample(1:n, 0.7 * n)
train <- shortness_breath_data[trainingdx,]
validate <- shortness_breath_data[-trainingdx,]
train %>% distinct(shortness_breath)
validate %>% distinct(shortness_breath)
And just to do the same in case it will ease you job in finding the issue, I provided dput(head(train)) and dput(head(validate))
train dataset:
structure(list(shortness_breath = structure(c(1L, 1L, 1L, 1L,
1L, 1L), .Label = c("No", "Yes"), class = "factor"), obesity = structure(c(2L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
asthma = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), diabetes_type_one = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
diabetes_type_two = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), hypertension = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), lung_condition = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), kidney_disease = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
Covid_tested = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("negative",
"positive"), class = "factor"), Gender = structure(c(1L,
1L, 1L, 2L, 1L, 2L), .Label = c("Female", "Male", "Other"
), class = "factor")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"), problems = structure(list(row = c(2910L,
35958L), col = c("how_unwell", "how_unwell"), expected = c("a double",
"a double"), actual = c("How Unwell", "How Unwell"), file = c("'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'",
"'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'"
)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
)))
validate dataset:
structure(list(shortness_breath = structure(c(1L, 2L, 2L, 1L,
1L, 1L), .Label = c("No", "Yes"), class = "factor"), obesity = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
asthma = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), diabetes_type_one = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
diabetes_type_two = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), hypertension = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), lung_condition = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), kidney_disease = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
Covid_tested = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("negative",
"positive"), class = "factor"), Gender = structure(c(2L,
1L, 2L, 2L, 1L, 1L), .Label = c("Female", "Male", "Other"
), class = "factor")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"), problems = structure(list(row = c(2910L,
35958L), col = c("how_unwell", "how_unwell"), expected = c("a double",
"a double"), actual = c("How Unwell", "How Unwell"), file = c("'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'",
"'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'"
)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
)))
And then, I build my logistic regression model with stepwise, forward method.
null_model <- glm(shortness_breath ~ 1, data = train, family = "binomial")
fm_shortness_breath <- glm(shortness_breath ~., data = train, family = "binomial")
stepmodel <- step(null_model, scope = list(lower = null_model, upper = fm_shortness_breath), direction = "forward")
Then I get my summary model and store the predictions in the source data frame.
summary(stepmodel)
validate$pred <- predict(stepmodel, validate, type = "response")
validate$real <- validate$shortness_breath
train$pred <- predict(stepmodel, train, type = "response")
train$real <- train$shortness_breath
Then I plot my ROC curve with no problem:
plot.roc(validate$real, validate$pred, col = "red", main = "ROC Validation Set", percent = TRUE, print.auc = TRUE)
Yet, when I am trying to get my confusion matrix, this is where I get my error. But this is my code:
cm_stepmodel <- confusionMatrix(stepmodel, validate)
And then, the error comes in:
Error: `data` and `reference` should be factors with the same levels.
With Show Traceback:
3.
stop("`data` and `reference` should be factors with the same levels.", call. = FALSE)
2.
confusionMatrix.default(stepmodel, validate)
1.
confusionMatrix(stepmodel, validate)
I simply do not see the problem. And tried several other options but did not work. I have reproduced, step by step the exact approach I am undertaking. And I do not get my answer. Also, I have tag this issue with RMarkdown as well, alongside caret and R, just in case.
Also, libraries used are:
library(tidyverse)
library(conflicted)
library(tidymodels)
library(ggrepel)
library(corrplot)
library(dplyr)
library(corrr)
library(themis)
library(rsample)
library(caret)
library(forcats)
library(rcompanion)
library(MASS)
library(pROC)
library(ROCR)
library(data.table)
Try to convert your predicted probabilities to labels, and then run your confusionMatrix on this:
validate$pred <- predict(stepmodel, validate, type = "response")
validate$pred_label <- as.factor(ifelse(validate$pred >= 0.5, "Yes", "No"))
confusionMatrix(validate$real, validate$pred) # Error
confusionMatrix(validate$real, validate$pred_label) # This will work
Check that you are correctly assigning labels as in your original dataset in the validate$pred_label statement.
I'm not particularly familiar with confusionMatrix, but the general idea is that you make predictions of labels and compare to the actual labels of your data. It threw an error because you were comparing labels with probabilities -- you needed to assign the labels. Please correct me if I made a conceptual error or coding mistake above.

Issues spreading data after removing outliers

I put my data into long format, in order to remove outliers (I grouped by grade and condition, and then removed by 1.5 * IQR), however, I'm having issues getting it back into wide format. The final two columns (condition and variable (36, 37) are what I want to spread the data by (so that BOXED_Conjunction_12 becomes a variable). Variables 1:35 should remain as they are. (There will be NAs introduced given that outliers were removed; however, outliers were only removed per condition and not completely). I think I'm having issues because of removing the outliers, but I would think that fill = NA would solve this issue. Can't figure it out.
I've tried
dat%>%spread(condition,pid.avg_rw, fill = NA)
I've also tried using reshape2:
dat%>%dcast((1:35)~ condition, value.var = "pid.avg_rw")
and I get the error
number of rows of result is not a multiple of vector length (arg 1)Aggregation function missing: defaulting to length
Here's a dput of the first ten lines.
Thanks much,
James
structure(list(pid = c("ADMIN-UCSF-bo002", "ADMIN-UCSF-bo002",
"ADMIN-UCSF-bo002", "ADMIN-UCSF-bo002", "ADMIN-UCSF-bo002", "ADMIN-UCSF-bo002",
"ADMIN-UCSF-bo002", "ADMIN-UCSF-bo002", "ADMIN-UCSF-bo002", "ADMIN-UCSF-bo002"
), timepoint = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1), District.ID = c(175420L,
175420L, 175420L, 175420L, 175420L, 175420L, 175420L, 175420L,
175420L, 175420L), School = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Bowers", "Bracher", "Cabrillo",
"Central Park", "Laurelwood", "Millikin", "Peterson"), class = "factor"),
Ethnicity = structure(c(6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L), .Label = c("American Indian or Alaskan Native", "Asian",
"Black or African American", "Blank on Purpose", "Filipino",
"Hispanic or Latino", "Pacific Islander", "Two or More Races",
"White"), class = "factor"), Age.2018 = c(10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L), Sex = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("F", "M"), class = "factor"),
Language.Fluency = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("English Learner", "English Only",
"IFEP-Initially Fluent", "RFEP-Redesignated"), class = "factor"),
Parent.Ed.Lvl = structure(c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L), .Label = c("College Graduate", "Declined to state/Unknown",
"Grad School/post grad trng", "High School Graduate", "Not HS Graduate",
"Some College"), class = "factor"), SpEd = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"
), class = "factor"), SpEd.Dis = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "Autism (AUT)",
"Emotional Disturbance (ED)", "Hard of Hearing (HH)", "Intellectual Disability (ID)",
"Other Health Impairment (OHI)", "Specific Learning Disability (SLD)",
"Speech or Language Impairment (SLI)", "Visual Impairment (VI)"
), class = "factor"), Low.Income = structure(c(2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
grade = c("3", "3", "4", "3", "3", "4", "3", "3", "4", "3"
), gender = c("F", "F", "2", "F", "F", "2", "F", "F", "2",
"F"), Teacher = c("Keith, Susan", "Keith, Susan", "Lourdes Martin",
"Keith, Susan", "Keith, Susan", "Lourdes Martin", "Keith, Susan",
"Keith, Susan", "Lourdes Martin", "Keith, Susan"), time = structure(c(17113,
17263, 17417, 17113, 17263, 17417, 17113, 17263, 17417, 17113
), class = "Date"), ela.score = c(2424, 2424, NA, 2424, 2424,
NA, 2424, 2424, NA, 2424), School.Year = c("2017", "2017",
"2018", "2017", "2017", "2018", "2017", "2017", "2018", "2017"
), math.score = c(2440, 2440, NA, 2440, 2440, NA, 2440, 2440,
NA, 2440), basc = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), cohort = c("3", "3", "3", "3", "3", "3", "3", "3", "3",
"3"), attendance = c(96.1, 96.1, 100, 96.1, 96.1, 100, 96.1,
96.1, 100, 96.1), tme4 = structure(c(NA, 17655, 17655, NA,
17655, 17655, NA, 17655, 17655, NA), class = "Date"), t4.minus = c(6.39692965521615,
5.97126183979046, 5.47227067367148, 6.39692965521615, 5.97126183979046,
5.47227067367148, 6.39692965521615, 5.97126183979046, 5.47227067367148,
6.39692965521615), median_grade = c(1536.4, 1536.4, 1372.4,
1192, 1192, 1054, 986.6, 986.6, 871.6, 958.4), mad_grade = c(377.17344,
377.17344, 278.13576, 167.5338, 167.5338, 161.89992, 139.66092,
139.66092, 116.23584, 143.21916), lowerq = c(1323.7, 1323.7,
1226.2, 1102.2, 1102.2, 960.6, 902.9, 902.9, 804, 873.5),
upperq = c(1964.8, 1964.8, 1655.6, 1329.3, 1329.3, 1181.6,
1091.9, 1091.9, 964.2, 1074.1), iqr = c(641.1, 641.1, 429.4,
227.1, 227.1, 221, 189, 189, 160.2, 200.6), grade.threshold.upper = c(3888.1,
3888.1, 2943.8, 2010.6, 2010.6, 1844.6, 1658.9, 1658.9, 1444.8,
1675.9), grade.threshold.lower = c(-599.6, -599.6, -61.9999999999995,
420.9, 420.9, 297.6, 335.9, 335.9, 323.4, 271.7), mad = c(377.17344,
377.17344, 278.13576, 167.5338, 167.5338, 161.89992, 139.66092,
139.66092, 116.23584, 143.21916), z_rw = c(0.350390238376874,
0.0417183791440274, 0.171148318277673, -0.108910138097997,
-0.497500239197831, -0.365723152941879, 0.512731829784946,
-0.588322005081869, -0.0970981769116109, -0.290844134905211
), condition = c("BOXED_Conjunction_12", "BOXED_Conjunction_12",
"BOXED_Conjunction_12", "BOXED_Conjunction_4", "BOXED_Conjunction_4",
"BOXED_Conjunction_4", "BOXED_Feature_12", "BOXED_Feature_12",
"BOXED_Feature_12", "BOXED_Feature_4"), pid.avg_rw = c(2140,
1845.6, 1884.4, 1242.8, 1088.4, 973.6, 1160.4, 887.6, 910.8,
929.2), avg_rw_grade = c(1805.81052631579, 1805.81052631579,
1686.41503416856, 1286.07368421053, 1286.07368421053, 1148.48656036446,
1033.36421052632, 1033.36421052632, 982.933485193622, 1001.18526315789
), sd_grade = c(953.763652869694, 953.763652869694, 1156.80345459324,
397.333847576144, 397.333847576144, 478.193844053012, 247.762635541793,
247.762635541793, 742.892271389251, 247.504606484003)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))

Bar chart showing NA bar when there are no NA values

My visualisation is showing an NA bar chart despite the fact that I have imputed all NA values in my incomeLev column and explicitly removed all NA values from the mental health (which is in my stacked bar visualisation)
brfss2013$mentalHealth <- forcats::fct_explicit_na(brfss2013$mentalHealth, na_level = "Missing")
brfss2013$incomeLev <- as.factor(brfss2013$incomeLev)
brfss2013 <- subset(brfss2013, !is.na(incomeLev))
brfss2013 %>%
add_count(incomeLev) %>%
rename(count_inc = n) %>%
count(incomeLev, mentalHealth, count_inc) %>%
rename(count_mentalHealth = n) %>%
mutate(percent= count_mentalHealth / count_inc) %>%
mutate(incomeLev = factor(incomeLev,
levels=c('0-$20k','25-$35k','35-$50k','50-$75k','>$75k')))%>%
ggplot(aes(x= incomeLev,
y= count_mentalHealth,
group= mentalHealth)) +
xlab('Annual Income')+ylab('Number of People')+
geom_bar(aes(fill=mentalHealth),
stat="identity",na.rm=TRUE)+
# Using the scales package does the percent formatting for you
geom_text(aes(label = scales::percent(percent)),position = position_stack(vjust = 0.5))+
theme_minimal()
Here is a sample of my data:
brfss2013<-structure(list(incomeLev = structure(c(5L, 1L, 1L, 5L, 4L, 1L,
1L, 4L, 1L, 3L), .Label = c(">$75k", "0-$20k", "25-$35k", "35-$50k",
"50-$75"), class = "factor"), healtheat = c(4.66, 1.68, 2.37,
1.85, 2.5, 3, 3.66, 4.27, 2.72, 1.72), X_age_g = structure(c(5L,
4L, 5L, 5L, 6L, 4L, 3L, 5L, 4L, 6L), .Label = c("Age 18 to 24",
"Age 25 to 34", "Age 35 to 44", "Age 45 to 54", "Age 55 to 64",
"Age 65 or older"), class = "factor"), employ1 = structure(c(7L,
1L, 1L, 7L, 7L, 1L, 1L, 7L, 7L, 5L), .Label = c("Employed for wages",
"Self-employed", "Out of work for 1 year or more", "Out of work for less than 1 year",
"A homemaker", "A student", "Retired", "Unable to work"), class = "factor"),
renthom1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L), .Label = c("Own", "Rent", "Other arrangement"), class = "factor"),
sex = structure(c(2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L), .Label = c("Male",
"Female"), class = "factor"), physLev = structure(c(3L, 1L,
3L, 1L, 2L, 1L, 2L, 1L, 2L, 2L), .Label = c("0-200", "200-500",
"500-1000", "1000-2000", "2000-4000", "4000-10000", ">10000"
), class = "factor"), mentalHealth = structure(c(5L, 1L,
1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L), .Label = c("Excellent",
"Good", "Ok", "Bad", "Very Bad", "Missing"), class = "factor")), row.names = c(NA,
10L), class = "data.frame")

Resources