I am using the caret package along with the confusionMatrix function and I would like to know if it is possible to know which are the exact values that were not clasified properly.
Here is a subset of my train data
train_sub <- structure(
list(
corr = c(
0.629922866893549,
0.632354159559817,
0.656112138936032,
0.4469719807955,
0.598136079870775,
0.314461239093862,
0.379065842199838,
0.347331370037428,
0.310270891798492,
0.361064451331448,
0.335628455451358
),
rdist = c(
0.775733824285612,
0.834148208687529,
0.884167982488944,
0.633989717138057,
0.850225777237626,
0.626197919283803,
0.649597055761598,
0.680382136363523,
0.627828985862852,
0.713674404108905,
0.646094473468118
),
CCF2 = c(
0.634465565134314,
0.722096802135009,
0.792385621105087,
0.46497582143802,
0.739612023831014,
0.470724554509749,
0.505961260826622,
0.527876803999064,
0.461724328071479,
0.564117580569802,
0.490084457081904
),
Wcorr = c(
0.629,
0.613,
0.812,
0.424,
0.593,
0.36,
0.346,
0.286,
0.333,
0.381,
0.333
),
Wcorr2 = c(
0.735,
0.743,
0.802,
0.588,
0.691,
0.632,
0.61,
0.599,
0.599,
0.632,
0.613
),
Wcorr3 = c(
0.21,
0.301,
0.421,
-0.052,
0.169,
-0.032,
-0.042,-0.048,
-0.035,
0.006,
-0.004
),
Var = c("W", "W", "W", "W",
"W", "B", "B", "B", "B", "B", "B")
),
row.names = c(1L, 2L,
3L, 5L, 7L, 214L, 215L, 216L, 217L, 218L, 221L),
class = "data.frame"
)
and here is a subset of my test data
test_sub <- structure(
list(
corr = c(
0.636658204667785,
0.5637857758104,
0.540558984461647,
0.392647603023863,
0.561801911406989,
0.297187412065481,
0.278864501603015,
0.505277007007347,
0.403811785308709,
0.510158398354856,
0.459607853624603
),
rdist = c(
0.887270722679019,
0.843656768956754,
0.815806338767273,
0.732093571145576,
0.832944903081762,
0.485497073465096,
0.454461718498521,
0.69094669881886,
0.627667080657035,
0.705558894672344,
0.620838398507191
),
CCF2 = c(
0.802017782695131,
0.731763898271157,
0.689402284804853,
0.577932997250877,
0.715111899030751,
0.324826043263382,
0.298456267077388,
0.544808216945995,
0.458148923874818,
0.551160266327893,
0.461228649848996
),
Wcorr = c(
0.655,
0.536,
0.677,
0.556,
0.571,
0.29,
0.25,
0.484,
0.25,
0.515,
0.314
),
Wcorr2 = c(
0.779,
0.682,
0.734,
0.675,
0.736,
0.5,
0.529,
0.611,
0.555,
0.639,
0.572
),
Wcorr3 = c(
0.368,
0.154,
0.266,
0.103,
0.224,
-0.204,
-0.16,
-0.026,
-0.149,
0.032,
-0.097
),
Var = c("W", "W", "W", "W", "W", "B", "B", "B", "B", "B",
"B")
),
row.names = c(4L, 6L, 8L, 13L, 15L, 321L, 322L, 329L,
334L, 341L, 344L),
class = "data.frame"
)
When I use this line,
confusionMatrix(reference=as.factor(test$Var),data=fittedTL,mode = "everything")
With this I compute some machine learning using glmnet method (it gives the best accuracy ini my case)
classCtrl <- trainControl(method = "repeatedcv", number=10,repeats=5,classProbs = TRUE,savePredictions = "final")
set.seed(355)
glmnetTL <- train(Var~., train_sub, method= "glmnet", trControl=classCtrl)
glmnetTL
And finally I compute the confusion matrix on my test set:
predict_glmnet <- predict(glmnetTL,test_sub)
predict_glmnet
CM_glmnet <- confusionMatrix(reference=as.factor(test_sub$Var),data=predict_glmnet,mode = "everything")
CM_glmnet
The output of the confusion matrix is a table like so
B
W
B
4
0
W
2
5
So here I have two predictions/classifications that are not good.
Is there any way I can traceback to which row of my test set it corresponds ?
Goal
I have brake force (kg) data for many drivers, and I want to find when the brake application started in time. Particularly, I need the time frame of brake start. Following are three examples of brake pedal force and the desired location of the brake start of time frames:
Estimating Brake start
I estimated the brake start by assuming that it is a changepoint. So, I used the changepoint package in R. But I get some of them right and others wrong (the vertical red line below represents the estimated changepoint):
You can see the changepoints for participants B and C are (almost) correct, but incorrect for participant A. In my full dataset, there are many incorrect values so manually estimating them is going to be very time consuming.
Do you have any suggestions to accurately estimate the brake start? Thank you for your time.
The data and code for the above figure are provided below.
Data and Code
Data
foo <- structure(list(participant = c("A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "C", "C", "C", "C", "C", "C", "C", "C", "C",
"C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C",
"C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C",
"C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C",
"C", "C", "C", "C"), frames = c(39614, 39644, 39674, 39704, 39734,
39764, 39794, 39824, 39854, 39884, 39914, 39944, 39974, 40004,
40034, 40064, 40094, 40124, 40154, 40184, 40214, 40244, 40274,
40304, 40334, 40364, 40394, 40424, 40454, 40484, 40514, 40544,
40574, 40604, 40634, 40664, 40694, 40724, 40754, 40784, 40814,
40844, 40874, 40904, 40934, 40964, 40994, 41024, 41054, 41084,
41114, 41144, 41174, 45296, 45326, 45356, 45386, 45416, 45446,
45476, 45506, 45536, 45566, 45596, 45626, 45656, 45686, 45716,
45746, 45776, 45806, 45836, 45866, 45896, 45926, 63792, 63822,
63852, 63882, 63912, 63942, 63972, 64002, 64032, 64062, 64092,
64122, 64152, 64182, 64212, 64242, 64272, 64302, 64332, 64362,
64392, 64422, 64452, 64482, 64512, 64542, 64572, 64602, 64632,
64662, 64692, 64722, 64752, 64782, 64812, 64842, 64872, 64902,
64932, 64962, 64992, 65022, 65052, 65082, 65112, 65142, 65172,
65202, 65232, 65262, 65292, 65322), ED_brake_pedal_force_kg = c(0.34,
0.34, 0.34, 0.33, 0.33, 0.34, 0.32, 0.34, 0.34, 0.34, 0.34, 0.32,
0.34, 0.34, 0.37, 0.32, 0.32, 0.33, 0.34, 0.32, 0.33, 0.34, 0.34,
0.72, 2.01, 2.91, 4.57, 5.73, 5.84, 5.82, 5.21, 5.23, 5.23, 4.41,
4, 3.57, 3.09, 2.28, 1.37, 0.33, 0.33, 0.65, 1.21, 3.36, 4.91,
5.2, 5.96, 6.24, 7.6, 14.13, 25.8, 32.37, 37.71, 0.32, 0.34,
0.33, 0.32, 1.72, 8.93, 18.83, 22.78, 39.5, 66.63, 9.46, 2.24,
0.33, 0.34, 1.9, 5.5, 8.55, 10.66, 12.24, 12.24, 12.24, 12.27,
0.29, 0.29, 0.31, 0.31, 0.3, 0.29, 0.3, 0.3, 0.3, 0.29, 0.3,
0.31, 0.3, 0.29, 0.29, 0.91, 2.79, 3.67, 4.24, 5.61, 5.91, 6.08,
5.4, 4.46, 3.74, 3.85, 4, 4.43, 2.08, 0.7, 0.3, 0.29, 0.31, 0.32,
0.34, 0.69, 0.83, 0.83, 0.84, 1.36, 1.68, 2.04, 3.87, 5.21, 7.28,
9.84, 13.49, 14.83, 14.79, 14.79, 14.79, 14.71)), row.names = c(NA,
-127L), class = c("tbl_df", "tbl", "data.frame"))
Code
Estimation of changepoint and plotting:
library(changepoint)
library(tidyverse)
foo %>%
group_by(participant) %>%
mutate(brake_start_frame = frames[cpts(cpt.meanvar(ED_brake_pedal_force_kg,
Q = 8,
method = "BinSeg"))][1]) %>%
ungroup() %>%
ggplot() +
geom_line(aes(x = frames, y = ED_brake_pedal_force_kg)) +
geom_vline(aes(xintercept = brake_start_frame), color="red") +
facet_wrap(~ participant, scales = "free_x")
Since this is a time-series problem, you can explore TTR::momentum function to solve this problem. Whenever momentum will go above a particular threshold in upward direction, it will trigger the event.
library(TTR)
library(data.table)
setDT(foo)
foo[, momentum := TTR::momentum(ED_brake_pedal_force_kg, 5), by = participant]
ggplot(foo) +
geom_line(aes(x = momentum, y = ED_brake_pedal_force_kg)) +
facet_wrap(~ participant, scales = "free_x")
I have character strings that I want to convert to tables. The identifier in each row can have white spaces and I need them removed without also removing spaces between the numbers. Is it possible to use a regular expression to achieve this?
For example, the data would look like this:
A B C 5.65 7.8
DC 5.65 7.8
D AB 7.9 12.2
D AB C 7.9 1.2
A BC 13.88 2.4
AB C 7.9 12.2
And I want to get to this:
ABC 5.65 7.8
DC 5.65 7.8
DAB 7.9 12.2
DABC 7.9 1.2
ABC 13.88 2.4
ABC 7.9 12.2
EDIT: As requested, this is an example of the data type and the form in which I receive it. This has 16 rows, each with 6 columns of data, but the first column is an alphabetic identifier.
# Data as I receive it.
data <- c("A", "a", "2.07", "2.35", "39.00", "82.20", "8.8", "3.80",
"B", "2.26", "2.25", "40.00", "80.80", "8.1", "1.86", "D",
"Et", "2.07", "2.22", "41.00", "83.80", "8.8", "3.87", "F",
"2.05", "2.15", "43.00", "82.20", "8.4", "3.11", "Bc", "2.08",
"2.12", "48.00", "82.60", "8.3", "2.47", "Gf", "H", "I",
"2.08", "2.10", "46.00", "82.20", "8.1", "2.90", "J", "K",
"1.95", "2.08", "38.00", "83.40", "8.7", "1.63", "L", "M",
"1.89", "2.07", "45.00", "83.80", "9.0", "1.84", "N", "2.06",
"2.05", "41.00", "80.60", "9.0", "4.09", "O", "P", "1.86",
"2.04", "48.00", "81.60", "8.6", "2.60", "Qst", "R", "1.95",
"2.03", "44.00", "82.80", "8.8", "1.40", "S", "2.03", "2.02",
"40.00", "81.40", "8.2", "1.74", "T", "1.95", "2.01", "43.00",
"81.80", "9.0", "2.30", "Unh", "1.96", "2.00", "44.00", "82.60",
"9.2", "2.40", "V", "W", "C", "1.98", "1.97", "40.00",
"82.00", "8.1", "1.15", "Yu", "1.90", "1.96", "41.00", "82.80",
"9.6", "2.08", "Z", "a", "bi", "1.90", "1.95", "42.00",
"84.20", "9.6", "1.69")
# Required format
data2 <- c("Aa", "2.07", "2.35", "39.00", "82.20", "8.8", "3.80",
"B", "2.26", "2.25", "40.00", "80.80", "8.1", "1.86",
"DEt", "2.07", "2.22", "41.00", "83.80", "8.8", "3.87", "F",
"2.05", "2.15", "43.00", "82.20", "8.4", "3.11", "Bc", "2.08",
"2.12", "48.00", "82.60", "8.3", "2.47", "GfHI",
"2.08", "2.10", "46.00", "82.20", "8.1", "2.90", "JK",
"1.95", "2.08", "38.00", "83.40", "8.7", "1.63", "LM",
"1.89", "2.07", "45.00", "83.80", "9.0", "1.84", "N", "2.06",
"2.05", "41.00", "80.60", "9.0", "4.09", "OP", "1.86",
"2.04", "48.00", "81.60", "8.6", "2.60", "QstR", "1.95",
"2.03", "44.00", "82.80", "8.8", "1.40", "S", "2.03", "2.02",
"40.00", "81.40", "8.2", "1.74", "T", "1.95", "2.01", "43.00",
"81.80", "9.0", "2.30", "Unh", "1.96", "2.00", "44.00", "82.60",
"9.2", "2.40", "VWC", "1.98", "1.97", "40.00",
"82.00", "8.1", "1.15", "Yu", "1.90", "1.96", "41.00", "82.80",
"9.6", "2.08", "Zabi", "1.90", "1.95", "42.00",
"84.20", "9.6", "1.69")
df <- data.frame(matrix(data2, ncol=7, byrow=T))
To do as you request within your R environment, one approach is to convert the vector to a string, apply a regular expression filter to the string, then convert the string back to a vector.
See details below, hopefully this points you in the right direction.
Solution
data <- c("A", "a", "2.07", "2.35", "39.00", "82.20", "8.8", "3.80",
"B", "2.26", "2.25", "40.00", "80.80", "8.1", "1.86", "D",
"Et", "2.07", "2.22", "41.00", "83.80", "8.8", "3.87", "F",
"2.05", "2.15", "43.00", "82.20", "8.4", "3.11", "Bc", "2.08",
"2.12", "48.00", "82.60", "8.3", "2.47", "Gf", "H", "I",
"2.08", "2.10", "46.00", "82.20", "8.1", "2.90", "J", "K",
"1.95", "2.08", "38.00", "83.40", "8.7", "1.63", "L", "M",
"1.89", "2.07", "45.00", "83.80", "9.0", "1.84", "N", "2.06",
"2.05", "41.00", "80.60", "9.0", "4.09", "O", "P", "1.86",
"2.04", "48.00", "81.60", "8.6", "2.60", "Qst", "R", "1.95",
"2.03", "44.00", "82.80", "8.8", "1.40", "S", "2.03", "2.02",
"40.00", "81.40", "8.2", "1.74", "T", "1.95", "2.01", "43.00",
"81.80", "9.0", "2.30", "Unh", "1.96", "2.00", "44.00", "82.60",
"9.2", "2.40", "V", "W", "C", "1.98", "1.97", "40.00",
"82.00", "8.1", "1.15", "Yu", "1.90", "1.96", "41.00", "82.80",
"9.6", "2.08", "Z", "a", "bi", "1.90", "1.95", "42.00",
"84.20", "9.6", "1.69")
# Use stringi base regular expression engine
require(stringi)
# Convert the vector data to be a string sequence - so we can manipulate as text
data1 <- toString(data)
# Now we can apply the regular expression substitution to the data (formatted as a string...
# Here we do a:
#
# (?<!\d) - Negative look behind to prevent a digit.
# , - A literal combination of quotes, comma and space. We drop the ", " in conversion to string...
# (?!\d) - Negative look ahead to prevent a digit.
#
data3 = stri_replace_all_regex(str = data1, pattern = '(?<!\\d), (?!\\d)', replacement = '')
# OK, check the string data...
data3
# Now we convert the string back to be a vector...
newData = strsplit(data3, " ")[[1]]
newData
# Now we convert to a dataframe...
df <- data.frame(matrix(newData, ncol=7, byrow=T))
df
# Done
Output
> data <- c("A", "a", "2.07", "2.35", "39.00", "82.20", "8.8", "3.80",
+ "B", "2.26", "2.25", "40.00", "80.80", "8.1", "1.86", "D",
+ "Et", "2.07", "2.22", "41.00", "83.80", "8.8", "3.87", "F",
+ "2.05", "2.15", "43.00", "82.20", "8.4", "3.11", "Bc", "2.08",
+ "2.12", "48.00", "82.60", "8.3", "2.47", "Gf", "H", "I",
+ "2.08", "2.10", "46.00", "82.20", "8.1", "2.90", "J", "K",
+ "1.95", "2.08", "38.00", "83.40", "8.7", "1.63", "L", "M",
+ "1.89", "2.07", "45.00", "83.80", "9.0", "1.84", "N", "2.06",
+ "2.05", "41.00", "80.60", "9.0", "4.09", "O", "P", "1.86",
+ "2.04", "48.00", "81.60", "8.6", "2.60", "Qst", "R", "1.95",
+ "2.03", "44.00", "82.80", "8.8", "1.40", "S", "2.03", "2.02",
+ "40.00", "81.40", "8.2", "1.74", "T", "1.95", "2.01", "43.00",
+ "81.80", "9.0", "2.30", "Unh", "1.96", "2.00", "44.00", "82.60",
+ "9.2", "2.40", "V", "W", "C", "1.98", "1.97", "40.00",
+ "82.00", "8.1", "1.15", "Yu", "1.90", "1.96", "41.00", "82.80",
+ "9.6", "2.08", "Z", "a", "bi", "1.90", "1.95", "42.00",
+ "84.20", "9.6", "1.69")
>
> # Use stringi base regular expression engine
> require(stringi)
>
> # Convert the vector data to be a string sequence - so we can manipulate as text
> data1 <- toString(data)
>
> # Now we can apply the regular expression substitution to the data (formatted as a string...
> # Here we do a:
> #
> # (?<!\d) - Negative look behind to prevent a digit.
> # , - A literal combination of quotes, comma and space. We drop the ", " in conversion to string...
> # (?!\d) - Negative look ahead to prevent a digit.
> #
> data3 = stri_replace_all_regex(str = data1, pattern = '(?<!\\d), (?!\\d)', replacement = '')
> # OK, check the string data...
> data3
[1] "Aa, 2.07, 2.35, 39.00, 82.20, 8.8, 3.80, B, 2.26, 2.25, 40.00, 80.80, 8.1, 1.86, DEt, 2.07, 2.22, 41.00, 83.80, 8.8, 3.87, F, 2.05, 2.15, 43.00, 82.20, 8.4, 3.11, Bc, 2.08, 2.12, 48.00, 82.60, 8.3, 2.47, GfHI, 2.08, 2.10, 46.00, 82.20, 8.1, 2.90, JK, 1.95, 2.08, 38.00, 83.40, 8.7, 1.63, LM, 1.89, 2.07, 45.00, 83.80, 9.0, 1.84, N, 2.06, 2.05, 41.00, 80.60, 9.0, 4.09, OP, 1.86, 2.04, 48.00, 81.60, 8.6, 2.60, QstR, 1.95, 2.03, 44.00, 82.80, 8.8, 1.40, S, 2.03, 2.02, 40.00, 81.40, 8.2, 1.74, T, 1.95, 2.01, 43.00, 81.80, 9.0, 2.30, Unh, 1.96, 2.00, 44.00, 82.60, 9.2, 2.40, VWC, 1.98, 1.97, 40.00, 82.00, 8.1, 1.15, Yu, 1.90, 1.96, 41.00, 82.80, 9.6, 2.08, Zabi, 1.90, 1.95, 42.00, 84.20, 9.6, 1.69"
>
> # Now we convert the string back to be a vector...
> newData = strsplit(data3, " ")[[1]]
> newData
[1] "Aa," "2.07," "2.35," "39.00," "82.20," "8.8," "3.80," "B," "2.26," "2.25," "40.00," "80.80,"
[13] "8.1," "1.86," "DEt," "2.07," "2.22," "41.00," "83.80," "8.8," "3.87," "F," "2.05," "2.15,"
[25] "43.00," "82.20," "8.4," "3.11," "Bc," "2.08," "2.12," "48.00," "82.60," "8.3," "2.47," "GfHI,"
[37] "2.08," "2.10," "46.00," "82.20," "8.1," "2.90," "JK," "1.95," "2.08," "38.00," "83.40," "8.7,"
[49] "1.63," "LM," "1.89," "2.07," "45.00," "83.80," "9.0," "1.84," "N," "2.06," "2.05," "41.00,"
[61] "80.60," "9.0," "4.09," "OP," "1.86," "2.04," "48.00," "81.60," "8.6," "2.60," "QstR," "1.95,"
[73] "2.03," "44.00," "82.80," "8.8," "1.40," "S," "2.03," "2.02," "40.00," "81.40," "8.2," "1.74,"
[85] "T," "1.95," "2.01," "43.00," "81.80," "9.0," "2.30," "Unh," "1.96," "2.00," "44.00," "82.60,"
[97] "9.2," "2.40," "VWC," "1.98," "1.97," "40.00," "82.00," "8.1," "1.15," "Yu," "1.90," "1.96,"
[109] "41.00," "82.80," "9.6," "2.08," "Zabi," "1.90," "1.95," "42.00," "84.20," "9.6," "1.69"
>
> # Now we convert to a dataframe...
> df <- data.frame(matrix(newData, ncol=7, byrow=T))
> df
X1 X2 X3 X4 X5 X6 X7
1 Aa, 2.07, 2.35, 39.00, 82.20, 8.8, 3.80,
2 B, 2.26, 2.25, 40.00, 80.80, 8.1, 1.86,
3 DEt, 2.07, 2.22, 41.00, 83.80, 8.8, 3.87,
4 F, 2.05, 2.15, 43.00, 82.20, 8.4, 3.11,
5 Bc, 2.08, 2.12, 48.00, 82.60, 8.3, 2.47,
6 GfHI, 2.08, 2.10, 46.00, 82.20, 8.1, 2.90,
7 JK, 1.95, 2.08, 38.00, 83.40, 8.7, 1.63,
8 LM, 1.89, 2.07, 45.00, 83.80, 9.0, 1.84,
9 N, 2.06, 2.05, 41.00, 80.60, 9.0, 4.09,
10 OP, 1.86, 2.04, 48.00, 81.60, 8.6, 2.60,
11 QstR, 1.95, 2.03, 44.00, 82.80, 8.8, 1.40,
12 S, 2.03, 2.02, 40.00, 81.40, 8.2, 1.74,
13 T, 1.95, 2.01, 43.00, 81.80, 9.0, 2.30,
14 Unh, 1.96, 2.00, 44.00, 82.60, 9.2, 2.40,
15 VWC, 1.98, 1.97, 40.00, 82.00, 8.1, 1.15,
16 Yu, 1.90, 1.96, 41.00, 82.80, 9.6, 2.08,
17 Zabi, 1.90, 1.95, 42.00, 84.20, 9.6, 1.69
> # Done