Running Fishers discriminate test in R - r

I have tried the following code to try and run fishers discriminant on my data set but it doesn't like the negative values.
Split_user <- user_col%>%
select(-Heroin)%>%
filter(User == "Yes")
Split_user1 <- Split_user%>%
select(-User)
Split_nonuser <- user_col%>%
select(-Heroin)%>%
filter(User == "No")
Split_nonuser1 <- Split_user%>%
select(-User)
#Calculate the Mean by class
m1 = colMeans(Split_user1)
m1
m2 = colMeans(Split_nonuser1)
m2
#Calculate the covariances by class
S1 = cov(Split_user1)
S2 = cov(Split_nonuser1)
dat <- data.frame(
"User"=c(m1),
"Non User" = c(m2),
stringsAsFactors = FALSE
)
dat
test <- fisher.test(dat)
test
structure(list(Nscore = c(0.31287, -0.67825, -0.46725, -0.14882,
0.73545, -0.67825, -0.46725, -1.32828, 0.62967, -0.24649, -1.05308,
-1.32828, 2.28554, -0.79151, -0.92104, -2.05048, -1.55078, 0.52135,
1.37297, -0.34799, -0.79151, -1.1943, 0.41667, 1.60383, -0.14882
), Escore = c(-0.57545, 1.93886, 0.80523, -0.80615, -1.6334,
-0.30033, -1.09207, 1.93886, 2.57309, 0.00332, 0.80523, 0.00332,
0.16767, 0.80523, 1.45421, -1.50796, -0.80615, -1.23177, -0.15487,
-1.7625, 0.80523, 0.47617, -0.94779, -3.27393, 0.63779), Oscore = c(-0.58331,
1.43533, -0.84732, -0.01928, -0.45174, -1.55521, -0.45174, -0.84732,
-0.97631, -1.42424, -1.11902, 0.14143, 0.44585, -0.01928, 0.44585,
-1.55521, -1.68062, -0.31776, -0.17779, -2.39883, 0.7233, -1.11902,
-0.84732, -1.27553, 1.24033), Ascore = c(-0.91699, 0.76096, -1.6209,
0.59042, -0.30172, 2.03972, -0.30172, -0.30172, 0.76096, 0.59042,
-0.76096, -1.92595, -1.6209, 0.94156, -0.60633, -1.07533, 0.28783,
-0.45321, -1.92595, -1.92595, 1.61108, -0.60633, 1.11406, 0.28783,
0.76096), Cscore = c(-0.00665, -0.14277, -1.0145, 0.58489, 1.30612,
1.63088, 0.93949, 1.63088, 1.13407, 0.12331, 1.81175, -0.52745,
-0.78155, 3.46436, 1.63088, 1.13407, 0.7583, -1.38502, -1.5184,
0.7583, -1.13788, 1.81175, -0.89891, -1.0145, 1.46191), Impulsivity = c(-0.21712,
-0.71126, -1.37983, -1.37983, -0.21712, -1.37983, -0.21712, 0.19268,
-1.37983, -1.37983, 0.19268, 0.52975, 1.29221, -0.71126, 1.29221,
-0.71126, -0.21712, -1.37983, -0.71126, -1.37983, 0.19268, -0.21712,
-0.71126, -1.37983, -0.21712), SS = c(-1.18084, -0.21575, 0.40148,
-1.18084, -0.21575, -1.54858, 0.07987, -0.52593, -1.54858, -0.84637,
0.07987, 1.2247, 0.07987, -0.84637, 0.7654, -0.52593, -2.07848,
-0.84637, -0.21575, -2.07848, -0.21575, -1.18084, 0.07987, -1.54858,
-0.52593), Heroin = c("CL0", "CL0", "CL0", "CL0", "CL0", "CL0",
"CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL0",
"CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL1",
"CL0"), User = c("No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No")), row.names = c(NA, -25L), class = c("tbl_df",
"tbl", "data.frame"))
matrix - non negative enteries
structure(c(0.610545701853111, 0.96447451044118, 0.851040908078699,
0.589544977621028, 0.827711005318391, 0.904054295338232, 0.589544977621028,
0.456502942067452, 0.688702581355738, 0.533249474881585, 0.389562632031922,
0.456502942067452, 0.632276676717644, 0.436140184047234, 0.653498081085907,
0.166581403886779, 0.495545270523754, 0.476097331977154, 0.671587833703474,
0.533249474881585, 0.436140184047234, 0.71082275573317, 0.610545701853111,
0.550877805396231, 0.495545270523754, 0.606851082126989, 0.606851082126989,
0.529890274393643, 0.17257337601325, 0.40854553851544, 0.238467663593076,
0.327676651856099, 0.580185689487682, 0.367919991081809, 0.479144172121096,
1, 0.505317471692705, 0.453951873616486, 0.0885974551303489,
0.259420636058159, 0.505317471692705, 0.63189368241683, 0.386973070246684,
0, 0.327676651856099, 0.479144172121096, 0.327676651856099, 0.386973070246684,
0.350233306260252, 0.63189368241683, 0.535574626985272, 0.563618843854673,
0.750572944212062, 0.223135727679215, 0.4774929083129, 0.706259916342132,
0.355063144061413, 0.275029248201035, 0.667195538247031, 0.563618843854673,
0.850078930076767, 0.223135727679215, 0.796364408545283, 0.163192941968364,
0.223135727679215, 0.355063144061413, 0.163192941968364, 0.595629597576807,
0.4774929083129, 0.223135727679215, 0.535574626985272, 0.706259916342132,
0.508037245380387, 0.383103354327933, 0.414857685465647, 0.403077115158141,
0.233825658261037, 0.558377116170461, 0.712895943634026, 0.210211608610118,
0.285534960468907, 0.658493870402802, 0.429792238205629, 0.533600585795802,
0.210211608610118, 0.482206789967235, 0.684122436721568, 0.285534960468907,
0.456117618078684, 0.533600585795802, 0.508041194672498, 0.18731799331194,
0.456117618078684, 0.357934394919504, 0.712895943634026, 0.558377116170461,
0.558377116170461, 0.156244832950339, 0.581589612247721, 0.658493870402802,
0.318731025010843, 0.202385427208327, 0.202385427208327, 0.249965664305335,
0.406440653462484, 0.599528335983808, 0.498561515107706, 0.571262830706954,
0.571262830706954, 0.788549949400029, 0.694141246205002, 0.694141246205002,
0.729304611825936, 0.474472314587249, 0.429044383403209, 0.694141246205002,
0.36191629319069, 0.474472314587249, 0.154790732976724, 0.474472314587249,
0.429044383403209, 0.297072430244326, 0.474472314587249, 0.154790732976724,
0.406440653462484, 0.446013490788146, 0.757189169998879, 1, 0,
0.528084009118427, 0.446013490788146, 0.271569845659404, 0.624098434171681,
0.271569845659404, 0.528084009118427, 0.757189169998879, 0.528084009118427,
0.446013490788146, 0.367285305878396, 0.156155405657909, 0.271569845659404,
0.624098434171681, 0.757189169998879, 0.528084009118427, 0, 0.367285305878396,
0.528084009118427, 0.528084009118427, 0.156155405657909, 0.528084009118427,
0.825751648038478, 1, 0.619957452233758, 0.308011329405206, 0.619957452233758,
0.53955917314341, 0.710932676034508, 0.46565805295222, 0.53955917314341,
1, 0.825751648038478, 0.619957452233758, 0.825751648038478, 0.710932676034508,
0.619957452233758, 0.53955917314341, 1, 0.710932676034508, 0.710932676034508,
0.825751648038478, 0.825751648038478, 0.619957452233758, 0.46565805295222,
0, 0.825751648038478), dim = c(25L, 7L), dimnames = list(c("30",
"67", "115", "20", "3", "18", "16", "8", "10", "77", "84", "71",
"54", "64", "85", "105", "58", "2", "102", "17", "79", "87",
"63", "11", "39"), c("Nscore", "Escore", "Oscore", "Ascore",
"Cscore", "Impulsivity", "SS")))

Related

Cannot make a logistic curve in R

I am trying to make a logistic curve in R but the line does not appear in the plot.
My data are:
dput(los1)
structure(list(X1 = c("5.51688462301445", "2.55660506920185",
"4.17130300764484", "15.0032350113684", "0.0672790807684578",
"0", "10.7646529229551", "1.6819770192119", "4.44041933071867",
"2.69116323073877", "0", "0.740069888453036", "1.54741885767498",
"0.201837242305373", "1.81653518074882", "6.12239634993057",
"3.49851219996026", "22.4039338958996", "0.538232646147662",
"0.134558161536916", "1.2783025346007", "1.6819770192119", "16.9543283536541",
"60.0129400454734", "9.62090854989083", "0.470953565379205",
"33.7740985457708", "6.8624662383836", "0", "0", "4.50769841148758",
"62.6368241954438", "264.137671097005", "14.5995605267576", "0",
"0", "0", "6.12239634993057", "10.1591411960385", "22.9421665420477",
"0.470953565379205", "2.28748874612802", "13.8594906383046",
"11.0337692460289", "18.6363053728655", "27.2480277112295", "0.0672790807684578",
"0.470953565379205", "0", "0"), X2 = c("No", "No", "Yes", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "Yes", "Yes",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No")), row.names = c(NA, 50L), class = "data.frame")
and the code I use for the curve is:
los1 %>%
mutate(prob = ifelse(X2 == "Yes", 1, 0)) %>%
ggplot(aes(X1, prob)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm", se=FALSE, method.args = list(family = "binomial")) +
labs(
title = "Logistic Regression Model",
x = "Plasma Glucose Concentration",
y = "Probability of being diabete-pos"
)
Any idea how to make the curve?
Another option using stat_smooth with "glm" method and "X1" converted to numeric like this:
library(tidyverse)
los1 %>%
mutate(prob = ifelse(X2 == "Yes", 1, 0)) %>%
mutate(X1 = as.numeric(X1)) %>%
ggplot(aes(X1, prob)) +
geom_point(alpha = 0.2) +
stat_smooth(method="glm", color="green", se=FALSE, method.args = list(family=binomial)) +
labs(
title = "Logistic Regression Model",
x = "Plasma Glucose Concentration",
y = "Probability of being diabete-pos"
)
#> `geom_smooth()` using formula 'y ~ x'
Created on 2022-08-30 with reprex v2.0.2
los1 <- structure(list(X1 = c("5.51688462301445", "2.55660506920185",
"4.17130300764484", "15.0032350113684", "0.0672790807684578",
"0", "10.7646529229551", "1.6819770192119", "4.44041933071867",
"2.69116323073877", "0", "0.740069888453036", "1.54741885767498",
"0.201837242305373", "1.81653518074882", "6.12239634993057",
"3.49851219996026", "22.4039338958996", "0.538232646147662",
"0.134558161536916", "1.2783025346007", "1.6819770192119", "16.9543283536541",
"60.0129400454734", "9.62090854989083", "0.470953565379205",
"33.7740985457708", "6.8624662383836", "0", "0", "4.50769841148758",
"62.6368241954438", "264.137671097005", "14.5995605267576", "0",
"0", "0", "6.12239634993057", "10.1591411960385", "22.9421665420477",
"0.470953565379205", "2.28748874612802", "13.8594906383046",
"11.0337692460289", "18.6363053728655", "27.2480277112295", "0.0672790807684578",
"0.470953565379205", "0", "0"), X2 = c("No", "No", "Yes", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "Yes", "Yes",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No")), row.names = c(NA, 50L), class = "data.frame")
str(los1)
los1$X1 <- as.numeric(los1$X1)
los1$Y <- ifelse(los1$X2 == "Yes", 1, 0)
library(ggplot2)
los1 |>
ggplot(aes(X1, Y)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "glm", se=FALSE, method.args = list(family = "binomial")) +
labs(
title = "Logistic Regression Model",
x = "Plasma Glucose Concentration",
y = "Probability of being diabete-pos"
)

ggplotly() ignores legend and produce different plot with ggplot legend

I try to use ggplotly to run ggplot graph, but the legend label not showing the same things. Why it is? Please help. Thanks
And also any idea to ignore the warning of changing to numeric data, so it doesnt show too many warning when run it through shiny.
Thanks a lot
The code are below
structure(list(...1 = c("1", "2", "3", "4", "5", "6", "7", "8",
"9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
"31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41",
"42", "43", "44", "45", "46", "47", "48", "49", "50", "51", "52",
"53", "54", "55", "56", "57", "58", "59", "60", "61", "62", "63",
"64", "65", "66", "67", "68", "69", "70", "71", "72", "73", "74",
"75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85",
"86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96",
"97", "98", "99", "100", "101", "102", "103", "104", "105", "106",
"107", "108", "109", "110", "111", "112", "113", "114", "115",
"116", "117", "118", "119", "120", "121", "122", "123", "124",
"125", "126", "127", "128", "129", "130", "131", "132", "133",
"134", "135", "136", "137", "138", "139", "140", "141", "142",
"143", "144", "145", "146", "147", "148", "149"), indexlist = c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149), datainput = c("112069",
"7377.02", "Unanswered", "675900", "Unanswered", "17323000",
"1935328.98", "411079", "Unanswered", "38530.29", "96.5", "89268",
"6380000", "32185.99", "102103", "Unanswered", "Question no match",
"Unanswered", "Unanswered", "1441914.2080000001", "681325", "89340.307000000001",
"234", "9278", "9809", "259550", "675900", "Unanswered", "168322",
"Unanswered", "435708.78", "962.15899999999999", "681325", "81000",
"38759", "Unanswered", "Question no match", "Unanswered", "195747",
"Unanswered", "7070890", "10739506", "65430.91", "Unanswered",
"61900", "Unanswered", "Unanswered", "5130068", "11556", "Unanswered",
"Unanswered", "102364", "Unanswered", "103451.19", "9756559.5299999993",
"16520", "644039", "16.187999999999999", "Unanswered", "Unanswered",
"13154.44", "Question no match", "Question no match", "125131",
"Unanswered", "Unanswered", "Unanswered", "608470.29", "Question no match",
"Unanswered", "Unanswered", "Unanswered", "10496.82", "195747",
"21399", "Unanswered", "214050", "1439.18", "681104", "10587765",
"11816", "69528", "Unanswered", "26519409", "Question no match",
"1013315", "17323000", "114016", "117723", "Unanswered", "Question no match",
"555872.6", "8442.34", "1995000", "Unanswered", "7208", "152495",
"372366", "132191.5", "21399", "Unanswered", "195747", "3207.89",
"Unanswered", "77629", "195747", "Question no match", "Unanswered",
"400", "Unanswered", "555872.6", "3291303", "110296.5", "Unanswered",
"55715.991999999998", "186011", "Unanswered", "Question no match",
"Unanswered", "385000", "Unanswered", "142829.75599999999", "125131",
"Question no match", "20981", "Unanswered", "186011", "9701.8629999999994",
"Unanswered", "102103", "5138", "4395555.97", "118398.916", "1638.58",
"2749023", "Unanswered", "9394598", "20960", "17323000", "1232.19",
"240468", "6963.1", "Unanswered", "348.99400000000003", "2513000",
"4449880.6100000003", "Unanswered", "Unanswered", "27522854"),
verification = c("Yes", "no information", "no answer", "Yes",
"no answer", "Yes", "Yes", "Yes", "no information", "no information",
"no information", "no information", "Yes", "Yes", "Yes",
"no answer", "No", "no information", "no answer", "Yes",
"Yes", "no information", "no information", "Yes", "Yes",
"Yes", "Yes", "no answer", "No", "no answer", "no information",
"no information", "Yes", "no information", "Yes", "no answer",
"No", "no information", "Yes", "no answer", "Yes", "Yes",
"Yes", "no answer", "Yes", "no answer", "no answer", "No",
"No", "no answer", "no information", "Yes", "no answer",
"Yes", "Yes", "Yes", "Yes", "No", "no answer", "no answer",
"Yes", "no information", "No", "No", "no information", "no answer",
"no answer", "No", "no information", "no answer", "no answer",
"no information", "No", "Yes", "No", "no answer", "Yes",
"Yes", "Yes", "Yes", "no information", "Yes", "no answer",
"Yes", "no information", "Yes", "Yes", "Yes", "Yes", "no answer",
"no information", "No", "no information", "Yes", "no answer",
"Yes", "Yes", "Yes", "Yes", "No", "no answer", "Yes", "Yes",
"no answer", "No", "Yes", "no information", "no answer",
"no information", "no answer", "No", "Yes", "No", "no information",
"No", "no answer", "no answer", "no information", "no answer",
"Yes", "no answer", "Yes", "No", "no information", "Yes",
"no answer", "no answer", "No", "no answer", "Yes", "no information",
"Yes", "No", "Yes", "Yes", "no answer", "Yes", "Yes", "Yes",
"Yes", "No", "No", "no answer", "no information", "Yes",
"Yes", "no answer", "no answer", "Yes")), row.names = c(NA,
-149L), class = c("tbl_df", "tbl", "data.frame"))->data_a
p <- data_a%>%
select(indexlist, datainput, verification) %>%
mutate_at(c("datainput"), as.numeric)%>%
drop_na(c("datainput"))%>%
ggplot(aes(x=1:length(`datainput`), y=`datainput`, label= `indexlist`, color = `verification` == "Yes"))+
scale_colour_manual(name = 'Verification',breaks = c("TRUE", "FALSE"), values = c("green", "red"), labels = c("Verified", "Non-Verified"))+
geom_point(size=1.5, alpha = 0.4)+
geom_text(aes(label= ifelse(`datainput` > quantile(`datainput`, 0.975,na.rm = T), `indexlist`,"")), vjust = "inward", hjust = "inward", size = 2, color = "grey50")+
theme_minimal()+
labs(title = "datainput Details",
x = "",
y = "")+
theme(
axis.text.x = element_text(size = 5.5),
axis.text.y = element_text(size = 5.5),
plot.title = element_text(color = "grey40", size = 9, face = "bold"))
ggplotly(p)
I have tried scale_manual_fill and colour but it doesnt work
Try to keep data cleaning/preparation separate from plotting, see cleaned data and plot, now the ggplot and plotly look the same:
library(tidyverse)
library(plotly)
# prepare the data
plotData <- data_a %>%
select(indexlist, datainput, verification) %>%
# remove non-numeric rows before converting
filter(!grepl("^[^0-9.]+$", datainput)) %>%
# prepare data for plotting
mutate(datainput = as.numeric(datainput),
x = seq(n()),
Verification = factor(ifelse(verification == "Yes", "Verified", "Non-Verified"),
levels = c("Verified", "Non-Verified")),
label = ifelse(datainput > quantile(datainput, 0.975, na.rm = TRUE),
indexlist, ""))
# then plot with clean data
p <- ggplot(plotData, aes(x = x, y = datainput,
color = Verification, label = label)) +
scale_colour_manual(values = c("green", "red"))+
geom_point(size = 1.5, alpha = 0.4) +
geom_text(vjust = "inward", hjust = "inward", size = 2, color = "grey50") +
theme_minimal() +
labs(title = "datainput Details", x = "", y = "") +
theme(axis.text.x = element_text(size = 5.5),
axis.text.y = element_text(size = 5.5),
plot.title = element_text(color = "grey40", size = 9, face = "bold"))
# now plotly
ggplotly(p)
ggplot
plotly
I have tried cleaning your data manipulation process by preparing the data before plotting.
library(dplyr)
library(plotly)
library(ggplot2)
p1 <- data_a %>%
filter(grepl('\\d+', datainput)) %>%
mutate(datainput = as.numeric(datainput),
row = as.numeric(`...1`),
verification = ifelse(verification == 'Yes', 'verified', 'Non-Verified')) %>%
ggplot(aes(row, datainput, color = verification)) +
scale_colour_manual(name = 'Verification',
values = c("green", "red")) +
geom_point(size=1.5, alpha = 0.4)+
geom_text(aes(label= ifelse(datainput > quantile(datainput, 0.975,na.rm = TRUE), indexlist,"")),
vjust = -2, hjust = "inward", size = 2, color = "grey50") +
theme_minimal()+
labs(title = "datainput Details",
x = "",
y = "")+
theme(
axis.text.x = element_text(size = 5.5),
axis.text.y = element_text(size = 5.5),
plot.title = element_text(color = "grey40", size = 9, face = "bold"))
plotly::ggplotly(p1)

error Predictor.new() function package IML in R

I am attempting to use package 'iml' in R to create plots of SHAP values from a GBM model created in H2O.
When I try to create the R6 Predictor object using the Predictor.new() function I get an error that states Error : all(feature.class %in% names(feature.types)) is not TRUE.
From this I am guessing that there is something about one of the feature classes that is incorrect, but this is just an educated guess based upon what the error message is literally saying.
Here is a sample of anonymized data (I can't share the real data because it is confidential):
structure(list(dlr_id_cur = c(1, 2), date_eff = structure(c(16014,
15416), class = "Date"), new_vec_ind = structure(c(1L, 1L), .Label = c("NNA",
"UNA"), class = "factor"), cntrct_term = c(9587879614862828,
19), amt_financed = c(9455359, 65561175), reg_payment = c(885288,
389371), acct_stat_cd = structure(c(3L, 3L), .Label = c("11",
"22", "33"), class = "factor"), base_rental = c(1, 626266), down_pymt = c(2,
6654661), car_count = c(5, 1), dur_lease = c(3974, 6466), returned = structure(1:2, .Label = c("00",
"11"), class = "factor"), state = structure(c(10L, 1L), .Label = c("ANA",
"BNA", "CNA", "DNA", "FNA", "GNA", "HNA", "INA", "KNA", "LNA",
"MNA", "NNA", "ONA", "PNA", "QNA", "RNA", "SNA", "TNA", "UNA",
"VNA", "WNA"), class = "factor"), zip = c(34633, 45222), zip_two_digits = structure(c(71L,
36L), .Label = c("00", "01", "02", "03", "04", "05", "06", "07",
"08", "09", "110", "111", "112", "113", "114", "115", "116",
"117", "118", "119", "220", "221", "222", "223", "224", "225",
"226", "227", "228", "229", "330", "331", "332", "333", "334",
"335", "336", "337", "338", "339", "440", "441", "442", "443",
"444", "445", "446", "447", "448", "449", "550", "551", "552",
"553", "554", "555", "556", "557", "558", "559", "660", "661",
"662", "663", "664", "665", "666", "667", "668", "669", "770",
"771", "772", "773", "774", "775", "776", "777", "778", "779",
"880", "881", "882", "883", "884", "885", "886", "887", "888",
"889", "990", "991", "992", "993", "994", "995", "996", "997",
"998", "999", "ANA", "BNA", "CNA", "ENA", "GNA", "HNA", "JNA",
"KNA", "LNA", "MNA", "NNA", "PNA", "RNA", "SNA", "TNA", "VNA"
), class = "factor")
, mod_year_date = c(8156, 6278), vehic_mod_fam_code = structure(c(2L,
2L), .Label = c("BNA", "CNA", "ENA", "MNA", "SNA", "TNA", "VNA",
"XNA"), class = "factor"), mod_class_code = structure(c(4L, 2L
), .Label = c("BNA", "CNA", "ENA", "GNA", "MNA", "RNA", "SNA"
), class = "factor"), count_dl_DL_CDE_CSPS_A_NP = c(945, 337),
DL_CDE_CSPS_A_NP_avg_dl = c(3355188283749626, 8835582388327814
), count_sv_DL_CDE_CSPS_A_NP = c(6532, 8475), DL_CDE_CSPS_A_NP_avg_sv = c(4471193398278526,
6934672627789796), count_dl_NUM_CSPS_INIT_SCR = c(774, 773
), NUM_CSPS_INIT_SCR_avg_dl = c(9468453388562312, 5847816458727333
), count_sv_NUM_CSPS_INIT_SCR = c(2467, 3882), NUM_CSPS_INIT_SCR_avg_sv = c(5857936629789154,
8963457353776469), count_FFV = c(8563, 2566), average_FFV = c(25697792913881564,
13693335921646120), csps_NUM_SV = c(8, 6), avg_SV_rating = c(9817541424596360,
6218928542331853), csps_FFV_ratio = c(23125612473476952,
2), avg_DL_rating = c(2182256921592387, 7668957586431513),
has_DL_rating = c(1, 8), has_bad_DL_rating = c(2, 4), serv_has_MNT = c(7,
3), serv_has_SCP = c(5, 4), serv_has_ELW = c(9, 4), serv_has_LCP = c(7,
1), ro_count = c(6, 1), ro_tot_cust_pay = c(2, 188759), ro_tot_pay = c(3,
764372), date_eff_weekday = structure(c(4L, 3L), .Label = c("FNA",
"MNA", "SNA", "TNA", "WNA"), class = "factor"), date_eff_month_int = c(83,
7), date_eff_day = c(2, 24)), .Names = c("dlr_id_cur", "date_eff",
"new_vec_ind", "cntrct_term", "amt_financed", "reg_payment",
"acct_stat_cd", "base_rental", "down_pymt", "car_count", "dur_lease",
"returned", "state", "zip", "zip_two_digits", "mod_year_date",
"vehic_mod_fam_code", "mod_class_code", "count_dl_DL_CDE_CSPS_A_NP",
"DL_CDE_CSPS_A_NP_avg_dl", "count_sv_DL_CDE_CSPS_A_NP", "DL_CDE_CSPS_A_NP_avg_sv",
"count_dl_NUM_CSPS_INIT_SCR", "NUM_CSPS_INIT_SCR_avg_dl", "count_sv_NUM_CSPS_INIT_SCR",
"NUM_CSPS_INIT_SCR_avg_sv", "count_FFV", "average_FFV", "csps_NUM_SV",
"avg_SV_rating", "csps_FFV_ratio", "avg_DL_rating", "has_DL_rating",
"has_bad_DL_rating", "serv_has_MNT", "serv_has_SCP", "serv_has_ELW",
"serv_has_LCP", "ro_count", "ro_tot_cust_pay", "ro_tot_pay",
"date_eff_weekday", "date_eff_month_int", "date_eff_day"), row.names = 1:2, class = "data.frame")
# 1. create a data frame with just the features
features_iml <- as.data.frame(df_testR) %>% dplyr::select(-returned)
# 2. Create a vector with the actual responses
response_iml <- as.numeric(as.vector(df_testR$returned))
# 3. Create custom predict function that returns the predicted values as a
# vector (probability of customer churn in my example)
pred <- function(model, newdata) {
results <- as.data.frame(h2o.predict(model, as.h2o(newdata)))
return(results[[3L]])
}
# 4. example of prediction output
pred(GBM5, features_iml) %>% head()
# 5. create Predictor object
predictor = Predictor$new(model = GBM5, data = features_iml, y =
response_iml, predict.fun = pred, class = "classification")
Error : all(feature.class %in% names(feature.types)) is not TRUE
Here are also so basic descriptions of the dataset and model object I'm
using in the code above:
class(GBM5)
[1] "H2OBinomialModel"
attr(,"package")
[1] "h2o"
class(df_testR)
[1] "tbl_df" "tbl" "data.frame"
dim(df_testR)
[1] 47006 44
If there is anything else I can provide or if I have been unclear please let me know.
In the iml package there are specific feature classes that are acceptable, namely numeric, integer, character, factor and ordered. If you have any Date objects, or any other data type than the 5 listed here than the Predictor object can not be created.

How to use tapply to match specific condition

I have accident dataset that contains number of accidents being reported. I am trying to use tapply function, that will display me the total number of accidents being reported on "Thursday". However, instead of returning number of accidents being reported for particular day. It is displaying total number of rows I have in my dataset.I am using below tapply function.:
tapply(myfinal$VEHICLE_COUNT,myfinal$DAY_OF_WEEK=='THURSDAY',length)
My sample dataset is as follows:
> dput(tail(myfinal,5))
structure(list(CASE_NUMBER = c("1251045636", "1251045630", "1251045591",
"1251045574", "1250010434"), BARRACK = c("Frederick", "Frederick",
"Frederick", "Frederick", "Jessup"), ACC_DATE = c("2012-12-31T00:00:00",
"2012-12-31T00:00:00", "2012-12-31T00:00:00", "2012-12-31T00:00:00",
"2012-12-31T00:00:00"), ACC_TIME = c("18:12", "18:12", "12:12",
"9:12", "11:12"), ACC_TIME_CODE = c("5", "5", "4", "3", "3"),
DAY_OF_WEEK = c("MONDAY ", "MONDAY ", "MONDAY ", "MONDAY ",
"MONDAY "), ROAD = c("IS 00070 EISENHOWER MEMOR HWY", "MD 00077 ROCKY RIDGE RD",
"MD 00085 BUCKEYSTOWN PIKE", "MD 00017 MYERSVILLE RD", "IS 00070 No Name"
), INTERSECT_ROAD = c("CO 00248 MONUMENT RD", "MD 00076 MOTTERS STATION RD",
"CO 00308 MANOR WOODS RD", "CO 00941 DAWN CT", "US 00029 Columbia Pike"
), DIST_FROM_INTERSECT = c("300", "0", "400", "500", "0.25"
), DIST_DIRECTION = c("E", "U", "S", "S", "E"), CITY_NAME = c("Not Applicable",
"Not Applicable", "Not Applicable", "Not Applicable", NA),
COUNTY_CODE = c("10", "10", "10", "10", "13"), COUNTY_NAME = c("Frederick",
"Frederick", "Frederick", "Frederick", "Howard"), VEHICLE_COUNT = c(1,
2, 2, 1, 2), PROP_DEST = c("NO", "YES", "YES", "NO", "NO"
), INJURY = c("YES", "NO", "NO", "YES", "YES"), COLLISION_WITH_1 = c("FIXED OBJ",
"VEH", "VEH", "NON-COLLISION", "VEH"), COLLISION_WITH_2 = c("OTHER-COLLISION",
"OTHER-COLLISION", "OTHER-COLLISION", "OTHER-COLLISION",
"OTHER-COLLISION")), .Names = c("CASE_NUMBER", "BARRACK",
"ACC_DATE", "ACC_TIME", "ACC_TIME_CODE", "DAY_OF_WEEK", "ROAD",
"INTERSECT_ROAD", "DIST_FROM_INTERSECT", "DIST_DIRECTION", "CITY_NAME",
"COUNTY_CODE", "COUNTY_NAME", "VEHICLE_COUNT", "PROP_DEST", "INJURY",
"COLLISION_WITH_1", "COLLISION_WITH_2"), row.names = 18634:18638, class = "data.frame")
Any suggestions on how to fix it! Thanks in advance!
If can only use tapply for whatever reason, then, building off of Maurits' answer, you should be able to do this:
tapply(myfinal$VEHICLE_COUNT,trimws(myfinal$DAY_OF_WEEK)=='THURSDAY',length)
Or similar. It seems that the strings in your DAY_OF_WEEK variable have a lot of whitespaces at the end. You either need to remove them (via trimws) or modify your comparison string to include these spaces (e.g., myfinal$DAY_OF_WEEK=="THURSDAY "). With the comparison operator, R will match two string only if they match exactly character by character, so any additional whitespaces in either string will count against you.
Base R solution is to subset DAY_OF_WEEK by "THURSDAY" and then return number of rows:
nrow(df[df$DAY_OF_WEEK == "THURSDAY",])
There is really no point in using tapply here!
Method 1
Use dplyr:
require(tidyverse);
df %>% filter(trimws(DAY_OF_WEEK) == "MONDAY") %>% summarise(count = n());
# count
#1 5
Method 2
In base R, use subset and table
table(subset(df, trimws(DAY_OF_WEEK) == "MONDAY")$DAY_OF_WEEK);
#MONDAY
# 5
I've used "MONDAY" here because you've got no entries with DAY_OF_WEEK = "THURSDAY".
Sample data
df <- structure(list(CASE_NUMBER = c("1251045636", "1251045630", "1251045591",
"1251045574", "1250010434"), BARRACK = c("Frederick", "Frederick",
"Frederick", "Frederick", "Jessup"), ACC_DATE = c("2012-12-31T00:00:00",
"2012-12-31T00:00:00", "2012-12-31T00:00:00", "2012-12-31T00:00:00",
"2012-12-31T00:00:00"), ACC_TIME = c("18:12", "18:12", "12:12",
"9:12", "11:12"), ACC_TIME_CODE = c("5", "5", "4", "3", "3"),
DAY_OF_WEEK = c("MONDAY ", "MONDAY ", "MONDAY ", "MONDAY ",
"MONDAY "), ROAD = c("IS 00070 EISENHOWER MEMOR HWY", "MD 00077 ROCKY RIDGE RD",
"MD 00085 BUCKEYSTOWN PIKE", "MD 00017 MYERSVILLE RD", "IS 00070 No Name"
), INTERSECT_ROAD = c("CO 00248 MONUMENT RD", "MD 00076 MOTTERS STATION RD",
"CO 00308 MANOR WOODS RD", "CO 00941 DAWN CT", "US 00029 Columbia Pike"
), DIST_FROM_INTERSECT = c("300", "0", "400", "500", "0.25"
), DIST_DIRECTION = c("E", "U", "S", "S", "E"), CITY_NAME = c("Not Applicable",
"Not Applicable", "Not Applicable", "Not Applicable", NA),
COUNTY_CODE = c("10", "10", "10", "10", "13"), COUNTY_NAME = c("Frederick",
"Frederick", "Frederick", "Frederick", "Howard"), VEHICLE_COUNT = c(1,
2, 2, 1, 2), PROP_DEST = c("NO", "YES", "YES", "NO", "NO"
), INJURY = c("YES", "NO", "NO", "YES", "YES"), COLLISION_WITH_1 = c("FIXED OBJ",
"VEH", "VEH", "NON-COLLISION", "VEH"), COLLISION_WITH_2 = c("OTHER-COLLISION",
"OTHER-COLLISION", "OTHER-COLLISION", "OTHER-COLLISION",
"OTHER-COLLISION")), .Names = c("CASE_NUMBER", "BARRACK",
"ACC_DATE", "ACC_TIME", "ACC_TIME_CODE", "DAY_OF_WEEK", "ROAD",
"INTERSECT_ROAD", "DIST_FROM_INTERSECT", "DIST_DIRECTION", "CITY_NAME",
"COUNTY_CODE", "COUNTY_NAME", "VEHICLE_COUNT", "PROP_DEST", "INJURY",
"COLLISION_WITH_1", "COLLISION_WITH_2"), row.names = 18634:18638, class = "data.frame")

How to search for specific character that has space in the end using sqldf in R

I have a dataset that contains 18 columns and columns are related to an accident being reported. I am trying to find number of accidents that were reported during specific day of the week. For example: total number of accidents reported on Tuesday. For this I am using two variable available in my dataset:Day_of_Week and number_of_vehicle. I am running below sqldf query. However, it is showing me total number of accidents reported during entire week. I would like to use sqldf to report for particular day ex: monday. I would also like to add that there is a fair amount of space in Day_oF_WEEK column.See below for example
Day_OF_Week Number_of_vehicle
MONDAY(Space here) 50
Some sample dataset
> dput(head(myf,5))
structure(list(CASE_NUMBER = c("1363000002", "1296000023", "1283000016",
"1282000006", "1267000007"), BARRACK = c("Rockville", "Berlin",
"Prince Frederick", "Leonardtown", "Essex"), ACC_DATE = c("2012-01-01T00:00:00",
"2012-01-01T00:00:00", "2012-01-01T00:00:00", "2012-01-01T00:00:00",
"2012-01-01T00:00:00"), ACC_TIME = c("2:01", "18:01", "7:01",
"0:01", "1:01"), ACC_TIME_CODE = c("1", "5", "2", "1", "1"),
DAY_OF_WEEK = c("SUNDAY ", "SUNDAY ", "SUNDAY ", "SUNDAY ",
"SUNDAY "), ROAD = c("IS 00495 CAPITAL BELTWAY", "MD 00090 OCEAN CITY EXPWY",
"MD 00765 MAIN ST", "MD 00944 MERVELL DEAN RD", "IS 00695 BALTO BELTWAY"
), INTERSECT_ROAD = c("IS 00270 EISENHOWER MEMORIAL", "CO 00220 ST MARTINS NECK RD",
"CO 00208 DUKE ST", "MD 00235 THREE NOTCH RD", "IS 00083 HARRISBURG EXPWY"
), DIST_FROM_INTERSECT = c("0", "0.25", "100", "10", "100"
), DIST_DIRECTION = c("U", "W", "S", "E", "S"), CITY_NAME = c("Not Applicable",
"Not Applicable", "Not Applicable", "Not Applicable", "Not Applicable"
), COUNTY_CODE = c("15", "23", "4", "18", "3"), COUNTY_NAME = c("Montgomery",
"Worcester", "Calvert", "St. Marys", "Baltimore"), VEHICLE_COUNT = c("2",
"1", "1", "1", "2"), PROP_DEST = c("YES", "YES", "YES", "YES",
"YES"), INJURY = c("NO", "NO", "NO", "NO", "NO"), COLLISION_WITH_1 = c("VEH",
"FIXED OBJ", "FIXED OBJ", "FIXED OBJ", "VEH"), COLLISION_WITH_2 = c("OTHER-COLLISION",
"OTHER-COLLISION", "FIXED OBJ", "OTHER-COLLISION", "OTHER-COLLISION"
)), .Names = c("CASE_NUMBER", "BARRACK", "ACC_DATE", "ACC_TIME",
"ACC_TIME_CODE", "DAY_OF_WEEK", "ROAD", "INTERSECT_ROAD", "DIST_FROM_INTERSECT",
"DIST_DIRECTION", "CITY_NAME", "COUNTY_CODE", "COUNTY_NAME",
"VEHICLE_COUNT", "PROP_DEST", "INJURY", "COLLISION_WITH_1", "COLLISION_WITH_2"
), row.names = c(NA, 5L), class = "data.frame")
sqldf Code:
sqldf("select sum(Number_of_vehicle),DAY_OF_WEEK from accident group by DAY_OF_WEEK")
Any help is appreciated!
Thanks in advance!

Resources