Which column Combination is Best? - R - r

Below is a sample of the data set I'm working with:
structure(list(SVM_WinP = c(0.558165671393824, 0.348390523096837,
0.95493337065313, 0.0388005577521338, 0.736669323790385, 0.168896453268363,
0.705743689606604, 0.992138145290718, 0.810339570932884, 0.0145257024968556,
0.541140631037918, 0.437257022506864, 0.739497092250373, 0.625243533966904,
0.150497415992424, 0.715115299694835, 0.812625896077946, 0.948612041787568,
0.26448620625025, 0.734515435526053, 0.399712486824722, 0.22554632915551,
0.0185366382857313, 0.892516918078863, 0.230435327799452, 0.912324570011269,
0.71628846770974, 0.891120531189591, 0.326028496805855, 0.788864881180649,
0.585446980301914, 0.656930686507977, 0.811559711972135, 0.422998128997747,
0.176237445430815, 0.390207772327287, 0.799289216832336, 0.453716951109004,
0.316937325078128, 0.0975479139041335, 0.212656206249349, 0.727225917796848,
0.0589924049765094, 0.704428416779042, 0.0651626151496622, 0.935195903907177,
0.22603403070976, 0.331925528930646, 0.409096405088493, 0.0897325808718911,
0.0759214065235218, 0.99043710624311, 0.70564387477249, 0.392500549890471,
0.654295861591396, 0.344903248551428, 0.261717680022946, 0.800187975491863,
0.640075093428915, 0.0713729699969963, 0.802125102402682, 0.0162662428619326,
0.675626125667836, 0.729495125292526, 0.562858566153988, 0.866141522104534,
0.000600272455124062, 0.187123818503705, 0.408661413570358, 0.662178557425881,
0.967381518793872, 0.378339466187956, 0.383488953847531, 0.82366249439814,
0.536614974041644, 0.992170654125606, 0.831021118390595, 0.159885205835131,
0.890692277474385, 0.560003938125708, 0.961825298739267, 0.0838257843353912,
0.395496873814998, 0.162668110402946, 0.785662879106253, 0.138453575967573,
0.858532308701443, 0.707366377977707, 0.451099911093265, 0.27160115225865,
0.453597876327078, 0.646715200637795, 0.446808833127976, 0.394046228758467,
0.462929523602182, 0.924041526401814, 0.341652286189242, 0.451831445282859,
0.581440687680135, 0.0925517613786266), Boosted_WinP = c(0.89402261247966,
0.483670714594616, 0.661212451602682, 0.102014455620411, 0.289828014145476,
0.0570801901599125, 0.930966558681998, 0.48005474192123, 0.359186789690061,
0.610737858474027, 0.967551717704477, 0.199246146230633, 0.177721319572841,
0.577168222508493, 0.886607806421217, 0.634723982886352, 0.976871039022192,
0.956290271491002, 0.419917349440615, 0.133670577393046, 0.0765556605556079,
0.237098217108597, 0.979801750731868, 0.672680586055324, 0.725045977926599,
0.0740428495567435, 0.891159458339062, 0.631750415599141, 0.805786964697526,
0.962123067571387, 0.621847198501487, 0.196640907520318, 0.707504018539905,
0.221570092457165, 0.653045678639389, 0.412809228568008, 0.2212228879495,
0.662859253415819, 0.107905391793683, 0.551955923724445, 0.944560603389602,
0.988726324659146, 0.649337881350091, 0.900301333510395, 0.724575587263516,
0.965813952177899, 0.0671322402562242, 0.943316151949797, 0.322598098285156,
0.449779325602717, 0.020745717658762, 0.26438258954934, 0.374250053005224,
0.630881056727796, 0.888716928427941, 0.266639380057253, 0.00935638377937442,
0.0692444468397704, 0.949930917223125, 0.140814337621726, 0.362392597610464,
0.319108310209083, 0.604714171488092, 0.778562512066814, 0.750863433868021,
0.405964734099616, 0.689208564147027, 0.305533864346514, 0.414522044813037,
0.610709181705631, 0.539622473601807, 0.789797808326043, 0.610082648317471,
0.314523722429576, 0.789090600049371, 0.986978209407206, 0.714937815310679,
0.104676883555296, 0.901508251093595, 0.719331787482761, 0.738692573763422,
0.481365914976921, 0.577694538455519, 0.330133430304781, 0.76701513198333,
0.27169636976014, 0.675104533589495, 0.38758220529306, 0.817665918132103,
0.224569935842561, 0.137642782903388, 0.38861433169735, 0.340476517281863,
0.116094571960928, 0.0788352866831896, 0.281260238428684, 0.62599650915352,
0.955783458828903, 0.929530835778402, 0.10495691432555), PLS_WinP = c(0.39354723733149,
0.54000675808501, 0.340426806552083, 0.777179557555313, 0.507897688685752,
0.694818232188098, 0.593846935483312, 0.562915835206265, 0.853709203148761,
0.0902403931506494, 0.379814264458866, 0.376016855604251, 0.565391931175011,
0.477586831523071, 0.311687045297573, 0.47364241598787, 0.882997364084817,
0.460162443155103, 0.776359443552496, 0.600616421729718, 0.0773825441336474,
0.660293116104677, 0.0590032298475717, 0.245227484789182, 0.94438787432565,
0.842687663704992, 0.728168466460016, 0.145753700343619, 0.177708509658491,
0.634430499275636, 0.19966644589498, 0.981134498034763, 0.260532257456577,
0.56496361314877, 0.156074815970709, 0.216702434267147, 0.936767970703466,
0.411816552777972, 0.107422857533626, 0.277753722359518, 0.404330137426437,
0.130127783711268, 0.0104980713589864, 0.377720838386588, 0.133519419340816,
0.503839507649224, 0.911227329852191, 0.804976040200434, 0.0907083437738657,
0.734628812113806, 0.403969138559623, 0.644615047897493, 0.68426993586742,
0.985562440072655, 0.316277451670215, 0.893243089753064, 0.759492030077886,
0.430715961391847, 0.493800854199238, 0.887996140074659, 0.590165703452082,
0.775078446791539, 0.404126648856637, 0.17202190853721, 0.426679975602619,
0.0189556454712428, 0.725679596128594, 0.940800226706401, 0.794246618869053,
0.588497639101467, 0.826495931466981, 0.298463345807837, 0.0406542203871959,
0.588265033534172, 0.518999127258825, 0.719442596652878, 0.992038770408227,
0.0033195080311974, 0.992165758916786, 0.546313016904328, 0.776291901701705,
0.118868135467985, 0.974025520272049, 0.218512787136481, 0.63449371251762,
0.817003158588241, 0.861062799577631, 0.735049339627781, 0.178070722335702,
0.200975229673079, 0.465401012160916, 0.763034561748005, 0.145932817290429,
0.768996313890048, 0.125020279707435, 0.386656887551852, 0.275941268064707,
0.750641840531497, 0.840421843079901, 0.287144353270468), LR_WinP = c(0.262697012518672,
0.417511140329806, 0.667146472801859, 0.762697854238976, 0.389722735583148,
0.769038676622114, 0.630779526927904, 0.880964844549245, 0.933629831329103,
0.0729199623950939, 0.189643700519897, 0.671188506222618, 0.640887866294401,
0.738209297814263, 0.479252798503006, 0.092371444294318, 0.791149416899593,
0.183032329838997, 0.0528440179102034, 0.713347182772784, 0.542136778846393,
0.681831696079813, 0.195373846257483, 0.943155177669363, 0.941368682433923,
0.310042041144368, 0.345805650366193, 0.155234872569669, 0.0403703846871454,
0.196127468986564, 0.494958482900134, 0.414731832661525, 0.776416544626071,
0.124761024471987, 0.761069813681869, 0.144067383397882, 0.469516565531721,
0.562066251266583, 0.779017109874317, 0.392428193314516, 0.744143376328888,
0.623046148835947, 0.169308036706163, 0.706430604178061, 0.732834938224122,
0.502769597540655, 0.473769462148849, 0.222683307247984, 0.27422066894613,
0.569776289627385, 0.638213824757715, 0.612958021854454, 0.619062017788159,
0.547975167598928, 0.270106889117638, 0.299689849516908, 0.828034148357673,
0.860401248280072, 0.867155101493026, 0.580069315267473, 0.547248973530937,
0.076345470752778, 0.658895826263306, 0.639233259924366, 0.205808774941673,
0.294023916953547, 0.432868814998754, 0.943489557958723, 0.907515638844483,
0.828845843273485, 0.850648314796682, 0.288097834753634, 0.671366290547666,
0.879037251235933, 0.0124827413572859, 0.280493897410629, 0.0162973705316591,
0.809127046301876, 0.629451067500879, 0.901499283838544, 0.391072669964811,
0.602852074122138, 0.604805662116713, 0.13735168472145, 0.42220946510903,
0.950159416430759, 0.780990331247864, 0.253484866387938, 0.507014634599896,
0.207862903175663, 0.310823586606733, 0.557734792983327, 0.391445114349291,
0.785370274380705, 0.513497291082603, 0.0303323528945423, 0.140790018934132,
0.541288508082692, 0.16373642829569, 0.0278380611642626), Cubist_WinP = c(0.241873420019825,
0.223874146182308, 0.248511146677154, 0.899382173351828, 0.215615774430975,
0.338073130677165, 0.3284297527255, 0.724296036601866, 0.615232294867287,
0.824200769438341, 0.190423734271807, 0.940860030686527, 0.995310345365077,
0.37119648225333, 0.798263037114627, 0.187888050612689, 0.232430041208821,
0.0719966044560845, 0.369515559234538, 0.825546888953371, 0.526463485504593,
0.576413871593272, 0.597121262858026, 0.586729568038402, 0.723417581533145,
0.268487181542246, 0.00469857584664435, 0.885891705923672, 0.407971613428226,
0.221324631202003, 0.548246510306384, 0.072138700588176, 0.13829208707251,
0.755757369173496, 0.758708784488404, 0.559532866513871, 0.0446287164588287,
0.270610514785508, 0.901027762454115, 0.845730039486019, 0.810239868959125,
0.394014868798588, 0.767246370744583, 0.20191425809665, 0.519366898808556,
0.585299475139399, 0.901206599700221, 0.938186421066386, 0.817601118830569,
0.998879888878252, 0.612855023767725, 0.66661039116652, 0.780886797440356,
0.416894974234899, 0.516028546997213, 0.228493779760707, 0.180492518880357,
0.0872574331179239, 0.0562605389443052, 0.92828268169163, 0.908299573720351,
0.313083322211844, 0.317939934294433, 0.833270457777705, 0.944814684090358,
0.516292344139557, 0.860624653168591, 0.815580971336458, 0.946932968305583,
0.133250948190429, 0.692126111495591, 0.00450343070881709, 0.776433481915768,
0.642709887325145, 0.0467694562058146, 0.328734895196091, 0.316914538539486,
0.631060254330745, 0.306871357989628, 0.194087017118568, 0.985472752557697,
0.428587799376777, 0.00537361261016367, 0.929737213630118, 0.178166777298782,
0.458101672143506, 0.936443042786035, 0.604477797854701, 0.27558910997039,
0.594629580959861, 0.516862946229946, 0.334134868427636, 0.35282462084093,
0.726048651254153, 0.161202150849049, 0.769852810743947, 0.170122621575997,
0.708418305775943, 0.374084046943861, 0.656103357314548), ENET_WinP = c(0.376797188438971,
0.440082695690767, 0.560426153017069, 0.884087023955913, 0.320283166512571,
0.598969241889637, 0.874791139513396, 0.403175783073465, 0.353991872837964,
0.592363070905669, 0.386467280638169, 0.83094700743756, 0.820185487372966,
0.800519723084049, 0.260872486760217, 0.201048171817858, 0.19645691107559,
0.930233435950361, 0.728311181937606, 0.818296180693016, 0.231575380948394,
0.750337750505728, 0.580894416487517, 0.0269090570776034, 0.510045476179919,
0.519527872225584, 0.958485196488128, 0.901788608954781, 0.692735158264612,
0.661114271923073, 0.530615815921976, 0.422829277192182, 0.210410591821317,
0.0667469302069553, 0.356411085751059, 0.106243550716451, 0.0966124007470056,
0.252437838222532, 0.582525442807074, 0.575721333894169, 0.471326790899397,
0.189660715838895, 0.229731603732155, 0.678046777175715, 0.784881225360488,
0.972792580610392, 0.374258819536079, 0.714300690694055, 0.438646274627636,
0.034421452176977, 0.242648781298251, 0.627506222893452, 0.276326191040387,
0.507484881023458, 0.0266678979491796, 0.64254544706957, 0.38759581458661,
0.195993496042201, 0.772067933960612, 0.490837360249447, 0.0926671344008307,
0.522179263220861, 0.772178015111987, 0.503638988702346, 0.94633126162395,
0.22168948475807, 0.533719965320603, 0.905420058401729, 0.200220652884564,
0.0183702982451034, 0.368865442768652, 0.898313017742509, 0.423437627329978,
0.866370664887362, 0.228047803289617, 0.224019495700614, 0.358686951797117,
0.876472867126893, 0.460264495428769, 0.917804441107961, 0.047160286658086,
0.584131500836106, 0.197491826799768, 0.26336310487536, 0.831135352408695,
0.33925905484227, 0.324609712686945, 0.1523567520872, 0.23537965419749,
0.65313971868625, 0.841473584513374, 0.575787196328203, 0.969116667478611,
0.887096000069419, 0.468617730179891, 0.707310991502054, 0.808830763749894,
0.965516874651877, 0.379716995266897, 0.238551839508423), Actual_Result = c(1,
0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1,
0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1)), row.names = c(NA,
-100L), class = c("tbl_df", "tbl", "data.frame"))
Each of the 7 columns to the left represent the probability that the Actual_Result column is a 1. I'm trying to figure out which model or combination of models is best based on their performance when the _WinP value is greater than 0.5238. Few Examples from the data: SVM_WinP > 0.5238 = 0.58 accurate, SVM_WinP >0.5238 & Boosted_WinP>0.5238 = 0.54545 accurate, Boosted_WinP > 0.5238 & PLS_WinP > 0.5238 & LR_WinP >0.5238 = 0.3636 accurate. With there being 6 different columns there will be 63 different combinations that I could use. I know how to code each one individually, but there's gotta be a better way to loop through all of these and pull the best one without coding 61 different lines of code (one for each combo). Really appreciate your guys help on this.
EDIT 1
A sample of the ending result would hopefully be the following (real result will obviously have 63 rows rather than 3):
structure(list(Models = c("SVM_WinP", "SVM_WinP,Boosted_WinP",
"Boosted_WinP,PLS_WinP,LR_WinP"), ATS = c(0.58, 0.54545, 0.3636
)), class = "data.frame", row.names = c(NA, -3L))
Couple of notes, I have created all of the possible groups that need to be accounted for with the following code:
groupNames <- names(Sample_Data)[1:6]
myGroups <- Map(combn,
list(groupNames),
seq_along(groupNames),
simplify = FALSE) %>%
unlist(recursive = FALSE)
I just don't know how to loop through these and calculate the ATS. ATS is just going to equal the sum of the Actual Result column divided by the length of that column. For Example, the first row of the desired result is for the SVM_WinP column only. When that column is filtered for greater than 0.5238, there is 50 rows, and 29 of them are 1. Thus, the ATS is 29/50 = 0.58.

Very intriguing question and well supported with data and example code. Thanks very much for that. Here's my approach, step by step, and a function that combines each step at the bottom.
# Define variables
vars <- setdiff(names(dat), "Actual_Result")
head(vars)
#> [1] "SVM_WinP" "Boosted_WinP" "PLS_WinP" "LR_WinP" "Cubist_WinP"
#> [6] "ENET_WinP"
# Define all combinations of variables
x <- expand.grid(vars, vars)
head(x)
#> Var1 Var2
#> 1 SVM_WinP SVM_WinP
#> 2 Boosted_WinP SVM_WinP
#> 3 PLS_WinP SVM_WinP
#> 4 LR_WinP SVM_WinP
#> 5 Cubist_WinP SVM_WinP
#> 6 ENET_WinP SVM_WinP
# Get all cases where each var in a pair is greater than some
# performance threshold y
y <- 0.5238
which(dat$SVM_WinP > y & dat$Boosted_WinP > y,)
#> [1] 1 3 7 11 14 16 17 18 24 27 28 30 31 33 42 44 46 55 59 63 64 65 70 71 75
#> [26] 76 77 79 80 81 85 87 99
# Get the "Actual_Result" for each case
dat[which(dat$SVM_WinP > y & dat$Boosted_WinP > y,),"Actual_Result"]
#> [1] 1 1 1 1 0 1 0 1 1 0 0 0 1 1 1 1 0 1 1 0 1 0 0 0 0 1 1 0 1 0 0 0 1
# Get percent success in "Actual_Result"
mean(dat[which(dat$SVM_WinP > y & dat$Boosted_WinP > y,),"Actual_Result"])
#> [1] 0.5454545
# Where x is a pair of variables and y is a performance threshold
lcs_combo <- function(x, dat, y){
var1 <- x[[1]]
var2 <- x[[2]]
ATS <- mean(dat[which(dat[,var1] > y & dat[,var2] > y),"Actual_Result"])
return(c(var1 = paste(var1), var2 = paste(var2), ATS))
}
# Result for one pair of values
lcs_combo(x[5,], dat, 0.5238)
#> var1 var2 ATS
#> [1,] "Cubist_WinP" "SVM_WinP" "0.473684210526316"
# Apply over all combinations of values
apply(X = x, MARGIN = 1, FUN = lcs_combo, dat, 0.5238)
#> [,1] [,2] [,3] [,4]
#> [1,] "SVM_WinP" "Boosted_WinP" "PLS_WinP" "LR_WinP"
#> [2,] "SVM_WinP" "SVM_WinP" "SVM_WinP" "SVM_WinP"
#> [3,] "0.58" "0.545454545454545" "0.444444444444444" "0.538461538461538"
To format the result from apply() into a format more similar to what you have in the edit of your question, wrap it in t().
edit: Accommodate any number of elements in each group
y <- 0.5238
out <- c()
for(i in 1:length(myGroups)){
groups <- myGroups[[i]]
# rowSums() fails when there's only one group
if(length(groups) == 1){
ATS <- mean(dat[which(dat[,groups] > y),"Actual_Result"])
} else{
ATS <- mean(dat[rowSums(dat[,groups] > y) == length(groups),"Actual_Result"])
}
out <- rbind(out, c(paste(groups, collapse=","), ATS))
}
out

Related

How to add empty (0) coordinates into a partially filled data.frame for a given range?

I am not totally sure how to describe my problem, so I might just need help to find the right keywords to search for.
Here are some dummy data that resembles my own data, there are x and y co-ordinates and a z value:
require(data.table)
example <- data.table(x = c(-3, -4, -2, -1, -1, 0, 0, 0, 1, 4, 4, 5),
y = c(2, -2, -2, -3, -0, 3, 4, 4, -1, 4, 4, 4),
z = c(10, 10, 20, 30, 40, 40, 50, 70, 70, 80, 90, 90))
There are some duplicate co-ordinates in there, e.g. at (4,4) so the next step is to average the z values for the duplicate points:
example <- as.data.table(aggregate(z ~ x + y, data = example, FUN = "mean"))
Next, I would like to add z = 0 values to all of the coordinates that I don't have data for, e.g. (x = 0, y = 0), (x = 1, y = 1) etc. for the range -5:5 in both x and y axes.
How do I go about this?
To clarify: I have z values for specific x and y coordinates, I'd like to create a data table (or matrix) which has all x,y coordinates from -5,-5 to 5,5 with z = 0 except for the specific z values I already have.
Thanks!
Maybe this is what you are looking for.
example[, .(z=mean(z)), by=.(x, y)][CJ(x=-5:5, y=-5:5), on=c("x", "y")][is.na(z), z:=0][]
x y z
1: -5 -5 0
2: -5 -4 0
3: -5 -3 0
4: -5 -2 0
5: -5 -1 0
---
117: 5 1 0
118: 5 2 0
119: 5 3 0
120: 5 4 90
121: 5 5 0
Here, example[, .(z=mean(z)), by=.(x, y)] performs the data.table equivalent of your aggregate function. The result is then joined to the Cartesian product of -5:5 with itself (11^2 = 121 observations) CJ(x=-5:5, y=-5:5) with the second chain [CJ(x=-5:5, y=-5:5), on=c("x", "y")]. The join fills in NA for x y combinations not present in the aggregated data, so in the final chain, The NA values of z are set to 0 [is.na(z), z:=0]. The last bit prints the output.

How to change the per-step weighting coefficient in the R package DTW

I would like to change the default step-pattern weight of the cost function because I need to standardize my results with some others in a paper that doesn't use the weight 2 for the diagonal distance. I've read the JSS paper but I just found other step-patterns that are not what I'm really looking for, I guess. For example, imagine we have two timeSeries Q, C:
Q = array(c(0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0),dim=c(8,2))
C = array(c(0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0),dim=c(8,2))
When I calculate the dtw distance, I obtain
alignment = dtw(Q,C,keep=TRUE)
With a alginment$distance of 2.41 and a cost matrix where for example the [2,2] element is 2 instead of 1 because of the weight or penalization of 2*d[i,j] in the diagonal when selecting the minimum between:
g[i,j] = min( g[i-1,j-1] + 2 * d[i ,j ] ,
g[i ,j-1] + d[i ,j ] ,
g[i-1,j ] + d[i ,j ] )
plot(asymmetricP1)
edit(asymmetricP1)
structure(c(1, 1, 1, 2, 2, 3, 3, 3, 1, 0, 0, 1, 0, 2, 1, 0, 2,
1, 0, 1, 0, 1, 0, 0, -1, 0.5, 0.5, -1, 1, -1, 1, 1), .Dim = c(8L, 4L), class = "stepPattern", npat = 3, norm = "N")
Look at the plot, and consider the branches as ordered from right to left (ie. branch1 = 0.5 weight)
Everything in the script below is in the context of plot(asymmetricP1) and edit(asymmetricP1)
#first 8 digit sequence (1,1,1,2,2,3,3,3....
#branch1: "1,1,1" <- amount of intervals assigned to specificaly branch1; (end, joint, origin)
#branch2: "2,2" <- only 2 intervals, this is the middle diagnol line.
#branch3: "3,3,3" <- amount of interals
#note: Don't be confused by the numbers themselves, ie. "4,4,4" <- 3 intervals; "2,2,2" <- 3 intervals
#for the next sequences consider:
#the sequence of each branch is to be read as farthest from origin -> 0,0
#each interval assignment is accounted for in this order
#next 8 digit sequence: 1, 0, 0, 1, 0, 2, 1, 0,
#branch1: 1,0,0 <- interval position in relation to the query index
#branch2: 1,0 <- interval position in relation to the query index
#branch3: 2,1,0 <- interval position in relation to the query index (again see in plot)
#next 8 digit sequence: 2, 1, 0, 1, 0, 1, 0, 0
#branch1: 2,1,0 <- interval position in relation to the REFERENCE index
#branch2: 1,0 <- interval position in relation to the reference index
#branch3: 1,0,0 <- interval position in relation to the reference index (again see in plot)
#next 8 digit sequence: -1, 0.5, 0.5, -1, 1, -1, 1, 1
#note: "-1" is a signal that indicates weighting values follow
#note: notice that for each -1 that occurs, there is one value less, for example branch 1
# .....which has 3 intervals can only contain 2 weights (0.5 and 0.5)
#branch1: -1,0.5,0.5 <- changing the first 0.5 changes weight of [-1:0] segment (query index)
#branch2: -1,1 <- weight of middle branch
#branch3: -1,1,1 <- changing the second 1 changes weight of[-1,0] segment (query index)
#.Dim=c(8L, 4L):
#8 represents the number of intervals (1,1,1,2,2,3,3,3)
#4 (from what I understand) is the (length of all the branch sequences mentioned previously)/8
#npat = 3
#3 is the number of patterns you described in the structure. ie(1,1,1,2,2,3,3,3)
Hope this helps, good luck!

Calculate different sets of value in a variable into a dataframe

I am trying to figure out how to calculate the average,median and standard deviation for each value of each variable. Here is some of the data (thanks to #Barranka for providing the data in a easy-to-copy format):
df <- data.frame(
gama=c(10, 1, 1, 1, 1, 1, 10, 0.1, 10),
theta=c(1, 1, 1, 1, 0.65, 1, 0.65, 1, 1),
detectl=c(3, 5, 1, 1, 5, 3, 5, 5, 1),
NSMOOTH=c(10, 5, 20, 20, 5, 20, 10, 10, 40),
NREF=c(50, 80, 80, 50, 80, 50, 10, 100, 30),
NOBS=c(10, 40, 40, 20, 20, 20, 10, 40, 10),
sma=c(15, 15, 15, 15, 15, 15, 15, 15, 15),
lma=c(33, 33, 33, 33, 33, 33, 33, 33, 33),
PosTrades=c(11, 7, 6, 3, 9, 3, 6, 6, 5),
NegTrades=c(2, 2, 1, 0, 1, 0, 1, 5, 1),
Acc=c(0.846154, 0.777778, 0.857143, 1, 0.9, 1, 0.857143, 0.545455, 0.833333),
AvgWin=c(0.0451529, 0.0676022, 0.0673241, 0.13204, 0.0412913, 0.126522, 0.0630061, 0.0689745, 0.0748437),
AvgLoss=c(-0.0194498, -0.0083954, -0.0174653, NaN, -0.00264179, NaN, -0.0161558, -0.013903, -0.0278908), Return=c(1.54942, 1.54916, 1.44823, 1.44716, 1.42789, 1.42581, 1.40993, 1.38605, 1.38401)
)
To save it into csv later, i have to make it into data frame that supposed to be like this
Table for gama
Value Average Median Standard Deviation
10 (Avg of 10) (median of 10) (Stdev of 10)
1 (Avg of 1) (median of 1) (Stdev of 1)
0.1 (Avg of 0.1) (median of 0.1) (Stdev of 0.1)
Table for theta
Value Average Median Standard Deviation
1 (Avg of 10) (median of 10) (Stdev of 10)
0.65 (Avg of 0.65) (median of 0.65) (Stdev of 0.65)
Table for detectionsLimit
Value Average Median Standard Deviation
3 (Avg of 3) (median of 3) (Stdev of 3)
5 (Avg of 5) (median of 5) (Stdev of 5)
...
The columns to be used as ID's are:
ids <- c("gama", "theta","detectl", "NSMOOTH", "NREF", "NOBS", "sma", "lma")
Summary statistics should be computed over the following columns:
vals <- c("PosTrades", "NegTrades", "Acc", "AvgWin", "AvgLoss", "Return")
I have tried using data.table package/function, but I cannot figuring out how to develop an approach using data.table without renaming values one by one; also, when pursuing this approach, my code gets very complicated.
Clever use of melt() and tapply() can help you. I made the following assumptions:
You have to get the mean, median and average of the last three columns
You need to group the data for each of the first ten columns (gama, theta, ..., negTrades)
For reproducibility, here's the input:
# Your example data
df <- data.frame(
gama=c(10, 1, 1, 1, 1, 1, 10, 0.1, 10),
theta=c(1, 1, 1, 1, 0.65, 1, 0.65, 1, 1),
detectl=c(3, 5, 1, 1, 5, 3, 5, 5, 1),
NSMOOTH=c(10, 5, 20, 20, 5, 20, 10, 10, 40),
NREF=c(50, 80, 80, 50, 80, 50, 10, 100, 30),
NOBS=c(10, 40, 40, 20, 20, 20, 10, 40, 10),
sma=c(15, 15, 15, 15, 15, 15, 15, 15, 15),
lma=c(33, 33, 33, 33, 33, 33, 33, 33, 33),
PosTrades=c(11, 7, 6, 3, 9, 3, 6, 6, 5),
NegTrades=c(2, 2, 1, 0, 1, 0, 1, 5, 1),
Acc=c(0.846154, 0.777778, 0.857143, 1, 0.9, 1, 0.857143, 0.545455, 0.833333),
AvgWin=c(0.0451529, 0.0676022, 0.0673241, 0.13204, 0.0412913, 0.126522, 0.0630061, 0.0689745, 0.0748437),
AvgLoss=c(-0.0194498, -0.0083954, -0.0174653, NaN, -0.00264179, NaN, -0.0161558, -0.013903, -0.0278908), Return=c(1.54942, 1.54916, 1.44823, 1.44716, 1.42789, 1.42581, 1.40993, 1.38605, 1.38401)
)
And here's my proposed solution:
library(reshape)
md <- melt(df, id=colnames(df)[1:10]) # This will create one row for each
# 'id' combination, and will store
# the rest of the column headers
# in the `variable` column, and
# each value corresponding to the
# variable. Like this:
head(md)
## gama theta detectl NSMOOTH NREF NOBS sma lma PosTrades NegTrades variable value
## 1 10 1.00 3 10 50 10 15 33 11 2 Acc 0.846154
## 2 1 1.00 5 5 80 40 15 33 7 2 ## Acc 0.777778
## 3 1 1.00 1 20 80 40 15 33 6 1 ## Acc 0.857143
## 4 1 1.00 1 20 50 20 15 33 3 0 ## Acc 1.000000
## 5 1 0.65 5 5 80 20 15 33 9 1 ## Acc 0.900000
## 6 1 1.00 3 20 50 20 15 33 3 0 ## Acc 1.000000
results <- list() # Prepare the results list
for(i in unique(md$variable)) { # For each variable you have...
results[[i]] <- list() # ... create a new list to hold the 'summary'
tmp_data <- subset(md, variable==i) # Filter the data you'll use
for(j in colnames(tmp_data)[1:10]) { # For each variable, use tapply()
# to get what you need, and
# store it into a data frame
# inside the results
results[[i]][[j]] <- as.data.frame(
t(
rbind(
tapply(tmp_data$value, tmp_data[,j], mean),
tapply(tmp_data$value, tmp_data[,j], median),
tapply(tmp_data$value, tmp_data[,j], sd))
)
)
colnames(results[[i]][[j]]) <- c('average', 'median', 'sd')
}
rm(tmp_data) # You'll no longer need this
}
Now what? Check out the summary for results:
summary(results)
## Length Class Mode
## Acc 10 -none- list
## AvgWin 10 -none- list
## AvgLoss 10 -none- list
## Return 10 -none- list
You have a list for each variable. Now, if you check out the summary for any results "sublist", you'll see this:
summary(results$Acc)
## Length Class Mode
## gama 3 data.frame list
## theta 3 data.frame list
## detectl 3 data.frame list
## NSMOOTH 3 data.frame list
## NREF 3 data.frame list
## NOBS 3 data.frame list
## sma 3 data.frame list
## lma 3 data.frame list
## PosTrades 3 data.frame list
## NegTrades 3 data.frame list
See what happens when you peek into the results$Acc$gama list:
results$Acc$gama
## average median sd
## 0.1 0.5454550 0.545455 NA
## 1 0.9069842 0.900000 0.09556548
## 10 0.8455433 0.846154 0.01191674
So, for each variable and each "id" column, you have the data summary you want.
Hope this helps.
I have an approach involving data.table.
EDIT: I tried to submit an edit to the question, but I took some liberties so it'll probably get rejected. I made assumptions about which columns were to be used as "id" columns (columns whose values subset data), and which should be "measure" columns (columns whose values are used to calculate the summary statistics). See here for these designations:
ids <- c("gama", "theta","detectl", "NSMOOTH", "NREF", "NOBS", "sma", "lma")
vals <- c("PosTrades", "NegTrades", "Acc", "AvgWin", "AvgLoss", "Return")
Setup
# Convert to data.table
df <- data.table(df)
# Helper function to convert a string to a call
# useful in a data.table j
s2c <- function (x, type = "list"){
as.call(lapply(c(type, x), as.symbol))
}
# Function to computer the desired summary stats
smry <- function(x) list(Average=mean(x, na.rm=T), Median=median(x, na.rm=T), StandardDeviation=sd(x, na.rm=T))
# Define some names to use later
ids <- c("gama", "theta","detectl", "NSMOOTH", "NREF", "NOBS", "sma", "lma")
vals <- c("PosTrades", "NegTrades", "Acc", "AvgWin", "AvgLoss", "Return")
usenames <- paste(rep(c("Average","Median","StdDev"),each=length(vals)), vals,sep="_")
Calculations in data.table
# Compute the summary statistics
df2 <- df[,j={
for(i in 1:length(ids)){ # loop through each id
t.id <- ids[i]
t.out <- .SD[,j={
t.vals <- .SD[,eval(s2c(vals))] # this line returns a data.table with each vals as a column
sapply(t.vals, smry) # apply summary statistics
},by=t.id] # this by= loops through each value of the current id (t.id)
setnames(t.out, c("id.val", usenames)) # fix the names of the data.table to be returned for this i
t.out <- cbind(id=t.id, t.out) # add a column indicating the variable name (t.id)
if(i==1){big.out <- t.out}else{big.out <- rbind(big.out, t.out)} # accumulate the output data.table
}
big.out
}]
Formatting
df2 <- data.table:::melt.data.table(df2, id.vars=c("id","id.val")) # melt into "long" format
df2[,c("val","metric"):=list(gsub(".*_","",variable),gsub("_.*","",variable))] # splice names to create id's
df2[,variable:=NULL] # delete old column that had the names we just split up
df2 <- data.table:::dcast.data.table(df2, id+id.val+val~metric) # go a bit wider, so stats in diff columns
# reshape2:::acast(df2, id+id.val~metric~val) # maybe replace the above line with this
Result
id id.val val Average Median StdDev
1: NOBS 10 Acc 3.214550 0.01191674 0.006052701
2: NOBS 10 AvgLoss 1.000000 0.06300610 1.409930000
3: NOBS 10 AvgWin 1.333333 0.06100090 1.447786667
4: NOBS 10 NegTrades 6.000000 0.84615400 -0.019449800
5: NOBS 10 PosTrades 7.333333 0.84554333 -0.021165467
---
128: theta 1 AvgLoss 1.000000 0.06897450 1.447160000
129: theta 1 AvgWin 1.571429 0.08320849 1.455691429
130: theta 1 NegTrades 6.000000 0.84615400 -0.017465300
131: theta 1 PosTrades 5.857143 0.83712329 -0.017420860
132: theta 1 Return 1.718249 0.03285638 0.068957635

How to make a confusion matrix from comparing prediction results of two algorithms?

I applied two unsupervised algorithms to the same data, and would like to make a confusion matrix out of results, how should I achieve it in R?
An example with R codes like following:
xx.1 <- c(41, 0, 4, 0, 0, 0, 0, 0, 0, 7, 0, 11, 8, 0, 0, 0, 0, 0, 3, 0, 0, 1, 1, 0, 4)
xx.2 <- matrix(xx.1, nrow = 5)
rownames(xx.2) <- paste("Algo1", 1:5, sep = "_")
colnames(xx.2) <- paste("Algo2", 1:5, sep = "_")
xx.2
xx.2 is the predicting results of two algorithms, the numbers show how many observation are classified as Algo1_X and Algo2_X:
Algo2_1 Algo2_2 Algo2_3 Algo2_4 Algo2_5
Algo1_1 41 0 0 0 0
Algo1_2 0 0 11 0 1
Algo1_3 4 0 8 0 1
Algo1_4 0 0 0 3 0
Algo1_5 0 7 0 0 4
The problem is, how should I rearrange the matrix to get a confusion matrix, by using the results of Algo1 as reference? There are two questions:
Determine the corresponding between two algorithms, i.e., the method I think is that the most similar classification should be paired;
The matrix is rearranged so that dialog line have the largest intersect value.
Here, Algo2_1 and Algo1_1 have the largest intersect value and their are a pair; then Algo1_2 and Algo2_3 should be a pair since they have the second largest value in the left column/rows, so Algo2_3 should be moved to the second column.
How could I do it easily in R? or there are packages available for this purpose?
Thanks!

How do I perform koyck lag transformations in PMML?

I'm using PMML to transfer my models (that I develop in R) between different platforms. One issue I often face is that given input data I need to do a lot of pre-processing. Most times this is rather straightforward in PMML but I cannot figure out how to do it when I need a Koyck lag transformation. Now the first few lines of the input data set looks like this:
Y Z S Xa Xb Xc
1 11.37738 1 0.8414710 0.0 0.0 581102.6
2 21.29848 2 0.9092974 700254.1 0.0 35695.1
3 14.30348 3 0.1411200 0.0 384556.3 0.0
4 18.07305 4 0.0000000 413643.2 0.0 0.0
5 29.02756 5 0.0000000 604453.3 0.0 350888.2
6 20.73336 6 0.0000000 0.0 0.0 168961.2
and is generated by:
df<-structure(list(Y = c(11.3773828021943, 21.2984762226498, 14.3034834956969,
18.0730530464578, 29.0275566937015, 20.7333617643781, 30.9707039948106,
30.2428379202751, 22.1677291047936, 19.7450403054104, 18.4642890388219,
28.4145184014117, 28.5224574661743, 40.5073319897728, 40.8853498146471,
20.7173713186907, 35.8080372291603, 37.6213598048788, 38.3123458040493,
25.143519382411),
Z = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20),
S = c(0.841470984807897, 0.909297426825682, 0.141120008059867,
0, 0, 0, 0.656986598718789, 0.989358246623382,
0.412118485241757, 0, 0, 0, 0.420167036826641, 0.99060735569487,
0.650287840157117, 0, 0, 0, 0.149877209662952, 0.912945250727628),
Xa = c(0, 700254.133201206, 0, 413643.212229974, 604453.339408554,
0, 623209.174415675, 1042574.05046884, 0, 0, 397257.053501325,
441408.09060313, 0, 0, 597980.888163467, 0, 121672.230528635,
199542.274825303, 447951.083632432, 84751.5842557032),
Xb = c(0, 0, 384556.309344495, 0, 0, 0, 0, 0, 0, 0, 0,
179488.805498654, 31956.7161910341, 785611.676606721,
65452.7295721654, 0, 231214.563631705, 0, 0,
176249.685091327),
Xc = c(581102.615208462, 35695.0974169599, 0, 0, 350888.245086195,
168961.239749307, 458076.400377529, 218707.589596171,
0, 506676.223324812, 0, 25613.8139087091, 429615.016105429,
410675.885159107, 0, 229898.803944166, 2727.64268459058,
711726.797796325, 354985.810664457, 0)),
.Names = c("Y", "Z", "S", "Xa", "Xb", "Xc"),
row.names = c(NA, -20L),
class = "data.frame")
I want to create a new variable M using koyck lags of the variables Xa, Xb and Xc like this:
lagIt<-function (x, d, ia = mean(x))
{
y <- x
y[1] <- y[1] + ia*d
for (i in 2:length(x)) y[i] <- y[i] + y[i-1] * d
y
}
df2<-transform(df, M=(lagIt(tanh(Xa/300000), 0.5) +
lagIt(tanh(Xb/100000), 0.7) + lagIt(tanh(Xc/400000), 0.3)))
> head(df2)
# Y Z S Xa Xb Xc M
# 1 11.37738 1 0.8414710 0.0 0.0 581102.6 1.460318
# 2 21.29848 2 0.9092974 700254.1 0.0 35695.1 1.637388
# 3 14.30348 3 0.1411200 0.0 384556.3 0.0 1.767136
# 4 18.07305 4 0.0000000 413643.2 0.0 0.0 1.960151
# 5 29.02756 5 0.0000000 604453.3 0.0 350888.2 2.796750
# 6 20.73336 6 0.0000000 0.0 0.0 168961.2 1.761774
and finally build a model:
fit<-lm(Y~Z+S+M, data=df2)
Using the pmml library in R I can get the PMML XML output like this.
library(pmml)
pmml(fit)
However, I want to include a section of where the creation of the variable M takes place. How can I write that section conforming to PMML? Again the input data is the df data.frame and I want all pre-processing of data to be defined in PMML.
PMML operates on single-valued data records, but you're trying to use vector-valued data records. Most certainly, you cannot do (for-)loops in PMML.
Depending on your deployment platform, you might be able to use extension functions. Basically, this involves 1) programming Koyck lag transformation, 2) turning it into a standalone extension library and 3) making the PMML engine aware of this extension library. This extension function can be called by name just like all other built-in and user-defined functions.
The above should be doable using the JPMML library.

Resources