How to calculate standard deviation every 3 columns in a dataframe? - r

I have a dataframe with 4895 rows and 75 columns.
I need to calculate the standard deviation each 3 columns, for each row.
So at the and I should have 4895 rows and 25 columns (75/3), where each columns is the SD calculated among three columns.
This is part of the data frame:
structure(list(`101_LOD` = c(-0.00647656063436054, 0.00645714072316343,
0.00174533523902105, -0.000354643362187957, -0.000599093190801188,
0.00086188829059792), `101_LOD.1` = c(0.00380625456526623, -0.00398115037246045,
0.00158673927930099, -0.00537583996746438, -0.00280048350643599,
0.00348232298529063), `101_LOD.2` = c(-0.00281100080425964, -0.00335537844222041,
0.00611652518452308, -0.000738139825060029, 0.00485039477849737,
0.00412428118507656), `107_LOD` = c(0.00264717678436649, 0.00339296025595841,
0.00392733001719888, 0.0106686039973083, 0.00886643251752075,
0.0426091484273961), `107_LOD.1` = c(0.000242380702002215, -0.00116108069669281,
0.0119784744970561, 0.00380805756323248, 0.00190407945251567,
0.00199684331869391), `107_LOD.2` = c(-0.0102716279438754, -0.00706528150567528,
-0.0108745954674186, -0.0122962259781756, -0.00590383880635847,
-0.00166664119985051), `111_LOD` = c(-0.00174374098054644, 0.00383270191075735,
-0.00118363208946644, 0.00107908760333878, -9.30127551375776e-05,
-0.00141500588842743), `111_LOD.1` = c(0.000769378300959002,
0.00253820252869653, 0.00110643824418424, -0.000338050323261079,
-0.00313666295753596, 0.0043919374295125), `111_LOD.2` = c(0.000177265973907964,
0.00199829884609846, -0.000490950219515303, -0.00100263695578483,
0.00122606902671889, 0.00934018452187161), `113_LOD` = c(0.000997977666838309,
0.0062400770296875, -0.00153620247996209, 0.00136849054508488,
-0.00145700847633675, -0.000591288575933268), `113_LOD.1` = c(-0.00114161441697546,
0.00152607521404826, 0.000811193628975422, -0.000799514037634276,
-0.000319008435039752, -0.0010086036089075), `113_LOD.2` = c(-0.000722312098377764,
0.00364767954707251, 0.000547744649351312, 0.000352509651080838,
-0.000852173274761947, 0.00360487150682726), `135_LOD` = c(-0.00634051802134062,
0.00426062889500736, 0.00484049067127332, 0.00216220020394825,
0.00165634168942681, -0.00537970105199375), `135_LOD.1` = c(-0.00209301968088832,
0.00535855274344209, -0.00119679744329422, 0.0041216882161451,
0.00512978202611836, 0.0014048506490567), `135_LOD.2` = c(0.00022377545723911,
0.00400550696583795, 0.00198972253447825, 0.00301341644871015,
0.00256802839330668, 0.00946109288597202), `137_LOD` = c(-0.0108508893475138,
-0.0231919072487789, -0.00346546003410657, -0.00154066625155414,
0.0247266017774909, -0.0254464953061609), `137_LOD.1` = c(-0.00363025194918789,
-0.00291104074373261, 0.0024998477144967, 0.000877707284759669,
0.0095477003599792, 0.0501795740749602), `137_LOD.2` = c(0.00930498343499501,
-0.011839104725282, 0.000274929503053888, 0.000715665078729413,
0.0145503185102915, 0.0890428314632625), `149_LOD` = c(-0.000194406250680231,
0.000355157226357547, -0.000353931679163222, 0.000101471293242973,
-0.000429409422518444, 0.000344585379249552), `149_LOD.1` = c(-0.000494386150759807,
0.000384907974061922, 0.000582537329068263, -0.000173285705433721,
-6.92758935962043e-05, 0.00237942557324254), `149_LOD.2` = c(0.000368606958615297,
0.000432568466833549, 3.33092313366271e-05, 0.000715304544370804,
-0.000656902381786168, 0.000855422043674721), `155_LOD` = c(-0.000696168382693618,
-0.000917607266525328, 4.77049670728094e-06, 0.000140297660927979,
-5.99898679530658e-06, 6.71169142984434e-06), `155_LOD.1` = c(-0.000213644203677328,
-3.44396001911029e-07, -0.000524232671878577, -0.000830180665933627,
1.47799998238307e-06, -5.97640014667251e-05), `155_LOD.2` = c(-0.000749882784933487,
0.000345737159390042, -0.00076916001239521, -0.000135205762575321,
-2.55352420251723e-06, -3.07199008030628e-05), `31_LOD` = c(-0.00212014938530172,
0.0247411322547065, -0.00107990654365844, -0.000409195814154659,
-0.00768439381433953, 0.001860128524035), `31_LOD.1` = c(-0.00248488588195854,
-0.011146734518705, -0.000167943850441196, -0.0021998906531997,
0.0166775965182051, -0.0156939303287719), `31_LOD.2` = c(0.00210626277375321,
-0.00327815351414411, -0.00271043947479133, 0.00118991079627845,
-0.00838520090692615, 0.0255825346347586), `33_LOD` = c(0.0335175783154054,
0.0130192144768818, 0.0890608024914352, -0.0142431454793663,
0.00961009674973182, -0.0429774973256228), `33_LOD.1` = c(0.018600175159935,
0.04588362587764, 0.0517479021554752, 0.0453766081395813, -0.0483559729403664,
0.123771869764484), `33_LOD.2` = c(0.01906507758481, -0.00984821669825455,
0.134177176083007, -0.00544320457445977, 0.0516083894733814,
-0.0941500564321804), `39_LOD` = c(-0.148517395684098, -0.21311281527214,
0.112875846920874, -0.134256453140454, 0.0429030528286934, -0.0115143877745049
), `39_LOD.1` = c(-0.0431568202849291, -0.159003698955288, 0.0429009071238143,
-0.126060096927082, -0.078848020069061, -0.0788748111534866),
`39_LOD.2` = c(-0.16276833960171, 0.0236589399437796, 0.0828435027244962,
-0.50219849047847, -0.105196237549017, -0.161206838628339
), `42_LOD` = c(-0.00643926654994104, -0.0069253267922805,
7.63419856289838e-05, -0.0185223126108671, 0.00120855708103566,
-0.00275288147011515), `42_LOD.1` = c(-0.000866169150506504,
-0.00147791175852563, -0.000670310173141084, -0.00757733007180311,
0.0151353172950393, -0.00114193461500327), `42_LOD.2` = c(0.00719928454572906,
0.00311615354837406, 0.00270759483782046, -0.0108062423259522,
0.00158765505419478, -0.0034831499672973), `45_LOD` = c(0.00557787518897268,
0.022337270533665, 0.00657118689440082, -0.00247269227623608,
0.0191646343214611, 0.0233090596023039), `45_LOD.1` = c(-0.0305395220788143,
0.077105031761457, -0.00101713990356452, 0.0147500116150713,
-5.43009569586179e-05, -0.0235006181977403), `45_LOD.2` = c(-0.0216498682456909,
-0.0413426968184435, -0.0210779895848601, -0.0147549519865421,
0.00305229143870313, -0.0483293292336662), `47_LOD` = c(-0.00467568767221499,
-0.0199796182799552, 0.00985966068611855, -0.031010117051163,
0.0319279109813341, 0.0350743318265918), `47_LOD.1` = c(0.00820166533285921,
-0.00748186905620154, -0.010483251821707, -0.00921919551377505,
0.0129546148757833, 0.000223462281435923), `47_LOD.2` = c(0.00172469728530889,
0.0181683409295075, 0.00264937907258855, -0.0569837400476351,
0.00514558635349483, 0.0963339573489031), `59_LOD` = c(-0.00664210061621158,
-0.062069664217766, 0.0104345353700492, 0.0115323589989968,
-0.000701276829098035, -0.0397759501000331), `59_LOD.1` = c(-0.00844888486350536,
0.0207426674766074, -0.0227755432761471, -0.00370561240222376,
0.0152046240483297, -0.0127327412801225), `59_LOD.2` = c(-0.000546590647534814,
0.0178115310450356, 0.00776130696191998, 0.00162470375408126,
-0.036140754156005, 0.0197791914089296), `61_LOD` = c(0.00797528044191513,
-0.00358928087671818, 0.000662870138322471, -0.0412142836466128,
-0.00571822580078707, -0.0333870884803465), `61_LOD.1` = c(0.000105849888219735,
-0.00694734283847093, -0.00656216592134899, 0.00161225110022219,
0.0125744958934939, -0.0178560868664668), `61_LOD.2` = c(0.0049288443167774,
0.0059411543659837, -0.00165857112209555, -0.0093669075333705,
0.00655185371925189, 0.00516436591134869), `69_LOD` = c(0.0140014747729604,
0.0119645827116724, 0.0059880663080946, -0.00339119330845176,
0.00406436116298777, 0.00374425148741196), `69_LOD.1` = c(0.00465076983995792,
0.00664902297016735, -0.00183936649215524, 0.00496509351837152,
-0.0224812403463345, -0.0193087796456654), `69_LOD.2` = c(-0.00934638876711703,
-0.00802183076602164, 0.00406752039394799, -0.000421337136630527,
-0.00406768983408334, -0.0046016148041856), `71_LOD` = c(-0.00206064862123214,
0.0058604630066848, -0.00353440181333921, -0.000305197461077327,
0.00266085011303462, -0.00105635261106644), `71_LOD.1` = c(3.66652318354654e-06,
0.00542612739642576, 0.000860385212430484, 0.00157520645492044,
-0.00280256517377998, -0.00474358065422048), `71_LOD.2` = c(-0.00167098030843413,
0.0059622082597603, -0.00121597491543965, -0.000791592953383716,
-0.0022790991468459, 0.00508978650148816), `75_LOD` = c(NA,
-0.00562613898652477, -0.000103076958936504, -3.76628574664693e-05,
-0.000325767611573817, 0.000117404893823389), `75_LOD.1` = c(NA,
NA, -0.000496324358203359, -0.000517476831074487, -0.00213096062838051,
-0.00111202867609916), `75_LOD.2` = c(NA, NA, -0.000169651845347418,
-4.72864955070539e-05, -0.00144880109085214, 0.00421635976535877
), `79_LOD` = c(-0.0011901810540199, 0.00731686066269579,
0.00538551997145174, -0.00578723012473479, -0.0030246805255648,
0.00146141135533218), `79_LOD.1` = c(-0.00424278455960268,
-0.010593752642875, 0.0065136497427927, -0.00427355522802769,
0.000539975609490915, -0.0206849687839064), `79_LOD.2` = c(-0.00366739576561779,
-0.00374066839898667, -0.00132764684703939, -0.00534145222725701,
0.00920940542227595, -0.0101871763957068), `85_LOD` = c(-0.0120254177480422,
0.00369546541331518, -0.00420718877886963, 0.00414911885475517,
-0.00130381692844529, -0.00812757789798261), `85_LOD.1` = c(-0.00302024868281014,
0.00537704163310547, 0.00184264538884543, -0.00159032685888543,
-0.0062127769817834, 0.00349476605688194), `85_LOD.2` = c(0.0122689407380797,
-0.00509605601025503, -0.00641413996554198, 0.000592176121486696,
0.00131237912317341, -0.00535018996837309), `87_LOD` = c(0.00613621268007298,
0.000410268892659307, -0.00239014321624482, -0.00171179729894864,
-0.00107159765522861, -0.00708388174601732), `87_LOD.1` = c(0.00144787264098156,
-0.0025946273860992, -0.00194897899110034, 0.00157863310440493,
-0.0048913305554607, -0.000585669821053749), `87_LOD.2` = c(-0.00224691693198253,
-0.00277315666829267, 0.00166487067514155, -0.00173757960229744,
-0.00362252480121682, -0.0101992979591839), `93_LOD` = c(-0.0234225447373586,
0.0390095666365413, 0.00606244490932179, 0.0264258422783391,
0.0161211132913951, -0.0617678157059), `93_LOD.1` = c(-0.0124876313221369,
-0.0309636779639578, 0.00610883313140442, -0.0192442672220773,
0.0129557286224975, -0.00869066964782635), `93_LOD.2` = c(-0.0219837540560547,
-0.00521242297372905, 0.0179965615561871, 0.0081370991723329,
1.45427765512579e-06, -0.0111199632179688), `99_LOD` = c(0.00412086456443205,
-0.00259940538393106, 0.00742537463584133, -0.00302091572866969,
-0.00320466045653491, -0.00168702410433936), `99_LOD.1` = c(0.00280546156134205,
-0.00472591065687533, 0.00518402193979284, -0.00130887074314965,
0.00148769905391341, 0.00366250488078969), `99_LOD.2` = c(-0.00240469207099292,
-9.57307699040024e-05, -0.000145493235845501, 0.000667454164326723,
-0.0057445759245933, 0.00433464631989088), H_LOD = c(-6248.9128518109,
-10081.9540490064, -6696.91582671427, -5414.20614601348,
-3933.64339240365, -13153.7509294302), H_LOD.1 = c(-6.2489128518109,
-10.0819540490064, -6.69691582671427, -5.41420614601348,
-3.93364339240365, -13.1537509294302), H_LOD.2 = c(-6248.9128518109,
-10081.9540490064, -6696.91582671427, -5414.20614601348,
-3933.64339240365, -13153.7509294302)), row.names = c(NA,
6L), class = "data.frame")
What I tried
LOD_sd=aggregate(LOD_ut,list(rep(1:(ncol(LOD_ut)%/%3),each=3,len=ncol(LOD_ut))),std)[-1];
I'm stucked because of this error:
Error in aggregate.data.frame(LOD_ut, list(rep(1:(ncol(LOD_ut)%/%3), each = 3, :
arguments must have same length
Someone can help me?
Thanks

Here is an idea via base R. We split the data frame every 3 columns and create a list. We then loop over that list and calculate the rowwise standard deviation, i.e.
sapply(split.default(df, rep(seq((ncol(df) / 3)), each = 3)), function(i)
apply(i, 1, sd, na.rm = TRUE))

Related

not reading the function properly

I think i did not write this code properly since it does not find the proper code inside the function :
comp_spread_CDS = function(loss, vec_ZC_prem, vec_ZC_def, vec_prob_suv_prem, vec_prob_surv_def)
{
nb_payment = lenght(vec__VC_prem)
nb_step = lengh(vec_ZC_def)
vec_prob_surv_prem_eff = vec_prob_surv_prem + c(1, vec_prob_surv_prem[1 :(nb_payment - 1)])
vec_tenor = rep(tenor, nb_payment)
vec_prob_def = c(1, vec_prob_surv_def[1:(nb_step-1)]) - vec_prob_surv_def
annuity = 0.5 * sum(vec_ZC_prem * vec_prob_surv_prem_eff * vec_tenor)
leg_def = los * sum( vec_ZC_def * vec_prob_def)
spread_CDS = leg_def / annuity
return(spread_CDS)
}
base = 10000
notional = 100
maturity = 5
recovery = 40/100
loss = 1 - recovery
int_rate = 3/100
intensity = 180/base
tenor = 3/12
time_step = 1/360
nb_payment = maturity/ tenor
nb_payment
nb_step = maturity/time_step
nb_step
c(1, vec_prob_surv_prem[1: nb_payment-1])
vec_prob_surv_prem_eff
As a result it is telling me that the object is not find by R.
vec_prob_surv_prem_eff
Erreur : objet 'vec_prob_surv_prem_eff' introuvable
thank you for your help
sincerely,

How to create a new data file from an existing dataset to load into Rattle?

My goal is to create a decision tree model in Rattle for a school project. I've been able to determine the variables that I would need for my research question and created a new dataset from the original .csv file. After saving the new dataset as not only an .xls file and a .rdata file, I received an error message after loading the file into Rattle. This is my first time creating a decision tree model so I'm struggling a bit. Thanks in advance for your help!
Here's what I have so far:
install.packages(readxl)
library(readxl)
library(rattle)
setwd("C:/Users/river/OneDrive/Documents/Random Data")
edu <- read_excel('pfi_pu.xlsx')
eduu <- data.frame(c("P1HRSWK" = c(edu$P1HRSWK),
"P1EMPL" = c(edu$P1EMPL),
"P2HRSWK" = c(edu$P2HRSWK),
"P2EMPL" = c(edu$P2EMPL),
"P1ENRL" = c(edu$P1ENRL),
"P2ENRL" = c(edu$P2ENRL),
"P1EDUC" = c(edu$P1EDUC),
"P2EDUC" = c(edu$P2EDUC),
"P1HISPRM" = c(edu$P1HISPRM),
"P2HISPRM" = c(edu$P2HISPRM),
"P1PACI" = c(edu$P1PACI),
"P2PACI" = c(edu$P2PACI),
"P1BLACK" = c(edu$P1BLACK),
"P2BLACK" = c(edu$P2BLACK),
"P1ASIAN" = c(edu$P1ASIAN),
"P2ASIAN" = c(edu$P2ASIAN),
"P1AMIND" = c(edu$P1AMIND),
"P2AMIND" = c(edu$P2AMIND),
"P1HISPAN" = c(edu$P1HISPAN),
"P2HISPAN" = c(edu$P2HISPAN),
"P1LKWRK" = c(edu$P1LKWRK),
"P2LKWRK" = c(edu$P2LKWRK),
"P1MTHSWRK" = c(edu$P1MTHSWRK),
"P1REL" = c(edu$P1REL),
"P2REL" = c(edu$P2REL),
"P1SEX" = c(edu$P1SEX),
"P2SEX" = c(edu$P2SEX),
"P1MRSTA" = c(edu$P1MRSTA),
"SEFUTUREX" = c(edu$SEFUTUREX),
"HSFUTUREX" = c(edu$HSFUTUREX),
"PARGRADEX" = c(edu$PARGRADEX),
"TTLHHINC" = c(edu$TTLHHINC),
"PAR1EMPL" = c(edu$PAR1EMPL),
"PAR2EMPL" = c(edu$PAR2EMPL),
"SEEXPEL" = c(edu$SEEXPEL),
"SESUSPIN" = c(edu$SESUSPIN),
"SESUSOUT" = c(edu$SESUSOUT),
"SEGRADEQ" = c(edu$SEGRADEQ)
,dim = c(14075,38,1))
save(eduu,file="eduu.xls")
error message
Seems your problem is about writing a file. The command save must be used to save .RData files, not Excel files. According to this post, you may try:
openxlsx::write.xlsx(eduu, 'eduu.xlsx')
xlsx::write.xlsx(eduu, 'eduu.xlsx')
writexl::write_xlsx(eduu, 'eduu.xlsx')

Creating advance lua array table from list view and call the array elements

Say I have a CE Lua form and some variables:
form.Show()
list = form.CEListView1
tab_player = {}
p_name = 'Joe'
p_gen = 'Male'
table.insert(tab_player,{player_name = p_name, player_gen = p_gen})
-- and then add some elements from List View to same record index
for idx = list.ItemIndex + 1, list.Items.Count-1 do
mtrl_name = list.Items[idx].Caption
mtrl_qty = list.Items[idx].SubItems[0]
mtrl_unit = list.Items[idx].SubItems[1]
mtrl_price = list.Items[idx].SubItems[2]
mtrl_tprice = list.Items[idx].SubItems[3]
table.insert(tab_player, {v_itemname = mtrl_name, v_itemqty = mtrl_qty,
v_itemunit = mtrl_unit, v_itemprice = mtrl_price, v_itemttlprice = mtrl_tprice})
end
-- check
for index, data in ipairs(tab_player) do
print(index)
for key, value in pairs(data) do
print('\t', key, value)
end
end
Result, it's created 9 tab_player record indexes (depending how many items on list view).
What I want is like this structure for one record index:
tab_player =
{
player_name = p_name,
player_gen = p_gen,
{
v_itemname = mtrl_name,
v_itemqty = mtrl_qty,
v_itemunit = mtrl_unit,
v_itemprice = mtrl_price,
v_itemttlprice = mtrl_tprice},
{
v_itemname = mtrl_name,
v_itemqty = mtrl_qty,
v_itemunit = mtrl_unit,
v_itemprice = mtrl_price,
v_itemttlprice = mtrl_tprice},
{
v_itemname = mtrl_name,
v_itemqty = mtrl_qty,
v_itemunit = mtrl_unit,
v_itemprice = mtrl_price,
v_itemttlprice = mtrl_tprice}
-- and so on
}
How CE Lua script to get the structure as I want?
If done, then how CE Lua script call the data from tab_player to fill player name editbox, player gen editbox and fill the items to CE List View?
EDIT:
What I want to be produce an array table with structure below:
list = UDF1.CEListView1
tab_player = {}
player_name = 'Joe'
player_gen = 'Male'
-- this is list view items contain:
--- row 1, column 1 to 5
mtrl_name = list.Items[1].Caption -- Milk
mtrl_qty = list.Items[1].SubItems[0] -- 300
mtrl_unit = list.Items[1].SubItems[1] -- ml
mtrl_price = list.Items[1].SubItems[2] -- 3975
mtrl_tprice = list.Items[1].SubItems[3] -- 3975
--- row 2, column 1 to 5
mtrl_name = list.Items[2].Caption -- Sugar
mtrl_qty = list.Items[2].SubItems[0] -- 1
mtrl_unit = list.Items[2].SubItems[1] -- Kg
mtrl_price = list.Items[2].SubItems[2] -- 18000
mtrl_tprice = list.Items[2].SubItems[3] -- 18000
--- row 3, column 1 to 5 and so om
the tab_player should be:
tab_player = {
-- index 0 or record 1
{player_name = 'Joe', player_gen = 'Male',
-- row 1, column 1 to 5
{
item_name = 'Milk',
item_qty = 300,
item_unit = 'ml',
item_price = 3975,
item_tprice = 3975
},
-- row 2, column 1 to 5
{
item_name = 'Sugar',
item_qty = 2,
item_unit = 'Kg',
item_price = 9000
item_tprice = 18000
},
-- row 3, column 1 to 5
{
item_name = 'bla bla bla',
item_qty = 1,
item_unit = 'bla',
item_price = 1000000
item_tprice = 1000000
}
-- and so on
}
How to create, print multidimensional and call back the item from the array table as above?.

I need a hint to sorted out this issue

I have this form:
class RegisterForm(FlaskForm):
"""Registration form."""
username = StringField('Username', validators=[Required(), Length(1, 64)])
password = PasswordField('Password', validators=[Required(), Length(4, 4)])
password_again = PasswordField('Password again',
validators=[Required(), EqualTo('password'), Length(4, 4)])
tc = SelectField(validators=[Required()],
choices=[("m_", "Yes"),
("n", "No")])
submit = SubmitField('Register')
And I want to add secrets.randbelow(n) next to "m_"

R GBM versus Spark GBT performance

I'm trying to compare performance between R and Spark-ML and my initial testing tells me that Spark-ML is better than R in most cases and scales much better when the dataset gets bigger.
However, I'm having strange results when it comes to Gradient Boosted Trees, especially because R takes 3 minutes where Spark takes 15 on the same dataset, on the same computer.
Here is the R code:
train <- read.table("c:/Path/to/file.csv", header=T, sep=";",dec=".")
train$X1 <- factor(train$X1)
train$X2 <- factor(train$X2)
train$X3 <- factor(train$X3)
train$X4 <- factor(train$X4)
train$X5 <- factor(train$X5)
train$X6 <- factor(train$X6)
train$X7 <- factor(train$X7)
train$X8 <- factor(train$X8)
train$X9 <- factor(train$X9)
library(gbm)
boost <- gbm(Freq~X1+X2+X3+X4+X5+X6+X7+X8+X9+Y1, distribution = "gaussian", data = train, n.trees = 2000, bag.fraction = 1, shrinkY1 = 1, interaction.depth = 1, n.minobsinnode = 50, train.fraction = 1.0, cv.folds = 0, keep.data = TRUE)
And here is the scala code for Spark
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.regression.GBTRegressor
val conf = new SparkConf()
.setAppName("GBTExample")
.set("spark.driver.memory", "8g")
.set("spark.executor.memory", "8g")
.set("spark.network.timeout", "120s")
val sc = SparkContext.getOrCreate(conf.setMaster("local[8]"))
val spark = new SparkSession.Builder().getOrCreate()
import spark.implicits._
val sourceData = spark.read.format("com.databricks.spark.csv")
.option("header", "true")
.option("delimiter", ";")
.option("inferSchema", "true")
.load("c:/Path/to/file.csv")
val data = sourceData.select($"X1", $"X2", $"X3", $"X4", $"X5", $"X6", $"X7", $"X8", $"X9", $"Y1".cast("double"), $"Freq".cast("double"))
val X1Indexer = new StringIndexer().setInputCol("X1").setOutputCol("X1Idx")
val X2Indexer = new StringIndexer().setInputCol("X2").setOutputCol("X2Idx")
val X3Indexer = new StringIndexer().setInputCol("X3").setOutputCol("X3Idx")
val X4Indexer = new StringIndexer().setInputCol("X4").setOutputCol("X4Idx")
val X5Indexer = new StringIndexer().setInputCol("X5").setOutputCol("X5Idx")
val X6Indexer = new StringIndexer().setInputCol("X6").setOutputCol("X6Idx")
val X7Indexer = new StringIndexer().setInputCol("X7").setOutputCol("X7Idx")
val X8Indexer = new StringIndexer().setInputCol("X8").setOutputCol("X8Idx")
val X9Indexer = new StringIndexer().setInputCol("X9").setOutputCol("X9Idx")
val assembler = new VectorAssembler()
.setInputCols(Array("X1Idx", "X2Idx", "X3Idx", "X4Idx", "X5Idx", "X6Idx", "X7Idx", "X8Idx", "X9Idx", "Y1"))
.setOutputCol("features")
val dt = new GBTRegressor()
.setLabelCol("Freq")
.setFeaturesCol("features")
.setImpurity("variance")
.setMaxIter(2000)
.setMinInstancesPerNode(50)
.setMaxDepth(1)
.setStepSize(1)
.setSubsamplingRate(1)
.setMaxBins(32)
val pipeline = new Pipeline()
.setStages(Array(X1Indexer, X2Indexer, X3Indexer, X4Indexer, X5Indexer, X6Indexer, X7Indexer, X8Indexer, X9Indexer, assembler, dt))
val model = pipeline.fit(data)
I have the feeling that I'm not comparing the same methods here, but the documentation that I could find did not clarify the situation.

Resources