How to calculate standard deviation every 3 columns in a dataframe? - r
I have a dataframe with 4895 rows and 75 columns.
I need to calculate the standard deviation each 3 columns, for each row.
So at the and I should have 4895 rows and 25 columns (75/3), where each columns is the SD calculated among three columns.
This is part of the data frame:
structure(list(`101_LOD` = c(-0.00647656063436054, 0.00645714072316343,
0.00174533523902105, -0.000354643362187957, -0.000599093190801188,
0.00086188829059792), `101_LOD.1` = c(0.00380625456526623, -0.00398115037246045,
0.00158673927930099, -0.00537583996746438, -0.00280048350643599,
0.00348232298529063), `101_LOD.2` = c(-0.00281100080425964, -0.00335537844222041,
0.00611652518452308, -0.000738139825060029, 0.00485039477849737,
0.00412428118507656), `107_LOD` = c(0.00264717678436649, 0.00339296025595841,
0.00392733001719888, 0.0106686039973083, 0.00886643251752075,
0.0426091484273961), `107_LOD.1` = c(0.000242380702002215, -0.00116108069669281,
0.0119784744970561, 0.00380805756323248, 0.00190407945251567,
0.00199684331869391), `107_LOD.2` = c(-0.0102716279438754, -0.00706528150567528,
-0.0108745954674186, -0.0122962259781756, -0.00590383880635847,
-0.00166664119985051), `111_LOD` = c(-0.00174374098054644, 0.00383270191075735,
-0.00118363208946644, 0.00107908760333878, -9.30127551375776e-05,
-0.00141500588842743), `111_LOD.1` = c(0.000769378300959002,
0.00253820252869653, 0.00110643824418424, -0.000338050323261079,
-0.00313666295753596, 0.0043919374295125), `111_LOD.2` = c(0.000177265973907964,
0.00199829884609846, -0.000490950219515303, -0.00100263695578483,
0.00122606902671889, 0.00934018452187161), `113_LOD` = c(0.000997977666838309,
0.0062400770296875, -0.00153620247996209, 0.00136849054508488,
-0.00145700847633675, -0.000591288575933268), `113_LOD.1` = c(-0.00114161441697546,
0.00152607521404826, 0.000811193628975422, -0.000799514037634276,
-0.000319008435039752, -0.0010086036089075), `113_LOD.2` = c(-0.000722312098377764,
0.00364767954707251, 0.000547744649351312, 0.000352509651080838,
-0.000852173274761947, 0.00360487150682726), `135_LOD` = c(-0.00634051802134062,
0.00426062889500736, 0.00484049067127332, 0.00216220020394825,
0.00165634168942681, -0.00537970105199375), `135_LOD.1` = c(-0.00209301968088832,
0.00535855274344209, -0.00119679744329422, 0.0041216882161451,
0.00512978202611836, 0.0014048506490567), `135_LOD.2` = c(0.00022377545723911,
0.00400550696583795, 0.00198972253447825, 0.00301341644871015,
0.00256802839330668, 0.00946109288597202), `137_LOD` = c(-0.0108508893475138,
-0.0231919072487789, -0.00346546003410657, -0.00154066625155414,
0.0247266017774909, -0.0254464953061609), `137_LOD.1` = c(-0.00363025194918789,
-0.00291104074373261, 0.0024998477144967, 0.000877707284759669,
0.0095477003599792, 0.0501795740749602), `137_LOD.2` = c(0.00930498343499501,
-0.011839104725282, 0.000274929503053888, 0.000715665078729413,
0.0145503185102915, 0.0890428314632625), `149_LOD` = c(-0.000194406250680231,
0.000355157226357547, -0.000353931679163222, 0.000101471293242973,
-0.000429409422518444, 0.000344585379249552), `149_LOD.1` = c(-0.000494386150759807,
0.000384907974061922, 0.000582537329068263, -0.000173285705433721,
-6.92758935962043e-05, 0.00237942557324254), `149_LOD.2` = c(0.000368606958615297,
0.000432568466833549, 3.33092313366271e-05, 0.000715304544370804,
-0.000656902381786168, 0.000855422043674721), `155_LOD` = c(-0.000696168382693618,
-0.000917607266525328, 4.77049670728094e-06, 0.000140297660927979,
-5.99898679530658e-06, 6.71169142984434e-06), `155_LOD.1` = c(-0.000213644203677328,
-3.44396001911029e-07, -0.000524232671878577, -0.000830180665933627,
1.47799998238307e-06, -5.97640014667251e-05), `155_LOD.2` = c(-0.000749882784933487,
0.000345737159390042, -0.00076916001239521, -0.000135205762575321,
-2.55352420251723e-06, -3.07199008030628e-05), `31_LOD` = c(-0.00212014938530172,
0.0247411322547065, -0.00107990654365844, -0.000409195814154659,
-0.00768439381433953, 0.001860128524035), `31_LOD.1` = c(-0.00248488588195854,
-0.011146734518705, -0.000167943850441196, -0.0021998906531997,
0.0166775965182051, -0.0156939303287719), `31_LOD.2` = c(0.00210626277375321,
-0.00327815351414411, -0.00271043947479133, 0.00118991079627845,
-0.00838520090692615, 0.0255825346347586), `33_LOD` = c(0.0335175783154054,
0.0130192144768818, 0.0890608024914352, -0.0142431454793663,
0.00961009674973182, -0.0429774973256228), `33_LOD.1` = c(0.018600175159935,
0.04588362587764, 0.0517479021554752, 0.0453766081395813, -0.0483559729403664,
0.123771869764484), `33_LOD.2` = c(0.01906507758481, -0.00984821669825455,
0.134177176083007, -0.00544320457445977, 0.0516083894733814,
-0.0941500564321804), `39_LOD` = c(-0.148517395684098, -0.21311281527214,
0.112875846920874, -0.134256453140454, 0.0429030528286934, -0.0115143877745049
), `39_LOD.1` = c(-0.0431568202849291, -0.159003698955288, 0.0429009071238143,
-0.126060096927082, -0.078848020069061, -0.0788748111534866),
`39_LOD.2` = c(-0.16276833960171, 0.0236589399437796, 0.0828435027244962,
-0.50219849047847, -0.105196237549017, -0.161206838628339
), `42_LOD` = c(-0.00643926654994104, -0.0069253267922805,
7.63419856289838e-05, -0.0185223126108671, 0.00120855708103566,
-0.00275288147011515), `42_LOD.1` = c(-0.000866169150506504,
-0.00147791175852563, -0.000670310173141084, -0.00757733007180311,
0.0151353172950393, -0.00114193461500327), `42_LOD.2` = c(0.00719928454572906,
0.00311615354837406, 0.00270759483782046, -0.0108062423259522,
0.00158765505419478, -0.0034831499672973), `45_LOD` = c(0.00557787518897268,
0.022337270533665, 0.00657118689440082, -0.00247269227623608,
0.0191646343214611, 0.0233090596023039), `45_LOD.1` = c(-0.0305395220788143,
0.077105031761457, -0.00101713990356452, 0.0147500116150713,
-5.43009569586179e-05, -0.0235006181977403), `45_LOD.2` = c(-0.0216498682456909,
-0.0413426968184435, -0.0210779895848601, -0.0147549519865421,
0.00305229143870313, -0.0483293292336662), `47_LOD` = c(-0.00467568767221499,
-0.0199796182799552, 0.00985966068611855, -0.031010117051163,
0.0319279109813341, 0.0350743318265918), `47_LOD.1` = c(0.00820166533285921,
-0.00748186905620154, -0.010483251821707, -0.00921919551377505,
0.0129546148757833, 0.000223462281435923), `47_LOD.2` = c(0.00172469728530889,
0.0181683409295075, 0.00264937907258855, -0.0569837400476351,
0.00514558635349483, 0.0963339573489031), `59_LOD` = c(-0.00664210061621158,
-0.062069664217766, 0.0104345353700492, 0.0115323589989968,
-0.000701276829098035, -0.0397759501000331), `59_LOD.1` = c(-0.00844888486350536,
0.0207426674766074, -0.0227755432761471, -0.00370561240222376,
0.0152046240483297, -0.0127327412801225), `59_LOD.2` = c(-0.000546590647534814,
0.0178115310450356, 0.00776130696191998, 0.00162470375408126,
-0.036140754156005, 0.0197791914089296), `61_LOD` = c(0.00797528044191513,
-0.00358928087671818, 0.000662870138322471, -0.0412142836466128,
-0.00571822580078707, -0.0333870884803465), `61_LOD.1` = c(0.000105849888219735,
-0.00694734283847093, -0.00656216592134899, 0.00161225110022219,
0.0125744958934939, -0.0178560868664668), `61_LOD.2` = c(0.0049288443167774,
0.0059411543659837, -0.00165857112209555, -0.0093669075333705,
0.00655185371925189, 0.00516436591134869), `69_LOD` = c(0.0140014747729604,
0.0119645827116724, 0.0059880663080946, -0.00339119330845176,
0.00406436116298777, 0.00374425148741196), `69_LOD.1` = c(0.00465076983995792,
0.00664902297016735, -0.00183936649215524, 0.00496509351837152,
-0.0224812403463345, -0.0193087796456654), `69_LOD.2` = c(-0.00934638876711703,
-0.00802183076602164, 0.00406752039394799, -0.000421337136630527,
-0.00406768983408334, -0.0046016148041856), `71_LOD` = c(-0.00206064862123214,
0.0058604630066848, -0.00353440181333921, -0.000305197461077327,
0.00266085011303462, -0.00105635261106644), `71_LOD.1` = c(3.66652318354654e-06,
0.00542612739642576, 0.000860385212430484, 0.00157520645492044,
-0.00280256517377998, -0.00474358065422048), `71_LOD.2` = c(-0.00167098030843413,
0.0059622082597603, -0.00121597491543965, -0.000791592953383716,
-0.0022790991468459, 0.00508978650148816), `75_LOD` = c(NA,
-0.00562613898652477, -0.000103076958936504, -3.76628574664693e-05,
-0.000325767611573817, 0.000117404893823389), `75_LOD.1` = c(NA,
NA, -0.000496324358203359, -0.000517476831074487, -0.00213096062838051,
-0.00111202867609916), `75_LOD.2` = c(NA, NA, -0.000169651845347418,
-4.72864955070539e-05, -0.00144880109085214, 0.00421635976535877
), `79_LOD` = c(-0.0011901810540199, 0.00731686066269579,
0.00538551997145174, -0.00578723012473479, -0.0030246805255648,
0.00146141135533218), `79_LOD.1` = c(-0.00424278455960268,
-0.010593752642875, 0.0065136497427927, -0.00427355522802769,
0.000539975609490915, -0.0206849687839064), `79_LOD.2` = c(-0.00366739576561779,
-0.00374066839898667, -0.00132764684703939, -0.00534145222725701,
0.00920940542227595, -0.0101871763957068), `85_LOD` = c(-0.0120254177480422,
0.00369546541331518, -0.00420718877886963, 0.00414911885475517,
-0.00130381692844529, -0.00812757789798261), `85_LOD.1` = c(-0.00302024868281014,
0.00537704163310547, 0.00184264538884543, -0.00159032685888543,
-0.0062127769817834, 0.00349476605688194), `85_LOD.2` = c(0.0122689407380797,
-0.00509605601025503, -0.00641413996554198, 0.000592176121486696,
0.00131237912317341, -0.00535018996837309), `87_LOD` = c(0.00613621268007298,
0.000410268892659307, -0.00239014321624482, -0.00171179729894864,
-0.00107159765522861, -0.00708388174601732), `87_LOD.1` = c(0.00144787264098156,
-0.0025946273860992, -0.00194897899110034, 0.00157863310440493,
-0.0048913305554607, -0.000585669821053749), `87_LOD.2` = c(-0.00224691693198253,
-0.00277315666829267, 0.00166487067514155, -0.00173757960229744,
-0.00362252480121682, -0.0101992979591839), `93_LOD` = c(-0.0234225447373586,
0.0390095666365413, 0.00606244490932179, 0.0264258422783391,
0.0161211132913951, -0.0617678157059), `93_LOD.1` = c(-0.0124876313221369,
-0.0309636779639578, 0.00610883313140442, -0.0192442672220773,
0.0129557286224975, -0.00869066964782635), `93_LOD.2` = c(-0.0219837540560547,
-0.00521242297372905, 0.0179965615561871, 0.0081370991723329,
1.45427765512579e-06, -0.0111199632179688), `99_LOD` = c(0.00412086456443205,
-0.00259940538393106, 0.00742537463584133, -0.00302091572866969,
-0.00320466045653491, -0.00168702410433936), `99_LOD.1` = c(0.00280546156134205,
-0.00472591065687533, 0.00518402193979284, -0.00130887074314965,
0.00148769905391341, 0.00366250488078969), `99_LOD.2` = c(-0.00240469207099292,
-9.57307699040024e-05, -0.000145493235845501, 0.000667454164326723,
-0.0057445759245933, 0.00433464631989088), H_LOD = c(-6248.9128518109,
-10081.9540490064, -6696.91582671427, -5414.20614601348,
-3933.64339240365, -13153.7509294302), H_LOD.1 = c(-6.2489128518109,
-10.0819540490064, -6.69691582671427, -5.41420614601348,
-3.93364339240365, -13.1537509294302), H_LOD.2 = c(-6248.9128518109,
-10081.9540490064, -6696.91582671427, -5414.20614601348,
-3933.64339240365, -13153.7509294302)), row.names = c(NA,
6L), class = "data.frame")
What I tried
LOD_sd=aggregate(LOD_ut,list(rep(1:(ncol(LOD_ut)%/%3),each=3,len=ncol(LOD_ut))),std)[-1];
I'm stucked because of this error:
Error in aggregate.data.frame(LOD_ut, list(rep(1:(ncol(LOD_ut)%/%3), each = 3, :
arguments must have same length
Someone can help me?
Thanks
Here is an idea via base R. We split the data frame every 3 columns and create a list. We then loop over that list and calculate the rowwise standard deviation, i.e.
sapply(split.default(df, rep(seq((ncol(df) / 3)), each = 3)), function(i)
apply(i, 1, sd, na.rm = TRUE))
Related
not reading the function properly
I think i did not write this code properly since it does not find the proper code inside the function : comp_spread_CDS = function(loss, vec_ZC_prem, vec_ZC_def, vec_prob_suv_prem, vec_prob_surv_def) { nb_payment = lenght(vec__VC_prem) nb_step = lengh(vec_ZC_def) vec_prob_surv_prem_eff = vec_prob_surv_prem + c(1, vec_prob_surv_prem[1 :(nb_payment - 1)]) vec_tenor = rep(tenor, nb_payment) vec_prob_def = c(1, vec_prob_surv_def[1:(nb_step-1)]) - vec_prob_surv_def annuity = 0.5 * sum(vec_ZC_prem * vec_prob_surv_prem_eff * vec_tenor) leg_def = los * sum( vec_ZC_def * vec_prob_def) spread_CDS = leg_def / annuity return(spread_CDS) } base = 10000 notional = 100 maturity = 5 recovery = 40/100 loss = 1 - recovery int_rate = 3/100 intensity = 180/base tenor = 3/12 time_step = 1/360 nb_payment = maturity/ tenor nb_payment nb_step = maturity/time_step nb_step c(1, vec_prob_surv_prem[1: nb_payment-1]) vec_prob_surv_prem_eff As a result it is telling me that the object is not find by R. vec_prob_surv_prem_eff Erreur : objet 'vec_prob_surv_prem_eff' introuvable thank you for your help sincerely,
How to create a new data file from an existing dataset to load into Rattle?
My goal is to create a decision tree model in Rattle for a school project. I've been able to determine the variables that I would need for my research question and created a new dataset from the original .csv file. After saving the new dataset as not only an .xls file and a .rdata file, I received an error message after loading the file into Rattle. This is my first time creating a decision tree model so I'm struggling a bit. Thanks in advance for your help! Here's what I have so far: install.packages(readxl) library(readxl) library(rattle) setwd("C:/Users/river/OneDrive/Documents/Random Data") edu <- read_excel('pfi_pu.xlsx') eduu <- data.frame(c("P1HRSWK" = c(edu$P1HRSWK), "P1EMPL" = c(edu$P1EMPL), "P2HRSWK" = c(edu$P2HRSWK), "P2EMPL" = c(edu$P2EMPL), "P1ENRL" = c(edu$P1ENRL), "P2ENRL" = c(edu$P2ENRL), "P1EDUC" = c(edu$P1EDUC), "P2EDUC" = c(edu$P2EDUC), "P1HISPRM" = c(edu$P1HISPRM), "P2HISPRM" = c(edu$P2HISPRM), "P1PACI" = c(edu$P1PACI), "P2PACI" = c(edu$P2PACI), "P1BLACK" = c(edu$P1BLACK), "P2BLACK" = c(edu$P2BLACK), "P1ASIAN" = c(edu$P1ASIAN), "P2ASIAN" = c(edu$P2ASIAN), "P1AMIND" = c(edu$P1AMIND), "P2AMIND" = c(edu$P2AMIND), "P1HISPAN" = c(edu$P1HISPAN), "P2HISPAN" = c(edu$P2HISPAN), "P1LKWRK" = c(edu$P1LKWRK), "P2LKWRK" = c(edu$P2LKWRK), "P1MTHSWRK" = c(edu$P1MTHSWRK), "P1REL" = c(edu$P1REL), "P2REL" = c(edu$P2REL), "P1SEX" = c(edu$P1SEX), "P2SEX" = c(edu$P2SEX), "P1MRSTA" = c(edu$P1MRSTA), "SEFUTUREX" = c(edu$SEFUTUREX), "HSFUTUREX" = c(edu$HSFUTUREX), "PARGRADEX" = c(edu$PARGRADEX), "TTLHHINC" = c(edu$TTLHHINC), "PAR1EMPL" = c(edu$PAR1EMPL), "PAR2EMPL" = c(edu$PAR2EMPL), "SEEXPEL" = c(edu$SEEXPEL), "SESUSPIN" = c(edu$SESUSPIN), "SESUSOUT" = c(edu$SESUSOUT), "SEGRADEQ" = c(edu$SEGRADEQ) ,dim = c(14075,38,1)) save(eduu,file="eduu.xls") error message
Seems your problem is about writing a file. The command save must be used to save .RData files, not Excel files. According to this post, you may try: openxlsx::write.xlsx(eduu, 'eduu.xlsx') xlsx::write.xlsx(eduu, 'eduu.xlsx') writexl::write_xlsx(eduu, 'eduu.xlsx')
Creating advance lua array table from list view and call the array elements
Say I have a CE Lua form and some variables: form.Show() list = form.CEListView1 tab_player = {} p_name = 'Joe' p_gen = 'Male' table.insert(tab_player,{player_name = p_name, player_gen = p_gen}) -- and then add some elements from List View to same record index for idx = list.ItemIndex + 1, list.Items.Count-1 do mtrl_name = list.Items[idx].Caption mtrl_qty = list.Items[idx].SubItems[0] mtrl_unit = list.Items[idx].SubItems[1] mtrl_price = list.Items[idx].SubItems[2] mtrl_tprice = list.Items[idx].SubItems[3] table.insert(tab_player, {v_itemname = mtrl_name, v_itemqty = mtrl_qty, v_itemunit = mtrl_unit, v_itemprice = mtrl_price, v_itemttlprice = mtrl_tprice}) end -- check for index, data in ipairs(tab_player) do print(index) for key, value in pairs(data) do print('\t', key, value) end end Result, it's created 9 tab_player record indexes (depending how many items on list view). What I want is like this structure for one record index: tab_player = { player_name = p_name, player_gen = p_gen, { v_itemname = mtrl_name, v_itemqty = mtrl_qty, v_itemunit = mtrl_unit, v_itemprice = mtrl_price, v_itemttlprice = mtrl_tprice}, { v_itemname = mtrl_name, v_itemqty = mtrl_qty, v_itemunit = mtrl_unit, v_itemprice = mtrl_price, v_itemttlprice = mtrl_tprice}, { v_itemname = mtrl_name, v_itemqty = mtrl_qty, v_itemunit = mtrl_unit, v_itemprice = mtrl_price, v_itemttlprice = mtrl_tprice} -- and so on } How CE Lua script to get the structure as I want? If done, then how CE Lua script call the data from tab_player to fill player name editbox, player gen editbox and fill the items to CE List View? EDIT: What I want to be produce an array table with structure below: list = UDF1.CEListView1 tab_player = {} player_name = 'Joe' player_gen = 'Male' -- this is list view items contain: --- row 1, column 1 to 5 mtrl_name = list.Items[1].Caption -- Milk mtrl_qty = list.Items[1].SubItems[0] -- 300 mtrl_unit = list.Items[1].SubItems[1] -- ml mtrl_price = list.Items[1].SubItems[2] -- 3975 mtrl_tprice = list.Items[1].SubItems[3] -- 3975 --- row 2, column 1 to 5 mtrl_name = list.Items[2].Caption -- Sugar mtrl_qty = list.Items[2].SubItems[0] -- 1 mtrl_unit = list.Items[2].SubItems[1] -- Kg mtrl_price = list.Items[2].SubItems[2] -- 18000 mtrl_tprice = list.Items[2].SubItems[3] -- 18000 --- row 3, column 1 to 5 and so om the tab_player should be: tab_player = { -- index 0 or record 1 {player_name = 'Joe', player_gen = 'Male', -- row 1, column 1 to 5 { item_name = 'Milk', item_qty = 300, item_unit = 'ml', item_price = 3975, item_tprice = 3975 }, -- row 2, column 1 to 5 { item_name = 'Sugar', item_qty = 2, item_unit = 'Kg', item_price = 9000 item_tprice = 18000 }, -- row 3, column 1 to 5 { item_name = 'bla bla bla', item_qty = 1, item_unit = 'bla', item_price = 1000000 item_tprice = 1000000 } -- and so on } How to create, print multidimensional and call back the item from the array table as above?.
I need a hint to sorted out this issue
I have this form: class RegisterForm(FlaskForm): """Registration form.""" username = StringField('Username', validators=[Required(), Length(1, 64)]) password = PasswordField('Password', validators=[Required(), Length(4, 4)]) password_again = PasswordField('Password again', validators=[Required(), EqualTo('password'), Length(4, 4)]) tc = SelectField(validators=[Required()], choices=[("m_", "Yes"), ("n", "No")]) submit = SubmitField('Register') And I want to add secrets.randbelow(n) next to "m_"
R GBM versus Spark GBT performance
I'm trying to compare performance between R and Spark-ML and my initial testing tells me that Spark-ML is better than R in most cases and scales much better when the dataset gets bigger. However, I'm having strange results when it comes to Gradient Boosted Trees, especially because R takes 3 minutes where Spark takes 15 on the same dataset, on the same computer. Here is the R code: train <- read.table("c:/Path/to/file.csv", header=T, sep=";",dec=".") train$X1 <- factor(train$X1) train$X2 <- factor(train$X2) train$X3 <- factor(train$X3) train$X4 <- factor(train$X4) train$X5 <- factor(train$X5) train$X6 <- factor(train$X6) train$X7 <- factor(train$X7) train$X8 <- factor(train$X8) train$X9 <- factor(train$X9) library(gbm) boost <- gbm(Freq~X1+X2+X3+X4+X5+X6+X7+X8+X9+Y1, distribution = "gaussian", data = train, n.trees = 2000, bag.fraction = 1, shrinkY1 = 1, interaction.depth = 1, n.minobsinnode = 50, train.fraction = 1.0, cv.folds = 0, keep.data = TRUE) And here is the scala code for Spark import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession import org.apache.spark.ml.regression.GBTRegressor val conf = new SparkConf() .setAppName("GBTExample") .set("spark.driver.memory", "8g") .set("spark.executor.memory", "8g") .set("spark.network.timeout", "120s") val sc = SparkContext.getOrCreate(conf.setMaster("local[8]")) val spark = new SparkSession.Builder().getOrCreate() import spark.implicits._ val sourceData = spark.read.format("com.databricks.spark.csv") .option("header", "true") .option("delimiter", ";") .option("inferSchema", "true") .load("c:/Path/to/file.csv") val data = sourceData.select($"X1", $"X2", $"X3", $"X4", $"X5", $"X6", $"X7", $"X8", $"X9", $"Y1".cast("double"), $"Freq".cast("double")) val X1Indexer = new StringIndexer().setInputCol("X1").setOutputCol("X1Idx") val X2Indexer = new StringIndexer().setInputCol("X2").setOutputCol("X2Idx") val X3Indexer = new StringIndexer().setInputCol("X3").setOutputCol("X3Idx") val X4Indexer = new StringIndexer().setInputCol("X4").setOutputCol("X4Idx") val X5Indexer = new StringIndexer().setInputCol("X5").setOutputCol("X5Idx") val X6Indexer = new StringIndexer().setInputCol("X6").setOutputCol("X6Idx") val X7Indexer = new StringIndexer().setInputCol("X7").setOutputCol("X7Idx") val X8Indexer = new StringIndexer().setInputCol("X8").setOutputCol("X8Idx") val X9Indexer = new StringIndexer().setInputCol("X9").setOutputCol("X9Idx") val assembler = new VectorAssembler() .setInputCols(Array("X1Idx", "X2Idx", "X3Idx", "X4Idx", "X5Idx", "X6Idx", "X7Idx", "X8Idx", "X9Idx", "Y1")) .setOutputCol("features") val dt = new GBTRegressor() .setLabelCol("Freq") .setFeaturesCol("features") .setImpurity("variance") .setMaxIter(2000) .setMinInstancesPerNode(50) .setMaxDepth(1) .setStepSize(1) .setSubsamplingRate(1) .setMaxBins(32) val pipeline = new Pipeline() .setStages(Array(X1Indexer, X2Indexer, X3Indexer, X4Indexer, X5Indexer, X6Indexer, X7Indexer, X8Indexer, X9Indexer, assembler, dt)) val model = pipeline.fit(data) I have the feeling that I'm not comparing the same methods here, but the documentation that I could find did not clarify the situation.