Table from results of sknn function (klaR package) won't output - r

I have a data set with 100 observations of 6 variables that I'm trying to run the sknn function on and then output a table of the results. I have updated the response variable to a factor to use as row and column headers in the table, and checked the data types of all other variables to make sure they are compatible (int and num). For some reason, no matter what I try, R freezes trying to pull the table of results and eventually crashes.
Here is the code I'm using:
loans_k$action_taken <- as.factor(loans_k$action_taken)
loans_k.knn <- sknn(loans_k$action_taken ~ ., data = loans_k, kn = 6)
loans_knntab <- table(loans_k$action_taken, predict(loans_k.knn)$class)
The data I'm using is:
> dput(loans_k)
structure(list(action_taken = structure(c(1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), loan_type = c(1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), loan_purpose = c(2L,
1L, 4L, 1L, 31L, 31L, 32L, 1L, 1L, 2L, 1L, 31L, 1L, 1L, 1L, 1L,
1L, 32L, 1L, 1L, 31L, 2L, 31L, 4L, 32L, 32L, 1L, 32L, 2L, 31L,
1L, 32L, 32L, 1L, 5L, 2L, 32L, 31L, 1L, 1L, 31L, 32L, 31L, 32L,
32L, 31L, 1L, 1L, 32L, 31L, 31L, 1L, 31L, 1L, 1L, 1L, 31L, 32L,
4L, 4L, 4L, 4L, 1L, 31L, 31L, 31L, 32L, 1L, 1L, 1L, 31L, 4L,
2L, 31L, 2L, 4L, 31L, 1L, 32L, 1L, 31L, 32L, 1L, 31L, 1L, 31L,
1L, 2L, 1L, 31L, 31L, 4L, 31L, 1L, 2L, 31L, 32L, 32L, 31L, 2L
), loan_to_value_ratio = c(63.2, 102.23, 55.98, 95, 75, 59.719,
24.038, 56.111, 98.02, 50, 80, 73.642, 96.5, 75, 73.723, 75,
63.141, 78.328, 66.32, 95, 61.667, 80, 81.944, 80, 58.685, 75.955,
85, 80, 50.144, 84.848, 80, 55.671, 60, 100, 64, 87.511, 48.541,
73.735, 102.16, 80, 61.62, 78.947, 44.64, 55.556, 26.846, 40.425,
102.391, 47.71, 51.333, 89.82, 78.949, 36.32, 55.441, 96.5, 95,
28.704, 74.199, 67.33, 77.84, 97.326, 26.75, 102, 80, 47.193,
76.27, 22.096, 72.759, 75, 97, 95, 67.942, 35.86, 84.04, 80,
92.25, 83.06, 85, 53.58, 75, 80, 54.415, 64.947, 100, 82, 80,
91.34, 95, 82.15, 100.514, 57.896, 71.42, 100, 59.905, 93.103,
79.91, 79.72, 60, 43.9, 79.772, 73.58), loan_term = c(60L, 360L,
240L, 360L, 348L, 360L, 180L, 360L, 360L, 60L, 360L, 312L, 360L,
360L, 360L, 360L, 240L, 360L, 180L, 360L, 360L, 360L, 360L, 240L,
180L, 360L, 360L, 360L, 120L, 360L, 360L, 360L, 360L, 360L, 240L,
240L, 360L, 360L, 360L, 360L, 240L, 360L, 60L, 180L, 120L, 180L,
360L, 360L, 120L, 360L, 360L, 360L, 360L, 360L, 360L, 180L, 360L,
360L, 360L, 60L, 360L, 360L, 360L, 180L, 360L, 120L, 360L, 360L,
360L, 360L, 180L, 120L, 360L, 360L, 60L, 240L, 360L, 240L, 360L,
360L, 360L, 360L, 360L, 360L, 360L, 360L, 360L, 120L, 360L, 360L,
360L, 360L, 360L, 360L, 240L, 360L, 240L, 240L, 360L, 361L),
debt_to_income_ratio = c(30L, 50L, 36L, 38L, 38L, 48L, 20L,
49L, 36L, 20L, 39L, 44L, 42L, 30L, 20L, 30L, 44L, 50L, 44L,
36L, 30L, 30L, 39L, 38L, 46L, 36L, 30L, 36L, 30L, 36L, 36L,
49L, 20L, 36L, 36L, 60L, 41L, 30L, 36L, 41L, 30L, 41L, 20L,
38L, 60L, 20L, 44L, 42L, 36L, 20L, 41L, 30L, 30L, 45L, 36L,
42L, 42L, 39L, 36L, 50L, 36L, 46L, 36L, 36L, 50L, 30L, 43L,
36L, 37L, 50L, 30L, 20L, 30L, 42L, 36L, 60L, 36L, 30L, 36L,
20L, 49L, 30L, 40L, 30L, 47L, 49L, 43L, 30L, 30L, 30L, 36L,
41L, 36L, 37L, 30L, 20L, 42L, 38L, 30L, 36L)), row.names = c(NA,
100L), class = "data.frame")
How can I get R to output the desired table?

Related

Gtsummary output with mgcv gam

I have the following data set:
structure(list(Age = c(83L, 26L, 26L, 20L, 20L, 77L, 32L, 21L,
15L, 75L, 27L, 81L, 81L, 15L, 24L, 16L, 35L, 27L, 30L, 31L, 24L,
24L, 31L, 79L, 30L, 19L, 20L, 42L, 62L, 83L, 79L, 18L, 26L, 66L,
23L, 83L, 77L, 80L, 57L, 42L, 32L, 76L, 85L, 29L, 65L, 79L, 9L,
34L, 20L, 16L, 34L, 22L, 19L, 23L, 25L, 14L, 53L, 28L, 79L, 22L,
22L, 21L, 82L, 81L, 16L, 19L, 77L, 15L, 18L, 15L, 78L, 24L, 16L,
14L, 29L, 18L, 50L, 17L, 43L, 8L, 14L, 85L, 31L, 20L, 30L, 23L,
78L, 29L, 6L, 61L, 14L, 22L, 10L, 83L, 15L, 13L, 15L, 15L, 29L,
8L, 9L, 15L, 8L, 9L, 15L, 9L, 34L, 8L, 9L, 9L, 16L, 8L, 25L,
21L, 23L, 13L, 56L, 10L, 7L, 27L, 8L, 8L, 8L, 8L, 80L, 80L, 6L,
15L, 42L, 25L, 23L, 21L, 8L, 11L, 43L, 69L, 34L, 34L, 14L, 12L,
10L, 22L, 78L, 16L, 76L, 12L, 10L, 16L, 6L, 13L, 66L, 11L, 26L,
12L, 16L, 13L, 24L, 76L, 10L, 65L, 20L, 13L, 25L, 14L, 12L, 15L,
43L, 51L, 27L, 15L, 24L, 34L, 63L, 17L, 15L, 9L, 12L, 17L, 82L,
75L, 24L, 44L, 69L, 11L, 10L, 12L, 10L, 10L, 70L, 54L, 45L, 42L,
84L, 54L, 23L, 23L, 14L, 81L, 17L, 42L, 44L, 16L, 15L, 43L, 45L,
50L, 53L, 23L, 53L, 49L, 13L, 69L, 14L, 65L, 14L, 13L, 22L, 67L,
59L, 52L, 54L, 44L, 78L, 62L, 69L, 10L, 63L, 57L, 22L, 12L, 62L,
9L, 82L, 53L, 54L, 66L, 49L, 63L, 51L, 9L, 45L, 49L, 77L, 49L,
61L, 62L, 57L, 67L, 16L, 65L, 75L, 45L, 16L, 55L, 17L, 64L, 67L,
56L, 52L, 63L, 10L, 62L, 14L, 66L, 68L, 15L, 13L, 43L, 47L, 55L,
69L, 21L, 67L, 34L, 52L, 15L, 31L, 64L, 55L, 13L, 48L, 71L, 64L,
13L, 25L, 34L, 50L, 61L, 70L, 33L, 57L, 51L, 46L, 57L, 69L, 46L,
8L, 11L, 46L, 71L, 33L, 38L, 56L, 17L, 29L, 28L, 6L, 8L), Sex = structure(c(1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L,
2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L,
2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L,
1L, 2L, 2L), .Label = c("Male", "Female"), class = "factor"),
mean_AD_scaled = c(3.15891332561581, -0.0551328105526693,
0.582747640515478, 1.94179165777054, 1.7064645993306, 2.37250948563045,
1.015775832203, 1.36189033704266, -1.05640048650493, 0.184814975542474,
-0.143366705302007, 1.81560178585347, 2.06325078470728, -0.473088628698217,
0.414641167726219, 0.199887349084444, -0.60620959209809,
-0.17879228399189, -1.03483709078065, -1.43497010225613,
-0.958595084469815, 1.0203965598582, -1.44731404613503, -1.17191867788498,
-2.02547709312595, -1.22395687266857, -1.09952727795348,
-1.0830246791849, 1.21072653232248, 1.69997357714829, 1.53648783201423,
0.208688735094353, 0.0862394522314924, 1.08662698958276,
-0.731299290763917, 2.29307697689102, -0.660008064083659,
-1.21425334459264, 1.10191939777498, -2.0957781638801, -1.14947514355972,
0.248845058764562, 2.6526135953958, 0.197907037232212, -0.222469162066061,
1.92880961340592, 1.23328008397287, -1.17288683034607, -0.308282675662673,
-1.02603570477074, -1.32647101621898, -1.58316343919798,
-0.0440210607151585, -0.388375288352846, -0.935491446193807,
-0.63789458173376, 0.454577456746182, -1.77391147749773,
0.709267564407921, 0.125735671950958, -0.821073428064989,
-0.126534054558056, 0.519597695894384, 0.188005477971066,
0.212319306823438, -1.45807374053215, 1.5856655763446, -1.25641198358011,
-0.910847565366061, -1.1191763722206, 0.25300371365424, -0.750772357310844,
0.37932560636146, -0.871791414947088, -1.92771569802088,
-1.1752191976387, 0.210449012296334, -0.347778895382139,
-0.132254955464496, 0.953616043508016, -0.0862677135627232,
0.838977990728951, -1.8993092246739, -0.0254281327692267,
0.298022803094927, -1.21559555595915, 0.0134079829994995,
-0.763094297724715, 0.334768589686298, -1.12568939786794,
-2.11786964276497, -0.0434709740895377, 0.388237009696492,
1.30050066962355, -0.260645173884043, -0.60620959209809,
1.05945271027717, -0.275717547426008, -0.0238878902174922,
0.496604074943496, 0.534009965485611, -0.692903244295693,
-0.566933407028871, 0.125625654625835, -0.518305749324122,
1.79381835547894, -0.790708646330802, -0.227860010997131,
0.347420582075538, 0.784189362817269, -0.660118081408782,
1.29962053102256, -0.561652575422924, -0.710395998990384,
-1.29315777017148, -0.457356151205503, -1.01756437073621,
0.146528946399368, -1.07136284272178, -1.42968927065019,
0.798601632408495, -0.799730066990963, -0.431348055546223,
0.569545561500617, 2.32168148142323, 0.472070211440872, 1.65145593676866,
-0.814142336582189, -0.544489872703603, -0.315433801795725,
0.382626126115175, -0.623812364117908, 0.216279930527897,
-0.606099574772967, -0.367207954999011, 0.719829227619811,
-0.749122097433987, 0.934693063586709, -0.79026857703031,
-0.371872689584264, 0.0769979969210905, -0.793899148759394,
1.50414273842782, 0.730280873506577, -0.290569886317732,
0.303743704001367, 0.390877425499463, -1.00359217044547,
-0.534918365417827, 0.325967203676389, 0.129036191704673,
0.34434009697207, -0.141386393449775, -0.363401355549725,
-0.395416397160769, -0.0235578382421178, -1.13583299524436,
1.16781977552417, -1.31890182425046, 0.139377820266317, 0.0160483988024708,
0.481311666751279, -1.05475022662807, 0.839858129329941,
0.652498624644007, -0.350199276534864, -0.262075399110649,
0.178543988010412, -1.13198238886502, -0.05117218684821,
-1.29678834190056, 0.429603523943066, 1.05098137624263, -0.956504755292464,
0.502765045150433, -0.81678275238516, -1.50263075720731,
-0.826684311646306, 2.40100397283753, 2.06633126981075, -0.470558230220369,
0.484942238480364, 0.822035322659877, 0.143888530596397,
0.384056351341786, -0.63580425255641, 0.358422314587926,
-0.372422776209885, 0.0607154328027556, -0.113221958218067,
1.02710761669075, -0.349649189909243, 2.27195365046724, -0.507634068787109,
-0.326105482332738, -1.0396778530861, 1.06484355920824, 1.32151397872221,
-0.185173288849074, -0.651888785489516, -0.171311105883464,
-0.104200537557911, -0.693673365571561, -1.26609350819101,
0.411230630647381, -0.929770545287362, -0.481009876107135,
0.386146680519137, 0.0482834750637615, -0.198265350538812,
0.790020281048832, 0.926001694901924, -1.08918564939184,
0.50298507980068, -0.0694350628187722, 1.04966116834114,
0.00878725534429612, 1.48742010500899, 0.750194009353997,
0.423772605711498, -0.596418050162068, -0.652636903300361,
-0.308942779613417, 0.314437388003408, 0.679562886624478,
-1.24312189070515, -0.432712270377761, 0.00427654501421597,
-0.197935298563442, 0.228821905592019, 1.06957430418856,
-1.61612462980509, 1.9499329398297, -0.263285589687014, 0.156430505660519,
-0.322254875953402, -0.451085163673446, -0.35526007349056,
0.10780284795577, 0.408700232169533, -0.957604928543701,
-1.05662052115517, 1.00345389178912, -0.238751726184391,
0.300003114947154, -0.397946795638617, -0.0802167606809086,
0.943714484246865, 1.10973062785877, 1.76279346979401, 1.62087112038423,
0.25533608094687, 0.226841593739787, 0.869672824438507, -1.44960240649761,
-0.450315042397579, -0.199629565370345, 0.29813282042005,
0.760425620590513, 1.87391096816911, -0.454275666102039,
-0.0559029318285365, -0.343048150401812, -1.01371376435687,
0.68880434193488, -0.29222014619459, 1.16132875334186, -1.95715633422403,
-0.534368278792206, -0.560112332871189, 1.84508642898666,
-1.19150176175703, -0.772203732244971, -0.3443683583033,
-1.45684154649076, -0.633823940704178, -1.77454957798344,
0.279539892474118, -0.875532004001301, 1.26001429397797,
-0.536590628759707, 2.1869102581465, 0.211109116247078, 0.130246382281038,
-0.355810160116181, -0.898085555651692, -0.429741802599415,
1.13360438741065, 1.61338994227581, 0.588688576072169, 0.454137387445685,
0.747113524250528, 0.460848444278238, -0.38177424884541,
-0.169990897981981, -0.747361820232001, -0.760123829946369,
0.208028631143609, -1.28748087619509, 2.33950428809329, -0.973029357526068,
-1.06091119683501, 0.917530360867389, -0.35041931118511,
-1.90613029883158, -1.15057531681095, 0.65348878057012, 0.43147381847017
)), row.names = c(NA, -308L), class = c("tbl_df", "tbl",
"data.frame"))
I am using this gam model:
m1 <- gam(mean_AD_scaled ~ s(Age, bs = 'ad', k = -1) + Sex + ti(Age, by = Sex, bs ='fs'),
data = DF,
method = 'REML',
family = gaussian)
Output:
Family: gaussian
Link function: identity
Formula:
mean_AD_scaled ~ s(Age, bs = "ad", k = -1) + Sex + ti(Age,
by = Sex, bs = "fs")
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.04691 0.06976 0.672 0.502
SexFemale -0.12950 0.09428 -1.374 0.171
Approximate significance of smooth terms:
edf Ref.df F p-value
s(Age) 2.980 3.959 8.72 2.24e-06 ***
ti(Age):SexMale 2.391 2.873 23.47 < 2e-16 ***
ti(Age):SexFemale 1.000 1.000 43.40 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Rank: 48/49
R-sq.(adj) = 0.34 Deviance explained = 35.6%
-REML = 375.4 Scale est. = 0.63867 n = 308
But when I use gtsummary, I get a repeated value for each gender 'interaction':
tbl_regression(m1, tidy_fun = tidy_gam)
I see the following in a publication, which I am trying to replicate with gender and age:
I am not sure how to fix this. My goal is to print a table for a manuscript so any other gam-related information that can be added like edf and R^2.
I think you've found a bug in the handling of these types of interactions. While we work on a fix to the bug, this code should get you what you need. Thanks
library(gtsummary)
#> #BlackLivesMatter
library(mgcv)
packageVersion("gtsummary")
#> [1] ‘1.5.2’
m1 <- gam(marker ~ s(age, bs = 'ad', k = -1) + grade + ti(age, by = grade, bs ='fs'),
data = gtsummary::trial,
method = 'REML',
family = gaussian)
tbl_regression(m1, tidy_fun = gtsummary::tidy_gam) %>%
modify_table_body(
~ .x %>%
dplyr::select(-n_obs) %>%
dplyr::distinct()
) %>%
as_kable() # convert to kable to display on SO
Characteristic
Beta
95% CI
p-value
Grade
I
—
—
II
-0.39
-0.70, -0.08
0.014
III
-0.13
-0.43, 0.18
0.4
s(age)
>0.9
ti(age):gradeI
0.6
ti(age):gradeII
>0.9
ti(age):gradeIII
0.6
Created on 2022-02-21 by the reprex package (v2.0.1)

Why is prediction error discrete in adabag?

I've got the table of 55 observations with 5 variables (F,H,R,T,U) and 1 classifier variable ("Group") in which I have two groups.
I'm doing data sampling by splitting the data into the training set (70%) and test set (30%). Then I run adaboosting and check how it works.
I want to get the adaboost error distribution for 100 samplings. But the distribution occurs to be discrete, outputting only five value variants: 0, 0.0588235294117647, 0.117647058823529 0.176470588235294 and 0.235294117647059.It doesn't change with mfinal argument. I guess there should be more! How it works?
I use the folowing code:
predictions<-list()
for (i in 1:100){
train.ind<-sample(nrow(df), nrow(df) * 0.7)
assign(paste0("ada",i), do.call(boosting,
c(formula=Group~F + H + R + T + U,
data=substitute(df[train.ind,]), mfinal=50, boos=FALSE,
coeflearn='Breiman'),envir = parent.frame()))
assign(paste0("pred",i), predict(ada,df[-train.ind,]))
predictions[[i]]<-get(paste0("pred",i))$error
}
hist(100*unlist(predictions),breaks=10,
main="Error probability [%] ntrees=10. 100 sampling operations", xlab="AdaBoost error")
dput(df)
structure(list(Group = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Canines", "Sled"), class = "factor"), F = c(0.263150566678734,
0.260347316635598, 0.26437277258488, 0.265710057607949, 0.254866055219663,
0.263294264681227, 0.261901194801303, 0.257318268395066, 0.26420207103455,
0.252093225560912, 0.255473253732324, 0.259067858940115, 0.259528043446917,
0.267331491048901, 0.260246447333382, 0.26035486437815, 0.254553215708594,
0.274074579975413, 0.262896904742862, 0.260504330262876, 0.258329960879536,
0.262664861154909, 0.256148832094211, 0.258509128895957, 0.256292083925698,
0.262358651734143, 0.254578103664353, 0.255386025800537, 0.264120912009577,
0.275232714712253, 0.265375720277527, 0.267601768121804, 0.262932226832642,
0.263633189245163, 0.262826186070212, 0.261058637786334, 0.262979366135887,
0.259232168979912, 0.252933156025384, 0.263963451214447, 0.258511197058683,
0.261957295373665, 0.253412282699461, 0.260748166588172, 0.263136039863289,
0.255317062006506, 0.258822015633545, 0.252757763183064, 0.260840486010478,
0.258620689655172, 0.263738813871524, 0.26241134751773, 0.26405425581719,
0.263685152057245, 0.262062787572784), H = c(0.242711147002311,
0.243850477245014, 0.245132979060713, 0.241794831140003, 0.235370262206577,
0.241392449436832, 0.236787894677703, 0.240434935369935, 0.234076675284456,
0.236978505926275, 0.23489414817613, 0.236461115627298, 0.241377100655228,
0.240778565421122, 0.238954656595734, 0.237237027626932, 0.23562891291975,
0.228247507171151, 0.235543469567304, 0.238348073568565, 0.237639956832591,
0.237993655975811, 0.23053394888479, 0.237553985998722, 0.238716430501961,
0.241044553515742, 0.23579805839771, 0.244646715997643, 0.245211405561299,
0.248463204730402, 0.237910443860818, 0.23772859908127, 0.242517289073306,
0.230376515634971, 0.239386381312522, 0.242971498213445, 0.248246377553633,
0.245227816034538, 0.237968589560153, 0.235998092571798, 0.235639593181493,
0.240320284697509, 0.239383587641388, 0.237939850635807, 0.240409493084614,
0.239705089012767, 0.235291279312896, 0.237725562711216, 0.251017166425148,
0.244410329082034, 0.247581475626206, 0.244082639531298, 0.248022977743474,
0.246127343801762, 0.246345535241663), R = c(0.23238005068085,
0.233913128793082, 0.232906768805408, 0.234580624702711, 0.23729616240706,
0.232552468336102, 0.23566425708828, 0.233370934038501, 0.23413197660754,
0.241255572873247, 0.240609653949119, 0.233790113420818, 0.239086204963073,
0.233644719452121, 0.23849468613068, 0.236846146329206, 0.239755264655663,
0.225925420024587, 0.239355887920232, 0.237429996633718, 0.23819641170916,
0.232039177131833, 0.223832380603256, 0.235838907338977, 0.236669843303285,
0.234916072348618, 0.238304558463179, 0.235904655883701, 0.232124394623714,
0.222879222527955, 0.233232723139038, 0.233871666714818, 0.235947441217151,
0.242585880964708, 0.234693056561268, 0.233941777691605, 0.229366135886539,
0.23539800906269, 0.239803390172875, 0.236505714593364, 0.24647853698133,
0.235569395017794, 0.242526379716086, 0.236207360559779, 0.234180854122081,
0.240408036487878, 0.239601762794737, 0.245058343429191, 0.234449894103222,
0.237875925051173, 0.230698942666106, 0.233475177304965, 0.231384358432554,
0.233114688928642, 0.230655428424067), T = c(0.261758235638105,
0.261889077326307, 0.257587479549, 0.257914486549337, 0.272467520166701,
0.262760817545838, 0.265646653432713, 0.268875862196498, 0.267589277073454,
0.269672695639567, 0.269022944142428, 0.270680912011768, 0.260008650934782,
0.258245224077857, 0.262304209940204, 0.265561961665713, 0.270062606715993,
0.271752492828849, 0.262203737769602, 0.263717599534841, 0.265833670578713,
0.267302305737446, 0.289484838417743, 0.268097977766344, 0.268321642269056,
0.261680722401497, 0.271319279474757, 0.264062602318119, 0.258543287805409,
0.253424858029389, 0.263481112722616, 0.260797966082108, 0.258603042876902,
0.263404414155158, 0.263094376055998, 0.262028086308617, 0.259408120423941,
0.26014200592286, 0.269294864241588, 0.263532741620391, 0.259370672778494,
0.262153024911032, 0.264677749943065, 0.265104622216242, 0.262273612930016,
0.264569812492848, 0.266284942258822, 0.264458330676529, 0.253692453461153,
0.25909305621162, 0.257980767836164, 0.260030835646007, 0.256538408006782,
0.25707281521235, 0.260936248761486), U = c(0.276642254462421,
0.275750907536407, 0.274138521440258, 0.279385339041277, 0.283770344294126,
0.273124933319108, 0.276770665567999, 0.272796198013943, 0.273326789343435,
0.278824893979485, 0.282917535762971, 0.269035729493284, 0.276381346021371,
0.275681845488406, 0.280473043309851, 0.274957072857482, 0.279453614114969,
0.265400901516186, 0.284438401450319, 0.275270067631668, 0.277080803992985,
0.268341093323935, 0.26334299428362, 0.27494270078114, 0.277070411973316,
0.276364671746617, 0.277622940087166, 0.275489489882784, 0.275412200032649,
0.267636555236813, 0.275475938484053, 0.27914367434201, 0.281161825726141,
0.287341513046201, 0.274277898463271, 0.272041104617345, 0.268317034458041,
0.277054269097656, 0.276448903327891, 0.282483963758864, 0.288513266166897,
0.280409252669039, 0.283610415243301, 0.27874587902846, 0.274619094771137,
0.275604453090517, 0.286100299160421, 0.288513039597016, 0.270078586556683,
0.280480764184118, 0.274123602187187, 0.277940178846747, 0.273784368554907,
0.282369310276287, 0.277372857201026)), na.action = structure(c(`2` = 2L,
`4` = 4L, `19` = 18L, `24` = 20L, `28` = 24L, `29` = 25L, `30` = 26L,
`32` = 28L, `33` = 29L, `42` = 38L, `54` = 46L, `69` = 54L, `74` = 58L,
`77` = 59L, `79` = 60L, `80` = 61L, `83` = 62L), class = "omit"), row.names = c(5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 20L,
25L, 26L, 27L, 31L, 41L, 44L, 46L, 47L, 48L, 50L, 51L, 52L, 55L,
57L, 64L, 65L, 66L, 67L, 68L, 70L, 71L, 72L, 85L, 86L, 87L, 88L,
89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L,
101L, 102L, 103L), class = "data.frame")

Nonlinear model convergence

I have a time series data set and each time series has datapoint of 30-year from different/same species. I am developing a forecasting model using the first 23 years of data from each time series data point and I am using the rest 7 years as test set to know the predictive ability of model but the nonlinear model (Model 6 and Model 7) don't give succinct result?
Data:
DD <- structure(list(Plot = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("A",
"B", "C", "D"), class = "factor"), Species = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("BD", "BG"), class = "factor"), Year = c(37L,
38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L,
51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L,
64L, 65L, 66L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L,
47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L,
60L, 61L, 62L, 63L, 64L, 65L, 66L, 37L, 38L, 39L, 40L, 41L, 42L,
43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L,
56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 37L, 38L,
39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L,
52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L,
65L, 66L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L,
48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L,
61L, 62L, 63L, 64L, 65L, 66L, 37L, 38L, 39L, 40L, 41L, 42L, 43L,
44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L,
57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 37L, 38L, 39L,
40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L,
53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 65L,
66L), Count = c(81L, 45L, 96L, 44L, 24L, 8L, 28L, 32L, 39L, 29L,
40L, 17L, 4L, 12L, 18L, 11L, 63L, 98L, 78L, 76L, 67L, 36L, 56L,
43L, 81L, 8L, 14L, 20L, 25L, 19L, 135L, 91L, 171L, 88L, 59L,
1L, 1L, 1L, 2L, 1L, 11L, 9L, 34L, 15L, 32L, 21L, 33L, 43L, 39L,
20L, 6L, 3L, 9L, 9L, 28L, 16L, 15L, 2L, 1L, 1L, 34L, 16L, 19L,
35L, 32L, 7L, 2L, 30L, 29L, 25L, 28L, 11L, 31L, 31L, 28L, 27L,
34L, 110L, 87L, 103L, 72L, 19L, 46L, 43L, 107L, 32L, 26L, 31L,
12L, 29L, 23L, 40L, 50L, 23L, 34L, 11L, 9L, 4L, 24L, 55L, 14L,
16L, 51L, 43L, 2L, 13L, 8L, 96L, 52L, 118L, 32L, 1L, 8L, 17L,
34L, 29L, 38L, 15L, 4L, 38L, 2L, 1L, 1L, 1L, 1L, 1L, 3L, 3L,
4L, 6L, 4L, 4L, 10L, 6L, 7L, 9L, 15L, 30L, 25L, 36L, 13L, 17L,
43L, 36L, 60L, 50L, 26L, 13L, 13L, 27L, 18L, 56L, 96L, 16L, 54L,
2L, 2L, 9L, 5L, 5L, 6L, 2L, 6L, 2L, 3L, 4L, 3L, 136L, 71L, 116L,
28L, 23L, 76L, 64L, 98L, 58L, 26L, 13L, 13L, 13L, 18L, 19L, 24L,
18L, 17L, 3L, 23L, 19L, 9L, 11L, 13L, 20L, 29L, 29L, 17L, 20L,
26L, 71L, 63L, 53L, 54L, 20L, 22L, 18L, 93L, 50L, 18L, 12L, 12L,
31L), LogCount = c(1.908385019, 1.653212514, 1.982271233, 1.643462676,
1.380211242, 0.903089987, 1.447158031, 1.505109978, 1.591064607,
1.462397998, 1.602059991, 1.230448921, 0.602059991, 1.079181206,
1.255272505, 1.041392685, 1.799340549, 1.991226076, 1.892094603,
1.880813592, 1.826074803, 1.556302501, 1.748188027, 1.633468456,
1.908485019, 0.903089987, 1.146128035, 1.301029996, 1.397940009,
1.278753601, 2.130333768, 1.95904139, 2.2329961, 1.94448267,
1.770852012, 0, 0, 0, 0.30102999, 0, 1.0411392685, 0.954242509,
1.531478917, 1.176031259, 1.505149978, 1.322219295, 1.51851394,
1.6334684456, 1.591064607, 1.301029996, 0.77815125, 0.477121255,
0.954242509, 0.954242509, 1.447158031, 1.204119983, 1.176091259,
0.301029996, 0, 0, 1.531478917, 1.204119983, 1.278753501, 1.544068044,
1.505149978, 0.084509804, 0.301029996, 1.477121255, 1.462397998,
1.397940009, 1.447158031, 1.041392685, 1.491361694, 1.491361694,
1.447158031, 1.431363754, 1.531478917, 2.041392685, 1.939519253,
2.012837225, 1.857332495, 1.278753601, 1.662757382, 1.633468456,
2.029383778, 1.505149978, 1.414973348, 1.491361594, 1.079181245,
1.462397998, 1.361727835, 1.602059991, 1.698970004, 1.361727836,
1.531478917, 1.041392685, 0.954242509, 0.602059991, 1.380211242,
1.740362689, 1.146128036, 1.204119983, 1.707570176, 1.633468456,
0.301029996, 1.113943352, 0.903089987, 1.982271233, 1.716003344,
2.071882007, 1.50514997, 0, 0.903089987, 1.230448921, 1.53147891,
1.2397998, 1.57978359, 1.176091259, 0.602059991, 1.57978359,
0.301029996, 0, 0, 0, 0, 0, 0.477121255, 0.477121255, 0.602059991,
0.77815125, 0.602059991, 0.602059991, 1, 0.77815125, 0.84509804,
0.95424509, 1.176091259, 1.477121255, 1.39790009, 1.555302501,
1.113943352, 1.230448921, 1.633468456, 1.555302501, 1.77815125,
1.698970004, 1.414973348, 1.113943352, 1.113943352, 1.431353754,
1.255272505, 1.748188027, 1.982271233, 1.204119983, 1.73239376,
1.431363754, 1.361727835, 0.954242509, 0.698970004, 0.698970004,
0.77815125, 0.301029996, 0.77815125, 0.301029996, 0.477121255,
0.602059991, 0.477121255, 2.133538908, 1.851258349, 2.064457989,
1.447158031, 1.361727836, 1.880813592, 1.806179974, 1.991226076,
1.763427994, 1.414973348, 1.113943352, 1.113943352, 1.113943352,
1.255272505, 1.278753601, 1.380211242, 1.255272505, 1.230446921,
0.477121255, 1.361727835, 1.278753601, 0.954242509, 1.0411392685,
1.113943352, 1.301029996, 1.462397998, 1.462397998, 1.230448921,
1.301029995, 1.414973348, 1.851258349, 1.799340549, 1.72427587,
1.73239376, 1.301029996, 1.342422681, 1.255272505, 1.968482949,
1.698970004, 1.255272505, 1.079181246, 1.079181246, 1.491361694
), Diff = c(-0.255272505, 0.329058719, -0.338818557, -0.263241434,
-0.077121255, 0.544068044, 0.057991947, 0.085910629, -0.128666609,
0.139661993, -0.37161107, -0.62838893, 0.477121255, 0.176091259,
-0.21387982, 0.757947864, 0.191885527, -0.099131473, -0.011281011,
-0.054738789, -0.269772302, 0.191885526, -0.114719571, 0.275016563,
-1.005395032, 0.243038049, 0.15490196, 0.096910013, -0.119186408,
NA, -0.171292376, 0.273954718, -0.288513438, -0.17363066, -1.770852012,
0, 0, 0.301029996, -0.301029996, 1.041392685, -0.087150176, 0.577235408,
-0.355387658, 0.329058719, -0.182930683, 0.196294545, 0.110954516,
-0.042403849, -0.290034611, -0.522878746, -0.301029995, 0.477121254,
0, 0.492915522, -0.243038048, -0.028028724, -0.875061263, -0.301029996,
0, 1.531078917, -0.32735893, 0.070633618, 0.265310043, -0.038918066,
-0.660051938, -0.544068044, 1.176091259, -0.014723257, -0.064457989,
0.049218022, -0.405765346, 0.449969009, 0, -0.044203663, -0.015794267,
0.100115153, 0.509913768, -0.101873432, 0.073317972, -0.155504729,
-0.578578895, 0.384054231, -0.029289376, 0.395915322, -0.5202338,
-0.09017663, 0.076388346, -0.412180448, 0.383216752, -0.100670162,
0.240332155, 0.096910013, -0.337242168, 0.169751081, -0.490086232,
-0.087150176, -0.352182518, 0.778151251, 0.360151447, -0.594234653,
0.057991947, 0.503450193, -0.07410172, -1.33243846, 0.812913356,
-0.210853365, 1.079181246, -0.266267889, 0.355878663, -0.566732029,
-1.505149978, 0.903089987, 0.327358934, 0.301029996, -0.069080919,
0.117385599, -0.403692338, -0.574031268, 0.977723606, -1.278753601,
-0.301029996, 0, 0, 0, 0, 0.477121255, 0, 0.124938736, 0.176091259,
-0.176091259, 0, 0.397490009, -0.2218485, 0.06690679, 0.10914469,
0.22184875, 0.301029996, -0.079181206, 0.158362092, -0.442359149,
0.116505569, 0.403019535, -0.077165955, 0.221848749, -0.079181206,
-0.283996656, -0.301029996, 0, 0.317420412, -0.176091259, 0.492915522,
0.23483206, -0.77815125, 0.528273777, -0.301029996, -0.069635928,
-0.407485327, -0.255272505, 0, 0.079181246, -0.477121254, 0.477121254,
-0.477121254, 0.176091259, 0.124938736, -0.124938736, 1.656417653,
-0.282280559, 0.21319964, -0.617299958, -0.085430195, 0.5191085756,
-0.074533518, 0.185045102, -0.227798082, -0.348454546, -0.301029996,
0, 0, 0.141329153, 0.023481096, 0.101457641, -0.124938737, -0.024823584,
-0.753327666, 0.884606581, -0.082974235, -0.324511092, 0.087150176,
0.072550667, 0.187086644, 0.161368002, 0, -0.231949077, 0.070581075,
0.113903352, 0.436285001, -0.00519178, -0.075054679, 0.00811789,
-0.431363764, 0.041392685, -0.087150176, 0.713210444, -0.269512945,
-0.443697499, -0.176091259, 0, 0.412180448, -0.148939013)), class = "data.frame", row.names = c(NA,
-210L))
Code:
for(f in 1:11){
for(b in 1:5){
for (c in 1:5){
#To select test sets 1,2,3,4, and 5 years beyond the training set:
#Calculate the mean of abundance for the training set years.
Model1<-lm(mean~1, data=DD1)
#
Output2:
2 3 0.676209994477288 1.9365051784348e-09 4.44089209850063e-16
3 53 11.9236453578109 2.06371097988267e-09 1.13686837721616e-13
4 31 1.94583877614293 1.11022302462516e-15 1.99840144432528e-15
5 4 8.06660449042397 1.48071350736245e-08 3.19744231092045e-14
6 5 10.5321102149558 9.31706267692789e-10 1.4210854715202e-14
..
First, please see the time series graph of counts for different species and plots below.
library(ggplot2)
ggplot(DD, aes(Year, Count)) +
geom_point() +
geom_line() +
facet_grid(Plot ~ Species) +
scale_y_log10()
It is seen that there is no obvious trend which can be fitted by power or log-power function using nls.
Second, as I understand you are trying to use nls to predict outside the training data set. Usually it is not quite an effective to use least square models because of auto-correlated nature of time-series.
Third, the simplest prediction algorithm is Holt-Winters (see "dirty" implementation below). You can use as well a ton of other algorithms like ARIMA, exponential smoothing state space etc.
x <- ts(DD[DD$Species == "BG" & DD$Plot == "elq1a3", ]$LogCount)
m <- HoltWinters(x, gamma = FALSE)
library(forecast)
f <- forecast(m, 2)
plot(f, main = "elq1a3 at BG")
Fourth, about your algorithm in question, it throws
Error in qr.solve(QR.B, cc) : singular matrix 'a' in solve.
The reason is that in the first step of for-loop (at f = b = c = 1 DD2 data frame contains just one row. And executing
Model6<-nls(Diff~1+Count^T,start=list(T=1),trace=TRUE,algorithm ="plinear",data=DD2)
means that you are trying to fit a curve using only one data point, which is impossible.
However if you change f value in for-loop from 1:11 to 2:11 another error thrown:
Error in nls(Diff ~ 1 + Count^T, start = list(T = 1), trace = TRUE,
algorithm = "plinear", : step factor 0.000488281 reduced below
minFactor 0.000976562.
In this case you cannot use "naive" approach used by plinear algorithm with self-starting inital value and, e.g. nls.control(min.factor = 1e-5). You must feed all initial coefficients explicitely with default Gauss-Newton algorithm. Quite exausting, please try yourself :)

dplyr n_distinct() in filter takes forever where as base length(unique()) works like charm

I have a data frame such as this:
structure(list(x = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L,
6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L,
13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L, 17L, 17L, 18L, 18L, 19L,
19L, 20L, 20L, 21L, 21L, 22L, 22L, 23L, 23L, 24L, 24L, 25L, 25L,
26L, 26L, 27L, 27L, 28L, 28L, 29L, 29L, 30L, 30L, 31L, 31L, 32L,
32L, 33L, 33L, 34L, 34L, 35L, 35L, 36L, 36L, 37L, 37L, 38L, 38L,
39L, 39L, 40L, 40L, 41L, 41L, 42L, 42L, 43L, 43L, 44L, 44L, 45L,
45L, 46L, 46L, 47L, 47L, 48L, 48L, 49L, 49L, 50L, 50L, 51L, 51L,
52L, 52L, 53L, 53L, 54L, 54L, 55L, 55L, 56L, 56L, 57L, 57L, 58L,
58L, 59L, 59L, 60L, 60L, 61L, 61L, 62L, 62L, 63L, 63L, 64L, 64L,
65L, 65L, 66L, 66L, 67L, 67L, 68L, 68L, 69L, 69L, 70L, 70L, 71L,
71L, 72L, 72L, 73L, 73L, 74L, 74L, 75L, 75L, 76L, 76L, 77L, 77L,
78L, 78L, 79L, 79L, 80L, 80L, 81L, 81L, 82L, 82L, 83L, 83L, 84L,
84L, 85L, 85L, 86L, 86L, 87L, 87L, 88L, 88L, 89L, 89L, 90L, 90L,
91L, 91L, 92L, 92L, 93L, 93L, 94L, 94L, 95L, 95L, 96L, 96L, 97L,
97L, 98L, 98L, 99L, 99L, 100L, 100L), y = structure(c(1L, 2L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L), .Label = c("one", "two"), class = "factor")), class = "data.frame", row.names = c(NA,
-200L), .Names = c("x", "y"))
I am trying to filter groups of x that have two distinct y values using:
library(dplyr)
df %>% group_by(x) %>% filter(n_distinct(y) > 1)
On a large data set, this almost never finishes.
Changing to this works reasonably fast for the full data set:
library(dplyr)
df %>% group_by(x) %>% filter(length(unique(y)) > 1)
Any idea why n_distinct() is super slow to never finishing?

Select top row melted data after sorting by value

I have the following melted data:
dat.melt <- structure(list(CellTypes = structure(c(62L, 35L, 73L, 45L, 14L,
22L, 46L, 13L, 68L, 21L, 1L, 10L, 64L, 24L, 72L, 58L, 51L, 9L,
60L, 37L, 34L, 49L, 33L, 2L, 50L, 32L, 11L, 52L, 44L, 66L, 8L,
5L, 47L, 59L, 53L, 7L, 6L, 77L, 75L, 17L, 27L, 61L, 20L, 18L,
19L, 16L, 54L, 15L, 41L, 3L, 63L, 48L, 57L, 43L, 70L, 40L, 12L,
76L, 74L, 29L, 28L, 25L, 30L, 42L, 39L, 56L, 4L, 67L, 71L, 31L,
36L, 23L, 38L, 69L, 55L, 26L, 65L, 62L, 35L, 73L, 45L, 14L, 22L,
46L, 13L, 68L, 21L, 1L, 10L, 64L, 24L, 72L, 58L, 51L, 9L, 60L,
37L, 34L, 49L, 33L, 2L, 50L, 32L, 11L, 52L, 44L, 66L, 8L, 5L,
47L, 59L, 53L, 7L, 6L, 77L, 75L, 17L, 27L, 61L, 20L, 18L, 19L,
16L, 54L, 15L, 41L, 3L, 63L, 48L, 57L, 43L, 70L, 40L, 12L, 76L,
74L, 29L, 28L, 25L, 30L, 42L, 39L, 56L, 4L, 67L, 71L, 31L, 36L,
23L, 38L, 69L, 55L, 26L, 65L), .Label = c("3T3-L1", "Adipose Brown",
"Adipose White", "Adrenal Gland", "B Cells (GL7 neg; Alum)",
"B Cells (GL7 neg; KLH)", "B Cells (GL7 pos; Alum)", "B Cells (GL7 pos; KLH)",
"B Cells Marginal Zone", "B220+ Dend. Cells", "BA/F3", "Bladder",
"Bone", "Bone Marrow", "C2C12", "CD4+ SP Thymoctyes", "CD4+ T cells",
"CD4+/CD8+ DP Thymocytes", "CD8+ SP Thymocytes", "CD8+ T cells",
"CD8a+ Dend. Cells Lymphoid", "CD8a+ Dend. Cells Myeloid", "Ciliary Bodies",
"Common Myeloid Progenitor", "Cornea", "Dorsal Root Ganglia",
"Embryonic Fibroblasts", "Embryonic Stem Line Bruce4 P13", "Embryonic Stem Line V26 2 P16",
"Epidermis", "Eyecup", "Follicular B Cells", "Foxp3+ Tcells",
"Granulo Monoprogenitor", "Granulocytes", "Heart", "Hematopoietic Stem Cells",
"Iris", "Kidney", "Lacrimal Gland", "Large Intestine", "Lens",
"Liver", "Lung", "Lymph Nodes", "Macrophage Peri ", "Mammary Gland",
"Mammary Gland Non-Lactating", "Mast Cells", "Mast Cells IgE",
"Mast Cells IgE 1hr", "Mast Cells IgE 6hr", "Megaerythrocyte Progenitor",
"mIMCD-3 Cells", "MIN6 cells", "Neuro2a Neuroblastoma Cells",
"NIH 3T3", "NK Cells", "Osteoblast Day14", "Osteoblast Day21",
"Osteoblast Day5", "Osteoclasts", "Ovary", "Pancreas", "Pituitary",
"Placenta", "Prostate", "RAW 264.7 Cells", "Retinal Pigment Epithelium",
"Salivary Gland", "Skeletal Muscle", "Small Intestine", "Spleen",
"Stem Cells C3H/10T1/2", "Stomach", "Umbilical Cord", "Uterus"
), class = "factor"), variable = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("LPS_IV_SP", "MPL_IV_SP"), class = "factor"),
value = c(3.647, 33.629, 17.838, 33.917, 29.66, 31.694, 32.603,
24.152, 19.969, 24.012, 40.101, 12.682, 0.323, 12.846, 5.087,
11.707, 16.682, 7.71, 22.472, 10.21, 10.109, 12.643, 12.623,
1.48, 13.075, 5.042, 12.19, 11.691, 15.24, 17.073, 5.854,
5.188, 11.983, 18.679, 6.406, 4.474, 5.445, 8.144, 0.739,
3.652, 14.232, 17.1, 2.603, 1.762, 1.993, 3.475, 10.305,
7.457, 1.189, 2.895, 4.181, 3.06, 5.885, 3.063, 2.532, 1.662,
3.86, 5.094, 5.916, 4.553, 3.703, 2.546, 0.764, 0.597, 1.39,
2.933, 0.665, 0.121, 0.257, 0.764, 0.196, 0.208, 0.232, 0.001,
0.004, 0.035, 0.036, 56.156, 53.485, 48.206, 45.975, 41.067,
40.581, 38.155, 33.009, 29.468, 29.219, 27.945, 19.165, 15.985,
15.682, 15.077, 14.72, 13.856, 13.576, 12.914, 12.77, 12.577,
12.526, 11.05, 10.532, 10.008, 9.942, 9.238, 8.67, 8.237,
7.938, 7.819, 7.55, 7.349, 7.217, 7.146, 6.158, 5.852, 5.368,
5.328, 5.126, 4.887, 4.767, 4.24, 3.858, 3.816, 3.676, 3.318,
3.118, 2.459, 2.269, 2.266, 2.201, 1.467, 1.418, 1.368, 1.267,
1.077, 1.022, 0.835, 0.667, 0.655, 0.609, 0.53, 0.452, 0.24,
0.239, 0.211, 0.124, 0.084, 0.05, 0.028, 0.024, 0.016, 0.007,
0.006, 0.003, 0.002)), row.names = c(NA, -154L), .Names = c("CellTypes",
"variable", "value"), class = "data.frame")
It looks like this:
> tail(dat.melt,n=5L)
CellTypes variable value
150 Iris MPL_IV_SP 0.016
151 Retinal Pigment Epithelium MPL_IV_SP 0.007
152 MIN6 cells MPL_IV_SP 0.006
153 Dorsal Root Ganglia MPL_IV_SP 0.003
154 Pituitary MPL_IV_SP 0.002
> head(dat.melt,n=5L)
CellTypes variable value
1 Osteoclasts LPS_IV_SP 3.647
2 Granulocytes LPS_IV_SP 33.629
3 Spleen LPS_IV_SP 17.838
4 Lymph Nodes LPS_IV_SP 33.917
5 Bone Marrow LPS_IV_SP 29.660
>
For each variable MPL_IV_SP and LPS_IV_SP I would like to select top-5 rows ('cell type') sorted descending by values. How can I do that?
You can do using data.table package as well. Below is the code:
library(data.table)
dat.melt <- data.table(dat.melt)
dat.melt[, .SD[1:5], by=variable]
The advantage of data.table is that it is faster than data.frame.
We can use top_n
library(dplyr)
dat.melt %>%
group_by(variable) %>%
top_n(5, value)
NOTE: In the other answer, there is no sorting done. But, I can understand the biased voting.

Resources