How to fix jagged line from predict_gam in ggplot2? - r

Data:
structure(list(ID = c(19903L, 28185L, 28207L, 28429L, 28522L,
29092L, 29127L, 29219L, 29304L, 30981L, 31166L, 31411L, 32010L,
33231L, 33640L, 33714L, 34093L, 34193L, 34385L, 35054L, 35337L,
35377L, 35608L, 35881L, 35940L, 37112L, 37122L, 37125L, 37170L,
37198L, 37266L, 37378L, 37589L, 37725L, 37877L, 38519L, 38522L,
38605L, 38623L, 38806L, 39040L, 39083L, 39159L, 39218L, 39593L,
39636L, 39657L, 39686L, 39700L, 39819L, 39820L, 39951L, 40151L,
40152L, 40181L, 40226L, 40248L, 40286L, 40382L, 40556L, 40623L,
40628L, 40798L, 40800L, 40815L, 40915L, 43282L, 43299L, 43450L,
43466L, 43509L, 43677L, 43740L, 43762L, 43998L, 44068L, 44130L,
44131L, 44307L, 44408L, 50679L, 50848L, 51064L, 51455L, 51690L,
51726L, 51727L, 51796L, 52126L, 52183L, 52461L, 52500L, 52502L,
52577L, 52614L, 53202L, 53320L, 53390L, 53456L, 53473L, 53474L,
53475L, 53577L, 53626L, 53851L, 53873L, 54153L, 54206L, 54532L,
54581L, 54913L, 55122L, 55267L, 55332L, 55462L, 55542L, 55612L,
55728L, 55867L, 55903L, 55920L, 55991L, 56022L, 56098L, 56307L,
56420L, 56679L, 56703L, 56746L, 56919L, 57005L, 57035L, 57405L,
57445L, 57480L, 57725L, 57808L, 57809L, 57863L, 58004L, 58060L,
58130L, 58145L, 58215L, 58229L, 58503L, 58515L, 58667L, 58999L,
59326L, 59327L, 59344L, 59361L, 59428L, 59756L, 59865L, 60099L,
60100L, 60169L, 60252L, 60280L, 60306L, 60384L, 60429L, 60472L,
60493L, 60503L, 60575L, 60603L, 60662L, 60664L, 60806L, 60846L,
60925L, 61274L, 61415L, 61727L, 61749L, 61882L, 61883L, 62081L,
62144L, 62210L, 62285L, 62411L, 62809L, 62917L, 62934L, 62937L,
62983L, 62989L, 63327L, 63329L, 63383L, 63458L, 63470L, 63589L,
64081L, 64328L, 64418L, 64507L, 64596L, 65178L, 65250L, 65302L,
65478L, 65480L, 65487L, 65565L, 65572L, 65574L, 65617L, 65802L,
65865L, 65934L, 65935L, 65974L, 65975L, 65978L, 65991L, 65995L,
66013L, 66154L, 66232L, 66237L, 66245L, 66314L, 66389L, 66396L,
66460L, 66572L, 66589L, 66735L, 67174L, 73230L, 73525L, 73539L,
73677L, 73705L, 73942L, 73953L, 74034L, 74113L, 74114L, 74425L,
74427L, 74439L, 74607L, 74618L, 74641L, 74657L, 74794L, 74800L,
74836L, 74942L, 74952L, 74962L, 74969L, 74975L, 74977L, 74985L,
74989L, 75220L, 75229L, 75377L, 75407L, 75432L, 75653L, 75732L,
75735L, 75737L, 75757L, 75895L, 75898L, 76381L, 76559L, 76574L,
76594L, 76595L, 76746L, 76751L, 76755L, 76759L, 76775L, 77088L,
77091L, 77099L, 77109L, 77134L, 77182L, 77188L, 77203L, 77204L,
77252L, 77304L, 77453L, 77528L, 77556L, 77585L, 77668L, 77733L,
77758L, 78262L, 79724L, 79730L, 79747L, 79850L, 79977L, 80052L,
80819L, 80901L, 80932L, 81064L, 81065L, 81071L, 81098L, 81112L,
81142L, 81175L, 81727L, 81938L, 82554L, 83744L, 83949L), Age = c(83L,
26L, 26L, 20L, 84L, 20L, 23L, 77L, 32L, 14L, 21L, 9L, 76L, 18L,
21L, 15L, 75L, 27L, 34L, 81L, 81L, 15L, 24L, 24L, 16L, 35L, 27L,
7L, 30L, 31L, 24L, 24L, 79L, 30L, 19L, 78L, 25L, 20L, 42L, 62L,
83L, 79L, 18L, 26L, 66L, 23L, 83L, 21L, 77L, 24L, 57L, 42L, 32L,
76L, 85L, 29L, 77L, 65L, 79L, 9L, 34L, 20L, 11L, 16L, 9L, 21L,
16L, 34L, 22L, 19L, 23L, 25L, 14L, 53L, 28L, 79L, 22L, 22L, 21L,
82L, 81L, 16L, 19L, 77L, 15L, 18L, 15L, 78L, 24L, 16L, 14L, 29L,
18L, 50L, 17L, 43L, 8L, 14L, 85L, 31L, 20L, 30L, 23L, 78L, 29L,
6L, 61L, 14L, 22L, 10L, 83L, 15L, 13L, 15L, 15L, 29L, 8L, 9L,
15L, 8L, 9L, 15L, 9L, 34L, 8L, 9L, 9L, 16L, 8L, 25L, 21L, 23L,
13L, 56L, 10L, 7L, 27L, 8L, 8L, 8L, 8L, 80L, 80L, 6L, 15L, 42L,
25L, 23L, 21L, 8L, 11L, 43L, 69L, 34L, 34L, 14L, 12L, 10L, 22L,
78L, 16L, 76L, 12L, 10L, 16L, 6L, 13L, 66L, 11L, 26L, 12L, 16L,
13L, 24L, 76L, 10L, 20L, 13L, 25L, 14L, 12L, 15L, 43L, 51L, 27L,
15L, 24L, 34L, 63L, 17L, 15L, 9L, 12L, 17L, 82L, 75L, 24L, 44L,
69L, 11L, 10L, 12L, 10L, 10L, 70L, 54L, 45L, 42L, 84L, 54L, 23L,
23L, 14L, 81L, 17L, 42L, 44L, 16L, 15L, 43L, 45L, 50L, 53L, 23L,
53L, 49L, 13L, 69L, 14L, 65L, 14L, 13L, 22L, 67L, 59L, 52L, 54L,
44L, 78L, 62L, 69L, 10L, 63L, 57L, 22L, 12L, 62L, 9L, 82L, 53L,
54L, 66L, 49L, 63L, 51L, 9L, 45L, 49L, 77L, 49L, 61L, 62L, 57L,
67L, 16L, 65L, 75L, 45L, 16L, 55L, 17L, 64L, 67L, 56L, 52L, 63L,
10L, 62L, 14L, 66L, 68L, 15L, 13L, 43L, 47L, 55L, 69L, 21L, 67L,
34L, 52L, 15L, 31L, 64L, 55L, 13L, 48L, 71L, 64L, 13L, 25L, 34L,
50L, 61L, 70L, 33L, 57L, 51L, 46L, 57L, 69L, 46L, 8L, 11L, 46L,
71L, 33L, 38L, 56L, 17L, 29L, 28L, 6L, 8L), Sex = structure(c(1L,
1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L,
1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L,
1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L,
2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L,
2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L,
2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L,
2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L,
1L, 2L, 2L), .Label = c("Male", "Female"), class = "factor"),
mean_FA_scaled = c(-1.52160414281774, -1.30073487609629,
-1.39164271432334, -1.83373601712535, -2.19478262184568,
-0.47769168350816, -1.66624867866514, -0.36061779499817,
-1.10976759821506, -2.01706489349897, -1.21708170925372,
-0.68001882107227, -0.770347444019124, -1.21756680205088,
-1.04908755742334, -0.654272701867476, 0.791455877697352,
0.0263414533200063, -1.48353521852673, -1.48465744813212,
0.885781086077571, 0.937258844105155, -1.76609091258925,
-1.40930154017838, -1.42620014597815, -0.395529996012095,
-1.79188771313106, -1.6968602062236, -1.6213377738768, -1.26578647412735,
-1.3364652186935, -1.52114801078458, 0.587760344033774, -1.4860765255686,
-1.41824317606643, -1.08076339305916, -1.84290933912549,
-1.42950167307528, -0.186882171702826, 0.94192876730175,
-1.96157606965602, -0.668579319288362, -1.2972378638421,
-2.10201405453099, 0.593407693015703, -1.87521507137852,
-0.399874110613579, -2.16173114991939, -1.71213049306692,
-2.03230549555918, 0.864393561856266, 1.66450706953957, -1.76062456838238,
-1.42625806750617, -0.635317881823001, -1.05738481631217,
-0.905876579394418, 0.0731565283419971, -1.15139145628828,
-0.742407546940581, -1.69348627721645, 0.153573329806466,
-1.09929828202549, -0.982123030841461, 0.725678742439884,
-0.850887328730634, -0.99078229928042, 0.215368360012574,
-0.402661584149531, 0.0241114744912448, -0.71105027970887,
0.366463906043185, 0.957024565541906, 0.669292134912623,
1.05465854121026, 1.82844671440856, -0.181835758574102, 0.736386984932541,
-1.09078381740658, 0.0590019549321627, -1.02109697900777,
0.321350275906775, -0.0449237467173357, 0.0239956314352051,
0.117669222625202, -0.725516181331811, 0.387590783388401,
0.829691326381412, 1.37355999410519, -0.459526044282955,
-0.460235583001197, -0.311304854080326, 0.578796987572713,
0.997164184459617, 0.18257029477137, 0.291839257380694, -0.863007408468775,
-1.87780705975741, 2.29568520056216, 0.00319456268509986,
0.881190804982003, 0.930713711438919, -0.525093214001351,
2.54459572703618, 0.166620153992923, 1.20602921449896, -0.289055747129726,
1.46280982859267, -0.391909900510859, 2.11139337878521, 1.59105533181948,
-0.209203680563451, -0.763585105622814, -0.373635658420616,
0.6654186327263, -1.62880965099135, -0.961003393687248, 0.201720599972912,
-0.335957704443747, 0.757593504378786, -0.162251041912412,
0.141221563956246, 0.0760670851249914, -2.24164331007099,
0.424957409152164, -0.0769326311392693, -0.0363368801884033,
0.30505984615121, -0.551628514025415, 0.33740901955026, -0.31017538428394,
0.966704700912213, -1.19032920349958, 0.711567610176064,
0.67279638735782, -0.599819225337876, 0.0996845881750585,
0.656310472445189, -0.0716472917074639, -0.483100106187007,
-0.511691620455773, 2.1239406297925, 1.29844301245453, 0.101559797644699,
-1.35720112572458, 0.307058138867893, -0.0785544339238233,
0.27531714151305, -0.660383423073563, -0.957274695320974,
-1.47069111968835, -0.526229923988739, -0.645664114765535,
-0.887580616731169, 0.119110020634694, -0.368379279752821,
-1.37513507883771, 0.756384392481372, 0.0675019391690662,
1.18129672203451, 0.788168830982229, 0.780204620879509, 0.283447876008828,
0.146224535938955, -0.389296191558966, 0.807326376374772,
0.590410253940679, -0.41226207741881, -1.02024263646948,
0.0042805913354707, -0.217414057160255, 0.302561980255357,
-0.0445038156391923, -0.782909175408415, 0.298159944125853,
0.0170233274998232, -0.0487465675666421, -0.456839933421037,
0.310127979852941, -0.787615299560023, -0.21877521306872,
-0.395986128045251, -0.266386709100983, 0.372589107631277,
-0.47845190356342, 0.546216128061583, -0.483150787524024,
-0.638590448156119, 2.21420409102033, 0.550980173741211,
0.781797462900053, 0.0321553266949922, 0.224223113608598,
0.45913835087484, 0.924827436153908, 2.19646562306427, -0.622017650951458,
0.554498906568413, -0.0470089217260485, -0.401307668432068,
-0.588777934059104, 0.462266113387909, 0.263008816808847,
-0.162403085923465, -0.062640494100388, 0.660965915259779,
0.113397509933743, 0.191685695243484, 1.14629763872856, 0.407899519150338,
0.473039517599588, 0.589070818605222, 1.07992680780889, 0.0233440142449823,
0.303792812725778, 0.560066613449315, -0.401387310533095,
-0.286101749200717, -0.673299923821975, 1.66157479218356,
1.44751130500445, 0.402802424684597, 1.46472123901732, -0.397311082998703,
-0.641768892006205, 0.839031172774602, -0.603272796446055,
1.48020076738061, -0.550643848049078, 0.299513859843316,
0.739782634512702, 0.517841819522891, 0.240976915588321,
0.407841597622318, 1.04632508136641, 0.140700270204069, 0.320249766874399,
-0.0720093012575883, 0.191207842637321, 1.89043722977174,
1.44823532410469, -0.403472485541808, 1.81747058484881, 0.510261339543303,
0.874862878045841, -0.274271277102676, 1.60814942277632,
-0.625188854610541, 0.262176194843562, 0.546426093600656,
-0.0371912227266948, -0.0447861830882888, 1.43379838324576,
-0.0424331210124857, 1.86971580312266, -0.228122299652913,
0.731789463645971, 0.0910470403091081, 0.618791802670374,
0.267229848163289, 0.199251694841068, 0.246957313356364,
1.87125072361518, -1.40312565725327, -0.190900477709198,
0.257180463051856, 1.48421907338698, 0.0556569866890196,
-0.667601893503029, 0.247688572647614, 0.188977863808559,
0.91364858124609, 1.5448556730327, 0.930329981315788, 0.312119032378622,
1.15772266013046, -0.0360834735033167, 1.78212397237474,
-0.861407326257228, 0.476608931763807, 1.38366006055364,
0.803771442592559, 0.145174708243597, -1.13023561817905,
0.570130478942752, 0.862605234678655, -0.328963679935357,
0.654840713671687, 0.852222800781108, 0.304538552399032,
0.652132882236762, -0.639712677761503, 0.046078213992748,
-0.171257839519489, 0.349420496423362, 0.184018332971865,
0.149583984564103, 1.29365724620189, 0.621419992004272, -0.866656464734021,
1.09066401106555, 0.810541021179871, 1.62963106948065, 1.03406743799922,
-0.118969180099629, -0.372665472826285, 1.40028353909531,
0.381002209576151, 0.508378889882659, 0.667424165633985,
0.4092534348678, 0.813183690895774, 1.08099111588625, 0.708867018932142,
0.0693192271106869, 1.26885235182742, -0.117571823236151,
0.174801569825717, 0.584835306868775, -0.84211945742664,
1.05460061968224, 1.61507104537468, -1.62830066556388, 0.0799550676933195
), RAVLT_DELAY = c(NA, 12L, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 5L, NA, NA, NA, NA, NA, NA, NA,
7L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 5L, 12L,
NA, NA, NA, NA, 14L, NA, NA, NA, NA, NA, 6L, 7L, NA, NA,
NA, NA, 7L, 1L, 1L, 11L, 4L, 12L, 7L, 9L, 9L, 8L, 14L, 12L,
7L, 12L, 7L, 6L, 13L, 10L, 13L, NA, 11L, 14L, 8L, 0L, 11L,
15L, 13L, 6L, 9L, 9L, 12L, 5L, 14L, 15L, 12L, 4L, 15L, 8L,
15L, 14L, 5L, 12L, 8L, 9L, 9L, 13L, 6L, 4L, 10L, NA, 4L,
13L, 9L, 14L, 8L, 15L, 14L, 9L, 15L, 14L, 11L, 11L, 15L,
12L, 9L, 13L, 14L, 7L, 13L, 9L, 12L, 10L, 6L, 9L, 10L, 11L,
15L, 11L, 11L, NA, 9L, 12L, 10L, 9L, 11L, 2L, 12L, NA, 6L,
12L, 12L, 10L, 11L, 4L, 13L, 4L, 5L, 6L, 12L, 15L, 11L, 11L,
14L, 2L, 11L, 5L, 10L, 12L, 10L, NA, 12L, 8L, 12L, 12L, 8L,
7L, 14L, 14L, 7L, 8L, NA, 9L, 6L, 15L, 7L, 14L, 8L, 14L,
11L, 13L, 6L, 12L, 11L, 14L, 15L, 10L, 6L, 13L, 7L, 4L, 12L,
14L, 7L, 13L, 3L, 13L, 7L, 10L, 6L, 8L, 3L, 15L, 11L, 15L,
11L, 11L, 8L, 4L, 7L, 10L, 5L, 7L, 8L, 9L, 14L, 12L, 14L,
12L, NA, NA, 11L, 10L, 13L, 7L, 12L, 12L, 14L, 8L, 13L, 2L,
11L, 8L, 7L, 4L, 7L, 9L, 4L, 12L, 14L, 15L, 12L, 13L, 9L,
7L, 11L, 10L, 14L, 6L, 5L, 5L, 10L, 8L, 5L, 12L, 2L, 11L,
8L, NA, 9L, 7L, 8L, 12L, 10L, 7L, 13L, 15L, 9L, 6L, 4L, 10L,
8L, 13L, 10L, 9L, 7L, 7L, 15L, 8L, 12L, 9L, 10L, 12L, 6L,
13L, 8L, 11L, 9L, 1L, 13L, 12L, NA, 8L, 2L, 11L, 9L, 7L,
6L, 10L, 13L, 15L, 6L, 5L, 7L, 5L, 5L, 11L, 11L, 13L, 9L,
4L, 10L, 2L, NA, 12L, 10L, 15L, NA, 6L)), row.names = c(NA,
-324L), class = c("tbl_df", "tbl", "data.frame"))
I am using the following model in mgcv::gam:
m1 <- gam(mean_FA_scaled ~ s(Age, bs = 'ad', k = -1) + Sex +
te(Age, by = Sex, bs ='fs') +
te(RAVLT_DELAY, by = Sex, bs = 'fs') + s(RAVLT_DELAY),
data = DF,
method = 'REML', family = gaussian)
I would like to reproduce the gam plot:
But in ggplot. However, When I use predict_gam my plot is very jagged. This doesn't happen when I try to plot the smooth term effect on age.
# Plot
m1_p <- predict_gam(m1)
m1_p %>%
ggplot(aes(x = RAVLT_DELAY, y = fit)) +
geom_line(aes(color = Sex))
geom_smooth_ci(Sex, size = 1, alpha = 1) +
theme_classic(base_size = 24)

Your fit object has predictions for each age and each sex along the length of RAVLY_DELAY. With your existing code, each series tries to plot all the values from these various lines as one series, hence the jaggies.
If we tell ggplot to treat each Age,Sex combination as a different series (aka group), we get:
m1_p %>%
ggplot(aes(x = RAVLT_DELAY, y = fit)) +
geom_line(aes(color = Sex, group = interaction(Age,Sex)))
There are a lot of age groups here, which we could see separately with:
m1_p %>%
mutate(Age = round(Age, 1)) %>%
ggplot(aes(x = RAVLT_DELAY, y = fit)) +
geom_line(aes(color = Sex)) +
facet_wrap(~Age, ncol = 10)
While wrong, I liked the aesthetic qualities that arose when I grouped by Age only:

I had the same problem and I finally managed to fix it, or that's what I think. I am a beginner, not an expert, so sorry for my dummy language in this field.
This is happening because you have more variables in your model, apart from the ones you are plotting, that cause variance.
So, what you have to do is to create a new database with all the variables you are not plotting fixed somehow, the numeric you can use means, the factors, choose one, etc.
Then run the model with the function predict.gam (not "_"), that let you add a new database, that will be the one with your variables fixed.
Then predict.gam has to be turned into a database to plot it, so you bind it (the result) with your new data, and then you can use ggplot2 and geom_smooth_ci with no problem.
EXAMPLE:
model<-x~gam(s(v4, by=v3) + s(v2, by=v1)
#I want to plot the first smooth, first create the data:
new=expand.grid(v1=levels(circ$v1)[1], v2=mean(circ$v2), v3=levels(circ$v3), v4=seq(0,23, 0.1))
# see that I maintain the levels and the numbers of v3 and v4, and I fix the other ones randomly.
predict<-predict.gam(model, newdata = new, se.fit = TRUE)
mew=cbind(new,preddist2n)
mew %>%
ggplot(aes(v4, fit)) +
geom_smooth_ci(v3, ci_z = 1.96, ci_alpha = 0.05 )
If after you need to plot the second smooth, you should create another database to fix the variables out of the second smooth...
Tell me if it worked for you :)

Related

R loess regression

I think I missed something in the use of the loess function and I can't understand what i did wrong. I have a data frame in which I store the output (count) of 3 different softwares for 26 different genes on the genomes of different patients. The 3 softwares were each used on the same genome but with different rate of downsampling.
I pooled the results of all the patients by genes. At the end I have a data frame with 4 columns: samplexxx (downsampling rate), software (name of the software I used), gene (the name of the gene) and count (count results given by the software).
My goal is to estimate the downsampling effect (samplexxx) on the count given by the software, and I want to do some regression to be able to compare them with each other.
rate <- c(5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90,
95, 100)
my attempts:
datalist <- list()
for (i in 1:22) {
name <- genes[i]
print(name)
mod <- paste("mod_", name)
xfit <- paste("xfit_", name)
df <- paste("df_", name)
mod <- loess(data2[data2$gene == name,]$count ~
data2[data2$gene == name,]$samplexxx)
xfit <- predict(mod, newdata=data2[data2$gene == name,]$samplexxx)
df <- setNames(data.frame(matrix(ncol=4, nrow=60)),
c("down", "software", "gene", "loess"))
df$down <- data2[data2$gene == name,]$samplexxx
df$software <- data2[data2$gene == name,]$software
df$gene <- data2[data2$gene == name,]$gene
df$loess <- xfit
print(xfit)
datalist[[i]] <- df
}
data_loess <- do.call(rbind, datalist)
ggplot(data_loess, aes(x=gene, y=loess, fill=software)) +
geom_boxplot()
and:
mod <- loess(data2$count ~ data$samplexxx)
xfit <- predict(mod, newdata=data2$samplexxx)
for (i in 1:20) {
down <- rate[i]
print(name)
title <- paste("loess_downsampling", down)
out <- paste("loess_downsampling", down, ".pdf", sep="")
pdf(out, width=10)
print(ggplot(data2, aes(x=down, y=loess, fill=software))) +
geom_boxplot() + ggtitle(title))
dev.off()
}
Sample data:
> dput(data2)
structure(list(samplexxx = c(5L, 10L, 15L, 20L, 25L, 30L, 35L,
40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L,
5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L,
70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L,
35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L,
100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L,
65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L,
30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L,
95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L,
60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L,
25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L,
90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L,
55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L,
20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L,
85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L,
50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L,
15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L,
80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L,
45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L,
5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L,
70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L,
35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L,
100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L,
65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L,
30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L,
95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L,
60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L,
25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L,
90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L,
55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L,
20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L,
85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L,
50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L,
15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L,
80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L,
45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L,
5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L,
70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L,
35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L,
100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L,
65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L,
30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L,
95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L,
60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L,
25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L,
90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L,
55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L,
20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L,
85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L,
50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L,
15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L,
80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L,
45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L,
5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L,
70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L,
35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L,
100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L,
65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L,
30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L,
95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L,
60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L,
25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L,
90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L,
55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L,
20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L,
85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L,
50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L,
15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L,
80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L,
45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L,
5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L,
70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L,
35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L,
100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L,
65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L,
30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L,
95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L,
60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L,
25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L,
90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L,
55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L,
20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L,
85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L,
50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L,
15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L,
80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L,
45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L,
5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L,
70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L,
35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L,
100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L,
65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L,
30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L,
95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L,
60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L,
25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L,
90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L,
55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L, 15L,
20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L,
85L, 90L, 95L, 100L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L,
50L, 55L, 60L, 65L, 70L, 75L, 80L, 85L, 90L, 95L, 100L, 5L, 10L,
15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L,
80L, 85L, 90L, 95L, 100L), software = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("EH", "GangSTR", "Tred"), class = "factor"),
gene = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L,
18L, 18L, 18L, 18L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L,
19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 21L, 21L, 21L, 21L,
21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
21L, 21L, 21L, 21L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L,
19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L,
19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 19L, 19L, 19L, 19L,
19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L,
19L, 19L, 19L, 19L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L,
21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L), .Label = c("AFF2", "AR", "ATN1", "ATXN1",
"ATXN10", "ATXN2", "ATXN3", "ATXN7", "C9ORF72", "CACNA1A",
"CBL", "CNBP", "CSTB", "DIP2B", "DMPK", "FMR1", "FXN", "HTT",
"JPH3", "NOP56", "PPP2R2B", "TBP"), class = "factor"), count = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 24L, 24L, 24L, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
21L, NA, NA, NA, NA, NA, NA, NA, NA, NA, 17L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 15L, 15L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, NA, NA, NA, NA, 20L, 34L, 31L, 33L, 34L, 34L, 34L, 34L,
34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, NA, NA, NA, NA, NA,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, NA, NA, NA, NA, NA, 22L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, NA, NA,
NA, NA, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, NA, NA, NA, NA, 6L, 8L, 8L,
8L, 8L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, NA, NA,
NA, NA, 11L, NA, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, NA, NA, NA, 12L, 5L, NA, 12L,
12L, 5L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, NA, NA, NA, NA, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 20L, 20L, 18L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, NA, NA, NA, NA, 27L, 24L,
21L, 14L, 27L, 14L, 21L, 27L, 27L, 14L, 27L, 27L, 27L, 27L,
27L, 27L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 68L, 73L,
78L, 54L, 79L, 76L, 87L, 72L, 62L, 63L, NA, NA, NA, NA, NA,
27L, 27L, 27L, 28L, 27L, 27L, 64L, 27L, 64L, 64L, 27L, 27L,
27L, 27L, 27L, NA, NA, NA, NA, NA, 18L, 20L, 18L, 20L, 20L,
18L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, NA, NA,
NA, NA, NA, 15L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 9L, 7L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, NA, NA, NA, NA, NA, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L,
14L, 14L, NA, NA, NA, NA, NA, 35L, 29L, 35L, 35L, 30L, 35L,
32L, 35L, 35L, 35L, 35L, 35L, 35L, 35L, 35L, 11L, 19L, 19L,
19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L,
19L, 19L, 19L, 19L, 19L, 20L, 11L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L,
20L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 33L, 33L, 32L, 33L, 33L, 33L, 33L, 33L, 33L, 33L,
33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, NA, 21L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 19L, 21L, 21L, 21L, 21L, 21L,
21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
21L, 19L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 8L, 8L,
7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 11L, NA, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 7L, 15L, 15L, 13L, 15L, 15L, 15L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 27L, 19L, 27L, 27L, 27L,
27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L,
27L, 27L, NA, 76L, 23L, 23L, 23L, 32L, 65L, 32L, 28L, 32L,
28L, 32L, 32L, 23L, 28L, 32L, 28L, 28L, 32L, 84L, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 14L, 18L, 17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 15L,
NA, NA, 15L, NA, 15L, NA, NA, 15L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 9L, NA, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 14L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, NA, 28L, 36L, 36L, NA, 36L, 36L, 36L,
36L, NA, 36L, NA, 36L, 36L, 36L, 36L, 36L, NA, 36L, 36L,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1L, 8L, 18L, 16L, 15L, 14L, 15L, 16L, 15L, 16L, 14L, 15L,
14L, 14L, 14L, 14L, 16L, 16L, 16L, 16L, 31L, 28L, 31L, 31L,
32L, 32L, 32L, 33L, 31L, 33L, 32L, 31L, 32L, 32L, 32L, 32L,
32L, 32L, 32L, 32L, 7L, 18L, 22L, 22L, 22L, 22L, 22L, 22L,
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L,
19L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 5L, 6L, 6L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 12L, 11L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 5L, 7L, 7L, 7L, 7L, 11L, 11L, 7L,
11L, 15L, 15L, 11L, 7L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
1L, 2L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 4L, 20L, 17L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 1L, 2L, 1L, 1L,
1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 1L, 15L, 6L, 22L, 13L, 14L, 13L, 14L, 13L, 14L, 14L,
27L, 27L, 14L, 14L, 27L, 14L, 27L, 14L, 27L, NA, 15L, 20L,
20L, 20L, 20L, 40L, 20L, 40L, 20L, 40L, 40L, 40L, 40L, 20L,
40L, 40L, 40L, 40L, 32L, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 15L, 14L,
17L, 17L, 17L, 19L, 17L, 13L, 17L, 17L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 5L, 3L, 1L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 5L, 3L,
1L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 12L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, NA,
2L, 3L, 2L, 29L, 33L, 33L, 35L, 33L, 35L, 35L, 33L, 35L,
35L, 33L, 35L, 35L, 35L, 35L, 35L)), class = "data.frame", row.names = c(NA,
-1320L))
I believe the loess should be done on a split on the "software".
software <- unique(data2$software)
data_loess <- do.call(rbind, lapply(software, \(x) {
X <- subset(data2, software == x)
lo <- loess(count ~ samplexxx, X)
count_pred <- predict(lo, newdata=X)
return(cbind(X, count_pred))
}))
Note: R version 4.1.2 (2021-11-01)
Gives:
head(data_loess[data_loess$samplexxx > 80, ], 10)
# samplexxx software gene count count_pred
# 17 85 EH AFF2 24 22.69004
# 18 90 EH AFF2 24 22.31879
# 19 95 EH AFF2 24 21.83428
# 20 100 EH AFF2 24 21.25618
# 37 85 EH AR 21 22.69004
# 38 90 EH AR 21 22.31879
# 39 95 EH AR 21 21.83428
# 40 100 EH AR 21 21.25618
# 57 85 EH ATN1 NA 22.69004
# 58 90 EH ATN1 NA 22.31879
And here a plot of "count" predictions on "samplexxx".
plot(count_pred ~ samplexxx, data_loess, col=as.numeric(software) + 1,
pch=20, xlab='Downsampling', ylab='Count (LOESS)')
legend('topleft', legend=software, pch=19, col=as.numeric(software) + 1,
horiz=TRUE, cex=.7, title='Software')
Looks interesting, but I'm not sure if it's absolutely right.
In my answer you see something different from for loops, which is probably new to you, however it's the r-ish way and its much shorter to code. The looping job here does lapply().
Anyway, hope this helps.

Use rbind() in nested for loop with apply() in r

How can you use rbind in a for loop that runs through a list of dataframes? I tried to follow Looping through list of data frames in R but receive the following:
Error in apply(dataFramesList, 2, function(x) { :
dim(X) must have a positive length
I have two dataframes, dfTraining and dfAccuracy (code to reproduce dataframes is below), and need to add a row for any of the crop types missing from either of two columns, CROP or CROP_LABEL. I believe my problem is in my last line of code.
My code block is:
dataFramesList <- list(dfTraining, dfAccuracy)
apply(dataFramesList, 2, function(x){
cropNumbers <- seq(1,23, by = 1)
cropNumbers <- cropNumbers[-c(3)]
cropNumbers <- append(cropNumbers, 34)
listofCROPandCROP_LABELColumns <- list(dataFrameList$CROP, dataFrameList$CROP_LABEL)
missingCROP <- NULL
for (i in listofCROPandCROP_LABELColumns){
for (j in cropNumbers){
if (!j %in% i){
# If crop number is missing from CROP_LABEL, add missingCROP observation (row)
# Make row for missing crop type
missingCrop <- list(FREQUENCY = 0, AA = 1, CROP = j, CROP_LABEL = j, ACRES = 0)
dataFrameList <- rbind(dataFrameList, missingCrop)
}
}
}
})
My dfAccuracy dataframe:
structure(list(FREQUENCY = c(4L, 2L, 1L, 1L, 1L, 1L, 65L, 1L,
1L, 4L, 1L, 5L, 5L, 2L, 4L, 1L, 1L, 1L, 1L, 4L, 9L, 2L, 1L, 1L,
1L, 2L, 4L, 1L, 2L, 18L, 1L, 10L, 3L, 1L, 7L, 1L, 1L, 1L, 3L,
1L, 7L, 1L), AA = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
CROP = c(1L, 4L, 12L, 13L, 14L, 18L, 1L, 1L, 1L, 1L, 1L,
4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 13L,
13L, 13L, 13L, 14L, 14L, 14L, 18L, 18L, 18L, 18L, 18L, 19L,
19L, 21L, 21L, 21L, 21L), CROP_LABEL = c(1L, 4L, 14L, 13L,
12L, 18L, 1L, 4L, 5L, 6L, 18L, 1L, 4L, 6L, 14L, 18L, 12L,
14L, 18L, 1L, 6L, 14L, 18L, 18L, 4L, 6L, 13L, 21L, 12L, 14L,
18L, 1L, 6L, 14L, 18L, 21L, 1L, 19L, 6L, 13L, 21L, 34L),
ACRES = c(331.737184484, 193.772138572, 26.48543619, 73.2696289437,
112.470306056, 66.6556450342, 3905.71121736, 24.9581079934,
39.9287379709, 259.662359273, 85.2786247851, 306.051491303,
368.342995232, 154.82030835, 265.754349805, 70.3722566979,
35.4066607701, 139.336463432, 58.4307705147, 251.070357093,
471.031628349, 150.965736858, 28.2780117926, 35.3426930108,
34.5730542194, 67.7383953308, 144.442123948, 33.2746560126,
69.4072817311, 1219.65459596, 92.4840910734, 582.983473317,
191.957841327, 35.708775262, 319.638682538, 60.6889287642,
82.6244195055, 36.2898952104, 267.422844756, 72.8352758659,
489.746546145, 65.5392893502)), row.names = c(25L, 26L, 27L,
29L, 30L, 31L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L,
70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L, 81L, 82L,
83L, 84L, 85L, 86L, 87L, 88L, 89L, 90L, 91L, 92L, 93L, 94L, 95L
), class = "data.frame")
and my dfTraining dataframe is:
structure(list(FREQUENCY = c(7L, 1L, 1L, 4L, 2L, 6L, 1L, 107L,
1L, 21L, 1L, 1L, 1L, 2L, 1L, 19L, 3L, 1L, 1L, 12L, 1L, 2L, 32L,
2L, 2L, 29L, 2L, 18L, 1L), AA = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), CROP = c(1L, 1L, 4L, 4L, 12L, 13L, 21L,
1L, 1L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 7L, 12L, 13L, 14L, 14L,
14L, 18L, 18L, 18L, 19L, 21L, 34L), CROP_LABEL = c(1L, 4L, 1L,
4L, 12L, 13L, 21L, 1L, 6L, 4L, 6L, 1L, 5L, 14L, 18L, 6L, 14L,
1L, 12L, 13L, 1L, 6L, 14L, 6L, 14L, 18L, 19L, 21L, 34L), ACRES = c(624.940370218,
26.9188766351, 37.8773839813, 291.79294767, 140.949264214, 391.571023675,
44.5217011939, 6806.02216989, 72.7500299887, 1676.12121152, 14.8739557721,
67.0700291739, 59.7438207953, 82.6713019474, 75.62666152, 1370.78710769,
145.215281276, 41.7380537313, 66.5236760194, 679.91208779, 70.9661875374,
38.8514254734, 1749.63365551, 109.917242057, 79.7758083723, 1660.85759895,
96.8771921798, 1428.71888481, 69.473161379)), row.names = c(18L,
19L, 20L, 21L, 22L, 23L, 24L, 38L, 39L, 40L, 41L, 42L, 43L, 44L,
45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L,
58L, 59L), class = "data.frame")

Removing greater than and less than characters and number of decimals in data frame in r

I am having a dataframe of 2 variables
structure(list(X1 = structure(c(17L, 27L, 6L, 1L, 28L, 1L, 1L,4L, 17L, 28L, 28L, 12L, 21L, 28L, 28L, 8L, 28L, 1L, 1L, 10L, 4L, 21L, 30L, 1L, 8L, 28L, 1L, 1L, 1L, 1L, 8L, 1L, 17L, 1L, 1L, 28L, 8L, 23L, 15L, 23L, 25L, 13L, 8L, 4L, 28L, 10L, 1L, 30L, 13L, 4L, 1L, 1L, 17L, 13L, 13L, 8L, 4L, 4L, 4L, 28L, 28L, 13L,1L, 4L, 28L, 1L, 1L, 1L, 1L, 1L, 12L, 2L, 6L, 1L, 8L, 1L, 21L, 1L, 21L, 1L, 30L,13L, 25L, 17L, 1L, 28L, 13L, 1L, 1L, 1L, 1L,8L, 30L, 25L, 28L, 4L, 1L, 13L, 17L, 4L,1L, 1L, 28L, 1L, 1L, 8L, 1L, 8L, 1L, 13L, 1L, 1L, 1L, 4L, 6L, 1L, 1L, 30L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 6L, 6L, 1L, 15L, 21L, 10L, 21L, 1L, 10L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 28L, 28L, 1L, 30L, 15L, 25L, 6L, 17L, 25L, 15L, 8L, 18L, 22L, 14L, 22L, 28L, 30L, 3L, 30L, 14L, 18L, 22L, 24L, 10L, 26L, 26L, 18L, 26L, 30L, 29L, 18L, 14L, 9L, 9L, 16L, 16L, 29L, 18L, 16L, 27L, 24L, 14L, 26L, 5L, 22L, 28L, 22L, 11L, 9L, 26L, 30L, 18L, 28L, 16L, 26L, 7L, 30L, 7L, 28L, 5L, 18L, 9L, 26L, 24L, 27L, 16L, 16L, 14L, 26L, 29L, 5L, 22L, 24L, 26L, 18L, 27L, 9L, 18L, 11L, 14L, 18L, 22L, 29L, 26L, 22L, 26L, 20L, 24L, 14L, 7L, 16L, 24L, 26L, 29L, 24L, 24L, 24L, 20L, 20L, 24L, 11L, 20L, 29L, 16L, 18L, 24L, 24L, 7L, 24L, 18L, 11L, 11L, 24L, 24L, 7L, 11L, 18L, 24L, 24L, 16L, 29L, 7L, 30L, 24L, 22L, 24L, 18L, 26L, 9L, 9L, 24L, 29L, 9L, 24L, 30L, 11L, 24L, 16L, 26L, 26L, 26L, 30L, 26L, 16L, 26L, 24L, 29L, 20L, 24L, 14L, 9L, 7L, 29L, 29L, 15L, 6L, 15L, 2L, 6L, 6L, 3L, 2L, 17L, 30L, 27L, 23L, 2L, 15L, 8L, 13L, 21L, 28L, 23L, 25L, 1L, 25L, 19L, 27L, 23L, 15L, 19L, 19L, 23L, 2L, 27L, 27L, 15L, 2L, 2L, 3L, 23L, 2L, 23L, 6L, 2L, 15L, 13L,1L, 1L, 13L, 28L, 1L, 1L, 28L, 21L, 1L, 28L, 4L, 1L, 17L, 17L, 13L, 21L, 1L, 1L, 1L, 17L, 1L, 1L, 17L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 13L, 1L, 1L, 1L, 1L, 8L,25L, 1L, 28L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 8L, 4L, 1L, 25L, 28L, 13L, 1L, 1L, 28L, 1L, 4L, 1L, 1L, 8L, 1L, 8L, 13L, 4L, 28L, 21L, 28L, 28L, 28L, 28L, 28L, 8L, 1L, 1L, 1L, 1L, 13L, 21L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 28L, 4L, 1L, 17L, 17L, 28L, 1L, 13L, 8L, 17L, 1L, 13L, 13L, 8L, 4L, 1L, 17L, 25L, 1L, 1L, 8L, 8L, 1L, 4L, 17L, 21L),
.Label = c("<8", ">1024", "1024", "11", "11.000000000000007", "128", "128.00000000000009", "16", "16.000000000000007", "181", "181.00000000000006", "22", "23", "23.000000000000011", "256", "256.00000000000017", "32", "32.000000000000014", "362", "362.00000000000017", "45", "45.000000000000014", "512", "512.00000000000045", "64", "64.000000000000028", "724", "8", "8.0000000000000018", "90"),
class = "factor"),
X2 = structure(c(7L, 2L, 2L, 8L, 18L, 4L, 13L, 18L, 8L, 13L, 8L, 18L, 12L, 13L, 18L, 16L, 7L, 5L, 1L, 16L, 18L, 18L, 18L, 12L, 7L, 1L, 4L, 4L, 2L,16L, 12L, 12L, 2L, 2L, 13L, 13L, 18L, 2L, 16L, 2L, 16L, 16L, 2L, 12L, 16L, 2L, 12L,2L, 2L, 16L, 16L, 2L, 2L, 2L, 2L, 2L, 7L, 18L, 18L, 18L, 13L, 18L, 13L, 18L, 9L, 13L, 8L, 4L, 1L, 13L, 8L, 2L, 16L, 12L, 7L, 7L, 18L, 18L, 18L, 12L, 16L, 7L, 16L, 7L, 12L, 12L, 16L, 12L, 13L, 13L, 12L, 16L, 12L, 12L, 7L, 7L, 13L,16L, 7L, 18L, 16L, 13L, 18L, 4L, 12L, 7L, 4L, 18L, 18L, 18L, 9L, 17L, 13L, 7L, 12L, 7L, 18L, 12L, 18L, 13L, 9L, 1L, 18L, 1L, 13L, 13L, 13L, 1L, 1L, 13L, 12L, 4L, 1L,1L, 4L, 12L, 9L, 1L, 1L, 1L, 2L, 12L, 9L, 2L, 18L, 2L, 18L, 7L, 12L, 1L, 9L, 9L, 7L, 18L, 9L, 18L, 1L, 12L, 13L, 12L, 16L, 7L, 12L, 7L, 16L, 2L, 12L,7L, 16L, 12L, 16L, 2L, 12L, 2L, 15L, 7L, 7L, 2L, 7L, 3L, 12L, 16L, 1L, 17L, 2L, 18L, 5L, 7L, 1L, 16L, 7L, 10L, 1L, 12L, 18L, 16L, 16L, 13L, 12L, 7L, 2L, 1L, 9L, 18L, 12L, 13L, 2L, 2L, 12L, 2L, 2L, 2L, 16L, 2L, 1L, 18L, 12L, 7L, 2L, 2L, 12L, 7L, 12L, 4L, 2L, 18L, 13L, 2L, 16L, 7L, 2L, 2L, 12L, 2L, 14L, 12L, 12L, 16L, 1L, 2L, 4L, 2L, 2L, 2L, 17L, 2L, 2L, 2L, 18L, 16L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 12L, 2L, 2L, 1L, 2L, 12L, 18L, 2L, 15L, 16L, 16L, 2L, 2L, 2L, 2L, 11L, 12L, 14L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 16L, 16L, 12L, 2L, 12L, 2L, 2L, 2L, 12L, 2L,16L, 2L, 12L, 14L, 7L, 2L, 4L, 14L, 2L, 16L, 15L, 7L, 16L, 18L, 2L, 16L, 2L, 2L, 12L, 12L, 2L, 2L, 4L, 2L, 2L, 2L, 16L, 2L, 12L,18L, 3L, 16L, 2L, 2L, 13L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 16L, 16L, 2L, 2L, 4L, 4L, 11L, 13L, 4L, 4L, 8L, 4L, 13L, 1L, 4L, 1L, 1L, 2L, 2L, 11L, 18L, 8L, 8L, 4L, 7L, 8L, 4L, 8L, 4L, 4L, 8L, 8L, 1L, 4L, 8L, 4L, 13L, 1L, 6L, 1L, 17L, 2L, 2L, 8L, 18L, 8L, 8L, 4L, 7L, 8L, 17L, 8L, 4L, 1L, 4L, 13L, 1L, 2L, 4L, 16L, 13L, 4L, 4L, 17L, 4L, 7L, 4L, 4L, 1L, 1L, 4L, 1L, 17L, 8L, 1L, 8L, 1L, 4L, 1L, 8L, 8L, 8L, 1L, 13L, 16L, 16L, 17L, 8L, 13L, 1L, 4L, 7L, 1L, 1L, 4L, 4L, 8L, 6L, 4L, 1L, 12L, 13L, 8L, 4L, 4L, 18L, 2L, 4L, 8L, 13L, 17L,13L, 18L, 7L, 16L, 7L, 1L, 13L, 8L, 13L, 4L, 1L, 7L),
.Label = c("<8", ">1024", "1024", "11", "128", "16", "181", "22", "23", "256", "32", "362", "45", "512", "64", "724", "8", "90"), class = "factor")),
.Names = c("X1", "X2"),
row.names = c(NA, -471L),
class = "data.frame")
I have 2 questions
1) Each one is having some greater than values and some with less than values. i want to remove the > and < characters from data frame and retain only the number in the dataframe. I can do it in excel but i want to learn the code for learning it in R.
2) I want to reduce the number of decimals to integer/whole number as some are having more number of decimals.
It may be a small question, but i am struggling to do this. i highly appreciate for this help.
You can use dplyr::mutate_all and stringr::str_replace_all.
Decimals are directly approximated by as.numeric since it is ~10^(-13)magnitude.
your_df <- structure(list(X1 = structure(c(17L, 27L, 6L, 1L, 28L, 1L, 1L,4L, 17L, 28L, 28L, 12L, 21L, 28L, 28L, 8L, 28L, 1L, 1L, 10L, 4L, 21L, 30L, 1L, 8L, 28L, 1L, 1L, 1L, 1L, 8L, 1L, 17L, 1L, 1L, 28L, 8L, 23L, 15L, 23L, 25L, 13L, 8L, 4L, 28L, 10L, 1L, 30L, 13L, 4L, 1L, 1L, 17L, 13L, 13L, 8L, 4L, 4L, 4L, 28L, 28L, 13L,1L, 4L, 28L, 1L, 1L, 1L, 1L, 1L, 12L, 2L, 6L, 1L, 8L, 1L, 21L, 1L, 21L, 1L, 30L,13L, 25L, 17L, 1L, 28L, 13L, 1L, 1L, 1L, 1L,8L, 30L, 25L, 28L, 4L, 1L, 13L, 17L, 4L,1L, 1L, 28L, 1L, 1L, 8L, 1L, 8L, 1L, 13L, 1L, 1L, 1L, 4L, 6L, 1L, 1L, 30L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 6L, 6L, 1L, 15L, 21L, 10L, 21L, 1L, 10L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 28L, 28L, 1L, 30L, 15L, 25L, 6L, 17L, 25L, 15L, 8L, 18L, 22L, 14L, 22L, 28L, 30L, 3L, 30L, 14L, 18L, 22L, 24L, 10L, 26L, 26L, 18L, 26L, 30L, 29L, 18L, 14L, 9L, 9L, 16L, 16L, 29L, 18L, 16L, 27L, 24L, 14L, 26L, 5L, 22L, 28L, 22L, 11L, 9L, 26L, 30L, 18L, 28L, 16L, 26L, 7L, 30L, 7L, 28L, 5L, 18L, 9L, 26L, 24L, 27L, 16L, 16L, 14L, 26L, 29L, 5L, 22L, 24L, 26L, 18L, 27L, 9L, 18L, 11L, 14L, 18L, 22L, 29L, 26L, 22L, 26L, 20L, 24L, 14L, 7L, 16L, 24L, 26L, 29L, 24L, 24L, 24L, 20L, 20L, 24L, 11L, 20L, 29L, 16L, 18L, 24L, 24L, 7L, 24L, 18L, 11L, 11L, 24L, 24L, 7L, 11L, 18L, 24L, 24L, 16L, 29L, 7L, 30L, 24L, 22L, 24L, 18L, 26L, 9L, 9L, 24L, 29L, 9L, 24L, 30L, 11L, 24L, 16L, 26L, 26L, 26L, 30L, 26L, 16L, 26L, 24L, 29L, 20L, 24L, 14L, 9L, 7L, 29L, 29L, 15L, 6L, 15L, 2L, 6L, 6L, 3L, 2L, 17L, 30L, 27L, 23L, 2L, 15L, 8L, 13L, 21L, 28L, 23L, 25L, 1L, 25L, 19L, 27L, 23L, 15L, 19L, 19L, 23L, 2L, 27L, 27L, 15L, 2L, 2L, 3L, 23L, 2L, 23L, 6L, 2L, 15L, 13L,1L, 1L, 13L, 28L, 1L, 1L, 28L, 21L, 1L, 28L, 4L, 1L, 17L, 17L, 13L, 21L, 1L, 1L, 1L, 17L, 1L, 1L, 17L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 13L, 1L, 1L, 1L, 1L, 8L,25L, 1L, 28L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 8L, 4L, 1L, 25L, 28L, 13L, 1L, 1L, 28L, 1L, 4L, 1L, 1L, 8L, 1L, 8L, 13L, 4L, 28L, 21L, 28L, 28L, 28L, 28L, 28L, 8L, 1L, 1L, 1L, 1L, 13L, 21L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 28L, 4L, 1L, 17L, 17L, 28L, 1L, 13L, 8L, 17L, 1L, 13L, 13L, 8L, 4L, 1L, 17L, 25L, 1L, 1L, 8L, 8L, 1L, 4L, 17L, 21L), .Label = c("<8", ">1024", "1024", "11", "11.000000000000007", "128", "128.00000000000009", "16", "16.000000000000007", "181", "181.00000000000006", "22", "23", "23.000000000000011", "256", "256.00000000000017", "32", "32.000000000000014", "362", "362.00000000000017", "45", "45.000000000000014", "512", "512.00000000000045", "64", "64.000000000000028", "724", "8", "8.0000000000000018", "90"), class = "factor"), X2 = structure(c(7L, 2L, 2L, 8L, 18L, 4L, 13L, 18L, 8L, 13L, 8L, 18L, 12L, 13L, 18L, 16L, 7L, 5L, 1L, 16L, 18L, 18L, 18L, 12L, 7L, 1L, 4L, 4L, 2L,16L, 12L, 12L, 2L, 2L, 13L, 13L, 18L, 2L, 16L, 2L, 16L, 16L, 2L, 12L, 16L, 2L, 12L,2L, 2L, 16L, 16L, 2L, 2L, 2L, 2L, 2L, 7L, 18L, 18L, 18L, 13L, 18L, 13L, 18L, 9L, 13L, 8L, 4L, 1L, 13L, 8L, 2L, 16L, 12L, 7L, 7L, 18L, 18L, 18L, 12L, 16L, 7L, 16L, 7L, 12L, 12L, 16L, 12L, 13L, 13L, 12L, 16L, 12L, 12L, 7L, 7L, 13L,16L, 7L, 18L, 16L, 13L, 18L, 4L, 12L, 7L, 4L, 18L, 18L, 18L, 9L, 17L, 13L, 7L, 12L, 7L, 18L, 12L, 18L, 13L, 9L, 1L, 18L, 1L, 13L, 13L, 13L, 1L, 1L, 13L, 12L, 4L, 1L,1L, 4L, 12L, 9L, 1L, 1L, 1L, 2L, 12L, 9L, 2L, 18L, 2L, 18L, 7L, 12L, 1L, 9L, 9L, 7L, 18L, 9L, 18L, 1L, 12L, 13L,
12L, 16L, 7L, 12L, 7L, 16L, 2L, 12L,7L, 16L, 12L, 16L, 2L, 12L, 2L, 15L, 7L, 7L, 2L, 7L, 3L, 12L, 16L, 1L, 17L, 2L, 18L, 5L, 7L, 1L, 16L, 7L, 10L, 1L, 12L, 18L, 16L, 16L, 13L, 12L, 7L, 2L, 1L, 9L, 18L, 12L, 13L, 2L, 2L, 12L, 2L, 2L, 2L, 16L, 2L, 1L, 18L, 12L, 7L, 2L, 2L, 12L, 7L, 12L, 4L, 2L, 18L, 13L, 2L, 16L, 7L, 2L, 2L, 12L, 2L, 14L, 12L, 12L, 16L, 1L, 2L, 4L, 2L, 2L, 2L, 17L, 2L, 2L, 2L, 18L, 16L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 12L, 2L, 2L, 1L, 2L, 12L, 18L, 2L, 15L, 16L, 16L, 2L, 2L, 2L, 2L, 11L, 12L, 14L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 16L, 16L, 12L, 2L, 12L, 2L, 2L, 2L, 12L, 2L,16L, 2L, 12L, 14L, 7L, 2L, 4L, 14L, 2L, 16L, 15L, 7L, 16L, 18L, 2L, 16L, 2L, 2L, 12L, 12L, 2L, 2L, 4L, 2L, 2L, 2L, 16L, 2L, 12L,18L, 3L, 16L, 2L, 2L, 13L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 16L, 16L, 2L, 2L, 4L, 4L, 11L, 13L, 4L, 4L, 8L, 4L, 13L, 1L, 4L, 1L, 1L, 2L, 2L, 11L, 18L, 8L, 8L, 4L, 7L, 8L, 4L, 8L, 4L, 4L, 8L, 8L, 1L, 4L, 8L, 4L, 13L, 1L, 6L, 1L, 17L, 2L, 2L, 8L, 18L, 8L, 8L, 4L, 7L, 8L, 17L, 8L, 4L, 1L, 4L, 13L, 1L, 2L, 4L, 16L, 13L, 4L, 4L, 17L, 4L, 7L, 4L, 4L, 1L, 1L, 4L, 1L, 17L, 8L, 1L, 8L, 1L, 4L, 1L, 8L, 8L, 8L, 1L, 13L, 16L, 16L, 17L, 8L, 13L, 1L, 4L, 7L, 1L, 1L, 4L, 4L, 8L, 6L, 4L, 1L, 12L, 13L, 8L, 4L, 4L, 18L, 2L, 4L, 8L, 13L, 17L,13L, 18L, 7L, 16L, 7L, 1L, 13L, 8L, 13L, 4L, 1L, 7L),
.Label = c("<8", ">1024", "1024", "11", "128", "16", "181", "22", "23", "256", "32", "362", "45", "512", "64", "724", "8", "90"), class = "factor")), .Names = c("X1", "X2"), row.names = c(NA, -471L), class = "data.frame")
library(dplyr)
library(stringr)
mutate_all(your_df, function(x) as.numeric(str_replace_all(x, pattern = "<|>", replacement = "")))
#> X1 X2
#> 1 32 181
#> 2 724 1024
#> 3 128 1024
#> 4 8 22
#> 5 8 90
#> 6 8 11
#> 7 8 45
#> 8 11 90
#> 9 32 22
#> 10 8 45
#> 11 8 22
#> 12 22 90
#> 13 45 362
You can do this with base R:
my_df <- as.data.frame(sapply(my_df, gsub, pattern = "<|>", replacement = ""))
my_df <- as.data.frame(sapply(my_df, as.numeric))
my_df
# X1 X2
# 1 8 23
# 2 8 90
# 3 8 8
# 4 8 362
# 5 8 45
# 6 90 362
# 7 256 724
# 8 64 181
# 9 128 362
# 10 32 181
# 11 64 724
# 12 256 1024
# 13 16 362
# 14 32.000000000000014 181
# 15 45.000000000000014 724
# 16 23.000000000000011 362
# 17 45.000000000000014 724
# 18 8 1024
# 19 90 362
# 20 1024 1024
# 21 90 64
# 22 23.000000000000011 181
# 23 32.000000000000014 181
# 24 45.000000000000014 1024
# 25 512.00000000000045 181
If you only want to round the decimals, but keep the < and > signs you can do the following (without perfoming the steps above):
sapply(my_df,
function(x) paste0(gsub(x, pattern = "\\d|\\.", replacement = ""),
round(as.numeric(gsub(x, pattern = "<|>", replacement = "")))))
# X1 X2
# [1,] "<8" "23"
# [2,] "<8" "90"
# [3,] "8" "<8"
# [4,] "8" "362"
# [5,] "<8" "45"
# [6,] "90" "362"
# [7,] "256" "724"
# [8,] "64" "181"
# [9,] "128" "362"
# [10,] "32" "181"
# [11,] "64" "724"
# [12,] "256" ">1024"
# [13,] "16" "362"
# [14,] "32" "181"
# [15,] "45" "724"
# [16,] "23" "362"
# [17,] "45" "724"
# [18,] "8" ">1024"
# [19,] "90" "362"
# [20,] "1024" ">1024"
# [21,] "90" "64"
# [22,] "23" "181"
# [23,] "32" "181"
# [24,] "45" ">1024"
# [25,] "512" "181"
How it works
sapply takes the data.frame and applys the function specified after the comma to each column of the data.frame. gsub substitute the pattern with the replacement in x (a column of the data.frame). There I used regular expression, so \\d means all digits (0-9) and \\. the dot and | combines them with a OR logic.
stringr-solution
There's a shorter solution with stringr:
library(stringr)
sapply(my_df,
function(x) str_c(str_extract(x, "[<>]?"),
round(as.numeric(str_extract(x, "\\d+")))))
Here the pattern we want to have are extracted and then combined again after rounding the decimals.
Data
my_df <-
structure(list(X1 = structure(c(1L, 1L, 28L, 28L, 1L, 30L, 15L,
25L, 6L, 17L, 25L, 15L, 8L, 18L,
22L, 14L, 22L, 28L, 30L, 3L, 30L,
14L, 18L, 22L, 24L),
.Label = c("<8", ">1024", "1024", "11",
"11.000000000000007", "128",
"128.00000000000009", "16",
"16.000000000000007", "181",
"181.00000000000006", "22",
"23", "23.000000000000011",
"256", "256.00000000000017",
"32", "32.000000000000014",
"362", "362.00000000000017",
"45", "45.000000000000014",
"512", "512.00000000000045",
"64", "64.000000000000028",
"724", "8",
"8.0000000000000018", "90"),
class = "factor"),
X2 = structure(c(9L, 18L, 1L, 12L, 13L, 12L, 16L, 7L,
12L, 7L, 16L, 2L, 12L, 7L, 16L, 12L,
16L, 2L, 12L, 2L, 15L, 7L, 7L, 2L, 7L),
.Label = c("<8", ">1024", "1024", "11",
"128", "16", "181", "22", "23",
"256", "32", "362", "45", "512",
"64", "724", "8", "90"),
class = "factor")),
.Names = c("X1", "X2"),
row.names = c(NA, -25L),
class = "data.frame")
# X1 X2
# 1 <8 23
# 2 <8 90
# 3 8 <8
# 4 8 362
# 5 <8 45
# 6 90 362
# 7 256 724
# 8 64 181
# 9 128 362
# 10 32 181
# 11 64 724
# 12 256 >1024
# 13 16 362
# 14 32.000000000000014 181
# 15 45.000000000000014 724
# 16 23.000000000000011 362
# 17 45.000000000000014 724
# 18 8 >1024
# 19 90 362
# 20 1024 >1024
# 21 90 64
# 22 23.000000000000011 181
# 23 32.000000000000014 181
# 24 45.000000000000014 >1024
# 25 512.00000000000045 181

removing rows from dataframe in two different columns R

I have a dataframe of results. There are multiple comparisons for Cruise_Strata. I have two columns of cruise_strata (Cruise1_Strata1 and Cruise2_Strata2). The problem I found is that there are "duplicate" records in the dataframe. For example one row will have
Cruise_Strata1 Cruise_Strata2
201501.35 201502.35
and another row will have
Cruise_Strata1 Cruise_Strata2
201502.35 201501.35
The rows have the same results for the remaining columns. I would like to be able to identify rows where this happens and remove one row from the dataset, but do not know how to go about it. I cant use duplicate because they are not duplicates.
Any help would be appreciated.
Here is the dataframe.
dput(result5)
structure(list(Cruise_Strata1 = structure(c(1L, 1L, 2L, 2L, 3L,
3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L,
11L, 11L, 12L, 12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L, 17L,
17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 21L, 22L, 22L, 23L, 23L,
24L, 24L, 25L, 25L, 26L, 26L, 27L, 27L, 28L, 28L, 29L, 29L, 30L,
30L, 31L, 31L, 32L, 32L, 33L, 33L, 34L, 34L, 35L, 35L, 36L, 36L,
37L, 37L, 38L, 38L, 39L, 39L, 40L, 40L, 41L, 41L, 42L, 42L, 43L,
43L, 44L, 44L, 45L, 45L, 46L, 46L, 47L, 47L, 48L, 48L, 49L, 49L,
50L, 50L, 51L, 51L, 52L, 52L, 53L, 53L, 54L, 54L, 55L, 55L, 56L,
56L, 57L, 57L, 58L, 58L, 59L, 59L, 60L, 60L, 61L, 61L, 62L, 62L,
63L, 63L, 64L, 64L, 65L, 65L, 66L, 66L), .Label = c("201501.10",
"201501.11", "201501.13", "201501.14", "201501.15", "201501.17",
"201501.18", "201501.19", "201501.21", "201501.22", "201501.23",
"201501.24", "201501.25", "201501.26", "201501.27", "201501.29",
"201501.30", "201501.31", "201501.33", "201501.34", "201501.35",
"201501.9", "201502.10", "201502.11", "201502.13", "201502.14",
"201502.15", "201502.17", "201502.18", "201502.19", "201502.21",
"201502.22", "201502.23", "201502.24", "201502.25", "201502.26",
"201502.27", "201502.29", "201502.30", "201502.31", "201502.33",
"201502.34", "201502.35", "201502.9", "201503.10", "201503.11",
"201503.13", "201503.14", "201503.15", "201503.17", "201503.18",
"201503.19", "201503.21", "201503.22", "201503.23", "201503.24",
"201503.25", "201503.26", "201503.27", "201503.29", "201503.30",
"201503.31", "201503.33", "201503.34", "201503.35", "201503.9"
), class = "factor"), Cruise_Strata2 = structure(c(23L, 45L,
24L, 46L, 25L, 47L, 26L, 48L, 27L, 49L, 28L, 50L, 29L, 51L, 30L,
52L, 31L, 53L, 32L, 54L, 33L, 55L, 34L, 56L, 35L, 57L, 36L, 58L,
37L, 59L, 38L, 60L, 39L, 61L, 40L, 62L, 41L, 63L, 42L, 64L, 43L,
65L, 44L, 66L, 1L, 45L, 2L, 46L, 3L, 47L, 4L, 48L, 5L, 49L, 6L,
50L, 7L, 51L, 8L, 52L, 9L, 53L, 10L, 54L, 11L, 55L, 12L, 56L,
13L, 57L, 14L, 58L, 15L, 59L, 16L, 60L, 17L, 61L, 18L, 62L, 19L,
63L, 20L, 64L, 21L, 65L, 22L, 66L, 1L, 23L, 2L, 24L, 3L, 25L,
4L, 26L, 5L, 27L, 6L, 28L, 7L, 29L, 8L, 30L, 9L, 31L, 10L, 32L,
11L, 33L, 12L, 34L, 13L, 35L, 14L, 36L, 15L, 37L, 16L, 38L, 17L,
39L, 18L, 40L, 19L, 41L, 20L, 42L, 21L, 43L, 22L, 44L), .Label = c("201501.10",
"201501.11", "201501.13", "201501.14", "201501.15", "201501.17",
"201501.18", "201501.19", "201501.21", "201501.22", "201501.23",
"201501.24", "201501.25", "201501.26", "201501.27", "201501.29",
"201501.30", "201501.31", "201501.33", "201501.34", "201501.35",
"201501.9", "201502.10", "201502.11", "201502.13", "201502.14",
"201502.15", "201502.17", "201502.18", "201502.19", "201502.21",
"201502.22", "201502.23", "201502.24", "201502.25", "201502.26",
"201502.27", "201502.29", "201502.30", "201502.31", "201502.33",
"201502.34", "201502.35", "201502.9", "201503.10", "201503.11",
"201503.13", "201503.14", "201503.15", "201503.17", "201503.18",
"201503.19", "201503.21", "201503.22", "201503.23", "201503.24",
"201503.25", "201503.26", "201503.27", "201503.29", "201503.30",
"201503.31", "201503.33", "201503.34", "201503.35", "201503.9"
), class = "factor"), P_value = c(0.63, 0.6793, 0.0319, 0.0289,
0.9516, 0.8128, 0.9967, 0.3071, 0.9641, 0.0246, 0.7967, 0.2551,
0.2329, 0.3725, 0.0269, 0.3796, 0.0245, 0.5562, 0.9952, 0.5176,
0.5596, 0.9966, 0.32, 0.6402, 0.7691, 0.9671, 0.9396, 0.9, 0.9024,
0.3624, 0.0433, 0.3402, 0.5302, 0.787, 0.0295, 0.3638, 0.006,
0.701, 0.6323, 0.0366, 2e-04, 0.0011, 0.8849, 0.3, 0.63, 0.9738,
0.0319, 0.5197, 0.9516, 0.7369, 0.9967, 0.2276, 0.9641, 0.0158,
0.7967, 0.6332, 0.2329, 0.0322, 0.0269, 0.3013, 0.0245, 0.0129,
0.9952, 0.795, 0.5596, 0.7277, 0.32, 0.747, 0.7691, 0.3817, 0.9396,
0.7961, 0.9024, 0.4164, 0.0433, 0.0028, 0.5302, 0.2864, 0.0295,
0.7036, 0.006, 0, 0.6323, 0.002, 2e-04, 0.9548, 0.8849, 0.0546,
0.6793, 0.9738, 0.0289, 0.5197, 0.8128, 0.7369, 0.3071, 0.2276,
0.0246, 0.0158, 0.2551, 0.6332, 0.3725, 0.0322, 0.3796, 0.3013,
0.5562, 0.0129, 0.5176, 0.795, 0.9966, 0.7277, 0.6402, 0.747,
0.9671, 0.3817, 0.9, 0.7961, 0.3624, 0.4164, 0.3402, 0.0028,
0.787, 0.2864, 0.3638, 0.7036, 0.701, 0, 0.0366, 0.002, 0.0011,
0.9548, 0.3, 0.0546), Cruise1 = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("201501",
"201502", "201503"), class = "factor"), Cruise1_Strata1 = structure(c(1L,
1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L,
9L, 10L, 10L, 11L, 11L, 12L, 12L, 13L, 13L, 14L, 14L, 15L, 15L,
16L, 16L, 17L, 17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 21L, 22L,
22L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L,
8L, 8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L, 13L, 13L, 14L,
14L, 15L, 15L, 16L, 16L, 17L, 17L, 18L, 18L, 19L, 19L, 20L, 20L,
21L, 21L, 22L, 22L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L,
6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L, 13L,
13L, 14L, 14L, 15L, 15L, 16L, 16L, 17L, 17L, 18L, 18L, 19L, 19L,
20L, 20L, 21L, 21L, 22L, 22L), .Label = c("10", "11", "13", "14",
"15", "17", "18", "19", "21", "22", "23", "24", "25", "26", "27",
"29", "30", "31", "33", "34", "35", "9"), class = "factor"),
Cruise2 = structure(c(2L, 3L, 2L, 3L, 2L, 3L, 2L, 3L, 2L,
3L, 2L, 3L, 2L, 3L, 2L, 3L, 2L, 3L, 2L, 3L, 2L, 3L, 2L, 3L,
2L, 3L, 2L, 3L, 2L, 3L, 2L, 3L, 2L, 3L, 2L, 3L, 2L, 3L, 2L,
3L, 2L, 3L, 2L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L,
1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L,
3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L,
1L, 3L, 1L, 3L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L), .Label = c("201501", "201502", "201503"), class = "factor"),
Cruise2_Strata2 = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L,
4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L,
11L, 12L, 12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L, 17L,
17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 21L, 22L, 22L, 1L,
1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L,
9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L, 13L, 13L, 14L, 14L,
15L, 15L, 16L, 16L, 17L, 17L, 18L, 18L, 19L, 19L, 20L, 20L,
21L, 21L, 22L, 22L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L,
6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L,
12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L, 17L, 17L, 18L,
18L, 19L, 19L, 20L, 20L, 21L, 21L, 22L, 22L), .Label = c("10",
"11", "13", "14", "15", "17", "18", "19", "21", "22", "23",
"24", "25", "26", "27", "29", "30", "31", "33", "34", "35",
"9"), class = "factor"), adjuste_p = c(1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.792, 1, 1, 1, 0.0264,
0.1452, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.3696, 1,
1, 1, 1, 0.792, 0, 1, 0.264, 0.0264, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0.3696, 1, 1, 1, 1, 1, 0, 1, 0.264,
0.1452, 1, 1, 1)), .Names = c("Cruise_Strata1", "Cruise_Strata2",
"P_value", "Cruise1", "Cruise1_Strata1", "Cruise2", "Cruise2_Strata2",
"adjuste_p"), row.names = c(1453L, 2905L, 1520L, 2972L, 1587L,
3039L, 1654L, 3106L, 1721L, 3173L, 1788L, 3240L, 1855L, 3307L,
1922L, 3374L, 1989L, 3441L, 2056L, 3508L, 2123L, 3575L, 2190L,
3642L, 2257L, 3709L, 2324L, 3776L, 2391L, 3843L, 2458L, 3910L,
2525L, 3977L, 2592L, 4044L, 2659L, 4111L, 2726L, 4178L, 2793L,
4245L, 2860L, 4312L, 23L, 2927L, 90L, 2994L, 157L, 3061L, 224L,
3128L, 291L, 3195L, 358L, 3262L, 425L, 3329L, 492L, 3396L, 559L,
3463L, 626L, 3530L, 693L, 3597L, 760L, 3664L, 827L, 3731L, 894L,
3798L, 961L, 3865L, 1028L, 3932L, 1095L, 3999L, 1162L, 4066L,
1229L, 4133L, 1296L, 4200L, 1363L, 4267L, 1430L, 4334L, 45L,
1497L, 112L, 1564L, 179L, 1631L, 246L, 1698L, 313L, 1765L, 380L,
1832L, 447L, 1899L, 514L, 1966L, 581L, 2033L, 648L, 2100L, 715L,
2167L, 782L, 2234L, 849L, 2301L, 916L, 2368L, 983L, 2435L, 1050L,
2502L, 1117L, 2569L, 1184L, 2636L, 1251L, 2703L, 1318L, 2770L,
1385L, 2837L, 1452L, 2904L), class = "data.frame")
R Info
R version 3.2.1 (2015-06-18)
Platform: i386-w64-mingw32/i386 (32-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1
Does this give you your desired result?
duplicated(apply(cbind(result5$Cruise_Strata1, df$Cruise_Strata2), 1,
function(x) paste(min(x), max(x))))
You can use the resulting logical vector to subset your data.
First you create a vector pasting the values in Cruise_Strata1 and Cruise_Strata2. Doing this you move the smaller of the two to the front and the larger one to the end (or you could do it vice versa). This is just a trick so that you can apply the duplicated function and recognize the duplicates.
Note: this approach will remove duplicates of the form:
Cruise_Strata1 Cruise_Strata2
x y
y x
As well as (if this is not desired let me know):
Cruise_Strata1 Cruise_Strata2
x y
x y
For a generic data frame df with duplicated values in Cruise_Strata1 and Cruise_Strata2:
df$dupe <- 0
for(i in 1:(length(df$Cruise_Strata1)-1))
{
for(j in (i+1):length(df$Cruise_Strata1))
if(df$Cruise_Strata1[i]==df$Cruise_Strata2[j])
{print(df[c(i,j),]); df$dupe[i] = 1;break}
}
df[df$dupe != 1,]

Use dplyr to find genotype frequency across SNPs

To find genotype frequency across SNPs I need to find the proportion of a certain genotype (XX, YX, or YY) in the total number of samples (XX, YX, and YY). I think I would need to start my dplyr statement with
dat %>% group_by(Assay) %>%
but I don't know how to finish it. The data, dat, provided below and dput at the bottom.
Source: local data frame [143 x 3]
Groups: Assay
Assay Final n
1 One_apoe-83 Invalid 2
2 One_apoe-83 No Call 9
3 One_apoe-83 NTC 2
4 One_apoe-83 XX 4
5 One_apoe-83 YX 41
6 One_apoe-83 YY 134
7 One_CD9-269 Invalid 2
8 One_CD9-269 No Call 5
9 One_CD9-269 NTC 2
10 One_CD9-269 XX 99
.. ... ... ...
I could use a for loop across SNPs to get what I'm looking for with boolean patterning for each genotype but that would be very verbose.
for(i in seq(levels(dat$Assay))) {
storage_df[i,1] <- dat[dat$Assay == levels(dat$Assay)[i],]$XX / (dat[dat$Assay == levels(dat$Assay)[i],]$XX + dat[dat$Assay == levels(dat$Assay)[i],]$YX + dat[dat$Assay == levels(dat$Assay)[i],]$XY) ...
You get the point. How would I do this in dplyr? The whole object is below.
dat <- structure(list(Assay = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L,
5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L,
7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 10L,
10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L,
12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L,
14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L,
16L, 16L, 17L, 17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L,
18L, 19L, 19L, 19L, 19L, 19L, 19L, 20L, 20L, 20L, 20L, 20L, 20L,
21L, 21L, 21L, 21L, 21L, 21L, 22L, 22L, 22L, 22L, 22L, 22L, 23L,
23L, 23L, 23L, 23L, 23L, 24L, 24L, 24L, 24L, 24L, 24L), .Label = c("One_apoe-83",
"One_CD9-269", "One_Cytb_26", "One_E2", "One_ghsR-66", "One_IL8r-362",
"One_KPNA-422", "One_lpp1-44", "One_MHC2_190", "One_MHC2_251",
"One_Prl2", "One_redd1-414", "One_STC-410", "One_STR07", "One_sys1-230",
"One_U1004-183", "One_U1105", "One_U1201-492", "One_U1203-175",
"One_U1209-111", "One_U1212-106", "One_U401-224", "One_vamp5-255",
"One_ZNF-61"), class = "factor"), Final = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 6L, 1L,
2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L,
6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L,
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L,
2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L,
6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L,
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L,
2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L,
6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("Invalid",
"No Call", "NTC", "XX", "YX", "YY"), class = "factor"), n = c(2L,
9L, 2L, 4L, 41L, 134L, 2L, 5L, 2L, 99L, 75L, 9L, 2L, 7L, 2L,
110L, 71L, 2L, 8L, 2L, 110L, 59L, 11L, 2L, 6L, 2L, 67L, 86L,
29L, 2L, 3L, 2L, 152L, 28L, 5L, 2L, 4L, 2L, 78L, 81L, 25L, 2L,
4L, 2L, 115L, 62L, 7L, 2L, 17L, 2L, 80L, 62L, 29L, 2L, 13L, 2L,
59L, 68L, 48L, 2L, 7L, 2L, 48L, 86L, 47L, 2L, 7L, 2L, 42L, 87L,
52L, 2L, 3L, 2L, 47L, 81L, 57L, 2L, 9L, 2L, 40L, 85L, 54L, 2L,
8L, 2L, 52L, 86L, 42L, 2L, 7L, 2L, 9L, 39L, 133L, 2L, 8L, 2L,
101L, 71L, 8L, 2L, 13L, 2L, 20L, 82L, 73L, 2L, 11L, 2L, 27L,
75L, 75L, 2L, 6L, 2L, 3L, 40L, 139L, 2L, 13L, 2L, 59L, 82L, 34L,
2L, 19L, 2L, 20L, 84L, 65L, 2L, 11L, 2L, 119L, 47L, 11L, 2L,
8L, 2L, 51L, 100L, 29L)), class = "data.frame", .Names = c("Assay",
"Final", "n"), row.names = c(NA, -143L))
Hope I am not misunderstanding. Are you looking for below:
Assume the data structure is:
df <- structure(list(Assay = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L), .Label = c("One_apoe-83", "One_CD9-269"), class = "factor"),
Final = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L
), .Label = c("Invalid", "No Call", "NTC", "XX", "YX", "YY"
), class = "factor"), n = c(2L, 9L, 2L, 4L, 41L, 134L, 2L,
5L, 2L, 99L)), .Names = c("Assay", "Final", "n"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"))
Code
df %>% group_by(Assay) %>% mutate(n_percent = n/sum(n)*100)
# Assay Final n n_percent
# 1 One_apoe-83 Invalid 2 1.041667
# 2 One_apoe-83 No Call 9 4.687500
# 3 One_apoe-83 NTC 2 1.041667
# 4 One_apoe-83 XX 4 2.083333
# 5 One_apoe-83 YX 41 21.354167
# 6 One_apoe-83 YY 134 69.791667
# 7 One_CD9-269 Invalid 2 1.851852
# 8 One_CD9-269 No Call 5 4.629630
# 9 One_CD9-269 NTC 2 1.851852
# 10 One_CD9-269 XX 99 91.666667
Option 2
Here is the code based on the comment. A line is added to filter out the elements you don't want.
df %>%
filter(! Final %in% c("Invalid", "No Call", "NTC")) %>%
group_by(Assay) %>%
mutate(n_percent = n/sum(n)*100)
# Source: local data frame [4 x 4]
# Groups: Assay
#
# Assay Final n n_percent
# 1 One_apoe-83 XX 4 2.234637
# 2 One_apoe-83 YX 41 22.905028
# 3 One_apoe-83 YY 134 74.860335
# 4 One_CD9-269 XX 99 100.000000

Resources