Extract only significant rows from TukeyHSD output - r
After generating a very large TukeyHSD table, I want to only see the rows that are <0.05 in the adj.p.value column. I have tried the IF and ifelse functions, but they only produce a TRUE/FALSE table. I want to see the whole data row for the significant comparisons. Thanks!
Data
structure(list(Species = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L), .Label = c("A", "B", "C", "D"), class = "factor"),
TREATMENT = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 4L,
4L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 1L, 1L, 1L, 2L, 2L, 3L, 3L,
3L, 4L, 4L), .Label = c("1", "2", "3", "4"), class = "factor"),
`CLASS 1` = c(9.3993303987076, 8.14588087483834, 8.44889021858093,
28.2773809415175, 9.49156649568952, 37.406663111623, 8.42458221212166,
35.529904738035, 33.1401135085753, 8.26977630375797, 7.87786527094827,
7.83020300515061, 35.1465417538538, 10.5560853720815, 7.54702433773332,
7.15030081390987, 7.73624654623485, 33.6461639540039, 10.3098164094602,
7.79017325570062, 8.47473750173462, 8.37179798600773, 31.7364310355766,
2.00147496567679, 9.30194886619568, 7.87886829173726, 7.93445694220837,
9.10020522660375, 8.81542855137005, 7.83313314713951, 7.84449591023115,
23.6150030864875, 9.3452854347794, 8.91047098149179, 7.76031738257155,
9.79467065201063, 24.7592334362831, 8.54842834366722, 7.60436112798701,
8.93480758329653, 8.72406315335014, 9.49850179222777), `CLASS 2` = c(10.8069912074565,
4.52426389123869, 8.13120921128287, 52.3870196313339, 7.17369219206721,
45.7325224336886, 17.8345921677786, 38.4579761235057, 37.5916934855387,
6.28803058195647, 3.72788988807285, 4.64744990904241, 29.7689968962103,
7.08515103144071, 6.44277647222835, 5.71017728280462, 6.28290843412007,
45.6123170472575, 6.98431855663527, 8.03809625184267, 4.76656440828616,
6.74640254081232, 31.3243238213156, 45.1287867136276, 7.6308508343969,
4.0127554151831, 9.11910102221636, 6.1658394708941, 10.4617259648364,
6.07502685224869, 8.08373642262043, 48.588633863193, 10.2160085507338,
7.52606530219909, 5.66373884014351, 8.51992766801391, 25.9109062123364,
5.74498954209992, 5.56377323143979, 7.76698847227212, 7.05016373786876,
7.99745310894107), `CLASS 3` = c(3.96856956332584, 2.76052305637364,
6.92715392916015, 0.687821057043984, 2.30154255462355, 2.61089063893911,
11.2199145273738, 10.7058533354417, 1.90691767773411, 3.93488282297868,
1.7034110083142, 1.69310511636903, 1.54005861925764, 50.436990190291,
3.93233520754151, 4.06684782901502, 6.10592204678281, 0.675086986967025,
3.94018776658881, 5.74129993338595, 2.02845185559621, 4.10963382465756,
14.9264019576272, 12.9672579626868, 5.1049208042632, 1.37282635713804,
3.00088572108344, 4.78878116348504, 4.79564218319094, 3.03836532949481,
3.48474205480686, 1.09218910757234, 6.2830307568812, 3.06784943090836,
4.89376208853059, 6.6321148581705, 1.01356027363186, 3.15439940439419,
40.8141653079423, 2.52825000616702, 1.65382018138259, 1.81173455682492
), `CLASS 4` = c(0, 13.4274810838142, 10.9876140536356, 3.15424686759082,
15.2632739415738, 0, 0, 2.39525969535064, 4.19386122886851,
13.15599261724, 14.5421891905919, 14.4542067660843, 0, 0,
12.2276086827261, 12.7527880016103, 12.1436697242409, 3.79216208516423,
0, 12.2283190622827, 16.0271803699645, 14.035876401479, 4.24556176551009,
0, 0, 14.4993393432366, 13.6722412691012, 0, 14.0027443968931,
13.7579074961889, 12.9935353616471, 4.66128854387559, 0,
15.1941922851023, 11.6990009190362, 0, 7.99399142573694,
14.5041748372822, 0, 15.0674109079436, 13.7134908002476,
15.4194201146961), `CLASS 5` = c(7.82638584740367, 6.56112678542475,
6.95253086439919, 2.06445951884762, 7.17086660532553, 2.58627258328855,
7.83400556063298, 1.77053879587063, 2.65292759651742, 6.94701807830366,
6.85309102458439, 6.71505104532983, 4.06818278652755, 5.79906266122279,
6.62064468061089, 6.88365856613044, 7.68403751285005, 2.38479005191691,
9.07405520739349, 7.65785587918449, 7.4385885335047, 7.30144390122309,
2.41680929257195, 4.18258704279641, 7.8906816661241, 6.75678558060943,
6.67150537517493, 8.24794113296791, 7.67443442992891, 6.89357008866252,
6.45444668132533, 2.98342694785768, 8.704729108357, 7.14382850099481,
7.15233553294014, 9.14001781571836, 3.98831954045444, 7.54093786042356,
5.79029360470226, 6.82793163574773, 6.48049736162586, 7.18554914992982
), `CLASS 6` = c(20.6189597026452, 22.8728557858066, 23.0767150659042,
4.99832103176657, 24.1726463550235, 5.56104550736533, 31.8124013284184,
4.31653191057476, 8.4695331411828, 20.63468068931, 19.7369752322083,
19.6902616040991, 11.6648564225744, 0, 25.2321582223958,
22.2981543181678, 29.3198455372777, 5.88723409877159, 30.1474816315191,
28.0835788057802, 24.0430626320328, 25.1446564854412, 5.78713327050339,
30.6155806819949, 23.8853696442419, 20.1783872969561, 17.5935515655693,
23.4169038776536, 21.1986239116884, 19.6931330316831, 22.2658181144794,
7.38944654414811, 32.1897387187698, 23.6398829158785, 25.3561697324352,
30.4118856020653, 12.6822088903071, 26.300118251779, 11.7338836812169,
23.8624555097246, 20.1037712460599, 21.8478004507985), `CLASS 7` = c(15.9129851563051,
15.2250454288061, 13.5577123002506, 2.9902563940573, 15.4408266617369,
2.67511425705514, 8.17164465017573, 2.23047357314211, 4.01010767344732,
13.4046459481448, 15.3008244637288, 15.3885729336047, 7.81496654756214,
17.8194559247092, 12.7823202355514, 13.7684066964868, 15.378473991847,
3.75026919344972, 18.2880822635935, 14.7412162942703, 17.5270089738067,
16.799718650752, 4.33839497916674, 2.21937177530762, 15.0315149187176,
15.3632530721031, 14.1580725482114, 16.4215442147509, 15.5113323256627,
14.3349000132624, 12.8504657216928, 5.06281347160092, 15.5075336560533,
15.9392345541138, 13.3981839319596, 16.6700105346756, 8.10398633871805,
15.958090408468, 16.5733149488757, 15.1802203155931, 14.2236219296677,
16.2095182295187), `CLASS 8` = c(19.9174685533413, 16.6755018156139,
13.9892072522183, 3.35339208579287, 18.98558519396, 3.42749146804023,
10.4801793890691, 2.97802997775506, 5.11270635117451, 17.0372757040089,
18.7865491767228, 18.3992789502607, 9.99639697401416, 0,
15.9270550696003, 17.1615519869107, 15.3488962066467, 4.25197658246908,
21.2560581648095, 15.7194605175531, 19.6944057250743, 17.4904702096271,
5.22494387772846, 2.88494085790995, 19.1038328534942, 19.0183655117756,
17.533290326259, 19.92632149392, 17.5400682364295, 17.664926273487,
16.3075864395099, 6.6071984352649, 17.7536737744256, 18.5784760293114,
14.706720581834, 18.8313728693457, 9.73353207739478, 18.2488613518859,
8.53356517614357, 19.8319355692553, 17.4801581342745, 20.0300225970631
), `CLASS 9` = c(11.5493095708147, 9.80732127808386, 7.92896710456816,
2.08710247204941, 0, 0, 4.22268016442976, 1.61543185032431,
2.92213933696131, 10.3276972542995, 11.4712047448286, 11.1818706700593,
0, 8.30325482025479, 9.28807709161222, 10.2081145049644,
0, 0, 0, 0, 0, 0, 0, 0, 12.0508804125665, 10.9194191312608,
10.316895230176, 11.9324634197247, 0, 10.709037767833, 9.7151732936871,
0, 0, 0, 9.36977099054923, 0, 5.81426180513736, 0, 3.38664292169246,
0, 10.5704134555229, 0), `CLASS 10` = c(44.7938508721352,
51.7310046920715, 57.5715824785637, 89.5047895292528, 58.4027215389776,
91.3111216916161, 69.2914902356924, 91.4055258029079, 85.3021190418994,
52.2833630152431, 47.5883305901355, 48.3152264007455, 78.1204536918961,
68.0782265938132, 55.3819029226251, 51.9782682455077, 61.5885922886562,
89.6129641721643, 51.3818043642034, 61.8814673089921, 55.3399967676143,
58.4083672383978, 88.0198518505328, 90.713100323986, 45.9230901490977,
47.942176704251, 51.3202365201787, 43.4717297386365, 59.2741650079789,
50.3975658567551, 54.6723278637849, 85.3465611452765, 58.0340634611641,
58.33846091558, 55.372988962717, 55.3585987802603, 72.3599002382954,
58.2521103792226, 65.716183348586, 58.1599124794039, 51.2453091189091,
56.5749100234884), `CLASS 11` = c(55.2061491278648, 48.2689953079285,
42.4284175214362, 10.4952104707472, 41.5972784610224, 8.68887830838393,
30.7085097643076, 8.59447419709211, 14.6978809581006, 47.7166369847569,
52.4116694098645, 51.6847735992545, 21.8795463081039, 31.9217734061868,
44.6180970773749, 48.0217317544923, 38.4114077113438, 10.3870358278357,
48.6181956357966, 38.1185326910079, 44.6600032323857, 41.5916327616022,
11.9801481494672, 9.28689967601398, 54.0769098509023, 52.0578232957489,
48.6797634798213, 56.5282702613635, 40.7258349920211, 49.6024341432449,
45.3276721362151, 14.6534388547235, 41.9659365388359, 41.66153908442,
44.627011037283, 44.6414012197397, 27.6400997617046, 41.7478896207774,
34.283816651414, 41.8400875205961, 48.7546908810909, 43.4250899765116
), `CLASS 12` = c(0.811392418775427, 1.07172325344784, 1.35691090645737,
8.52815575054215, 1.40400342762093, 10.5089654211764, 2.25642633809048,
10.6353831202186, 5.80370185913679, 1.09570511081795, 0.907972043744494,
0.934805805194479, 3.57047868323309, 2.13265803649301, 1.24124305047309,
1.08239054166649, 1.60339326148851, 8.62738568129464, 1.05684309531167,
1.62339583767845, 1.23914000811097, 1.40432975000493, 7.34714218491929,
9.76785617252635, 0.849218090969217, 0.920940862853288, 1.05424169822542,
0.769026356858985, 1.45544382379371, 1.01603009463636, 1.20615785649631,
5.82433666195463, 1.38288498357373, 1.40029538508808, 1.24079537651438,
1.24007305478085, 2.61793194894868, 1.3953306600253, 1.91682810629766,
1.39005236188319, 1.05108468934595, 1.30281618424025)), row.names = c(NA,
-42L), .Names = c("Species", "TREATMENT", "CLASS 1", "CLASS 2",
"CLASS 3", "CLASS 4", "CLASS 5", "CLASS 6", "CLASS 7", "CLASS 8",
"CLASS 9", "CLASS 10", "CLASS 11", "CLASS 12"), class = c("tbl_df",
"tbl", "data.frame"))
library(Reshape2)
library(dplyr)
library(broom)
melt <- melt(example, id=c("TREATMENT","Species"), value.name="Percentage", variable.name = "Class")
res = melt %>% group_by(Species,Class) %>%
do(Model = TukeyHSD(aov(Percentage ~ TREATMENT, data=.)))
as.data.frame(tidy(res,Model))
You can select the rows of interest using, e.g., filter from dplyr. In filter you give a logical statement that indicates which rows you want. In your case, you want all rows where adj.p.value is less than .05.
as.data.frame(tidy(res,Model)) %>%
filter(adj.p.value < .05)
Species Class comparison estimate conf.low conf.high adj.p.value
1 A CLASS 3 3-1 6.410802 1.668508 11.153096 0.013375263
2 A CLASS 3 3-2 9.096133 4.353839 13.838426 0.002303284
3 A CLASS 3 4-3 -8.041984 -13.236906 -2.847061 0.006931852
4 A CLASS 9 2-1 -9.066165 -16.389642 -1.742688 0.020094567
Related
In grouped dataframe, summarize rows which contain certain value (e.g. zeros only) in set of columns with common substring in header [R]
Given the following dataframe: dframe <- structure(list(id = c("294361-7349174-75411122", "294365-7645230-95464222", "291915-7345264-75464222", "291365-7345074-75164202", "594165-7345274-78444212", "234385-7335274-75464229", "734515-1345274-95464892", "201365-8345274-78464232", "294365-7315971-75464120", "591365-7345374-75464222", "394365-7345204-75411022", "494305-7345273-75464222", "291161-7345271-75461210", "294035-7345201-75464292", "298365-7345279-78864223", "294365-7345274-15964293", "294395-7345274-69464299", "899965-1345294-95464222", "194365-7145274-75464222", "194361-7349231-75464222", "294365-7345274-75464122", "191315-1345274-13464322", "794365-7349274-75464292", "214365-8318274-75464222", "394363-8341274-39494929"), gene = structure(c(3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("ABC_1", "C_1", "XYZ_123" ), class = "factor"), group = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("KO", "WT"), class = "factor"), class_A = c(0, 1, 0, 2, 1, 0, 0, 1, 0, 1, 0, 0, 0, 2, 2, 1, 0, 0, 0, 0, 1, 1, 1, 0, 3), class_B = c(0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1)), row.names = c(NA, -25L), class = "data.frame") I would like to produce a new dataframe for data grouped per "group" and "gene". I want to output sum of rows per group in which both class_A and class_B columns contain the same, desired value - now I am interested in zeros. Based on the answers provided in this thread: Efficient way to create a dataframe with multiple summary columns based on a grouped dataframe using dplyr in R I can achieve this with following code: desired_dframe <- dframe %>% group_by(group, gene) %>% summarise(counts_zero = sum(ifelse((class_A == 0 & class_B == 0), 1, 0))) However, the above approach has one pitfall: the column names are hardcoded. In real life, I have dataframes with various number of columns denoting classes (and other names, e.g. "class_C", "class_Z" etc.). The common part of their names, is "class_". Based on this, I would like to consider all of the columns of interest. I was playing around with rowSums(dplyr::across(dplyr::starts_with('class_')==0)) to achieve this, yet with no avail. The function throws the error and I have no idea how to debug it. Also, I was trying to incorporate this column into the #akrun's answer provided here: Efficient way to create a dataframe with multiple summary columns based on a grouped dataframe using dplyr in R On the #akrun's request, I am putting this into the new thread.
If it is to get the sum of class_ columns, use across or if_all (more correct) i.e. loop over the class_ columns in if_all, apply the condition .x ==0, which returns TRUE only if all the columns looped for that rows will be 0 or else it return FALSE. Do the sum directly on the logical vector (TRUE -> 1 and FALSE -> 0) library(dplyr) dframe %>% group_by(group, gene) %>% summarise(counts_zero = sum(if_all(starts_with('class_'), ~ .x == 0)), .groups = 'drop') -output # A tibble: 5 × 3 group gene counts_zero <fct> <fct> <int> 1 KO ABC_1 2 2 KO XYZ_123 4 3 WT ABC_1 2 4 WT C_1 0 5 WT XYZ_123 1
Issues with pivot_wider and unique identifiers because of duplicate values
I'm trying to use pivot_wider move my dataset from long to wide so I can use it in a different programme. I have seen the other posts on this topic but the solutions don't address my problem. I have measurement variable called "rating" which has a value for each "rock" and each test ("gentest", first and second). I have an id variable called "turkcode". For each individual in the dataset, there are 18 ratings. The problem is that there are 4 ratings for rock #8 and I think this is why the data won't pivot wider the way I want them to. Here's a subset of the data structure(list(turkcode = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("100879", "104655", "108505", "110324", "110600", "112445", "114083", "115814", "116573", "117411", "117817", "118651", "119324", "121548", "121883", "121918", "123275", "123718", "125491", "127450", "127825", "128062", "129061", "131404", "135358", "135594", "135671", "135945", "137951", "138675", "139469", "140924", "145730", "147222", "148533", "150851", "153455", "158882", "164468", "166907", "169260", "171463", "172398", "175565", "177108", "179000", "180270", "183953", "185574", "185880", "185948", "186371", "187787", "189220", "190014", "192550", "193904", "195308", "196755", "197493", "198368", "200155", "200297", "201915", "214519", "215994", "217903", "218771", "219302", "220434", "222740", "223223", "224721", "225118", "225223", "229856", "229874", "231301", "232576", "233842", "234215", "237581", "239567", "240609", "241098", "241423", "242108", "244633", "246055", "251597", "252929", "255252", "256652", "259936", "274962", "277053", "279422", "280317", "282602", "283750", "285737", "286259", "287544", "288507", "290503", "291401", "291835", "292160", "294117", "297863", "298061", "299347", "299499", "301399", "304875", "305231", "306312", "307410", "308979", "311157", "311524", "311630", "318956", "318988", "319995", "321405", "324288", "327086", "327559", "328345", "328401", "330318", "330909", "332723", "334115", "334517", "335811", "335831", "337145", "338323", "338542", "338575", "340083", "341182", "343612", "343947", "344554", "346476", "349874", "350117", "350433", "350972", "351187", "355311", "356717", "359366", "360048", "360058", "361191", "361971", "362827", "363543", "367244", "374254", "374965", "376278", "377622", "382139", "382916", "384586", "385229", "386782", "388951", "389029", "390299", "390662", "396335", "396732", "398076", "398573", "399276", "399587", "403388", "406073", "406160", "411977", "412935", "417350", "420060", "421393", "422944", "424462", "427143", "429291", "430758", "431629", "431638", "431935", "432218", "433788", "434291", "436681", "437087", "439385", "439499", "440477", "440834", "441253", "441876", "443826", "444080", "447597", "452643", "454649", "457055", "457946", "463512", "464079", "464123", "467897", "468650", "470211", "471115", "471512", "475493", "476937", "479198", "482871", "484066", "484070", "485462", "486402", "491701", "491835", "499644", "501833", "502335", "502373", "504800", "507439", "507946", "507987", "509066", "513078", "515519", "517017", "517988", "519144", "519210", "519858", "522847", "523683", "525315", "528577", "532463", "532630", "533028", "539033", "539852", "540690", "546773", "546916", "549652", "551599", "554198", "556066", "559920", "560804", "560857", "562080", "562420", "563841", "565668", "565776", "566509", "569039", "572553", "575364", "576421", "576694", "576877", "577120", "577155", "577534", "577605", "578463", "578820", "578995", "580213", "581893", "582433", "582905", "583887", "584569", "585314", "585566", "587393", "589144", "592284", "594463", "596863", "601837", "602632", "604254", "605885", "609296", "609963", "610062", "612437", "612949", "613161", "614372", "614777", "615372", "615384", "616927", "618118", "620041", "620336", "621634", "622289", "624098", "626163", "626612", "627019", "627856", "630003", "630255", "634018", "634478", "635801", "638606", "640012", "641078", "641366", "641436", "641821", "642076", "642446", "643329", "643942", "644015", "646792", "647254", "647700", "649516", "650792", "650810", "651229", "652387", "652671", "654778", "657964", "658894", "660500", "660607", "664469", "666754", "666796", "668996", "669712", "671682", "673516", "675712", "677835", "678008", "679262", "680295", "686455", "690471", "691175", "692489", "694023", "696001", "698716", "700133", "700641", "707812", "707953", "708010", "708881", "713657", "715255", "715386", "716764", "718936", "719956", "725348", "727753", "728436", "729588", "730513", "731928", "732013", "732438", "733366", "733559", "734672", "735174", "735675", "737044", "737127", "741264", "745262", "748173", "748414", "748943", "749221", "749963", "750363", "753518", "754512", "754970", "758639", "760838", "761642", "766250", "770646", "772574", "773054", "775271", "776762", "778208", "779453", "781378", "781861", "782257", "785763", "785860", "787011", "790280", "791735", "791903", "792178", "796650", "796822", "796970", "798621", "802731", "804701", "805606", "807848", "809142", "810539", "812182", "812321", "814029", "814545", "814774", "815079", "816572", "824215", "825063", "827763", "829973", "829983", "830126", "832112", "832666", "833066", "834756", "835270", "835340", "837413", "837746", "839882", "846097", "847975", "848746", "851745", "851975", "856622", "858918", "859174", "859182", "859726", "859850", "862222", "864356", "865028", "869700", "871576", "872256", "873350", "873597", "875873", "883140", "886308", "886592", "886706", "892144", "893930", "894959", "896820", "900374", "901373", "902879", "904147", "905194", "906305", "908049", "908798", "911505", "913314", "915390", "915833", "919057", "922432", "924120", "925640", "927671", "932006", "936810", "936916", "938349", "940727", "941945", "942271", "943188", "944548", "945783", "947164", "948322", "949181", "951414", "952632", "955090", "956428", "956985", "959916", "960349", "962224", "962980", "964665", "967160", "967588", "969929", "972543", "972893", "977734", "978083", "978981", "980427", "980782", "981541", "981850", "982220", "983781", "985193", "986366", "988934", "989056", "991218", "991914", "995411", "995630", "995873", "995936", "996309"), class = "factor"), aid = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("mem", "noMem"), class = "factor"), gentest = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L), .Label = c("first", "second"), class = "factor"), rocks = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 8L, 8L, 1L, 1L), .Label = c("R1", "R2", "R3", "R4", "R5", "R6", "R7", "R8"), class = "factor"), rating = c(7L, 5L, 2L, 7L, 4L, 2L, 6L, 3L, 3L, 2L, 3L, 3L, 2L, 1L, 3L, 6L, 3L, 2L, 2L, 4L), condition = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("baseline", "category", "property" ), class = "factor"), order = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("after", "before", "none"), class = "factor")), row.names = c(NA, -20L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), groups = structure(list( turkcode = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("100879", "104655", "108505", "110324", "110600", "112445", "114083", "115814", "116573", "117411", "117817", "118651", "119324", "121548", "121883", "121918", "123275", "123718", "125491", "127450", "127825", "128062", "129061", "131404", "135358", "135594", "135671", "135945", "137951", "138675", "139469", "140924", "145730", "147222", "148533", "150851", "153455", "158882", "164468", "166907", "169260", "171463", "172398", "175565", "177108", "179000", "180270", "183953", "185574", "185880", "185948", "186371", "187787", "189220", "190014", "192550", "193904", "195308", "196755", "197493", "198368", "200155", "200297", "201915", "214519", "215994", "217903", "218771", "219302", "220434", "222740", "223223", "224721", "225118", "225223", "229856", "229874", "231301", "232576", "233842", "234215", "237581", "239567", "240609", "241098", "241423", "242108", "244633", "246055", "251597", "252929", "255252", "256652", "259936", "274962", "277053", "279422", "280317", "282602", "283750", "285737", "286259", "287544", "288507", "290503", "291401", "291835", "292160", "294117", "297863", "298061", "299347", "299499", "301399", "304875", "305231", "306312", "307410", "308979", "311157", "311524", "311630", "318956", "318988", "319995", "321405", "324288", "327086", "327559", "328345", "328401", "330318", "330909", "332723", "334115", "334517", "335811", "335831", "337145", "338323", "338542", "338575", "340083", "341182", "343612", "343947", "344554", "346476", "349874", "350117", "350433", "350972", "351187", "355311", "356717", "359366", "360048", "360058", "361191", "361971", "362827", "363543", "367244", "374254", "374965", "376278", "377622", "382139", "382916", "384586", "385229", "386782", "388951", "389029", "390299", "390662", "396335", "396732", "398076", "398573", "399276", "399587", "403388", "406073", "406160", "411977", "412935", "417350", "420060", "421393", "422944", "424462", "427143", "429291", "430758", "431629", "431638", "431935", "432218", "433788", "434291", "436681", "437087", "439385", "439499", "440477", "440834", "441253", "441876", "443826", "444080", "447597", "452643", "454649", "457055", "457946", "463512", "464079", "464123", "467897", "468650", "470211", "471115", "471512", "475493", "476937", "479198", "482871", "484066", "484070", "485462", "486402", "491701", "491835", "499644", "501833", "502335", "502373", "504800", "507439", "507946", "507987", "509066", "513078", "515519", "517017", "517988", "519144", "519210", "519858", "522847", "523683", "525315", "528577", "532463", "532630", "533028", "539033", "539852", "540690", "546773", "546916", "549652", "551599", "554198", "556066", "559920", "560804", "560857", "562080", "562420", "563841", "565668", "565776", "566509", "569039", "572553", "575364", "576421", "576694", "576877", "577120", "577155", "577534", "577605", "578463", "578820", "578995", "580213", "581893", "582433", "582905", "583887", "584569", "585314", "585566", "587393", "589144", "592284", "594463", "596863", "601837", "602632", "604254", "605885", "609296", "609963", "610062", "612437", "612949", "613161", "614372", "614777", "615372", "615384", "616927", "618118", "620041", "620336", "621634", "622289", "624098", "626163", "626612", "627019", "627856", "630003", "630255", "634018", "634478", "635801", "638606", "640012", "641078", "641366", "641436", "641821", "642076", "642446", "643329", "643942", "644015", "646792", "647254", "647700", "649516", "650792", "650810", "651229", "652387", "652671", "654778", "657964", "658894", "660500", "660607", "664469", "666754", "666796", "668996", "669712", "671682", "673516", "675712", "677835", "678008", "679262", "680295", "686455", "690471", "691175", "692489", "694023", "696001", "698716", "700133", "700641", "707812", "707953", "708010", "708881", "713657", "715255", "715386", "716764", "718936", "719956", "725348", "727753", "728436", "729588", "730513", "731928", "732013", "732438", "733366", "733559", "734672", "735174", "735675", "737044", "737127", "741264", "745262", "748173", "748414", "748943", "749221", "749963", "750363", "753518", "754512", "754970", "758639", "760838", "761642", "766250", "770646", "772574", "773054", "775271", "776762", "778208", "779453", "781378", "781861", "782257", "785763", "785860", "787011", "790280", "791735", "791903", "792178", "796650", "796822", "796970", "798621", "802731", "804701", "805606", "807848", "809142", "810539", "812182", "812321", "814029", "814545", "814774", "815079", "816572", "824215", "825063", "827763", "829973", "829983", "830126", "832112", "832666", "833066", "834756", "835270", "835340", "837413", "837746", "839882", "846097", "847975", "848746", "851745", "851975", "856622", "858918", "859174", "859182", "859726", "859850", "862222", "864356", "865028", "869700", "871576", "872256", "873350", "873597", "875873", "883140", "886308", "886592", "886706", "892144", "893930", "894959", "896820", "900374", "901373", "902879", "904147", "905194", "906305", "908049", "908798", "911505", "913314", "915390", "915833", "919057", "922432", "924120", "925640", "927671", "932006", "936810", "936916", "938349", "940727", "941945", "942271", "943188", "944548", "945783", "947164", "948322", "949181", "951414", "952632", "955090", "956428", "956985", "959916", "960349", "962224", "962980", "964665", "967160", "967588", "969929", "972543", "972893", "977734", "978083", "978981", "980427", "980782", "981541", "981850", "982220", "983781", "985193", "986366", "988934", "989056", "991218", "991914", "995411", "995630", "995873", "995936", "996309" ), class = "factor"), rocks = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 1L, 1L), .Label = c("R1", "R2", "R3", "R4", "R5", "R6", "R7", "R8"), class = "factor"), gentest = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("first", "second"), class = "factor"), .rows = list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15:16, 17:18, 19L, 20L)), row.names = c(NA, -18L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE)) Does anyone know how I can modify the second set of ratings for rock #8 so that I can pivot the data wider or even exclude this data from the dataset altogether? EDIT: Here is an example of how I'd like the output to look id <- rep("100879", times = 6) aid <- rep("mem", times = 6) test <- rep(c("first", "second"), times = 3) order <- rep("after", times = 6) condition <- rep ("cat", times = 6) R1 <- sample(0:9, 6, replace=T) R2 <- sample(0:9, 6, replace=T) R3 <- sample(0:9, 6, replace=T) R4 <- sample(0:9, 6, replace=T) R5 <- sample(0:9, 6, replace=T) R6 <- sample(0:9, 6, replace=T) R7 <- sample(0:9, 6, replace=T) R8 <- sample(0:9, 6, replace=T) df <- cbind(id, aid, test, order, condition, R1, R2, R3, R4, R5, R6, R7, R8)
a data.table suggestion library( data.table ) #set data as data.table setDT( mydata ) #create rowid by group mydata[, row_id := rowidv( mydata, cols = c("turkcode", "aid", "gentest", "condition", "order", "rocks") ) ] #create new rocks-column to group on mydata[, rocks2 := paste0( rocks, ifelse( row_id == 1, "", paste0("_",row_id ) ) ) ] #now cast to wide dcast( mydata, turkcode + aid + gentest + condition + order ~ rocks2, value.var = "rating" ) # turkcode aid gentest condition order R1 R2 R3 R4 R5 R6 R7 R8 R8_2 # 1: 100879 mem first category after 7 2 4 6 3 3 2 3 6 # 2: 100879 mem second category after 5 7 2 3 2 3 1 3 2 # 3: 104655 mem first category after 2 NA NA NA NA NA NA NA NA # 4: 104655 mem second category after 4 NA NA NA NA NA NA NA NA
Another option using pivot_wider and separate library(dplyr) library(tidyr) #short version, but you will end up with R1-R8 in list foramt df %>% pivot_wider(id_cols = c("turkcode", "aid", "gentest", "condition", "order"), names_from = "rocks", values_from = "rating", values_fn = list(rating = list)) #clean version df %>% #id_cols: A set of columns that uniquely identifies each observation. #Defaults to all columns in data except for the columns specified in names_from and values_from. pivot_wider(id_cols = c("turkcode", "aid", "gentest", "condition", "order"), names_from = "rocks", values_from = "rating", values_fn = list(rating = ~paste(., collapse = ",")) #values_fn = list(rating = mean) #,values_fill = list(rating=0) ) %>% separate(R8, into = c('R8','R8_1')) # A tibble: 4 x 14 # Groups: turkcode, gentest [1,118] turkcode aid gentest condition order R1 R2 R3 R4 R5 R6 R7 R8 R8_1 <fct> <fct> <fct> <fct> <fct> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> 1 100879 mem first category after 7 2 4 6 3 3 2 3 6 2 100879 mem second category after 5 7 2 3 2 3 1 3 2 3 104655 mem first category after 2 NA NA NA NA NA NA NA NA 4 104655 mem second category after 4 NA NA NA NA NA NA NA NA
Raking Weights on Nested Data: R Output Doesn't Match Stata Output
Introduction I have multilevel survey data of teachers nested in schools. I have manually calculated design weights and non-response adjustment weights based on probability selection and response rate (oldwt below). Now I want to create post-stratification weights by raking on two marginals: the sex (male or female) of and the employment status (full-time or not full-time) of the teacher. With the help of kind people at Statalist (see here), I have seemingly done this in Stata successfully. However, in trying to replicate the results in R, I come up with vastly different output. Sample Data #Variables #school : unique school id #caseid : unique teacher id #oldwt : the product of the design weight and the non-response adjustment #gender : male or female #timecat : employment status (full-time or part-time) #scgender : a combined factor variable of school x gender #sctime : a combined factor variable of school x timecat #genderp : the school's true population for gender #fullp : the school's true population for timecat #Sample Data foo <- structure(list(caseid = 1:11, school = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), oldwt = c(1.8, 1.8, 1.8, 1.8, 1.8, 1.3, 1.3, 1.3, 1.3, 1.3, 1.3), gender = structure(c(2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L), .Label = c("Female", "Male"), class = "factor"), timecat = structure(c(2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L), .Label = c("Full-time", "Part-time"), class = "factor"), scgender = structure(c(2L, 1L, 1L, 2L, 2L, 3L, 4L, 4L, 3L, 4L, 4L), .Label = c("1.Female", "1.Male", "2.Female", "2.Male"), class = "factor"), sctime = structure(c(2L, 2L, 1L, 1L, 1L, 4L, 4L, 3L, 3L, 3L, 3L), .Label = c("1.Full-time", "1.Part-time", "2.Full-time", "2.Part-time"), class = "factor"), genderp = c(0.444, 0.556, 0.556, 0.444, 0.444, 0.25, 0.75, 0.75, 0.25, 0.75, 0.75), fullp = c(0.222, 0.222, 0.778, 0.778, 0.778, 0.375, 0.375, 0.625, 0.625, 0.625, 0.625)), .Names = c("caseid", "school", "oldwt", "gender", "timecat", "scgender", "sctime", "genderp", "fullp"), class = "data.frame", row.names = c(NA, -11L)) Raking Code (See here and here for in-depth examples of using anesrake in R). # extract true population proportions into a vector genderp <- c(aggregate(foo$genderp, by=list(foo$scgender), FUN=max)) fullp <- c(aggregate(foo$fullp, by=list(foo$sctime), FUN=max)) genderp <- as.vector(genderp$x) fullp <- as.vector(fullp$x) # align the levels/labels of the population total with the variables names(genderp) <- c("1.Female", "1.Male", "2.Female", "2.Male") names(fullp) <- c("1.Full-time", "1.Part-time", "2.Full-time", "2.Part-time") # create target list of true population proportions for variables targets <- list(genderp, fullp) names(targets) <- c("scgender", "sctime") # rake library(anesrake) outsave <- anesrake(targets, foo, caseid = foo$caseid, weightvec = foo$oldwt, verbose = F, choosemethod = "total", type = "nolim", nlim = 2, force1 = FALSE) outsave Comparison with Stata Output The issue is that the output from R doesn't match up with the output with Stata (even if I set force1 = TRUE), and it seems that the Stata output is the one that is right, making me think my sloppy R code is wrong. Is that the case? caseid R Stata 1 0.070 0.633 2 0.152 1.367 3 0.404 3.633 4 0.187 1.683 5 0.187 1.683 6 0.143 1.146 7 0.232 1.854 8 0.173 1.382 9 0.107 0.854 10 0.173 1.382 11 0.173 1.382
The distribution of your targets in R should sum up one and represent the distribution in your population. Look at my example. I think that the force1 option will not compute the distribution you want at least each school has the same population weight. This is what force1 is doing: targets[[1]]/sum(targets[[1]]) 1.Female 1.Male 2.Female 2.Male 0.278 0.222 0.125 0.375 Is that what you want?
Combining dataframe rows based on a value in a range [duplicate]
This question already has an answer here: Comparing multiple columns in different data sets to find values within range R (1 answer) Closed 8 years ago. I'm trying to bring together (it's not really a merge or join) data contained in two dataframes based on whether a value in one falls within a range on the second. data is at the end of the post for convenience. One data frame (df1) looks like this: Chromosome Position P.value start.range end.range name 2 4553493 8.23e-05 4453493 4653493 A 3 24548810 1.04e-04 24448810 24648810 B 1 9952003 2.09e-04 9852003 10052003 C The second df is much longer, but head(df2) looks like this: ensembl_gene_id chromosome_name start_position end_position OS01G0281600 1 10048273 10050309 OS01G0281400 1 10021423 10027120 OS01G0281301 1 10019633 10020376 OS01G0281200 1 10011875 10015468 OS01G0281100 1 10008075 10011595 OS01G0281000 1 10003952 10007742 I need to match the rows from each IF df1$Position is within 100,000 of either df2$start_position or df2$end_position (ie ((df1$Position - df2$start_position)<100000 | (df1$Position - df2$end_position)<100000). I need, as output, a list or dataframe of the rows that match. There will be multiple df2 values that match df1, and there are multiple entries per chromosome, though df1$name is unique. I've been trying various applications of ddply and custom functions, but am coming up short. Any ideas? data: df1 <- structure(list(Chromosome = c(2L, 3L, 1L), Position = c(4553493L, 24548810L, 9952003L), P.value = c(8.23e-05, 0.000104, 0.000209 ), start.range = c(4453493, 24448810, 9852003), end.range = c(4653493, 24648810, 10052003), name = c("A", "B", "C")), .Names = c("Chromosome", "Position", "P.value", "start.range", "end.range", "name"), class = "data.frame", row.names = c(NA, 3L)) df2 <- structure(list(ensembl_gene_id = c("OS01G0281600", "OS01G0281400", "OS01G0281301", "OS01G0281200", "OS01G0281100", "OS01G0281000", "OS01G0280500", "OS01G0280400", "OS01G0280000", "OS01G0279900", "OS01G0279800", "OS01G0279700", "OS01G0279400", "OS01G0279300", "OS01G0279200", "OS01G0279100", "OS01G0279000", "OS01G0278900", "OS01G0278950", "OS02G0183000", "OS02G0182850", "OS02G0182900", "OS02G0182700", "OS02G0182800", "OS02G0182500", "OS02G0182300", "OS02G0181900", "OS02G0182100", "OS02G0181800", "OS02G0181400", "OS02G0180900", "OS02G0180700", "OS02G0180500", "OS02G0180200", "OS02G0180400", "OS02G0180100", "OS03G0640300", "OS03G0640400", "OS03G0640000", "OS03G0640100", "OS03G0639700", "OS03G0639800", "OS03G0639600", "OS03G0639400", "OS03G0639300", "OS03G0638900", "OS03G0639100", "OS03G0638400", "OS03G0638800", "OS03G0638300", "OS03G0638200"), chromosome_name = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), start_position = c(10048273L, 10021423L, 10019633L, 10011875L, 10008075L, 10003952L, 9967185L, 9962807L, 9936850L, 9928971L, 9917593L, 9913390L, 9889550L, 9887657L, 9878384L, 9874379L, 9866730L, 9859354L, 9863216L, 4639932L, 4629617L, 4630446L, 4616832L, 4625425L, 4598883L, 4594375L, 4567630L, 4573831L, 4563073L, 4551426L, 4521670L, 4497115L, 4486531L, 4460342L, 4481872L, 4455016L, 24630180L, 24638186L, 24616417L, 24621460L, 24591421L, 24596843L, 24574540L, 24564913L, 24544511L, 24487877L, 24514494L, 24466606L, 24476060L, 24454477L, 24449135L), end_position = c(10050309L, 10027120L, 10020376L, 10015468L, 10011595L, 10007742L, 9969073L, 9966715L, 9947933L, 9935981L, 9921565L, 9917318L, 9902737L, 9889123L, 9885517L, 9876678L, 9870864L, 9860677L, 9866617L, 4641686L, 4630180L, 4634616L, 4621974L, 4628750L, 4601382L, 4595386L, 4573049L, 4578257L, 4566597L, 4552860L, 4523668L, 4500124L, 4489409L, 4463571L, 4483470L, 4457715L, 24634746L, 24641449L, 24617859L, 24629502L, 24596437L, 24600376L, 24579212L, 24565726L, 24549550L, 24489307L, 24515219L, 24473558L, 24480927L, 24457481L, 24453890L)), .Names = c("ensembl_gene_id", "chromosome_name", "start_position", "end_position"), class = "data.frame", row.names = c(NA, -51L))
Is this what you want? ddply(df1, .(name), function(x) { df2[(x$Position - df2$start_position) < 100000 | (x$Position - df2$end_position) < 100000, ] })
Improve speed of 3 for loops in R
I am working with a matrix set_onco of 206 rows x 196 cols and I have a vector, genes_100 (it's a matrix but I take only the first col), with 101 names. here's a snippet of how they look > set_onco[1:10,1:10] V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 GLI1_UP.V1_DN COPZ1 C10orf46 C20orf118 TMEM181 CCNL2 YIPF1 GTDC1 OPN3 RSAD2 SLC22A1 GLI1_UP.V1_UP IGFBP6 HLA-DQB1 CCND2 PTH1R TXNDC12 M6PR PPT2 STAU1 IGJ TMOD3 E2F1_UP.V1_DN TGFB1I1 CXCL5 POU5F1 SAMD10 KLF2 STAT6 ENTPD6 VCAN HMGCS1 ANXA8 E2F1_UP.V1_UP RRP1B HES1 ADCY6 CHAF1B VPS37B GRSF1 TLX2 SSX2IP DNA2 CMA1 EGFR_UP.V1_DN NPY1R PDZK1 GFRA1 GREB1 MSMB DLC1 MYB SLC6A14 IFI44 IFI44L EGFR_UP.V1_UP FGG GBP1 TNFRSF11B FGB GJA1 DUSP6 S100A9 ADM ITGB6 DUSP4 ERB2_UP.V1_DN NPY1R PDZK1 ANXA3 GREB1 HSPB8 DLC1 NRIP1 FHL2 EGR3 IFI44 FAM18B1 ERB2_UP.V1_UP CYP1A1 CEACAM5 FAM129A TNFRSF11B DUSP4 CYP1B1 UPK2 DAB2 CEACAM6 KIAA1199 GCNP_SHH_UP_EARLY.V1_DN SRRM2 KIAA1217 DEFA1 DLK1 PITX2 CCL2 UPK3B SEZ6 TAF15 EMP1 genes_100[1:10,1] [1] AL591845.1 B3GALT6 RAP1GAP HSPG2 BX293535.1 RP1-159A19.1 IFI6 FAM76A FAM176B CSF3R 101 Levels: 5_8S_rRNA AC018470.1 AC091179.2 AC103702.3 AC138972.1 ACVR1B AL049829.5 AL137797.2 AL139260.2 AL450326.2 AL591845.1 AL607122.2 B3GALT6 BX293535.1 ... ZNF678 what I want to do is to parse through the matrix and count the frequency at which each row contains the names in genes_100 to do that I created 3 for loops: the first one moves down one row at the time, the second one moves into the row and the third one loops over the list genes_100 checking for matches. at the end I save in a matrix how many times genes_100 matched with the terms in each row, saving also the row names from the matrix (so that I know which one is which) the code works and gives me the correct output...but it's just really slow!! a snippet of the output is: head(result_matrix_100) freq_100 [1,] "GLI1_UP.V1_DN" "0" [2,] "GLI1_UP.V1_UP" "0" [3,] "E2F1_UP.V1_DN" "0" [4,] "E2F1_UP.V1_UP" "0" [5,] "EGFR_UP.V1_DN" "0" [6,] "EGFR_UP.V1_UP" "0" I used system.time() and I get: user system elapsed 525.38 0.06 530.34 which is way too slow since I have even bigger matrices to parse, and in some cases I have to repeat this 10k times!!! the code is: result_matrix_100 <- matrix(nrow=0, ncol=2) for (q in seq(1,nrow(set_onco),1)) { for (j in seq(1, length(set_onco[q,]),1)) { for (x in seq(1,101,1)) { if (as.character(genes_100[x,1]) == as.character(set_onco[q,j])) { freq_100 <- freq_100+1 } } } result_matrix_100 <- rbind(result_matrix_100, cbind(row.names(set_onco)[q], freq_100)) } what would you suggest? thanks in advance :)
#joran's will possibly be faster although it may not be "factor-safe". Your set_onco values are probably encoded as factor variables (because your genes_100 object clearly is.) This will be safer: set_onco[] <- lapply(set_onco, as.character) # that converts a data.frame with factor columns to character valued # at that point #joran's solution could be used safely freq100 <- apply(set_onco, 1, function(x) sum(x %in% genes_100) ) # that does a row-by-row count of the number of matches to genes_100 freq100 GLI1_UP.V1_DN GLI1_UP.V1_UP E2F1_UP.V1_DN 0 0 0 E2F1_UP.V1_UP EGFR_UP.V1_DN EGFR_UP.V1_UP 0 0 0 ERB2_UP.V1_DN ERB2_UP.V1_UP GCNP_SHH_UP_EARLY.V1_DN 0 0 0 The size of your dataset (206 rows x 196 cols) is quite small so this will be virtually immediate. These dput statements and output can be used to construct what I think your objects look like internally: dput(set_onco) structure(list(V2 = structure(c(1L, 4L, 8L, 6L, 5L, 3L, 5L, 2L, 7L), .Label = c("COPZ1", "CYP1A1", "FGG", "IGFBP6", "NPY1R", "RRP1B", "SRRM2", "TGFB1I1"), class = "factor"), V3 = structure(c(1L, 6L, 3L, 5L, 8L, 4L, 8L, 2L, 7L), .Label = c("C10orf46", "CEACAM5", "CXCL5", "GBP1", "HES1", "HLA-DQB1", "KIAA1217", "PDZK1"), class = "factor"), V4 = structure(c(3L, 4L, 8L, 1L, 7L, 9L, 2L, 6L, 5L), .Label = c("ADCY6", "ANXA3", "C20orf118", "CCND2", "DEFA1", "FAM129A", "GFRA1", "POU5F1", "TNFRSF11B"), class = "factor"), V5 = structure(c(7L, 5L, 6L, 1L, 4L, 3L, 4L, 8L, 2L), .Label = c("CHAF1B", "DLK1", "FGB", "GREB1", "PTH1R", "SAMD10", "TMEM181", "TNFRSF11B" ), class = "factor"), V6 = structure(c(1L, 8L, 5L, 9L, 6L, 3L, 4L, 2L, 7L), .Label = c("CCNL2", "DUSP4", "GJA1", "HSPB8", "KLF2", "MSMB", "PITX2", "TXNDC12", "VPS37B"), class = "factor"), V7 = structure(c(8L, 6L, 7L, 5L, 3L, 4L, 3L, 2L, 1L), .Label = c("CCL2", "CYP1B1", "DLC1", "DUSP6", "GRSF1", "M6PR", "STAT6", "YIPF1" ), class = "factor"), V8 = structure(c(2L, 5L, 1L, 7L, 3L, 6L, 4L, 8L, 9L), .Label = c("ENTPD6", "GTDC1", "MYB", "NRIP1", "PPT2", "S100A9", "TLX2", "UPK2", "UPK3B"), class = "factor"), V9 = structure(c(4L, 8L, 9L, 7L, 6L, 1L, 3L, 2L, 5L), .Label = c("ADM", "DAB2", "FHL2", "OPN3", "SEZ6", "SLC6A14", "SSX2IP", "STAU1", "VCAN"), class = "factor"), V10 = structure(c(8L, 6L, 4L, 2L, 5L, 7L, 3L, 1L, 9L), .Label = c("CEACAM6", "DNA2", "EGR3", "HMGCS1", "IFI44", "IGJ", "ITGB6", "RSAD2", "TAF15"), class = "factor"), V11 = structure(c(8L, 9L, 1L, 2L, 6L, 3L, 5L, 7L, 4L), .Label = c("ANXA8", "CMA1", "DUSP4", "EMP1", "IFI44", "IFI44L", "KIAA1199", "SLC22A1", "TMOD3"), class = "factor")), .Names = c("V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11"), class = "data.frame", row.names = c("GLI1_UP.V1_DN", "GLI1_UP.V1_UP", "E2F1_UP.V1_DN", "E2F1_UP.V1_UP", "EGFR_UP.V1_DN", "EGFR_UP.V1_UP", "ERB2_UP.V1_DN", "ERB2_UP.V1_UP", "GCNP_SHH_UP_EARLY.V1_DN" )) dput(factor(genes_100) ) structure(c(1L, 2L, 9L, 7L, 3L, 10L, 8L, 6L, 5L, 4L), .Label = c("AL591845.1", "B3GALT6", "BX293535.1", "CSF3R", "FAM176B", "FAM76A", "HSPG2", "IFI6", "RAP1GAP", "RP1-159A19.1"), class = "factor")
Something like this will probably be quite fast: #Sample data m <- matrix(sample(letters,206*196,replace = TRUE),206,196) genes_100 <- letters[1:5] m1 <- matrix(m %in% genes_100,206,196) rowSums(m1)