Grouping elements from different data - r

In my work I'm trying to find which of genes usually comes together. So I set up some experiments and now trying to analyze the data. I already wrote a nice script for analyzing it but still it's not enough.
What I want to do this time is to analyze couple of tables and establish which genes are usually together - in the same cluster.
That's my data:
First table:
> dput(tbl_col_clu1[1:20,])
structure(list(`10` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), `20` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `52.5` = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `81` = c(0, 0, 0, 0,
0, 0, 0.64209043, 0, 0, 0, 0, 0, 0, 0, 0.636411741, 0.183490041,
0, 0, 0, 0), `110` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), `140.5` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `189` = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0.84958569, 0, 0, 0, 0, 0), `222.5` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0.37119221, 0, 0, 0, 1, 0, 0, 0, 0,
0), `278` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), `340` = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), `397` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `453.5` = c(0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `529` = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `580` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `630.5` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `683.5` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `735.5` = c(0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `784` = c(0,
0, 0, 0, 0, 0, 0, 0.399952462, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0.959211661, 1), `832` = c(0, 0.1266780707, 0, 0, 0, 0, 0, 0.2132893016,
1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.959211661, 1), `882.5` = c(0,
0.12667807, 0, 0, 0, 1, 0, 0.08480435, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 1, 0.70163097), `926.5` = c(0, 1, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), `973` = c(0, 0.12621196, 0,
0, 0, 0, 0, 0.11813646, 0, 0, 0, 1, 0, 0, 0.59389934, 1, 0, 0,
0, 0), `1108` = c(0, 0.092444384, 0, 0, 0, 0, 0, 0.115758222,
0, 0, 0, 0.925835779, 0, 0, 1, 0.303482426, 0.848464317, 0, 0,
0), `1200` = c(0, 0.120055749, 0, 1, 0, 0, 0, 0.150055416, 0,
0, 0, 0.558015841, 0, 0, 0.796949668, 0.276321753, 1, 0, 0, 0
), Clusters = structure(c(1L, 64L, 45L, 102L, 11L, 77L, 170L,
55L, 59L, 316L, 316L, 98L, 90L, 77L, 232L, 178L, 101L, 50L, 51L,
51L), .Label = c("10", "10,13,15", "10,15", "10,15,16", "10,20,21,22,23,24",
"10,22,23,24", "11", "11,12,13,14,15", "11,12,13,14,15,16", "11,12,13,14,15,16,17",
"12", "12,13", "12,13,14", "12,13,14,15", "12,13,14,15,16", "12,13,14,15,16,17",
"12,13,14,15,16,17,18,19,20,21,22,23,24", "12,13,15", "12,13,17",
"13", "13,14", "13,14,15", "13,14,15,16", "13,14,15,16,17", "13,15",
"13,15,16,17", "14", "14,15", "14,15,16", "14,15,16,17", "14,15,16,17,18,19,20,21,22,23,24",
"14,19", "15", "15,16", "15,16,17", "15,16,17,18,19,20,21,22,23,24",
"15,16,17,19,20,21,22,23,24", "15,17", "15,17,24", "15,22,23,24",
"15,23", "15,24", "16", "16,17", "17", "17,18,19,20", "17,18,19,20,21,22,23,24",
"17,21,22,23,24", "18", "18,19", "18,19,20", "18,19,20,21", "18,19,20,21,22",
"18,19,20,21,22,23", "18,19,20,21,22,23,24", "18,19,21", "18,19,22,23",
"18,20", "19", "19,20", "19,20,21", "19,20,21,22", "19,20,21,22,23",
"19,20,21,22,23,24", "19,20,22", "19,20,22,23", "19,20,22,23,24",
"19,20,23", "19,21", "19,22", "19,23", "19,24", "2", "2,18,19,20",
"2,19,20", "2,3,4", "20", "20,21", "20,21,22", "20,21,22,23",
"20,21,22,23,24", "20,21,23", "20,22", "20,22,23", "20,22,23,24",
"20,22,24", "20,23", "20,23,24", "20,24", "21", "21,22", "21,22,23",
"21,22,23,24", "21,23,24", "21,24", "22", "22,23", "22,23,24",
"22,24", "23", "23,24", "24", "3", "3,10", "3,18,19,20", "3,18,19,20,21,22,23,24",
"3,19,20", "3,19,20,21", "3,19,20,22,23,24", "3,20,21,22,23,24",
"3,20,22,23,24", "3,21,23,24", "3,22,23,24", "3,22,24", "3,23",
"3,23,24", "3,24", "3,4", "3,4,10", "3,4,18,19", "3,4,18,19,20",
"3,4,18,19,20,21,22,23", "3,4,18,19,20,21,22,23,24", "3,4,19,20,21",
"3,4,21", "3,4,21,22,23", "3,4,21,22,23,24", "3,4,22,23", "3,4,22,23,24",
"3,4,22,24", "3,4,23,24", "3,4,24", "3,4,5", "3,4,5,10", "3,4,5,10,23,24",
"3,4,5,20", "3,4,5,22,23,24", "3,4,5,23,24", "3,4,5,24", "3,4,5,6",
"3,4,5,6,10", "3,4,5,6,20,22,23,24", "3,4,5,6,7", "3,4,5,6,7,10",
"3,4,5,6,7,24", "3,4,5,6,7,8", "3,4,5,6,7,8,10", "3,4,5,6,7,8,10,13",
"3,4,5,6,7,8,10,22,23,24", "3,4,5,6,7,8,12", "3,4,5,6,7,8,15",
"3,4,5,6,7,8,18,19,20,21,22,23,24", "3,4,5,6,7,8,22,23,24", "3,4,5,6,7,8,9,10",
"3,4,5,6,7,8,9,10,11,12", "3,4,5,6,7,8,9,10,11,12,13,14,15",
"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24",
"3,4,5,6,7,8,9,10,11,14,15", "3,4,5,6,7,8,9,10,19,20,21,22,23,24",
"3,4,5,6,7,8,9,10,22,23,24", "3,4,6", "3,4,6,7,20,21,22,23,24",
"3,4,7", "3,4,7,8", "3,5,6,7,8", "3,5,8", "3,7", "3,7,19,20,22,23",
"4", "4,10", "4,10,24", "4,18,19,20", "4,19,20", "4,20,21,22",
"4,20,21,22,23,24", "4,20,22,23,24", "4,22,23,24", "4,23,24",
"4,24", "4,5", "4,5,10", "4,5,10,21", "4,5,10,23,24", "4,5,19,20,21,22,23",
"4,5,19,20,22,23,24", "4,5,20,21,22,23,24", "4,5,20,22,23,24",
"4,5,22,23,24", "4,5,24", "4,5,6", "4,5,6,10", "4,5,6,10,20,22,23,24",
"4,5,6,19", "4,5,6,22,23,24", "4,5,6,7", "4,5,6,7,10", "4,5,6,7,19,20,21,22,23,24",
"4,5,6,7,22,23,24", "4,5,6,7,8", "4,5,6,7,8,10", "4,5,6,7,8,10,19,20,21,22,23,24",
"4,5,6,7,8,10,20,21,22,23,24", "4,5,6,7,8,10,21,22,23,24", "4,5,6,7,8,10,22,23,24",
"4,5,6,7,8,10,23,24", "4,5,6,7,8,15", "4,5,6,7,8,17,18,19,20,21,22,23,24",
"4,5,6,7,8,19,20", "4,5,6,7,8,19,20,21,22,23,24", "4,5,6,7,8,20,21,22,23,24",
"4,5,6,7,8,21,22,23,24", "4,5,6,7,8,22,23,24", "4,5,6,7,8,9,10",
"4,5,6,7,8,9,10,11,12", "4,5,6,7,8,9,10,11,12,13,14,15", "4,5,6,7,8,9,10,11,12,13,14,15,16,17",
"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18", "4,5,6,7,8,9,10,12,13",
"4,5,6,7,8,9,14,15,16", "4,5,7,9", "4,5,8,22", "4,6", "4,6,7,22,23,24",
"4,6,7,23,24", "4,6,7,8,15,17", "4,6,7,8,23,24", "4,7", "4,7,20,21",
"4,7,21,22,23,24", "4,7,8", "4,7,8,22,23,24", "5", "5,10", "5,17",
"5,18,19,20,21,22,23", "5,19,20,21,22,23,24", "5,20", "5,22,23,24",
"5,24", "5,6", "5,6,10", "5,6,7", "5,6,7,10", "5,6,7,10,19",
"5,6,7,22,23,24", "5,6,7,8", "5,6,7,8,10", "5,6,7,8,10,15", "5,6,7,8,10,22,23,24",
"5,6,7,8,15", "5,6,7,8,18,19,20,21,22,23,24", "5,6,7,8,21,22,23,24",
"5,6,7,8,22,23,24", "5,6,7,8,9", "5,6,7,8,9,10", "5,6,7,8,9,10,11,12,13",
"5,6,7,8,9,10,11,12,13,14,15", "5,6,7,8,9,12", "5,6,7,8,9,13",
"5,7", "5,7,8", "5,8", "6", "6,10", "6,21,22,23", "6,22", "6,22,23,24",
"6,7", "6,7,10,17", "6,7,22,23,24", "6,7,23,24", "6,7,24", "6,7,8",
"6,7,8,10", "6,7,8,13,14,15,16,17", "6,7,8,15", "6,7,8,19,20",
"6,7,8,20,21,22,23,24", "6,7,8,21,22,23,24", "6,7,8,23,24", "6,7,8,9",
"6,7,8,9,10", "6,7,8,9,10,11,12", "6,7,8,9,10,11,12,13,14,15,16,17",
"6,7,8,9,10,15,16", "6,7,8,9,10,18,19,20,21,22,23,24", "6,7,8,9,15",
"6,8", "7", "7,15", "7,15,17", "7,16,18,21", "7,17", "7,19,20",
"7,19,20,21,22", "7,20,21,22,23,24", "7,20,22,23,24", "7,22,23,24",
"7,24", "7,8", "7,8,10", "7,8,10,22,23,24", "7,8,13,15", "7,8,14",
"7,8,15", "7,8,15,16", "7,8,15,23", "7,8,20", "7,8,22", "7,8,23",
"7,8,9", "7,8,9,10", "7,8,9,13", "7,8,9,15,16,17", "8", "8,10",
"8,15", "8,17", "8,22", "8,24", "8,9", "8,9,10", "9", "9,10,11,12,13,14,15,16,17"
), class = "factor")), .Names = c("10", "20", "52.5", "81", "110",
"140.5", "189", "222.5", "278", "340", "397", "453.5", "529",
"580", "630.5", "683.5", "735.5", "784", "832", "882.5", "926.5",
"973", "1108", "1200", "Clusters"), row.names = c("at1g01050.1",
"at1g01080.1", "at1g01090.1", "at1g01220.1", "at1g01320.2", "at1g01420.1",
"at1g01710.1", "at1g01800.1", "at1g01920.2", "at1g01940.1", "at1g01960.1",
"at1g02020.2", "at1g02100.2", "at1g02140.1", "at1g02150.1", "at1g02500.2",
"at1g02560.1", "at1g02880.3", "at1g02920.1", "at1g02930.2"), class = "data.frame")
Second table:
> dput(tbl_col_clu2[1:20,])
structure(list(`10` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), `20` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `52.5` = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `81` = c(0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `110` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `140.5` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `189` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `222.5` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `278` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), `340` = c(0,
0, 0, 0, 0, 0, 0.583163048, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
1, 0.218194067), `397` = c(0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
0, 0.63953839, 0, 1, 0, 0, 0, 1), `453.5` = c(0, 0.66069369,
0, 0, 0, 1, 0.57541627, 1, 1, 0, 0, 0, 1, 0.64615661, 0, 0.45209671,
0, 0, 0, 0.17022498), `529` = c(0, 0.521435654, 0, 0, 1, 0, 0.175996209,
0, 0, 0, 1, 0, 0, 0, 0, 0.886059888, 0, 0, 0, 0.17022498), `580` = c(0,
0.437291195, 0, 0, 1, 0, 0.20731698, 0, 0, 0, 1, 0, 0, 0, 0,
0.719755907, 0, 0, 0, 0.033248127), `630.5` = c(0, 0.52204783,
0, 0, 0, 0, 0.48815538, 0, 0, 0, 0, 1, 0, 0, 0, 0.82709638, 0,
0, 0, 0.09539534), `683.5` = c(0, 0.52429838, 0, 0, 0, 0, 0.59605685,
0, 0, 0, 0, 0, 0, 0, 0, 0.27845748, 0.28224351, 0, 0, 0), `735.5` = c(1,
0.3768651, 0, 1, 0, 0, 0.51381348, 0, 0, 0, 0, 0, 0, 0, 0, 0.39914361,
0.22206677, 0, 0, 0), `784` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0), `832` = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.16189002, 0, 0, 0), `882.5` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `926.5` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0), `973` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.86100786, 0, 0, 0, 0,
0), `1108` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), `1200` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), Clusters = structure(c(168L, 32L, 246L,
168L, 81L, 44L, 8L, 44L, 27L, 318L, 81L, 132L, 15L, 3L, 219L,
32L, 156L, 318L, 1L, 6L), .Label = c("10", "10,11", "10,11,12",
"10,11,12,13", "10,11,12,13,14", "10,11,12,13,14,15", "10,11,12,13,14,15,16",
"10,11,12,13,14,15,16,17", "10,11,12,13,14,15,16,17,18,19", "10,11,12,13,14,15,16,17,18,19,20",
"10,11,12,13,14,15,16,17,18,19,20,21", "10,11,12,13,14,16", "10,11,12,13,15,16,17,18,19,20,21",
"10,11,12,13,19", "10,12", "10,12,13", "10,12,13,14", "10,12,13,14,15",
"10,12,13,14,15,16,17", "10,12,13,15", "10,12,21", "10,13", "10,13,14",
"10,17,18", "10,20", "11", "11,12", "11,12,13", "11,12,13,14",
"11,12,13,14,15", "11,12,13,14,15,16", "11,12,13,14,15,16,17",
"11,12,13,14,15,16,17,18,19", "11,12,13,14,15,16,17,18,19,20",
"11,12,13,14,15,16,17,18,19,20,21,22,23", "11,12,13,14,15,16,17,18,19,20,21,22,23,24",
"11,12,13,14,15,16,17,18,19,21,22", "11,12,13,14,15,16,18", "11,12,13,17,18,19",
"11,12,14", "11,13", "11,13,14,15,16", "11,15", "12", "12,13",
"12,13,14", "12,13,14,15", "12,13,14,15,16", "12,13,14,15,16,17",
"12,13,14,15,16,17,18", "12,13,14,15,16,17,18,19", "12,13,14,15,16,17,18,19,20",
"12,13,14,15,16,17,18,19,20,21", "12,13,14,15,16,17,18,19,20,21,22",
"12,13,14,15,16,17,18,19,20,21,22,23", "12,13,14,15,16,17,18,19,20,21,22,23,24",
"12,13,14,15,16,17,18,19,23,24", "12,13,14,15,16,17,19", "12,13,14,15,16,17,19,20,21",
"12,13,14,15,16,17,21", "12,13,14,15,16,18", "12,13,14,15,17",
"12,13,14,16,17,19", "12,13,14,18", "12,13,15", "12,13,16", "12,13,16,17,18,19",
"12,13,16,19", "12,13,17", "12,13,21,22,23", "12,14", "12,14,15",
"12,14,15,16", "12,14,15,17,19", "12,15", "12,15,16,17", "12,16,17",
"12,20", "12,21,23", "13", "13,14", "13,14,15", "13,14,15,16",
"13,14,15,16,17", "13,14,15,16,17,18", "13,14,15,16,17,18,19",
"13,14,15,16,17,18,19,20", "13,14,15,16,17,18,19,20,21", "13,14,15,16,17,18,19,20,21,22",
"13,14,15,16,17,18,19,20,21,22,23", "13,14,15,16,17,18,19,20,21,22,23,24",
"13,14,15,16,17,18,19,21", "13,14,15,16,17,18,19,21,22,23", "13,14,15,16,17,19",
"13,14,15,16,17,21", "13,14,15,16,18,23", "13,14,17", "13,14,19,20,21,22,23",
"13,14,23,24", "13,15", "13,15,16", "13,15,16,18,19", "13,15,17",
"13,16,17", "13,17", "13,17,19", "13,19", "13,21", "14", "14,15",
"14,15,16", "14,15,16,17", "14,15,16,17,18", "14,15,16,17,18,19",
"14,15,16,17,18,19,20", "14,15,16,17,18,19,20,21", "14,15,16,17,18,19,20,21,22",
"14,15,16,17,18,19,20,21,22,23", "14,15,16,17,18,19,20,21,22,23,24",
"14,15,16,17,18,19,20,22,23,24", "14,15,16,17,19", "14,15,16,17,19,20",
"14,15,16,17,19,20,21", "14,15,16,17,22", "14,15,16,19", "14,15,17",
"14,15,19", "14,17", "14,17,18,19", "14,19", "14,21", "15", "15,16",
"15,16,17", "15,16,17,18", "15,16,17,18,19", "15,16,17,18,19,20",
"15,16,17,18,19,20,21", "15,16,17,18,19,20,21,22,23", "15,16,17,18,19,20,21,22,23,24",
"15,16,17,19", "15,16,17,19,20,21", "15,16,17,19,24", "15,16,17,20,21",
"15,16,17,21", "15,16,17,23", "15,16,18,19", "15,16,19,20", "15,17",
"15,18,19,20", "15,18,19,20,21", "15,19", "16", "16,17", "16,17,18",
"16,17,18,19", "16,17,18,19,20", "16,17,18,19,20,21", "16,17,18,19,20,21,22",
"16,17,18,19,20,21,22,23", "16,17,18,19,20,21,22,23,24", "16,17,19",
"16,17,19,20", "16,17,19,20,21", "16,17,19,21", "16,17,23", "16,19",
"17", "17,18", "17,18,19", "17,18,19,20", "17,18,19,20,21", "17,18,19,20,21,22",
"17,18,19,20,21,22,23", "17,18,19,20,21,22,23,24", "17,18,19,21",
"17,19", "17,19,20", "17,19,20,21", "17,19,20,21,22,23,24", "17,19,23",
"17,20,21", "17,20,21,23", "17,21,22", "17,23", "17,24", "18",
"18,19", "18,19,20", "18,19,20,21", "18,19,20,21,22", "18,19,20,21,22,23",
"18,19,20,21,22,23,24", "18,19,20,21,23", "18,20", "19", "19,20",
"19,20,21", "19,20,21,22", "19,20,21,22,23", "19,20,21,22,23,24",
"19,20,21,23,24", "19,20,22", "19,21", "19,22", "19,23", "2",
"2,17", "2,3,4,5,6", "2,3,4,5,6,7", "20", "20,21", "20,21,22",
"20,21,22,23", "20,21,22,23,24", "20,21,23", "20,21,23,24", "21",
"21,22", "21,22,23", "21,22,23,24", "21,23", "22", "22,23", "22,23,24",
"23", "23,24", "24", "3", "3,23,24", "3,4", "3,4,23,24", "3,4,5",
"3,4,5,6", "3,4,5,6,13,14,15,16,17,18,19,20,21,22,23,24", "3,4,5,6,7",
"3,4,5,6,7,8,9", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24",
"3,4,5,6,7,8,9,20,21,22,23,24", "3,4,5,6,7,8,9,21,22,23,24",
"3,4,5,6,8,9", "3,4,5,7,8,9,15,16,17,18,19,20,21,22,23", "3,4,6,12,13,14,15,16,17,18,19,20,21,22,23,24",
"3,8,9,10,11,12,13,14,15,16,17,18,19,20", "4", "4,17,18,19,20,21,22,23,24",
"4,19,20,21,22,23,24", "4,21", "4,22,23,24", "4,5,17,18,19,20,21,22,23,24",
"4,5,21,22,23,24", "4,5,6", "4,5,6,22,23,24", "4,5,6,7,8,9",
"4,5,6,7,8,9,10", "4,5,6,7,8,9,10,15,16,17,18,19,20,21,22,23,24",
"4,5,6,7,8,9,12,13,14,15,16,17,18,19,20,21,22,23,24", "4,5,6,7,8,9,13",
"4,5,6,7,8,9,14,15,16,17,18,19,20,21,22,23,24", "4,5,6,7,8,9,17,18,19,20,21,22,23,24",
"4,5,6,7,8,9,19,20,21,22,23,24", "4,5,6,7,8,9,19,23,24", "4,5,6,7,8,9,23,24",
"4,5,7,8,9", "4,8,9,12,13,14,15,16,17,18,19,20,21,22,23,24",
"4,8,9,23,24", "5", "5,22,23", "5,6", "5,6,15,16,17,18,19,20,21,22,23,24",
"5,6,19,20,21,22,23,24", "5,6,24", "5,6,7", "5,6,7,8", "5,6,7,8,19,20,21,22,23,24",
"5,6,7,8,9", "5,6,7,8,9,10,11,12,13", "5,6,7,8,9,10,11,12,13,14,15,16,17",
"5,6,7,8,9,15,23,24", "5,6,9", "5,7", "5,8,9", "6", "6,15,16,17,18,19,20,21,22,23,24",
"6,19,20,21,22,23,24", "6,20,21,22,23,24", "6,21,22,23,24", "6,7",
"6,7,8", "6,7,8,9", "6,7,8,9,15,16,17,18,19,20,21,22,23,24",
"6,7,8,9,23,24", "6,7,9", "6,8,15,16,17,18,19,20,21,22,23", "6,8,9",
"6,9", "7", "7,14,24", "7,8,9", "7,8,9,10,11,12,13,14,15", "7,8,9,20,21,22,23,24",
"7,8,9,23,24", "7,9", "7,9,10", "8", "8,19,20,21", "8,19,20,21,22,23,24",
"8,9", "8,9,10,11,12,13,14,15,16,17", "8,9,10,17,18,19,20,21,22",
"8,9,12,13,14,15,16,17,18,19", "8,9,14,15,16,17,18,19,20,21,22,23,24",
"8,9,15,16,17,18,19,20,21,22", "8,9,19", "8,9,19,20,21,22,23",
"8,9,21,22", "9", "9,10", "9,10,11,12,13,14", "9,10,11,12,13,14,15,16",
"9,10,11,12,13,14,15,16,17", "9,10,11,12,13,14,15,16,17,18,19",
"9,10,11,12,13,14,15,16,17,18,19,20,21", "9,10,11,12,13,14,15,16,17,18,19,20,21,22,23",
"9,10,11,12,13,14,15,16,17,19", "9,12", "9,12,13", "9,12,13,14",
"9,13", "9,13,14,15", "9,13,14,15,16,17", "9,13,14,15,18", "9,14",
"9,14,15,16", "9,15", "9,15,16,17", "9,16", "9,16,17,18,19,21,22",
"9,16,17,19", "9,17", "9,17,18", "9,19", "9,19,20", "9,19,20,21",
"9,19,21", "9,20", "9,20,21", "9,20,21,22", "9,21", "9,22", "9,23"
), class = "factor")), .Names = c("10", "20", "52.5", "81", "110",
"140.5", "189", "222.5", "278", "340", "397", "453.5", "529",
"580", "630.5", "683.5", "735.5", "784", "832", "882.5", "926.5",
"973", "1108", "1200", "Clusters"), row.names = c("at1g01050.1",
"at1g01080.1", "at1g01090.1", "at1g01220.1", "at1g01420.1", "at1g01470.1",
"at1g01800.1", "at1g01910.5", "at1g01920.2", "at1g01980.1", "at1g02020.2",
"at1g02100.2", "at1g02130.1", "at1g02140.1", "at1g02150.1", "at1g02500.2",
"at1g02560.1", "at1g02780.1", "at1g02880.3", "at1g02920.1"), class = "data.frame")
Third Table:
> dput(tbl_col_clu3[1:20,])
structure(list(`10` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), `33.95` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `58.66` = c(0, 0, 0, 0, 0.328143363,
0.552139556, 0.495919686, 0, 0, 0, 0, 0, 0, 0, 0, 0.416266322,
0.886125103, 1, 1, 0), `84.42` = c(0, 0, 0, 0, 1, 1, 0, 0, 0,
0, 0, 0.327004551, 0, 0, 0, 0.956778355, 1, 0.175277617, 0.240402438,
0), `110.21` = c(0, 0, 0, 0, 0, 0.151581882, 0, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0.091367379, 0.029316359, 0, 0), `134.16` = c(0.190968551,
0, 0, 0, 0, 0.164736594, 0, 0, 0, 0, 0, 0.650199285, 0, 0, 0,
0, 0.097800974, 0.007393484, 0, 0), `164.69` = c(0.5342874459,
0, 0.3619993464, 0, 0, 0.1891527151, 0, 0, 0, 0, 0, 0.4926963182,
0, 0, 0, 0, 0, 0, 0, 0), `199.1` = c(0.866134859, 0, 0.405387979,
0, 0, 0.274468991, 0, 0, 0, 0, 0, 0.352737127, 0.170514318, 0,
0, 0, 0, 0, 0, 0), `234.35` = c(1, 0, 0.446118481, 0, 0, 0.338427523,
0, 0, 0, 0, 0, 0.204601923, 0.343919727, 0, 0, 0, 0, 0, 0, 0),
`257.19` = c(0.732231652, 0, 0.666653103, 0, 0, 0.403078017,
0, 0, 0, 0, 0, 0.315665123, 1, 0, 0, 0, 0, 0, 0, 0), `361.84` = c(0.660960044,
0, 1, 0, 0, 0.202578329, 0, 0, 0, 0, 0, 0.320183046, 0.424361453,
0, 0, 0, 0, 0, 0, 0), `432.74` = c(0.47961801, 0, 0.48323321,
0, 0, 0.25926071, 0, 0, 0, 0, 0, 0.36362413, 0.43039587,
0, 0, 0, 0, 0, 0, 0), `506.34` = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0.22943212, 0.19354376, 0, 0, 0, 0, 0, 0, 0), `581.46` = c(0,
0.52783556, 0, 1, 0, 0, 0, 0.64407392, 0, 0.70701938, 0,
0.2596209, 0.29757967, 0, 0, 0, 0, 0, 0, 0), `651.71` = c(0,
0.32678969, 0, 0.36428195, 0, 0, 0, 0.64951761, 0, 0.80866933,
1, 0.18614028, 0.21567888, 0.32813633, 0, 0, 0, 0, 0, 0),
`732.59` = c(0, 0.229023369, 0, 0.312832425, 0, 0, 0, 0.696041374,
0, 0.590471454, 0, 0.108699479, 0.187935709, 0.275177957,
0, 0, 0, 0, 0, 0.243080694), `817.56` = c(0, 0.25668583,
0, 0.4003249, 0, 0, 0, 0.53376606, 0, 0.85524485, 0, 0.22539659,
0.27977127, 0.55089774, 0, 0, 0, 0, 0, 1), `896.24` = c(0,
0.31675535, 0, 0.50882005, 0, 0, 0, 0.74705458, 0.12936306,
1, 0, 0.1949139, 0.21957859, 0.75063327, 0, 0, 0, 0, 0, 0.63346358
), `971.77` = c(0, 0.27811949, 0, 0.48419038, 0, 0, 0, 0.8563439,
0.39897143, 0.84491933, 0, 0.13935282, 0.17670128, 0.84111004,
0, 0, 0, 0, 0, 0), `1038.91` = c(0, 1, 0, 0.52506752, 0,
0, 0, 1, 1, 0.85617714, 0, 0.13507463, 0, 1, 0, 0, 0, 0,
0, 0), Clusters = structure(c(222L, 88L, 237L, 88L, 145L,
155L, 143L, 88L, 122L, 88L, 97L, 180L, 260L, 102L, 186L,
145L, 149L, 149L, 145L, 106L), .Label = c("10", "10,11",
"10,11,12", "10,11,12,13", "10,11,12,13,14", "10,11,12,13,14,15",
"10,11,12,13,14,15,16", "10,11,12,13,14,15,16,17,18", "10,11,12,13,14,15,16,17,18,19",
"10,11,12,13,14,15,16,17,18,19,20", "10,11,12,14", "10,11,12,14,15",
"10,11,12,14,15,16", "10,11,12,14,15,16,17,18", "10,11,12,14,15,16,17,18,19",
"10,11,12,14,15,16,17,18,19,20", "10,11,12,14,15,17,18,19",
"10,11,12,15,16,17", "10,11,14", "10,11,15", "10,11,15,16,17",
"10,11,16", "10,11,17", "10,11,20", "10,12", "10,14,15,16",
"10,14,15,16,17,18,19", "10,15", "10,15,16", "10,15,16,18",
"10,16,19", "10,18,19,20", "10,19", "10,19,20", "10,20",
"11", "11,12", "11,12,13", "11,12,13,14", "11,12,13,14,15",
"11,12,13,14,15,16", "11,12,13,14,15,16,17,18", "11,12,13,14,15,16,17,18,19",
"11,12,13,14,15,16,17,18,19,20", "11,12,13,14,15,16,18,19",
"11,12,14,15", "11,12,14,15,16,17", "11,12,14,15,16,17,18",
"11,12,14,15,16,17,18,19", "11,12,14,15,16,17,18,19,20",
"11,12,18", "11,12,19", "11,12,20", "12", "12,13", "12,13,14",
"12,13,14,15", "12,13,14,15,16", "12,13,14,15,16,17,18",
"12,13,14,15,16,17,18,19,20", "12,14", "12,14,15", "12,14,15,16",
"12,14,15,16,17", "12,14,15,16,17,18", "12,14,15,16,17,18,19",
"12,14,15,16,17,18,19,20", "12,14,15,16,20", "12,14,15,18,19,20",
"12,15", "12,16", "12,16,17,18", "12,18,19,20", "12,19,20",
"12,20", "13", "13,14", "13,14,15", "13,14,15,16,17,18,19,20",
"13,16", "13,20", "14", "14,15", "14,15,16", "14,15,16,17",
"14,15,16,17,18", "14,15,16,17,18,19", "14,15,16,17,18,19,20",
"14,15,16,18", "14,15,17", "14,15,18", "14,16", "14,16,17",
"14,16,17,18,19,20", "14,18,19,20", "14,19", "15", "15,16",
"15,16,17", "15,16,17,18", "15,16,17,18,19", "15,16,17,18,19,20",
"15,20", "16", "16,17", "16,17,18", "16,17,18,19", "16,17,18,19,20",
"16,17,18,20", "16,17,19", "16,18,19,20", "16,19,20", "17",
"17,18", "17,18,19", "17,18,19,20", "17,18,20", "17,19,20",
"17,20", "18", "18,19", "18,19,20", "19", "19,20", "2", "2,19,20",
"2,3", "2,3,4", "2,3,4,5", "2,3,4,5,11", "2,3,4,5,6", "2,3,4,5,6,7,8",
"2,3,4,5,6,7,8,11,12", "2,3,4,5,6,7,8,9", "2,3,4,5,6,7,8,9,10",
"2,3,4,5,6,7,8,9,10,11", "2,3,4,5,6,7,8,9,10,11,12", "2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20",
"2,4", "2,5", "2,5,6,7", "20", "3", "3,18", "3,4", "3,4,10",
"3,4,20", "3,4,5", "3,4,5,6", "3,4,5,6,7", "3,4,5,6,7,8",
"3,4,5,6,7,8,9", "3,4,5,6,7,8,9,10", "3,4,5,6,7,8,9,10,11",
"3,4,5,6,7,8,9,10,11,12", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17",
"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20",
"3,4,8", "3,4,8,9", "3,5", "3,7", "3,9", "4", "4,5", "4,5,12,13",
"4,5,16", "4,5,6", "4,5,6,16,17,18,19,20", "4,5,6,20", "4,5,6,7",
"4,5,6,7,8", "4,5,6,7,8,10,11", "4,5,6,7,8,9", "4,5,6,7,8,9,10",
"4,5,6,7,8,9,10,11", "4,5,6,7,8,9,10,11,12", "4,5,6,7,8,9,10,11,12,13,14,15",
"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19", "4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20",
"4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,20", "4,5,6,7,8,9,16,17",
"4,5,7,8,9,10,11,12,13,14,15,16,17,18,19,20", "4,6,7", "4,7,13",
"5", "5,11,12,14,15,16,17,18,19", "5,14", "5,14,15,16", "5,16,19",
"5,17,18,19,20", "5,18", "5,6", "5,6,7", "5,6,7,10", "5,6,7,8",
"5,6,7,8,10", "5,6,7,8,9", "5,6,7,8,9,10", "5,6,7,8,9,10,11",
"5,6,7,8,9,10,11,12", "5,6,7,8,9,10,11,12,13", "5,6,7,8,9,10,11,12,13,14",
"5,6,7,8,9,10,11,12,13,14,15,16", "5,6,7,8,9,10,11,12,13,14,15,16,17,18",
"5,6,7,8,9,10,11,12,13,14,15,16,17,18,19", "5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20",
"5,6,7,8,9,16,17,18,19,20", "5,6,8", "5,7,8,9,10", "5,7,8,9,10,14,15,16,17,18",
"5,8", "6", "6,7", "6,7,16", "6,7,8", "6,7,8,10,11,12,15,16,17,18",
"6,7,8,19", "6,7,8,9", "6,7,8,9,10", "6,7,8,9,10,11", "6,7,8,9,10,11,12",
"6,7,8,9,10,11,12,13,14", "6,7,8,9,10,11,12,13,14,15,16,17",
"6,7,8,9,10,11,12,13,14,15,16,17,18,19", "6,7,8,9,10,11,12,13,14,15,16,17,18,19,20",
"6,7,8,9,10,11,12,14,15,16", "6,7,8,9,10,18,19", "7", "7,10,11,14,15",
"7,12", "7,8", "7,8,12", "7,8,9", "7,8,9,10", "7,8,9,10,11",
"7,8,9,10,11,12", "7,8,9,10,11,12,13", "7,8,9,10,11,12,13,14,15,16",
"7,8,9,10,11,12,13,14,15,16,17,18", "7,8,9,10,11,12,13,14,15,16,17,18,19",
"7,8,9,10,11,12,13,14,15,16,17,18,19,20", "7,8,9,10,11,12,14,15,16,17,18,19",
"7,8,9,10,11,12,14,15,16,17,18,19,20", "7,8,9,10,12,15,16,17,18",
"7,9,10,11,12,13,14,15,16,17,18,19,20", "8", "8,10", "8,10,20",
"8,14,15,16,17,18,19,20", "8,16,17", "8,9", "8,9,10", "8,9,10,11",
"8,9,10,11,12", "8,9,10,11,12,13,14", "8,9,10,11,12,13,14,15",
"8,9,10,11,12,13,14,15,16", "8,9,10,11,12,13,14,15,16,17,18",
"8,9,10,11,12,13,14,15,16,17,18,19", "8,9,10,11,12,13,14,15,16,17,18,19,20",
"8,9,10,11,12,14,15,16", "8,9,10,11,12,14,15,16,17,18,19,20",
"8,9,10,14,15,16,17,18,19,20", "8,9,17", "9", "9,10", "9,10,11",
"9,10,11,12", "9,10,11,12,13,14,15,16,17", "9,10,11,12,13,14,15,16,17,18",
"9,10,11,12,13,14,15,16,17,18,19", "9,10,11,12,13,14,15,16,17,18,19,20",
"9,10,11,12,14,15,16", "9,10,11,12,14,15,16,17,18", "9,10,11,12,14,15,16,17,18,19",
"9,10,11,12,14,15,16,17,18,19,20", "9,10,11,12,16,17,18,19,20",
"9,10,11,14,15,16,17", "9,10,12,14,15,16,17", "9,10,14,15",
"9,11,12", "9,11,12,14", "9,12,14", "9,20"), class = "factor")), .Names = c("10",
"33.95", "58.66", "84.42", "110.21", "134.16", "164.69", "199.1",
"234.35", "257.19", "361.84", "432.74", "506.34", "581.46", "651.71",
"732.59", "817.56", "896.24", "971.77", "1038.91", "Clusters"
), row.names = c("at1g01050.1", "at1g01080.1", "at1g01090.1",
"at1g01320.2", "at1g01470.1", "at1g01800.1", "at1g01910.5", "at1g01960.1",
"at1g01980.1", "at1g02150.1", "at1g02470.1", "at1g02500.2", "at1g02560.1",
"at1g02780.1", "at1g02816.1", "at1g02880.2", "at1g02920.1", "at1g02930.2",
"at1g03030.1", "at1g03090.2"), class = "data.frame")
The last column (Clusters) is important for us and the row.names. This column says in which column we can find any abundance for that gene. It doesn't matter for me in which exaclty cluster is gene but which genes come together with it.
Let's use an example:
Those genes belong to the same cluster (cluster 5) in data1.
at1g09640.1
at1g07250.1
at1g08200.1
at1g09300.2 ##
at1g09490.2 ## Those
at1g09760.1 ##
at1g09780.1
If we analyze other data set (data2). We can see that some of those genes can be found together again. Maybe it's different cluster (cluster 20) or so but they are together and that's most important for me.
at1g02880.3
at1g01220.1
at1g09300.2 ##
at1g09490.2 ## Those
at1g09760.1 ##
at1g02130.1
I have like 15 similar data sets and I would like to be able to ask R: show me genes which can be found together in 15 of 15 data sets or 13 of 15 data sets and so on....
Any ideas ?

First, you need to turn those comma delimited lists into columns- it is much easier to work with them that way. Then, you want to find which genes have the matching columns. Finally, you can aggregate to get totals of how many genes match other genes.
Note that you will have both orders of genes, as well as genes matched with themselves. Also, the "Clusters" column will tell you how many times they were in the same exact set of clusters.
This will run in O(n^2) time, meaning that doubling the number of genes analyzed will quadruple the time. My quick timing tests estimate it would take 15 hours on my computer to do 15 data frames of 2300 rows.
library(plyr)
frame_list <- list(tbl_col_clu1, tbl_col_clu2, tbl_col_clu3)
turn_numbers_into_columns <- function(x) {
# Creates a data.frame that has the group numbers as columns
x[, strsplit(x$Clusters, ",")[[1]]] <- 1
return(x)
}
get_comparison <- function(current_table) {
# Creates a comparison data frame for a single input table
simplified_frame <- data.frame(
"gene" = row.names(current_table),
"Clusters" = as.character(current_table$Clusters),
stringsAsFactors = FALSE)
split_f <- adply(simplified_frame, 1, turn_numbers_into_columns)
#This is the slow line
comparison_frame <- ddply(split_f, "gene", function(x) {
ddply(split_f, "gene", function (y) {
output <- as.data.frame(x == y)
output$gene <- x$gene
output$gene2 <- y$gene
return(output)
})
})
return(comparison_frame)
}
combined_frame <- ldply(frame_list, get_comparison)
sum_frame <- aggregate(
combined_frame[, !(names(combined_frame) %in% c("gene", "gene2"))],
by = combined_frame[, c("gene", "gene2")],
FUN = sum,
na.rm = T)
View(sum_frame)
If you had consistently the same set of genes and groupings, you could turn everything into arrays, which run faster than data frames, cutting your time by a factor of about six. The part that runs very slowly would be replaced with something like this. It returns 3-dimensional arrays that you could add together.
comparison_frame <- aaply(split_f, 1, function(x) {
print(x)
output <- aaply(split_f, 1, function (y) {
output <- array(x == y, c(1, length(x)))
return(output)
})
return(output)
})

Throw them into SPMF with Apriori or FPGrowth algorithm. SPMF expects input as file of comma-separated sequences of integers (you may have to convert your data). Each sequence is on separate string:
1,2,4,10
3,2,1,11,12
2,5,14,5
You invoke it like this:
java -jar spmf.jar run FPGrowth sequences.txt output.txt 35% 90%
First number is minimal support (how many sets should contain your group to consider it a group). SPMF contains different algorithms You can try to see which one fits you best.

Related

Loop for dynamic rows in R

I don't know If my question is well thought out. I'll try to explain as best as I can. I'm trying to create this model originaly made in Excel into R. The function basically is a sum of products between these four matrices. The left matrices are interaction matrices, so the variables are in the rows as well in the columns. The other two matrices describe the behaviour as times passes, taking data from the previous time step. In excel this is roughly the way it's built:
The cell where the function is applied is the one with the red arrow. I made the following code, starting by making the main function, then making the four matrices and then the loop for applying the fucntion to the matrices.
### MAIN FUNCTION ### This is the main sum product function
func_dxit <- function(xit, ksim, alfa, beta, dkism_evaluar, dt) {
xit_log <- log(xit)
prod_xit <- (-xit * xit_log)
suma_alfa <- crossprod(alfa, ksim_evaluar)
suma_beta <- crossprod(beta, dkism_evaluar)
suma_alfabeta <- (suma_alfa + suma_beta)
dxit <- (prod_xit * suma_alfabeta) * dt
return(dxit)
}
Where "xit" is the value of the element in the first row and first column of the 3rd matrix. "alfa" is the row of left most matrix, "beta" is the row of second one, and "ksim_evaluar" and "dksim_evaluar" are rows of the 3rd and 4th matrices respectively.
### INITIAL DATA ###
ini <- c(0.5,0.05,0.25,0.5,0.3,0.05,0.5,0.5,0.3,0.25,0.2,0.1) # initial values, these are initial values of ksim at time 0, or row 1.
dt <- 0.1 # constant
### ALFA AND BETA MATRICES ###
input_alfa <- matrix(c(
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 0), nrow = 12, ncol = 12, byrow = TRUE)
colnames(input_alfa) <- c("unidades de recursos", "volumen de corta",
"acuerdos de venta", "ingresos", "ganancias",
"sanciones", "trabajo comunitario",
"eficiencia de aprovechamiento",
"regeneracion artificial", "esfuerzo de reforestacion",
"regeneracion natural", "productividad")
rownames(input_alfa) <- c("unidades de recursos", "volumen de corta",
"acuerdos de venta", "ingresos", "ganancias",
"sanciones", "trabajo comunitario",
"eficiencia de aprovechamiento",
"regeneracion artificial", "esfuerzo de reforestacion",
"regeneracion natural", "productividad")
input_beta <- matrix(c(
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), nrow = 12, ncol = 12, byrow = TRUE)
colnames(input_beta) <- c("unidades de recursos", "volumen de corta",
"acuerdos de venta", "ingresos", "ganancias",
"sanciones", "trabajo comunitario",
"eficiencia de aprovechamiento",
"regeneracion artificial", "esfuerzo de reforestacion",
"regeneracion natural", "productividad")
rownames(input_beta) <- c("unidades de recursos", "volumen de corta",
"acuerdos de venta", "ingresos", "ganancias",
"sanciones", "trabajo comunitario",
"eficiencia de aprovechamiento",
"regeneracion artificial", "esfuerzo de reforestacion",
"regeneracion natural", "productividad")
Next I made the empty KSIM and DKSIM matrices, except for KSIM which first row is "ini" the initial values.
### KSIM AND DKSIM MATRICES ###
ksim <- matrix(c(
ini,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
), nrow = 26, ncol = 12, byrow = TRUE)
colnames(ksim) <- c("unidades de recursos", "volumen de corta",
"acuerdos de venta", "ingresos", "ganancias",
"sanciones", "trabajo comunitario",
"eficiencia de aprovechamiento",
"regeneracion artificial", "esfuerzo de reforestacion",
"regeneracion natural", "productividad")
rownames(ksim) <- c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
"11", "12", "13", "14", "15", "16", "17", "18", "19",
"20", "21", "22", "23", "24", "25")
dksim <- matrix(c(
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
), nrow = 26, ncol = 12, byrow = TRUE)
colnames(dksim) <- c("unidades de recursos", "volumen de corta",
"acuerdos de venta", "ingresos", "ganancias",
"sanciones", "trabajo comunitario",
"eficiencia de aprovechamiento", "regeneracion artificial",
"esfuerzo de reforestacion", "regeneracion natural", "productividad")
rownames(dksim) <- c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
"11", "12", "13", "14", "15", "16", "17", "18", "19",
"20", "21", "22", "23", "24", "25")
Next is to fill up the matrices using a loop that iterates the main function
for (i in 1:26){
xit <- as.vector(ksim[i,])
alfa <- as.vector(input_alfa[1,])
beta <- as.vector(input_beta[1,])
ksim_evaluar <- as.vector(ksim[1,])
dksim_evaluar <- as.vector(dksim[1,])
dksim[i,] <- func_dxit(xit, ksim, alfa, beta, dkism_evaluar,dt)
return (dksim)
}
This does iterate the function over the whole row in dksim. The problem is that for every iteration in the same row, it has to take the "alfa" and "beta" values of the next variable, something that my loop doesn't achieve. That is, it's correctly evaluating the same row for "ksim_evaluar" and "dksim_evaluar" (same time step) but it has to take the next row for "alfa" and "beta" (next variable) as you can see in the green arrow compared with the first image.
For the next iteration it must evaluate the 3rd row, for the next one the 4th and so on. Should this be a nested loop? I hope I explained this correctly, thanks a lot in advance.

Get edge lenghts as spatial distances in a network in R

I have a list of nodes with spatial coordinates nodes_geo. The first column is the nodes' identifier, the other 2 columns are the x and y coordinates:
nodes_geo <- structure(list(grid_grid.g9.nodes = c(27, 28, 29, 40, 41, 42,
43, 55, 56, 58, 69, 81, 94, 98, 108, 110, 113, 114, 123, 124,
128, 138, 139, 141), grid_grid.coords.x1 = c(15.504078, 15.704078,
15.904078, 15.104078, 15.304078, 15.504078, 15.704078, 15.104078,
15.304078, 15.704078, 14.904078, 14.304078, 13.904078, 14.704078,
13.704078, 14.104078, 14.704078, 14.904078, 13.704078, 13.904078,
14.704078, 13.704078, 13.904078, 14.304078), grid_grid.coords.x2 = c(43.835623,
43.835623, 43.835623, 44.035623, 44.035623, 44.035623, 44.035623,
44.235623, 44.235623, 44.235623, 44.435623, 44.635623, 44.835623,
44.835623, 45.035623, 45.035623, 45.035623, 45.035623, 45.235623,
45.235623, 45.235623, 45.435623, 45.435623, 45.435623)), class = "data.frame", row.names = c(NA,
-24L))
And I have an adjacency matrix for these nodes adjacency_matrix:
adjacency_matrix <- structure(c(0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1,
1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), .Dim = c(24L, 24L), .Dimnames = list(
c("27", "28", "29", "40", "41", "42", "43", "55", "56", "58",
"69", "81", "94", "98", "108", "110", "113", "114", "123",
"124", "128", "138", "139", "141"), c("27", "28", "29", "40",
"41", "42", "43", "55", "56", "58", "69", "81", "94", "98",
"108", "110", "113", "114", "123", "124", "128", "138", "139",
"141")))
I would like to get the spatial lenght of edges in a network based on the above adjacency matrix. I would also like to be able to do the same from random networks generated from the real network with the same number of edges. The goal is to compare the mean edge lenght of the real network and that of the random networks for hypothesis testing.
Here is a starting point using the data from above (note that we assue these
coordinates are planar with Euclidean distances, so you should really project
first):
library(spatstat)
nodes_ppp <- as.ppp(nodes_geo[,c(2:3,1)], W = bounding.box.xy(nodes_geo[,-1]))
head(as.data.frame(nodes_ppp))
#> x y marks
#> 1 15.50408 43.83562 27
#> 2 15.70408 43.83562 28
#> 3 15.90408 43.83562 29
#> 4 15.10408 44.03562 40
#> 5 15.30408 44.03562 41
#> 6 15.50408 44.03562 42
a_mat <- adjacency_matrix > 0 # Logical matrix required
typeof(adjacency_matrix)
#> [1] "double"
typeof(a_mat)
#> [1] "logical"
network <- linnet(nodes_ppp, a_mat) # Linear network
#> Warning: Network is not connected
plot(network)
segs <- as.psp(network) # Segments of linear network
rslt <- lengths_psp(segs) # Segment lengths
hist(rslt)

How can I make this replacement of values based on order more computationally efficient in R? [duplicate]

This question already has answers here:
Get value of a matrix with row-index and column-index [duplicate]
(2 answers)
Closed 2 years ago.
I have a df that of 32 columns and just under a million rows. The columns are the POINTID (individual id), First (year that an event first happened), and then 30 columns of years w binary occurrence data. I would like the first occurrence in each row (currently stored as a 1, same as all other occurrences) to be changed to a 2, so that I can differentiate between the first event and repeat events. I've tried doing this with the tidyverse, but even then it is taking forever. I can't tell if my code is just wrong or if it's not computationally efficient enough. I tested it on a smaller dataset and it seemed to work, in the long format but not the wide, so I'm thinking it's an efficiency issue because the pivot_longer table generated is about about 35 million rows long.
Can anyone help me understand why this isn't working or how to do it in a way that computes faster?
classifications %>%
pivot_longer(-c(1,32),names_to="Years", values_to="Present")%>%
group_by(POINTID)%>%
mutate(Present=replace(Present, Years==first, 2))
A reduced version of my DF is below:
> dput(classifications)
structure(list(POINTID = 2:11, first = structure(c(33L, 33L,
33L, 33L, 1L, 33L, 33L, 1L, 1L, 36L), .Label = c("X1985", "X1986",
"X1987", "X1988", "X1989", "X1990", "X1991", "X1992", "X1993",
"X1994", "X1995", "X1996", "X1997", "X1998", "X1999", "X2000",
"X2001", "X2002", "X2003", "X2004", "X2005", "X2006", "X2007",
"X2008", "X2009", "X2010", "X2011", "X2012", "X2013", "X2014",
"X2015", "X2016", "X2017", "X2018", "X2019", "X2020"), class = "factor"),
X1990 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X1991 = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0), X1992 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0), X1993 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X1994 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), X1995 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), X1996 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X1997 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), X1998 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), X1999 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2000 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), X2001 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), X2002 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2003 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), X2004 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), X2005 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2006 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), X2007 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), X2008 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2009 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), X2010 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), X2011 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2012 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), X2013 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), X2014 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X2015 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), X2016 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), X2017 = c(1, 1, 1, 1, 0, 1, 1, 0, 0, 0), X2018 = c(1,
0, 0, 0, 0, 0, 0, 0, 0, 0), X2019 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), X2020 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 1)), row.names = c(NA,
10L), class = "data.frame")
You can do this keeping the data in wide format with vectorised operations of row/column subsetting. We get the column index using match.
mat <- cbind(1:nrow(classifications),
match(classifications$first, names(classifications)))
classifications[mat] <- 2

Find the smallest distance between the profiles

I would like to find the smallest distance between the profiles stored in a data frame. I am interested especially in one row in comparison to the rest of the rows stored in the data frame.
That's a data frame:
structure(list(`10` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `34` = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 393090, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6718400,
0, 311350, 0), `59` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2164949.7,
4834137.6, 0, 0, 0, 1187816.7, 0, 0, 0, 0, 0, 0, 1340912.5, 0
), `84` = c(0, 0, 0, 0, 0, 0, 0, 0, 8607100, 0, 0, 17586713.2,
22629743.6, 0, 0, 0, 2808791.7, 0, 0, 4026222.5, 0, 0, 0, 1981900,
0), `110` = c(2296000, 0, 0, 0, 0, 2140221.7, 0, 0, 5809230.6,
0, 0, 37134898.5, 3861828.7, 2553100, 0, 12075845.8, 0, 0, 1272950,
8695273, 0, 0, 2657180, 2710080, 0), `134` = c(0, 0, 0, 1176150,
0, 1329596.7, 1471000, 0, 6511934, 6511934, 0, 18709227.3, 0,
1041211.2, 0, 6544176.9, 0, 0, 2412651.7, 7724956.9, 2878418.3,
0, 8620131.7, 2386972.8, 0), `165` = c(0, 1226610, 0, 1345098.7,
2083771.9, 0, 1808231.4, 0, 0, 10742997.7, 0, 13060798.9, 0,
538340, 538340, 2791649.5, 0, 0, 6217622, 1316097.1, 4716931.8,
0, 6615816.9, 1510532, 0), `199` = c(0, 1571525, 0, 1903038.3,
1676700, 0, 888832.2, 0, 0, 9084418.6, 0, 11189460.1, 0, 0, 1807662.5,
2564275, 0, 0, 18080359.7, 0, 0, 0, 2397710.2, 1717949.2, 0),
`234` = c(0, 1314900, 2482696, 1325684, 0, 0, 0, 0, 0, 7321432.7,
0, 9843409.2, 0, 0, 1073341.7, 2762775, 0, 0, 9335312.8,
0, 0, 0, 1950788.2, 1509100, 0), `257` = c(0, 1568700, 14604298.7,
940162.2, 0, 0, 0, 0, 0, 4779505.9, 0, 9691692.4, 0, 0, 735290,
2650165, 0, 2311383.7, 5193383.4, 0, 0, 0, 1341998.7, 1225325.6,
0), `362` = c(0, 0, 4190740.5, 288800, 0, 0, 0, 0, 0, 4846634.8,
0, 9574498.7, 0, 0, 0, 1425600, 0, 8339312.1, 3877892.5,
0, 0, 0, 1752866.7, 0, 0), `433` = c(0, 0, 773280, 0, 0,
0, 0, 0, 0, 3926582.8, 3926582.8, 5962586.5, 0, 0, 0, 1041400,
0, 1972909.3, 1895439.4, 0, 0, 0, 963891.2, 0, 1109800),
`506` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9332272, 0, 0, 0,
0, 0, 0, 2219100, 0, 0, 0, 0, 0, 0, 0), `581` = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 4371537.1, 0, 0, 0, 0, 0, 0, 2428800,
0, 0, 0, 0, 0, 0, 0), `652` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1689871.4, 0, 0, 0, 0, 0, 0, 988399.7, 0, 0, 0, 0, 0,
0, 0), `733` = c(0, 0, 0, 0, 0, 0, 0, 1250100, 0, 0, 1754205.3,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `818` = c(0, 0,
0, 0, 0, 0, 0, 517340, 0, 0, 1149227.6, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), `896` = c(0, 0, 0, 0, 0, 0, 0, 579846.7,
0, 0, 985931.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
`972` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 858255.5, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1039` = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 848993.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0)), .Names = c("10", "34", "59", "84", "110", "134",
"165", "199", "234", "257", "362", "433", "506", "581", "652",
"733", "818", "896", "972", "1039"), row.names = c("Mark_1",
"Mark_2", "Alex_1", "Katrin_1", "Georg_1", "Martin_1",
"Tim_1", "Tom_1", "Mike_1", "Mike_2", "Mike_3",
"Hare_1", "Dea_1", "Monty_1", "Monty_2", "Niko_1",
"Lee_1", "Marq_1", "Otto_1", "Priaq_1", "Surkta_1",
"Norsa_1", "Norsa_2", "Quer_1", "Quer_2"), class = "data.frame")
So the row named Katrin_1 is the one which is interesting for me. I would like to find which rows have the smallest euclidean distance to Katrin_1. Let say 3-5 rows.
Let's get rid of Katrin_1 column with df[!rownames(df) %in% "Katrin_1", ], subtract df["Katrin_1", ] from each of the remaining rows with sweep, find Euclidean distances by squaring the resulting matrix element-wise and using rowSums, use which.min to get the final result:
names(which.min(rowSums(sweep(df[!rownames(df) %in% "Katrin_1", ], 2, as.numeric(df["Katrin_1", ]), `-`)^2)))
# [1] "Mark_2"
This should be much more efficient than using dist as dist would compute all possible distances, while we need need only a few.

Remove multiple matching columns from multiple character string

I have a dataframe of over 200 variables, many of which end with a code for a given species. I want to eliminate any columns that contain one of several codes, contained in a separate vector of character strings. How can I remove these multiple columns matching the multiple codes at the same time? The column names don't match the code values exactly, but contain the codes at the end of the column name. For example:
"rev230" "rev3360" "rev3508"
Manually, I've done this (using the dplyr package):
sub = select(df, -contains("3781"), -contains("3751"), -contains("1408"),
-contains("1409"), -contains("4469"), -contains("1789"), -contains("4559"),
-contains("1453"),-contains("8"), -contains("3508"), -contains("4656"),
-contains("5131"), -contains("9999"))
This gets me what I want (eliminating all columns that contain data on the species matching these codes), but obviously this is very tedious.
I'd like something more like:
sub = select(df, -contains(species$codes))
# I realize this isn't the right syntax
I tried a loop to remove individual columns, using something like this:
foreach(i=1:length(species$codes), .combine=rbind)%do%
select(df, -contains(species$codes[i]))
but that didn't work either. Thanks in advance!
reproducible example:
Species codes (contained in a character vector):
dput(species)
c("3754", "3755", "3758", "3764", "3765", "3771", "3772", "3782",
"3761", "3762", "3763", "3767", "3768", "1790", "1412", "1413",
"1416", "1422", "1423", "1424", "1425", "1426", "1410", "1411",
"1414", "1415", "1420", "3770", "4740", "4470", "4472", "4474",
"4476", "4479", "4480", "1812", "1815", "1799", "4560", "3810",
"1440", "1441", "3302", "3295", "3560", "3360", "1940", "3840",
"570", "1050", "4710", "230")
Here are the first 10 rows of the data, with only columns for species data
dput(logsub)
structure(list(lbs3781 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 708), lbs3764 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3765 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 708), lbs3758 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3755 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3782 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), lbs3751 = c(0, 0, 4, 0, 0, 0, 21, 0, 18, 0), lbs3761 = c(0,
0, 0, 0, 0, 0, 18, 0, 0, 0), lbs3762 = c(0, 0, 4, 0, 0, 0, 3,
0, 0, 0), lbs3763 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3767 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3768 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), lbs3754 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3771 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3772 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), lbs1790 = c(0, 0, 0, 0, 0, 0, 0, 0, 18, 0), lbs1409 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 86), lbs1411 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), lbs1414 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs1415 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 86), lbs4740 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), lbs1420 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3770 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs1408 = c(2508, 785, 57, 0, 132,
5003, 18, 104, 636, 0), lbs1412 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0), lbs1413 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs1416 = c(2331,
654, 57, 0, 81, 4284, 15, 104, 120, 0), lbs1422 = c(177, 0, 0,
0, 51, 719, 3, 0, 0, 0), lbs1423 = c(0, 131, 0, 0, 0, 0, 0, 0,
502, 0), lbs1424 = c(0, 0, 0, 0, 0, 0, 0, 0, 14, 0), lbs1425 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs1426 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), lbs1410 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs4469 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs4470 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), lbs4472 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
lbs4474 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs4476 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), lbs4479 = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), lbs4480 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0), lbs1789 = c(0, 0, 0, 863, 0, 0, 0, 0,
0, 98), lbs1812 = c(0, 0, 0, 863, 0, 0, 0, 0, 0, 27), lbs1815 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 71), lbs1799 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), lbs4559 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 12),
lbs4560 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 12), lbs3810 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), lbs1453 = c(0, 0, 5,
0, 0, 0, 21, 0, 15, 235), lbs1440 = c(0, 0, 5, 0, 0, 0, 21,
0, 15, 0), lbs1441 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3560 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3302 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 235), lbs3295 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
lbs0008 = c(0, 97, 99, 0, 0, 0, 0, 0, 0, 0), lbs1940 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3840 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), lbs1050 = c(0, 0, 31, 0, 0, 0, 0, 0, 0, 0),
lbs4710 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs570 = c(0,
97, 68, 0, 0, 0, 0, 0, 0, 0), lbs230 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), lbs3360 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3508 = c(0,
0, 5043, 0, 0, 0, 0, 0, 0, 0), lbs4656 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0), lbs9999 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
rev3781 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 1688.144979), rev3764 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), rev3765 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 1688.144979), rev3758 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0), rev3755 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev3782 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), rev3751 = c(0, 0, 7.063636364,
0, 0, 0, 33.44605263, 0, 32.53608247, 0), rev3761 = c(0,
0, 0, 0, 0, 0, 27.34105263, 0, 0, 0), rev3762 = c(0, 0, 7.063636364,
0, 0, 0, 6.105, 0, 0, 0), rev3763 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), rev3767 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev3768 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), rev3754 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), rev3771 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev3772 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), rev1790 = c(0, 0, 0, 0, 0, 0,
0, 0, 32.53608247, 0), rev1409 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 260.0068669), rev1411 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0
), rev1414 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev1415 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 260.0068669), rev4740 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0), rev1420 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0), rev3770 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev1408 = c(6349.327025,
2014.2837, 142.8362084, 0, 339.5618788, 13265.98305, 41.94345809,
235.6862428, 1835.487932, 0), rev1412 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0), rev1413 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
rev1416 = c(5841.249152, 1623.155767, 142.8362084, 0, 194.2835976,
11101.38378, 33.99320809, 235.6862428, 299.2968186, 0), rev1422 = c(508.0778723,
0, 0, 0, 145.2782813, 2164.599274, 7.95025, 0, 0, 0), rev1423 = c(0,
391.1279328, 0, 0, 0, 0, 0, 0, 1494.676782, 0), rev1424 = c(0,
0, 0, 0, 0, 0, 0, 0, 41.51433134, 0), rev1425 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0), rev1426 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0), rev1410 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev4469 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), rev4470 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), rev4472 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev4474 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), rev4476 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), rev4479 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), rev4480 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev1789 = c(0,
0, 0, 963.8520574, 0, 0, 0, 0, 0, 95.34540063), rev1812 = c(0,
0, 0, 963.8520574, 0, 0, 0, 0, 0, 30.02711217), rev1815 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 65.31828847), rev1799 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0), rev4559 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 12.94965112), rev4560 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 12.94965112
), rev3810 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev1453 = c(0,
0, 3.505617978, 0, 0, 0, 13.9460241, 0, 10.93726937, 225.778089
), rev1440 = c(0, 0, 3.505617978, 0, 0, 0, 13.9460241, 0,
10.93726937, 0), rev1441 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0
), rev3560 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev3302 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 225.778089), rev3295 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0), rev0008 = c(0, 180.3441341, 169.7750491,
0, 0, 0, 0, 0, 0, 0), rev1940 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0), rev3840 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev1050 = c(0,
0, 48.71428571, 0, 0, 0, 0, 0, 0, 0), rev4710 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0), rev570 = c(0, 180.3441341, 121.0607634,
0, 0, 0, 0, 0, 0, 0), rev230 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0), rev3360 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev3508 = c(0,
0, 2620.957866, 0, 0, 0, 0, 0, 0, 0), rev4656 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0), rev9999 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0)), .Names = c("lbs3781", "lbs3764", "lbs3765", "lbs3758",
"lbs3755", "lbs3782", "lbs3751", "lbs3761", "lbs3762", "lbs3763",
"lbs3767", "lbs3768", "lbs3754", "lbs3771", "lbs3772", "lbs1790",
"lbs1409", "lbs1411", "lbs1414", "lbs1415", "lbs4740", "lbs1420",
"lbs3770", "lbs1408", "lbs1412", "lbs1413", "lbs1416", "lbs1422",
"lbs1423", "lbs1424", "lbs1425", "lbs1426", "lbs1410", "lbs4469",
"lbs4470", "lbs4472", "lbs4474", "lbs4476", "lbs4479", "lbs4480",
"lbs1789", "lbs1812", "lbs1815", "lbs1799", "lbs4559", "lbs4560",
"lbs3810", "lbs1453", "lbs1440", "lbs1441", "lbs3560", "lbs3302",
"lbs3295", "lbs0008", "lbs1940", "lbs3840", "lbs1050", "lbs4710",
"lbs570", "lbs230", "lbs3360", "lbs3508", "lbs4656", "lbs9999",
"rev3781", "rev3764", "rev3765", "rev3758", "rev3755", "rev3782",
"rev3751", "rev3761", "rev3762", "rev3763", "rev3767", "rev3768",
"rev3754", "rev3771", "rev3772", "rev1790", "rev1409", "rev1411",
"rev1414", "rev1415", "rev4740", "rev1420", "rev3770", "rev1408",
"rev1412", "rev1413", "rev1416", "rev1422", "rev1423", "rev1424",
"rev1425", "rev1426", "rev1410", "rev4469", "rev4470", "rev4472",
"rev4474", "rev4476", "rev4479", "rev4480", "rev1789", "rev1812",
"rev1815", "rev1799", "rev4559", "rev4560", "rev3810", "rev1453",
"rev1440", "rev1441", "rev3560", "rev3302", "rev3295", "rev0008",
"rev1940", "rev3840", "rev1050", "rev4710", "rev570", "rev230",
"rev3360", "rev3508", "rev4656", "rev9999"), row.names = c(34367L,
48646L, 48715L, 48717L, 48722L, 48724L, 48743L, 48744L, 48781L,
48783L), class = "data.frame")
One options is select_if with grepl, as grepl returns a logical vector and you can include multiple patterns you want to match on via |.
A simple case where you want to remove two species would look like:
select_if(df, !grepl("3781|3751", names(df)) )
Aggregate all of your species into the right form for grepl via paste with collapse, and then use this within grepl.
select_if(df, !grepl(paste(species, collapse = "|"), names(df)) )

Resources