Cluster sequences in a network by their editing distance - in R - r

I have a dataframe my_df with 10,000 different sequences with different lengths (between 13to18) they comprised from different numbers (0-3)
example of my data (60 lines) :
library(stringdist)
library(igraph)
library(reshape2)
structure(list(alfa_ch = c("2000000232003211","2000000331021", "20000003310320011", "20000003323331021",
"20000003331001","20000003331001", "20000003332021", "200000100331021",
"20000013011001","20000013301021", "2000001333331011", "20000023231031",
"200000233302001","20000023331011", "20000023331012", "20000023332021",
"200000233331021","20000030231011", "200000303323331021", "200000313301021",
"20000032031021","2000003220021", "2000003221011", "2000003231031",
"20000032311001","200000330330021", "2000003311211", "2000003331001",
"2000003331001","2000003331012", "20000033321012", "200000333231011",
"20000033323331021","20000033331021", "2000010320011", "20000103323331021",
"200001113011001","20000113011001", "20000120330021", "20000123033011",
"2000012331131","2000013011001", "2000013301021", "200001330231011",
"2000013323001","20000133231311", "20000133301001", "200001333331011",
"200001333331011","200001333331011", "200001333331011", "20000200331021",
"20000200331021","20000200331131", "20000203221011", "2000020333133011",
"20000212221111","20000213301021", "2000021331011", "200002223231011")),
row.names = c(1L,3L, 5L, 6L, 7L, 8L, 9L, 10L, 12L, 13L, 14L, 16L, 17L, 18L, 19L,20L, 21L,
23L, 24L, 27L, 29L, 31L, 32L, 33L, 34L, 35L, 38L, 41L,42L, 43L, 46L, 47L, 48L,
49L, 58L, 59L, 60L, 62L, 63L, 64L, 66L,68L, 71L, 72L, 73L, 74L, 75L, 77L, 78L,
79L, 80L, 81L, 82L, 83L,84L, 85L, 89L, 90L, 91L, 95L), class = "data.frame")
, my goal is to cluster them by editing distance < 3.
dist_mtx=as.matrix(stringdistmatrix(my_df$alfa,my_df$alfa,method = "lv"))
dist_mtx[dist_mtx>3]=NA
dist_mtx[new_test_2==0]=NA
colnames(dist_mtx) <- dist_mtx$alfa
rownames(dist_mtx) <- dist_mtx$alfa
then created an edge list , while the value represents the editing distance between any 2 sequences:
edge_list <- unique(melt(dist_mtx,na.rm = TRUE,varnames = c('seq1','seq2'),as.is = T))
edge_list=edge_list[!is.na(edge_list$value),]
then created the igraph object :
igraph_obj <- igraph::graph_from_data_frame(edge_list,directed = F,vertices = dist_mtx$alfa)
then i tried numerous methods to try and cluster those sequences with louvain method and im still getting clusters which its members have editing distance > 3 , im aware that it might be because of the connected components.
so my questions are :
is there a way to cluster to sequences together so that in each cluster the members would be with editing distance < 3 ?
is there a way to recognize the cluster centers (HUBS) , tried hubness.score() and assign vertices according to those centers with consideration of the editing distance ?
this is my first post ,
i will appreciate any help

Related

Replace row names with the column value of another data frame based on column matches

I want to replace the row names of meth.kirp.cpg with anno$V1 if the existing row names matches anno$V2.
library(tidyverse)
rownames(meth.kirp.cpg) <- meth.kirp.cpg %>%
rowwise() %>%
mutate(out = anno$V1[str_which(colnames(meth.kirp.cpg), anno$V2)])
Traceback:
Error in `mutate()`:
ℹ In argument: `out = anno$V1[str_which(rownames(meth.kirp.cpg),
anno$V2)]`.
ℹ In row 1.
Caused by error in `str_detect()`:
! Can't recycle `string` (size 142513) to match `pattern` (size 365860).
Run `rlang::last_error()` to see where the error occurred.
Example data:
meth.kirp.cpg
> dput(meth.kirp.cpg[1:100,1:2])
structure(list(TCGA.2K.A9WE.01A = c(0.461440642939772, 0.143910373119058,
0.847164847154162, 0.737361955793681, 0.716794733144112, 0.351877113536983,
0.248986769373366, 0.0121360989202765, 0.876303885229884, 0.944311384947134,
0.0490407302658151, 0.0200484962577958, 0.0623434271852525, 0.489865398138095,
0.920994933496615, 0.92663932531651, 0.0149191766670711, 0.884749685210921,
0.446591784140497, 0.91113228700911, 0.912199953863369, 0.908167409366654,
0.386721526377863, 0.0386737340626713, 0.0347492896507038, 0.98309370597552,
0.0176080612232509, 0.91878387167279, 0.743683318738873, 0.939148492241393,
0.722471943330892, 0.613143449419421, 0.0111202783577944, 0.843823786705695,
0.836431557867031, 0.390282953982417, 0.027408710286304, 0.0222349236137297,
0.657221610108816, 0.861848830221141, 0.0433751011272091, 0.0281247935879252,
0.938960776959358, 0.919825831744144, 0.922071582222369, 0.874732275907705,
0.0287898761495033, 0.0266947996996682, 0.922915821025777, 0.95009866012662,
0.964858875373814, 0.106451342824246, 0.406100902807456, 0.0421684244823044,
0.0341277368595181, 0.805451068725895, 0.147595746750675, 0.602617067494429,
0.90660866745333, 0.922313274809095, 0.462291286891102, 0.502857899902497,
0.0292904155423265, 0.835117565787527, 0.146789494933407, 0.06805696389495,
0.970563583145203, 0.0379479981289824, 0.058526761439653, 0.938993650169269,
0.44761099556807, 0.558961729061086, 0.939778576056268, 0.0728795533192928,
0.812084345787681, 0.899377654465699, 0.940111049552295, 0.838186810388758,
0.715121288990262, 0.897506380407565, 0.0929678061732199, 0.99024632582796,
0.055583745670494, 0.835146654988372, 0.973309086845447, 0.651216797099359,
0.0218535991986461, 0.0999671036378156, 0.790540668893094, 0.980591855409854,
0.567883806155822, 0.774816434396113, 0.904434807209845, 0.16641097147085,
0.0102686285230525, 0.65243489007093, 0.917594420539083, 0.0147831247626457,
0.844679485594683, 0.65566679452182), TCGA.2Z.A9J1.01A = c(0.595894468074615,
0.0807243779293262, 0.867305510246114, 0.70680600651273, 0.217862460492399,
0.169408257004071, 0.173115013795265, 0.0108902025634162, 0.813866558997356,
0.938576461648791, 0.0426568318037534, 0.0133187057875756, 0.0540543120983417,
0.317547629906197, 0.89911570032979, 0.525131175543627, 0.0152198596492253,
0.586968687135673, 0.49896100615873, 0.946718072906056, 0.859306039060091,
0.91185524112895, 0.28077646371254, 0.0413484993379312, 0.169193526857136,
0.941230054689418, 0.0164701153466769, 0.928402415411224, 0.736184540407898,
0.946288965623826, 0.312150292032857, 0.403171876971832, 0.0091246246912222,
0.535149883791691, 0.801041308364712, 0.171664264695538, 0.022737572168221,
0.0164834707992085, 0.34399568227201, 0.690016503202975, 0.0390842331750004,
0.0270854886242561, 0.888936631403145, 0.911902815624012, 0.858247513475469,
0.877113632682254, 0.0342892379505875, 0.0387268488822914, 0.922299785913074,
0.926130065834329, 0.975692332236198, 0.105415153493416, 0.127593519059119,
0.0540003798276299, 0.030980833881057, 0.914299941557146, 0.0512267439881511,
0.307325891435045, 0.941037265659174, 0.927078967007025, 0.48873418258592,
0.259006924115841, 0.0278764868641079, 0.87768067729952, 0.302640875302654,
0.0706384569300761, 0.968762634771395, 0.0364352674378962, 0.0441231506131831,
0.8307385629478, 0.242575477196221, 0.513439830376976, 0.932449172188782,
0.0526229004254996, 0.81314353054328, 0.778591104943176, 0.95668645045373,
0.453172059602829, 0.250129171963381, 0.863470213940097, 0.0994627135023581,
0.989489689575077, 0.0472116225581592, 0.911407225108748, 0.825189076107663,
0.578029414148402, 0.018058167343065, 0.0855852777154159, 0.819733395638372,
0.988287891473147, 0.255899615791521, 0.643359326354994, 0.491979154678761,
0.0978562004864199, 0.0105671614378101, 0.48897100984416, 0.9024550858788,
0.0131702158217202, 0.81328537816321, 0.85890307119103)), row.names = c("cg00000029",
"cg00000165", "cg00000236", "cg00000289", "cg00000292", "cg00000321",
"cg00000363", "cg00000622", "cg00000658", "cg00000721", "cg00000734",
"cg00000769", "cg00000905", "cg00000924", "cg00000948", "cg00000957",
"cg00001245", "cg00001249", "cg00001261", "cg00001349", "cg00001364",
"cg00001446", "cg00001510", "cg00001582", "cg00001583", "cg00001687",
"cg00001747", "cg00001791", "cg00001809", "cg00001854", "cg00001874",
"cg00002033", "cg00002116", "cg00002145", "cg00002190", "cg00002224",
"cg00002236", "cg00002406", "cg00002426", "cg00002449", "cg00002464",
"cg00002490", "cg00002531", "cg00002591", "cg00002593", "cg00002597",
"cg00002660", "cg00002719", "cg00002769", "cg00002808", "cg00002809",
"cg00002810", "cg00002837", "cg00003091", "cg00003173", "cg00003181",
"cg00003287", "cg00003345", "cg00003513", "cg00003529", "cg00003578",
"cg00003625", "cg00003784", "cg00003969", "cg00003994", "cg00004055",
"cg00004067", "cg00004072", "cg00004082", "cg00004089", "cg00004105",
"cg00004121", "cg00004192", "cg00004207", "cg00004209", "cg00004429",
"cg00004533", "cg00004562", "cg00004608", "cg00004773", "cg00004818",
"cg00004883", "cg00004939", "cg00004963", "cg00004979", "cg00004996",
"cg00005010", "cg00005040", "cg00005072", "cg00005083", "cg00005112",
"cg00005166", "cg00005215", "cg00005297", "cg00005306", "cg00005390",
"cg00005437", "cg00005543", "cg00005617", "cg00005619"), class = "data.frame")
anno
> dput(anno[1:100,])
structure(list(V1 = c("TSPY4", "TTTY14", "TMSB4Y", "TBL1Y", "TMSB4Y",
"TSPY4", "RPS4Y2", "EIF1AY", "PCDH11Y", "TBL1Y", "ZFY", "FAM197Y2",
"TTTY14", "TSPY4", "ZFY", "NLGN4Y", "EIF1AY", "TSPY4", "TBL1Y",
"UTY", "PRKY", "ZFY", "CD24", "PRKY", "TSPY1", "CYorf15A", "TSPY2",
"TTTY15", "RPS4Y2", "UTY", "CYorf15A", "RPS4Y2", "TSPY2", "TBL1Y",
"TSPY3", "DDX3Y", "CYorf15A", "ZFY", "RBMY1F", "DDX3Y", "RPS4Y2",
"ZFY", "DDX3Y", "TTTY15", "BCORL2", "PCDH11Y", "KDM5D", "TTTY14",
"EIF1AY", "DDX3Y", "LOC100101121", "CYorf15A", "TTTY15", "TSPY1",
"TSPY1", "FAM197Y2", "TSPY4", "TMSB4Y", "DDX3Y", "TTTY15", "TTTY20",
"NLGN4Y", "TSPY4", "CYorf15A", "RPS4Y2", "KDM5D", "RBMY1J", "EIF1AY",
"KDM5D", "ZFY", "TGIF2LY", "HMGN5", "EBP", "UBL4A", "WDR13",
"MTM1", "BCOR", "ZCCHC12", "FTHL17", "PORCN", "NAA10", "PCDH11X",
"ARSE", "DOCK11", "PDK3", "LONRF3", "MAGIX", "PCYT1B", "SLC6A8",
"UBE2A", "TAF9B", "STARD8", "BCOR", "ZIC3", "IL1RAPL2", "TMSB4X",
"CLCN5", "LOC100133957", "SCML1", "GNL3L"), V2 = c("cg00050873",
"cg00212031", "cg00214611", "cg01707559", "cg02004872", "cg02011394",
"cg02050847", "cg02233190", "cg02494853", "cg02839557", "cg02842889",
"cg03052502", "cg03244189", "cg03443143", "cg03683899", "cg03706273",
"cg03750315", "cg04016144", "cg04042030", "cg04448376", "cg04689676",
"cg04840163", "cg05230942", "cg05480730", "cg05544622", "cg05621349",
"cg05865243", "cg05890011", "cg06322277", "cg06479204", "cg07731488",
"cg07747963", "cg08242338", "cg08921682", "cg09350919", "cg09856092",
"cg10076560", "cg10213302", "cg10267609", "cg10698069", "cg10841270",
"cg11131351", "cg14180491", "cg14741114", "cg15027426", "cg15295597",
"cg15329860", "cg15345074", "cg15422579", "cg15429127", "cg15682806",
"cg15682993", "cg15746461", "cg15810474", "cg15935877", "cg17834650",
"cg17837162", "cg18032798", "cg18077436", "cg25032547", "cg25071634",
"cg25518695", "cg25705492", "cg25756647", "cg26058907", "cg26517491",
"cg26983430", "cg26983535", "cg27049643", "cg27433982", "cg27539833",
"cg00008945", "cg00011200", "cg00011891", "cg00014152", "cg00016522",
"cg00016934", "cg00018261", "cg00021786", "cg00026186", "cg00072288",
"cg00072839", "cg00074638", "cg00112256", "cg00114625", "cg00114913",
"cg00116709", "cg00139317", "cg00140085", "cg00142683", "cg00192980",
"cg00200463", "cg00206414", "cg00240113", "cg00241296", "cg00241907",
"cg00264378", "cg00265812", "cg00266918", "cg00360365")), row.names = c(1L,
2L, 4L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 16L, 17L, 18L,
20L, 21L, 23L, 25L, 27L, 29L, 31L, 34L, 35L, 36L, 37L, 38L, 39L,
40L, 41L, 42L, 43L, 45L, 47L, 48L, 50L, 51L, 52L, 54L, 56L, 57L,
58L, 61L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 72L, 73L, 74L, 75L,
76L, 77L, 78L, 79L, 80L, 82L, 83L, 85L, 86L, 87L, 88L, 89L, 91L,
92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L, 101L, 102L, 103L,
104L, 105L, 106L, 107L, 108L, 109L, 110L, 111L, 112L, 113L, 114L,
115L, 116L, 117L, 118L, 119L, 120L, 122L, 123L, 124L, 125L), class = "data.frame")
There is no match between your cpg dataframe and annotation dataframe:
table(rownames(meth.kirp.cpg) %in% anno$V2)
# FALSE
# 100
Below should work with your full data, assuming there is are no duplicates and all rownames are in annotation dataframe:
rownames(meth.kirp.cpg) <- anno$V2[ match(rownames(meth.kirp.cpg), anno$V2) ]

Plotting multiple different histograms based on vector of column names

I have the following dataframe that I want to plot a histogram for each column:
structure(list(ACTB = c(11.7087918, 13.1847403, 8.767737, 12.2949669,
12.399929, 12.130683, 9.816222, 10.700336, 11.862543, 12.479818,
12.48152, 11.798277, 12.0932696, 11.014992, 12.3496682, 11.9810211,
11.946094, 12.1517049, 11.6794028, 12.4895911, 12.787039, 12.2927522,
12.746232, 12.4428358, 11.6382198, 11.6833202, 12.3320067, 12.390378,
12.5550587, 11.597384, 11.7608624, 12.018702, 11.9211984, 11.7143178,
11.800693, 12.7543979, 12.7028472, 11.6509804, 11.5112258, 12.36468,
12.0704304, 12.5876125, 12.2929857, 11.764464, 12.3740263, 12.275172,
11.5247418, 11.9290723, 11.100383, 12.5631062, 10.647334, 12.265323,
11.457643, 12.194339, 11.468173, 12.355388, 12.3233796, 12.200504,
11.716417, 12.430028, 11.3201558, 11.43911, 12.9782049, 11.139062,
11.181185, 10.123614, 11.963833, 10.919224, 11.873896, 11.800616,
12.2159602, 11.6360763, 11.6204291, 11.5500821, 12.6783682, 11.918854,
11.8701782, 10.98058, 11.6254916, 12.1558646, 11.533709, 12.0096358,
12.2830638, 11.772724, 11.8853726, 12.041823, 12.623814, 12.3134903,
11.6714245, 12.1333082, 12.4747336, 11.5326378, 12.6222532, 10.922728,
10.9492515, 11.3410073, 12.3005053), ATP5F1 = c(8.3731175, 8.3995189,
8.871088, 8.4389342, 8.529104, 9.004405, 8.883721, 8.70097, 8.24411,
8.393635, 8.76813, 8.756177, 8.4418168, 7.986864, 8.4840108,
8.6523954, 8.5645576, 8.2452877, 8.2440872, 8.7155973, 9.028364,
8.3578703, 9.007441, 7.8892308, 9.0255621, 8.3165712, 8.3400111,
8.061171, 8.5216917, 8.337517, 8.2341439, 8.810458, 8.8794988,
8.4657149, 8.311901, 8.131606, 8.5865282, 9.0900416, 8.8407707,
7.437107, 8.3982759, 8.7610335, 8.3624475, 8.353429, 8.3630127,
8.555639, 8.6435841, 8.9587154, 8.517079, 8.9597121, 8.111514,
8.99767, 8.266991, 8.106218, 8.518875, 8.445485, 8.6409752, 8.662025,
8.697312, 8.071819, 8.3113401, 8.709276, 8.9154896, 8.138148,
6.866765, 9.391611, 8.448086, 8.29189, 8.541953, 8.801044, 8.3088083,
8.288688, 8.8357729, 8.4731257, 8.7321095, 8.383259, 8.4729561,
5.551528, 8.526436, 8.4548827, 8.242625, 8.9862422, 8.5688994,
8.848029, 8.2656363, 8.434976, 8.8023704, 8.6692361, 8.4333198,
8.2926568, 8.2141276, 8.3246346, 7.7262395, 8.0797336, 8.7005427,
8.7695946, 8.1262312), DDX5 = c(11.3122241, 11.7042284, 8.866042,
12.0376754, 12.417701, 11.479431, 10.078783, 9.043405, 11.216074,
11.846906, 11.161803, 8.713301, 11.0790887, 11.685125, 11.9599302,
12.4036502, 11.9778411, 11.9900709, 11.6069971, 11.2651929, 11.455536,
12.3741866, 11.558182, 11.498146, 12.5073231, 11.4546523, 11.8465482,
11.51445, 11.721283, 12.340818, 11.5388553, 11.920725, 11.7067172,
11.6207138, 11.638226, 11.1407525, 11.5832407, 11.981909, 11.7684202,
12.435987, 11.5253382, 10.9882446, 12.1789747, 11.956257, 12.5427815,
12.007658, 11.6360041, 12.2520109, 11.858959, 12.4740761, 6.927855,
11.117424, 7.749824, 11.518817, 11.322855, 11.74096, 11.768474,
11.497009, 11.912888, 11.570506, 11.8167398, 11.912566, 11.2631437,
11.328946, 11.072161, 12.807216, 12.127281, 12.125497, 11.524622,
11.20101, 11.5451414, 12.0747211, 11.5716524, 11.7223929, 11.8529683,
11.868865, 11.8998228, 9.859857, 12.1404707, 11.9166386, 12.613162,
12.9062351, 11.6691732, 11.984726, 11.727059, 11.421816, 11.9506736,
12.2447547, 11.8167228, 11.9021356, 12.5527606, 12.6511506, 11.8550833,
11.382018, 11.8314198, 11.8394352, 11.8128198), EEF1G = c(12.622405,
11.2945857, 8.610078, 13.1323891, 12.702769, 12.319703, 10.181874,
8.615338, 11.526551, 12.106198, 11.602801, 9.137166, 13.0991666,
13.049641, 12.2938678, 11.7442632, 12.7866184, 12.6753617, 12.9552413,
12.0861518, 13.136434, 12.64865, 13.298616, 11.8531038, 12.7791485,
13.4150478, 11.636058, 12.013313, 11.8785493, 12.771945, 12.5351321,
13.147321, 11.6760014, 12.2604174, 11.802344, 12.23351, 12.1175728,
12.7360727, 12.5730595, 11.13, 11.7737462, 11.9774565, 11.8927844,
12.17392, 12.441605, 12.221691, 12.4866463, 12.5645763, 12.070268,
12.1801377, 8.80704, 12.288168, 8.298831, 12.234659, 11.832415,
12.474423, 12.4440819, 11.888544, 11.625162, 12.161204, 12.2707656,
12.941017, 12.3491325, 12.978561, 11.833124, 11.782119, 12.273029,
12.462202, 12.538127, 12.236135, 12.2884941, 12.4195123, 12.5274317,
12.3917089, 11.912339, 12.439751, 12.0962051, 10.912737, 11.999598,
12.3776528, 11.348448, 12.4151316, 11.5389366, 11.328957, 12.4397802,
12.238454, 12.0192408, 12.2290439, 12.8381542, 11.1834666, 12.0636739,
12.4752125, 12.7681644, 12.1747129, 12.7343662, 12.3493937, 11.7971488
)), class = "data.frame", row.names = c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L,
33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L,
46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L,
59L, 60L, 61L, 62L, 63L, 64L, 66L, 67L, 68L, 69L, 70L, 71L, 72L,
73L, 75L, 76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L, 85L, 86L,
87L, 88L, 89L, 90L, 91L, 92L, 93L, 97L, 98L, 99L, 100L, 102L,
103L))
I want to create a grid of histograms for each column, the list of column is:
HK_GENES = c(
"ACTB", "ATP5F1", "DDX5", "EEF1G"
)
Is there a way of doing it with ggplot2?
I tried with no success the following:
ggplot(data=df_hk_genes, aes_string(x=HK_GENES)) +
geom_histogram(bins=15) +
facet_wrap(HK_GENES, nrow = 5, scale = "free_x")
In python I could create a subfigure for each histogram an iterate over it.
I have around 20 column in my original dataframe, and I want to avoid calling the same block with different column
You can reshape the data and facet over the groups.
library(reshape2)
library(dplyr)
melt(df_hk_genes) %>%
ggplot(aes(x = value)) +
facet_wrap(~ variable, nrow = 5, scale = "free_x") +
geom_histogram(bins=15)

R: Loop through columns in tibble to find differences between each and create new for each difference

I have been working on this for a while now, but I can't seem to figure it out. I'm looking for a solution that can: calculate difference between col1 and col2 and create colA based on this; then calculate difference between col2 and col3 and create colB based on this, etc. etc. I have about 70 rows and 42 of these columns so it's not something I want to do by hand (at this point I am almost desperate enough).
To give a note also, some of the cells in the rows are empty (NA). An emergency solution would be to fill these with zeroes, but I'd rather not.
Also, the dataframe I use is a tibble, however, I am not bound to this so much that I can't change it to a real dataframe.
My data looks like this:
testdata
As you can see, the columns have annoyingly long names I did not know how to change also :). I use the column numbers usually, which are 77:119. I hope this is complete enough. Sorry for the noob-ness and possibly unclear explanation, this is my first question on here and I'm not that craftsy in R!
Finally, to create the 'user/intermittent_answers/n_length' columns I used the following loop, so I thought it'd be possible to reuse this for the calculations that I need now.
#loop through PARTS of testdata to create _length's
for(i in names(testdata[34:76]))
testdata[[paste(i, 'length', sep="_")]] <- str_length(testdata[[i]])
Then I tried something similar which I found here: FOR loop to calculate difference on dates in R
for (j in 2:length(testdata$`user/intermittant_answers/42_length`))
+ testdata$lag[j] <- as.numeric(difftime(testdata$`user/intermittant_answers/42_length`[j], testdata$`user/intermittant_answers/42_length`[j-1], units=c("difference")), units = "days")
Error in as.POSIXct.numeric(time1) : 'origin' must be supplied
I figured this was because I am not working with anything time related, but I don't know/don't know how to find another 'diff' related function that is not bound to matrixes like the one from matrixStats package.
I hope someone can push me in the right direction!
Thank you!!
EDIT: #Ben, thank you for responding! If I had known this function I would've used it way sooner :'). I tried to keep a representation of NA values inside the df. Also, some people suggested using a double loop, however, I have not managed to figure this out. I hope this helps!
> dput(testdata[1:10, 95:105])
structure(list(`user/intermittant_answers/18_length` = c(NA,
24L, 34L, 33L, NA, NA, 16L, NA, 25L, 28L), `user/intermittant_answers/19_length` = c(NA,
38L, 68L, 34L, NA, 11L, 20L, 12L, 47L, 52L), `user/intermittant_answers/20_length` = c(NA,
59L, 81L, 42L, 2L, 33L, 20L, 26L, 96L, 78L), `user/intermittant_answers/21_length` = c(6L,
90L, 116L, 42L, 14L, 41L, 20L, NA, 127L, 113L), `user/intermittant_answers/22_length` = c(17L,
115L, 131L, 65L, 20L, 70L, 37L, 11L, 170L, 130L), `user/intermittant_answers/23_length` = c(40L,
138L, 188L, 65L, 38L, 113L, 22L, 24L, 200L, 136L), `user/intermittant_answers/24_length` = c(66L,
155L, 210L, 99L, 49L, 133L, 41L, 49L, 242L, 185L), `user/intermittant_answers/25_length` = c(66L,
158L, 233L, 99L, 65L, 156L, 67L, 70L, 296L, 224L), `user/intermittant_answers/26_length` = c(84L,
201L, 250L, 113L, 84L, 164L, 67L, 78L, 334L, 224L), `user/intermittant_answers/27_length` = c(89L,
237L, 285L, 130L, 97L, 167L, 84L, 86L, 412L, 232L), `user/intermittant_answers/28_length` = c(116L,
284L, 315L, 130L, 97L, 184L, 97L, 108L, 445L, 247L)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))

Is there a way to plot confidence intervals for an orthogonal/ TLS regression model?

I've found the functions onls::onls() and pracma::odregress() that calculate orthogonal regression models. I would like to plot such models in the same style as geom_smooth(), that is, the regression line surrounded by a 95% confidence interval.
Example:
example <- structure(list(y = c(-28.9143374863044, -28.5783512160246, -29.1751498307569,
-28.5613677412358, -29.2441600709021, -29.1848482932202, -29.469712350617,
-29.1212786695474, -29.3338385227209, -29.0582324840251, -29.1159002526588,
-29.1384485361936, -29.4743426548081, -29.242305699462, -29.5517891592378,
-29.1701701877517, -29.2337122509592, -29.150317639976, -29.139526754614,
-29.05974643127, -29.0540797909476, -29.0859798970361, -29.27517072563,
-29.1907525452561, -30.0965246973573, -28.9734662257987, -29.6953578711591,
-28.2014460687026, -30.0621997994278, -27.9399550295493, -29.8886842413551,
-29.6609659140518, -29.6920474706673, -30.2418230320867, -29.8334571372628,
-29.8626462112615, -29.9051818751105, -29.6518825347484, -29.5380886463871,
-29.7500527026688, -29.6095990506199, -29.6049957701729, -29.5368579894466,
-29.5861340837645, -29.5737037489314, -29.5773848425703, -28.0265409956043,
-28.0899954900073, -28.265152586989, -28.0062832808179, -27.7205565228848,
-27.4041257575861, -28.1113851658386, -26.914663492446, -27.877772497213,
-27.0684956870887, -27.9276723508022, -27.7588907638397, -27.3710663654935,
-27.3623535825255, -27.7783142763593, -28.5132310123219, -28.5193067297636,
-28.5283974320574, -28.6153706663899, -28.6816032262091, -29.1043640141426,
-28.44589108955, -28.6614098552091, -28.7403207700811), x = c(33.1158714294,
18.6527993810972, 17.0276514703819, 22.3627925702962, 18.170924813473,
32.0677953809724, 46.5216445923, 34.9911138888596, 25.0910229505442,
13.9473438263, 17.381641499988, 17.014380035215, 40.9107205320526,
52.2695803285185, 58.9499627404227, 40.5894751586832, 23.496896254444,
33.6412616569372, 14.7548102820616, 46.3057677573658, 14.280050708175,
31.2877073530984, 18.8534870545271, 16.5168182808868, 63.9908365598676,
33.7277991683148, 35.4163778417314, 32.1050571361531, 51.3240160147292,
24.4237814340378, 39.3334452128324, 53.8079129732769, 43.26844558712,
58.3003234863, 43.934151887875, 76.8046441618721, 64.8779439305438,
46.8684772359235, 66.4989547729, 41.9780584414396, 50.2248225396345,
58.8492643072032, 64.5647735596, 48.3225469025232, 60.4074024077677,
57.3789336302925, 11.2785320282, 11.3491302769043, 7.59091310831495,
18.4789668943737, 5.84773873549871, 10.6156844347299, 15.7432512138035,
11.4885938379565, 7.74754936760848, 12.1071624756, 14.9075944237136,
20.9201573163328, 30.2789412366595, 33.8582180028129, 15.4269225956373,
8.53801707561128, 10.1814249853966, 7.33018941782735, 8.42749268077253,
9.74786459733547, 10.5363144200841, 10.7873065304121, 16.7602893825786,
12.7551904319156)), class = "data.frame", row.names = c(1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L,
30L, 31L, 32L, 33L, 34L, 35L, 36L, 39L, 40L, 41L, 42L, 43L, 44L,
45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L,
58L, 59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L,
71L, 72L))
mod <- onls(y ~ a*x+b, data = example, start = list(a = 0.03, b = -28))
newData <- data.frame(x = seq(min(example$x), max(example$x), 0.1))
newData$y <- predict(mod, newdata = newData)
plot(y ~ x, data = newData, type = "l", col = "red")
points(y ~ x, data = example)
# for a regular lm() model the subsequent steps would be
conf <- predict(mod, newdata = newData, interval = "confidence", level = 0.95)
lines(newData$x, conf[,2])
lines(newData$x, conf[,3])
However, the last steps won't give any useful result when applied to the onls model. Are there any methods to calculate or estimate those confidence intervals?
Edit: As DanY mentioned, the onls package contains the confint.onls() function. This will give me the upper and lower estimate for each regression parameter at a given confidence level:
confint(mod, level = 0.95)
I could do something like
conf_a <- confint(mod, param = "a", level = 0.95)
conf_b <- confint(mod, param = "b", level = 0.95)
and calculate the extrema for each x
x <- seq(min(example$x), max(example$x), 0.1)
test <- cbind(
conf_a[1]*x+conf_b[1],
conf_a[1]*x+conf_b[2],
conf_a[2]*x+conf_b[1],
conf_a[2]*x+conf_b[2]
)
maxima <- vector()
for(i in 1:length(x)){
maxima[i] <- max(test[i,])
}
but this doesn't quite look like what I#d expect and I'm not really convinced this is the correct approach.

Fuzzy Join Error: All columns in a tibble must be vectors

test <- structure(list(trip_count = 1:10, dropoff_longitude = c(-73.959862,
-73.882202, -73.934113, -73.992203, -74.00563, -73.975189, -73.97448,
-73.974838, -73.981377, -73.955093), dropoff_latitude = c(40.773617,
40.744175, 40.715923, 40.749203, 40.726158, 40.729824, 40.763599,
40.754135, 40.759987, 40.765224)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7fd18800f6e0>)
> dput(zip_codes)
zip_codes <- structure(list(zipcode = c("10001", "10002", "10003", "10004",
"10005", "10006", "10007", "10009", "10010", "10011", "10012",
"10013", "10014", "10016", "10017", "10018", "10019", "10020",
"10021", "10022", "10023", "10024", "10025", "10026", "10027",
"10028", "10029", "10030", "10031", "10032", "10033", "10034",
"10035", "10036", "10037", "10038", "10039", "10040", "10044",
"10065", "10069", "10075", "10103", "10110", "10111", "10112",
"10115", "10119", "10128", "10152", "10153", "10154", "10162",
"10165", "10167", "10168", "10169", "10170", "10171", "10172",
"10173", "10174", "10177", "10199", "10271", "10278", "10279",
"10280", "10282"), bounds_north = c(40.759731, 40.724136, 40.739673,
40.709044, 40.709294, 40.71369, 40.71719, 40.734975, 40.745421,
40.756703, 40.731706, 40.727557, 40.742873, 40.752197, 40.757912,
40.762526, 40.773446, 40.761094, 40.775045, 40.764898, 40.783192,
40.818099, 40.811264, 40.807546, 40.822108, 40.782213, 40.800665,
40.824032, 40.834372, 40.850517, 40.861552, 40.87765, 40.809582,
40.765558, 40.819569, 40.714451, 40.846615, 40.866336, 40.772955,
40.770517, 40.781007, 40.777677, 40.761771, 40.755516, 40.759689,
40.759899, 40.811331, 40.751522, 40.787914, 40.759059, 40.764279,
40.758432, 40.770085, 40.752801, 40.755303, 40.752119, 40.754974,
40.753811, 40.756556, 40.755928, 40.754783, 40.752116, 40.7556,
40.752723, 40.708797, 40.71628, 40.713256, 40.714767, 40.719611
), bounds_south = c(40.743451, 40.708802, 40.722933, 40.683919,
40.702879, 40.705871, 40.709806, 40.718612, 40.73231, 40.731043,
40.719867, 40.713446, 40.72428, 40.73801, 40.747251, 40.749102,
40.758645, 40.757284, 40.758133, 40.751445, 40.768436, 40.778805,
40.788476, 40.79691, 40.803047, 40.770062, 40.782531, 40.812791,
40.817221, 40.829083, 40.842958, 40.849745, 40.781075, 40.752197,
40.806636, 40.701689, 40.817912, 40.851863, 40.749415, 40.759284,
40.771612, 40.769441, 40.759787, 40.753481, 40.758538, 40.758436,
40.810373, 40.749101, 40.773108, 40.757749, 40.762964, 40.757125,
40.768355, 40.75146, 40.753994, 40.750775, 40.753811, 40.751441,
40.755243, 40.754619, 40.753481, 40.750766, 40.754678, 40.750241,
40.707694, 40.714082, 40.711995, 40.700273, 40.713378), bounds_east = c(-73.984076,
-73.973635, -73.979864, -73.995657, -74.004569, -74.009988, -74.000455,
-73.971282, -73.971566, -73.990798, -73.991794, -73.994035, -73.999555,
-73.968192, -73.964271, -73.981822, -73.973015, -73.977201, -73.947973,
-73.958599, -73.974067, -73.960687, -73.954966, -73.944667, -73.940404,
-73.944337, -73.930891, -73.936232, -73.938588, -73.934671, -73.92216,
-73.910587, -73.914228, -73.978116, -73.933219, -73.991772, -73.929107,
-73.924385, -73.940026, -73.952085, -73.986609, -73.947039, -73.975831,
-73.980395, -73.976744, -73.97845, -73.963058, -73.99111, -73.937328,
-73.970993, -73.971411, -73.971451, -73.94827, -73.977677, -73.973735,
-73.976048, -73.975209, -73.974648, -73.97282, -73.973276, -73.978332,
-73.973959, -73.975352, -73.993948, -74.009829, -74.002115, -74.007666,
-74.013754, -74.012441), bounds_west = c(-74.008621, -73.997532,
-73.999604, -74.047285, -74.012508, -74.015905, -74.013754, -73.988643,
-73.994028, -74.012359, -74.004575, -74.016381, -74.01599, -73.987746,
-73.981822, -74.007989, -74.003477, -73.98373, -73.968441, -73.977655,
-73.990149, -73.98814, -73.977092, -73.962475, -73.9659, -73.96323,
-73.955778, -73.948677, -73.960007, -73.950403, -73.944672, -73.947051,
-73.946462, -74.001702, -73.943398, -74.010542, -73.943506, -73.938947,
-73.961583, -73.972553, -73.996142, -73.965148, -73.979513, -73.984118,
-73.97845, -73.980886, -73.964424, -73.994844, -73.959921, -73.973068,
-73.973465, -73.973524, -73.951858, -73.979768, -73.975807, -73.978159,
-73.976974, -73.977107, -73.974897, -73.975352, -73.980395, -73.976048,
-73.976516, -74.00143, -74.011248, -74.00542, -74.009668, -74.019603,
-74.01831), zip = c(10001, 10002, 10003, 10004, 10005, 10006,
10007, 10009, 10010, 10011, 10012, 10013, 10014, 10016, 10017,
10018, 10019, 10020, 10021, 10022, 10023, 10024, 10025, 10026,
10027, 10028, 10029, 10030, 10031, 10032, 10033, 10034, 10035,
10036, 10037, 10038, 10039, 10040, 10044, 10065, 10069, 10075,
10103, 10110, 10111, 10112, 10115, 10119, 10128, 10152, 10153,
10154, 10162, 10165, 10167, 10168, 10169, 10170, 10171, 10172,
10173, 10174, 10177, 10199, 10271, 10278, 10279, 10280, 10282
)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L, 10L, 11L, 12L,
13L, 14L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L,
40L, 43L, 50L, 51L, 53L, 67L, 74L, 75L, 76L, 79L, 83L, 91L, 101L,
102L, 103L, 111L, 114L, 116L, 117L, 118L, 119L, 120L, 121L, 122L,
123L, 126L, 133L, 151L, 158L, 159L, 160L, 162L), class = "data.frame")
Hey guys, so I am trying to fuzzy-join lat & lon information to get the zip code of a specific location. I tried:
test <- test %>% fuzzy_left_join(zip_codes,by = c("dropoff_longitude" = "bounds_east", "dropoff_longitude" = "bounds_west", "dropoff_latitude" = "bounds_north","dropoff_latitude" = "bounds_south"), match_fun = list('<=', '>=' , '<=', '>='))
But unfortunately, this returns the error message Error: All columns in a tibble must be vectors. x Column "col" is NULL.
I don't know how to solve this. There is no column "col" in either one of the data frames. The result should give me the correspondent zip code if the dropoff_longitute is between bounds_east and bounds_west and the dropoff_latitude is between bounds_north and bounds_south.
Thanks a lot in advance!
We could use the non-equi join from data.table as one of the dataset is data.table
library(data.table)
setDT(test)[zip_codes, on = .(dropoff_longitude <= bounds_east,
dropoff_longitude >= bounds_west,
dropoff_latitude <= bounds_north,
dropoff_latitude >= bounds_south)]

Resources