Getting distance between vectors of longitude and latitude points

Getting distance between vectors of longitude and latitude points - r

I have a data set that looks like this:
structure(list(Date2 = structure(c(18428, 18438, 18428, 18438,
18428, 18438, 18428, 18438, 18428, 18438, 18428, 18438, 18428,
18438, 18428, 18438, 18428, 18438, 18428, 18438, 18428, 18438,
18428, 18438, 18428, 18438, 18428, 18438, 18428, 18438, 18428,
18438, 18428, 18438, 18428, 18438, 18428, 18438, 18428, 18438,
18428, 18438, 18428, 18438), class = "Date"), Fish_ID = c("Fork1",
"Fork1", "Fork10", "Fork10", "Fork12", "Fork12", "Fork13", "Fork13",
"Fork14", "Fork14", "Fork15", "Fork15", "Fork16", "Fork16", "Fork17",
"Fork17", "Fork18", "Fork18", "Fork19", "Fork19", "Fork2", "Fork2",
"Fork20", "Fork20", "Fork21", "Fork21", "Fork22", "Fork22", "Fork23",
"Fork23", "Fork3", "Fork3", "Fork4", "Fork4", "Fork5", "Fork5",
"Fork6", "Fork6", "Fork7", "Fork7", "Fork8", "Fork8", "Fork9",
"Fork9"), Lat2 = c(32.9394, NA, 32.92935, NA, NA, 32.9047333333333,
NA, 32.9093833333333, NA, 32.9509833333333, 32.9160666666667,
NA, NA, 32.9074333333333, NA, 32.9029, NA, 32.90775, NA, 32.9094,
NA, NA, 32.9455166666667, 32.9459166666667, 32.9431, 32.9437666666667,
32.90365, 32.9044333333333, 32.9056166666667, 32.90585, NA, 32.9475333333333,
32.94325, NA, 32.9288833333333, NA, NA, NA, 32.9297, NA, NA,
NA, 32.9303, NA), Long2 = c(-95.6334, NA, -95.6406, NA, NA, -95.6531666666667,
NA, -95.6486, NA, -95.6252333333333, -95.648, NA, NA, -95.6391166666667,
NA, -95.64155, NA, -95.6393666666667, NA, -95.63895, NA, NA,
-95.6391166666667, -95.6389333333333, -95.6365, -95.6401333333333,
-95.6535666666667, -95.6532833333333, -95.6560333333333, -95.6575166666667,
NA, -95.63015, -95.6334333333333, NA, -95.6395, NA, NA, NA, -95.6398833333333,
NA, NA, NA, -95.6425166666667, NA), lag.Lat2 = c(NA, 32.9394,
NA, 32.92935, NA, NA, NA, NA, NA, NA, NA, 32.9160666666667, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32.9455166666667, NA,
32.9431, NA, 32.90365, NA, 32.9056166666667, NA, NA, NA, 32.94325,
NA, 32.9288833333333, NA, NA, NA, 32.9297, NA, NA, NA, 32.9303
), lag.Long2 = c(NA, -95.6334, NA, -95.6406, NA, NA, NA, NA,
NA, NA, NA, -95.648, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, -95.6391166666667, NA, -95.6365, NA, -95.6535666666667, NA,
-95.6560333333333, NA, NA, NA, -95.6334333333333, NA, -95.6395,
NA, NA, NA, -95.6398833333333, NA, NA, NA, -95.6425166666667)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -44L), groups = structure(list(
Fish_ID = c("Fork1", "Fork10", "Fork12", "Fork13", "Fork14",
"Fork15", "Fork16", "Fork17", "Fork18", "Fork19", "Fork2",
"Fork20", "Fork21", "Fork22", "Fork23", "Fork3", "Fork4",
"Fork5", "Fork6", "Fork7", "Fork8", "Fork9"), .rows = structure(list(
1:2, 3:4, 5:6, 7:8, 9:10, 11:12, 13:14, 15:16, 17:18,
19:20, 21:22, 23:24, 25:26, 27:28, 29:30, 31:32, 33:34,
35:36, 37:38, 39:40, 41:42, 43:44), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -22L), .drop = TRUE))
Lat2 and Long2 are the locations of each fish and lag values are for the location of each fish the prior time they were located.
I am trying to calculate the distance between each Long2 Lat2 value and each lag.Long2 lag.Long2 value so that I can calculate the distance traveled from the last time each fish was located. I know how to do this by hand for each one using the geosphere package, but I'm wondering if there is a for loop I could write to do this calculation for each individual fish so I could automate the process?
Thanks!

using your data, we're looking for fish that moved, fishx and fishy, as matrix, with long before lat, so perhaps:
complete <- which(complete.cases(fish_df)==TRUE)
fishx <- as.matrix(fish_df[complete, 4:3])
fishx
Long2 Lat2
24 -95.63893 32.94592
26 -95.64013 32.94377
28 -95.65328 32.90443
30 -95.65752 32.90585
fishy <- as.matrix(fish_df[complete, 6:5])
geosphere::distm(fishx, fishy, fun = geosphere::distGeo)
[,1] [,2] [,3] [,4]
[1,] 47.55876 386.4671 4883.23872 4746.9509
[2,] 216.11520 347.7147 4623.08009 4484.7115
[3,] 4745.03178 4566.5523 90.82777 288.8099
[4,] 4723.80758 4574.9848 442.81633 141.1616
diag(geosphere::distm(fishx, fishy, fun = geosphere::distGeo))
[1] 47.55876 347.71468 90.82777 141.16164
# presumably in meters
You know your study area so this may or may not be correct... Which is to say, no need for loop, just get your matrices.

Related

Missing cases while using summarise(across())

I have data.frame that looks like this:
I want to quickly reshape it so I will only one record for each ID, something that is looks like this:
df can be build using codes:
df<-structure(list(ID = structure(c("05-102", "05-102", "05-102",
"01-103", "01-103", "01-103", "08-104", "08-104", "08-104", "05-105",
"05-105", "05-105", "02-106", "02-106", "02-106", "05-107", "05-107",
"05-107", "08-108", "08-108", "08-108", "02-109", "02-109", "02-109",
"05-111", "05-111", "05-111", "07-115", "07-115", "07-115"), label = "Unique Subject Identifier", format.sas = "$"),
EXSTDTC1 = structure(c(NA, NA, NA, 17022, NA, NA, 17024,
NA, NA, 17032, NA, NA, 17038, NA, NA, 17092, NA, NA, 17108,
NA, NA, 17155, NA, NA, 17247, NA, NA, 17333, NA, NA), class = "Date"),
EXSTDTC6 = structure(c(NA, 16885, NA, NA, NA, 17031, NA,
NA, 17032, NA, NA, 17041, NA, NA, 17047, NA, NA, 17100, NA,
NA, 17116, NA, 17164, NA, NA, NA, 17256, NA, 17342, NA), class = "Date"),
EXSTDTC3 = structure(c(NA, NA, 16881, NA, 17027, NA, NA,
17029, NA, NA, 17037, NA, NA, 17043, NA, NA, 17097, NA, NA,
17113, NA, NA, NA, 17160, NA, 17252, NA, NA, NA, 17338), class = "Date"),
EXDOSEA1 = c("73.8+147.6", NA, NA, "64.5+129", NA, NA, "62.7+125.4",
NA, NA, "114+57", NA, NA, "60+117.5", NA, NA, "48.6+97.2",
NA, NA, "61.2+122.4", NA, NA, "47.7+95.4", NA, NA, "51.6+103.2",
NA, NA, "68+136", NA, NA), EXDOSEA6 = c(NA, "100", NA, NA,
NA, "86", NA, NA, "83.5", NA, NA, "76", NA, NA, "39.2", NA,
NA, "32", NA, NA, "81.5", NA, "69.6", NA, NA, NA, "68", NA,
"91", NA), EXDOSEA3 = c(NA, NA, "1600", NA, "4302", NA, NA,
"4185", NA, NA, "3900", NA, NA, "3921", NA, NA, "3300", NA,
NA, "4080", NA, NA, NA, "3183", NA, "3300", NA, NA, NA, "1514"
)), row.names = c(NA, -30L), class = c("tbl_df", "tbl", "data.frame"
))
right now I have my codes as:
df %>%
group_by(ID) %>%
summarise(across(EXSTDTC1:EXDOSEA3, na.omit))
But it seems remove the 05-102 as it did not have value on EXSTDTC1. I would like to see how we can address this. Is it possible to keep across still?
Many thanks.

We could use an if/else condition to address those cases where there is only NA
library(dplyr)
df %>%
group_by(ID) %>%
summarise(across(EXSTDTC1:EXDOSEA3,
~ if(all(is.na(.))) NA else .[complete.cases(.)]), .groups = 'drop')
-output
# A tibble: 10 x 7
# ID EXSTDTC1 EXSTDTC6 EXSTDTC3 EXDOSEA1 EXDOSEA6 EXDOSEA3
# <chr> <date> <date> <date> <chr> <chr> <chr>
# 1 01-103 2016-08-09 2016-08-18 2016-08-14 64.5+129 86 4302
# 2 02-106 2016-08-25 2016-09-03 2016-08-30 60+117.5 39.2 3921
# 3 02-109 2016-12-20 2016-12-29 2016-12-25 47.7+95.4 69.6 3183
# 4 05-102 NA 2016-03-25 2016-03-21 73.8+147.6 100 1600
# 5 05-105 2016-08-19 2016-08-28 2016-08-24 114+57 76 3900
# 6 05-107 2016-10-18 2016-10-26 2016-10-23 48.6+97.2 32 3300
# 7 05-111 2017-03-22 2017-03-31 2017-03-27 51.6+103.2 68 3300
# 8 07-115 2017-06-16 2017-06-25 2017-06-21 68+136 91 1514
# 9 08-104 2016-08-11 2016-08-19 2016-08-16 62.7+125.4 83.5 4185
#10 08-108 2016-11-03 2016-11-11 2016-11-08 61.2+122.4 81.5 4080

Copy value in dataframe up/down x cells

How can I copy values in a dataframe up and down 5 times?
Please find below my minimal example:
structure(list(Date = structure(c(16448, 16449, 16450, 16451,
16455, 16456, 16457, 16458, 16461, 16462, 16463, 16464, 16465,
16468, 16469, 16470, 16471, 16472, 16475, 16476, 16477, 16478,
16479, 16483, 16484, 16485, 16486, 16489, 16490, 16491, 16492
), class = "Date"), Share.price = c(18.56, 18.93, 18.55, 20.25,
20.22, 20.1, 20.59, 20.65, 20.84, 20.47, 20.67, 20.75, 20.89,
21.12, 21.66, 21.52, 21.82, 22.11, 22.04, 22.28, 22.66, 22.94,
23.11, 23.49, 23.41, 23.32, 23.31, 23.37, 23.54, 23.45, 23.35
), NASDAQ100 = c(4166.2, 4145.84, 4089.65, 4142.14, 4171.21,
4192.09, 4270.36, 4278.14, 4275.72, 4165.5, 4140.38, 4181.35,
4148.43, 4188.59, 4229.15, 4221.2, 4256.18, 4228.68, 4216.09,
4281.16, 4297.28, 4347.97, 4384.03, 4385.34, 4390.91, 4411.86,
4443.05, 4449.49, 4451.03, 4440.59, 4462.27), stock_return = c(-0.0159066808059385,
0.0199353448275863, -0.0200739566825145, 0.091644204851752, -0.00148148148148154,
-0.00593471810089008, 0.0243781094527362, 0.00291403593977653,
0.00920096852300248, -0.0177543186180423, 0.00977039570102603,
0.00387034349298492, 0.00674698795180726, 0.0110100526567736,
0.0255681818181818, -0.00646352723915053, 0.0139405204460967,
0.0132905591200733, -0.00316598824061512, 0.0108892921960074,
0.0170556552962298, 0.0123565754633716, 0.00741063644289443,
0.0164430982258762, -0.00340570455512977, -0.00384451089278086,
-0.000428816466552383, 0.00257400257400267, 0.0072742832691484,
-0.0038232795242141, -0.00426439232409373), market_return = c(-0.000904083242805209,
-0.00488694733810179, -0.0135533450398472, 0.012834839167166,
0.0070181114110097, 0.00500574173920759, 0.0186708777721851,
0.00182186045204635, -0.000565666387729264, -0.0257781145631613,
-0.00603048853679028, 0.00989522700814907, -0.00787305535293627,
0.00968077079762702, 0.00968344956178559, -0.00187981036378464,
0.00828674310622583, -0.00646119290067619, -0.00297728842097301,
0.015433731253365, 0.00376533462893232, 0.0117958336436072, 0.00829352548430635,
0.00029881182382429, 0.00127014096968529, 0.00477122054426072,
0.00706958063039183, 0.00144945476643288, 0.000346107081935225,
-0.00234552451904382, 0.0048822341175385), Dividend_change = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Increase",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), alpha = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.000404446336263359,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), beta = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.976061079957424,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
-31L), class = "data.frame")
My goal is to copy the data of "divident_change", "alpha", "beta up/down 5 rows.
Thanks in advance for answering my question!

If we want to assign based on the non-blank elements, create an index of non-blank
i1 <- which(nzchar(value))
n <- 5
for(i in i1) value[c(i-seq_len(n), i + seq_len(n))] <- value[i]
value
#[1] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "5" "5" "5" "5" "5" "5" "5" "5" "5" "5"
#[26] "5" "" "" "" "" "" "" "" "" "" "" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "" "" ""
#[51] "" "" "" "" "" "" "" "" "" "" "" ""
For the updated dataset
nm1 <- c("Dividend_change", "alpha", "beta")
n <- 5
for(nm in nm1) {
i1 <- which(!is.na(dfn[[nm]]))
for(i in i1) dfn[[nm]][c(i-seq_len(n), i + seq_len(n))] <- dfn[[nm]][i]
}
dfn

How to loop variable creation and str_replace dynamically in R

I am trying to parse multiple columns into each of their components. However the number of components varies across the columns. Specifically, suppose the following df:
id X1.startAll X2.startAll
1 ["1555726884484","1555727530298","1555727532509"]
2 ["1555735159384","1555735161545"]
3 ["1555730029709"]
4 ["1555735159384","1555735161545"]
5
6 ["1555735159384","1555735161545"]
now I have 40 of these columns (and another 120 very similar ones, to which I aim to generalize the process) and many more rows. I can do the first column quite simply using the following:
df1$X1.startAll1 <- str_replace(df1$X1.startAll, "\\[\"([0-9]+)\",*\"*([0-9]*)\"*,*\"*([0-9]*)\"*\\]", "\\1")
df1$X1.startAll2 <- str_replace(df1$X1.startAll, "\\[\"([0-9]+)\",*\"*([0-9]*)\"*,*\"*([0-9]*)\"*\\]", "\\2")
df1$X1.startAll3 <- str_replace(df1$X1.startAll, "\\[\"([0-9]+)\",*\"*([0-9]*)\"*,*\"*([0-9]*)\"*\\]", "\\3")
which yields my desired result:
id X1.startAll X1.startAll1 X1.startAll2 X1.startAll3
1 ["1555726884484","1555727530298","1555727532509"] 1555726884484 1555727530298 1555727532509
2
3 ["1555730029709"] 1555730029709
4 ["1555735159384","1555735161545"] 1555735159384 1555735161545
5
6
However, I have to do this for many columns and for many different 'array' lengths within each of these.
I have tried automating this using a for loop, however, I (1) can't figure out how to read the right number of iterations (i.e. the max the number of components in the startAll column), (2) dynamically create the variables, (3) nor how to update the string extraction dynamically ("\\i").
Any and all help on looping this process would help a lot!
Edit 2: below is a copy-pasteable sample of the data:
structure(list(X1.startAll = list(NA, NA, NA, NA, c(1555726884484,
1555727530298, 1555727532509), NA, NA, c(1555735159384, 1555735161545
), NA, NA, NA, 1555730029709, NA, NA, NA, c(1555728423843, 1555728561054,
1555728586917), c(1555725657389, 1555725657827), c(1555703810672,
1555703823206, 1555703848659), NA, NA), X2.startAll = list(NA,
NA, NA, NA, c(1555727541885, 1555727786959, 1555727897893
), NA, NA, 1555735262052, c(1555737694350, 1555737696711),
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), X3.startAll = list(
NA, NA, NA, NA, c(1555727920770, 1555728230065, 1555728843391
), NA, NA, c(1555735331144, 1555735452321, 1555735457305),
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), X4.startAll = list(
NA, NA, NA, NA, 1555728854666, NA, NA, 1555735589629, 1555738374484,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), X5.startAll = list(
NA, NA, NA, NA, c(1555728949327, 1555728988444), NA, NA,
c(1555735646258, 1555735912372, 1555735914267, 1555736071856,
1555736074184, 1555736093411, 1555736124826, 1555736238538,
1555736248889, 1555736576754, 1555736620915, 1555736874386,
1555737698921, 1555737777400, 1555737966562, 1555738152090,
1555738354075, 1555738700232, 1555738703134, 1555738716736
), 1555738415269, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), X6.startAll = list(NA, NA, NA, NA, 1555729661240, NA,
NA, NA, 1555738960285, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), X7.startAll = list(NA, NA, NA, NA, c(1555730266934,
1555730356654, 1555730533798, 1555730535289), NA, c(1555732523945,
1555733415340, 1555733477452, 1555733748200, 1555734007271, 1555734286685,
1555734288597), NA, c(1555739871726, 1555740315324, 1555740328252,
1555740329835, 1555740538272, 1555741140561, 1555741143555, 1555741152932
), c(1555743562826, 1555743566386, 1555743593201), NA, NA, NA,
c(1555727969354, 1555727985539, 1555728064237, 1555738166838,
1555826735910), NA, NA, NA, NA, NA, NA), X8.startAll = list(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA)), row.names = c(NA, -20L), class = "data.frame")

Column linking problem on Parent_ID and Extension in R

I have a file which contains some Order_IDs and their Externsion_ID if exists. A new order can be fresh order, or an extension of existing Order_ID or an extension of an existing extension.
My problem is to add a new column named Parent_ID which marks the root of the Order_ID.
Please find the expected output as below :
A reproducible input is attached below.
df1 = structure(list(Order_ID = c("SL158", "SL159", "SL160", "SL162",
"SL164", "SL165", "SL168", "SL169", "SL170", "SL171", "SL172",
"SL176", "SL177", "SL178", "SL179", "SL180", "SL183", "SL184",
"SL189", "SL190", "SL191", "SL192", "SL193", "SL195", "SL196",
"SL199", "SL200", "SL201", "SL207", "SL208", "SL209", "SL218",
"SL219", "SL223", "SL224", "SL225", "SL226", "SL227", "SL229",
"SL232", "SL233", "SL234", "SL235", "SL239", "SL240", "SL241",
"SL242", "SL243", "SL251", "SL252", "SL257", "SL258", "SL260",
"SL261", "SL262", "SL266", "SL267", "SL268", "SL269", "SL277",
"SL278", "SL279", "SL280", "SL281", "SL287", "SL288", "SL289",
"SL300", "SL301", "SL302", "SL303", "SL304", "SL305", "SL315",
"SL316", "SL322", "SL323", "SL327", "SL328", "SL333", "SL334",
"SL335", "SL336", "SL337", "SL340", "SL341", "SL342", "SL343",
"SL344", "SL345", "SL350", "SL351", "SL352", "SL353", "SL354",
"SL355", "SL363", "SL364", "SL365", "SL366", "SL367", "SL368",
"SL369", "SL370", "SL376", "SL377", "SL378", "SL379", "SL380",
"SL381", "SL382", "SL383", "SL384", "SL385", "SL1217", "SL1452",
"SL4316", "SL4317", "SL4348", "SL4381", "SL4681", "SL4738", "SL5319",
"SL5520", "SL5703", "SL6132", "SL6244", "SL6855", "SL6997", "SLB1253161",
"SLB2970530", "SLB27287329", "SLB36502009", "SLB81913180", "SLB82838226",
"SLB90244936", "SLB99701642", "SL11995", "SLH5317239", "SLH22149557",
"SLH44727392", "SLH45803004", "SLH57801072", "SLH74470000", "SLH79063451",
"SL1134", "SL1011", "SL3686", "SL3691", "SL3695", "SL3716", "SL3718",
"SL3720", "SL3721", "SL3727", "SL5242", "SL5245", "SL5246", "SL5254",
"SL5255", "SL10126", "SL10134", "SL10143", "SL11333", "SL11338",
"SL11365", "SL11377", "SL11384", "SL10004", "SL10046", "SL10058",
"SL10070", "SL10092", "SL11335", "SL11364", "SL11366"),
Extension_Of = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, "SL1134", "SL1011", "SL3691", "SL3718", "SL3727", "SL3695",
"SL3720", "SL3716", "SL3721", "SL5242", "SL5246", "SL5245", "SL5254",
"SL5255", "SL3686", "SL11365", "SL11384", "SL11377", "SL10134",
"SL11333", "SL10143", "SL11338", "SL10126", "SL10046", "SL10070",
"SL11364", "SL11335", "SL10004", "SL10058", "SL11366", "SL10092",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "SL384", NA, NA, "SL171", NA,
NA, NA)),
row.names = c(NA, -176L),
class = c("tbl_df", "tbl", "data.frame"))
head(df1)
# Order_ID Extension_Of
#1 SL158 <NA>
#2 SL159 <NA>
#3 SL160 <NA>
#4 SL162 <NA>
#5 SL164 <NA>
#6 SL165 <NA>

Here is a solution based on igraph:
library(igraph) # 1.2.1
v <- data.frame(name = unique(unlist(df1)), stringsAsFactors = FALSE)
v <- v[!is.na(v$name), ]
g <- graph_from_data_frame(df1[!is.na(df1$Extension_Of), 2:1], vertices = v)
df1$Parent_ID <- sapply(df1$Order_ID, function(oid){
n <- ego(g, order = nrow(df1), oid, mode = 'in')[[1]]
nin <- lapply(n, function(x){ego(g, order = nrow(df1), x, mode = 'in')[[1]]})
root <- n[lengths(nin) == 1]$name
})
df1[df1$Parent_ID == 'SL384', ]
# Order_ID Extension_Of Parent_ID
# 113 SL384 <NA> SL384
# 138 SL11995 SL10046 SL384
# 170 SL10046 SL384 SL384
This answer is inspired by this answer and this function.
The rationale: Each line without NA in df1 can be treated as an edge in a graph. if B is extension of A, we have an edge A -> B. If C is extension of B, we get B->C. Then the problem can be rephrased as: for each node (Order_ID), find its root node. For C, its root node is A since (A->B->C).
In the code above, for Order_ID, ego finds all the nodes that are directly or indirectly upstream of it (including itself). Among those upstream nodes, we can determine the root node as the one without other upstream nodes.

Calculating the mean of 3 columns in data frame

I have 3 data frames and they are just replicates. So I want to bind them and calculate the mean of each fraction.
Three data frames:
Nr.1
> dput(head(tbl_gel1))
structure(list(Name = c("yal003w", "yal005c", "yal012w", "yal016w",
"yal035w", "yal038w"), `1_1` = c(1.08346521189121, NA, NA, NA,
NA, NA), `1_10` = c(0.267721905361376, 1.43303883148383, 1.61684304894131,
NA, NA, NA), `1_11` = c(0.189487668138674, 0.75522363065885,
1, NA, NA, NA), `1_12` = c(NA, 1.01340492119247, NA, NA, NA,
NA), `1_13` = c(0.374782308020683, 0.945489433731933, NA, NA,
NA, 0.0317297633029047), `1_14` = c(0.437488212634424, 1.18763709680314,
NA, NA, NA, 0.0278039649538794), `1_15` = c(1, 0.963283876302253,
NA, NA, NA, 0.101985769564935), `1_16` = c(0.933864874212228,
0.534233379286527, NA, NA, NA, 0.216767470594226), `1_17` = c(1,
0.665519263271478, NA, NA, 1, 1), `1_18` = c(0.666036574750145,
0.570465125348879, NA, NA, NA, 1.42894349812116), `1_19` = c(0.514337131747938,
0.23204076838128, NA, NA, 1, 1.2521214021452), `1_2` = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), `1_20` = c(NA,
NA, NA, NA, NA, 1.40803677399372), `1_21` = c(1.09990599806138,
NA, NA, NA, NA, 1.04631699593704), `1_22` = c(1.26442418472118,
NA, NA, NA, NA, 0.928872017485782), `1_23` = c(1.11596921281805,
NA, NA, NA, 1, 0.34698227364696), `1_24` = c(0.754496014447251,
NA, NA, NA, 1, 0.222234793614252), `1_3` = c(6.29254185223621,
NA, NA, 0.693642968439352, NA, NA), `1_4` = c(1.36347593974479,
NA, NA, 1, NA, NA), `1_5` = c(0.765885344543765, NA, NA, 1, NA,
NA), `1_6` = c(0.238118001668604, 0.679584207611477, NA, NA,
NA, NA), `1_7` = c(0.847897771442355, 0.277348019879946, NA,
NA, NA, NA), `1_8` = c(0.356154192700505, 1, 0.409523853881517,
NA, NA, NA), `1_9` = c(0.180109142324181, 1, 0.578310191227172,
NA, NA, 0.093113736249161)), .Names = c("Name", "1_1", "1_10",
"1_11", "1_12", "1_13", "1_14", "1_15", "1_16", "1_17", "1_18",
"1_19", "1_2", "1_20", "1_21", "1_22", "1_23", "1_24", "1_3",
"1_4", "1_5", "1_6", "1_7", "1_8", "1_9"), row.names = c(NA,
6L), class = "data.frame")
Nr. 2
> dput(head(tbl_gel2))
structure(list(Name = c("yal003w", "yal005c", "yal012w", "yal016w",
"yal035w", "yal038w"), `2_1` = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), `2_2` = c(1.0548947840373, NA,
NA, NA, NA, NA), `2_3` = c(1.61794716486303, 0.346821796129205,
NA, NA, NA, NA), `2_4` = c(1, NA, NA, 0.378254379051086, NA,
NA), `2_5` = c(0.670710809411423, NA, NA, 1, NA, NA), `2_6` = c(0.313872585645673,
NA, NA, NA, NA, NA), `2_7` = c(0.299293639466945, 0.13920907824675,
NA, NA, NA, NA), `2_8` = c(0.311431376422469, 0.511742245543671,
0.342807141055383, NA, NA, NA), `2_9` = c(0.243672215177189,
1, 0.689138745271004, NA, NA, 0.0540861571772987), `2_10` = c(0.154732102234279,
1.08973258347909, 1, NA, NA, NA), `2_11` = c(0.149365726324845,
1.1210733533474, 1.0427649268992, NA, NA, 0.0955468461925663),
`2_12` = c(0.153741630869067, 2.96276072446013, 1, NA, NA,
NA), `2_13` = c(0.629371115599316, 0.952868912207058, 0.0771105403237483,
NA, NA, 0.0885212695236819), `2_14` = c(0.907644486740723,
1.43000783337778, NA, NA, NA, 0.138102409899801), `2_15` = c(1.09683345304359,
0.423641943213571, NA, NA, NA, 0.255699738225622), `2_16` = c(0.913095779338154,
0.510977400533081, NA, NA, 0.520556617688936, 0.284898552722227
), `2_17` = c(0.935941553863477, 0.388225948821767, NA, NA,
1.14984991998928, 1), `2_18` = c(2.21746156904543, 0.642743615867438,
NA, NA, NA, 2.22716071647178), `2_19` = c(0.500618035526774,
0.282924681750454, NA, NA, NA, 1), `2_20` = c(0.701627311828743,
0.254001731153973, NA, NA, 1, 1.15996914621286), `2_21` = c(1.97359874904275,
NA, NA, NA, 1.67526802494991, 1.38709456754353), `2_22` = c(2.09198896289293,
NA, NA, NA, NA, 0.921672834103247), `2_23` = c(1.18791465369551,
NA, NA, NA, NA, 0.576309066193914), `2_24` = c(0.473199477125101,
0.176144702328764, NA, NA, 1, 0.130236848112641)), .Names = c("Name",
"2_1", "2_2", "2_3", "2_4", "2_5", "2_6", "2_7", "2_8", "2_9",
"2_10", "2_11", "2_12", "2_13", "2_14", "2_15", "2_16", "2_17",
"2_18", "2_19", "2_20", "2_21", "2_22", "2_23", "2_24"), row.names = c(NA,
6L), class = "data.frame")
Nr.3
> dput(head(tbl_gel3))
structure(list(Name = c("yal003w", "yal005c", "yal012w", "yal016w",
"yal035w", "yal038w"), `3_1` = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), `3_2` = c(1, 1.4605309655311,
NA, NA, NA, NA), `3_3` = c(1.74480713727388, 0.42825619952525,
NA, NA, NA, NA), `3_4` = c(1, 0.431712121875013, NA, 0.395182020245312,
NA, NA), `3_5` = c(2.26247329056518, 0.644462177666441, NA, 1,
NA, NA), `3_6` = c(0.619783374266709, 0.472094874244026, NA,
NA, NA, NA), `3_7` = c(0.45731912574756, 0.176354321796083, NA,
NA, NA, NA), `3_8` = c(0.271829278733367, 0.517232771669986,
0.153774052052871, NA, NA, NA), `3_9` = c(0.141017619508583,
1.41279969394534, 0.651948154271122, NA, NA, NA), `3_10` = c(NA,
1.64435171100405, 0.998807430240956, NA, NA, NA), `3_11` = c(0.110046035477971,
1.33684444261939, 1.25595310581771, NA, NA, 0.0236163735479745
), `3_12` = c(NA, 0.982250906830292, 0.39283619985401, NA, NA,
0.0688303458902568), `3_13` = c(0.136798076436642, 0.55729642483448,
0.176525038283566, NA, NA, 0.0251189412372225), `3_14` = c(0.316623893146817,
1, NA, NA, NA, 0.0727823461722849), `3_15` = c(NA, 0.607991038574375,
NA, NA, NA, 0.133968257432001), `3_16` = c(0.362994392402489,
0.547183167896534, NA, NA, NA, 0.0777347708647245), `3_17` = c(1,
0.116561118715651, NA, NA, 0.710972173471528, 1), `3_18` = c(NA,
3.63330458071475, NA, NA, NA, 3.24019081192985), `3_19` = c(NA,
NA, NA, NA, NA, 2.46635222132474), `3_20` = c(0.452303676849426,
0.0896715384025126, NA, NA, 1, 1), `3_21` = c(1.50169299468485,
0.513442106966708, NA, NA, 1.45124841710635, 1.02529618467026
), `3_22` = c(0.565232592993276, 0.748536315065533, NA, NA, 2.9089322117881,
0.782555457293307), `3_23` = c(1.62622280168665, 0.704926586534075,
NA, NA, NA, 0.584486806995139), `3_24` = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_)), .Names = c("Name",
"3_1", "3_2", "3_3", "3_4", "3_5", "3_6", "3_7", "3_8", "3_9",
"3_10", "3_11", "3_12", "3_13", "3_14", "3_15", "3_16", "3_17",
"3_18", "3_19", "3_20", "3_21", "3_22", "3_23", "3_24"), row.names = c(NA,
6L), class = "data.frame")
I used function below to bind them. There are different number of rows in each data frame and in some cases different names so in the final table should be more rows than in each of them.
mylist <- list(tbl_gel1,tbl_gel2,tbl_gel3)
tbl_all <- Reduce(function(x, y) merge(x, y, all=T,by="Name",sort=F),
mylist, accumulate=F)
Everything goes fine until this moment.
Now I want to calculate the mean of each fraction (there is 24 fractions in total)
## Calculating the mean
tbl_all1 <- tbl_all[-1]
ind <- c(1, 25, 49)
tbl_mean <- cbind(tbl_all[1], sapply(0:23, function(i) rowMeans(tbl_all1[ind+i])))
There is something wrong with that function because sum of many rows gives 0. That's definitely wrong because in tbl_gel1 and others are only rows with atleast one number in any fraction.
If I take a look on tbl_mean I see that rows with sum of 0 are in the bottom.

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Getting distance between vectors of longitude and latitude points - r

Related

Missing cases while using summarise(across())

Copy value in dataframe up/down x cells

How to loop variable creation and str_replace dynamically in R

Column linking problem on Parent_ID and Extension in R

Calculating the mean of 3 columns in data frame

Categories

Resources