How to join two dataframes containing time varying variables in R - r

This seems like a simple enough thing but I can't figure it out nor find an answer online - apologies if it something obvious. I have two seperate dataframes containing the same patients with the same unique identifier. Both datasets have time varying variables - one continuous and one categorical and the time to each reading is different in the sets but have a common start point at time 1. I have tried to modify the tmerge function from survival package but without luck as I don't have a dichotomous outcome variable nor a single data set with one row per patient.
Reprex for creating the datasets below (df1 and df2) and an example of my desired combined output table for a single patient (ID 3), output gets very long if done for all 4 patients
Thanks for any possible help
df1 <- structure(list(tstart = c(1, 1, 1, 1426, 1, 560, 567), tstop = c(2049,
3426, 1426, 1707, 560, 567, 4207), category = structure(c(1L,
1L, 1L, 2L, 1L, 4L, 2L), .Label = c("none", "high", "low", "moderate"
), class = "factor"), id = c(1L, 2L, 3L, 3L, 4L, 4L, 4L)), row.names = c(NA,
-7L), class = c("tbl_df", "tbl", "data.frame"))
df2 <- structure(list(tstart = c(1, 365, 730, 1, 365, 730, 1096, 2557,
1, 365, 730, 1096, 1826, 2557, 3652, 1), tstop = c(365, 730,
1096, 365, 730, 1096, 2557, 2582, 365, 730, 1096, 1826, 2557,
3652, 4864, 365), egfr = c(66, 62, 58, 54, 50, 43, 49, 51, 106,
103, 80, 92, 97, 90, 81, 51), id = c(1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 4L)), row.names = c(NA, -16L), class = c("tbl_df",
"tbl", "data.frame"))
df_example_patient_3 <- structure(list(id = c(3L, 3L, 3L,
3L, 3L, 3L,3L, 3L, 3L), tstart = c(1, 365, 730, 1096, 1426, 1707, 1826, 2557, 3652), tstop = c(365, 730,
1096, 1426, 1707, 1826, 2557, 3652, 4864), egfr = c(106, 103, 80, 92, 92, 92, 97, 90, 81), category = c("none", "none", "none", "none", "high", "high", "high", "high", "high")), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"))
# DF1
tstart tstop category id
<dbl> <dbl> <fct> <int>
1 1 2049 none 1
2 1 3426 none 2
3 1 1426 none 3
4 1426 1707 high 3
5 1 560 none 4
6 560 567 moderate 4
7 567 4207 high 4
# DF2
tstart tstop egfr id
<dbl> <dbl> <dbl> <int>
1 1 365 66 1
2 365 730 62 1
3 730 1096 58 1
4 1 365 54 2
5 365 730 50 2
6 730 1096 43 2
7 1096 2557 49 2
8 2557 2582 51 2
9 1 365 106 3
10 365 730 103 3
11 730 1096 80 3
12 1096 1826 92 3
13 1826 2557 97 3
14 2557 3652 90 3
15 3652 4864 81 3
16 1 365 51 4
# Combined set
id tstart tstop egfr category
<int> <dbl> <dbl> <dbl> <chr>
1 3 1 365 106 none
2 3 365 730 103 none
3 3 730 1096 80 none
4 3 1096 1426 92 none
5 3 1426 1707 92 high
6 3 1707 1826 92 high
7 3 1826 2557 97 high
8 3 2557 3652 90 high
9 3 3652 4864 81 high

I had to do it this way to really work out the details.
First, i construct a full df1 with all the timestamps, including those of df2.
then i proceed with multiple merges. This is not elegant, but it works:
library(data.table)
library(zoo)
# Proper data.tables
setDT(df1, key = c("id", "tstart"))
setDT(df2, key = c("id", "tstart"))
timestamps_by_id <- unique(rbind(
df1[, .(id, tstart)],
df1[, .(id, tstop)],
df2[, .(id, tstart)],
df2[, .(id, tstop)],
use.names = F
))
setorder(timestamps_by_id, id, tstart)
# Merge to construct full df1
df1_full <- df1[timestamps_by_id]
df1_full[, category := na.locf(category), by = id]
df1_full[, tstop := shift(tstart, -1), by = id]
setkey(df1_full, id, tstart)
# Merge with df2
result <- na.omit(df2[df1_full, roll = T])
result[, tstop := i.tstop]
print(result[id == 3, .(id, tstart, tstop, egfr, category)])
Or a more data.tabley solution using the more arcane foverlaps:
library(data.table)
# Proper data.tables
setDT(df1, key = c("id", "tstart", "tstop"))
setDT(df2, key = c("id", "tstart", "tstop"))
# We add an infinite upper range
proper_df1 <- rbind(
df1,
df1[, .SD[which.max(tstop)], by = .(id)][, .(id, tstart = tstop, tstop = Inf, category), ]
)
setkey(proper_df1, id, tstart, tstop)
overlaps <- foverlaps(df2, proper_df1, type = "any") # Overlap join
overlaps[
tstart %between% .(i.tstart, i.tstop) & tstart != 1,
i.tstart := tstart
]
overlaps[tstop %between% .(i.tstart, i.tstop), i.tstop := tstop]
print(overlaps[
id == 3,
.(id, "tstart" = i.tstart, "tstop" = i.tstop, category, egfr)
])

This messy dplyr solution seems to work for this particular dataset but don't know would it work for all datasets, the direction of the fill may need to be altered depending on particular dataset
library(tidyverse)
library(magrittr)
df1 %>%
bind_rows(df2) %>%
group_by(id) %>%
arrange(id, tstop) %>%
mutate(
tstart = case_when(
tstart < lag(tstop) ~ lag(tstop), TRUE ~ tstart)) %>%
fill(egfr, category, .direction = "updown") %>%
ungroup() %>%
filter(id == 3)
tstart tstop category id egfr
<dbl> <dbl> <fct> <int> <dbl>
1 1 365 none 3 106
2 365 730 none 3 103
3 730 1096 none 3 80
4 1096 1426 none 3 92
5 1426 1707 high 3 92
6 1707 1826 high 3 92
7 1826 2557 high 3 97
8 2557 3652 high 3 90
9 3652 4864 high 3 81

Related

Merge two dataframes: specifically merge a selection of columns based on two conditions?

I have two datasets on the same 2 patients. With the second dataset I want to add new information to the first, but I can't seem to get the code right.
My first (incomplete) dataset has a patient ID, measurement time (either T0 or FU1), year of birth, date of the CT scan, and two outcomes (legs_mass and total_mass):
library(tidyverse)
library(dplyr)
library(magrittr)
library(lubridate)
df1 <- structure(list(ID = c(115, 115, 370, 370), time = structure(c(1L,
6L, 1L, 6L), .Label = c("T0", "T1M0", "T1M6", "T1M12", "T2M0",
"FU1"), class = "factor"), year_of_birth = c(1970, 1970, 1961,
1961), date_ct = structure(c(16651, 17842, 16651, 18535), class = "Date"),
legs_mass = c(9.1, NA, NA, NA), total_mass = c(14.5, NA,
NA, NA)), row.names = c(NA, -4L), class = c("tbl_df", "tbl",
"data.frame"))
# Which gives the following dataframe
df1
# A tibble: 4 x 6
ID time year_of_birth date_ct legs_mass total_mass
<dbl> <fct> <dbl> <date> <dbl> <dbl>
1 115 T0 1970 2015-08-04 9.1 14.5
2 115 FU1 1970 2018-11-07 NA NA
3 370 T0 1961 2015-08-04 NA NA
4 370 FU1 1961 2020-09-30 NA NA
The second dataset adds to the legs_mass and total_mass columns:
df2 <- structure(list(ID = c(115, 370), date_ct = structure(c(17842,
18535), class = "Date"), ctscan_label = c("PXE115_CT_20181107_xxxxx-3.tif",
"PXE370_CT_20200930_xxxxx-403.tif"), legs_mass = c(956.1, 21.3
), total_mass = c(1015.9, 21.3)), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame"))
# Which gives the following dataframe:
df2
# A tibble: 2 x 5
ID date_ct ctscan_label legs_mass total_mass
<dbl> <date> <chr> <dbl> <dbl>
1 115 2018-11-07 PXE115_CT_20181107_xxxxx-3.tif 956. 1016.
2 370 2020-09-30 PXE370_CT_20200930_xxxxx-403.tif 21.3 21.3
What I am trying to do, is...
Add the legs_mass and total_mass column values from df2 to df1, based on ID number and date_ct.
Add the new columns of df2 (the one that is not in df1; ctscan_label) to df1, also based on the date of the ct and patient ID.
So that the final dataset df3 looks as follows:
df3 <- structure(list(ID = c(115, 115, 370, 370), time = structure(c(1L,
6L, 1L, 6L), .Label = c("T0", "T1M0", "T1M6", "T1M12", "T2M0",
"FU1"), class = "factor"), year_of_birth = c(1970, 1970, 1961,
1961), date_ct = structure(c(16651, 17842, 16651, 18535), class = "Date"),
legs_mass = c(9.1, 956.1, NA, 21.3), total_mass = c(14.5,
1015.9, NA, 21.3)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
# Corresponding to the following tibble:
# A tibble: 4 x 6
ID time year_of_birth date_ct legs_mass total_mass
<dbl> <fct> <dbl> <date> <dbl> <dbl>
1 115 T0 1970 2015-08-04 9.1 14.5
2 115 FU1 1970 2018-11-07 956. 1016.
3 370 T0 1961 2015-08-04 NA NA
4 370 FU1 1961 2020-09-30 21.3 21.3
I have tried the merge function and rbind from baseR, and bind_rows from dplyr but can't seem to get it right.
Any help?
You can join the two datasets and use coalesce to keep one non-NA value from the two datasets.
library(dplyr)
left_join(df1, df2, by = c("ID", "date_ct")) %>%
mutate(leg_mass = coalesce(legs_mass.x , legs_mass.y),
total_mass = coalesce(total_mass.x, total_mass.y)) %>%
select(-matches('\\.x|\\.y'), -ctscan_label)
# ID time year_of_birth date_ct leg_mass total_mass
# <dbl> <fct> <dbl> <date> <dbl> <dbl>
#1 115 T0 1970 2015-08-04 9.1 14.5
#2 115 FU1 1970 2018-11-07 956. 1016.
#3 370 T0 1961 2015-08-04 NA NA
#4 370 FU1 1961 2020-09-30 21.3 21.3
We can use data.table methods
library(data.table)
setDT(df1)[setDT(df2), c("legs_mass", "total_mass") :=
.(fcoalesce(legs_mass, i.legs_mass),
fcoalesce(total_mass, i.total_mass)), on = .(ID, date_ct)]
-output
df1
ID time year_of_birth date_ct legs_mass total_mass
1: 115 T0 1970 2015-08-04 9.1 14.5
2: 115 FU1 1970 2018-11-07 956.1 1015.9
3: 370 T0 1961 2015-08-04 NA NA
4: 370 FU1 1961 2020-09-30 21.3 21.3

Divide column from dataframe into another

I've got 2 data frames that I'm trying to divide by each other but it's not working for me. Both dataframes are 8 x 3 with column one the same for both, column names are also the same for both data frames
bal_tier[,c(1, 3:4)]
# A tibble: 8 x 3
# Groups: hierachy_level2 [8]
hierachy_level2 `201804` `201904`
<chr> <dbl> <dbl>
1 CS 239 250
2 FNZ 87 97
3 OPS 1057 1136.
4 P&T 256 279
5 R&A 520 546
6 SPE 130 136.
7 SPP 67 66
8 TUR 46 69
dput(bal_tier[,c(1, 3:4)])
structure(list(hierachy_level2 = c("CS", "FNZ", "OPS", "P&T",
"R&A", "SPE", "SPP", "TUR"), `201804` = c(239, 87, 1057, 256,
520, 130, 67, 46), `201904` = c(250, 97, 1136.5, 279, 546, 136.5,
66, 69)), row.names = c(NA, -8L), groups = structure(list(hierachy_level2 = c("CS",
"FNZ", "OPS", "P&T", "R&A", "SPE", "SPP", "TUR"), .rows = list(
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
tier_leavers[,c(1, 3:4)]
# A tibble: 8 x 3
# Groups: hierachy_level2 [8]
hierachy_level2 `201804` `201904`
<chr> <dbl> <dbl>
1 CS 32 47
2 FNZ 1 11
3 OPS 73 76
4 P&T 48 33
5 R&A 41 33
6 SPE 28 30
7 SPP 10 12
8 TUR 2 3
dput(tier_leavers[,c(1, 3:4)])
structure(list(hierachy_level2 = c("CS", "FNZ", "OPS", "P&T",
"R&A", "SPE", "SPP", "TUR"), `201804` = c(32, 1, 73, 48, 41,
28, 10, 2), `201904` = c(47, 11, 76, 33, 33, 30, 12, 3)), row.names = c(NA,
-8L), groups = structure(list(hierachy_level2 = c("CS", "FNZ",
"OPS", "P&T", "R&A", "SPE", "SPP", "TUR"), .rows = list(1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"), .drop = FALSE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Doing this gives me what I want:
bal_tier[,1]
# A tibble: 8 x 1
# Groups: hierachy_level2 [8]
hierachy_level2
<chr>
1 CS
2 FNZ
3 OPS
4 P&T
5 R&A
6 SPE
7 SPP
8 TUR
(tier_leavers[,c(3:4)] / bal_tier[,c(3:4)])
201804 201904
1 0.13389121 0.18800000
2 0.01149425 0.11340206
3 0.06906339 0.06687198
4 0.18750000 0.11827957
5 0.07884615 0.06043956
6 0.21538462 0.21978022
7 0.14925373 0.18181818
8 0.04347826 0.04347826
but when I combine it in a cbind I end up with this:
cbind(bal_tier[,1], tier_leavers[,c(3:4)] / bal_tier[,c(3:4)])
[,1] [,2]
201804 Character,8 Numeric,8
201904 Character,8 Numeric,8
What am I understanding wrong here?
Here's a solution using tidyverse
nme <- c("A","B","C","D","E")
yr_1 <- round(10*runif(n=5,min=0,max=10),0)
yr_2 <- round(10*runif(n=5,min=0,max=10),0)
data_1 <- data.frame(nme,yr_1,yr_2)
yr_1 <- round(10*runif(n=5,min=0,max=10),0)
yr_2 <- round(10*runif(n=5,min=0,max=10),0)
data_2 <- data.frame(nme,yr_1,yr_2)
data_divide <- data_1 %>%
left_join(data_2,by="nme") %>%
mutate(
result_1=yr_1.x/yr_1.y,
result_2=yr_2.x/yr_2.y
)
What I ended up doing feels like cheating but I got a clue from Zeus's answer:
a <- bal_tier[, 1]
b <- tier_leavers[,c(3:4)] / bal_tier[,c(3:4)]
tier_to <- data.frame(a, b)
tier_to
> tier_to
hierachy_level2 X201804 X201904
1 CS 0.13389121 0.18800000
2 FNZ 0.01149425 0.11340206
3 OPS 0.06906339 0.06687198
4 P&T 0.18750000 0.11827957
5 R&A 0.07884615 0.06043956
6 SPE 0.21538462 0.21978022
7 SPP 0.14925373 0.18181818
8 TUR 0.04347826 0.04347826

Subsetting rows based on multiple columns using data.table - fastest way

I was wondering if there was a more elegant, less clunky and faster way to do this. I have millions of rows with ICD coding for clinical data. A short example provided below. I was to subset the dataset based on either of the columns meeting a specific set of diagnosis codes. The code below works but takes ages in R and was wondering if there is a faster way.
structure(list(eid = 1:10, mc1 = structure(c(4L, 3L, 5L, 2L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("345", "410", "413.9", "I20.1",
"I23.4"), class = "factor"), oc1 = c(350, 323, 12, 35, 413.1,
345, 345, 345, 345, 345), oc2 = structure(c(5L, 6L, 4L, 1L, 1L,
2L, 2L, 2L, 3L, 2L), .Label = c("", "345", "I20.3", "J23.6",
"K50.1", "K51.4"), class = "factor")), .Names = c("eid", "mc1",
"oc1", "oc2"), class = c("data.table", "data.frame"), row.names = c(NA,
-10L), .internal.selfref = <pointer: 0x102812578>)
The code below subsets all rows that meet the code of either "I20" or "413" (this would include all codes that have for example been coded as "I20.4" or "413.9" etc.
dat2 <- dat [substr(dat$mc1,1,3)== "413"|
substr(dat$oc1,1,3)== "413"|
substr(dat$oc2,1,3)== "413"|
substr(dat$mc1,1,3)== "I20"|
substr(dat$oc1,1,3)== "I20"|
substr(dat$oc2,1,3)== "I20"]
Is there a faster way to do this? For example can i loop through each of the columns looking for the specific codes "I20" or "413" and subset those rows?
We can specify the columns of interest in .SDcols, loop through the Subset of Data.table (.SD), get the first 3 characters with substr, check whether it is %in% a vector of values and Reduce it to a single logical vector for subsetting the rows
dat[dat[,Reduce(`|`, lapply(.SD, function(x)
substr(x, 1, 3) %chin% c('413', 'I20'))), .SDcols = 2:4]]
# eid mc1 oc1 oc2
#1: 1 I20.1 350.0 K50.1
#2: 2 413.9 323.0 K51.4
#3: 5 345 413.1
#4: 9 345 345.0 I20.3
For larger data it could help if we dont chech all rows:
minem <- function(dt, colsID = 2:4) {
cols <- colnames(dt)[colsID]
x <- c('413', 'I20')
set(dt, j = "inn", value = F)
for (i in cols) {
dt[inn == F, inn := substr(get(i), 1, 3) %chin% x]
}
dt[inn == T][, inn := NULL][]
}
n <- 1e7
set.seed(13)
dt <- dts[sample(.N, n, replace = T)]
dt <- cbind(dt, dts[sample(.N, n, replace = T), 2:4])
setnames(dt, make.names(colnames(dt), unique = T))
dt
# eid mc1 oc1 oc2 mc1.1 oc1.1 oc2.1
# 1: 8 345 345.0 345 345 345 345
# 2: 3 I23.4 12.0 J23.6 413.9 323 K51.4
# 3: 4 410 35.0 413.9 323 K51.4
# 4: 1 I20.1 350.0 K50.1 I23.4 12 J23.6
# 5: 10 345 345.0 345 345 345 345
# ---
# 9999996: 3 I23.4 12.0 J23.6 I20.1 350 K50.1
# 9999997: 5 345 413.1 I20.1 350 K50.1
# 9999998: 4 410 35.0 345 345 345
# 9999999: 4 410 35.0 410 35
# 10000000: 10 345 345.0 345 345 345 I20.3
system.time(r1 <- akrun(dt, 2:ncol(dt))) # 22.88 sek
system.time(r2 <- minem(dt, 2:ncol(dt))) # 17.72 sek
all.equal(r1, r2)
# [1] TRUE

New data.table columnS based on grouping and function of multiple columns

Let's say I have a data.frame
sample_df = structure(list(AE = c(148, 1789, 1223, 260, 1825, 37, 1442, 484,
10, 163, 1834, 254, 445, 837, 721, 1904, 1261, 382, 139, 213),
FW = structure(c(1L, 3L, 2L, 3L, 3L, 1L, 2L, 3L, 2L, 2L,
3L, 2L, 3L, 2L, 1L, 3L, 1L, 1L, 1L, 3L), .Label = c("LYLR",
"OCXG", "BIYX"), class = "factor"), CP = c("WYB/NXO", "HUK/NXO",
"HUK/WYB", "HUK/NXO", "WYB/NXO", "HUK/WYB", "HUK/NXO", "HUK/NXO",
"WYB/NXO", "HUK/NXO", "WYB/NXO", "HUK/NXO", "HUK/WYB", "WYB/NXO",
"HUK/WYB", "WYB/NXO", "WYB/NXO", "HUK/WYB", "WYB/NXO", "WYB/NXO"
), SD = c(1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, -1,
-1, 1, -1, 1, 1, 1)), .Names = c("AE", "FW", "CP", "SD"), row.names = c(NA, -20L), class = "data.frame")
Or in human readable format:
AE FW CP SD
1 148 LYLR WYB/NXO 1
2 1789 BIYX HUK/NXO 1
3 1223 OCXG HUK/WYB -1
4 260 BIYX HUK/NXO 1
5 1825 BIYX WYB/NXO 1
6 37 LYLR HUK/WYB 1
7 1442 OCXG HUK/NXO 1
8 484 BIYX HUK/NXO -1
9 10 OCXG WYB/NXO 1
10 163 OCXG HUK/NXO 1
11 1834 BIYX WYB/NXO -1
12 254 OCXG HUK/NXO -1
13 445 BIYX HUK/WYB 1
14 837 OCXG WYB/NXO -1
15 721 LYLR HUK/WYB -1
16 1904 BIYX WYB/NXO 1
17 1261 LYLR WYB/NXO -1
18 382 LYLR HUK/WYB 1
19 139 LYLR WYB/NXO 1
20 213 BIYX WYB/NXO 1
now suppose that for each unique value (fw,cp) of (FW,CP), I would like to get
sum of all values of AE for (FW,CP)=(fw,cp)
mean of all values of SD for (FW,CP)=(fw,cp)
In R, one could do something like:
unique_keys <- unique(sample_df[,c('FW','CP')])
slow_version <- function(ind, sample_df, unique_keys){
index <- which(sample_df$FW == unique_keys$FW[ind] & sample_df$CP == unique_keys$CP[ind])
c(ind = ind,
sum_ae = sum(sample_df$AE[index]),
min_ae = mean(sample_df$SD[index]))
}
intermed_result <- t(sapply(1:nrow(unique_keys), slow_version,
sample_df = sample_df,
unique_keys = unique_keys))
colnames(intermed_result) <- c('ind','sum','mean')
result <- data.frame(unique_keys[intermed_result[, 'ind'], ],
'sum' = intermed_result[,'sum'],
'mean' = intermed_result[,'mean'])
but this gets pretty slow as the size of data_frame grows.
Thanks to this answer, I suspect it is possible to use data.table magic to get the same result fastly. But doing:
library(data.table)
sample_dt = data.table(sample_df)
setkey(sample_dt, FW, CP)
f <- function(AE, SD) {list('sum' = sum(AE), 'mean' = mean(SD))}
sample_dt[,c("col1","col2"):=f(AE, SD), by=.(FW, CP)][]
does not yield the desired result. What is the correct way?
I would try:
library(data.table)
sample_dt = data.table(data_frame)
setkey(sample_dt, FW, CP)
f <- function(AE, SD) {list('sum' = sum(AE), 'mean' = mean(SD))}
sample_dt[, f(AE, SD), by=.(FW, CP)]
# FW CP sum mean
# 1: LYLR HUK/WYB 1140 0.3333333
# 2: LYLR WYB/NXO 1548 0.3333333
# 3: OCXG HUK/NXO 1859 0.3333333
# 4: OCXG HUK/WYB 1223 -1.0000000
# 5: OCXG WYB/NXO 847 0.0000000
# 6: BIYX HUK/NXO 2533 0.3333333
# 7: BIYX HUK/WYB 445 1.0000000
# 8: BIYX WYB/NXO 5776 0.5000000
you didn't get desired output because you assign the resulting sum and mean columns by group to original data.table with :=. However, I also prefer the syntax suggested by Frank, which should be the right way to go. For our current named list approach, when adding verbose = T, it says:
Making each group and running j (GForce FALSE) ... The result of j is
a named list. It's very inefficient to create the same names over and
over again for each group. When j=list(...), any names are detected,
removed and put back after grouping has completed, for efficiency.
Using j=transform(), for example, prevents that speedup (consider
changing to :=). This message may be upgraded to warning in future.
When we have many groups and the function in j are basic functions like mean and sd, using
sample_dt2[, .(sum.AE = sum(AE), mean.SD = mean(SD)), by=.(FW, CP)]
would be very fast, becaused those functions are replaced with GForce functions like gmean internally. see ?GForce and the benchmark of Frank for more information.

How to draw multiple lines in R under leaflet?

I am having trouble drawing multiple lines in R using leaflet. I have a base map of New York City stations. I would like to add more information from the existing data set. The data set has columns: start_lng, start_lat, end_lng end_lat and total_trip. For each row, I would like to draw a line that connects the start point and the end point separately. Then the two stations will be connect, which stands for a trip. I hope to have one trip for each row. Plus, for coloring, the darkness of the line segments will be based on the total_trip. How would I be able to do that? Thanks.
leaflet(sample) %>%
addTiles() %>%
setView(-73.9,40.7, zoom = 11) %>%
addCircles(data = master_stations,lng = ~long, lat = ~lat, weight = 1, popup = ~name)
Here's part of my data set:
start.station.id start.station.longitude start.station.latitude end.station.longitude end.station.latitude total_trip
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 72 -73.99393 40.76727 -74.00859 40.73620 2
2 72 -73.99393 40.76727 -73.99074 40.73455 2
3 72 -73.99393 40.76727 -73.97722 40.76341 2
4 72 -73.99393 40.76727 -73.98192 40.76527 2
5 79 -74.00667 40.71912 -73.98163 40.75206 2
6 79 -74.00667 40.71912 -73.98658 40.75514 2
7 79 -74.00667 40.71912 -73.98317 40.75527 2
8 79 -74.00667 40.71912 -73.98722 40.75300 2
9 83 -73.97632 40.68383 -73.97493 40.68981 4
10 83 -73.97632 40.68383 -73.98657 40.70149 2
# ... with 899 more rows
This is the full data set:
structure(list(start.station.id = c(72, 72, 72, 72, 79, 79),
end.station.id = c(238, 285, 352, 468, 153, 465), total_trip = c(2L,
2L, 2L, 2L, 2L, 2L), start.station.name = c("\"W 52 St & 11 Ave\"",
"\"W 52 St & 11 Ave\"", "\"W 52 St & 11 Ave\"", "\"W 52 St & 11 Ave\"",
"\"Franklin St & W Broadway\"", "\"Franklin St & W Broadway\""
), start.station.longitude = c(-73.99392888, -73.99392888,
-73.99392888, -73.99392888, -74.00666661, -74.00666661),
start.station.latitude = c(40.76727216, 40.76727216, 40.76727216,
40.76727216, 40.71911552, 40.71911552), end.station.name = c("\"Bank St & Washington St\"",
"\"Broadway & E 14 St\"", "\"W 56 St & 6 Ave\"", "\"Broadway & W 55 St\"",
"\"E 40 St & 5 Ave\"", "\"Broadway & W 41 St\""), end.station.longitude = c(-74.00859207,
-73.99074142, -73.97722479, -73.98192338, -73.9816324043,
-73.98658032), end.station.latitude = c(40.7361967, 40.73454567,
40.76340613, 40.7652654, 40.752062307, 40.75513557)), .Names = c("start.station.id",
"end.station.id", "total_trip", "start.station.name", "start.station.longitude",
"start.station.latitude", "end.station.name", "end.station.longitude",
"end.station.latitude"), row.names = c(NA, -6L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = list(start.station.id), drop = TRUE, indices = list(
0:3, 4:5), group_sizes = c(4L, 2L), biggest_group_size = 4L, labels = structure(list(
start.station.id = c(72, 79)), row.names = c(NA, -2L), class = "data.frame", vars = list(
start.station.id), drop = TRUE, .Names = "start.station.id"))

Resources