Efficient way to join with timeintervals in R - r

I have a table (1) like this:
START;END;CATEGORY
20.05.2017 19:23:00;20.05.2017 19:27:00;A
20.05.2017 19:27:00;20.05.2017 19:32:00;B
20.05.2017 19:32:00;20.05.2017 19:38:00;A
and a table (2) like this:
TIMESTAMP;VALUES
20.05.2017 19:24:09;323
20.05.2017 19:23:12;2322
20.05.2017 19:27:55;23333
20.05.2017 19:36:12;123123
Now I want to join the category from table 1 to table 2. The key are the timstamps. If the TIMESTAMP from table 2 is between START and END of table1 add category. I want basically a table like this:
TIMESTAMP;VALUES;CATEGORY
20.05.2017 19:24:09;323;A
20.05.2017 19:23:12;2322;A
20.05.2017 19:27:55;23333;B
20.05.2017 19:36:12;123123;B
These are my tries but they aren't efficient:
I)
for(j in seq(dim(table1)[1])){
for(i in seq(dim(table2)[1])){
table2[table2$TIMESTAMP[i]>=table1$START[j] & table2$TIMESTAMP[i]<=table1$END[j]] <- table1$CATEGORY[j]
}
II)
mapped_df <- data.frame()
for(i in seq(dim(table1)[1])){
start <- as.POSIXct(table1$START[i])
end <- as.POSIXct(table1$END[i])
cat <- table1$CATEGORY[i]
mapped_df <- rbind(mapped_df, data.frame(TIMESTAMP=seq(from=start, by=1, to=end), CATEGORY=cat))
}
merge(table2 , mapped_df)
Thanks in advance!

I have a preference for using SQL to do this. The sqldf package comes in handy.
Table1 <-
structure(
list(START = structure(c(1495322580, 1495322820, 1495323120),
class = c("POSIXct", "POSIXt"),
tzone = ""),
END = structure(c(1495322820, 1495323120, 1495323480),
class = c("POSIXct", "POSIXt"),
tzone = ""),
CATEGORY = c("A", "B", "A")),
class = "data.frame",
.Names = c("START", "END", "CATEGORY"),
row.names = c(NA, -3L)
)
Table2 <-
structure(
list(TIMESTAMP = structure(c(1495322649, 1495322592, 1495322875, 1495323372),
class = c("POSIXct", "POSIXt"),
tzone = ""),
VALUES = c(323L, 2322L, 23333L, 123123L)),
class = "data.frame",
.Names = c("TIMESTAMP", "VALUES"),
row.names = c(NA, -4L))
library(sqldf)
sqldf("SELECT T2.TIMESTAMP, T2.[VALUES], T1.CATEGORY
FROM Table2 T2
LEFT JOIN Table1 T1
ON T2.TIMESTAMP > T1.START AND T2.TIMESTAMP < T1.END")

Related

Populate a new column in one table based on start and end dates in another table

I have a larger data table (called raw.data) and a smaller one (called balldrop.times) listing the start and end times of an event.
I want to create a new column in the larger data table that will fill up the times between the event start and end date that are located in the smaller table. The times that aren't between the event start/end time can be labeled something else, it doesn't really matter.
#the dput of the smaller table
> dput(balldrop.times)
structure(list(Stage = 6:14,
BallStart = structure(c(1635837081, 1635847841, 1635856675, 1635866152, 1635878326, 1635886132, 1635895547, 1635902934, 1635911136), tzone = "", class = c("POSIXct", "POSIXt")),
BallEnd = structure(c(1635837364, 1635848243, 1635857005, 1635866475, 1635878704, 1635886465, 1635895905, 1635903786, 1635911457), tzone = "", class = c("POSIXct", "POSIXt"))),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L))
#here is part of the larger table just in case
> dput(head(raw.data, 5))
structure(list(DateTime = structure(c(1635825603.6576, 1635825604.608, 1635825605.6448, 1635825606.6816, 1635825607.632), class = c("POSIXct", "POSIXt"), tzone = "GMT"),
Press.Well = c(1154.2561461, 1154.0308849, 1149.7247783, 1152.0544566, 1155.7363779),
row.names = c(NA, -5L),
class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000020725b51ef0>)
My desired output is something like the following, with "Event Active" only for the times between the listed DateTime vales in the balldrop.times table:
DateTime
Press.Well
Event Status
2021-11-02 02:11:20
10
Event Not Active
2021-11-02 02:11:21
10
Event Active
2021-11-02 02:11:22
15
Event Active
...
...
...
2021-11-02 02:16:04
25
Event Active
2021-11-02 02:16:05
30
Event Not Active
I am thinking I can use mutate() to create a new column in the raw.data table and set conditions for the DateTime, but I am not sure how to do this for multiple separate start/end DateTimes.
Any help would be appericated. Thank you.
Your code isn't working. Neither do the times in your example table correspond with the ones in your expected output.
tmp <- structure(list(Stage = 6:14,
BallStart = structure(c(1635837081, 1635847841, 1635856675, 1635866152, 1635878326, 1635886132, 1635895547, 1635902934, 1635911136), tzone = "", class = c("POSIXct", "POSIXt")),
BallEnd = structure(c(1635837364, 1635848243, 1635857005, 1635866475, 1635878704, 1635886465, 1635895905, 1635903786, 1635911457), tzone = "", class = c("POSIXct", "POSIXt"))
),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L))
tmp1 <- structure(list(DateTime = structure(c(1635825603.6576, 1635825604.608, 1635825605.6448, 1635825606.6816, 1635825607.632), class = c("POSIXct", "POSIXt"), tzone = "GMT"),
Press.Well = c(1154.2561461, 1154.0308849, 1149.7247783, 1152.0544566, 1155.7363779) ), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -5L))
So note this isn't a clean solution.
tmp1 %>%
mutate(`Event Status` = case_when(
DateTime >= (tmp[1,] %>% pull(BallStart)) & DateTime <= (tmp[1,] %>% pull(BallEnd)) ~ "Event Active",
DateTime >= (tmp[2,] %>% pull(BallStart)) & DateTime <= (tmp[2,] %>% pull(BallEnd)) ~ "Event Active",
DateTime >= (tmp[3,] %>% pull(BallStart)) & DateTime <= (tmp[3,] %>% pull(BallEnd)) ~ "Event Active",
DateTime >= (tmp[4,] %>% pull(BallStart)) & DateTime <= (tmp[4,] %>% pull(BallEnd)) ~ "Event Active",
DateTime >= (tmp[5,] %>% pull(BallStart)) & DateTime <= (tmp[5,] %>% pull(BallEnd)) ~ "Event Active",
TRUE ~ "Event Not Active"
))
Because you want to compare multiple conditions, case_when is the preferred option rather than ifelse. With that I compare it to every row in your reference table.
Now, like said it isn't a clean solution as you have many rows to specify it. With a bigger reference table to check the code will increase exponentionally. But you can clean it up into a function.

Find value of a row by comparing two columns and a value with a range of a different dataset

I have 2 different datasets. One with an object that comes from a StationX and goes to StationY and arrives at a specific date and time as the following.
df1<-structure(list(From = c("Station1", "Station5", "Station6", "Station10"), To = c("Station15", "Station2", "Station2", "Station7"),
Arrival = structure(c(971169720, 971172720, 971178120, 971179620), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, -4L),class = c("tbl_df","tbl", "data.frame"))
In the Dataset2 are e.g. trucks which wait for the specific object at StationY between the time&date "Arrival" and "Departure" and leave at "Departure to a specifc region "TOID".
As in the following:
df2<-structure(list(TOID = c(2, 4, 7, 20), Station = c("Station15",
"Station2", "Station2","Station7"), Arrival = structure(c(971169600, 971172000, 971177700, 971179500), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Departure1 = structure(c(971170200, 971173200, 971178600, 971179800), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"))
I want to look for the TOID in Dataset2 and add it to Dataset1 if "TO"(Dataset1)="Station"(Dataset2) and "Arrival"(Dataset2)<="Arrival"(Dataset1)<="Departure"(Dataset2) and has therefore the following outcome:
df1outcome<-structure(list(From = c("Station1", "Station5", "Station6", "Station10"
), To = c("Station15", "Station2", "Station2", "Station7"), `TO_ID` = c(2, 4, 7, 20), Arrival = structure(c(971169720, 971172720, 971178120, 971179620), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"))
I need a solution which looks in dataset2 for the ID that matches the conditions regardless the roworder.
Would be awesome if you guys could help me how to code this in R.
Best,
J
Perhaps you could use tidyverse, use a left_join based on the station, and then filter based on dates:
library(tidyverse)
df1 %>%
left_join(df2, by = c("To" = "Station"), suffix = c("1","2")) %>%
filter(Arrival1 >= Arrival2 & Arrival1 <= Departure1) %>%
select(-c(Arrival2, Departure1))
# A tibble: 4 x 4
From To Arrival1 TOID
<chr> <chr> <dttm> <dbl>
1 Station1 Station15 2000-10-10 09:22:00 2
2 Station5 Station2 2000-10-10 10:12:00 4
3 Station6 Station2 2000-10-10 11:42:00 7
4 Station10 Station7 2000-10-10 12:07:00 20
Im pretty new to R, so this code is probably longer then it should be. But does this work?
#renaming variables so its easier to merge the objects and to compare them
df1 <- df1 %>% rename(Arrival_Package = Arrival)
df2 <- df2 %>% rename(Arrival_Truck = Arrival)
#merge objects
df1outcome <- merge(df1, df2, by.x = "To", by.y = "Station")
#subset from object and select relevant columns
df1outcome <- subset(df1outcome, Arrival_Package <= Departure1)
df1outcome <- subset(df1outcome, Arrival_Truck <= Arrival_Package)
df1outcome <- df1outcome %>% select(From, To, TOID, Arrival_Package)

lapply date format to many dataframes

I'm having difficulties cycling through a list and apply the same format to the same variable in many data frames:
df1 <- structure(list(datetime = structure(c(1446336120, 1446336180,
1446336240, 1446336300, 1446336360), class = c("POSIXct", "POSIXt"
), tzone = "UTC")), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"), .Names = "datetime")
df2 <- structure(list(datetime = structure(c(1446336120, 1446336180,
1446336240), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"), .Names = "datetime")
I want to apply the same format to all dataframes:
df1$datetime <- format(df1$datetime, "%m/%d/%Y %H:%M:%S")
df2$datetime <- format(df2$datetime, "%m/%d/%Y %H:%M:%S")
I tried this:
list_df <- mget(ls(pattern = "df"))
lapply(seq_along(list_df),
function(i) format(list_df[[i]]$datetime, "%m/%d/%Y %H:%M:%S"))
but not sure how to assign it back to each dataframe.
I think your current approach is not far off, but you never make an assignment back to the data frame. Add each data frame to a list and then use lapply:
lst <- list(df1, df2)
lapply(lst, function(x) {
x$datetime <- format(x$datetime, "%m/%d/%Y %H:%M:%S")
return(x)
})
At this point you have a list of data frames in the format you want. If you then later wanted to export each data frame to a CSV file, you could try this:
for (i in 1:length(lst)) {
filename <- paste0("out", i, ".csv")
write.csv(lst[[i]], file=filename)
}
library(dplyr)
lapply(list_df, function(x) x %>%
mutate(datetime=format(datetime, "%m/%d/%Y %H:%M:%S")))

optimisation of a condition for loop in r

I have 2 datasets, one of which contains measurements of temperature at 30 min intervals
ordered.temp<-structure(list(time = structure(c(1385244720, 1385246520, 1385248320,
1385250120, 1385251920, 1385253720, 1385255520, 1385257320, 1385259120,
1385260920), class = c("POSIXct", "POSIXt"), tzone = ""), temp = c(30.419,
29.34, 28.965, 28.866, 28.891, 28.866, 28.692, 28.419, 28.122,
27.85), hoboID = c(2392890L, 2392890L, 2392890L, 2392890L, 2392890L,
2392890L, 2392890L, 2392890L, 2392890L, 2392890L)), .Names = c("time",
"temp", "hoboID"), row.names = c(NA, 10L), class = "data.frame")
, the other I created to be able to assign each measurement into 3-hour bins
df<-structure(list(start = structure(c(1385182800, 1385193600, 1385204400,
1385215200, 1385226000, 1385236800, 1385247600, 1385258400, 1385269200,
1385280000), class = c("POSIXct", "POSIXt"), tzone = ""), end = structure(c(1385193600,
1385204400, 1385215200, 1385226000, 1385236800, 1385247600, 1385258400,
1385269200, 1385280000, 1385290800), class = c("POSIXct", "POSIXt"
), tzone = ""), b = 1:10), .Names = c("start", "end", "b"), row.names = c(NA,
10L), class = "data.frame")
For simplicity, I created a subset of the data, but in reality the temp dataframe is 460k rows long and growing bigger every year. I wrote a for loop to compare each line in temp with lines in bin and assign it the corresponding b value from the bin dataframe.
m <- length(ordered.temp$time)
b <- numeric(m)
n <- length(df$start)
for (i in 1:m){
for (j in 1:n){
if (df$start[j] < ordered.temp$time[i] & ordered.temp$time[i] <= df$end[j]){
b[i] <- df$b[j]
print(i/dim(ordered.temp)[1]*100)
}
}
}
Running this loop with 460k rows takes a very long time (i ran the loop for 1 minute and calculated that it would take ±277 hours to complete it. Therefore, it is imperative to speed this loop up or find alternative methods if this is not possible. I however have no idea how I achieve the desired result. Any help would be greatly appreciated. thanks.

xts merge memory performance

I am trying to improve the memory performance for the following example:
basline df with 4 rows
df <- structure(list(sessionid = structure(c(1L, 2L, 3L, 4L), .Label =
c("AAA1", "AAA2","AAA3", "AAA4"), class = "factor"), bitrateinbps = c(10000000,
10000000, 10000000, 10000000), startdate = structure(c(1326758507, 1326758671,
1326759569, 1326760589), class = c("POSIXct", "POSIXt"), tzone = ""), enddate =
structure(c(1326765780, 1326758734, 1326760629, 1326761592), class = c("POSIXct",
"POSIXt"), tzone = "")), .Names = c("sessionid", "bitrateinbps", "startdate",
"enddate"), row.names = c(NA, 4L), class =
"data.frame")
alternate df with 8 rows
df <- structure(list(sessionid = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L),
.Label = c("AAA1", "AAA2", "AAA3", "AAA4", "AAA5", "AAA6", "AAA7", "AAA8"),
class = "factor"), bitrateinbps =c(10000000, 10000000, 10000000, 10000000,
10000000, 10000000, 10000000, 10000000), startdate = structure(c(1326758507,
1326758671, 1326759569, 1326760589, 1326761589, 1326762589, 1326763589, 1326764589),
class = c("POSIXct",
"POSIXt"), tzone = ""), enddate = structure(c(1326765780, 1326758734, 1326760629,
1326761592, 1326767592,
1326768592, 1326768700, 1326769592), class = c("POSIXct", "POSIXt"), tzone = "")),
.Names = c("sessionid",
"bitrateinbps", "startdate", "enddate"), row.names = c(NA, 8L), class =
"data.frame")
try df analysis memory usage and again for alternate df
library(xts)
fun0 <- function(i, d) {
idx0 <- seq(d$startdate[i],d$enddate[i],1) # create sequence for index
dat0 <- rep(1,length(idx0)) # create data over sequence
xts(dat0, idx0, dimnames=list(NULL,d$sessionid[i])) # xts object
}
# loop over each row and put each row into its own xts object
xl0 <- lapply(1:NROW(df), fun0, d=df)
# merge all the xts objects
xx0 <- do.call(merge, xl0)
# apply a function (e.g. colMeans) to each 15-minute period
xa0 <- period.apply(xx0, endpoints(xx0, 'minutes', 15), colSums, na.rm=TRUE)/900
xa1 <- t(xa0)
# convert from atomic vector to data frame
xa1 = as.data.frame(xa1)
# bind to df
out1 = cbind(df, xa1)
# print aggregate memory usage statistics
print(paste('R is using', memory.size(), 'MB out of limit', memory.limit(), 'MB'))
# create function to return matrix of memory consumption
object.sizes <- function()
{
return(rev(sort(sapply(ls(envir=.GlobalEnv), function (object.name)
object.size(get(object.name))))))
}
# print to console in table format
object.sizes()
results as follows:
4 row df:
xx0 = 292104 Bytes .... do.call(merge, xl0)
xl0 = 154648 Bytes .... lapply(1:NROW(df), fun0, d=df)
8 row df:
xx0 = 799480 Bytes .... do.call(merge, xl0)
xl0 = 512808 Bytes .... lapply(1:NROW(df), fun0, d=df)
I'm looking for something a little more memory efficient for the merge and lapply functions, so I can scale out the number of rows, if anyone has any suggestions and can show the comparative results for alternatives.

Resources