time average for specific time range in r - r

I am trying to extract average values of all variables between 0 to 40 minutes every hour.
dput(head(df))
structure(list(DateTime = structure(c(1563467460, 1563468060,
1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), date = structure(c(1563467460, 1563468060,
1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), Date = structure(c(18095, 18095, 18095,
18095, 18095, 18095), class = "Date"), TimeCtr = structure(c(1563467460,
1563468060, 1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), MassConc = c(0.397627, 0.539531, 0.571902,
0.608715, 0.670382, 0.835773), VolConc = c(175.038, 160.534,
174.386, 183.004, 191.074, 174.468), NumbConc = c(234.456, 326.186,
335.653, 348.996, 376.018, 488.279), MassD = c(101.426, 102.462,
101.645, 102.145, 101.255, 101.433)), .Names = c("DateTime",
"date", "Date", "TimeCtr", "MassConc", "VolConc", "NumbConc",
"MassD"), row.names = c(NA, 6L), class = "data.frame")
What I've tried so far..
hourly_mean<-mydata %>%
filter(between(as.numeric(format(DateTime, "%M")), 0, 40)) %>%
group_by(DateTime=format(DateTime, "%Y-%m-%d %H")) %>%
summarise(variable1_mean=mean(variable1))
But it gives me a single average value for the whole period. Any help is very much welcomed.

We can convert DateTime , use ceiling_date with hourly unit to round Datetime, extract minutes from DateTime and filter to keep minutes which are less than 40, group_by hour and take mean of values.
library(lubridate)
library(dplyr)
df %>%
dplyr::mutate(DateTime = ymd_hm(DateTime),
hour = ceiling_date(DateTime, "hour"),
minutes = minute(DateTime)) %>%
filter(minutes <= 40) %>%
group_by(hour) %>%
summarise_at(vars(ends_with("Conc")), mean)
data
df <- structure(list(DateTime = structure(1:7, .Label = c("2019-08-0810:07",
"2019-08-0810:17", "2019-08-0810:27", "2019-08-0810:37", "2019-08-0810:47",
"2019-08-0810:57", "2019-08-0811:07"), class = "factor"), MassConc = c(0.556398,
1.06868, 0.777654, 0.87289, 0.789704, 0.51948, 0.416676), NumbConc = c(588.069,
984.018, 964.634, 997.678, 1013.52, 924.271, 916.357), VolConc = c(582.887,
979.685, 963.3, 994.178, 1009.52, 922.104, 916.856), Conc = c(281.665,
486.176, 420.058, 422.101, 429.841, 346.539, 330.282)), class =
"data.frame", row.names = c(NA, -7L))

Related

Add a column in dataframe by datetime conditions of another dataframe

I have the following data example.
first data:
structure(list(cycle_rounded = structure(c(1604188800, 1604190600,
1604192400, 1604194200, 1604196000, 1604197800, 1604199600, 1604201400,
1604203200, 1604205000, 1604206800, 1604208600, 1604210400, 1604212200,
1604214000, 1604215800, 1604217600, 1604219400, 1604221200, 1604223000,
1604224800, 1604226600, 1604228400, 1604230200, 1604232000, 1604233800,
1604235600, 1604237400, 1604239200, 1604241000, 1604242800, 1604244600,
1604246400, 1604248200, 1604250000, 1604251800, 1604253600, 1604255400,
1604257200, 1604259000, 1604260800, 1604262600, 1604264400, 1604266200,
1604268000, 1604269800, 1604271600, 1604273400, 1604275200, 1604277000,
1604278800, 1604280600, 1604282400, 1604284200, 1604286000, 1604287800,
1604289600, 1604291400, 1604293200, 1604295000, 1604296800, 1604298600,
1604300400, 1604302200, 1604304000, 1604305800, 1604307600, 1604309400,
1604311200, 1604313000, 1604314800, 1604316600, 1604318400, 1604320200,
1604322000, 1604323800, 1604325600, 1604327400, 1604329200, 1604331000,
1604332800, 1604334600, 1604336400, 1604338200, 1604340000, 1604341800,
1604343600, 1604345400, 1604347200, 1604349000, 1604350800, 1604352600,
1604354400, 1604356200, 1604358000, 1604359800, 1604361600, 1604363400,
1604365200, 1604367000), tzone = "UTC", class = c("POSIXct",
"POSIXt"))), row.names = c(NA, -100L), class = c("tbl_df", "tbl",
"data.frame"))
second data:
structure(list(data_inicio_dia = structure(c(1604206740, 1604293080,
1604379480), tzone = "UTC", class = c("POSIXct", "POSIXt")),
data_fim_dia = structure(c(1604252160, 1604338560, 1604424960
), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))
I would to add a column daynight in the first data, where the categories will day and night. To be day the value in cycle_rounded column has to be between the date_start_day and date_end_day present in the second data otherwise it will be night.
Thanks
library(dplyr)
left_join(
df1 %>% mutate(date = lubridate::as_date(cycle_rounded)),
df2 %>% mutate(date = lubridate::as_date(data_inicio_dia))) %>%
mutate(daynight = if_else(
cycle_rounded %>% between(data_inicio_dia, data_fim_dia), "day", "night"))
This should work. Although, there might be a more elegant way.
tab1 %>%
dplyr::group_by(cycle_rounded) %>%
dplyr::summarise(
daynight = case_when(
cycle_rounded >= (tab2 %>% dplyr::filter(lubridate::as_date(tab2$data_inicio_dia) == lubridate::as_date(cycle_rounded)) %>% dplyr::pull(data_inicio_dia)) &
cycle_rounded <= (tab2 %>% dplyr::filter(lubridate::as_date(tab2$data_inicio_dia) == lubridate::as_date(cycle_rounded)) %>% dplyr::pull(data_fim_dia))
~ "Day",
TRUE ~ "Night"
)
)

Populate a new column in one table based on start and end dates in another table

I have a larger data table (called raw.data) and a smaller one (called balldrop.times) listing the start and end times of an event.
I want to create a new column in the larger data table that will fill up the times between the event start and end date that are located in the smaller table. The times that aren't between the event start/end time can be labeled something else, it doesn't really matter.
#the dput of the smaller table
> dput(balldrop.times)
structure(list(Stage = 6:14,
BallStart = structure(c(1635837081, 1635847841, 1635856675, 1635866152, 1635878326, 1635886132, 1635895547, 1635902934, 1635911136), tzone = "", class = c("POSIXct", "POSIXt")),
BallEnd = structure(c(1635837364, 1635848243, 1635857005, 1635866475, 1635878704, 1635886465, 1635895905, 1635903786, 1635911457), tzone = "", class = c("POSIXct", "POSIXt"))),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L))
#here is part of the larger table just in case
> dput(head(raw.data, 5))
structure(list(DateTime = structure(c(1635825603.6576, 1635825604.608, 1635825605.6448, 1635825606.6816, 1635825607.632), class = c("POSIXct", "POSIXt"), tzone = "GMT"),
Press.Well = c(1154.2561461, 1154.0308849, 1149.7247783, 1152.0544566, 1155.7363779),
row.names = c(NA, -5L),
class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000020725b51ef0>)
My desired output is something like the following, with "Event Active" only for the times between the listed DateTime vales in the balldrop.times table:
DateTime
Press.Well
Event Status
2021-11-02 02:11:20
10
Event Not Active
2021-11-02 02:11:21
10
Event Active
2021-11-02 02:11:22
15
Event Active
...
...
...
2021-11-02 02:16:04
25
Event Active
2021-11-02 02:16:05
30
Event Not Active
I am thinking I can use mutate() to create a new column in the raw.data table and set conditions for the DateTime, but I am not sure how to do this for multiple separate start/end DateTimes.
Any help would be appericated. Thank you.
Your code isn't working. Neither do the times in your example table correspond with the ones in your expected output.
tmp <- structure(list(Stage = 6:14,
BallStart = structure(c(1635837081, 1635847841, 1635856675, 1635866152, 1635878326, 1635886132, 1635895547, 1635902934, 1635911136), tzone = "", class = c("POSIXct", "POSIXt")),
BallEnd = structure(c(1635837364, 1635848243, 1635857005, 1635866475, 1635878704, 1635886465, 1635895905, 1635903786, 1635911457), tzone = "", class = c("POSIXct", "POSIXt"))
),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L))
tmp1 <- structure(list(DateTime = structure(c(1635825603.6576, 1635825604.608, 1635825605.6448, 1635825606.6816, 1635825607.632), class = c("POSIXct", "POSIXt"), tzone = "GMT"),
Press.Well = c(1154.2561461, 1154.0308849, 1149.7247783, 1152.0544566, 1155.7363779) ), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -5L))
So note this isn't a clean solution.
tmp1 %>%
mutate(`Event Status` = case_when(
DateTime >= (tmp[1,] %>% pull(BallStart)) & DateTime <= (tmp[1,] %>% pull(BallEnd)) ~ "Event Active",
DateTime >= (tmp[2,] %>% pull(BallStart)) & DateTime <= (tmp[2,] %>% pull(BallEnd)) ~ "Event Active",
DateTime >= (tmp[3,] %>% pull(BallStart)) & DateTime <= (tmp[3,] %>% pull(BallEnd)) ~ "Event Active",
DateTime >= (tmp[4,] %>% pull(BallStart)) & DateTime <= (tmp[4,] %>% pull(BallEnd)) ~ "Event Active",
DateTime >= (tmp[5,] %>% pull(BallStart)) & DateTime <= (tmp[5,] %>% pull(BallEnd)) ~ "Event Active",
TRUE ~ "Event Not Active"
))
Because you want to compare multiple conditions, case_when is the preferred option rather than ifelse. With that I compare it to every row in your reference table.
Now, like said it isn't a clean solution as you have many rows to specify it. With a bigger reference table to check the code will increase exponentionally. But you can clean it up into a function.

How can I transform data from tidy to a unique format?

I have a dataset in which there are dates describing a time period of interest, as well as events ("Tests" in my toy example) that can fall inside or outside the period of the interest. The events also have a time and some dichotomous characteristics.
My collaborator has asked me to transform the data from this format:
structure(list(ID = c(1, 1, 2, 3), StartDate = structure(c(315878400,
315878400, 357696000, 323481600), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), EndDate = structure(c(316137600, 316310400,
357955200, 323654400), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
TestDateTime = structure(c(316135500, 315797700, 357923700,
323422560), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
TestName = c("Test1", "Test2", "Test1", "Test3"), Characteristic = c("Fast",
"Slow", "Fast", "Slow")), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
current state
to this format:
desired state
I am unsure how to accomplish this transformation or set of transformations using R, but I believe it is possible.
try the following
library(dplyr)
data %>%
select(-c(StartDate,EndDate)) %>% # Remove extra columns
tidyr::spread(TestDate, TestTime) %>% # Spread df to long form
select(-Characteristic, everything()) %>% # Move Characteristic to the end of the df
group_by(ID) %>% # Group by ID and
group_split() # split it
Take on count that the date columns of the final df are not exact as the "desire" state.
Hope this can help you.

Find value of a row by comparing two columns and a value with a range of a different dataset

I have 2 different datasets. One with an object that comes from a StationX and goes to StationY and arrives at a specific date and time as the following.
df1<-structure(list(From = c("Station1", "Station5", "Station6", "Station10"), To = c("Station15", "Station2", "Station2", "Station7"),
Arrival = structure(c(971169720, 971172720, 971178120, 971179620), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, -4L),class = c("tbl_df","tbl", "data.frame"))
In the Dataset2 are e.g. trucks which wait for the specific object at StationY between the time&date "Arrival" and "Departure" and leave at "Departure to a specifc region "TOID".
As in the following:
df2<-structure(list(TOID = c(2, 4, 7, 20), Station = c("Station15",
"Station2", "Station2","Station7"), Arrival = structure(c(971169600, 971172000, 971177700, 971179500), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Departure1 = structure(c(971170200, 971173200, 971178600, 971179800), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"))
I want to look for the TOID in Dataset2 and add it to Dataset1 if "TO"(Dataset1)="Station"(Dataset2) and "Arrival"(Dataset2)<="Arrival"(Dataset1)<="Departure"(Dataset2) and has therefore the following outcome:
df1outcome<-structure(list(From = c("Station1", "Station5", "Station6", "Station10"
), To = c("Station15", "Station2", "Station2", "Station7"), `TO_ID` = c(2, 4, 7, 20), Arrival = structure(c(971169720, 971172720, 971178120, 971179620), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"))
I need a solution which looks in dataset2 for the ID that matches the conditions regardless the roworder.
Would be awesome if you guys could help me how to code this in R.
Best,
J
Perhaps you could use tidyverse, use a left_join based on the station, and then filter based on dates:
library(tidyverse)
df1 %>%
left_join(df2, by = c("To" = "Station"), suffix = c("1","2")) %>%
filter(Arrival1 >= Arrival2 & Arrival1 <= Departure1) %>%
select(-c(Arrival2, Departure1))
# A tibble: 4 x 4
From To Arrival1 TOID
<chr> <chr> <dttm> <dbl>
1 Station1 Station15 2000-10-10 09:22:00 2
2 Station5 Station2 2000-10-10 10:12:00 4
3 Station6 Station2 2000-10-10 11:42:00 7
4 Station10 Station7 2000-10-10 12:07:00 20
Im pretty new to R, so this code is probably longer then it should be. But does this work?
#renaming variables so its easier to merge the objects and to compare them
df1 <- df1 %>% rename(Arrival_Package = Arrival)
df2 <- df2 %>% rename(Arrival_Truck = Arrival)
#merge objects
df1outcome <- merge(df1, df2, by.x = "To", by.y = "Station")
#subset from object and select relevant columns
df1outcome <- subset(df1outcome, Arrival_Package <= Departure1)
df1outcome <- subset(df1outcome, Arrival_Truck <= Arrival_Package)
df1outcome <- df1outcome %>% select(From, To, TOID, Arrival_Package)

Histograms or Density plots of time of day

I have the data.frame in which every row is an episode with a start and an end timestamp.
test.DF<-dput(head(test.DF, n=50))
structure(list(start = structure(c(1189494920, 1189495400, 1189496120,
1189496840, 1189497440, 1189498040, 1189498640, 1189501760, 1189503560,
1190453600, 1247458520, 1247480840, 1247482880, 1247483840, 1247485040,
1247486600, 1247487320, 1247488040, 1247488760, 1247490920, 1247491280,
1247492480, 1247493680, 1247502440, 1247503160, 1247503520, 1247548040,
1247549360, 1247550680, 1247552600, 1247553920, 1247557400, 1247558000,
1247558480, 1247559440, 1247560400, 1247563760, 1247564960, 1247566640,
1247567120, 1194935549, 1194936029, 1195722629, 1195724309, 1199691029,
1199692349, 1202560229, 1208063669, 1208322989, 1188188112), class = c("POSIXct",
"POSIXt"), tzone = ""), end = structure(c(1189495280, 1189495520,
1189496360, 1189497080, 1189497560, 1189498160, 1189498760, 1189501880,
1189503920, 1190453720, 1247458640, 1247480960, 1247483480, 1247484080,
1247485640, 1247486840, 1247487560, 1247488640, 1247490440, 1247491160,
1247491520, 1247492600, 1247493920, 1247502680, 1247503400, 1247504120,
1247549240, 1247550560, 1247551280, 1247552720, 1247554400, 1247557880,
1247558240, 1247559080, 1247559560, 1247560760, 1247563880, 1247565080,
1247566760, 1247567240, 1194935669, 1194936269, 1195722749, 1195724429,
1199691269, 1199692469, 1202560349, 1208063789, 1208323109, 1188204792
), class = c("POSIXct", "POSIXt"), tzone = "")), .Names = c("start",
"end"), row.names = c(NA, 50L), class = "data.frame")
I would like to see the distribution of these episodes within a 24 hour cycle. That is either a histogram or a density plot, with the 24H day cycle in the x axis. Is this possible? I would like to ignore the dates of the episodes.
By converting to a POSIXltformat, you can easily extract the hour of the time:
par(mar=c(6,4,1,1))
Hour <- as.POSIXlt(test.DF$start)$hour
hist(Hour, breaks=seq(0, 23), main="Start time (hour)")
Edit: Adding a value for ever minute between start and end
fun <- function(start.time, end.time){
seq.POSIXt(
as.POSIXlt(
paste0("2000-01-01 ", as.POSIXlt(start.time)$hour, ":", as.POSIXlt(start.time)$min)
),
as.POSIXlt(
paste0("2000-01-01 ", as.POSIXlt(end.time)$hour, ":", as.POSIXlt(end.time)$min)
),
by="min"
)
}
HM <- vector(mode="list", dim(test.DF)[1])
for(i in seq(HM)){
HM[[i]] <- fun(test.DF$start[i], test.DF$end[i])
}
HM2 <- as.POSIXlt(unlist(HM), origin="1970-01-01")
Hour <- HM2$hour
hist(Hour, breaks=seq(0, 23))
HourMinute <- HM2$hour + HM2$min/60
hist(HourMinute, breaks=seq(0, 23, by=1/60))

Resources