I have the following data example.
first data:
structure(list(cycle_rounded = structure(c(1604188800, 1604190600,
1604192400, 1604194200, 1604196000, 1604197800, 1604199600, 1604201400,
1604203200, 1604205000, 1604206800, 1604208600, 1604210400, 1604212200,
1604214000, 1604215800, 1604217600, 1604219400, 1604221200, 1604223000,
1604224800, 1604226600, 1604228400, 1604230200, 1604232000, 1604233800,
1604235600, 1604237400, 1604239200, 1604241000, 1604242800, 1604244600,
1604246400, 1604248200, 1604250000, 1604251800, 1604253600, 1604255400,
1604257200, 1604259000, 1604260800, 1604262600, 1604264400, 1604266200,
1604268000, 1604269800, 1604271600, 1604273400, 1604275200, 1604277000,
1604278800, 1604280600, 1604282400, 1604284200, 1604286000, 1604287800,
1604289600, 1604291400, 1604293200, 1604295000, 1604296800, 1604298600,
1604300400, 1604302200, 1604304000, 1604305800, 1604307600, 1604309400,
1604311200, 1604313000, 1604314800, 1604316600, 1604318400, 1604320200,
1604322000, 1604323800, 1604325600, 1604327400, 1604329200, 1604331000,
1604332800, 1604334600, 1604336400, 1604338200, 1604340000, 1604341800,
1604343600, 1604345400, 1604347200, 1604349000, 1604350800, 1604352600,
1604354400, 1604356200, 1604358000, 1604359800, 1604361600, 1604363400,
1604365200, 1604367000), tzone = "UTC", class = c("POSIXct",
"POSIXt"))), row.names = c(NA, -100L), class = c("tbl_df", "tbl",
"data.frame"))
second data:
structure(list(data_inicio_dia = structure(c(1604206740, 1604293080,
1604379480), tzone = "UTC", class = c("POSIXct", "POSIXt")),
data_fim_dia = structure(c(1604252160, 1604338560, 1604424960
), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))
I would to add a column daynight in the first data, where the categories will day and night. To be day the value in cycle_rounded column has to be between the date_start_day and date_end_day present in the second data otherwise it will be night.
Thanks
library(dplyr)
left_join(
df1 %>% mutate(date = lubridate::as_date(cycle_rounded)),
df2 %>% mutate(date = lubridate::as_date(data_inicio_dia))) %>%
mutate(daynight = if_else(
cycle_rounded %>% between(data_inicio_dia, data_fim_dia), "day", "night"))
This should work. Although, there might be a more elegant way.
tab1 %>%
dplyr::group_by(cycle_rounded) %>%
dplyr::summarise(
daynight = case_when(
cycle_rounded >= (tab2 %>% dplyr::filter(lubridate::as_date(tab2$data_inicio_dia) == lubridate::as_date(cycle_rounded)) %>% dplyr::pull(data_inicio_dia)) &
cycle_rounded <= (tab2 %>% dplyr::filter(lubridate::as_date(tab2$data_inicio_dia) == lubridate::as_date(cycle_rounded)) %>% dplyr::pull(data_fim_dia))
~ "Day",
TRUE ~ "Night"
)
)
Related
I have a larger data table (called raw.data) and a smaller one (called balldrop.times) listing the start and end times of an event.
I want to create a new column in the larger data table that will fill up the times between the event start and end date that are located in the smaller table. The times that aren't between the event start/end time can be labeled something else, it doesn't really matter.
#the dput of the smaller table
> dput(balldrop.times)
structure(list(Stage = 6:14,
BallStart = structure(c(1635837081, 1635847841, 1635856675, 1635866152, 1635878326, 1635886132, 1635895547, 1635902934, 1635911136), tzone = "", class = c("POSIXct", "POSIXt")),
BallEnd = structure(c(1635837364, 1635848243, 1635857005, 1635866475, 1635878704, 1635886465, 1635895905, 1635903786, 1635911457), tzone = "", class = c("POSIXct", "POSIXt"))),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L))
#here is part of the larger table just in case
> dput(head(raw.data, 5))
structure(list(DateTime = structure(c(1635825603.6576, 1635825604.608, 1635825605.6448, 1635825606.6816, 1635825607.632), class = c("POSIXct", "POSIXt"), tzone = "GMT"),
Press.Well = c(1154.2561461, 1154.0308849, 1149.7247783, 1152.0544566, 1155.7363779),
row.names = c(NA, -5L),
class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000020725b51ef0>)
My desired output is something like the following, with "Event Active" only for the times between the listed DateTime vales in the balldrop.times table:
DateTime
Press.Well
Event Status
2021-11-02 02:11:20
10
Event Not Active
2021-11-02 02:11:21
10
Event Active
2021-11-02 02:11:22
15
Event Active
...
...
...
2021-11-02 02:16:04
25
Event Active
2021-11-02 02:16:05
30
Event Not Active
I am thinking I can use mutate() to create a new column in the raw.data table and set conditions for the DateTime, but I am not sure how to do this for multiple separate start/end DateTimes.
Any help would be appericated. Thank you.
Your code isn't working. Neither do the times in your example table correspond with the ones in your expected output.
tmp <- structure(list(Stage = 6:14,
BallStart = structure(c(1635837081, 1635847841, 1635856675, 1635866152, 1635878326, 1635886132, 1635895547, 1635902934, 1635911136), tzone = "", class = c("POSIXct", "POSIXt")),
BallEnd = structure(c(1635837364, 1635848243, 1635857005, 1635866475, 1635878704, 1635886465, 1635895905, 1635903786, 1635911457), tzone = "", class = c("POSIXct", "POSIXt"))
),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L))
tmp1 <- structure(list(DateTime = structure(c(1635825603.6576, 1635825604.608, 1635825605.6448, 1635825606.6816, 1635825607.632), class = c("POSIXct", "POSIXt"), tzone = "GMT"),
Press.Well = c(1154.2561461, 1154.0308849, 1149.7247783, 1152.0544566, 1155.7363779) ), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -5L))
So note this isn't a clean solution.
tmp1 %>%
mutate(`Event Status` = case_when(
DateTime >= (tmp[1,] %>% pull(BallStart)) & DateTime <= (tmp[1,] %>% pull(BallEnd)) ~ "Event Active",
DateTime >= (tmp[2,] %>% pull(BallStart)) & DateTime <= (tmp[2,] %>% pull(BallEnd)) ~ "Event Active",
DateTime >= (tmp[3,] %>% pull(BallStart)) & DateTime <= (tmp[3,] %>% pull(BallEnd)) ~ "Event Active",
DateTime >= (tmp[4,] %>% pull(BallStart)) & DateTime <= (tmp[4,] %>% pull(BallEnd)) ~ "Event Active",
DateTime >= (tmp[5,] %>% pull(BallStart)) & DateTime <= (tmp[5,] %>% pull(BallEnd)) ~ "Event Active",
TRUE ~ "Event Not Active"
))
Because you want to compare multiple conditions, case_when is the preferred option rather than ifelse. With that I compare it to every row in your reference table.
Now, like said it isn't a clean solution as you have many rows to specify it. With a bigger reference table to check the code will increase exponentionally. But you can clean it up into a function.
I have a dataset in which there are dates describing a time period of interest, as well as events ("Tests" in my toy example) that can fall inside or outside the period of the interest. The events also have a time and some dichotomous characteristics.
My collaborator has asked me to transform the data from this format:
structure(list(ID = c(1, 1, 2, 3), StartDate = structure(c(315878400,
315878400, 357696000, 323481600), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), EndDate = structure(c(316137600, 316310400,
357955200, 323654400), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
TestDateTime = structure(c(316135500, 315797700, 357923700,
323422560), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
TestName = c("Test1", "Test2", "Test1", "Test3"), Characteristic = c("Fast",
"Slow", "Fast", "Slow")), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
current state
to this format:
desired state
I am unsure how to accomplish this transformation or set of transformations using R, but I believe it is possible.
try the following
library(dplyr)
data %>%
select(-c(StartDate,EndDate)) %>% # Remove extra columns
tidyr::spread(TestDate, TestTime) %>% # Spread df to long form
select(-Characteristic, everything()) %>% # Move Characteristic to the end of the df
group_by(ID) %>% # Group by ID and
group_split() # split it
Take on count that the date columns of the final df are not exact as the "desire" state.
Hope this can help you.
I am trying to plot the time and NDVI for each region on the same plot. I think to do this I have to convert the date column from characters to time and then plot each layer. However I cannot figure out how to do this. Any thoughts?
list(structure(list(observation = 1L, HRpcode = NA_character_,
timeseries = NA_character_), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(observation = 1:6, time = c("2014-01-01",
"2014-02-01", "2014-03-01", "2014-04-01", "2014-05-01", "2014-06-01"
), ` NDVI` = c("0.3793765496776215", "0.21686891782421552", "0.3785652933528299",
"0.41027240624704164", "0.4035578030242673", "0.341299793064468"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(observation = 1:6, time = c("2014-01-01",
"2014-02-01", "2014-03-01", "2014-04-01", "2014-05-01", "2014-06-01"
), ` NDVI` = c("0.4071076986818826", "0.09090719657570319", "0.35214166081795284",
"0.4444311032927228", "0.5220702877666005", "0.5732370503295022"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(observation = 1:6, time = c("2014-01-01",
"2014-02-01", "2014-03-01", "2014-04-01", "2014-05-01", "2014-06-01"
), ` NDVI` = c("0.3412131556625801", "0.18815996897460135", "0.5218904976415136",
"0.6970128777711452", "0.7229657162729096", "0.535967435470161"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
)))
111
First we need to clean your data. The first element in this list is empty
df = df[-1]
Now we need to make a data.frame
df = do.call(rbind, df)
I am going to add a region variable, change the name of NDVI to remove the space,
change ndvi into a numeric vector, and change time into a Date object
library(dplyr)
df = df %>%
mutate(region = factor(rep(1:3, rep(6, 3)))) %>%
rename(ndvi = ' NDVI') %>%
mutate(ndvi = as.numeric(ndvi)) %>%
mutate(time = as.Date(time))
Now we can use ggplot2 to plot the data by region
library(ggplot2)
g = df %>%
ggplot(aes(x = time, y = ndvi, col = region)) +
geom_line()
g
Which gives this plot:
Here's an approach with lubridate to handle dates and dplyr to make the binding of the data.frames easier to understand.
Note that the group names are taken from the names of the list, and since those don't exist in the data you provided, we have to set them in advance.
library(lubridate)
library(ggplot2)
library(dplyr)
names(data) <- 1:3
data <- bind_rows(data, .id = "group")
data$time <- ymd(data$time)
setnames(data," NDVI","NDVI")
data$NDVI <- as.numeric(data$NDVI)
ggplot(data, aes(x=time,y=NDVI,color=Group)) + geom_line()
I have 2 different datasets. One with an object that comes from a StationX and goes to StationY and arrives at a specific date and time as the following.
df1<-structure(list(From = c("Station1", "Station5", "Station6", "Station10"), To = c("Station15", "Station2", "Station2", "Station7"),
Arrival = structure(c(971169720, 971172720, 971178120, 971179620), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, -4L),class = c("tbl_df","tbl", "data.frame"))
In the Dataset2 are e.g. trucks which wait for the specific object at StationY between the time&date "Arrival" and "Departure" and leave at "Departure to a specifc region "TOID".
As in the following:
df2<-structure(list(TOID = c(2, 4, 7, 20), Station = c("Station15",
"Station2", "Station2","Station7"), Arrival = structure(c(971169600, 971172000, 971177700, 971179500), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Departure1 = structure(c(971170200, 971173200, 971178600, 971179800), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"))
I want to look for the TOID in Dataset2 and add it to Dataset1 if "TO"(Dataset1)="Station"(Dataset2) and "Arrival"(Dataset2)<="Arrival"(Dataset1)<="Departure"(Dataset2) and has therefore the following outcome:
df1outcome<-structure(list(From = c("Station1", "Station5", "Station6", "Station10"
), To = c("Station15", "Station2", "Station2", "Station7"), `TO_ID` = c(2, 4, 7, 20), Arrival = structure(c(971169720, 971172720, 971178120, 971179620), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"))
I need a solution which looks in dataset2 for the ID that matches the conditions regardless the roworder.
Would be awesome if you guys could help me how to code this in R.
Best,
J
Perhaps you could use tidyverse, use a left_join based on the station, and then filter based on dates:
library(tidyverse)
df1 %>%
left_join(df2, by = c("To" = "Station"), suffix = c("1","2")) %>%
filter(Arrival1 >= Arrival2 & Arrival1 <= Departure1) %>%
select(-c(Arrival2, Departure1))
# A tibble: 4 x 4
From To Arrival1 TOID
<chr> <chr> <dttm> <dbl>
1 Station1 Station15 2000-10-10 09:22:00 2
2 Station5 Station2 2000-10-10 10:12:00 4
3 Station6 Station2 2000-10-10 11:42:00 7
4 Station10 Station7 2000-10-10 12:07:00 20
Im pretty new to R, so this code is probably longer then it should be. But does this work?
#renaming variables so its easier to merge the objects and to compare them
df1 <- df1 %>% rename(Arrival_Package = Arrival)
df2 <- df2 %>% rename(Arrival_Truck = Arrival)
#merge objects
df1outcome <- merge(df1, df2, by.x = "To", by.y = "Station")
#subset from object and select relevant columns
df1outcome <- subset(df1outcome, Arrival_Package <= Departure1)
df1outcome <- subset(df1outcome, Arrival_Truck <= Arrival_Package)
df1outcome <- df1outcome %>% select(From, To, TOID, Arrival_Package)
I am trying to extract average values of all variables between 0 to 40 minutes every hour.
dput(head(df))
structure(list(DateTime = structure(c(1563467460, 1563468060,
1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), date = structure(c(1563467460, 1563468060,
1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), Date = structure(c(18095, 18095, 18095,
18095, 18095, 18095), class = "Date"), TimeCtr = structure(c(1563467460,
1563468060, 1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), MassConc = c(0.397627, 0.539531, 0.571902,
0.608715, 0.670382, 0.835773), VolConc = c(175.038, 160.534,
174.386, 183.004, 191.074, 174.468), NumbConc = c(234.456, 326.186,
335.653, 348.996, 376.018, 488.279), MassD = c(101.426, 102.462,
101.645, 102.145, 101.255, 101.433)), .Names = c("DateTime",
"date", "Date", "TimeCtr", "MassConc", "VolConc", "NumbConc",
"MassD"), row.names = c(NA, 6L), class = "data.frame")
What I've tried so far..
hourly_mean<-mydata %>%
filter(between(as.numeric(format(DateTime, "%M")), 0, 40)) %>%
group_by(DateTime=format(DateTime, "%Y-%m-%d %H")) %>%
summarise(variable1_mean=mean(variable1))
But it gives me a single average value for the whole period. Any help is very much welcomed.
We can convert DateTime , use ceiling_date with hourly unit to round Datetime, extract minutes from DateTime and filter to keep minutes which are less than 40, group_by hour and take mean of values.
library(lubridate)
library(dplyr)
df %>%
dplyr::mutate(DateTime = ymd_hm(DateTime),
hour = ceiling_date(DateTime, "hour"),
minutes = minute(DateTime)) %>%
filter(minutes <= 40) %>%
group_by(hour) %>%
summarise_at(vars(ends_with("Conc")), mean)
data
df <- structure(list(DateTime = structure(1:7, .Label = c("2019-08-0810:07",
"2019-08-0810:17", "2019-08-0810:27", "2019-08-0810:37", "2019-08-0810:47",
"2019-08-0810:57", "2019-08-0811:07"), class = "factor"), MassConc = c(0.556398,
1.06868, 0.777654, 0.87289, 0.789704, 0.51948, 0.416676), NumbConc = c(588.069,
984.018, 964.634, 997.678, 1013.52, 924.271, 916.357), VolConc = c(582.887,
979.685, 963.3, 994.178, 1009.52, 922.104, 916.856), Conc = c(281.665,
486.176, 420.058, 422.101, 429.841, 346.539, 330.282)), class =
"data.frame", row.names = c(NA, -7L))