Daily minimum values in R - r

I am trying to extract the daily minimum zenith angle in a dataset which consists of 24h values (1 zenith angle value every hour) over ~31 days for 12 months. It looks like this:
JulianDay Azimuth Zenith Date (YYMMDD HH:MM:SS)
2455928 174.14066 70.04650 2012-01-01 13:00:00
2455928 188.80626 70.30747 2012-01-01 14:00:00
2455928 203.03458 73.12297 2012-01-01 15:00:00
2455928 216.28061 78.20131 2012-01-01 16:00:00
2455928 228.35929 85.10759 2012-01-01 17:00:00
....
2456293 146.33844 77.03456 2012-12-31 11:00:00
2456293 159.80472 72.38003 2012-12-31 12:00:00
Is there a function that can extract the maximum and minimum solar zenith angle from each day (i.e., 365 outputs)?

You can do a summary grouped by day, here is one way, suppose your data frame is called df:
library(data.table)
setDT(df)[, .(maxZenith = max(Zenith), minZenith = min(Zenith)), .(JulianDay)]
If you want to use the Date column instead of JulianDay, do something like:
setDT(df)[, .(maxZenith = max(Zenith), minZenith = min(Zenith)), .(as.Date(Date))]
Assuming you renamed your Date (YYMMDD HH:MM:SS) as Date. Just FYI, even though allowed, don't consider it as a good practice to contain space in the column name.

In base R:
my.data <- read.table(text = '
JulianDay Azimuth Zenith Date.YYMMDD Date.HHMMSS
2455928 174.14066 70.04650 2012-01-01 13:00:00
2455928 188.80626 70.30747 2012-01-01 14:00:00
2455928 203.03458 73.12297 2012-01-01 15:00:00
2455928 216.28061 78.20131 2012-01-01 16:00:00
2455928 228.35929 85.10759 2012-01-01 17:00:00
2455929 160.00000 70.04650 2012-01-02 13:00:00
2455929 188.80626 70.30747 2012-01-02 14:00:00
2455929 203.03458 73.12297 2012-01-02 15:00:00
2455929 216.28061 78.20131 2012-01-02 16:00:00
2455929 228.35929 85.10759 2012-01-02 17:00:00
', header = TRUE)
with(my.data, aggregate(Azimuth ~ JulianDay, FUN = function(x) c(Min = min(x), Max = max(x))))
One problem with aggregate is that the output is not is a form that is easy to use. It requires a bit of post processing:
my.min.max <- with(my.data, aggregate(my.data$Azimuth, by = list(my.data$JulianDay),
FUN = function(x) c(MIN = min(x), MAX = max(x)) ))
# to convert output of aggregate into a data frame:
my.min.max2 <- do.call(data.frame, my.min.max)
# combine output from aggregate with original data set
colnames(my.min.max2) <- c('JulianDay', 'my.min', 'my.max')
my.data2 <- merge(my.data, my.min.max2, by = 'JulianDay')
my.data2
# JulianDay Azimuth Zenith Date.YYMMDD Date.HHMMSS my.min my.max
#1 2455928 174.1407 70.04650 2012-01-01 13:00:00 174.1407 228.3593
#2 2455928 188.8063 70.30747 2012-01-01 14:00:00 174.1407 228.3593
#3 2455928 203.0346 73.12297 2012-01-01 15:00:00 174.1407 228.3593
#4 2455928 216.2806 78.20131 2012-01-01 16:00:00 174.1407 228.3593
#5 2455928 228.3593 85.10759 2012-01-01 17:00:00 174.1407 228.3593
#6 2455929 160.0000 70.04650 2012-01-02 13:00:00 160.0000 228.3593
#7 2455929 188.8063 70.30747 2012-01-02 14:00:00 160.0000 228.3593
#8 2455929 203.0346 73.12297 2012-01-02 15:00:00 160.0000 228.3593
#9 2455929 216.2806 78.20131 2012-01-02 16:00:00 160.0000 228.3593
#10 2455929 228.3593 85.10759 2012-01-02 17:00:00 160.0000 228.3593
You can use by also, but the output from by also requires a bit of post-processing:
by.min.max <- as.data.frame(do.call("rbind", by(my.data$Azimuth, my.data$JulianDay,
FUN = function(x) c(Min = min(x), Max = max(x)))))
by.min.max <- cbind(JulianDay = rownames(by.min.max), by.min.max)
my.data2 <- merge(my.data, by.min.max, by = 'JulianDay')
my.data2
You can also use tapply:
my.data$Date_Time <- as.POSIXct(paste(my.data$Date.YYMMDD, my.data$Date.HHMMSS),
format = "%Y-%d-%m %H:%M:%S")
ty.min.max <- as.data.frame(do.call("rbind", tapply(my.data$Azimuth, my.data$JulianDay,
FUN = function(x) c(Min = min(x), Max = max(x)))))
ty.min.max <- cbind(JulianDay = rownames(ty.min.max), ty.min.max)
my.data2 <- merge(my.data, ty.min.max, by = 'JulianDay')
my.data2
You can also use a combination of split and sapply:
sy.min.max <- t(sapply(split(my.data$Azimuth, my.data$JulianDay),
function(x) c(Min = min(x), Max = max(x)) ))
sy.min.max <- data.frame(JulianDay = rownames(sy.min.max), sy.min.max,
stringsAsFactors = FALSE)
my.data2 <- merge(my.data, sy.min.max, by = 'JulianDay')
my.data2
You can also use a combination of split and lapply:
ly.min.max <- lapply(split(my.data$Azimuth, my.data$JulianDay),
function(x) c(Min = min(x), Max = max(x)))
ly.min.max <- as.data.frame(do.call("rbind", ly.min.max))
ly.min.max <- cbind(JulianDay = rownames(ly.min.max), ly.min.max)
my.data2 <- merge(my.data, ly.min.max, by = 'JulianDay')
my.data2
You can also use ave, although I have not figured out how to use two functions in one ave statement:
my.min <- ave(my.data$Azimuth, my.data$JulianDay, FUN = min)
my.max <- ave(my.data$Azimuth, my.data$JulianDay, FUN = max)
my.data2 <- data.frame(my.data, my.min, my.max)
my.data2

With dplyr
library(dplyr)
df %>%
group_by(JulianDay) %>% #if you need `Date` class, use `as.Date(JulianDay)`
summarise(MaxZenith = max(Zenith), minZenith = min(Zenith))
where 'JulianDay' is the renamed column name for (YYMMDD HH:MM:SS)

Related

Split out time interval in time series in r

I have a dataset - time series
Data below:
Col 1(End):
2018.01.01 01:00:00
2018.01.01 02:00:00
2018.01.01 03:00:00
2018.01.01 04:00:00
2018.01.01 05:00:00
2018.01.01 06:00:00
2018.01.01 07:00:00
2018.01.01 08:00:00
2018.01.01 09:00:00
2018.01.01 10:00:00
2018.01.01 11:00:00
2018.01.02 01:00:00
2018.01.02 02:00:00
2018.01.02 03:00:00
2018.01.02 04:00:00
Col 2(Price-indexed)
55.09
44.02
44.0
33
43
43
33
33
I wish to select from the data the time of 11:00 every day
I have tried doing a sequence but with daylight saving in GMT it changes to 12 in October fro 2019 and 2020 which is not correct
datos_2019_2020<-read.csv("DayaheadPricesfull_2019_2020.csv")
#price variable changed to numeric
datos_2019_2020$Price_indexed=as.numeric(datos_2019_2020$Price)
time_index_2019_2020 <- seq(from = as.POSIXct("2019-01-01 00:00"), to = as.POSIXct("2020-12-31 23:00"), by = "hour",tz="GMT")
eventdata_2019_2020 <- as.xts(datos_2019_2020$Price_indexed, drop = FALSE,order.by = time_index_2019_2020)
df.new_2019_2020 = eventdata_2019_2020[seq(12, nrow(eventdata_2019_2020), 24), ]
Using the xts object x shown reproducibly in the Note at the end:
x[format(time(x), format = "%H:%M:%S") == "11:00:00"]
giving this xts object:
[,1]
2018-01-01 11:00:00 NA
Time zone problems are often specific to a particular installation but often the problem is between local time and GMT or due to the switch between standard and daylight savings time. In these cases it often easiest to just set the entire session to GMT making the local time GMT. In that case there will be no confusion between local and GMT since they are both GMT and GMT does not have daylight savings time.
Sys.setenv(TZ = 'GMT')
Note
Lines1 <- "
2018.01.01 01:00:00
2018.01.01 02:00:00
2018.01.01 03:00:00
2018.01.01 04:00:00
2018.01.01 05:00:00
2018.01.01 06:00:00
2018.01.01 07:00:00
2018.01.01 08:00:00
2018.01.01 09:00:00
2018.01.01 10:00:00
2018.01.01 11:00:00
2018.01.02 01:00:00
2018.01.02 02:00:00
2018.01.02 03:00:00
2018.01.02 04:00:00"
Lines2 <- "
55.09
44.02
44.0
33
43
43
33
33"
library(xts)
col1 <- read.table(text = Lines1, sep = ",")
col2 <- read.table(text = Lines2)
# merge col1 and col2 using NA's to fill in
m <- merge(col1, col2, by = 0, all.x = TRUE)
z <- read.zoo(m[-1], tz = "", format = "%Y.%m.%d %H:%M:%S")
x <- as.xts(z)

I want to assign "day" and"night" variables based on maximum duration inside and outside "08:00:00-20:00:00"

I'm trying to add a new variable in a DateTime database, I can assign "day" and "night" when it doesn't intercept "08:00:00"/"20:00:00" but when it intercepts these two timepoints I want to assign "day" or "night" based the maximum time spent inside 08:00-20:00 (day) or outside 20:00-08:00 (night).
#Current input
pacman::p_load(pacman,lubridate,chron)
id<-c("m1","m1","m1","m2","m2","m2","m3","m4","m4")
x<-c("1998-01-03 10:00:00","1998-01-03 16:00:00","1998-01-03 19:20:00","1998-01-04 00:50:00","1998-01-06 11:20:00","1998-01-06 20:50:00","1998-01-06 22:00:00","1998-01-07 06:30:00","1998-01-07 07:50:00")
start<-as.POSIXct(x,"%Y-%m-%d %H:%M:%S",tz="UTC")
y<-c("1998-01-03 16:00:00","1998-01-03 19:20:00","1998-01-04 00:50:00","1998-01-06 11:20:00","1998-01-06 20:50:00","1998-01-06 22:00:00","1998-01-07 07:40:00","1998-01-07 07:50:00","1998-01-07 08:55:00")
end<-as.POSIXct(y,"%Y-%m-%d %H:%M:%S",tz="UTC")
mydata<-data.frame(id,start,end)
#Current output
df1 <- mydata %>%
mutate(start1 = as.POSIXct(sub("\\d+-\\d+-\\d+", Sys.Date(), start)),
end1 = as.POSIXct(sub("\\d+-\\d+-\\d+", Sys.Date(), end)),
day.night = case_when(start1 >= as.POSIXct('08:00:00', format = "%T") &
end1 >= as.POSIXct('08:00:00', format = "%T") &
end1 < as.POSIXct('20:00:00', format = "%T") ~ "day",
start1 >= as.POSIXct('20:00:00', format = "%T") &
(start1 < as.POSIXct('08:00:00', format = "%T") | end1 < as.POSIXct('23:00:00', format = "%T"))|
(start1 < as.POSIXct('08:00:00', format = "%T") & end1 < as.POSIXct('08:00:00', format = "%T")) ~ "night",
difftime(as.POSIXct('20:00:00', format = "%T"), start1) > difftime(end1, as.POSIXct('20:00:00', format = "%T")) ~ "day",
difftime(as.POSIXct('20:00:00', format = "%T"), start1) < difftime(end1, as.POSIXct('20:00:00', format = "%T")) ~ "night",
TRUE ~ "mixed"))
The current output is misassigning any periods that intercept 08:00-20:00
i.e. row 3 should = "night" because 4hrs50mins are "night" and 40 mins are "day"
row 4 should = "night" because 31hrs50mins are "night" and 28hrs20mins are "day"
#Current table
id start end start1 end1 day.night
1 m1 1998-01-03 10:00:00 1998-01-03 16:00:00 2019-09-03 10:00:00 2019-09-03 16:00:00 day
2 m1 1998-01-03 16:00:00 1998-01-03 19:20:00 2019-09-03 16:00:00 2019-09-03 19:20:00 day
3 m1 1998-01-03 19:20:00 1998-01-04 00:50:00 2019-09-03 19:20:00 2019-09-03 00:50:00 day
4 m2 1998-01-04 00:50:00 1998-01-06 11:20:00 2019-09-03 00:50:00 2019-09-03 11:20:00 day
5 m2 1998-01-06 11:20:00 1998-01-06 20:50:00 2019-09-03 11:20:00 2019-09-03 20:50:00 day
6 m2 1998-01-06 20:50:00 1998-01-06 22:00:00 2019-09-03 20:50:00 2019-09-03 22:00:00 night
7 m3 1998-01-06 22:00:00 1998-01-07 07:40:00 2019-09-03 22:00:00 2019-09-03 07:40:00 night
8 m4 1998-01-07 06:30:00 1998-01-07 07:50:00 2019-09-03 06:30:00 2019-09-03 07:50:00 night
9 m4 1998-01-07 07:50:00 1998-01-07 08:55:00 2019-09-03 07:50:00 2019-09-03 08:55:00 day
library(dplyr)
library(lubridate)
library(chron)
id<-c("m1","m1","m1","m2","m2","m2","m3","m4","m4")
x<-c("1998-01-03 10:00:00","1998-01-03 16:00:00","1998-01-03 19:20:00","1998-01-04 00:50:00","1998-01-06 11:20:00","1998-01-06 20:50:00","1998-01-06 22:00:00","1998-01-07 06:30:00","1998-01-07 07:50:00")
start<-as.POSIXct(x,"%Y-%m-%d %H:%M:%S",tz="UTC")
y<-c("1998-01-03 16:00:00","1998-01-03 19:20:00","1998-01-04 00:50:00","1998-01-06 11:20:00","1998-01-06 20:50:00","1998-01-06 22:00:00","1998-01-07 07:40:00","1998-01-07 07:50:00","1998-01-07 08:55:00")
end<-as.POSIXct(y,"%Y-%m-%d %H:%M:%S",tz="UTC")
mydata<-data.frame(id,start,end)
#Current output
df1 <- mydata %>%
mutate(i = interval(start, end),
total_interval_length = time_length(i, unit = "hour")) %>%
# Calculate daytime hours on first and last days
mutate(first_day = floor_date(start, unit = "day"),
last_day = floor_date(end, unit = "day")) %>%
mutate(first_day_daytime =
interval(update(first_day, hour = 8), update(first_day, hour = 20)),
last_day_daytime =
interval(update(last_day, hour = 8), update(last_day, hour = 20))) %>%
mutate(first_day_overlap =
coalesce(as.numeric(as.duration(intersect(first_day_daytime, i)), "hour"),0),
last_day_overlap =
coalesce(as.numeric(as.duration(intersect(last_day_daytime, i)), "hour"),0)
) %>%
# Calculate total daytime hours
# For rows of one date only, that is just first_day_overlap (or last_day_overlap since it's the same day)
# For rows in multiple dates, it's the first_day_overlap plus last_day_overlap plus 12 hours for each day in between
mutate(daytime_length =
ifelse(first_day == last_day,
first_day_overlap,
first_day_overlap + last_day_overlap +
12*(as.numeric(as.duration(interval(first_day, last_day)), "day")-1))
) %>%
# Assign day or night classification
mutate(day_night = ifelse(daytime_length >= total_interval_length - daytime_length, "day", "night"))

Using xts with timespans crossing calendar dates: How to use period.apply (xts) or POSIXct datetime arguments in these cases in R?

I have a problem applying a function (min) to a specific repeating time-period. Basically my data looks like in that sample:
library(xts)
start <- as.POSIXct("2018-05-18 00:00")
tseq <- seq(from = start, length.out = 1440, by = "10 mins")
Measurings <- data.frame(
Time = tseq,
Temp = sample(10:37,1440, replace = TRUE, set.seed(seed = 10)))
)
Measurings_xts <- xts(Measurings[,-1], Measurings$Time)
with much appreciated help (here), I managed to find out that min and max functions (contrary to mean, which works right away in period.apply) must be defined by a helper function and can then be calculated for logical datetime arguments(hours, days, years...) by using this solution:
colMin <- function(x, na.rm = FALSE) {
apply(x, 2, min, na.rm = na.rm)
}
epHours <- endpoints(Measurings_xts, "hours")
Measurings_min <- period.apply(Measurings_xts, epHours, colMin)
For meteorological analyses I need to calculate further minima for a less intuitive timespan, crossing the calendar day, that I fail to define in code:
I need to output the minimum nighttime temperature from e.g. 2018-05-18 19:00 to 2018-05-19 7:00 in the morning for each night in my dataset.
I have tried to move the timespan by manipulating(moving) the time column up or down, to include the nighttime in one calendar day. Since this solution is error-prone and doesn´t work for my real data, where some observations are missing. How do I use the POSIXct datetime and/or xts functionalities to calculate minima in this case?
You could solve this by creating your own "end points" when you use period.apply
# Choose the appropriate time ranges
z <- Measurings_xts["T19:00/T07:00"]
# Creating your own "endpoints":
epNights <- which(diff.xts(index(z), units = "mins") > 10) - 1
Subtract one off each index because the jumps are recorded at the start of the next "night interval" in the output from which().
Then add the last data point in the data set to your end points vector, and you can then use this in period.apply
epNights <- c(epNights, nrow(z))
Measurings_min <- period.apply(z, epNights, colMin)
Measurings_min
# [,1]
# 2018-05-18 07:00:00 10
# 2018-05-19 07:00:00 10
# 2018-05-20 07:00:00 10
# 2018-05-21 07:00:00 10
# 2018-05-22 07:00:00 10
# 2018-05-23 07:00:00 10
# 2018-05-24 07:00:00 11
# 2018-05-25 07:00:00 10
# 2018-05-26 07:00:00 10
# 2018-05-27 07:00:00 10
# 2018-05-27 23:50:00 12
here is one approach that works by defining a new group for each night interval
# define the time interval, e.g. from 19:00 to 7:00
from <- 19
to <- 7
hours <- as.numeric(strftime(index(Measurings_xts), format="%H"))
y <- rle(as.numeric(findInterval(hours, c(to,from)) != 1))
y$values[c(TRUE, FALSE)] <- cumsum(y$values[c(TRUE, FALSE)])
grp <- inverse.rle(y)
# grp is a grouping variable that is 0 for everything outside the
# defined interval , 1 for the first night, 2 for the second...
s <- split(Measurings_xts, grp); s$`0` <- NULL
# min_value will contain the minimum value for each night interval
min_value <- sapply(s, min)
# to see the date interval for each value
start <- sapply(s, function(x) as.character(index(x)[1]))
end <- sapply(s, function(x) as.character(index(x)[length(x)]))
data.frame(start, end, min_value)
# start end min_value
#1 2018-05-18 2018-05-18 06:50:00 10
#2 2018-05-18 19:00:00 2018-05-19 06:50:00 10
#3 2018-05-19 19:00:00 2018-05-20 06:50:00 10
#4 2018-05-20 19:00:00 2018-05-21 06:50:00 10
#5 2018-05-21 19:00:00 2018-05-22 06:50:00 10
#6 2018-05-22 19:00:00 2018-05-23 06:50:00 10
#7 2018-05-23 19:00:00 2018-05-24 06:50:00 11
#8 2018-05-24 19:00:00 2018-05-25 06:50:00 10
#9 2018-05-25 19:00:00 2018-05-26 06:50:00 10
#10 2018-05-26 19:00:00 2018-05-27 06:50:00 10
#11 2018-05-27 19:00:00 2018-05-27 23:50:00 12

In R, how do I create a time histogram of intervals defined by a start and stop time for each entry?

I have a dataframe in which each row is the working hours of an employee defined by a start and a stop time:
DF < - EmployeeNum Start_datetime End_datetime
123 2012-02-01 07:30:00 2012-02-01 17:45:00
342 2012-02-01 08:00:00 2012-02-01 17:45:00
876 2012-02-01 10:45:00 2012-02-01 18:45:00
I'd like to find the number of employees working during each hour on each day in a timespan:
Date Hour NumberofEmployeesWorking
2012-02-01 00:00 ? (number of employees working between 00:00 and 00:59)
2012-02-01 01:00 ?
2012-02-01 02:00 ?
2012-02-01 03:00 ?
2012-02-01 04:00 ?
2012-02-01 05:00 ?
2012-02-01 06:00 ?
How do I put my working hours into bins like this?
Your data, in a more consumable format, plus one row to span midnight (for example). I changed the format to include a "T" here, to make consumption easier, otherwise the middle space makes it less trivial to do it with read.table(text='...'). (You can skip this since you already have your real data.)
x <- read.table(text='EmployeeNum Start_datetime End_datetime
123 2012-02-01T07:30:00 2012-02-01T17:45:00
342 2012-02-01T08:00:00 2012-02-01T17:45:00
876 2012-02-01T10:45:00 2012-02-01T18:45:00
877 2012-02-01T22:45:00 2012-02-02T05:45:00',
header=TRUE, stringsAsFactors=FALSE)
In case you haven't done it with your own data, convert all times to POSIXt, otherwise skip this, too.
x[c('Start_datetime','End_datetime')] <- lapply(x[c('Start_datetime','End_datetime')],
as.POSIXct, format='%Y-%m-%dT%H:%M:%S')
We need to generate a sequence of hourly timestamps:
startdate <- trunc(min(x$Start_datetime), units = "hours")
enddate <- round(max(x$End_datetime), units = "hours")
c(startdate, enddate)
# [1] "2012-02-01 07:00:00 PST" "2012-02-02 06:00:00 PST"
timestamps <- seq(startdate, enddate, by = "hour")
head(timestamps)
# [1] "2012-02-01 07:00:00 PST" "2012-02-01 08:00:00 PST" "2012-02-01 09:00:00 PST"
# [4] "2012-02-01 10:00:00 PST" "2012-02-01 11:00:00 PST" "2012-02-01 12:00:00 PST"
(Assumption: all end timestamps are after their start timestamps ...)
Now it's just a matter of tallying:
counts <- mapply(function(st,en) sum(st <= x$End_datetime & x$Start_datetime <= en),
timestamps[-length(timestamps)], timestamps[-1])
data.frame(
start = timestamps[ -length(timestamps) ],
count = counts
)
# start count
# 1 2012-02-01 07:00:00 2
# 2 2012-02-01 08:00:00 2
# 3 2012-02-01 09:00:00 2
# 4 2012-02-01 10:00:00 3
# 5 2012-02-01 11:00:00 3
# 6 2012-02-01 12:00:00 3
# 7 2012-02-01 13:00:00 3
# 8 2012-02-01 14:00:00 3
# 9 2012-02-01 15:00:00 3
# 10 2012-02-01 16:00:00 3
# 11 2012-02-01 17:00:00 3
# 12 2012-02-01 18:00:00 1
# 13 2012-02-01 19:00:00 0
# 14 2012-02-01 20:00:00 0
# 15 2012-02-01 21:00:00 0
# 16 2012-02-01 22:00:00 1
# 17 2012-02-01 23:00:00 1
# 18 2012-02-02 00:00:00 1
# 19 2012-02-02 01:00:00 1
# 20 2012-02-02 02:00:00 1
# 21 2012-02-02 03:00:00 1
# 22 2012-02-02 04:00:00 1
# 23 2012-02-02 05:00:00 1
I did not see #r2evans answer before posting. I came up with this independently, though it looks similar. I posted it here, so it may be helpful. Feel free to accept #r2evans answer.
Data:
df1 <- read.table(text="EmployeeNum Start_datetime End_datetime
123 '2012-02-01 07:30:00' '2012-02-01 17:45:00'
342 '2012-02-01 08:00:00' '2012-02-01 17:45:00'
876 '2012-02-01 10:45:00' '2012-02-01 18:45:00'", header = TRUE )
df1 <- within(df1, Start_datetime <- as.POSIXct( Start_datetime))
df1 <- within(df1, End_datetime <- as.POSIXct( End_datetime))
Code:
Find datetime sequence by 1 hour for each employee and count the number by Start_datetime.
Also, with this code, it is assumed that you separate original data by each single day and then apply the following code. If your data has multiple days mixed in it, with IDateTime() function from data.table package, it is possible to separate days from time and group by them while making the datetime sequence.
library('data.table')
setDT(df1) # assign data.table class by reference
df2 <- df1[, Map( f = function(x, y) seq( from = trunc(x, "hour"),
to = round(y, "hour"),
by = "1 hour" ),
x = Start_datetime, y = End_datetime ),
by = EmployeeNum ]
colnames(df2)[ colnames(df2) == "V1" ] <- "Start_datetime" # for some reason I can't assign column name properly during the column creation step.
Output:
df2[, .N, by = .( Start_datetime, End_datetime = Start_datetime + 3599 ) ]
# Start_datetime End_datetime N
# 1: 2012-02-01 07:00:00 2012-02-01 07:59:59 1
# 2: 2012-02-01 08:00:00 2012-02-01 08:59:59 2
# 3: 2012-02-01 09:00:00 2012-02-01 09:59:59 2
# 4: 2012-02-01 10:00:00 2012-02-01 10:59:59 3
# 5: 2012-02-01 11:00:00 2012-02-01 11:59:59 3
# 6: 2012-02-01 12:00:00 2012-02-01 12:59:59 3
# 7: 2012-02-01 13:00:00 2012-02-01 13:59:59 3
# 8: 2012-02-01 14:00:00 2012-02-01 14:59:59 3
# 9: 2012-02-01 15:00:00 2012-02-01 15:59:59 3
# 10: 2012-02-01 16:00:00 2012-02-01 16:59:59 3
# 11: 2012-02-01 17:00:00 2012-02-01 17:59:59 3
# 12: 2012-02-01 18:00:00 2012-02-01 18:59:59 3
# 13: 2012-02-01 19:00:00 2012-02-01 19:59:59 1
Graph:
binwidth = 3600 the value indicates 1 hour = 60 min * 60 sec = 3600 seconds
library('ggplot2')
ggplot( data = df2,
mapping = aes( x = Start_datetime ) ) +
geom_histogram(binwidth = 3600, color = "red", fill = "white" ) +
scale_x_datetime( date_breaks = "1 hour", date_labels = "%H:%M" ) +
ylab("Number of Employees") +
xlab( "Working Hours: 2012-02-01" ) +
theme( axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid = element_blank(),
panel.background = element_rect( fill = "white", color = "black") )
Thank you both for your answers. I came up with a solution which is pretty similar to yours, but I was wondering if you could have a look and let me know what you think of it.
I started a new empty dataframe, and then made two nested loops, to look at each start and end time in each row, and generate a sequence of hours in between. Then I each hour in the sequence to the new empty dataframe. This way, I can simply do a count later.
staffDetailHours <- data.frame("personnelNum"=integer(0),
"workDate"=character(0),
"Hour"=integer(0))
for (i in 1:dim(DF)[1]){
hoursList <- seq(as.POSIXlt(DF[i,]$START)$hour,
as.POSIXlt(DF[i,]$END)$hour)
for (j in 1:length(hoursList)) {
staffDetailHours[nrow(staffDetailHours)+1,] = list(
DF[i,]$EmployeeNum,
DF[i,]$Date,
hoursList[j]
)
}
}

calculating difference between subsequent days without for loop

I have the following data frame:
time <- c("2004-01-01 01:30:00","2004-01-01 04:30:00","2004-01-01 07:30:00",
"2004-01-01 10:30:00","2004-01-01 13:30:00","2004-01-01 16:30:00",
"2004-01-01 19:30:00","2004-01-01 22:30:00","2004-01-02 01:30:00",
"2004-01-02 04:30:00","2004-01-02 07:30:00","2004-01-02 10:30:00",
"2004-01-02 13:30:00","2004-01-02 16:30:00","2004-01-02 19:30:00",
"2004-01-02 22:30:00","2004-01-03 01:30:00","2004-01-03 04:30:00",
"2004-01-03 07:30:00","2004-01-03 10:30:00")
d <- c(0.00, 0.00,152808.30, 739872.84, 82641.22, 83031.04, 83031.04, 82641.22, 0.00,
0.00, 267024.71,1247414.7, 151638.85, 151249.03, 151249.03, 152028.67, 0.00, 0.00,
296650.81,1355783.85)
dat <- data.frame(time = time, dat = d)
which demonstrate the accumulation (per day) of solar radiation from a forecast model for 3 days.
To convert the units of solar radiation from J/m2 to W/m2, I need to calculate the difference between the different forecast times per day and divide by 10800 (the forecast time). Here is my attempt:
itime <- as.numeric(as.Date(dat$time))
utime <- unique(itime)
l <- list()
for(i in 1:length(utime)){
idx <- itime == utime[i]
dat2 <- dat[idx,]
dat3 <- dat2[1,2]/10800
for(ii in 2:nrow(dat2)){
dat3[ii] <- (abs(dat2[ii,2] - dat2[ii-1,2]))/10800
}
df <- data.frame(dateTime = dat2$time,
dd = dat3)
l[[i]] <- df
}
df1 <- do.call(rbind.data.frame, l)
df1[,1] <- as.POSIXct(df1[,1])
which performs as expected. However, the actual data on which I intend to use this code has a length of >100 days. Thus, it is not optimal to run a loop.
Is there another method I can use instead of a loop?
I have tried:
dat2 <- c(dat[1,2]/10800,rev(abs(diff(rev(dat[,2])))/10800))
df2 <- data.frame(time = as.POSIXct(dat[,1]), dd = dat2)
which gives nearly the same answer (as the loop), but it also calculates the difference between time steps in different days, instead of isolating the calculation to individual days.
plot(df1, type = 'l')
lines(df2, col = 'red')
As you can see, there is a mismatch during the early hours.
Can anyone suggest another method?
For your list l you can have the same result by
dat <- data.frame(
time = c("2004-01-01 01:30:00","2004-01-01 04:30:00","2004-01-01 07:30:00",
"2004-01-01 10:30:00","2004-01-01 13:30:00","2004-01-01 16:30:00",
"2004-01-01 19:30:00","2004-01-01 22:30:00","2004-01-02 01:30:00",
"2004-01-02 04:30:00","2004-01-02 07:30:00","2004-01-02 10:30:00",
"2004-01-02 13:30:00","2004-01-02 16:30:00","2004-01-02 19:30:00",
"2004-01-02 22:30:00","2004-01-03 01:30:00","2004-01-03 04:30:00",
"2004-01-03 07:30:00","2004-01-03 10:30:00"),
dat = c(0.00, 0.00,152808.30, 739872.84, 82641.22, 83031.04, 83031.04, 82641.22, 0.00,
0.00, 267024.71,1247414.7, 151638.85, 151249.03, 151249.03, 152028.67, 0.00, 0.00,
296650.81,1355783.85)
)
dat$itime <- as.numeric(as.Date(dat$time))
utime <- unique(dat$itime)
daydat <- function(u) {
dat2 <- dat[dat$itime==u,]
data.frame(dateTime = dat2$time, dd = c(dat2$dat[1], abs(diff(dat2$dat)))/10800)
}
l <- lapply(utime, daydat)
Here is a version with split():
dat$itime <- as.numeric(as.Date(dat$time))
daydat <- function(d) data.frame(dateTime = d$time, dd = c(d$dat[1], abs(diff(d$dat)))/10800)
L <- split(dat, dat$itime)
l <- lapply(L, daydat)
or without creating dat$itime:
daydat <- function(d) data.frame(dateTime = d$time, dd = c(d$dat[1], abs(diff(d$dat)))/10800)
l <- lapply(split(dat, as.Date(dat$time)), FUN=daydat)
or using by()
l2 <- unclass(by(dat, as.Date(dat$time), FUN=daydat))
If you want to have the result in the original dataframe you can use ave()
dat$dd <- ave(dat$dat, as.Date(dat$time), FUN=function(x) c(x[1], abs(diff(x)))/10800)
Use can use lag() from dplyr with group_by()
library(dplyr)
df <- dat %>%
mutate(date = as.Date(time)) %>%
group_by(date) %>%
mutate(before.dat = lag(dat, order_by=date)) %>%
mutate(diff = abs(dat - before.dat)/10800) %>%
select(time, date, dat, before.dat, diff)
df
#Source: local data frame [20 x 5]
#Groups: date [3]
# time date dat before.dat diff
# <fctr> <date> <dbl> <dbl> <dbl>
#1 2004-01-01 01:30:00 2004-01-01 0.00 NA NA
#2 2004-01-01 04:30:00 2004-01-01 0.00 0.00 0.00000000
#3 2004-01-01 07:30:00 2004-01-01 152808.30 0.00 14.14891667
#4 2004-01-01 10:30:00 2004-01-01 739872.84 152808.30 54.35782778
#5 2004-01-01 13:30:00 2004-01-01 82641.22 739872.84 60.85477963
#6 2004-01-01 16:30:00 2004-01-01 83031.04 82641.22 0.03609444
#7 2004-01-01 19:30:00 2004-01-01 83031.04 83031.04 0.00000000
#8 2004-01-01 22:30:00 2004-01-01 82641.22 83031.04 0.03609444
#9 2004-01-02 01:30:00 2004-01-02 0.00 NA NA
#10 2004-01-02 04:30:00 2004-01-02 0.00 0.00 0.00000000
#11 2004-01-02 07:30:00 2004-01-02 267024.71 0.00 24.72451019
#12 2004-01-02 10:30:00 2004-01-02 1247414.70 267024.71 90.77685093
#13 2004-01-02 13:30:00 2004-01-02 151638.85 1247414.70 101.46072685
#14 2004-01-02 16:30:00 2004-01-02 151249.03 151638.85 0.03609444
#15 2004-01-02 19:30:00 2004-01-02 151249.03 151249.03 0.00000000
#16 2004-01-02 22:30:00 2004-01-02 152028.67 151249.03 0.07218889
#17 2004-01-03 01:30:00 2004-01-03 0.00 NA NA
#18 2004-01-03 04:30:00 2004-01-03 0.00 0.00 0.00000000
#19 2004-01-03 07:30:00 2004-01-03 296650.81 0.00 27.46766759
#20 2004-01-03 10:30:00 2004-01-03 1355783.85 296650.81 98.06787407
Simplified code based on GGamba's comment
dat %>%
mutate(time = as.Date(time)) %>%
group_by(time) %>%
mutate(diff = (dat-lag(dat)) / 10800)

Resources