Histograms or Density plots of time of day - r

I have the data.frame in which every row is an episode with a start and an end timestamp.
test.DF<-dput(head(test.DF, n=50))
structure(list(start = structure(c(1189494920, 1189495400, 1189496120,
1189496840, 1189497440, 1189498040, 1189498640, 1189501760, 1189503560,
1190453600, 1247458520, 1247480840, 1247482880, 1247483840, 1247485040,
1247486600, 1247487320, 1247488040, 1247488760, 1247490920, 1247491280,
1247492480, 1247493680, 1247502440, 1247503160, 1247503520, 1247548040,
1247549360, 1247550680, 1247552600, 1247553920, 1247557400, 1247558000,
1247558480, 1247559440, 1247560400, 1247563760, 1247564960, 1247566640,
1247567120, 1194935549, 1194936029, 1195722629, 1195724309, 1199691029,
1199692349, 1202560229, 1208063669, 1208322989, 1188188112), class = c("POSIXct",
"POSIXt"), tzone = ""), end = structure(c(1189495280, 1189495520,
1189496360, 1189497080, 1189497560, 1189498160, 1189498760, 1189501880,
1189503920, 1190453720, 1247458640, 1247480960, 1247483480, 1247484080,
1247485640, 1247486840, 1247487560, 1247488640, 1247490440, 1247491160,
1247491520, 1247492600, 1247493920, 1247502680, 1247503400, 1247504120,
1247549240, 1247550560, 1247551280, 1247552720, 1247554400, 1247557880,
1247558240, 1247559080, 1247559560, 1247560760, 1247563880, 1247565080,
1247566760, 1247567240, 1194935669, 1194936269, 1195722749, 1195724429,
1199691269, 1199692469, 1202560349, 1208063789, 1208323109, 1188204792
), class = c("POSIXct", "POSIXt"), tzone = "")), .Names = c("start",
"end"), row.names = c(NA, 50L), class = "data.frame")
I would like to see the distribution of these episodes within a 24 hour cycle. That is either a histogram or a density plot, with the 24H day cycle in the x axis. Is this possible? I would like to ignore the dates of the episodes.

By converting to a POSIXltformat, you can easily extract the hour of the time:
par(mar=c(6,4,1,1))
Hour <- as.POSIXlt(test.DF$start)$hour
hist(Hour, breaks=seq(0, 23), main="Start time (hour)")
Edit: Adding a value for ever minute between start and end
fun <- function(start.time, end.time){
seq.POSIXt(
as.POSIXlt(
paste0("2000-01-01 ", as.POSIXlt(start.time)$hour, ":", as.POSIXlt(start.time)$min)
),
as.POSIXlt(
paste0("2000-01-01 ", as.POSIXlt(end.time)$hour, ":", as.POSIXlt(end.time)$min)
),
by="min"
)
}
HM <- vector(mode="list", dim(test.DF)[1])
for(i in seq(HM)){
HM[[i]] <- fun(test.DF$start[i], test.DF$end[i])
}
HM2 <- as.POSIXlt(unlist(HM), origin="1970-01-01")
Hour <- HM2$hour
hist(Hour, breaks=seq(0, 23))
HourMinute <- HM2$hour + HM2$min/60
hist(HourMinute, breaks=seq(0, 23, by=1/60))

Related

unexpected result using aggregate on date_time

i have a date_time POSIXct object in a large (6 month) dataframe in 5 second increments that i want to aggregate into 30s 'blocks'. 6x 5s is 30s so nrow(df)/6 gives the correct sequence length.
I tried the following:
Date_time_30s <- aggregate(Mn$Date_time, list(seq(0, length.out = nrow(Mn)) %/% 6), FUN = mean)
The first 6 date_times look like this:
"","Date_time","Depth","Temperature","Light_Level","Date"
"1",2013-10-14 12:30:00,
"2",2013-10-14 12:30:05,
"3",2013-10-14 12:30:10,
"4",2013-10-14 12:30:15,
"5",2013-10-14 12:30:20,
"6",2013-10-14 12:30:25,
and so the mean should be 2013-10-14 12:30:12.5 but it comes out as 2013-10-14 11:30:12.
no decimal second (a simple formatting issue solved by options(digits.secs=3) ) but the hour is wrong.
What's going wrong?
dput(head(Mn))
structure(list(Date_time = structure(c(1381721400, 1381721405,
1381721410, 1381721415, 1381721420, 1381721425), class = c("POSIXct",
"POSIXt"), tzone = "Asia/Tokyo"), Depth = c(64.4476273148148,
65.9476334145628, 65.9476395143109, 66.4476456140589, 67.9476517138069,
66.9476578135549), Temperature = c(27.549999, 27.5, 27.400002,
27.35, 27.25, 27.200001), Light_Level = c(148L, 148L, 148L, 148L,
147L, 147L), Date = structure(c(15992, 15992, 15992, 15992, 15992,
15992), class = "Date"), vv = c(0, 0.300001, 1e-06, 0.100001,
0.300001, -0.199999), vv_abs = c(0, 0.300001, 1e-06, 0.100001,
0.300001, 0.199999)), row.names = c(NA, 6L), class = "data.frame")
Run this before the code:
options(digits.secs=3)
Can you run this command? It will give you the result in different time zones. Tokyo should be the correct time.
library(lubridate)
library(dplyr)
Date_time_30s <- aggregate(Mn$Date_time, list(seq(0, length.out = nrow(Mn)) %/% 6), FUN = mean) %>%
mutate(Tokyo = with_tz(x, tzone = "Asia/Tokyo"),
GMT = with_tz(x, tzone = "GMT"))

time average for specific time range in r

I am trying to extract average values of all variables between 0 to 40 minutes every hour.
dput(head(df))
structure(list(DateTime = structure(c(1563467460, 1563468060,
1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), date = structure(c(1563467460, 1563468060,
1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), Date = structure(c(18095, 18095, 18095,
18095, 18095, 18095), class = "Date"), TimeCtr = structure(c(1563467460,
1563468060, 1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), MassConc = c(0.397627, 0.539531, 0.571902,
0.608715, 0.670382, 0.835773), VolConc = c(175.038, 160.534,
174.386, 183.004, 191.074, 174.468), NumbConc = c(234.456, 326.186,
335.653, 348.996, 376.018, 488.279), MassD = c(101.426, 102.462,
101.645, 102.145, 101.255, 101.433)), .Names = c("DateTime",
"date", "Date", "TimeCtr", "MassConc", "VolConc", "NumbConc",
"MassD"), row.names = c(NA, 6L), class = "data.frame")
What I've tried so far..
hourly_mean<-mydata %>%
filter(between(as.numeric(format(DateTime, "%M")), 0, 40)) %>%
group_by(DateTime=format(DateTime, "%Y-%m-%d %H")) %>%
summarise(variable1_mean=mean(variable1))
But it gives me a single average value for the whole period. Any help is very much welcomed.
We can convert DateTime , use ceiling_date with hourly unit to round Datetime, extract minutes from DateTime and filter to keep minutes which are less than 40, group_by hour and take mean of values.
library(lubridate)
library(dplyr)
df %>%
dplyr::mutate(DateTime = ymd_hm(DateTime),
hour = ceiling_date(DateTime, "hour"),
minutes = minute(DateTime)) %>%
filter(minutes <= 40) %>%
group_by(hour) %>%
summarise_at(vars(ends_with("Conc")), mean)
data
df <- structure(list(DateTime = structure(1:7, .Label = c("2019-08-0810:07",
"2019-08-0810:17", "2019-08-0810:27", "2019-08-0810:37", "2019-08-0810:47",
"2019-08-0810:57", "2019-08-0811:07"), class = "factor"), MassConc = c(0.556398,
1.06868, 0.777654, 0.87289, 0.789704, 0.51948, 0.416676), NumbConc = c(588.069,
984.018, 964.634, 997.678, 1013.52, 924.271, 916.357), VolConc = c(582.887,
979.685, 963.3, 994.178, 1009.52, 922.104, 916.856), Conc = c(281.665,
486.176, 420.058, 422.101, 429.841, 346.539, 330.282)), class =
"data.frame", row.names = c(NA, -7L))

R: How to print only the time on the x axis leaving out the date

I have an xts object, a snapshot of the data is as follow:
ts <- structure(c(620519.432512971, 619086.596917204, 620478.784694651,
620997.044378227, 620885.262877848, 620275.545603053), index = structure(c(1519954199.9845,
1519955999.7115, 1519957799.9675, 1519959599.9935, 1519961399.9365,
1519963199.2225), tzone = "", tclass = c("POSIXct", "POSIXt")), .indexCLASS = c("POSIXct",
"POSIXt"), .indexTZ = "", tclass = c("POSIXct", "POSIXt"), tzone = "", class = c("xts",
"zoo"), .Dim = c(6L, 1L), .Dimnames = list(NULL, "yy"))
All the timestamps are within the same date, and I am trying to add more tick marks on the x axis (time), which I managed to achieve. It looks as follow:
ggplot(data=ts) +
geom_line(aes(x=Index,y=yy,colour = "yy"),na.rm=T) +
scale_x_datetime(date_breaks ="20 min")
As you can see the timestamps above are correct, but i want to remove the date part and just have the time on the x-axis. So I tried the following code:
ggplot(data=ts) +
geom_line(aes(x=Index,y=yy,colour = "yy"),na.rm=T) +
scale_x_datetime(date_breaks ="20 min",labels = date_format("%H:%M:%S"))
But the times are all incorrect. What should I do to fix this to make it print only the time, leaving out the date, since the xts data are all within the same date ?
I believe your times are being converted to UTC times when you plot them. You can convert to your timezone. Here is how I convert to using the "US/Pacific" time zone. I added the tz argument to date_format
ggplot(data=ts) +
geom_line(aes(x=Index,y=yy,colour = "yy"),na.rm=T) +
scale_x_datetime(date_breaks ="20 min",
labels = date_format("%H:%M:%S", tz = "US/Pacific"))
You may have to change the timezone (see OlsonNames()
It is very simple,
ggplot(data=ts) +
geom_line(aes(x = Index,y = yy,colour = "yy"), na.rm=T) +
scale_x_datetime(date_breaks = "20 min", date_labels = "%H:%M:%S")
You just have to use parameter date_labels to set the formatting option of your datetime object.
Sources:
- http://ggplot2.tidyverse.org/reference/scale_date.html
- https://www.stat.berkeley.edu/~s133/dates.html

optimisation of a condition for loop in r

I have 2 datasets, one of which contains measurements of temperature at 30 min intervals
ordered.temp<-structure(list(time = structure(c(1385244720, 1385246520, 1385248320,
1385250120, 1385251920, 1385253720, 1385255520, 1385257320, 1385259120,
1385260920), class = c("POSIXct", "POSIXt"), tzone = ""), temp = c(30.419,
29.34, 28.965, 28.866, 28.891, 28.866, 28.692, 28.419, 28.122,
27.85), hoboID = c(2392890L, 2392890L, 2392890L, 2392890L, 2392890L,
2392890L, 2392890L, 2392890L, 2392890L, 2392890L)), .Names = c("time",
"temp", "hoboID"), row.names = c(NA, 10L), class = "data.frame")
, the other I created to be able to assign each measurement into 3-hour bins
df<-structure(list(start = structure(c(1385182800, 1385193600, 1385204400,
1385215200, 1385226000, 1385236800, 1385247600, 1385258400, 1385269200,
1385280000), class = c("POSIXct", "POSIXt"), tzone = ""), end = structure(c(1385193600,
1385204400, 1385215200, 1385226000, 1385236800, 1385247600, 1385258400,
1385269200, 1385280000, 1385290800), class = c("POSIXct", "POSIXt"
), tzone = ""), b = 1:10), .Names = c("start", "end", "b"), row.names = c(NA,
10L), class = "data.frame")
For simplicity, I created a subset of the data, but in reality the temp dataframe is 460k rows long and growing bigger every year. I wrote a for loop to compare each line in temp with lines in bin and assign it the corresponding b value from the bin dataframe.
m <- length(ordered.temp$time)
b <- numeric(m)
n <- length(df$start)
for (i in 1:m){
for (j in 1:n){
if (df$start[j] < ordered.temp$time[i] & ordered.temp$time[i] <= df$end[j]){
b[i] <- df$b[j]
print(i/dim(ordered.temp)[1]*100)
}
}
}
Running this loop with 460k rows takes a very long time (i ran the loop for 1 minute and calculated that it would take ±277 hours to complete it. Therefore, it is imperative to speed this loop up or find alternative methods if this is not possible. I however have no idea how I achieve the desired result. Any help would be greatly appreciated. thanks.

Access First Graph

So I am using the PerformanceAnalytics package to plot performance summary of a simple PnL series so
library(xts)
library(PerformanceAnalytics)
dates <- structure(c(14008, 14011, 14012, 14013, 14014, 14015, 14018, 14019, 14020, 14021),
class = "Date")
PnL.xts = structure(c(0, -0.00510803851321091, -0.0102109843849305, -0.00138369232677364,
-0.00255257489213331, -0.00200279255353461, 0.0104232666033935,
0.00181846800788812, 4.72633257030091e-05, 0.0138334493571853),
.Dim = c(10L, 1L),
index = structure(c(1210291200, 1210550400, 1210636800, 1210723200,
1210809600, 1210896000, 1211155200, 1211241600,
1211328000, 1211414400),
tzone = "UTC", tclass = "Date"),
.indexCLASS = "Date", tclass = "Date",
.indexTZ = "UTC", tzone = "UTC", .Dimnames = list(NULL, "PnL"),
class = c("xts", "zoo"))
PnL.cum = cumsum(PnL.xts)
ret.ann = Return.annualized(PnL.xts, geometric = FALSE)
ret.cum = Return.cumulative(PnL.xts, geometric = FALSE)
ret.min = min(PnL.cum)
stdev = StdDev.annualized(PnL.xts)
sharpe = SharpeRatio.annualized(PnL.xts, geometric = FALSE)
stats = paste(paste("Annualized Return:", percent(round(ret.ann, 5))),
paste("Cumulative Return:", percent(round(ret.cum, 5))),
paste("Standard Deviation:", round(stdev, 5)),
paste("Sharpe Ratio:", round(sharpe, 5)), sep = '\n' )
lag = 1
descr = paste("Following fitted Granger model - ", lag, " day lag", sep = "")
charts.PerformanceSummary(R = PnL.xts, geometric = FALSE)
text(midrange(dates),ret.min, labels = stats, cex = 1)
mtext(descr, side = 3, line = 31)
However, I want to add some descriptive text into the cumulative PnL chart such as annualized return, cumulative return, standard deviation, and sharpe. How can I paste this into the whitespace in the first graph?
If I plotted the graph by itself, I could just do it with the above code. However, since the charts.PerformanceSummary function automatically plots 3 graphs, I can only access the 3rd graph it seems. Is there any way to access the first of 3 graphs printed by a function, so that I can write text on it relative to its own coordinates?
Here is an example of what I want: http://i.imgur.com/QXUb2Aq.png. But in this case, I had to manually, test values of the y coordinate until I found somthing that worked. Thanks!

Resources