i have a date_time POSIXct object in a large (6 month) dataframe in 5 second increments that i want to aggregate into 30s 'blocks'. 6x 5s is 30s so nrow(df)/6 gives the correct sequence length.
I tried the following:
Date_time_30s <- aggregate(Mn$Date_time, list(seq(0, length.out = nrow(Mn)) %/% 6), FUN = mean)
The first 6 date_times look like this:
"","Date_time","Depth","Temperature","Light_Level","Date"
"1",2013-10-14 12:30:00,
"2",2013-10-14 12:30:05,
"3",2013-10-14 12:30:10,
"4",2013-10-14 12:30:15,
"5",2013-10-14 12:30:20,
"6",2013-10-14 12:30:25,
and so the mean should be 2013-10-14 12:30:12.5 but it comes out as 2013-10-14 11:30:12.
no decimal second (a simple formatting issue solved by options(digits.secs=3) ) but the hour is wrong.
What's going wrong?
dput(head(Mn))
structure(list(Date_time = structure(c(1381721400, 1381721405,
1381721410, 1381721415, 1381721420, 1381721425), class = c("POSIXct",
"POSIXt"), tzone = "Asia/Tokyo"), Depth = c(64.4476273148148,
65.9476334145628, 65.9476395143109, 66.4476456140589, 67.9476517138069,
66.9476578135549), Temperature = c(27.549999, 27.5, 27.400002,
27.35, 27.25, 27.200001), Light_Level = c(148L, 148L, 148L, 148L,
147L, 147L), Date = structure(c(15992, 15992, 15992, 15992, 15992,
15992), class = "Date"), vv = c(0, 0.300001, 1e-06, 0.100001,
0.300001, -0.199999), vv_abs = c(0, 0.300001, 1e-06, 0.100001,
0.300001, 0.199999)), row.names = c(NA, 6L), class = "data.frame")
Run this before the code:
options(digits.secs=3)
Can you run this command? It will give you the result in different time zones. Tokyo should be the correct time.
library(lubridate)
library(dplyr)
Date_time_30s <- aggregate(Mn$Date_time, list(seq(0, length.out = nrow(Mn)) %/% 6), FUN = mean) %>%
mutate(Tokyo = with_tz(x, tzone = "Asia/Tokyo"),
GMT = with_tz(x, tzone = "GMT"))
Related
I am trying to extract average values of all variables between 0 to 40 minutes every hour.
dput(head(df))
structure(list(DateTime = structure(c(1563467460, 1563468060,
1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), date = structure(c(1563467460, 1563468060,
1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), Date = structure(c(18095, 18095, 18095,
18095, 18095, 18095), class = "Date"), TimeCtr = structure(c(1563467460,
1563468060, 1563468660, 1563469260, 1563469860, 1563470460), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), MassConc = c(0.397627, 0.539531, 0.571902,
0.608715, 0.670382, 0.835773), VolConc = c(175.038, 160.534,
174.386, 183.004, 191.074, 174.468), NumbConc = c(234.456, 326.186,
335.653, 348.996, 376.018, 488.279), MassD = c(101.426, 102.462,
101.645, 102.145, 101.255, 101.433)), .Names = c("DateTime",
"date", "Date", "TimeCtr", "MassConc", "VolConc", "NumbConc",
"MassD"), row.names = c(NA, 6L), class = "data.frame")
What I've tried so far..
hourly_mean<-mydata %>%
filter(between(as.numeric(format(DateTime, "%M")), 0, 40)) %>%
group_by(DateTime=format(DateTime, "%Y-%m-%d %H")) %>%
summarise(variable1_mean=mean(variable1))
But it gives me a single average value for the whole period. Any help is very much welcomed.
We can convert DateTime , use ceiling_date with hourly unit to round Datetime, extract minutes from DateTime and filter to keep minutes which are less than 40, group_by hour and take mean of values.
library(lubridate)
library(dplyr)
df %>%
dplyr::mutate(DateTime = ymd_hm(DateTime),
hour = ceiling_date(DateTime, "hour"),
minutes = minute(DateTime)) %>%
filter(minutes <= 40) %>%
group_by(hour) %>%
summarise_at(vars(ends_with("Conc")), mean)
data
df <- structure(list(DateTime = structure(1:7, .Label = c("2019-08-0810:07",
"2019-08-0810:17", "2019-08-0810:27", "2019-08-0810:37", "2019-08-0810:47",
"2019-08-0810:57", "2019-08-0811:07"), class = "factor"), MassConc = c(0.556398,
1.06868, 0.777654, 0.87289, 0.789704, 0.51948, 0.416676), NumbConc = c(588.069,
984.018, 964.634, 997.678, 1013.52, 924.271, 916.357), VolConc = c(582.887,
979.685, 963.3, 994.178, 1009.52, 922.104, 916.856), Conc = c(281.665,
486.176, 420.058, 422.101, 429.841, 346.539, 330.282)), class =
"data.frame", row.names = c(NA, -7L))
I have date time data table imported from Excel and the date/time column in a number format (i.e., 43596.22). I used the following code to convert the number to a date time format with UTC time zone:
info_dt1$Date_time<-convertToDateTime(info_dt1$date_time, origin = "1900-01-01",tx="UTC")
I am using the forverlaps function from data.table to merge this data table with another data table by date and time. When I first ran the following code:
info_dt3 = foverlaps(info_dt2, info_access3, by.x=c("Date_time", "dummy"), nomatch=NA)[, dummy := NULL]
I got an error message stating the two date time fields had different time zones. The time zone for the other data table was also specified as UTC.
I used the attr function to set both data tables date times columns to UTC:
#make sure all date/times have same time zone
attr(info_access2$Start_time, "tzone") <- "UTC"
attr(info_access2$End_time, "tzone") <- "UTC"
attr(info_dt1$Date_time, "tzone") <- "UTC"
When I do this, the info_dt1 data table time moves forward 4 hours and the resulting merge is off. I would like to know what I am doing incorrect when setting the format and time zone for both data tables for the merge to work correctly.
Some example data and code:
#first data table reduced example
info_dt1<-
structure(list(date_time = c(NA, 43596.2284722222, 43596.2285069444,
43596.2285416667, 43596.2285763889, 43596.2286111111, 43596.2286458333,
43596.2286805556, 43596.2287152778, 43596.22875), Temp = c(NA,
22.75, 22.66, 22.57, 22.49, 22.37, 22.28, 22.16, 22.08, 21.99
), Depth = c(NA, 0.19, 0.27, 0.7, 0.27, 0.27, 0.27, 0.19, 0.19,
0.19), Angle = c(NA, -3, -4, -3, -1, 1, -1, -2, 1, -6)), .Names = c("date_time",
"Temp", "Depth", "Angle"), row.names = c(NA, 10L), class = "data.frame")
#convert date time to POSIXct
info_dt1$Date_time<-convertToDateTime(info_dt1$date_time, origin = "1900-01-01",tx="UTC")
#second example data set
info_access2<-
structure(list(Tow = 201905001:201905010, Start_time = structure(c(1557554271,
1557564948, 1557569853, 1557573081, 1557577149, 1557582317, 1557586050,
1557588636, 1557590697, 1557593679), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), End_time = structure(c(1557555117, 1557565710,
1557570765, 1557573846, 1557577974, 1557583210, 1557586797, 1557589428,
1557591441, 1557594511), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
time_interval = structure(c(846, 762, 912, 765, 825, 893,
747, 792, 744, 832), start = structure(c(1557554271, 1557564948,
1557569853, 1557573081, 1557577149, 1557582317, 1557586050,
1557588636, 1557590697, 1557593679), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), tzone = "UTC", class = structure("Interval", package = "lubridate"))), .Names = c("Tow",
"Start_time", "End_time", "time_interval"), row.names = c(NA,
10L), class = "data.frame")
library(data.table)
#make info_dt2 and info_access2 data.tables
info_access3<-as.data.table(info_access2)
info_dt2<-as.data.table(info_dt1)
#remove NA from info_dt2
info_dt2<-info_dt2[complete.cases(info_dt2),]
#set dummy column for info_dt2
info_dt2[, dummy := Date_time]
#define setkey for info_access2
setkey(info_access3, Start_time, End_time)
#if I run the code like this I get the error message about different time zones
#use foverlaps to merge info_access3 and info_dt2
info_dt3 = foverlaps(info_dt2, info_access3, by.x=c("Date_time", "dummy"), nomatch=NA)[, dummy := NULL]
#if I run this chunk of code the times in info_dt1 are moved forward 4 hours
#make sure all date/times have same time zone
attr(info_access2$Start_time, "tzone") <- "UTC"
attr(info_access2$End_time, "tzone") <- "UTC"
attr(info_dt1$Date_time, "tzone") <- "UTC"
#make info_dt2 and info_access2 data.tables
info_access3<-as.data.table(info_access2)
info_dt2<-as.data.table(info_dt1)
#remove NA from info_dt2
info_dt2<-info_dt2[complete.cases(info_dt2),]
#but the foverlaps to merge info_access2 and info_dt2 doesn't give an error message
info_dt3 = foverlaps(info_dt2, info_access3, by.x=c("Date_time", "dummy"), nomatch=NA)[, dummy := NULL]
You can use lubridate::force_tz() to change a timestamp which had an inaccurate timezone when it was read in:
lubridate::force_tz(Sys.time(), "UTC")
#[1] "2019-06-25 14:04:32 UTC"
This will change the underlying timestamp double whereas merely altering the attribute won't.
I have a dataframe with dates. Here are the first 3 rows with dput:
df.cv <- structure(list(ds = structure(c(1448064000, 1448150400, 1448236800
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), y = c(10.4885204292416,
10.456538985014, 10.4264986311659), yhat = c(10.4851491194439,
10.282089547027, 10.4354960430083), yhat_lower = c(10.4169914076864,
10.2162549984153, 10.368531352493), yhat_upper = c(10.5506038959764,
10.3556867861042, 10.5093092789713), cutoff = structure(c(1447977600,
1447977600, 1447977600), class = c("POSIXct", "POSIXt"), tzone = "UTC")),.Names = c("ds",
"y", "yhat", "yhat_lower", "yhat_upper", "cutoff"), row.names = c(NA,
-3L), class = c("`enter code here`tbl_df", "tbl", "data.frame"))
I'm trying to plot the data with ggplot + geom_line from similar day/month combinations in one plot. So, for example, I want the y-value of 2016-01-01 to appear on the same x-value as 2017-01-01. If found a way to do this, but it seems to be a very complex workaround:
library(tidyverse)
library(lubridate)
p <- df.cv %>%
mutate(jaar = as.factor(year(ds))) %>%
mutate(x = as_date(as.POSIXct(
ifelse(jaar==2016, ds + years(1), ds),
origin = "1970-01-01")))
ggplot(p %>% filter(jaar!=2015), aes(x=x, group=jaar, color=jaar)) +
geom_line(aes(y=y))
It works, but as you can see I first have to extract the year, then use an ifelse to add one year to only the 2016 dates, convert with POSIXct because ifelse strips the class, convert back into POSIXct while supplying an origin, and finally remove the timestamp with as_date.
Isn't there a simpler, more elegant way to do this?
Use year<- to replace the year with any fixed leap year:
p <- df.cv %>%
mutate(jaar = as.factor(year(ds)),
x = `year<-`(as_date(ds), 2000))
ggplot(p, aes(x = x, y = y, color = jaar)) +
geom_line()
I have the data.frame in which every row is an episode with a start and an end timestamp.
test.DF<-dput(head(test.DF, n=50))
structure(list(start = structure(c(1189494920, 1189495400, 1189496120,
1189496840, 1189497440, 1189498040, 1189498640, 1189501760, 1189503560,
1190453600, 1247458520, 1247480840, 1247482880, 1247483840, 1247485040,
1247486600, 1247487320, 1247488040, 1247488760, 1247490920, 1247491280,
1247492480, 1247493680, 1247502440, 1247503160, 1247503520, 1247548040,
1247549360, 1247550680, 1247552600, 1247553920, 1247557400, 1247558000,
1247558480, 1247559440, 1247560400, 1247563760, 1247564960, 1247566640,
1247567120, 1194935549, 1194936029, 1195722629, 1195724309, 1199691029,
1199692349, 1202560229, 1208063669, 1208322989, 1188188112), class = c("POSIXct",
"POSIXt"), tzone = ""), end = structure(c(1189495280, 1189495520,
1189496360, 1189497080, 1189497560, 1189498160, 1189498760, 1189501880,
1189503920, 1190453720, 1247458640, 1247480960, 1247483480, 1247484080,
1247485640, 1247486840, 1247487560, 1247488640, 1247490440, 1247491160,
1247491520, 1247492600, 1247493920, 1247502680, 1247503400, 1247504120,
1247549240, 1247550560, 1247551280, 1247552720, 1247554400, 1247557880,
1247558240, 1247559080, 1247559560, 1247560760, 1247563880, 1247565080,
1247566760, 1247567240, 1194935669, 1194936269, 1195722749, 1195724429,
1199691269, 1199692469, 1202560349, 1208063789, 1208323109, 1188204792
), class = c("POSIXct", "POSIXt"), tzone = "")), .Names = c("start",
"end"), row.names = c(NA, 50L), class = "data.frame")
I would like to see the distribution of these episodes within a 24 hour cycle. That is either a histogram or a density plot, with the 24H day cycle in the x axis. Is this possible? I would like to ignore the dates of the episodes.
By converting to a POSIXltformat, you can easily extract the hour of the time:
par(mar=c(6,4,1,1))
Hour <- as.POSIXlt(test.DF$start)$hour
hist(Hour, breaks=seq(0, 23), main="Start time (hour)")
Edit: Adding a value for ever minute between start and end
fun <- function(start.time, end.time){
seq.POSIXt(
as.POSIXlt(
paste0("2000-01-01 ", as.POSIXlt(start.time)$hour, ":", as.POSIXlt(start.time)$min)
),
as.POSIXlt(
paste0("2000-01-01 ", as.POSIXlt(end.time)$hour, ":", as.POSIXlt(end.time)$min)
),
by="min"
)
}
HM <- vector(mode="list", dim(test.DF)[1])
for(i in seq(HM)){
HM[[i]] <- fun(test.DF$start[i], test.DF$end[i])
}
HM2 <- as.POSIXlt(unlist(HM), origin="1970-01-01")
Hour <- HM2$hour
hist(Hour, breaks=seq(0, 23))
HourMinute <- HM2$hour + HM2$min/60
hist(HourMinute, breaks=seq(0, 23, by=1/60))
I am trying to improve the memory performance for the following example:
basline df with 4 rows
df <- structure(list(sessionid = structure(c(1L, 2L, 3L, 4L), .Label =
c("AAA1", "AAA2","AAA3", "AAA4"), class = "factor"), bitrateinbps = c(10000000,
10000000, 10000000, 10000000), startdate = structure(c(1326758507, 1326758671,
1326759569, 1326760589), class = c("POSIXct", "POSIXt"), tzone = ""), enddate =
structure(c(1326765780, 1326758734, 1326760629, 1326761592), class = c("POSIXct",
"POSIXt"), tzone = "")), .Names = c("sessionid", "bitrateinbps", "startdate",
"enddate"), row.names = c(NA, 4L), class =
"data.frame")
alternate df with 8 rows
df <- structure(list(sessionid = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L),
.Label = c("AAA1", "AAA2", "AAA3", "AAA4", "AAA5", "AAA6", "AAA7", "AAA8"),
class = "factor"), bitrateinbps =c(10000000, 10000000, 10000000, 10000000,
10000000, 10000000, 10000000, 10000000), startdate = structure(c(1326758507,
1326758671, 1326759569, 1326760589, 1326761589, 1326762589, 1326763589, 1326764589),
class = c("POSIXct",
"POSIXt"), tzone = ""), enddate = structure(c(1326765780, 1326758734, 1326760629,
1326761592, 1326767592,
1326768592, 1326768700, 1326769592), class = c("POSIXct", "POSIXt"), tzone = "")),
.Names = c("sessionid",
"bitrateinbps", "startdate", "enddate"), row.names = c(NA, 8L), class =
"data.frame")
try df analysis memory usage and again for alternate df
library(xts)
fun0 <- function(i, d) {
idx0 <- seq(d$startdate[i],d$enddate[i],1) # create sequence for index
dat0 <- rep(1,length(idx0)) # create data over sequence
xts(dat0, idx0, dimnames=list(NULL,d$sessionid[i])) # xts object
}
# loop over each row and put each row into its own xts object
xl0 <- lapply(1:NROW(df), fun0, d=df)
# merge all the xts objects
xx0 <- do.call(merge, xl0)
# apply a function (e.g. colMeans) to each 15-minute period
xa0 <- period.apply(xx0, endpoints(xx0, 'minutes', 15), colSums, na.rm=TRUE)/900
xa1 <- t(xa0)
# convert from atomic vector to data frame
xa1 = as.data.frame(xa1)
# bind to df
out1 = cbind(df, xa1)
# print aggregate memory usage statistics
print(paste('R is using', memory.size(), 'MB out of limit', memory.limit(), 'MB'))
# create function to return matrix of memory consumption
object.sizes <- function()
{
return(rev(sort(sapply(ls(envir=.GlobalEnv), function (object.name)
object.size(get(object.name))))))
}
# print to console in table format
object.sizes()
results as follows:
4 row df:
xx0 = 292104 Bytes .... do.call(merge, xl0)
xl0 = 154648 Bytes .... lapply(1:NROW(df), fun0, d=df)
8 row df:
xx0 = 799480 Bytes .... do.call(merge, xl0)
xl0 = 512808 Bytes .... lapply(1:NROW(df), fun0, d=df)
I'm looking for something a little more memory efficient for the merge and lapply functions, so I can scale out the number of rows, if anyone has any suggestions and can show the comparative results for alternatives.