I have date time data table imported from Excel and the date/time column in a number format (i.e., 43596.22). I used the following code to convert the number to a date time format with UTC time zone:
info_dt1$Date_time<-convertToDateTime(info_dt1$date_time, origin = "1900-01-01",tx="UTC")
I am using the forverlaps function from data.table to merge this data table with another data table by date and time. When I first ran the following code:
info_dt3 = foverlaps(info_dt2, info_access3, by.x=c("Date_time", "dummy"), nomatch=NA)[, dummy := NULL]
I got an error message stating the two date time fields had different time zones. The time zone for the other data table was also specified as UTC.
I used the attr function to set both data tables date times columns to UTC:
#make sure all date/times have same time zone
attr(info_access2$Start_time, "tzone") <- "UTC"
attr(info_access2$End_time, "tzone") <- "UTC"
attr(info_dt1$Date_time, "tzone") <- "UTC"
When I do this, the info_dt1 data table time moves forward 4 hours and the resulting merge is off. I would like to know what I am doing incorrect when setting the format and time zone for both data tables for the merge to work correctly.
Some example data and code:
#first data table reduced example
info_dt1<-
structure(list(date_time = c(NA, 43596.2284722222, 43596.2285069444,
43596.2285416667, 43596.2285763889, 43596.2286111111, 43596.2286458333,
43596.2286805556, 43596.2287152778, 43596.22875), Temp = c(NA,
22.75, 22.66, 22.57, 22.49, 22.37, 22.28, 22.16, 22.08, 21.99
), Depth = c(NA, 0.19, 0.27, 0.7, 0.27, 0.27, 0.27, 0.19, 0.19,
0.19), Angle = c(NA, -3, -4, -3, -1, 1, -1, -2, 1, -6)), .Names = c("date_time",
"Temp", "Depth", "Angle"), row.names = c(NA, 10L), class = "data.frame")
#convert date time to POSIXct
info_dt1$Date_time<-convertToDateTime(info_dt1$date_time, origin = "1900-01-01",tx="UTC")
#second example data set
info_access2<-
structure(list(Tow = 201905001:201905010, Start_time = structure(c(1557554271,
1557564948, 1557569853, 1557573081, 1557577149, 1557582317, 1557586050,
1557588636, 1557590697, 1557593679), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), End_time = structure(c(1557555117, 1557565710,
1557570765, 1557573846, 1557577974, 1557583210, 1557586797, 1557589428,
1557591441, 1557594511), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
time_interval = structure(c(846, 762, 912, 765, 825, 893,
747, 792, 744, 832), start = structure(c(1557554271, 1557564948,
1557569853, 1557573081, 1557577149, 1557582317, 1557586050,
1557588636, 1557590697, 1557593679), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), tzone = "UTC", class = structure("Interval", package = "lubridate"))), .Names = c("Tow",
"Start_time", "End_time", "time_interval"), row.names = c(NA,
10L), class = "data.frame")
library(data.table)
#make info_dt2 and info_access2 data.tables
info_access3<-as.data.table(info_access2)
info_dt2<-as.data.table(info_dt1)
#remove NA from info_dt2
info_dt2<-info_dt2[complete.cases(info_dt2),]
#set dummy column for info_dt2
info_dt2[, dummy := Date_time]
#define setkey for info_access2
setkey(info_access3, Start_time, End_time)
#if I run the code like this I get the error message about different time zones
#use foverlaps to merge info_access3 and info_dt2
info_dt3 = foverlaps(info_dt2, info_access3, by.x=c("Date_time", "dummy"), nomatch=NA)[, dummy := NULL]
#if I run this chunk of code the times in info_dt1 are moved forward 4 hours
#make sure all date/times have same time zone
attr(info_access2$Start_time, "tzone") <- "UTC"
attr(info_access2$End_time, "tzone") <- "UTC"
attr(info_dt1$Date_time, "tzone") <- "UTC"
#make info_dt2 and info_access2 data.tables
info_access3<-as.data.table(info_access2)
info_dt2<-as.data.table(info_dt1)
#remove NA from info_dt2
info_dt2<-info_dt2[complete.cases(info_dt2),]
#but the foverlaps to merge info_access2 and info_dt2 doesn't give an error message
info_dt3 = foverlaps(info_dt2, info_access3, by.x=c("Date_time", "dummy"), nomatch=NA)[, dummy := NULL]
You can use lubridate::force_tz() to change a timestamp which had an inaccurate timezone when it was read in:
lubridate::force_tz(Sys.time(), "UTC")
#[1] "2019-06-25 14:04:32 UTC"
This will change the underlying timestamp double whereas merely altering the attribute won't.
Related
I'm using R and moveVis package of R to do some movement visualization. Below is the csv from where I import the data using read.csv
I'm having trouble converting the data.frame to moveStack using df2move
trackId,x,y,time,x1,x2,optional,sensor,timestamps
A34,19.00094708496841,72.8264388198447,2021-12-23 10:00:00,19.00094708496841,72.8264388198447,FALSE,unknown,2021-12-23 10:00:00
A34,18.986663359819435,72.84012881354482,2021-12-23 10:02:00,18.986663359819435,72.84012881354482,FALSE,unknown,2021-12-23 10:02:00
raw_data <- read.csv("mdata2.csv", header = TRUE)
m <- df2move(raw_data, proj = "+init=epsg:4326 +proj=longlat +datum=WGS84 +no_defs", x = "x1", y = "x2", time = as.POSIXct(raw_data$timestamps, format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), track_id = "trackId")
Getting this error on running above code
Error: Column named '1640233800' cannot be found in 'df'
The problem is with your time argument. The format of time in your dataset and the one you are specifying in your code do not match. That's why you are getting an error.
In case you are using excel, it formats timestamps to its own default. You'll need to change it first (if it's the case).
This is what it does:
So, please check the format in your csv and what you are specifying in your code. You can change the format in excel by selecting the timestamp values and pressing Ctrl + 1 key.
All you need is this:
raw_data$timestamps <- as.POSIXct(raw_data$timestamps, format = "%Y-%m-%d %H:%M", tz = "UTC")
m <- df2move(raw_data, proj = "+init=epsg:4326 +proj=longlat +datum=WGS84 +no_defs", x = "x1", y = "x2", time = "timestamps", track_id = "trackId")
You have to specify a "character" for time within the df2move-function. Therefore, you have to do the transformation before applying the function (as #Vishal A. suggested as well). However, the transformation to Timestamps of class POSIXct was not correct, so NAs were introduced. See the solution:
raw_data <- structure(list(trackId = c("vipin", "vipin"), x = c(72.8409492130316, 72.8363572715711), y = c(18.9968003664781, 18.9958569245008), time = c("2021-12-23 10:00:00", "2021-12-23 10:02:00"), x1 = c(72.8409492130316, 72.8363572715711), x2 = c(18.9968003664781, 18.9958569245008 ), optional = c(FALSE, FALSE), sensor = c("unknown", "unknown" ), timestamps = structure(c(NA_real_, NA_real_), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, -2L), class = "data.frame")
raw_data$timestamps <- as.POSIXct(raw_data$time, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")
m <- moveVis::df2move(raw_data, proj = "+init=epsg:4326 +proj=longlat +datum=WGS84 +no_defs", x = "x1", y = "x2", time = "timestamps", track_id = "trackId")
i have a date_time POSIXct object in a large (6 month) dataframe in 5 second increments that i want to aggregate into 30s 'blocks'. 6x 5s is 30s so nrow(df)/6 gives the correct sequence length.
I tried the following:
Date_time_30s <- aggregate(Mn$Date_time, list(seq(0, length.out = nrow(Mn)) %/% 6), FUN = mean)
The first 6 date_times look like this:
"","Date_time","Depth","Temperature","Light_Level","Date"
"1",2013-10-14 12:30:00,
"2",2013-10-14 12:30:05,
"3",2013-10-14 12:30:10,
"4",2013-10-14 12:30:15,
"5",2013-10-14 12:30:20,
"6",2013-10-14 12:30:25,
and so the mean should be 2013-10-14 12:30:12.5 but it comes out as 2013-10-14 11:30:12.
no decimal second (a simple formatting issue solved by options(digits.secs=3) ) but the hour is wrong.
What's going wrong?
dput(head(Mn))
structure(list(Date_time = structure(c(1381721400, 1381721405,
1381721410, 1381721415, 1381721420, 1381721425), class = c("POSIXct",
"POSIXt"), tzone = "Asia/Tokyo"), Depth = c(64.4476273148148,
65.9476334145628, 65.9476395143109, 66.4476456140589, 67.9476517138069,
66.9476578135549), Temperature = c(27.549999, 27.5, 27.400002,
27.35, 27.25, 27.200001), Light_Level = c(148L, 148L, 148L, 148L,
147L, 147L), Date = structure(c(15992, 15992, 15992, 15992, 15992,
15992), class = "Date"), vv = c(0, 0.300001, 1e-06, 0.100001,
0.300001, -0.199999), vv_abs = c(0, 0.300001, 1e-06, 0.100001,
0.300001, 0.199999)), row.names = c(NA, 6L), class = "data.frame")
Run this before the code:
options(digits.secs=3)
Can you run this command? It will give you the result in different time zones. Tokyo should be the correct time.
library(lubridate)
library(dplyr)
Date_time_30s <- aggregate(Mn$Date_time, list(seq(0, length.out = nrow(Mn)) %/% 6), FUN = mean) %>%
mutate(Tokyo = with_tz(x, tzone = "Asia/Tokyo"),
GMT = with_tz(x, tzone = "GMT"))
I have a dataset in which there are dates describing a time period of interest, as well as events ("Tests" in my toy example) that can fall inside or outside the period of the interest. The events also have a time and some dichotomous characteristics.
My collaborator has asked me to transform the data from this format:
structure(list(ID = c(1, 1, 2, 3), StartDate = structure(c(315878400,
315878400, 357696000, 323481600), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), EndDate = structure(c(316137600, 316310400,
357955200, 323654400), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
TestDateTime = structure(c(316135500, 315797700, 357923700,
323422560), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
TestName = c("Test1", "Test2", "Test1", "Test3"), Characteristic = c("Fast",
"Slow", "Fast", "Slow")), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
current state
to this format:
desired state
I am unsure how to accomplish this transformation or set of transformations using R, but I believe it is possible.
try the following
library(dplyr)
data %>%
select(-c(StartDate,EndDate)) %>% # Remove extra columns
tidyr::spread(TestDate, TestTime) %>% # Spread df to long form
select(-Characteristic, everything()) %>% # Move Characteristic to the end of the df
group_by(ID) %>% # Group by ID and
group_split() # split it
Take on count that the date columns of the final df are not exact as the "desire" state.
Hope this can help you.
I am having trouble with a function I wrote when trying to apply it to a dataframe to mutate in a new column
I want to add a column to a dataframe that calculates the sunrise/sunset time for all rows based on existing columns for Latitude, Longitude and Date. The sunrise/sunset calculation is derived from the "sunriseset" function from the maptools package.
Below is my function:
library(maptools)
library(tidyverse)
sunrise.set2 <- function (lat, long, date, timezone = "UTC", direction = c("sunrise", "sunset"), num.days = 1)
{
lat.long <- matrix(c(long, lat), nrow = 1)
day <- as.POSIXct(date, tz = timezone)
sequence <- seq(from = day, length.out = num.days, by = "days")
sunrise <- sunriset(lat.long, sequence, direction = "sunrise",
POSIXct = TRUE)
sunset <- sunriset(lat.long, sequence, direction = "sunset",
POSIXct = TRUE)
ss <- data.frame(sunrise, sunset)
ss <- ss[, -c(1, 3)]
colnames(ss) <- c("sunrise", "sunset")
if (direction == "sunrise") {
return(ss[1,1])
} else {
return(ss[1,2])
}
}
When I run the function for a single input I get the expected output:
sunrise.set2(41.2, -73.2, "2018-12-09 07:34:0", timezone="EST",
direction = "sunset", num.days = 1)
[1] "2018-12-09 16:23:46 EST"
However, when I try to do this on a dataframe object to mutate in a new column like so:
df <- df %>%
mutate(set = sunrise.set2(Latitude, Longitude, LocalDateTime, timezone="UTC", num.days = 1, direction = "sunset"))
I get the following error:
Error in mutate_impl(.data, dots) :
Evaluation error: 'from' must be of length 1.
The dput of my df is below. I suspect I'm not doing something right in order to properly vectorize my function but I'm not sure what.
Thanks
dput(df):
structure(list(Latitude = c(20.666, 20.676, 20.686, 20.696, 20.706,
20.716, 20.726, 20.736, 20.746, 20.756, 20.766, 20.776), Longitude = c(-156.449,
-156.459, -156.469, -156.479, -156.489, -156.499, -156.509, -156.519,
-156.529, -156.539, -156.549, -156.559), LocalDateTime = structure(c(1534318440,
1534404840, 1534491240, 1534577640, 1534664040, 1534750440, 1534836840,
1534923240, 1535009640, 1535096040, 1535182440, 1535268840), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), .Names = c("Latitude", "Longitude",
"LocalDateTime"), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"), spec = structure(list(cols = structure(list(
Latitude = structure(list(), class = c("collector_double",
"collector")), Longitude = structure(list(), class = c("collector_double",
"collector")), LocalDateTime = structure(list(format = "%m/%d/%Y %H:%M"), .Names = "format", class = c("collector_datetime",
"collector"))), .Names = c("Latitude", "Longitude", "LocalDateTime"
)), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
The problem is indeed that your function as it is now is not vectorized, it breaks if you give it more than one value. A workaround (as Suliman suggested) is using rowwise() or a variant of apply, but that would give your function a lot of unnecessary work.
So better to make it vectorized, as maptools::sunriset is also vectorized. First suggestion: Debug or rewrite it with vectors as input, and then you easily see the lines where something unexpected happens. Let's go at it line by line, I've outcommented your lines where I replace it with something else:
library(maptools)
library(tidyverse)
# sunrise.set2 <- function (lat, long, date, timezone = "UTC", direction = c("sunrise", "sunset"), num.days = 1)
sunrise.set2 <- function (lat, long, date, timezone = "UTC", direction = c("sunrise", "sunset")
# Why an argument saying how many days? You have the length of your dates
{
#lat.long <- matrix(c(long, lat), nrow = 1)
lat.long <- cbind(lon, lat)
day <- as.POSIXct(date, tz = timezone)
# sequence <- seq(from = day, length.out = num.days, by = "days") # Your days object is fine
sunrise <- sunriset(lat.long, day, direction = "sunrise",
POSIXct = TRUE)
sunset <- sunriset(lat.long, day, direction = "sunset",
POSIXct = TRUE)
# I've replaced sequence with day here
ss <- data.frame(sunrise, sunset)
ss <- ss[, -c(1, 3)]
colnames(ss) <- c("sunrise", "sunset")
if (direction == "sunrise") {
#return(ss[1,1])
return(ss[,1])
} else {
#return(ss[1,2])
return(ss[,2])
}
}
But looking at your function, I think there is still a lot of extra work done that doesn't serve any purpose.
You're calculating both sunrise and sunset, only to use one of them. And you can just pass one your direction-argument, without even looking at it.
Is it useful to ask for a seperate date and timezone? When your users give you a POSIXt-object, the timezone is included. And it's nice if you can input a string as a date, but that only works if it's in the right format. To keep it simple, I'd just ask for a POSIXct as input (which is in your example-data.frame)
Why are you making a data.frame and assigning names before returning? As soon as you're subsetting, it all gets dropped again.
Which means your function can be a lot shorter:
sunrise.set2 <- function(lat, lon, date, direction = c("sunrise", "sunset")) {
lat.long <- cbind(lon, lat)
sunriset(lat.long, date, direction=direction, POSIXct.out=TRUE)[,2]
}
If you have no control over your input you might need to add some checks, but usually I find it most useful to keep focused on just the thing you want to accomplish.
I have the data.frame in which every row is an episode with a start and an end timestamp.
test.DF<-dput(head(test.DF, n=50))
structure(list(start = structure(c(1189494920, 1189495400, 1189496120,
1189496840, 1189497440, 1189498040, 1189498640, 1189501760, 1189503560,
1190453600, 1247458520, 1247480840, 1247482880, 1247483840, 1247485040,
1247486600, 1247487320, 1247488040, 1247488760, 1247490920, 1247491280,
1247492480, 1247493680, 1247502440, 1247503160, 1247503520, 1247548040,
1247549360, 1247550680, 1247552600, 1247553920, 1247557400, 1247558000,
1247558480, 1247559440, 1247560400, 1247563760, 1247564960, 1247566640,
1247567120, 1194935549, 1194936029, 1195722629, 1195724309, 1199691029,
1199692349, 1202560229, 1208063669, 1208322989, 1188188112), class = c("POSIXct",
"POSIXt"), tzone = ""), end = structure(c(1189495280, 1189495520,
1189496360, 1189497080, 1189497560, 1189498160, 1189498760, 1189501880,
1189503920, 1190453720, 1247458640, 1247480960, 1247483480, 1247484080,
1247485640, 1247486840, 1247487560, 1247488640, 1247490440, 1247491160,
1247491520, 1247492600, 1247493920, 1247502680, 1247503400, 1247504120,
1247549240, 1247550560, 1247551280, 1247552720, 1247554400, 1247557880,
1247558240, 1247559080, 1247559560, 1247560760, 1247563880, 1247565080,
1247566760, 1247567240, 1194935669, 1194936269, 1195722749, 1195724429,
1199691269, 1199692469, 1202560349, 1208063789, 1208323109, 1188204792
), class = c("POSIXct", "POSIXt"), tzone = "")), .Names = c("start",
"end"), row.names = c(NA, 50L), class = "data.frame")
I would like to see the distribution of these episodes within a 24 hour cycle. That is either a histogram or a density plot, with the 24H day cycle in the x axis. Is this possible? I would like to ignore the dates of the episodes.
By converting to a POSIXltformat, you can easily extract the hour of the time:
par(mar=c(6,4,1,1))
Hour <- as.POSIXlt(test.DF$start)$hour
hist(Hour, breaks=seq(0, 23), main="Start time (hour)")
Edit: Adding a value for ever minute between start and end
fun <- function(start.time, end.time){
seq.POSIXt(
as.POSIXlt(
paste0("2000-01-01 ", as.POSIXlt(start.time)$hour, ":", as.POSIXlt(start.time)$min)
),
as.POSIXlt(
paste0("2000-01-01 ", as.POSIXlt(end.time)$hour, ":", as.POSIXlt(end.time)$min)
),
by="min"
)
}
HM <- vector(mode="list", dim(test.DF)[1])
for(i in seq(HM)){
HM[[i]] <- fun(test.DF$start[i], test.DF$end[i])
}
HM2 <- as.POSIXlt(unlist(HM), origin="1970-01-01")
Hour <- HM2$hour
hist(Hour, breaks=seq(0, 23))
HourMinute <- HM2$hour + HM2$min/60
hist(HourMinute, breaks=seq(0, 23, by=1/60))