Make datetime derived from one column - r

I want to create a new column datetime that contains the recorded date-times, derived from the path. The path column is formed like (data/aklbus2017/2017-03-09-05-14.csv), and I need to make a dttm column which make path column became (2017-03-09 05:14:00 ) How can I do it?
The path column looks like
#> # A tibble: 43,793 x 5
#> path delay stop.id stop.sequence route
#> <chr> <dbl> <dbl> <dbl> <chr>
#> 1 data/aklbus2017/2017-… 150 5050 28 15401-201702141…
#> 2 data/aklbus2017/2017-… 97 3093 6 83401-201702141…
#> 3 data/aklbus2017/2017-… 50 4810 13 98105-201702141…
#> 4 data/aklbus2017/2017-… 58 6838 5 36201-201702141…
#> 5 data/aklbus2017/2017-… -186 2745 11 37301-201702141…
#> 6 data/aklbus2017/2017-… 183 2635 14 03301-201702141…
#> 7 data/aklbus2017/2017-… -144 3360 4 10001-201702141…
#> 8 data/aklbus2017/2017-… -151 2206 20 38011-201702141…
#> 9 data/aklbus2017/2017-… -46 2419 38 38011-201702141…
#> 10 data/aklbus2017/2017-… -513 6906 42 38012-201702141…
#> # … with 43,783 more rows
which i want is
#> # A tibble: 43,793 x 5
#> datetime delay stop.id stop.sequence route
#> <dttm> <dbl> <dbl> <dbl> <chr>
#> 1 2017-03-09 05:14:00 150 5050 28 15401
#> 2 2017-03-09 05:14:00 97 3093 6 83401
#> 3 2017-03-09 05:14:00 50 4810 13 98105
#> 4 2017-03-09 05:14:00 58 6838 5 36201
#> 5 2017-03-09 05:14:00 -186 2745 11 37301
#> 6 2017-03-09 05:14:00 183 2635 14 03301
#> 7 2017-03-09 05:14:00 -144 3360 4 10001
#> 8 2017-03-09 05:14:00 -151 2206 20 38011
#> 9 2017-03-09 05:14:00 -46 2419 38 38011
#> 10 2017-03-09 05:14:00 -513 6906 42 38012
#> # … with 43,783 more rows

We could use parse_date_time function from lubridate after
we used str_sub from stringrpackage
# Example data
df <- tribble(
~path,
"data/aklbus2017/2017-03-09-05-14.csv",
"data/aklbus2017/2017-03-09-06-14.csv",
"data/aklbus2017/2017-03-09-07-14.csv",
"data/aklbus2017/2017-03-09-08-14.csv",
"data/aklbus2017/2017-03-09-09-14.csv",
)
# The code:
library(tidyverse)
library(lubridate)
df %>%
mutate(datetime = parse_date_time(str_sub(path, start=17, end = 32), "ymd_hm"))
Output:
path datetime
<chr> <dttm>
1 data/aklbus2017/2017-03-09-05-14.csv 2017-03-09 05:14:00
2 data/aklbus2017/2017-03-09-06-14.csv 2017-03-09 06:14:00
3 data/aklbus2017/2017-03-09-07-14.csv 2017-03-09 07:14:00
4 data/aklbus2017/2017-03-09-08-14.csv 2017-03-09 08:14:00
5 data/aklbus2017/2017-03-09-09-14.csv 2017-03-09 09:14:00

Here's a way using basename and tools::file_name_sans_ext
library(tools)
df <- data.frame(path=c('data/abc/2017-03-09-05-14.csv','data/xyz/2017-03-10-05-14.csv'))
df$datetime <- as.POSIXct(tools::file_path_sans_ext(basename(df$path)), format="%Y-%m-%d-%H-%M",tz='UTC')
df
path datetime
1 data/abc/2017-03-09-05-14.csv 2017-03-09 05:14:00
2 data/xyz/2017-03-09-05-14.csv 2017-03-09 05:14:00

With a view to these problems, read up on how to work with strings. There are many ways to break up strings in parts, replace parts and/or reassemble them differently.
Using the {tidyverse}, you can do the following.
library(tidyverse)
library(lubridate) # for time parsing - ymd_hms()
#-------- the data
df <- data.frame(path = "data/aklbus2017/2017-03-09-05-14.csv") %>%
#--------- tidyr to break up path into new columns
separate( col = path
,into = c("folder","sub-folder","file")
, sep = "/"
) %>%
#----------- string operation
mutate( dttm = str_remove(string = file, pattern = ".csv")
, dttm2 = ymd_hm(dttm) )
This gives you:
str(df)
'data.frame': 1 obs. of 5 variables:
$ folder : chr "data"
$ sub-folder: chr "aklbus2017"
$ file : chr "2017-03-09-05-14.csv"
$ dttm : chr "2017-03-09-05-14"
$ dttm2 : POSIXct, format: "2017-03-09 05:14:00"
There is no need to keep all columns. You can combine the string operation into one mutate() call. I just put it here to give you an idea on how to "step" through a series of steps to handle your string problem.

library(stringr)
library(rebus)
#>
#> Attaching package: 'rebus'
#> The following object is masked from 'package:stringr':
#>
#> regex
library(tidyverse)
library(chron)
datetime <-
c('data/aklbus2017/2017-03-09-05-14.csv',
'data/aklbus2017/2017-03-09-05-15.csv',
'data/aklbus2017/2017-03-09-05-16.csv')
date_separated <-
str_match(datetime, '2017/' %R% capture('.*') %R% '\\.csv$')[, 2] %>%
str_match(capture(one_or_more(DGT) %R% '-' %R% one_or_more(DGT) %R% '-' %R% one_or_more(DGT)) %R% '-' %R% capture('.*$')) %>%
`[`(, 2:3)
date_separated
#> [,1] [,2]
#> [1,] "2017-03-09" "05-14"
#> [2,] "2017-03-09" "05-15"
#> [3,] "2017-03-09" "05-16"
date_separated[, 2] <- date_separated[, 2] %>% str_replace('-', ':') %>% str_c(':00')
chron(dates=date_separated[,1],times=date_separated[,2],format=c('y-m-d','h:m:s')) %>% as.POSIXct() %>% tibble(datetime = .)
#> # A tibble: 3 x 1
#> datetime
#> <dttm>
#> 1 2017-03-09 02:14:00
#> 2 2017-03-09 02:15:00
#> 3 2017-03-09 02:16:00
#bind_cols(datetime, data)
Created on 2021-06-06 by the reprex package (v2.0.0)

Related

R: Accessing first three elements of splitted dataframe

For example,
dateIntervals <- as.Date(c("2020-08-10", "2020-11-11", "2021-07-05"))
possibleDates <- seq(as.Date("2020-01-02"), dateIntervals[3], by = "day")
genDF<-function() data.frame(Date = sample(possibleDates, 100), Value = runif(100))
listdf <-replicate(2, genDF(), simplify = FALSE)
Yes, listdf has two dataframe elements(each 100 random dates in possibleDates and values)
and listdf[[1]] is like this
A data.frame: 100 × 2
Date Value
<date> <dbl>
2020-07-24 0.63482411
2020-02-26 0.25989280
2020-10-26 0.21721077
2020-10-11 0.34774192
2020-08-18 0.67758312
2020-02-03 0.22929624
2020-06-10 0.30279353
2020-05-29 0.95549488
...
lapply(listdf, function(x) split(x, findInterval(x$Date, dateIntervals)))
Made listdf as a 2*3 list, splitted by date.
1.$`0`
A data.frame: 43 × 2
Date Value
<date> <dbl>
1 2020-07-24 0.63482411
2 2020-02-26 0.25989280
6 2020-02-03 0.22929624
7 2020-06-10 0.30279353
...
$`1`
A data.frame: 15 × 2
Date Value
<date> <dbl>
3 2020-10-26 0.21721077
4 2020-10-11 0.34774192
5 2020-08-18 0.67758312
31 2020-11-09 0.59149301
...
$`2`
A data.frame: 42 × 2
Date Value
<date> <dbl>
9 2021-06-28 0.10055644
10 2021-05-17 0.63942936
12 2021-04-22 0.63589801
13 2021-02-01 0.70106156
...
2.$`0`
A data.frame: 43 × 2
Date Value
<date> <dbl>
2 2020-07-16 0.81376364
4 2020-07-03 0.05152627
7 2020-01-21 0.98677433
8 2020-03-23 0.13513921
...
$`1`
A data.frame: 18 × 2
Date Value
<date> <dbl>
5 2020-11-01 0.02740125
12 2020-09-04 0.82042568
15 2020-08-12 0.54190868
16 2020-09-19 0.05933666
18 2020-10-05 0.04983061
...
$`2`
A data.frame: 38 × 2
Date Value
<date> <dbl>
1 2021-04-13 0.46199245
3 2021-06-12 0.71461155
6 2021-01-24 0.56527997
9 2021-04-17 0.72634151
13 2021-04-20 0.55489499
...
I want only first two of the splitted ones.($'0' and $'1' for 1. and 2.)
is there any parameter in the split function which does things like this?
(getting only first or last n elements)
I want something like this...
lapply(listdf, function(x) split(x, findInterval(x$Date, dateIntervals), some parameter=2))
yes this "2". Getting only the first two ones. is there a function parameter in split which can do this?

Is there a way to group data according to time in R?

I'm working with trip ticket data and it includes a column with dates and times. I'm want to group trips according to Morning(05:00 - 10:59), Lunch(11:00-12:59), Afternoon(13:00-17:59), Evening(18:00-23:59), and Dawn/Graveyard(00:00-04:59) and then count the number of trips (by means of counting the unique values in the trip_id column) for each of those categories.
Only I don't know how to group/summarize according to time values. Is this possible in R?
trip_id start_time end_time day_of_week
1 CFA86D4455AA1030 2021-03-16 08:32:30 2021-03-16 08:36:34 Tuesday
2 30D9DC61227D1AF3 2021-03-28 01:26:28 2021-03-28 01:36:55 Sunday
3 846D87A15682A284 2021-03-11 21:17:29 2021-03-11 21:33:53 Thursday
4 994D05AA75A168F2 2021-03-11 13:26:42 2021-03-11 13:55:41 Thursday
5 DF7464FBE92D8308 2021-03-21 09:09:37 2021-03-21 09:27:33 Sunday
Here's a solution with hour() and case_when().
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
trip <- tibble(start_time = mdy_hm("1/1/2022 1:00") + minutes(seq(0, 700, 15)))
trip <- trip %>%
mutate(
hr = hour(start_time),
time_of_day = case_when(
hr >= 5 & hr < 11 ~ "morning",
hr >= 11 & hr < 13 ~ "afternoon",
TRUE ~ "fill in the rest yourself :)"
)
)
print(trip)
#> # A tibble: 47 x 3
#> start_time hr time_of_day
#> <dttm> <int> <chr>
#> 1 2022-01-01 01:00:00 1 fill in the rest yourself :)
#> 2 2022-01-01 01:15:00 1 fill in the rest yourself :)
#> 3 2022-01-01 01:30:00 1 fill in the rest yourself :)
#> 4 2022-01-01 01:45:00 1 fill in the rest yourself :)
#> 5 2022-01-01 02:00:00 2 fill in the rest yourself :)
#> 6 2022-01-01 02:15:00 2 fill in the rest yourself :)
#> 7 2022-01-01 02:30:00 2 fill in the rest yourself :)
#> 8 2022-01-01 02:45:00 2 fill in the rest yourself :)
#> 9 2022-01-01 03:00:00 3 fill in the rest yourself :)
#> 10 2022-01-01 03:15:00 3 fill in the rest yourself :)
#> # ... with 37 more rows
trips <- trip %>%
count(time_of_day)
print(trips)
#> # A tibble: 3 x 2
#> time_of_day n
#> <chr> <int>
#> 1 afternoon 7
#> 2 fill in the rest yourself :) 16
#> 3 morning 24
Created on 2022-03-21 by the reprex package (v2.0.1)

R convert "Y-m-d" or "m/d/Y" to the same format

I have a huge (~10.000.000 rows) dataframe with a column that consists dates, i.e:
df <- data.frame(StartDate = as.character(c("2014-08-20 11:59:38",
"2014-08-21 16:17:44",
"2014-08-22 19:02:10",
"9/1/2014 08:05:13",
"9/2/2014 15:13:28",
"9/3/2014 00:22:01")))
The problem is that date formats are mixed - I would like to standardise them so as to get:
StartDate
1 2014-08-20
2 2014-08-21
3 2014-08-22
4 2014-09-01
5 2014-09-02
6 2014-09-03
1. as.Date() approach
as.Date("2014-08-31 23:59:38", "%m/%d/%Y")
as.Date("9/1/2014 00:00:28", "%m/%d/%Y")
gives
[1] NA
[1] "2014-09-01"
2. lubridate approach
dmy("9/1/2014 00:00:28")
mdy("9/1/2014 00:00:28")
dmy("2014-08-31 23:59:38")
mdy("2014-08-31 23:59:38")
in each case returns
[1] NA
Warning message:
All formats failed to parse. No formats found.
Is there any neat solution to that?
Easier maybe to use parse_date
library(parsedate)
df$StartDate <- as.Date(parse_date(df$StartDate))
-output
> df$StartDate
[1] "2014-08-20" "2014-08-21" "2014-08-22" "2014-09-01" "2014-09-02" "2014-09-03"
I have just found out that anytime::anydate extracts the dates directly and straightforwardly:
library(anytime)
library(tidyverse)
df %>%
mutate(Date = anydate(StartDate))
#> StartDate Date
#> 1 2014-08-20 11:59:38 2014-08-20
#> 2 2014-08-21 16:17:44 2014-08-21
#> 3 2014-08-22 19:02:10 2014-08-22
#> 4 9/1/2014 08:05:13 2014-09-01
#> 5 9/2/2014 15:13:28 2014-09-02
#> 6 9/3/2014 00:22:01 2014-09-03
Another solution, based on lubridate:
library(tidyverse)
library(lubridate)
df %>%
mutate(Date = if_else(!str_detect(StartDate,"/"),
date(ymd_hms(StartDate, quiet = T)), date(mdy_hms(StartDate, quiet = T))))
#> StartDate Date
#> 1 2014-08-20 11:59:38 2014-08-20
#> 2 2014-08-21 16:17:44 2014-08-21
#> 3 2014-08-22 19:02:10 2014-08-22
#> 4 9/1/2014 08:05:13 2014-09-01
#> 5 9/2/2014 15:13:28 2014-09-02
#> 6 9/3/2014 00:22:01 2014-09-03

Cannot filter column when name of that column comes from variable

As default I set the argument cut.points as NA and if it's on default then it shouldn't do anything with the data.
But if user decides to put for example cut.points = c("2012-01-01", "2013-01-01") then the data should be filtered by the column that has dates in it. And it should return only dates between 2012 to 2013.
The problem is that I'm reading data from the function so in theory i won't know what is the name of this date column that uses provides. So i find the column with dates and store it's name in the variable.
But the condition which i wrote that should filter based od this variable doesn't work:
modifier <- function(input.data, cut.points = c(NA, NA)) {
date_check <- sapply(input.data, function(x) !all(is.na(as.Date(as.character(x),format="%Y-%m-%d"))))
if (missing(cut.points)) {
input.data
} else {
cols <- colnames(select_if(input.data, date_check == TRUE))
cut.points <- as.Date(cut.points)
input.data <- filter(input.data, cols > cut.points[1] & cols < cut.points[2])
}
}
for ex. when i try to run this:
modifier(ex_data, cut.points = c("2012-01-01", "2013-01-01"))
On sample like this:
ex_data
Row.ID Order.ID Order.Date
1 32298 CA-2012-124891 2012-07-31
2 26341 IN-2013-77878 2013-02-05
3 25330 IN-2013-71249 2013-10-17
4 13524 ES-2013-1579342 2013-01-28
5 47221 SG-2013-4320 2013-11-05
6 22732 IN-2013-42360 2013-06-28
7 30570 IN-2011-81826 2011-11-07
8 31192 IN-2012-86369 2012-04-14
9 40155 CA-2014-135909 2014-10-14
10 40936 CA-2012-116638 2012-01-28
11 34577 CA-2011-102988 2011-04-05
12 28879 ID-2012-28402 2012-04-19
13 45794 SA-2011-1830 2011-12-27
14 4132 MX-2012-130015 2012-11-13
15 27704 IN-2013-73951 2013-06-06
16 13779 ES-2014-5099955 2014-07-31
17 36178 CA-2014-143567 2014-11-03
18 12069 ES-2014-1651774 2014-09-08
19 22096 IN-2014-11763 2014-01-31
20 49463 TZ-2014-8190 2014-12-05
the error is:
character string is not in a standard unambiguous format
I've added lubridateas a dependency so I could get access to %within% and is.Date. I've also changed the check condition, because I don't think your original one would work with NA, NA.
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
ex_data <- read_table(" Row.ID Order.ID Order.Date
1 32298 CA-2012-124891 2012-07-31
2 26341 IN-2013-77878 2013-02-05
3 25330 IN-2013-71249 2013-10-17
4 13524 ES-2013-1579342 2013-01-28
5 47221 SG-2013-4320 2013-11-05
6 22732 IN-2013-42360 2013-06-28
7 30570 IN-2011-81826 2011-11-07
8 31192 IN-2012-86369 2012-04-14
9 40155 CA-2014-135909 2014-10-14
10 40936 CA-2012-116638 2012-01-28
11 34577 CA-2011-102988 2011-04-05
12 28879 ID-2012-28402 2012-04-19
13 45794 SA-2011-1830 2011-12-27
14 4132 MX-2012-130015 2012-11-13
15 27704 IN-2013-73951 2013-06-06
16 13779 ES-2014-5099955 2014-07-31
17 36178 CA-2014-143567 2014-11-03
18 12069 ES-2014-1651774 2014-09-08
19 22096 IN-2014-11763 2014-01-31
20 49463 TZ-2014-8190 2014-12-05")
#> Warning: Missing column names filled in: 'X1' [1]
modifier <- function(input.data, cut.points = NULL) {
if (length(cut.points) == 2) {
date_col <- colnames(input.data)[sapply(input.data, is.Date)]
filtered.data <- input.data %>%
rename(Date = !! date_col) %>%
filter(Date %within% interval(cut.points[1], cut.points[2])) %>%
rename_with(~ date_col, Date)
return(filtered.data)
} else {
input.data
}
}
modifier(ex_data, cut.points = c("2012-01-01", "2013-01-01"))
#> # A tibble: 5 x 4
#> X1 Row.ID Order.ID Order.Date
#> <dbl> <dbl> <chr> <date>
#> 1 1 32298 CA-2012-124891 2012-07-31
#> 2 8 31192 IN-2012-86369 2012-04-14
#> 3 10 40936 CA-2012-116638 2012-01-28
#> 4 12 28879 ID-2012-28402 2012-04-19
#> 5 14 4132 MX-2012-130015 2012-11-13

Set up data in order to use Prophet() in R

I want to use the Prophet() function in R, but I cannot transform my column "YearWeek" to a as.Date() column.
I have a column "YearWeek" that stores values from 201401 up to 201937 i.e. starting in 2014 week 1 up to 2019 week 37.
I don't know how to declare this column as a date in the form yyyy-ww needed to use the Prophet() function.
Does anyone know how to do this?
Thank you in advance.
One solution could be to append a 01 to the end of your yyyy-ww formatted dates.
Data:
library(tidyverse)
df <- cross2(2014:2019, str_pad(1:52, width = 2, pad = 0)) %>%
map_df(set_names, c("year", "week")) %>%
transmute(date = paste(year, week, sep = "")) %>%
arrange(date)
head(df)
#> # A tibble: 6 x 1
#> date
#> <chr>
#> 1 201401
#> 2 201402
#> 3 201403
#> 4 201404
#> 5 201405
#> 6 201406
Now let's append the 01 and convert to date:
df %>%
mutate(date = paste(date, "01", sep = ""),
new_date = as.Date(date, "%Y%U%w"))
#> # A tibble: 312 x 2
#> date new_date
#> <chr> <date>
#> 1 20140101 2014-01-05
#> 2 20140201 2014-01-12
#> 3 20140301 2014-01-19
#> 4 20140401 2014-01-26
#> 5 20140501 2014-02-02
#> 6 20140601 2014-02-09
#> 7 20140701 2014-02-16
#> 8 20140801 2014-02-23
#> 9 20140901 2014-03-02
#> 10 20141001 2014-03-09
#> # ... with 302 more rows
Created on 2019-10-10 by the reprex package (v0.3.0)
More info about a numeric week of the year can be found here.

Resources