I am new here please be gentle ;)
I have two time columns in a dataframe in R that uses the HMM /HHMM format as a numeric. For example, 03:13 would be 313 and 14:14 would be 1414. An example would be sched_arr_time and sched_dep_time in the nycflights13 package.
I need to calculate the time difference in minutes. My SQL knowledge tells me I would substring this with a case when and then glue it back together as a time format somehow but I was hoping there is a more elegant way in R to deal with this?
Many thanks for your help!
This would explain the data:
library(nycflights13)
flights %>% select(sched_dep_time, sched_arr_time)
We can convert to time class with as.ITime after changing the format to HH:MM with str_pad and str_replace, and then take the difference using difftime
library(dplyr)
library(stringr)
library(data.table)
flights %>%
head %>%
select(sched_dep_time, sched_arr_time) %>%
mutate_all(~ str_pad(., width = 4, pad = 0) %>%
str_replace(., '^(..)', '\\1:') %>%
as.ITime) %>%
mutate(diff = difftime(sched_arr_time, sched_dep_time, unit = 'min'))
# A tibble: 6 x 3
# sched_dep_time sched_arr_time diff
# <ITime> <ITime> <drtn>
#1 05:15:00 08:19:00 184 mins
#2 05:29:00 08:30:00 181 mins
#3 05:40:00 08:50:00 190 mins
#4 05:45:00 10:22:00 277 mins
#5 06:00:00 08:37:00 157 mins
#6 05:58:00 07:28:00 90 mins
If we want to add a 'Date' as well, then we
library(lubridate)
flights %>%
head %>%
select(sched_dep_time, sched_arr_time) %>%
mutate_all(~ str_pad(., width = 4, pad = 0) %>%
str_replace("^(..)(..)", "\\1:\\2:00") %>%
str_c(Sys.Date(), ., sep=' ') %>%
ymd_hms) %>%
mutate(diff = difftime(sched_arr_time, sched_dep_time, unit = 'min'))
Here is another option using strptime
as_time <- function(x)
as.POSIXct(strptime(if_else(nchar(x) == 3, paste0("0", x), as.character(x)), "%H%M"))
flights %>%
select(sched_dep_time, sched_arr_time) %>%
mutate(diff_in_mins = difftime(as_time(sched_arr_time), as_time(sched_dep_time), "mins"))
## A tibble: 336,776 x 3
# sched_dep_time sched_arr_time diff_in_mins
# <int> <int> <drtn>
# 1 515 819 184 mins
# 2 529 830 181 mins
# 3 540 850 190 mins
# 4 545 1022 277 mins
# 5 600 837 157 mins
# 6 558 728 90 mins
# 7 600 854 174 mins
# 8 600 723 83 mins
# 9 600 846 166 mins
#10 600 745 105 mins
## … with 336,766 more rows
Related
I would like to calculate the number of days between rows of a data.frame groped by a couple of fields, so if I have the following data.frame:
da <- read.table(text="i j data date
2 682 147 2008-05-26
2 682 317 2010-11-13
2 682 217 2019-08-05
3 682 147 2008-05-26
3 682 317 2010-11-13
10 682 220 2019-08-08", header=TRUE)
require(dplyr)
da %>% count(i,j)
I would like to calculate two periods for the first group, 1 period for the second, and none for the last.
I can calculate the intervals between the first and last dates
require(lubridate)
da %>% group_by(i,j) %>%
summarize(fini=ymd(min(date)),fend=ymd(max(date)),deltaD=as.numeric(fend - fini))
`summarise()` regrouping output by 'i' (override with `.groups` argument)
# A tibble: 3 x 5
# Groups: i [3]
i j fini fend deltaD
<int> <int> <date> <date> <dbl>
1 2 682 2008-05-26 2019-08-05 4088
2 3 682 2008-05-26 2010-11-13 901
3 10 682 2019-08-08 2019-08-08 0
that is fine if I have two rows in the group, but I can't figure out how to do it if I have 3 or more.
Do you need something like this ?
library(dplyr)
da %>%
mutate(date = as.Date(date)) %>%
arrange(i, j, date) %>%
group_by(i, j) %>%
transmute(fini = date, fend = lead(date), deltaD = as.numeric(fend - fini)) %>%
na.omit() %>%
ungroup
# i j fini fend deltaD
# <int> <int> <date> <date> <dbl>
#1 2 682 2008-05-26 2010-11-13 901
#2 2 682 2010-11-13 2019-08-05 3187
#3 3 682 2008-05-26 2010-11-13 901
The following seems to work. I used lead (after arrange to order by date), and some tweaks to avoid dropping groups with only one date
da %>%
group_by(i,j) %>%
dplyr::arrange(date) %>%
dplyr::mutate(lead_date = dplyr::lead(date),
one_in_group = n() == 1) %>% # to maintain row with i = 10
dplyr::filter(!is.na(lead_date) | one_in_group) %>%
dplyr::mutate(lead_date = ifelse(one_in_group, date, lead_date),# handles solo dates
date = lubridate::ymd(date),
lead_date = lubridate::ymd(lead_date),
deltaD = as.numeric(lead_date - date)) %>%
dplyr::select(- one_in_group) %>%
dplyr::arrange(i, j, date)
# A tibble: 4 x 6
# Groups: i, j [3]
i j data date lead_date deltaD
<int> <int> <int> <date> <date> <dbl>
1 2 682 147 2008-05-26 2010-11-13 901
2 2 682 317 2010-11-13 2019-08-05 3187
3 3 682 147 2008-05-26 2010-11-13 901
4 10 682 220 2019-08-08 2019-08-08 0
subset data e.g. all previous year and store as new object.
mtdl <- na.omit(getSymbols("MTDL.JK", auto.assign = F, src = "yahoo", periodicity = "weekly"))
week.year.mtdl <- mtdl %>%
filter(DATE >= as.Date("2018-01-01") & DATE <= as.Date("2018-12-31"))
Here are a few ways to go about this if you want to use dplyr.
1 transform xts into data.frame
df_mtdl <- data.frame(date = index(mtdl), coredata(mtdl))
week.year.mtdl <- df_mtdl %>%
filter(date >= as.Date("2018-01-01") & date <= as.Date("2018-12-31"))
head(week.year.mtdl)
date MTDL.JK.Open MTDL.JK.High MTDL.JK.Low MTDL.JK.Close MTDL.JK.Volume MTDL.JK.Adjusted
1 2018-01-01 650 650 620 630 78200 609.6684
2 2018-01-08 630 650 610 610 291800 590.3138
3 2018-01-15 610 750 600 700 9390700 677.4093
4 2018-01-22 700 730 640 700 6816200 677.4093
5 2018-01-29 700 745 685 685 119900 662.8934
6 2018-02-05 695 715 630 635 1533000 614.5070
2 use tidyquant. This returns a tibble instead of an xts object. Tidyquant is built on top of quantmod and a lot of other packages.
library(tidyquant)
tq_mtdl <- tq_get("MTDL.JK", complete_cases = TRUE, periodicity = "weekly")
week.year.mtdl <- tq_mtdl %>%
filter(date >= as.Date("2018-01-01") & date <= as.Date("2018-12-31"))
head(week.year.mtdl)
# A tibble: 6 x 7
date open high low close volume adjusted
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2018-01-04 645 645 620 625 137000 605.
2 2018-01-11 620 660 600 645 1460000 624.
3 2018-01-18 645 750 635 660 13683700 639.
4 2018-01-25 680 745 665 685 1359700 663.
5 2018-02-01 700 715 675 700 922200 677.
6 2018-02-08 695 695 630 690 673700 668.
Or use packages timetk (used as part of tidyquant) or tsbox to transform the data from xts to data.frame or tibble.
This will give 2018 points of an xts object
mtdl["2018"]
All of these also work:
subset(mtdl, time(.) >= "2018-01-01" & time(.) <= "2018-12-31")
subset(mtdl, start = "2018-01-01", end = "2018-12-31")
window(mtdl, start = "2018-01-01", end = "2018-12-31")
dates <- seq(as.Date("2008-01-01"), as.Date("2008-12-31"), "day")
window(mtdl, dates)
mtdl[dates] # dates is from above
mtdl[ format(time(mtdl), "%Y") == 2018 ]
I am using tidyr::nest to deliver a grouped_by table to function boot and boot.ci from boot package in order to calculate mean and confidence interval for a non-parametric statistic. This works fine for non-overlapping groups like below:
library(dplyr)
library(tidyr)
library(purrr)
library(lubridate)
library(broom)
library(boot)
#toy example
set.seed(1)
Sys.setenv(TZ="America/Chicago")
df <- data.frame(date = mdy("01-01-2018")+ddays(sample(0:364,100,replace = T)),
score = sample(0:10,100,replace = T,prob=c(0.15,0.15,rep(0.15/7,7),0.25,0.3)))
# the statistic of interest
net_promoter_score <- function(data,col_name='score') {
return(
(sum(data[[col_name]]>=9,na.rm=TRUE)-
sum(data[[col_name]]<=6,na.rm=TRUE))/sum(!is.na(data[[col_name]]))*100
)
}
# boot needs to resample the staistic by index
nps_boot <- function(d,i) net_promoter_score(d[i,])
#do NPS confidence intervals by month - this works fine!
by_month = df %>%
mutate(month = lubridate::month(date,label=T,abbr=T)) %>%
nest(-month) %>%
mutate(boots = map(data, ~boot::boot(.x,nps_boot,R=4999)),
CI = map(boots, ~boot::boot.ci(.x,conf=0.9)$bca),
tidied_NPS = map(boots,broom::tidy),
tidied_CI = map(CI,broom::tidy)
) %>%
unnest(tidied_NPS,tidied_CI,.drop=T) %>%
select(month,mean=statistic,CI10=V4,CI90=V5)
by_month %>% head
A tibble: 6 x 4
month mean CI10 CI90
<ord> <dbl> <dbl> <dbl>
1 Apr 0 -100 33.3
2 May 6.67 -46.7 33.3
3 Jul 60 -100 60
4 Nov -20 -80 20
5 Mar -11.1 -66.7 33.3
6 Dec 0 -100 50
But I would like to do this for a sliding window - kind of like a moving average except I would like to use a different statistic to slide over. I can do this with lapply but I would like to use tidyverse.
#do 50-sample sliding window. I would like to solve this with tidyverse
window_size = 50
results = lapply(1:(nrow(df)-window_size), function(x) {
boot_df = df %>% arrange(date) %>% slice(x:(x+window_size-1))
boot = boot::boot(boot_df,nps_boot,R=999)
CI = boot.ci(boot,conf=0.9)$bca[4:5]
return(c(x,mean(boot$t),CI))
})
by_slide = as.data.frame(do.call(rbind, results)) %>%
select(date=V1,mean=V2,CI10=V3,CI90=V4) %>%
mutate(date = mdy("01-01-2018")+ddays((window_size %/% 2)+date))
by_slide %>% head
date mean CI10 CI90
1 2018-01-27 15.40541 -8.00000 38
2 2018-01-28 15.94194 -8.00000 36
3 2018-01-29 15.83383 -8.00000 36
4 2018-01-30 15.24525 -8.00000 38
5 2018-01-31 15.79780 -10.00000 36
6 2018-02-01 15.82583 -10.92218 36
You can use purrr::map_dfr():
results <- purrr::map_dfr(1:(nrow(df)-window_size), function(x) {
boot_df = df %>% arrange(date) %>% slice(x:(x+window_size-1))
boot = boot::boot(boot_df,nps_boot,R=999)
CI = boot.ci(boot,conf=0.9)$bca[4:5]
list(date = boot_df$date[1],
mean = mean(boot$t),
ci_lo = CI[1],
ci_hi = CI[2])
})
results
# A tibble: 50 x 4
date mean ci_lo ci_hi
<date> <dbl> <dbl> <dbl>
1 2018-01-05 15.6 -8 38
2 2018-01-09 16.3 -8 36
3 2018-01-22 16.2 -10 36
4 2018-01-23 15.6 -10 36
5 2018-01-26 15.2 -10 36
6 2018-01-31 16.5 -10 36
7 2018-02-06 19.7 -4.75 40
8 2018-02-09 19.5 -8 40
9 2018-02-14 16.3 -10 36
10 2018-02-15 16.1 -10 36
# … with 40 more rows
Then you can use results directly in computing by_slide:
by_slide = results %>%
mutate(date = mdy("01-01-2018") + ddays(window_size %/% 2))
Although I admit I don't understand how adding date in the ddays duration object works, that doesn't seem to come out with your provided output. But I'm assuming that's a syntax issue - separate from your question about how to replace lapply.
Suppose I have a daily rain data.frame like this:
df.meteoro = data.frame(Dates = seq(as.Date("2017/1/19"), as.Date("2018/1/18"), "days"),
rain = rnorm(length(seq(as.Date("2017/1/19"), as.Date("2018/1/18"), "days"))))
I'm trying to sum the accumulated rain between a 14 days interval with this code:
library(tidyverse)
library(lubridate)
df.rain <- df.meteoro %>%
mutate(TwoWeeks = round_date(df.meteoro$data, "14 days")) %>%
group_by(TwoWeeks) %>%
summarise(sum_rain = sum(rain))
The problem is that it isn't starting on 2017-01-19 but on 2017-01-15 and I was expecting my output dates to be:
"2017-02-02" "2017-02-16" "2017-03-02" "2017-03-16" "2017-03-30" "2017-04-13"
"2017-04-27" "2017-05-11" "2017-05-25" "2017-06-08" "2017-06-22" "2017-07-06" "2017-07-20"
"2017-08-03" "2017-08-17" "2017-08-31" "2017-09-14" "2017-09-28" "2017-10-12" "2017-10-26"
"2017-11-09" "2017-11-23" "2017-12-07" "2017-12-21" "2018-01-04" "2018-01-18"
TL;DR I have a year long daily rain data.frame and want to sum the accumulate rain for the dates above.
Please help.
Use of round_date in the way you have shown it will not give you 14-day periods as you might expect. I have taken a different approach in this solution and generated a sequence of dates between your first and last dates and grouped these into 14-day periods then joined the dates to your observations.
startdate = min(df.meteoro$Dates)
enddate = max(df.meteoro$Dates)
dateseq =
data.frame(Dates = seq.Date(startdate, enddate, by = 1)) %>%
mutate(group = as.numeric(Dates - startdate) %/% 14) %>%
group_by(group) %>%
mutate(starts = min(Dates))
df.rain <- df.meteoro %>%
right_join(dateseq) %>%
group_by(starts) %>%
summarise(sum_rain = sum(rain))
head(df.rain)
> head(df.rain)
# A tibble: 6 x 2
starts sum_rain
<date> <dbl>
1 2017-01-19 6.09
2 2017-02-02 5.55
3 2017-02-16 -3.40
4 2017-03-02 2.55
5 2017-03-16 -0.12
6 2017-03-30 8.95
Using a right-join to the date sequence is to ensure that if there are missing observation days that spanned a complete time period you'd still get that period listed in the result (though in your case you have a complete year of dates anyway).
round_date rounds to the nearest multiple of unit (here, 14 days) since some epoch (probably the Unix epoch of 1970-01-01 00:00:00), which doesn't line up with your purpose.
To get what you want, you can do the following:
df.rain = df.meteoro %>%
mutate(days_since_start = as.numeric(Dates - as.Date("2017/1/18")),
TwoWeeks = as.Date("2017/1/18") + 14*ceiling(days_since_start/14)) %>%
group_by(TwoWeeks) %>%
summarise(sum_rain = sum(rain))
This computes days_since_start as the days since 2017/1/18 and then manually rounds to the next multiple of two weeks.
Assuming you want to round to the closest date from the ones you have specified I guess the following will work
targetDates<-seq(ymd("2017-02-02"),ymd("2018-01-18"),by='14 days')
df.meteoro$Dates=targetDates[sapply(df.meteoro$Dates,function(x) which.min(abs(interval(targetDates,x))))]
sum_rain=ddply(df.meteoro,.(Dates),summarize,sum_rain=sum(rain,na.rm=T))
as you can see not all dates have the same number of observations. Date "2017-02-02" for instance has all the records between "2017-01-19" until "2017-02-09", which are 22 records. From "2017-02-10" on dates are rounded to "2017-02-16" etc.
This may be a cheat, but assuming each row/observation is a separate day, then why not just group by every 14 rows and sum.
# Assign interval groups, each 14 rows
df.meteoro$my_group <-rep(1:100, each=14, length.out=nrow(df.meteoro))
# Grab Interval Names
my_interval_names <- df.meteoro %>%
select(-rain) %>%
group_by(my_group) %>%
slice(1)
# Summarise
df.meteoro %>%
group_by(my_group) %>%
summarise(rain = sum(rain)) %>%
left_join(., my_interval_names)
#> Joining, by = "my_group"
#> # A tibble: 27 x 3
#> my_group rain Dates
#> <int> <dbl> <date>
#> 1 1 3.86 2017-01-19
#> 2 2 -0.581 2017-02-02
#> 3 3 -0.876 2017-02-16
#> 4 4 1.80 2017-03-02
#> 5 5 3.79 2017-03-16
#> 6 6 -3.50 2017-03-30
#> 7 7 5.31 2017-04-13
#> 8 8 2.57 2017-04-27
#> 9 9 -1.33 2017-05-11
#> 10 10 5.41 2017-05-25
#> # ... with 17 more rows
Created on 2018-03-01 by the reprex package (v0.2.0).
Let's say I have several years worth of data which look like the following
# load date package and set random seed
library(lubridate)
set.seed(42)
# create data.frame of dates and income
date <- seq(dmy("26-12-2010"), dmy("15-01-2011"), by = "days")
df <- data.frame(date = date,
wday = wday(date),
wday.name = wday(date, label = TRUE, abbr = TRUE),
income = round(runif(21, 0, 100)),
week = format(date, format="%Y-%U"),
stringsAsFactors = FALSE)
# date wday wday.name income week
# 1 2010-12-26 1 Sun 91 2010-52
# 2 2010-12-27 2 Mon 94 2010-52
# 3 2010-12-28 3 Tues 29 2010-52
# 4 2010-12-29 4 Wed 83 2010-52
# 5 2010-12-30 5 Thurs 64 2010-52
# 6 2010-12-31 6 Fri 52 2010-52
# 7 2011-01-01 7 Sat 74 2011-00
# 8 2011-01-02 1 Sun 13 2011-01
# 9 2011-01-03 2 Mon 66 2011-01
# 10 2011-01-04 3 Tues 71 2011-01
# 11 2011-01-05 4 Wed 46 2011-01
# 12 2011-01-06 5 Thurs 72 2011-01
# 13 2011-01-07 6 Fri 93 2011-01
# 14 2011-01-08 7 Sat 26 2011-01
# 15 2011-01-09 1 Sun 46 2011-02
# 16 2011-01-10 2 Mon 94 2011-02
# 17 2011-01-11 3 Tues 98 2011-02
# 18 2011-01-12 4 Wed 12 2011-02
# 19 2011-01-13 5 Thurs 47 2011-02
# 20 2011-01-14 6 Fri 56 2011-02
# 21 2011-01-15 7 Sat 90 2011-02
I would like to sum 'income' for each week (Sunday thru Saturday). Currently I do the following:
Weekending 2011-01-01 = sum(df$income[1:7]) = 487
Weekending 2011-01-08 = sum(df$income[8:14]) = 387
Weekending 2011-01-15 = sum(df$income[15:21]) = 443
However I would like a more robust approach which will automatically sum by week. I can't work out how to automatically subset the data into weeks. Any help would be much appreciated.
First use format to convert your dates to week numbers, then plyr::ddply() to calculate the summaries:
library(plyr)
df$week <- format(df$date, format="%Y-%U")
ddply(df, .(week), summarize, income=sum(income))
week income
1 2011-52 413
2 2012-01 435
3 2012-02 379
For more information on format.date, see ?strptime, particular the bit that defines %U as the week number.
EDIT:
Given the modified data and requirement, one way is to divide the date by 7 to get a numeric number indicating the week. (Or more precisely, divide by the number of seconds in a week to get the number of weeks since the epoch, which is 1970-01-01 by default.
In code:
df$week <- as.Date("1970-01-01")+7*trunc(as.numeric(df$date)/(3600*24*7))
library(plyr)
ddply(df, .(week), summarize, income=sum(income))
week income
1 2010-12-23 298
2 2010-12-30 392
3 2011-01-06 294
4 2011-01-13 152
I have not checked that the week boundaries are on Sunday. You will have to check this, and insert an appropriate offset into the formula.
This is now simple using dplyr. Also I would suggest using cut(breaks = "week") rather than format() to cut the dates into weeks.
library(dplyr)
df %>% group_by(week = cut(date, "week")) %>% mutate(weekly_income = sum(income))
I Googled "group week days into weeks R" and came across this SO question. You mention you have multiple years, so I think we need to keep up with both the week number and also the year, so I modified the answers there as so format(date, format = "%U%y")
In use it looks like this:
library(plyr) #for aggregating
df <- transform(df, weeknum = format(date, format = "%y%U"))
ddply(df, "weeknum", summarize, suminc = sum(income))
#----
weeknum suminc
1 1152 413
2 1201 435
3 1202 379
See ?strptime for all the format abbreviations.
Try rollapply from the zoo package:
rollapply(df$income, width=7, FUN = sum, by = 7)
# [1] 487 387 443
Or, use period.sum from the xts package:
period.sum(xts(df$income, order.by=df$date), which(df$wday %in% 7))
# [,1]
# 2011-01-01 487
# 2011-01-08 387
# 2011-01-15 443
Or, to get the output in the format you want:
data.frame(income = period.sum(xts(df$income, order.by=df$date),
which(df$wday %in% 7)),
week = df$week[which(df$wday %in% 7)])
# income week
# 2011-01-01 487 2011-00
# 2011-01-08 387 2011-01
# 2011-01-15 443 2011-02
Note that the first week shows as 2011-00 because that's how it is entered in your data. You could also use week = df$week[which(df$wday %in% 1)] which would match your output.
This solution is influenced by #Andrie and #Chase.
# load plyr
library(plyr)
# format weeks as per requirement (replace "00" with "52" and adjust corresponding year)
tmp <- list()
tmp$y <- format(df$date, format="%Y")
tmp$w <- format(df$date, format="%U")
tmp$y[tmp$w=="00"] <- as.character(as.numeric(tmp$y[tmp$w=="00"]) - 1)
tmp$w[tmp$w=="00"] <- "52"
df$week <- paste(tmp$y, tmp$w, sep = "-")
# get summary
df2 <- ddply(df, .(week), summarize, income=sum(income))
# include week ending date
tmp$week.ending <- lapply(df2$week, function(x) rev(df[df$week==x, "date"])[[1]])
df2$week.ending <- sapply(tmp$week.ending, as.character)
# week income week.ending
# 1 2010-52 487 2011-01-01
# 2 2011-01 387 2011-01-08
# 3 2011-02 443 2011-01-15
df.index = df['week'] #the the dt variable as index
df.resample('W').sum() #sum using resample
With dplyr:
df %>%
arrange(date) %>%
mutate(week = as.numeric(date - date[1])%/%7) %>%
group_by(week) %>%
summarise(weekincome= sum(income))
Instead of date[1] you can have any date from when you want to start your weekly study.