compare date variable with a list of dates - r

I have a df with a datetime variable (made with lubridate)
str(raw_data$date)
POSIXct[1:37166], format: "2016-11-04 09:12:38" "2016-11-04 09:04:08" "2016-11-04 09:04:14" "2016-11-04 09:08:01" "2016-11-04 09:11:56" ...
and a list of dates for a school term
vsdate<- c("2017/01/30","2017/03/31","2017/04/18","2017/06/30","2017/07/17","2017/09/22","2017/10/09","2017/12/22","2018/01/30","2018/03/29","2018/04/16","2018/06/29","2018/07/16","2018/09/21","2018/10/08","2018/12/21")
vsdate <- as_date(vsdate)
I want to compare if the dates in the list are between the dates in raw_data. I have done this below, but I can't get it to work in the tidyverse:
vsdate<- c("2017/01/30","2017/03/31","2017/04/18","2017/06/30","2017/07/17","2017/09/22","2017/10/09","2017/12/22","2018/01/30","2018/03/29","2018/04/16","2018/06/29","2018/07/16","2018/09/21","2018/10/08","2018/12/21")
vsdate <- as.Date(vsdate)
raw_data$Vic.School.Term=0
raw_data[raw_data$date<=vsdate[2]& raw_data$date>=vsdate[1],"Vic.School.Term"]<-1
raw_data[raw_data$date<vsdate[4]& raw_data$date>=vsdate[3],"Vic.School.Term"]<-1
raw_data[raw_data$date<vsdate[6]& raw_data$date>=vsdate[5],"Vic.School.Term"]<-1
raw_data[raw_data$date<vsdate[8]& raw_data$date>=vsdate[7],"Vic.School.Term"]<-1
raw_data[raw_data$date<=vsdate[10]& raw_data$date>=vsdate[9],"Vic.School.Term"]<-1
raw_data[raw_data$date<vsdate[12]& raw_data$date>=vsdate[11],"Vic.School.Term"]<-1
raw_data[raw_data$date<vsdate[14]& raw_data$date>=vsdate[13],"Vic.School.Term"]<-1
raw_data[raw_data$date<vsdate[16]& raw_data$date>=vsdate[15],"Vic.School.Term"]<-1
and here is my failed attempt in the tidyverse:
raw_data<- raw_data <- mutate(school.term=case_when(
between(date,vsdate[1],vsdate[2] ~ 1)))
Error in between(date, vsdate[1], vsdate[2] ~ 1) :
Expecting a single value: [extent=3].
Thanks!

Your between function is not closed properly. The proper signature for it is between(value,left, right) and you have between(value, left, right ~1). See below for the 1st few cases:
library(dplyr)
library(lubridate)
raw_data <- data.frame( date = c("2016-11-04 09:12:38", "2016-11-04 09:04:08",
"2016-11-04 09:04:14", "2016-11-04 09:08:01",
"2016-11-04 09:11:56", "2017-02-15 09:10:01",
"2017-05-01 10:00:00")
)
raw_data %>% mutate(date = ymd_hms(date)) -> raw_data
str(raw_data)
vsdate<- ymd(c("2017/01/30","2017/03/31","2017/04/18","2017/06/30",
"2017/07/17","2017/09/22","2017/10/09","2017/12/22",
"2018/01/30","2018/03/29","2018/04/16","2018/06/29",
"2018/07/16","2018/09/21","2018/10/08","2018/12/21"))
str(vsdate)
raw_data %>% mutate(school.term = case_when(between(as.Date(date), vsdate[1], vsdate[2]) ~1,
between(as.Date(date), vsdate[3], vsdate[4]) ~1,
TRUE ~ 0)
date school.term
1 2016-11-04 09:12:38 0
2 2016-11-04 09:04:08 0
3 2016-11-04 09:04:14 0
4 2016-11-04 09:08:01 0
5 2016-11-04 09:11:56 0
6 2017-02-15 09:10:01 1
7 2017-05-01 10:00:00 1
Also, note the as.Date function in the between. This allows the comparison between POSIXct and regular date format in R

Related

How to get date type format?

I have a date in yyyymmdd format dataframe
ex.
df= data.frame(dat = seq.Date(from= as.Date("2021-01-01") , to = as.Date("2021-01-07"), by =1))
I want to create a column of strings in this format:
example : 2021-01-07 should look like 07-JAN-21
toupper(format(date_column, "%d-%b-%y"))
here is the premise
> df$dat <- toupper(format(df$dat, "%d-%b-%y"))
> df
dat
1 01-JAN-21
2 01-FEB-21
3 01-MAR-21
4 01-APR-21
5 01-MAY-21
6 01-JUN-21
7 01-JUL-21

Selecting Date Range from Column in R

I have a dataset in R which I read in using read.table (Table Name a) . I want to select dates from '2007-02-01' to '2007-02-02'. The Current Date Column is of class "Character".
Date
1 16/12/2006
2 16/12/2006
3 16/12/2006
4 16/12/2006
5 16/12/2006
6 16/12/2006
I tried the following:
1. as.Date(a$Date) returns date in the format "0016-12-20"
2. a[a$Date >= '2007-02-01' & a$Date <= '2007-02-01'] returns all rows with 0 variables
3. strptime(a$Date,'%d%b%Y') returns NA values
Convert date to date class and subset :
df$Date <- as.Date(df$Date, '%d/%m/%Y')
subset(df, Date >= as.Date('2007-02-01') & Date <= as.Date('2007-02-02'))
You can also use :
library(dplyr)
df %>%
mutate(Date = lubridate::dmy(Date)) %>%
filter(Date >= as.Date('2007-02-01') & Date <= as.Date('2007-02-02'))

How to change the date format & remove rows from dataframe before certain date R Studio

I have a dataframe with over 8.8 million observations and I need to remove rows from the dataframe before a certain date. Currently the date format is in MM/DD/YYYY but I would like to convert it to R date format (I believe YYYY-MM-DD).
When I run the code that I have below, it puts them in the correct R format, but it does not keep the correct date. For some reason, it makes the dates 2020. None of the dates in my data frame have the year 2020
> dates <- nyc_call_data_sample$INCIDENT_DATETIME
> date <- as.Date(dates,
+ format = "%m/%d/%y")
> head(nyc_call_data_sample$INCIDENT_DATETIME)
[1] "07/01/2015" "04/24/2016" "04/01/2013" "02/07/2015" "06/27/2016" "05/04/2017"
> head(date)
[1] "2020-07-01" "2020-04-24" "2020-04-01" "2020-02-07" "2020-06-27" "2020-05-04"
> nyc_call_data_sample$INCIDENT_DATETIME <- strptime(as.character(nzd$date), "%d/%m/%y")
Also, I have data that goes back as far as 2013. How would I go about removing all rows from the dataframe that are before 01/01/2017
Thanks!
as.Date and basic ?Extraction are your friend here.
dat <- data.frame(
unformatted = c("07/01/2015", "04/24/2016", "04/01/2013", "02/07/2015", "06/27/2016", "05/04/2017")
)
dat$date <- as.Date(dat$unformatted, format = "%m/%d/%Y")
dat
# unformatted date
# 1 07/01/2015 2015-07-01
# 2 04/24/2016 2016-04-24
# 3 04/01/2013 2013-04-01
# 4 02/07/2015 2015-02-07
# 5 06/27/2016 2016-06-27
# 6 05/04/2017 2017-05-04
dat[ dat$date > as.Date("2017-01-01"), ]
# unformatted date
# 6 05/04/2017 2017-05-04
(Feel free to remove the unformatted column with dat$unformatted <- NULL.)
With tidyverse:
library(dplyr)
dat %>%
mutate(date = as.Date(unformatted, format = "%m/%d/%Y")) %>%
select(-unformatted) %>%
filter(date > as.Date("2017-01-01"))
# date
# 1 2017-05-04

Problems with anomalize function

I need to check the data array with function "Anomalize".
First I hooked up some libraries
library(tidyverse)
library(anomalize)
library(dplyr)
library(zoo)
library(ggplot2)
library(forecast)
library(anytime)
Then I delete all column that i do not need for this task
trash1 <- ASD[, -2]
trash2 <- trash1[,-2]
trash3 <- trash2[,-2]
trash4 <- trash3[,-2]
trash5 <- trash4[,-2]
trash6 <- trash5[,-2]
trash7 <- trash6[,-4]
trash8 <- trash7[,-4]
view(trash8)
Change class from Factor to Date:
trash8$DMY <- as.Date(trash8$DMY, format="%d.%m.%y")
Than I tryed to anomalize this
trash_tbl <- as_tibble(trash8)
trash_tbl %>%
time_decompose(Qp) %>%
anomalize(remainder) %>%
time_recompose() %>%
plot_anomalies(time_recomposed = TRUE, ncol = 3 , alpha_dots = 0.5)
As the result I have this error:
Converting from tbl_df to tbl_time.
Auto-index message: index = DMY
Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
Error: Only year, quarter, month, week, and day periods are allowed for an index of class Date
Please help me with it or say, what can I read to solve that problem??
This is my data. DMY - Date, MCC - Factor, Art - Numeric, Qp - Numeric , Ql - Factor
1 DMY MCC Art Qp Ql
1 2016-01-01 UA0000468 1801 3520 440
2 2016-01-01 UA0000468 3102 3024 604,8
3 2016-01-01 UA0000468 4419 270 521,1
4 2016-01-01 UA0000468 5537 1080 2084,4
5 2016-01-03 UA0010557 3528 180 36
6 2016-01-03 UA0010557 3529 198 39,6
...

R: converting start/end dates into data series

I have the following data frame representing user subscriptions:
User StartDate EndDate
1 2015-09-03 2015-10-17
2 2015-10-27 2015-12-25
...
How can I transform it into a time series that gives me the count of active monthly subscriptions over time (assuming it is active in the month if at least for one day in that month). Something like this (based on the example above, assuming only 2 records):
Month Count
2015-08 0
2015-09 1
2015-10 2
2015-11 1
2015-12 1
2016-01 0
Rem: I took some arbitrary start and end dates for the time series, to make the example clear.
Prepare the data and make sure that the date columns are actually stored as dates:
data <- read.table(text = "User StartDate EndDate
1 2015-09-03 2015-10-17
2 2015-10-27 2015-12-25", header = TRUE)
data$StartDate <- as.Date(StartDate)
data$EndDate <- as.Date(EndDate))
This function returns a vector with all month that are within a subscription:
library(lubridate)
subscr_month <- function(start, end) {
start <- floor_date(start, "month")
seq <- seq(start, end, by = "1 month")
months <- format(seq, format = "%Y-%m")
return(months)
}
It uses the function floor_date() from the lubridate package. It is necessary to round of the start date, because otherwise the last month might be missing. For example, for user 2, if you add two month to the start date, you end up on 2015-12-27, which is after the end date, such that no date from December will be included in seq. The last line converts the Dates to character that only include year and month.
Now, you can apply this function to each start and end date from your data using mapply(). Afterwards, table() creates a table of counts of all dates in the resulting list:
all_month <- mapply(subscr_month, data$StartDate, data$EndDate, SIMPLIFY = FALSE)
table(unlist(all_month))
## 2015-09 2015-10 2015-11 2015-12
## 1 2 1 1
You can also convert the table to a data frame:
as.data.frame(table(unlist(all_month)))
## Var1 Freq
## 1 2015-09 1
## 2 2015-10 2
## 3 2015-11 1
## 4 2015-12 1
Your example output also includes the counts for months that do not appear in the data set. If you want to have this, you can convert the vector of months to a factor and set the levels to all the months you want to include:
month_list <- format(seq(as.Date("2015-08-01"), as.Date("2016-01-01"), by = "1 month"), format = "%Y-%m")
all_month_factor <- factor(unlist(all_month), levels = month_list)
table(all_month_factor)
## all_month_factor
## 2015-08 2015-09 2015-10 2015-11 2015-12 2016-01
## 0 1 2 1 1 0
read the data frame mentioned.
df = structure(list(StartDate = structure(c(16681, 16735), class = "Date"),
EndDate = structure(c(16735, 16794), class = "Date")), class = "data.frame", .Names = c("StartDate",
"EndDate"), row.names = c(NA, -2L))
Could make good use of do in dplyr package and seq
df %>%
rowwise() %>% do({
w <- seq(.$StartDate,.$EndDate,by = "15 days") #for month difference less than 1 complete month
m <- format(w,"%Y-%m") %>% unique
data.frame(Month = m)
}) %>%
group_by(Month) %>%
summarise(Count = length(Month))

Resources