Select a time period by day and month - r

I have a dataframe organized by year.
For example:
date <- seq(as.Date("2001-07-20"),as.Date("2010-12-31"),by = 1)
Now I want to select a subset by using two time periods:
June 23 to July 13 AND July 20 to Aug 9 for 2004-2008.
Could you provide some clue? Thanks!
Yes, it can be solved by:
test[date %between% c("2004-07-20", "2004-08-09")]...
but there are many years in my data, the code can be very repetitive.
I wonder if it can be solved like:
df$md <- format(as.Date(df$date), "%m-%d")
df <- df[df$md %in% c(as.Date(06-23):Date(07-13), Date(07-20):Date(08-09)) & year %in% (2004:2008),]
It doesn't work: Error in as.Date.numeric(6 - 23) : 'origin' must be supplied

You can construct the ranges of interest and subset:
library(lubridate)
date <- seq(as.Date("2001-07-20",origin="1970-01-01"),as.Date("2010-12-31",origin="1970-01-01"),by = 1)
range1 <- as.Date(unlist(lapply(c(0:4),function(y) seq(as.Date("2004-06-23",origin="1970-01-01"),as.Date("2004-07-13",origin="1970-01-01"),by="1 day") + years(y))),origin="1970-01-01")
range2 <- as.Date(unlist(lapply(c(0:4),function(y) seq(as.Date("2004-07-20",origin="1970-01-01"),as.Date("2004-08-09",origin="1970-01-01"),by="1 day") + years(y))),origin="1970-01-01")
date[date %in% range1 | date %in% range2]
Alternative
Alternative answer using %between% as suggested in OP
library(lubridate)
dates <- seq(as.Date("2001-07-20"),as.Date("2010-12-31"),by = 1)
r1 <- c(as.Date("2004-06-23"),as.Date("2004-07-13"))
r2 <- c(as.Date("2004-07-20"),as.Date("2004-08-09"))
ranges <- lapply(c(0:4),function(y) list(r1=r1 + years(y),r2=r2+years(y)))
as.Date(unlist(lapply(ranges,function(r) { dates[dates %between% r$r1 | dates %between% r$r2] })))

Related

How to calculate average of values from specific days each year for multiple years in R?

I wanted to calculate the average temperature (t) of specific time period for each year.
I have weather data that gives me values for each day. My real data is from 2011-2019 and has all days in all years and I would like for example average temperature for 20th of April - 15th of May for each year.
Example data:
df <- data.frame(matrix(ncol = 4, nrow = 8))
x <- c("year", "month","day","t")
colnames(df) <- x
df$year <- c(2011,2011,2011,2011,2012,2012,2012,2012)
df$month <- c(3,3,4,4,3,3,4,4)
df$day <- c(1,2,3,4,1,2,3,4)
df$t <- c(1,3,6,1,2,7,1,-9)
I did managed to do this with a very ugly and time consuming code but lack of knowledge has stopped me in my tracks.
Thank you in advance.
With tidyverse you could do something similar:
library(tidyverse)
Data %>%
filter((month == 4 & day >= 20) |
(month == 5 & day <= 15)) %>%
group_by(year) %>%
summarise(mean_temp = mean(t))
Similar to #Ben's answer but in base R :
aggregate(t~year, subset(df, (month == 4 & day >= 20) |
(month == 5 & day <= 15)), mean)
you can actually add quite complex calculations to the group_by function in the dplyr package. Maybe you want to look into something like this.
library(dplyr)
library(lubridate)
df <- data.frame(matrix(ncol = 4, nrow = 8))
x <- c("year", "month","day","t")
colnames(df) <- x
df$year <- c(2011,2011,2011,2011,2012,2012,2012,2012)
df$month <- c(3,3,4,4,3,3,4,4)
df$day <- c(1,2,3,4,1,2,3,4)
df$t <- c(1,3,6,1,2,7,1,-9)
df %>%
group_by(lubridate::dmy(paste(day, month, year)) %>%
lubridate::yday() %>%
between(lubridate::yday(dmy("3.4.2000")), lubridate::yday(dmy("15.5.2000")))) %>%
summarise(mean(t))
I am using the yday function from lubridate to be able to select days over multiple years.
Hope this helps!!
Try the code bellow, I like to use for loop to deal with this kind of troble.
# Create a vector of all years
year_u <- unique(zz$year)
# Create the initial and final period
inicial_day <- 20
inicial_month <- 4
final_day <- 15
final_month <- 5
# Create an empty data.frame to store the data after each loop
averages <- data.frame()
# Open a loop
for(i in 1:length(year)){
# take each year
subsets <- subset(zz, year == year_u[i])
# Mean of each time between the period
average <- mean(subsets[subsets$day >= inicial_day & subsets$month >= inicial_month &
subsets$day <= final_day & subsets$month <= final_month, ]$t)
# Create a temporary data.frame to store the year and the t_mean
temp <- data.frame(year = year_u[i], t_mean = average)
# Combine the actual data with the last
averages <- rbind(averages, temp)
}

I want to return a season and year value from a continuous list of dates

I have a continuous list of dates (yyyy-mm-dd) from 1985 to 2018 in one column (Colname = date). What I wish to do is generate another column which outputs a water season and year given the date.
To make it clearer I have two water season:
Summer = yyyy-04-01 to yyyy-09-31;
Winter = yyyy-10-01 to yyyy(+1)-03-31.
So for 2018 - Summer = 2018-04-01 to 2018-09-31; Winter 2018-10-01 to 2019-03-31.
What I would like to output is something like the following:
Many thanks.
A tidy verse approach
library(tidyverse)
df <-tibble(date = seq(from = as.Date('2000-01-01'), to = as.Date('2001-12-31'), by = '1 month'))
df
df %>%
mutate(water_season_year = case_when(
lubridate::month(date) %in% c(4:9) ~str_c('Su_', lubridate::year(date)),
lubridate::month(date) %in% c(10:12) ~str_c('Wi_', lubridate::year(date)),
lubridate::month(date) %in% c(1:3)~str_c('Wi_', lubridate::year(date) -1),
TRUE ~ 'Error'))
You can compare just the month part of the data to get the season, in base R consider doing
month <- as.integer(format(df$date, "%m"))
year <- format(df$date, "%Y")
inds <- month >= 4 & month <= 9
df$water_season_year <- NA
df$water_season_year[inds] <- paste("Su", year[inds], sep = "_")
df$water_season_year[!inds] <- paste("Wi", year[!inds], sep = "_")
#To add previous year for month <= 3 do
df$water_season_year[month <= 3] <- paste("Wi",
as.integer(year[month <= 3]) - 1, sep = "_")
df
# date water_season_year
#1 2019-01-03 Wi_2019
#2 2000-06-01 Su_2000
Make sure that date variable is of "Date" class.
data
df <-data.frame(date = as.Date(c("2019-01-03", "2000-06-01")))

Remove all dates prior to specific date in all years

How to remove all dates prior to a specific date, say, 15th of March?
dates <- seq(as.Date("1990/1/1"), as.Date("1999/1/1"), "days")
I would need to subset a data frame based on this vector.
I would use lubridate for readability:
library(lubridate)
dates[month(dates) > 3 | (month(dates) == 3 & day(dates) >= 15)]
Base version of the same logic:
dates[as.integer(format(dates, "%m")) > 3 |
(as.integer(format(dates, "%m")) == 3 & as.integer(format(dates, "%d")) >= 15)]
Adding up Gregor's answer, but using dplyr:
library(dplyr)
library(lubridate)
filtered_df = df %>% filter((month(dates) == 3 & days(dates) >= 15) | month(dates) > 3)

Most effective way to group data in quarters and fiscal years in R

I have a large database (POY) with data from 2011 to 2017 which contains a date column. I would need to do two things: make it possible to split by quarters and by fiscal year.
Our fiscal year unfortunately does not run in parallel with calendar years but goes from July to June. Which also means that my Quarter 1 runs from July to September.
I've written some code that seems to work fine but it seems rather lengthy (especially the second part). Does anyone have any advice for this beginner to make it more efficient?
#Copy of date column and splitting it in 3 columns for year, month and day
library(tidyr)
POY$Date2 <- POY$Date
POY<-separate(POY, Date2, c("year","month","day"), sep = "-", convert=TRUE)
#Making a quarter variable
POY$quarter[POY$month<=3] <- "Q3"
POY$quarter[POY$month>3 & POY$month <=6] <- "Q4"
POY$quarter[POY$month>6 & POY$month <=9] <- "Q1"
POY$quarter[POY$month>9 & POY$month <=12] <- "Q2"
POY$quarter <- as.factor(POY$quarter)
For the Fiscal Year variable: it runs July - June, so:
June'15 should become FY1415
July'15 should become FY1516
Or: Q1 and Q2 in 2015 should become FY1516, while Q3 and Q4 of 2015 are actually FY1415.
#Making a FY variable
for (i in 1:nrow(POY)) {
if (POY$quarter[i] == "Q1" | POY$quarter[i] == "Q2") {
year1 <- as.character(POY$year[i])
year2 <- as.character(POY$year[i] + 1)
} else {
year1 <- as.character(POY$year[i]- 1)
year2 <- as.character(POY$year[i])
}
POY$FY[i] <- paste0("FY", substr(year1, start=3, stop=4), substr(year2, start=3, stop=4))
}
POY$FY <- as.factor(POY$FY)
summary(POY$FY)
Any suggestions?
Thank you!
Not sure if this was available at the time but the lubridate package contains a quarter function which allows you to create your fiscal quarter and year columns.
The documentation is here.
Examples for your case would be:
x <- ymd("2011-07-01")
quarter(x)
quarter(x, with_year = TRUE)
quarter(x, with_year = TRUE, fiscal_start = 7)
You can then use dplyr and paste function to mutate your own columns in creating fiscal quarters and years.
I've used a combination of base R, lubridate and dplyr;
# make a blank dataframe with sequential dates ...
df <- data.frame(date = seq (as.Date('2011-07-01'), as.Date('2015-07-01'), by = 'month'))
# similar to original poster, separate year/month/day
df <-
df %>%
separate(col = date, into = c('yr', 'mnth', 'dy'), sep = '-', convert = TRUE, remove = FALSE)
# extract last 2 digits of year
df$yr_small <- strftime(x = df$date, format = '%y', tz = 'GMT')
df$yr_small <- as.numeric(df$yr_small)
# Use dplyr's "case_when" to categorise quarters
df <-
df %>%
# make quarters
mutate(
quarter = case_when(
mnth >= 7 & mnth <= 9 ~ 'Q1'
, mnth >= 10 & mnth <= 12 ~ 'Q2'
, mnth >= 1 & mnth <= 3 ~ 'Q3'
, mnth >= 4 & mnth <= 6 ~ 'Q4' ) ) %>%
# ... the financial year is
mutate (
financial_year = case_when(
quarter == 'Q1' | quarter == 'Q2' ~ (yr_small + 1)
, quarter == 'Q3' | quarter == 'Q4' ~ (yr_small) ) )
# final column to make the full financial year start/end
df <- df %>% mutate (FY = paste('FY',df$financial_year, df$financial_year + 1, sep = '') )
Should give you this:
You could use this to replace the for-loop, I think. If you'd supply some data I could test it.
#Making a FY variable
POY$year1 <- as.character(POY$year - 1)
POY$year2 <- as.character(POY$year)
POY$year1[(POY$quarter == "Q1") | (POY$quarter == "Q2")] <-
as.character(POY$year[(POY$quarter == "Q1") |(POY$quarter == "Q2")])
POY$year2[(POY$quarter == "Q1") | (POY$quarter == "Q2")] <-
as.character(POY$year[(POY$quarter == "Q1") | (POY$quarter == "Q2")] + 1)
POY$FY <-
paste0("FY", substr(POY$year1, 3, 4), substr(POY$year2, 3, 4))
POY$FY <- as.factor(POY$FY)
summary(POY$FY)

Subsetting timeseries data

ggplot(Price.data['2000-01/2015-12'],aes(x=Demand,y=Price))+geom_point()+geom_smooth(method=lm)
indexClass(Price.data)
[1] "Date"
How to plot only March, April and June data from year 2010-2014?
head(Price.data)
Dry_Gas_Y Power_Gas_Y Price_Gas_Y
1990-01-01 52.16720 5.469179 2.39
1990-02-01 51.45287 5.470755 1.90
1990-03-01 49.29829 6.908609 1.55
1990-04-01 48.29243 7.721371 1.49
1990-05-01 47.25959 9.154057 1.47
1990-06-01 47.48744 11.525595 1.47
library(tidyverse)
Price.data %>%
mutate(year = as.numeric(format(Date, "%Y")),
month = as.numeric(format(Date, "%m"))) %>%
filter(year > 2009 & year < 2015, month == 3 | month == 4 | month ==6) %>%
ggplot(aes(Demand,Price))+geom_point()+geom_smooth(method=lm)
From your example I didn't see the dates having a column name and it looks like the dates are the row names. For this reason, this example creates a 'Date' column, then 'Month' & 'Year' columns for you to then filter the dates.
library(lubridate)
library(dplyr
plot_months <- Price.data%>%
mutate(Date = row.names(.),
Month = month(Date),
Year = year(Date))%>%
filter(Month %in% c(3,4,6),
Year %in% c(2009:2014))
ggplot(plot_months, aes(x=Demand,y=Price))+
geom_point()+
geom_smooth(method=lm)
You can use data.table, which is likely to be the fastest solution
library(data.table)
# convert your dataset into a data.table
setDT(df)
# If necessary, get date column into date format
# df[ , Date := as.Date(df$Date, "%m-%d-%y") ]
# Create separate columns for year and month
df[, year := year(Date)][, month := month(Date)]
# filter dataset
df <- df[ month %in% c(3,4,6) & year %in% c(2009:2014), ]
# subset(df, month %in% c(3,4,6) & year %in% c(2009:2014) ) # you could also use a simple subset, but this is likely to be slower
Plot
ggplot(data=df, aes(x=Demand, y=Price)) + geom_point() + geom_smooth(method=lm)
Since Price.data is an xts object, you can use the .indexmon function to extract the months you want to plot. Then use range-based subsetting to extracting the range of years you want.
Note that .indexmon returns the months starting with January = 0, like the $mon element of POSIXlt objects.
ggplot(Price.data[.indexmon(Price.data) %in% c(2, 3, 5)]['2010/2014'],
aes(x=Dry_Gas_Y, y=Price_Gas_Y)) + geom_point() + geom_smooth(method=lm)

Resources