I have the following dataframe:
Data <- data.frame(
date = c("2001-01-01", "2001-02-01", "2001-03-01", "2001-04-01", "2001-05-01", "2001-06-01"),
qtr = c("NA", "NA","NA","NA","NA","NA")
)
I want to fill Data$qtr with Year/Quater - f.e. 01/01 (I need this format!).
I wrote a function:
fun <- function(x) {
if(x == "2001-01-01" | x == "2001-02-01" | x == "2001-03-01") y <- "01/01"
if(x == "2001-04-01" | x == "2001-05-01" | x == "2001-06-01") y <- "01/02"
return(y)
}
n$qtr <- sapply(n$date, fun)
But it does not work. I always get the error message:
Error in FUN(X[[1L]], ...) : Object 'y' not found
Why?
You need to explicilty Vectorize your function:
fun_v <- Vectorize(fun, "x")
fun_v(Data$date)
#[1] "01/01" "01/01" "01/01" "01/02" "01/02" "01/02"
However, when it comes to more or less standard tasks (such as datetime manipulations), there's always a solution already available:
library(zoo)
yq <- as.yearqtr(Data$date, format = "%Y-%m-%d")
yq
#[1] "2001 Q1" "2001 Q1" "2001 Q1" "2001 Q2" "2001 Q2" "2001 Q2"
To convert to your specific format, use
format(yq, format = "%y/0%q")
#[1] "01/01" "01/01" "01/01" "01/02" "01/02" "01/02"
I have been loving the lubridate package for working with dates. Super slick. The quarter function finds the quarter (of course) and then just pair that with the year.
library(lubridate)
Data <- Data %>%
mutate(qtr = paste0(substring(year(date),3,4),"/0",quarter(date)))
If you are not familiar with the %>% from magrittr the first line basically says "use data frame called Data" and the second line says "mutate (or add) a column called qtr"
EDIT 2021-Q2
If the "YY/QQ" format is not critical, then a quick and safe way to get the year and the quarter is:
library(lubridate)
Data %>%
mutate(qtr = quarter(date, with_year = T))
Using base functions:
Data$date <- as.Date(Data$date)
Data$qtr <- paste(format(Data$date, "%y"),
sprintf("%02i", (as.POSIXlt(Data$date)$mon) %/% 3L + 1L),
sep="/")
# date qtr
# 1 2001-01-01 01/01
# 2 2001-02-01 01/01
# 3 2001-03-01 01/01
# 4 2001-04-01 01/02
# 5 2001-05-01 01/02
# 6 2001-06-01 01/02
another option would be:
Data$qtr <- lubridate::quarter(Data$date, with_year = T)
I made a similar format using quarters() and sub() in R:
Data$qtr <- paste(format(Data$date, "%y/"), 0,
sub( "Q", "", quarters(Data$date) ), sep = "")
The data.table package and its IDate class have some nice convenience functions (e.g. quarter(), year()), similar to those of lubridate available. paste0() them together as you please.
Data <- data.frame(
date = c("2001-01-01", "2001-02-01", "2001-03-01",
"2001-04-01", "2001-05-01", "2001-06-01")
)
require(data.table)
setDT(Data)
Data[ , date := as.IDate(date) ] # data.table s integer based date class
Data[ , qtr := paste0(year(date), '/', quarter(date)) ]
# your specific format
Data[ , qt2 := paste0(substr(year(date),3,4), '/', '0', quarter(date)) ]
tidyverse's clock gives an alternative, with the advantage of not loosing the original day, with calendar_narrow and date precision:
library(clock)
library(dplyr)
#Function to convert a date to year-quarter
toQ <- . %>%
date_parse() %>%
as_year_quarter_day() %>%
calendar_narrow("quarter")
Data %>%
mutate(qtr = toQ(date))
output
date qtr
1 2001-01-01 2001-Q1
2 2001-02-01 2001-Q1
3 2001-03-01 2001-Q1
4 2001-04-01 2001-Q2
5 2001-05-01 2001-Q2
6 2001-06-01 2001-Q2
Another (longer) way
of doing it using if statements is this:
month <- as.numeric(format(date, format = "%m"))[1]
if (month < 4) {
quarter <- paste( format(date, format = "%Y")[1], "Q1", sep="-")
} else if (month > 3 & month < 7) {
quarter <- paste( format(date, format = "%Y")[1], "Q2", sep="-")
} else if (month > 6 & month < 10) {
quarter <- paste( format(date, format = "%Y")[1], "Q3", sep="-")
} else if (month > 9) {
quarter <- paste( format(date, format = "%Y")[1], "Q4", sep="-")
}
Returns a string in the format:
> quarter
[1] "2001-Q1"
Then you could extend that using a loop.
yq=function(x,prefix="%Y",combine="Q") paste0(ifelse(is.null(prefix),"",format(x,"%Y")),floor(as.numeric(format(x,"%m"))/3-1e-3)+1,sep=combine)
this gives the flexibility of returning any format back that has the quarter in it
no need for chron or zoo
as for your example
yq(as.Date("2013-04-30"),prefix="%y",combine="/0")
> [1] "13/02"
In case someone is looking for a format such as 1Q21 for 1st quarter 21, I used #Roland answer above and made a small change towards:
paste(sprintf("%2i", (as.POSIXlt(Notional$Date)$mon) %/% 3L + 1L), format(Notional$Date, "%y"),sep="Q")
Related
I have a data frame where dates are represented by the string "ABC202003" with the format "ABCYYYYMM". How can I remove the "ABC" part and conver it to Date format month-year in R?
Does this work:
> library(dplyr)
> library(stringr)
> str <- c('ABC202003','DEF202004')
> df <- data.frame(str = str)
> df
str
1 ABC202003
2 DEF202004
> df %>% mutate(date = str_extract_all(str, '\\d+')) %>%
+ mutate(date = str_replace_all(date, '(\\d{4})(\\d{2})','\\1-\\2'))
str date
1 ABC202003 2020-03
2 DEF202004 2020-04
>
In month-year format:
> df %>% mutate(date = str_extract_all(str, '\\d+')) %>%
+ mutate(date = str_replace_all(date, '(\\d{4})(\\d{2})','\\2-\\1'))
str date
1 ABC202003 03-2020
2 DEF202004 04-2020
>
The data in the question, corrected.
x <- "ABC022003"
If there are always 3 characters at the beginning of the string, first run this:
date <- as.Date(paste0("01", substring(x, 4)), "%d%m%Y")
If there could be a different number of non-numeric digits, run this:
date <- as.Date(paste0("01", gsub("[^[:digit:]]", "", x)), "%d%m%Y")
Now date is an object of class "character". Any of the following will create a month-year string.
format(date, "%m-%Y")
#[1] "02-2003"
format(date, "%b-%Y")
#[1] "Feb-2003"
zoo::as.yearmon(date)
#[1] "Feb 2003"
We can get the digits with parse_number and then use ymd with truncated to convert to Date class. If needed to change the format to month-Year, then use format
library(dplyr)
library(lubridate)
df %>%
mutate(date = format(ymd(readr::parse_number(str), truncated = 2), '%m-%Y'))
# str date
#1 ABC202003 03-2020
#2 DEF202004 04-2020
if it needs to be Date class, remove the format
df %>%
mutate(date = ymd(readr::parse_number(str), truncated = 2))
# str date
#1 ABC202003 2020-03-01
#2 DEF202004 2020-04-01
data
df <- structure(list(str = c("ABC202003", "DEF202004")),
class = "data.frame", row.names = c(NA,
-2L))
First get rid of the letters at the beginning using gsub
x <- c('ABC202003','DEF202004')
x <- gsub("[^0-9.-]", "", x)
Then use parse_date_time from lubridate to parse it as a date
x <- lubridate::parse_date_time(x, orders = 'ym', truncated = 1)
then finally use format to format them as you wish
format(x, '%Y-%m')
This is the end result:
[1] "2020-03" "2020-04"
I have a continuous list of dates (yyyy-mm-dd) from 1985 to 2018 in one column (Colname = date). What I wish to do is generate another column which outputs a water season and year given the date.
To make it clearer I have two water season:
Summer = yyyy-04-01 to yyyy-09-31;
Winter = yyyy-10-01 to yyyy(+1)-03-31.
So for 2018 - Summer = 2018-04-01 to 2018-09-31; Winter 2018-10-01 to 2019-03-31.
What I would like to output is something like the following:
Many thanks.
A tidy verse approach
library(tidyverse)
df <-tibble(date = seq(from = as.Date('2000-01-01'), to = as.Date('2001-12-31'), by = '1 month'))
df
df %>%
mutate(water_season_year = case_when(
lubridate::month(date) %in% c(4:9) ~str_c('Su_', lubridate::year(date)),
lubridate::month(date) %in% c(10:12) ~str_c('Wi_', lubridate::year(date)),
lubridate::month(date) %in% c(1:3)~str_c('Wi_', lubridate::year(date) -1),
TRUE ~ 'Error'))
You can compare just the month part of the data to get the season, in base R consider doing
month <- as.integer(format(df$date, "%m"))
year <- format(df$date, "%Y")
inds <- month >= 4 & month <= 9
df$water_season_year <- NA
df$water_season_year[inds] <- paste("Su", year[inds], sep = "_")
df$water_season_year[!inds] <- paste("Wi", year[!inds], sep = "_")
#To add previous year for month <= 3 do
df$water_season_year[month <= 3] <- paste("Wi",
as.integer(year[month <= 3]) - 1, sep = "_")
df
# date water_season_year
#1 2019-01-03 Wi_2019
#2 2000-06-01 Su_2000
Make sure that date variable is of "Date" class.
data
df <-data.frame(date = as.Date(c("2019-01-03", "2000-06-01")))
I have a function created to arrive month, so that if a invoice is created before 15th of a month, will consider previous month. Else it will consider current month. The output gets stored in a matrix (2 columns and 4500 rows). One column is for months in integer and other for year in integer. Program and output below. I want the month and year to be in Date format instead of integer, so that I can slide and dice the data in visualization. Your help is appreciated.
# If the date is before 15th of a month, will consider previous month. Else current month
myDateFun <- function(x){
x <- as.Date(x, format='%d-%m-%Y')
if (day(x) < 15){
dd <- x-14
}
else {dd <- x}
return(c(month(dd), year(dd)))
}
# sapply method used to absorb the function and create matrix of month and year
mat = t(sapply(CI3$invoice_date, FUN=myDateFun, simplify='matrix'))
# Output [,1] is month. [,2] is year
mat
[,1] [,2]
[1,] 3 2016
[2,] 4 2016
[3,] 5 2016
[4,] 6 2016
If you adjust your function slightly you don't need to use sapply.
myDateFun <- function(x){
x <- as.Date(x, format='%d-%m-%Y')
ifelse(lubridate::day(x) < 15, dd <- x-14, dd <- x)
out <- format(dd, "%Y-%m")
return(out)
}
# add year month to CI3
# year_month will be a character vector due to format function.
CI3$year_month <- myDateFun(CI3$invoice_date)
edit based on comment:
I edited the function so it can take an extra argument specifying year or month. Default is year. Very simple error handling to make sure it is one of these values.
myDateFun <- function(x, period = "year"){
# error handling
if(!(period %in% c("year", "month"))) stop("period should be year or month")
x <- as.Date(x, format='%d-%m-%Y')
ifelse(lubridate::day(x) < 15, dd <- x-14, dd <- x)
if(period == "year"){
out <- format(dd, "%Y")
} else {
out <- format(dd, "%b")
}
return(out)
}
CI3$year <- myDateFun(CI3$invoice_date, "year")
CI3$month <- myDateFun(CI3$invoice_date, "month")
Here is a solution using the lubridate and purrr packages. I often just need the month and year of a date so I just default the day to 1 and ignore it.
Here is some sample data in your format:
library(tidyverse)
library(lubridate)
x <- data_frame(date = c("03/01/2018", "01/02/2015", "03/04/2006", "25/12/2006", "15/01/2014"))
This is your function using lubridate:
AltDateFun <- function(x) {
x <- dmy(x)
if (day(x) < 15) {
x <- x - months(1)
day(x) <- 1
return(x)
}
else {
day(x) <-1
return(x)
}
}
And assuming your invoice dates are a character column in a data frame with dates in the format dmy:
z <- map_df(x, AltDateFun)
# A tibble: 5 x 1
x
<date>
1 2017-12-01
2 2015-01-01
3 2006-03-01
4 2006-11-01
5 2013-12-01
EDIT:
To get the month and year in a separate column do this:
z %>% mutate(m = month(x), y = year(x))
# A tibble: 5 x 3
x m y
<date> <dbl> <dbl>
1 2017-12-01 12.0 2017
2 2015-01-01 1.00 2015
3 2006-03-01 3.00 2006
4 2006-11-01 11.0 2006
5 2013-12-01 12.0 2013
Worked. Thanks all for your answers. Just sharing the code I used. Updated code below
myDateFun <- function(x, period = "year") {
# error handling
if (!(period %in% c("year", "month")))
stop("period should be year or month")
x <- as.Date(x, format = '%d-%m-%Y')
ifelse(lubridate::day(x) < 15, dd <- x - 14, dd <- x)
if (period == "year") {
out <- format(dd, "%Y")
} else {
out <- format(dd, "%b")
}
return(out)
}
CI3$invyr <- myDateFun(CI3$invoice_date, "year")
CI3$invmon <- myDateFun(CI3$invoice_date, "month")
CI3$date_m_Y = paste(CI3$invmon, CI3$invyr, sep = "-")
ggplot(Price.data['2000-01/2015-12'],aes(x=Demand,y=Price))+geom_point()+geom_smooth(method=lm)
indexClass(Price.data)
[1] "Date"
How to plot only March, April and June data from year 2010-2014?
head(Price.data)
Dry_Gas_Y Power_Gas_Y Price_Gas_Y
1990-01-01 52.16720 5.469179 2.39
1990-02-01 51.45287 5.470755 1.90
1990-03-01 49.29829 6.908609 1.55
1990-04-01 48.29243 7.721371 1.49
1990-05-01 47.25959 9.154057 1.47
1990-06-01 47.48744 11.525595 1.47
library(tidyverse)
Price.data %>%
mutate(year = as.numeric(format(Date, "%Y")),
month = as.numeric(format(Date, "%m"))) %>%
filter(year > 2009 & year < 2015, month == 3 | month == 4 | month ==6) %>%
ggplot(aes(Demand,Price))+geom_point()+geom_smooth(method=lm)
From your example I didn't see the dates having a column name and it looks like the dates are the row names. For this reason, this example creates a 'Date' column, then 'Month' & 'Year' columns for you to then filter the dates.
library(lubridate)
library(dplyr
plot_months <- Price.data%>%
mutate(Date = row.names(.),
Month = month(Date),
Year = year(Date))%>%
filter(Month %in% c(3,4,6),
Year %in% c(2009:2014))
ggplot(plot_months, aes(x=Demand,y=Price))+
geom_point()+
geom_smooth(method=lm)
You can use data.table, which is likely to be the fastest solution
library(data.table)
# convert your dataset into a data.table
setDT(df)
# If necessary, get date column into date format
# df[ , Date := as.Date(df$Date, "%m-%d-%y") ]
# Create separate columns for year and month
df[, year := year(Date)][, month := month(Date)]
# filter dataset
df <- df[ month %in% c(3,4,6) & year %in% c(2009:2014), ]
# subset(df, month %in% c(3,4,6) & year %in% c(2009:2014) ) # you could also use a simple subset, but this is likely to be slower
Plot
ggplot(data=df, aes(x=Demand, y=Price)) + geom_point() + geom_smooth(method=lm)
Since Price.data is an xts object, you can use the .indexmon function to extract the months you want to plot. Then use range-based subsetting to extracting the range of years you want.
Note that .indexmon returns the months starting with January = 0, like the $mon element of POSIXlt objects.
ggplot(Price.data[.indexmon(Price.data) %in% c(2, 3, 5)]['2010/2014'],
aes(x=Dry_Gas_Y, y=Price_Gas_Y)) + geom_point() + geom_smooth(method=lm)
I don't often have to work with dates in R, but I imagine this is fairly easy. I have a column that represents a date in a dataframe. I simply want to create a new dataframe that summarizes a 2nd column by Month/Year using the date. What is the best approach?
I want a second dataframe so I can feed it to a plot.
Any help you can provide will be greatly appreciated!
EDIT: For reference:
> str(temp)
'data.frame': 215746 obs. of 2 variables:
$ date : POSIXct, format: "2011-02-01" "2011-02-01" "2011-02-01" ...
$ amount: num 1.67 83.55 24.4 21.99 98.88 ...
> head(temp)
date amount
1 2011-02-01 1.670
2 2011-02-01 83.550
3 2011-02-01 24.400
4 2011-02-01 21.990
5 2011-02-03 98.882
6 2011-02-03 24.900
I'd do it with lubridate and plyr, rounding dates down to the nearest month to make them easier to plot:
library(lubridate)
df <- data.frame(
date = today() + days(1:300),
x = runif(300)
)
df$my <- floor_date(df$date, "month")
library(plyr)
ddply(df, "my", summarise, x = mean(x))
There is probably a more elegant solution, but splitting into months and years with strftime() and then aggregate()ing should do it. Then reassemble the date for plotting.
x <- as.POSIXct(c("2011-02-01", "2011-02-01", "2011-02-01"))
mo <- strftime(x, "%m")
yr <- strftime(x, "%Y")
amt <- runif(3)
dd <- data.frame(mo, yr, amt)
dd.agg <- aggregate(amt ~ mo + yr, dd, FUN = sum)
dd.agg$date <- as.POSIXct(paste(dd.agg$yr, dd.agg$mo, "01", sep = "-"))
A bit late to the game, but another option would be using data.table:
library(data.table)
setDT(temp)[, .(mn_amt = mean(amount)), by = .(yr = year(date), mon = months(date))]
# or if you want to apply the 'mean' function to several columns:
# setDT(temp)[, lapply(.SD, mean), by=.(year(date), month(date))]
this gives:
yr mon mn_amt
1: 2011 februari 42.610
2: 2011 maart 23.195
3: 2011 april 61.891
If you want names instead of numbers for the months, you can use:
setDT(temp)[, date := as.IDate(date)
][, .(mn_amt = mean(amount)), by = .(yr = year(date), mon = months(date))]
this gives:
yr mon mn_amt
1: 2011 februari 42.610
2: 2011 maart 23.195
3: 2011 april 61.891
As you see this will give the month names in your system language (which is Dutch in my case).
Or using a combination of lubridate and dplyr:
temp %>%
group_by(yr = year(date), mon = month(date)) %>%
summarise(mn_amt = mean(amount))
Used data:
# example data (modified the OP's data a bit)
temp <- structure(list(date = structure(1:6, .Label = c("2011-02-01", "2011-02-02", "2011-03-03", "2011-03-04", "2011-04-05", "2011-04-06"), class = "factor"),
amount = c(1.67, 83.55, 24.4, 21.99, 98.882, 24.9)),
.Names = c("date", "amount"), class = c("data.frame"), row.names = c(NA, -6L))
You can do it as:
short.date = strftime(temp$date, "%Y/%m")
aggr.stat = aggregate(temp$amount ~ short.date, FUN = sum)
Just use xts package for this.
library(xts)
ts <- xts(temp$amount, as.Date(temp$date, "%Y-%m-%d"))
# convert daily data
ts_m = apply.monthly(ts, FUN)
ts_y = apply.yearly(ts, FUN)
ts_q = apply.quarterly(ts, FUN)
where FUN is a function which you aggregate data with (for example sum)
Here's a dplyr option:
library(dplyr)
df %>%
mutate(date = as.Date(date)) %>%
mutate(ym = format(date, '%Y-%m')) %>%
group_by(ym) %>%
summarize(ym_mean = mean(x))
I have a function monyr that I use for this kind of stuff:
monyr <- function(x)
{
x <- as.POSIXlt(x)
x$mday <- 1
as.Date(x)
}
n <- as.Date(1:500, "1970-01-01")
nn <- monyr(n)
You can change the as.Date at the end to as.POSIXct to match the date format in your data. Summarising by month is then just a matter of using aggregate/by/etc.
One more solution:
rowsum(temp$amount, format(temp$date,"%Y-%m"))
For plot you could use barplot:
barplot(t(rowsum(temp$amount, format(temp$date,"%Y-%m"))), las=2)
Also, given that your time series seem to be in xts format, you can aggregate your daily time series to a monthly time series using the mean function like this:
d2m <- function(x) {
aggregate(x, format(as.Date(zoo::index(x)), "%Y-%m"), FUN=mean)
}