ggplot(Price.data['2000-01/2015-12'],aes(x=Demand,y=Price))+geom_point()+geom_smooth(method=lm)
indexClass(Price.data)
[1] "Date"
How to plot only March, April and June data from year 2010-2014?
head(Price.data)
Dry_Gas_Y Power_Gas_Y Price_Gas_Y
1990-01-01 52.16720 5.469179 2.39
1990-02-01 51.45287 5.470755 1.90
1990-03-01 49.29829 6.908609 1.55
1990-04-01 48.29243 7.721371 1.49
1990-05-01 47.25959 9.154057 1.47
1990-06-01 47.48744 11.525595 1.47
library(tidyverse)
Price.data %>%
mutate(year = as.numeric(format(Date, "%Y")),
month = as.numeric(format(Date, "%m"))) %>%
filter(year > 2009 & year < 2015, month == 3 | month == 4 | month ==6) %>%
ggplot(aes(Demand,Price))+geom_point()+geom_smooth(method=lm)
From your example I didn't see the dates having a column name and it looks like the dates are the row names. For this reason, this example creates a 'Date' column, then 'Month' & 'Year' columns for you to then filter the dates.
library(lubridate)
library(dplyr
plot_months <- Price.data%>%
mutate(Date = row.names(.),
Month = month(Date),
Year = year(Date))%>%
filter(Month %in% c(3,4,6),
Year %in% c(2009:2014))
ggplot(plot_months, aes(x=Demand,y=Price))+
geom_point()+
geom_smooth(method=lm)
You can use data.table, which is likely to be the fastest solution
library(data.table)
# convert your dataset into a data.table
setDT(df)
# If necessary, get date column into date format
# df[ , Date := as.Date(df$Date, "%m-%d-%y") ]
# Create separate columns for year and month
df[, year := year(Date)][, month := month(Date)]
# filter dataset
df <- df[ month %in% c(3,4,6) & year %in% c(2009:2014), ]
# subset(df, month %in% c(3,4,6) & year %in% c(2009:2014) ) # you could also use a simple subset, but this is likely to be slower
Plot
ggplot(data=df, aes(x=Demand, y=Price)) + geom_point() + geom_smooth(method=lm)
Since Price.data is an xts object, you can use the .indexmon function to extract the months you want to plot. Then use range-based subsetting to extracting the range of years you want.
Note that .indexmon returns the months starting with January = 0, like the $mon element of POSIXlt objects.
ggplot(Price.data[.indexmon(Price.data) %in% c(2, 3, 5)]['2010/2014'],
aes(x=Dry_Gas_Y, y=Price_Gas_Y)) + geom_point() + geom_smooth(method=lm)
Related
let's say I have a list of dates from March 1st to July 15th:
daterange = as.data.frame(seq(as.Date("2020-3-1"), as.Date("2020-7-15"), "days"))
I want to group the dates by 1-15 and 16-30/31 for each month. So the dates in March will be separated into two groups: Mar 1-15 and Mar 16-31. Then keep doing this for every month.
I know the lubridate package can sort by week, but I don't know how to set a custom range.
Thanks
We can create a logical vector on day as well as a group on yearmon
library(dplyr)
library(zoo)
library(lubridate)
library(stringr)
daterange2 <- daterange %>%
set_names('Date') %>%
group_by(yearmon = as.yearmon(Date),
Daygroup = (day(Date) > 15) + 1) %>%
mutate(Label = str_c(format(Date, '%b'),
str_c(min(day(Date)), max(day(Date)), sep='-'), sep= ' '))
Using base R, you can create two groups in each month by pasting the month value from each date and assign value 1/2 based on the date.
newdaterange <- transform(daterange, group = paste0(format(date, "%b"), '-group-',
ifelse(as.integer(format(date, "%d")) > 15, 1, 2)))
I have a continuous list of dates (yyyy-mm-dd) from 1985 to 2018 in one column (Colname = date). What I wish to do is generate another column which outputs a water season and year given the date.
To make it clearer I have two water season:
Summer = yyyy-04-01 to yyyy-09-31;
Winter = yyyy-10-01 to yyyy(+1)-03-31.
So for 2018 - Summer = 2018-04-01 to 2018-09-31; Winter 2018-10-01 to 2019-03-31.
What I would like to output is something like the following:
Many thanks.
A tidy verse approach
library(tidyverse)
df <-tibble(date = seq(from = as.Date('2000-01-01'), to = as.Date('2001-12-31'), by = '1 month'))
df
df %>%
mutate(water_season_year = case_when(
lubridate::month(date) %in% c(4:9) ~str_c('Su_', lubridate::year(date)),
lubridate::month(date) %in% c(10:12) ~str_c('Wi_', lubridate::year(date)),
lubridate::month(date) %in% c(1:3)~str_c('Wi_', lubridate::year(date) -1),
TRUE ~ 'Error'))
You can compare just the month part of the data to get the season, in base R consider doing
month <- as.integer(format(df$date, "%m"))
year <- format(df$date, "%Y")
inds <- month >= 4 & month <= 9
df$water_season_year <- NA
df$water_season_year[inds] <- paste("Su", year[inds], sep = "_")
df$water_season_year[!inds] <- paste("Wi", year[!inds], sep = "_")
#To add previous year for month <= 3 do
df$water_season_year[month <= 3] <- paste("Wi",
as.integer(year[month <= 3]) - 1, sep = "_")
df
# date water_season_year
#1 2019-01-03 Wi_2019
#2 2000-06-01 Su_2000
Make sure that date variable is of "Date" class.
data
df <-data.frame(date = as.Date(c("2019-01-03", "2000-06-01")))
I have dataframe that has 6000 locations. For each location, I have 36 years daily data of rainfall in wide format.
A sample data:
set.seed(123)
mat <- matrix(round(rnorm(6000*36*365), digits = 2),nrow = 6000*36, ncol = 365)
dat <- data.table(mat)
names(dat) <- rep(paste0("d_",1:365))
dat$loc.id <- rep(1:6000, each = 36)
dat$year <- rep(1980:2015, times = 6000)
What I want to do is for each location, generate the long term average rainfall for each month. For e.g. for loc.id = 1, mean rainfall in Jan, Feb, March... Dec.
Let' say this data is called df which is a data table
library(dplyr)
Here's what I did:
loc.list <- unique(dat$loc.id)
my.list <- list() # a list to store results
ptm <- proc.time()
for(i in seq_along(loc.list)){
n <- loc.list[i]
df1 <- dat[dat$loc.id == n,]
df2 <- gather(df1, day, rain, -year) # this melts the data in long format
df3 <- df2 %>% mutate(day = gsub("d_","", day)) %>% # since the day column was in "d_1" format, I converted into integer (1,2,3..365)
mutate(day = as.numeric(as.character(day))) %>% # ensure that day column is numeric. For some reasonson, some NA.s appear.
arrange(year,day) %>% # ensure that they are arranged in order
mutate(month = strptime(paste(year, day), format = "%Y %j")$mon + 1) %>% # assing each day to a month
group_by(year,month) %>% # group by year and month
summarise(month.rain = sum(rain)) %>% # calculate for each location, year and month, total rainfall
group_by(month) %>% # group by month
summarise(month.mean = round(mean(month.rain), digits = 2)) # calculate for each month, the long term mean
my.list[[i]] <- df3
}
proc.time() - ptm
user system elapsed
1036.17 0.20 1040.68
I wanted to ask if there are more efficient and faster way to achieve this task
Another data.table alternative:
# change column names to month, grabbed from 365 dates of a non-leap year
setnames(dat, c(format(as.Date("2017-01-01") + 0:364, "%b"),
"loc.id", "year"))
# melt to long format
d <- melt(dat, id.vars = c("loc.id", "year"),
variable.name = "month", value.name = "rain")
# calculate mean rain by location and month
d2 <- d[ , .(mean_rain = mean(rain)), by = .(loc, month)]
This seems ~7 times faster than the answer by caw5cs. The result by Martin Morgan is in a different format though, which prevents a direct comparison of timings.
If you rather have unique column names in 'dat', you may use %b_%d (month-day) instead of %b only. Then use substr in by to grab the month part:
# change column names to month_day, using 365 dates of a non-leap year
setnames(dat, c(format(as.Date("2017-01-01") + 0:364, "%b_%d"),
"loc.id", "year"))
# melt to long format
d <- melt(dat, id.vars = c("loc.id", "year"),
variable.name = "month_day", value.name = "rain")
# calculate mean rain by location and month
d2 <- d[ , .(mean_rain = mean(rain)), by = .(loc.id, month = substr(month_day, 1, 3))]
Use the cryptically named rowsum() to sum daily rainfall at each site, over all years
loc.id = rep(1:6000, each = 36)
daily.by.loc = rowsum(mat, loc.id)
and use the same trick on the transposed matrix to sum by month (since there are 365 columns leap years must be ignored).
month = factor(
months(as.Date(0:364, origin="1970-01-01")),
levels = month.name
)
loc.by.month = rowsum(t(daily.by.loc), month)
Calculate the average by dividing by number of observations; R's column-major matrix representation and recycling rules apply. Transpose so the orientation is the same as the data.
days.per.month = tabulate(month)
ans = t(loc.by.month / (36 * days.per.month))
The result is a 6000 x 12 matrix
> dim(ans)
[1] 6000 12
> head(ans, 3)
January February March April May June
1 0.01554659 0.002043651 -0.02950717 -0.02700926 0.003521505 -0.011268519
2 0.04953405 0.032926587 -0.04959677 0.02808333 0.022051971 0.009768519
3 -0.01125448 -0.023343254 -0.02672939 0.04012963 0.018530466 0.035583333
July August September October November December
1 0.009874552 -0.030824373 -0.04958333 -0.03366487 -0.07390741 -0.07899642
2 -0.011630824 -0.003369176 -0.00100000 -0.00594086 -0.02817593 -0.01161290
3 0.031810036 0.059641577 -0.01109259 0.04646953 -0.01601852 0.03103943
in less than a second.
Grossly misread the question the first time, oops! Seems to be working as intended this time.
library(data.table)
set.seed(123)
mat <- matrix(round(rnorm(6000*36*365), digits = 2),nrow = 6000*36, ncol = 365)
dat <- data.table(mat)
names(dat) <- rep(paste0("d_",1:365))
dat$loc.id <- rep(1:6000, each = 36)
dat$year <- rep(1980:2015, times = 6000)
system.time({
# convert to long format with month # as column name
date_cols <- colnames(dat)[1:365]
setnames(dat, date_cols, as.character(1:365))
dat.long <- melt(dat, measure.vars=as.character(1:365), variable="day", value="rainfall")
# R date starts at 0 for Jan 1, so we offset the day by 1
dat.long[, day := as.numeric(day) - 1]
setkey(dat.long, year, day)
# Make table for merging year/day/month
months <- CJ(year=1980:2015, day=0:365)
months[, date := as.Date(day, origin=paste(year, "-01-01", sep=""))]
months[, month := tstrsplit(date, "-")[2]]
setkey(months, year, day)
# Merge tables to get month column
dat.merge <- merge(dat.long, months)
# aggregate by location an dmonth
dat.ag <- dat.merge[, list(mean_rainfall = mean(rainfall)), by=list(loc.id, month)]
})
Yielding
user system elapsed
14.420 4.205 18.626
> dat.ag
loc.id month mean_rainfall
1: 1 01 0.015546595
2: 2 01 0.049534050
3: 3 01 -0.011254480
4: 4 01 -0.019453405
5: 5 01 0.005860215
---
71996: 5996 12 0.027407407
71997: 5997 12 0.020334237
71998: 5998 12 0.043360434
71999: 5999 12 -0.006856369
72000: 6000 12 0.040542005
I need to aggregate multiple months from original data with dataframe in R, e.g: data frame with datetime include 2017 and 2018.
date category amt
1 2017-08-05 A 0.1900707
2 2017-08-06 B 0.2661277
3 2017-08-07 c 0.4763196
4 2017-08-08 A 0.5183718
5 2017-08-09 B 0.3021019
6 2017-08-10 c 0.3393616
What I want is to sum based on 6 month period and category:
period category sum
1 2017_secondPeriod A 25.00972
2 2018_firstPeriod A 25.59850
3 2017_secondPeriod B 24.96924
4 2018_firstPeriod B 24.79649
5 2017_secondPeriod c 20.17096
6 2018_firstPeriod c 27.01794
What I did:
1. select the last 6 months of 2017, like wise 2018
2. add a new column for each subset to indicate the period
3. Combine 2 subset again
4. aggregate
as following:
library(lubridate)
df <- data.frame(
date = today() + days(1:300),
category = c("A","B","c"),
amt = runif(300)
)
df2017_secondHalf <- subset(df, month(df$date) %in% c(7,8,9,10,11,12) & year(df$date) == 2017)
f2018_firstHalf <- subset(df, month(df$date) %in% c(1,2,3,4,5,6) & year(df$date) == 2018)
sum1 <- aggregate(df2017_secondHalf$amt, by=list(Category=df2017_secondHalf$Category), FUN=sum)
sum2 <- aggregate(df2018_firstHalf$amt, by=list(Category=df2018_secondHalf$Category), FUN=sum)
df2017_secondHalf$period <- '2017_secondPeriod'
df2018_firstHalf$period <- '2018_firstPeriod'
aggregate(x = df$amt, by = df[c("period", "category")], FUN = sum)
I try to figure out but did not know how to aggregate multple months e.g, 3 months, or 6 months.
Thanks in advance
Any suggesstion?
With lubridate and tidyverse (dplyr & magrittr)
First, let's create groups with Semesters, Quarter, and "Trimonthly".
library(tidyverse)
library(lubridate)
df <- df %>% mutate(Semester = semester(date, with_year = TRUE),
Quarter = quarter(date, with_year = TRUE),
Trimonthly = round_date(date, unit = "3 months" ))
Lubridate's semester() breaks by semsters and gives you a 1 (Jan-Jun) or 2 (Jul-Aug); quarter() does a similar thing with quarters.
I add a third, the more basic round_date function, where you can specify your time frame in the form of size and time units. It yields the first date of such time frame. I deliberately name it "Trimonthly" so you can see how it compares to quarter()
Pivot.Semester <- df %>%
group_by(Semester, category) %>%
summarise(Semester.sum = sum(amt))
Pivot.Quarter <- df %>%
group_by(Quarter, category) %>%
summarise(Quarter.sum = sum(amt))
Pivot.Trimonthly <- df %>%
group_by(Trimonthly, category) %>%
summarise(Trimonthly.sum = sum(amt))
Pivot.Semester
Pivot.Quarter
Pivot.Trimonthly
Optional: If you want to join the summarised data to the original DF.
df <- df %>% left_join(Pivot.Semester, by = c("category", "Semester")) %>%
left_join(Pivot.Quarter, by = c("category", "Quarter")) %>%
left_join(Pivot.Trimonthly, by = c("category", "Trimonthly"))
df
Here is a 3 line solution that uses no package. Let k be the number of months in a period. For half year periods k is 6. For quarter year periods k would be 3, etc. Replace 02 in the sprintf format with 1 if you want one digit suffices (but not for monthly since those must be two digit). Further modify the sprintf format if you want it to exactly match the question.
k <- 6
period <- with(as.POSIXlt(DF$date), sprintf("%d-%02d", year + 1900, (mon %/% k) + 1))
aggregate(amt ~ category + period, DF, sum)
giving:
category period amt
1 A 2017-02 0.7084425
2 B 2017-02 0.5682296
3 c 2017-02 0.8156812
At the expense of using one package we can simplify the quarterly and monthly calculations by replacing the formula for period with one of these:
library(zoo)
# quarterly
period <- as.yearqtr(DF$date)
# monthly
period <- as.yearmon(DF$date)
Note: The input in reproducible form is:
Lines <- "date category amt
1 2017-08-05 A 0.1900707
2 2017-08-06 B 0.2661277
3 2017-08-07 c 0.4763196
4 2017-08-08 A 0.5183718
5 2017-08-09 B 0.3021019
6 2017-08-10 c 0.3393616"
DF <- read.table(text = Lines)
DF$date <- as.Date(DF$date)
I don't often have to work with dates in R, but I imagine this is fairly easy. I have a column that represents a date in a dataframe. I simply want to create a new dataframe that summarizes a 2nd column by Month/Year using the date. What is the best approach?
I want a second dataframe so I can feed it to a plot.
Any help you can provide will be greatly appreciated!
EDIT: For reference:
> str(temp)
'data.frame': 215746 obs. of 2 variables:
$ date : POSIXct, format: "2011-02-01" "2011-02-01" "2011-02-01" ...
$ amount: num 1.67 83.55 24.4 21.99 98.88 ...
> head(temp)
date amount
1 2011-02-01 1.670
2 2011-02-01 83.550
3 2011-02-01 24.400
4 2011-02-01 21.990
5 2011-02-03 98.882
6 2011-02-03 24.900
I'd do it with lubridate and plyr, rounding dates down to the nearest month to make them easier to plot:
library(lubridate)
df <- data.frame(
date = today() + days(1:300),
x = runif(300)
)
df$my <- floor_date(df$date, "month")
library(plyr)
ddply(df, "my", summarise, x = mean(x))
There is probably a more elegant solution, but splitting into months and years with strftime() and then aggregate()ing should do it. Then reassemble the date for plotting.
x <- as.POSIXct(c("2011-02-01", "2011-02-01", "2011-02-01"))
mo <- strftime(x, "%m")
yr <- strftime(x, "%Y")
amt <- runif(3)
dd <- data.frame(mo, yr, amt)
dd.agg <- aggregate(amt ~ mo + yr, dd, FUN = sum)
dd.agg$date <- as.POSIXct(paste(dd.agg$yr, dd.agg$mo, "01", sep = "-"))
A bit late to the game, but another option would be using data.table:
library(data.table)
setDT(temp)[, .(mn_amt = mean(amount)), by = .(yr = year(date), mon = months(date))]
# or if you want to apply the 'mean' function to several columns:
# setDT(temp)[, lapply(.SD, mean), by=.(year(date), month(date))]
this gives:
yr mon mn_amt
1: 2011 februari 42.610
2: 2011 maart 23.195
3: 2011 april 61.891
If you want names instead of numbers for the months, you can use:
setDT(temp)[, date := as.IDate(date)
][, .(mn_amt = mean(amount)), by = .(yr = year(date), mon = months(date))]
this gives:
yr mon mn_amt
1: 2011 februari 42.610
2: 2011 maart 23.195
3: 2011 april 61.891
As you see this will give the month names in your system language (which is Dutch in my case).
Or using a combination of lubridate and dplyr:
temp %>%
group_by(yr = year(date), mon = month(date)) %>%
summarise(mn_amt = mean(amount))
Used data:
# example data (modified the OP's data a bit)
temp <- structure(list(date = structure(1:6, .Label = c("2011-02-01", "2011-02-02", "2011-03-03", "2011-03-04", "2011-04-05", "2011-04-06"), class = "factor"),
amount = c(1.67, 83.55, 24.4, 21.99, 98.882, 24.9)),
.Names = c("date", "amount"), class = c("data.frame"), row.names = c(NA, -6L))
You can do it as:
short.date = strftime(temp$date, "%Y/%m")
aggr.stat = aggregate(temp$amount ~ short.date, FUN = sum)
Just use xts package for this.
library(xts)
ts <- xts(temp$amount, as.Date(temp$date, "%Y-%m-%d"))
# convert daily data
ts_m = apply.monthly(ts, FUN)
ts_y = apply.yearly(ts, FUN)
ts_q = apply.quarterly(ts, FUN)
where FUN is a function which you aggregate data with (for example sum)
Here's a dplyr option:
library(dplyr)
df %>%
mutate(date = as.Date(date)) %>%
mutate(ym = format(date, '%Y-%m')) %>%
group_by(ym) %>%
summarize(ym_mean = mean(x))
I have a function monyr that I use for this kind of stuff:
monyr <- function(x)
{
x <- as.POSIXlt(x)
x$mday <- 1
as.Date(x)
}
n <- as.Date(1:500, "1970-01-01")
nn <- monyr(n)
You can change the as.Date at the end to as.POSIXct to match the date format in your data. Summarising by month is then just a matter of using aggregate/by/etc.
One more solution:
rowsum(temp$amount, format(temp$date,"%Y-%m"))
For plot you could use barplot:
barplot(t(rowsum(temp$amount, format(temp$date,"%Y-%m"))), las=2)
Also, given that your time series seem to be in xts format, you can aggregate your daily time series to a monthly time series using the mean function like this:
d2m <- function(x) {
aggregate(x, format(as.Date(zoo::index(x)), "%Y-%m"), FUN=mean)
}