How to apply for loop with ddply function? - r

I want to calculate the number of days in each month with rainfall >= 2.5 mm for every column. I was able to calculate it for a single column after taking help from this post like
require(seas)
library (zoo)
data(mscdata)
dat.int <- (mksub(mscdata, id=1108447))
dat.int$yearmon <- as.yearmon(dat.int$date, "%b %y")
require(plyr)
rainydays_by_yearmon <- ddply(dat.int, .(yearmon), summarize, rainy_days=sum(rain >= 1.0) )
print.data.frame(rainydays_by_yearmon)
Now I want to apply it for all the columns. I have tried the following code
for(i in 1:length(dat.int)){
y1 <- dat.int[[i]]
rainydays <- ddply(dat.int, .(yearmon), summarize, rainy_days=sum(y1 >= 2.5))
if(i==1){
m1 <- rainydays
}
else{
m1 <- cbind(rainydays, m1)
}
print(i)
}
m1
But I am unable to get the desired results. Please help me out!!!

I would use dplyr and tidyr from tidyverse instead. pivot_longer puts the data into long form with is easier to manipulate. pivot_wider makes it wide again (probably unnecessary depending on your next step)
library(seas)
library(tidyverse)
library(zoo)
data(mscdata)
dat.int <- (mksub(mscdata, id=1108447))
dat.int %>%
as_tibble() %>% # for easier viewing
mutate(yearmon = as.yearmon(dat.int$date, "%b %y")) %>%
select(-date, -year, -yday) %>%
pivot_longer(cols = -yearmon, names_to = "variable", values_to = "value") %>%
group_by(yearmon, variable) %>%
summarise(rainy_days = sum(value > 2.5)) %>%
pivot_wider(names_from = "variable", values_from = "rainy_days")

if you don't mind using the data.table library, see the solution below.
library('data.table')
library('seas')
setDT(mscdata)
mscdata[id == 1108447 & rain >= 2.5, .(rain_ge_2.5mm = .N),
by = .(year, month = format(date, "%m"))]
Output
# year month rain_ge_2.5mm
# 1: 1975 01 12
# 2: 1975 02 8
# 3: 1975 03 10
# 4: 1975 04 2
# 5: 1975 05 4
# ---
# 350: 2004 07 2
# 351: 2004 08 5
# 352: 2004 10 10
# 353: 2004 11 14
# 354: 2004 12 14
If you want to process all ids, then you can group data by id as below.
For rain only:
mscdata[, .(rain_ge_2.5mm = sum(rain >= 2.5)),
by = .(id, year, month = format(date, "%m"))]
For rain, snow, and precip
mscdata[, .(rain_ge_2.5mm = sum(rain >= 2.5),
snow_ge_2 = sum(snow >= 2.0),
precip_ge_2 = sum(precip >= 2.0)),
by = .(id, year, month = format(date, "%m"))]
# id year month rain_ge_2.5mm snow_ge_2 precip_ge_2
# 1: 1096450 1975 01 1 10 9
# 2: 1096450 1975 02 0 5 3
# 3: 1096450 1975 03 1 9 9
# 4: 1096450 1975 04 1 2 3
# 5: 1096450 1975 05 5 1 6
# ---
# 862: 2100630 2000 07 NA NA 3
# 863: 2100630 2000 08 NA NA 8
# 864: 2100630 2000 09 NA NA 6
# 865: 2100630 2000 11 NA NA NA
# 866: 2100630 2001 01 NA NA NA

Related

Convert dataframe into long format one column in R

I have data as given in input section (dput below), need to convert to output with all values of two rows in one long column. I tried using transpose but cells were getting trimmed.
I don't want to hardcode since in future I might have data in 3 or 4 rows in a similar way.
P.S - I also tried pivot_longer but it didnt help
structure(list(Header = c("Sat 12/3 \n358a-947a\n1017a-229p HRS 10.02",
"Sat 12/10 \n559a-1106a\n1134a-227p HRS 8.00"), X = c("Sun 12/4 ",
"Sun 12/11 "), X.1 = c("Mon 12/5 \n548a-1121a\n1149a-618p\n650p-845p HRS 13.95",
"Mon 12/12 \n500a-1121a\n1151a-547p\n616p-830p HRS 14.53"),
X.2 = c("Tue 12/6 \n359a-1120a\n1150a-400p HRS 11.53",
"Tue 12/13 \n548a-1120a\n1148a-449p HRS 10.54"), X.3 = c("Wed 12/7 \n548a-1119a\n1149a-515p HRS 10.95",
"Wed 12/14 \n429a-1120a\n1150a-432p HRS 11.56"), X.4 = c("Thu 12/8 \n549a-1120a\n1149a-447p HRS 10.48",
"Thu 12/15 \n429a-1121a\n1152a-431p HRS 11.52"), X.5 = c("Fri 12/9 \n548a-1120a\n1148a-218p HRS 8.03",
"Fri 12/16 \n430a-1120a\n1150a-432p HRS 11.55")), class = "data.frame", row.names = c(NA,
-2L))
My try (with a little help)
pivot_longer(df, cols = c(1:7)) %>%
select(value) %>%
mutate(value=str_replace(value,"HRS","")) %>%
separate(.,value,into=c("day","entry1","entry2","entry3"),sep="\n") %>%
separate(.,entry1,into=c("time_in1","time_out1"),sep="-") %>%
separate(.,entry2,into=c("time_in2","time_out2"),sep="-") %>%
separate(.,time_out2,into=c("time_out2","duration1"),remove = FALSE,sep=" ",extra = "merge") %>%
separate(.,entry3,into=c("time_in3","time_out3"),sep="-") %>%
separate(.,time_out3,into=c("time_out3","duration2"),remove = FALSE,sep=" ") %>%
mutate(duration=coalesce(duration1,duration2)) %>%
select(day, duration, time_in1,time_out1,time_in2,time_out2,time_in3,time_out3) %>%
separate(.,day,into=c("date","day"),extra="merge") %>%
mutate(day=mdy(paste0(day,"2021")),
duration=str_trim(duration))
Approach
The key was tidyr::separate_rows(), which not only separates the cell by "\n" but also splits the components into rows rather than columns.
Here, it is much better to split into rows than into columns. Suppose that most cells have 2 or 3 entries separated by "\n"; but there is a "rogue" cell, with an unusually large number (say 9) of entries, generated by someone who repeatedly clocked in and out throughout the day.
While splitting into columns would create arbitrarily many time_in* | time_out* columns, which remain empty (NA) in all rows except the "rogue"
date day duration time_in1 time_out1 time_in2 time_out2 time_in3 time_out3 time_in4 time_out4 time_in5 time_out5 time_in6 time_out6 time_in7 time_out7 time_in8 time_out8 time_in9 time_out9
<chr> <date> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# ... ... ... ... ... ... ... ... ... NA NA NA NA NA NA NA NA NA NA NA NA
splitting into rows will maintain a tame (and stable) columnar structure
date day duration time_in time_out
<date> <chr> <dbl> <chr> <chr>
# ... ... ... ... ...
# ... ... ... ... ...
# ... ... ... ... ...
without any "extraneous" columns (or rows).
Solution
Given your sample data df
df <- structure(list(Header = c("Sat 12/3 \n358a-947a\n1017a-229p HRS 10.02", "Sat 12/10 \n559a-1106a\n1134a-227p HRS 8.00"),
X = c("Sun 12/4 ", "Sun 12/11 "),
X.1 = c("Mon 12/5 \n548a-1121a\n1149a-618p\n650p-845p HRS 13.95", "Mon 12/12 \n500a-1121a\n1151a-547p\n616p-830p HRS 14.53"),
X.2 = c("Tue 12/6 \n359a-1120a\n1150a-400p HRS 11.53", "Tue 12/13 \n548a-1120a\n1148a-449p HRS 10.54"),
X.3 = c("Wed 12/7 \n548a-1119a\n1149a-515p HRS 10.95", "Wed 12/14 \n429a-1120a\n1150a-432p HRS 11.56"),
X.4 = c("Thu 12/8 \n549a-1120a\n1149a-447p HRS 10.48", "Thu 12/15 \n429a-1121a\n1152a-431p HRS 11.52"),
X.5 = c("Fri 12/9 \n548a-1120a\n1148a-218p HRS 8.03", "Fri 12/16 \n430a-1120a\n1150a-432p HRS 11.55")),
class = "data.frame", row.names = c(NA, -2L))
the following workflow
library(tidyverse)
library(stringr)
# ...
# Code to generate 'df'.
# ...
year_observed <- 2016
results <- df %>%
mutate(id = row_number()) %>%
pivot_longer(!id, names_to = "column") %>%
separate(value, into = c("date", "entries"), sep = "\n", fill = "right", extra = "merge", remove = TRUE) %>%
separate(entries, into = c("times", "duration"), sep = "HRS", fill = "right", extra = "warn", remove = TRUE) %>%
mutate(across(date:duration, trimws),
date = as.Date(paste(str_extract(date, "\\d{1,2}/\\d{1,2}$"), year_observed, sep = "/"), format = "%m/%d/%Y"),
duration = as.numeric(duration),
duration = if_else(is.na(duration), 0, duration),
day = format(date, format = "%a")) %>%
separate_rows(times, sep = "\n") %>%
separate(times, into = c("time_in", "time_out"), sep = "-", fill = "warn", extra = "warn", remove = TRUE) %>%
# ...Further Transformations... %>%
select(id, date, day, duration, time_in, time_out)
# View results.
results
should yield results like
# A tibble: 28 x 6
id date day duration time_in time_out
<int> <date> <chr> <dbl> <chr> <chr>
1 1 2016-12-03 Sat 10.0 358a 947a
2 1 2016-12-03 Sat 10.0 1017a 229p
3 1 2016-12-04 Sun 0 NA NA
4 1 2016-12-05 Mon 14.0 548a 1121a
5 1 2016-12-05 Mon 14.0 1149a 618p
6 1 2016-12-05 Mon 14.0 650p 845p
7 1 2016-12-06 Tue 11.5 359a 1120a
8 1 2016-12-06 Tue 11.5 1150a 400p
9 1 2016-12-07 Wed 11.0 548a 1119a
10 1 2016-12-07 Wed 11.0 1149a 515p
# ... with 18 more rows
where id identifies (by row number) the original record in df.
To pivot into your newly specified output, simply execute this code, or append it to the existing workflow:
wide_results <- results %>%
group_by(id, date) %>% mutate(entry = row_number()) %>% ungroup() %>%
pivot_wider(id_cols = c(date, day, duration), names_from = entry, names_glue = "{.value}_{entry}", values_from = c(time_in, time_out)) %>%
# Select so as to alternate between 'time_in_*' and 'time_out_*'.
select(order(as.numeric(str_extract(colnames(.), "\\d+$")), str_extract(colnames(.), "^time_(in|out)"), na.last = FALSE))
# View results.
wide_results
You should obtain wide_results like:
# A tibble: 14 x 9
date day duration time_in_1 time_out_1 time_in_2 time_out_2 time_in_3 time_out_3
<date> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2016-12-03 Sat 10.0 358a 947a 1017a 229p NA NA
2 2016-12-04 Sun 0 NA NA NA NA NA NA
3 2016-12-05 Mon 14.0 548a 1121a 1149a 618p 650p 845p
4 2016-12-06 Tue 11.5 359a 1120a 1150a 400p NA NA
5 2016-12-07 Wed 11.0 548a 1119a 1149a 515p NA NA
6 2016-12-08 Thu 10.5 549a 1120a 1149a 447p NA NA
7 2016-12-09 Fri 8.03 548a 1120a 1148a 218p NA NA
8 2016-12-10 Sat 8 559a 1106a 1134a 227p NA NA
9 2016-12-11 Sun 0 NA NA NA NA NA NA
10 2016-12-12 Mon 14.5 500a 1121a 1151a 547p 616p 830p
11 2016-12-13 Tue 10.5 548a 1120a 1148a 449p NA NA
12 2016-12-14 Wed 11.6 429a 1120a 1150a 432p NA NA
13 2016-12-15 Thu 11.5 429a 1121a 1152a 431p NA NA
14 2016-12-16 Fri 11.6 430a 1120a 1150a 432p NA NA
Note
You must supply the year_observed (here 2016) to correctly contextualize the dates written in m/d format. Otherwise, they will calibrate to the year 2021, which will skew the days of the week.
Warning
These dates (12/3, etc.) are in December, and close to the end of the calendar year. If any of these entries "cross over" (from 2016) into the next year (ex. 1/1/2017), they will be incorrectly calibrated to the former year (ex. 1/1/2016), and thus have an incorrect date and weekday.
However, if your dates do cross over, that's a good indication that the full date (12/3/2016) should have been notated in the original cells, in which case
results <- df %>%
# ... %>%
mutate(
# ...
date = as.Date(str_extract(date, "(\\d{1,2}/){2,2}\\d{4,4}$"), format = "%m/%d/%Y")
# ...
) # ... %>%
would have sufficed to properly parse the dates.

Missing data in R - How to skip grouping of days with missing information?

I have hourly values of temperature measurements and I wish to calculate the average per day only for complete (i.e. with 24 measurements) days. Incomplete days would then be summarized as "NA".
I have grouped the values together per year, month and day and call summarize().
I have three month of data missing which appears as a gap in my ggplot function and which is what I want to achieve with the rest. The problem is that when I call summarize() to calculate the mean of my values, days with only 1 or 2 measurements also get called. Only those with all missing values (24) appear as "NA".
Date TempUrb TempRur UHI
1 2011-03-21 22:00:00 10.1 11.67000 -1.570000
2 2011-03-21 23:00:00 9.9 11.67000 -1.770000
3 2011-03-22 00:00:00 10.9 11.11000 -0.210000
4 2011-03-22 01:00:00 10.7 10.56000 0.140000
5 2011-03-22 02:00:00 9.7 10.00000 -0.300000
6 2011-03-22 03:00:00 9.5 10.00000 -0.500000
7 2011-03-22 04:00:00 9.4 8.89000 0.510000
8 2011-03-22 05:00:00 8.4 8.33500 0.065000
9 2011-03-22 06:00:00 8.2 7.50000 0.700000
AvgUHI <- UHI %>% group_by(year(Date), add = TRUE) %>%
group_by(month(Date), add = TRUE) %>%
group_by(day(Date), add = TRUE, .drop = TRUE) %>%
summarize(AvgUHI = mean(UHI, na.rm = TRUE))
# A tibble: 2,844 x 4
# Groups: year(Date), month(Date) [95]
`year(Date)` `month(Date)` `day(Date)` AvgUHI
<int> <int> <int> <dbl>
1476 2015 4 4 0.96625000
1477 2015 4 5 -0.11909722
1478 2015 4 6 -0.60416667
1479 2015 4 7 -0.92916667
1480 2015 4 8 NA
1481 2015 4 9 NA
AvgUHI<- AvgUHI %>% group_by(`year(Date)`, add = TRUE) %>%
group_by(`month(Date)`, add = TRUE) %>%
summarize(AvgUHI= mean(AvgUHI, na.rm = TRUE))
# A tibble: 95 x 3
# Groups: year(Date) [9]
`year(Date)` `month(Date)` AvgUHI
<int> <int> <dbl>
50 2015 4 0.580887346
51 2015 5 0.453815051
52 2015 6 0.008479618
As you can see above on the final table, I have an average for 04-2015, while I am missing data on that month (08 - 09/04/2015 on this example represented on the second table).
The same happens when I calculate AvgUHI and I'm missing hourly data.
I simply would like to see on the last table the AvgUHI for 04-2015 be NA.
E.g: of my graph1
The following will give a dataframe aggregated by day, where only the complete days, with 4 observations, are not NA. Then you can group by month to have the final dataframe.
UHI %>%
mutate(Day = as.Date(Date)) %>%
group_by(Day) %>%
mutate(n = n(), tmpUHI = if_else(n == 24, UHI, NA_real_)) %>%
summarize(AvgUHI = mean(tmpUHI)) %>%
full_join(data.frame(Day = seq(min(.$Day), max(.$Day), by = "day"))) %>%
arrange(Day) -> AvgUHI
For hours look at Rui Barradas' answer. For months the following code worked:
AvgUHI %>%
group_by(year(Day), add = TRUE) %>%
group_by(month(Day), add = TRUE) %>%
mutate(sum = sum(is.na(AvgUHI)), tmpUHI = if_else(sum <= 10, AvgUHI, NA_real_)) %>%
summarise(AvgUHI = mean(tmpUHI, na.rm = TRUE)) -> AvgUHI

How do I group my date variable into month/year in R?

I have a "date" vector, that contains dates in mm/dd/yyyy format:
head(Entered_Date,5)
[1] 1/5/1998 1/5/1998 1/5/1998 1/5/1998 1/5/1998
I am trying to plot a frequency variable against the date, but I want to group the dates that it is by month or year. As it is now, there is a frequency per day, but I want to plot the frequency by month or year. So instead of having a frequency of 1 for 1/5/1998, 1 for 1/7/1998, and 3 for 1/8/1998, I would like to display it as 5 for 1/1998. It is a relatively large data set, with dates from 1998 to present, and I would like to find some automated way to accomplish this.
> dput(head(Entered_Date))
structure(c(260L, 260L, 260L, 260L, 260L, 260L), .Label = c("1/1/1998",
"1/1/1999", "1/1/2001", "1/1/2002", "1/10/2000", "1/10/2001",
"1/10/2002", "1/10/2003", "1/10/2005", "1/10/2006", "1/10/2007",
"1/10/2008", "1/10/2011", "1/10/2012", "1/10/2013", "1/11/1999",
"1/11/2000", "1/11/2001", "1/11/2002", "1/11/2005", "1/11/2006",
"1/11/2008", "1/11/2010", "1/11/2011", "1/11/2012", "1/11/2013",
"1/12/1998", "1/12/1999", "1/12/2001", "1/12/2004", "1/12/2005", ...
The floor_date() function from the lubridate package does this nicely.
data %>%
group_by(month = lubridate::floor_date(date, "month")) %>%
summarize(summary_variable = sum(value))
Thanks to Roman Cheplyaka
https://ro-che.info/articles/2017-02-22-group_by_month_r
See more on how to use the function: https://lubridate.tidyverse.org/reference/round_date.html
Here is an example using dplyr. You simply use the corresponding date format string for month %m or year %Y in the format statement.
set.seed(123)
df <- data.frame(date = seq.Date(from =as.Date("01/01/1998", "%d/%m/%Y"),
to=as.Date("01/01/2000", "%d/%m/%Y"), by="day"),
value = sample(seq(5), 731, replace = TRUE))
head(df)
date value
1 1998-01-01 2
2 1998-01-02 4
3 1998-01-03 3
4 1998-01-04 5
5 1998-01-05 5
6 1998-01-06 1
library(dplyr)
df %>%
mutate(month = format(date, "%m"), year = format(date, "%Y")) %>%
group_by(month, year) %>%
summarise(total = sum(value))
Source: local data frame [25 x 3]
Groups: month [?]
month year total
(chr) (chr) (int)
1 01 1998 105
2 01 1999 91
3 01 2000 3
4 02 1998 74
5 02 1999 77
6 03 1998 96
7 03 1999 86
8 04 1998 91
9 04 1999 95
10 05 1998 93
.. ... ... ...
Just to add to #cdeterman answer, you can use lubridate along with dplyr to make this even easier:
df <- data.frame(date = seq.Date(from =as.Date("01/01/1998", "%d/%m/%Y"),
to=as.Date("01/01/2000", "%d/%m/%Y"), by="day"),
value = sample(seq(5), 731, replace = TRUE))
library(dplyr)
library(lubridate)
df %>%
mutate(month = month(date), year = year(date)) %>%
group_by(month, year) %>%
summarise(total = sum(value))
Maybe you just add a column in your data like this:
Year <- format(as.Date(Entered_Date, "%d/%m/%Y"), "%Y")
Dont need dplyr. Look at ?as.POSIXlt
df$date<-as.POSIXlt(df$date)
mon<-df$date$mon
yr<-df$date$year
monyr<-as.factor(paste(mon,yr,sep="/"))
df$date<-monyr
Don't need to use ggplot2 but its nice for this kind of thing.
c <- ggplot(df, aes(factor(date)))
c + geom_bar()
If you want to see the actual numbers
aggregate(. ~ date,data = df,FUN=length )
df2<-aggregate(. ~ date,data = df,FUN=length )
df2
date value
1 0/98 31
2 0/99 31
3 1/98 28
4 1/99 28
5 10/98 30
6 10/99 30
7 11/97 1
8 11/98 31
9 11/99 31
10 2/98 31
11 2/99 31
12 3/98 30
13 3/99 30
14 4/98 31
15 4/99 31
16 5/98 30
17 5/99 30
18 6/98 31
19 6/99 31
20 7/98 31
21 7/99 31
22 8/98 30
23 8/99 30
24 9/98 31
25 9/99 31
There is a super easy way using the cut() function:
list = as.Date(c("1998-5-2", "1993-4-16", "1998-5-10"))
cut(list, breaks = "month")
and you will get this:
[1] 1998-05-01 1993-04-01 1998-05-01
62 Levels: 1993-04-01 1993-05-01 1993-06-01 1993-07-01 1993-08-01 ... 1998-05-01
Another solution is slider::slide_period:
library(slider)
library(dplyr)
monthly_summary <- function(data) summarise(data, date = format(max(date), "%Y-%m"), value = sum(value))
slide_period_dfr(df, df$date, "month", monthly_summary)
date value
1 1998-01 92
2 1998-02 82
3 1998-03 113
4 1998-04 94
5 1998-05 92
6 1998-06 74
7 1998-07 89
8 1998-08 92
9 1998-09 91
10 1998-10 100
...
There is also group_by(month_yr = cut(date, breaks = "1 month") in base R, without needing to use lubridate or other packages.

Can I count dates grouped by year?

I've got some data that looks about like so:
demo <- read.table(text = "
date num
'12/31/2010' 35
'04/01/2013' 34
'06/02/2015' 34
'06/15/2015' 34
'01/30/2015' 33
'04/15/2014' 33
'05/28/2014' 33
'06/02/2014' 33
'06/17/2015' 33
'06/25/2015' 33
'06/24/2015' 32
'07/31/2013' 32
'08/31/2013' 32
'04/27/2015' 31
'05/07/2015' 31
'12/30/2013' 31
'11/21/2014' 30
'12/20/2013' 30
",header = TRUE, sep = "")
How do I group and count these by year?
2010 1
2013 5
etc.
I can use plyr to count each date: count(demo, vars = 'date'), but not group them.
I'd convert the dates to a date format first, rather than treating them as strings.
library(lubridate)
# Convert string to date format
demo$date <- as.Date(demo$date, "%m/%d/%Y")
# Table of counts by year
table(year(demo$date))
# 2010 2013 2014 2015
# 1 5 4 8
I like data.table for this. First we need to convert to "Date" class in the date column, then find the number of observations by year.
library(data.table)
demo$date <- as.Date(demo$date, "%m/%d/%Y")
as.data.table(demo)[, .N, keyby = year(date)]
# year N
# 1: 2010 1
# 2: 2013 5
# 3: 2014 4
# 4: 2015 8
We use keyby here so we get a nice ordered result. Alternatively, and to change your entire table to a data.table, you can use setDT() instead of as.data.table(). This is the preferred method.
setDT(demo)[, .N, keyby = year(date)]
table(substr(demo$date, 7,10))
2010 2013 2014 2015
1 5 4 8
substr allows you isolate the year, and table tallies the amounts.
demo$date <- as.Date(demo$date, format = "%m/%d/%Y")
demo$year <- format(demo$date, format = "%Y")
aggregate(num ~ year, demo, FUN = length)
## year num
## 1 2010 1
## 2 2013 5
## 3 2014 4
## 4 2015 8
Date formats can be modified using Date and POSIXct classes. This allows you to handle dates that looks like '1/1/2010'.
dates <- as.Date(demo$date, format = "%m/%d/%Y")
head(dates)
# [1] "2010-12-31" "2013-04-01" "2015-06-02" "2015-06-15" "2015-01-30"
# [6] "2014-04-15"
table(format(dates, format = "%Y"))
#
# 2010 2013 2014 2015
# 1 5 4 8

How to subset data.frame by weeks and then sum?

Let's say I have several years worth of data which look like the following
# load date package and set random seed
library(lubridate)
set.seed(42)
# create data.frame of dates and income
date <- seq(dmy("26-12-2010"), dmy("15-01-2011"), by = "days")
df <- data.frame(date = date,
wday = wday(date),
wday.name = wday(date, label = TRUE, abbr = TRUE),
income = round(runif(21, 0, 100)),
week = format(date, format="%Y-%U"),
stringsAsFactors = FALSE)
# date wday wday.name income week
# 1 2010-12-26 1 Sun 91 2010-52
# 2 2010-12-27 2 Mon 94 2010-52
# 3 2010-12-28 3 Tues 29 2010-52
# 4 2010-12-29 4 Wed 83 2010-52
# 5 2010-12-30 5 Thurs 64 2010-52
# 6 2010-12-31 6 Fri 52 2010-52
# 7 2011-01-01 7 Sat 74 2011-00
# 8 2011-01-02 1 Sun 13 2011-01
# 9 2011-01-03 2 Mon 66 2011-01
# 10 2011-01-04 3 Tues 71 2011-01
# 11 2011-01-05 4 Wed 46 2011-01
# 12 2011-01-06 5 Thurs 72 2011-01
# 13 2011-01-07 6 Fri 93 2011-01
# 14 2011-01-08 7 Sat 26 2011-01
# 15 2011-01-09 1 Sun 46 2011-02
# 16 2011-01-10 2 Mon 94 2011-02
# 17 2011-01-11 3 Tues 98 2011-02
# 18 2011-01-12 4 Wed 12 2011-02
# 19 2011-01-13 5 Thurs 47 2011-02
# 20 2011-01-14 6 Fri 56 2011-02
# 21 2011-01-15 7 Sat 90 2011-02
I would like to sum 'income' for each week (Sunday thru Saturday). Currently I do the following:
Weekending 2011-01-01 = sum(df$income[1:7]) = 487
Weekending 2011-01-08 = sum(df$income[8:14]) = 387
Weekending 2011-01-15 = sum(df$income[15:21]) = 443
However I would like a more robust approach which will automatically sum by week. I can't work out how to automatically subset the data into weeks. Any help would be much appreciated.
First use format to convert your dates to week numbers, then plyr::ddply() to calculate the summaries:
library(plyr)
df$week <- format(df$date, format="%Y-%U")
ddply(df, .(week), summarize, income=sum(income))
week income
1 2011-52 413
2 2012-01 435
3 2012-02 379
For more information on format.date, see ?strptime, particular the bit that defines %U as the week number.
EDIT:
Given the modified data and requirement, one way is to divide the date by 7 to get a numeric number indicating the week. (Or more precisely, divide by the number of seconds in a week to get the number of weeks since the epoch, which is 1970-01-01 by default.
In code:
df$week <- as.Date("1970-01-01")+7*trunc(as.numeric(df$date)/(3600*24*7))
library(plyr)
ddply(df, .(week), summarize, income=sum(income))
week income
1 2010-12-23 298
2 2010-12-30 392
3 2011-01-06 294
4 2011-01-13 152
I have not checked that the week boundaries are on Sunday. You will have to check this, and insert an appropriate offset into the formula.
This is now simple using dplyr. Also I would suggest using cut(breaks = "week") rather than format() to cut the dates into weeks.
library(dplyr)
df %>% group_by(week = cut(date, "week")) %>% mutate(weekly_income = sum(income))
I Googled "group week days into weeks R" and came across this SO question. You mention you have multiple years, so I think we need to keep up with both the week number and also the year, so I modified the answers there as so format(date, format = "%U%y")
In use it looks like this:
library(plyr) #for aggregating
df <- transform(df, weeknum = format(date, format = "%y%U"))
ddply(df, "weeknum", summarize, suminc = sum(income))
#----
weeknum suminc
1 1152 413
2 1201 435
3 1202 379
See ?strptime for all the format abbreviations.
Try rollapply from the zoo package:
rollapply(df$income, width=7, FUN = sum, by = 7)
# [1] 487 387 443
Or, use period.sum from the xts package:
period.sum(xts(df$income, order.by=df$date), which(df$wday %in% 7))
# [,1]
# 2011-01-01 487
# 2011-01-08 387
# 2011-01-15 443
Or, to get the output in the format you want:
data.frame(income = period.sum(xts(df$income, order.by=df$date),
which(df$wday %in% 7)),
week = df$week[which(df$wday %in% 7)])
# income week
# 2011-01-01 487 2011-00
# 2011-01-08 387 2011-01
# 2011-01-15 443 2011-02
Note that the first week shows as 2011-00 because that's how it is entered in your data. You could also use week = df$week[which(df$wday %in% 1)] which would match your output.
This solution is influenced by #Andrie and #Chase.
# load plyr
library(plyr)
# format weeks as per requirement (replace "00" with "52" and adjust corresponding year)
tmp <- list()
tmp$y <- format(df$date, format="%Y")
tmp$w <- format(df$date, format="%U")
tmp$y[tmp$w=="00"] <- as.character(as.numeric(tmp$y[tmp$w=="00"]) - 1)
tmp$w[tmp$w=="00"] <- "52"
df$week <- paste(tmp$y, tmp$w, sep = "-")
# get summary
df2 <- ddply(df, .(week), summarize, income=sum(income))
# include week ending date
tmp$week.ending <- lapply(df2$week, function(x) rev(df[df$week==x, "date"])[[1]])
df2$week.ending <- sapply(tmp$week.ending, as.character)
# week income week.ending
# 1 2010-52 487 2011-01-01
# 2 2011-01 387 2011-01-08
# 3 2011-02 443 2011-01-15
df.index = df['week'] #the the dt variable as index
df.resample('W').sum() #sum using resample
With dplyr:
df %>%
arrange(date) %>%
mutate(week = as.numeric(date - date[1])%/%7) %>%
group_by(week) %>%
summarise(weekincome= sum(income))
Instead of date[1] you can have any date from when you want to start your weekly study.

Resources