So here is an example of my data
> d
customer date revenue
1: A 2016-01-01 32
2: A 2016-01-03 88
3: A 2016-01-04 80
4: A 2016-02-01 38
5: B 2016-01-13 44
6: B 2016-01-24 11
7: B 2016-01-25 50
8: B 2016-02-26 46
> dput(d)
structure(list(customer = c("A", "A", "A", "A", "B", "B", "B",
"B"), date = structure(c(16801, 16803, 16804, 16832, 16813, 16824,
16825, 16857), class = "Date"), revenue = c(32, 88, 80, 38, 44,
11, 50, 46)), .Names = c("customer", "date", "revenue"), row.names = c(NA,
-8L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000000002a60788>)
What i want to do is, I want to create a column, let call it roll_sum_3days.
This column is the rolling sum of revenue that happen afterward. The window size is conditioned on date column. In this case, roll_sum_3days is the sum of revenue that happen afterward and must not be later than 3 days.
The expected out come would look like this
customer date revenue roll_sum_3days
1: A 2016-01-01 32 168
2: A 2016-01-03 88 80
3: A 2016-01-04 80 0
4: A 2016-02-01 38 0
5: B 2016-01-13 44 0
6: B 2016-01-24 11 96
7: B 2016-01-25 50 46
8: B 2016-01-26 46 0
A possible solution:
library(lubridate) # for the '%m+%'-function
d[, roll_sum_3d := .SD[.SD[, .(date, date2 = date %m+% days(3), revenue)]
, on = .(date > date, date <= date2)
][, sum(revenue, na.rm = TRUE), by = date]$V1
, by = customer][]
which gives:
customer date revenue roll_sum_3d
1: A 2016-01-01 32 168
2: A 2016-01-03 88 80
3: A 2016-01-04 80 0
4: A 2016-02-01 38 0
5: B 2016-01-13 44 0
6: B 2016-01-24 11 96
7: B 2016-01-25 50 46
8: B 2016-01-26 46 0
What this does:
Group d by customer withby = customer`.
Add roll_sum_3d by reference with :=.
Calculate roll_sum_3d by joining .SD (Subset of Data) for each group with a date-window of that group (.SD[, .(date, date2 = date %m+% days(3), revenue)] with a non-equi join on = .(date > date, date <= date2), summarise the revenue for each date and give that back.
An alternative based on #Arun's comment:
d[, roll_sum_3d := d[d[, .(customer, date, date2 = date %m+% days(3), revenue)]
, on = .(customer, date > date, date <= date2)
, sum(revenue, na.rm = TRUE), by=.EACHI]$V1][]
Hi I guess there is another mistake in your example: observation number 8 won't add to the count of the two previous observations as it is from february. Nevermind I've got a solution if you want using apply() and the POSIXct() function
df <- data.frame(customer = c("A", "A", "A", "A", "B", "B", "B", "B"),
date = structure(c(16801, 16803, 16804, 16832, 16813, 16824,
16825, 16857), class = "Date"),
revenue = c(32, 88, 80, 38, 44, 11, 50, 46))
df$date <- as.POSIXct(df$date)
calc <- function(x){
date <- as.POSIXct(unlist(x["date"]),origin = "1970-01-01")
customer <- unlist(x["customer"])
# There you choose what you want to sum (here conditions are between the day and 3 days later and same customer)
# 86400 is the number of second in a day!
output <- sum(df[df$date > date & df$date <= (date+86400*3) & df$customer==customer,"revenue"])
return(output)
}
df$sum <- apply(df,1,calc)
# if you want to come back with your date format.
df$date <- as.Date(df$date)
df
customer date revenue sum
1 A 2016-01-01 32 168
2 A 2016-01-03 88 80
3 A 2016-01-04 80 0
4 A 2016-02-01 38 0
5 B 2016-01-13 44 0
6 B 2016-01-24 11 50
7 B 2016-01-25 50 0
8 B 2016-02-26 46 0
I couldn't keep your date format as the operator > won't work with it.
Related
I am brand new to R, and am having trouble figuring out how to set up a simple time series.
Illustration: say I have three variables: Event (0 or 1), HR (heart rate), DT (datetime):
df = data.frame(Event = c(1,0,0,0,1,0,0),
HR= c(100,120,115,105,105,115,100),
DT= c("2020-01-01 09:00:00","2020-01-01 09:15:00","2020-01-01 10:00:00","2020-01-01 10:30:00",
"2020-01-01 11:00:00","2020-01-01 12:00:00","2020-01-01 13:00:00"),
stringsAsFactors = F
)
Event HR DT
1 1 100 2020-01-01 09:00:00
2 0 120 2020-01-01 09:15:00
3 0 115 2020-01-01 10:00:00
4 0 105 2020-01-01 10:30:00
5 1 105 2020-01-01 11:00:00
6 0 115 2020-01-01 12:00:00
7 0 100 2020-01-01 13:00:00
What I would like to do is to calculate elapsed time after each new event: So, row1=0 min, row2=15, row3=60,... row5=0, row6=60 Then I can do things like plot HR vs elapsed.
What might be a simple way to calculate elapsed time?
Apologies for such a low level question, but would be very grateful for any help!
Here is a one line approach using data.table.
Data:
df <- structure(list(Event = c(1, 0, 0, 0, 1, 0, 0), HR = c(100, 120,
115, 105, 105, 115, 100), DT = structure(c(1577869200, 1577870100,
1577872800, 1577874600, 1577876400, 1577880000, 1577883600), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -7L), class = "data.frame")
Code:
library(data.table)
dt <- as.data.table(df)
dt[, mins_since_last_event := as.numeric(difftime(DT,DT[1],units = "mins")), by = .(cumsum(Event))]
Output:
dt
Event HR DT mins_since_last_event
1: 1 100 2020-01-01 09:00:00 0
2: 0 120 2020-01-01 09:15:00 15
3: 0 115 2020-01-01 10:00:00 60
4: 0 105 2020-01-01 10:30:00 90
5: 1 105 2020-01-01 11:00:00 0
6: 0 115 2020-01-01 12:00:00 60
7: 0 100 2020-01-01 13:00:00 120
The following uses the Chron library and converts your date/time column to time objects for the library to be able to run calculations and conversions on.
Example Data:
df <- data.frame(
Event=c(1,0,0,0,1,0,0),
HR=c(100,125,115,105,105,115,100),
DT=c("2020-01-01 09:00:00"
,"2020-01-01 09:15:00"
,"2020-01-01 10:00:00"
,"2020-01-01 10:30:00"
,"2020-01-01 11:00:00"
,"2020-01-01 12:00:00"
,"2020-01-01 13:00:00"))
Code:
library(chron)
Dates <- lapply(strsplit(as.character(df$DT)," "),head,n=1)
Times <- lapply(strsplit(as.character(df$DT)," "),tail,n=1)
df$DT <- chron(as.character(Dates),as.character(Times),format=c(dates="y-m-d",times="h:m:s"))
df$TimeElapsed[1] <- 0
for(i in 1:nrow(df)){
if(df$Event[i]==1){TimeStart <- df$DT[i]}
df$TimeElapsed[i] <- (df$DT[i]-TimeStart)*24*60
}
output:
> df
Event HR DT TimeElapsed
1 1 100 (20-01-01 09:00:00) 0
2 0 125 (20-01-01 09:15:00) 15
3 0 115 (20-01-01 10:00:00) 60
4 0 105 (20-01-01 10:30:00) 90
5 1 105 (20-01-01 11:00:00) 0
6 0 115 (20-01-01 12:00:00) 60
7 0 100 (20-01-01 13:00:00) 120
Welcome to Stack Overflow #greyguy.
Here is an approach with dplyr library wich is pretty good with large data sets:
library(dplyr)
#Yours Data
df = data.frame(Event = c(1,0,0,0,1,0,0),
HR= c(100,120,115,105,105,115,100),
DT= c("2020-01-01 09:00:00","2020-01-01 09:15:00","2020-01-01 10:00:00","2020-01-01 10:30:00",
"2020-01-01 11:00:00","2020-01-01 12:00:00","2020-01-01 13:00:00"),
stringsAsFactors = F
)
# Transform in time format not string and order by time if not ordered
Transform in time format not string and order by time if not ordered
df = df %>%
mutate(DT = as.POSIXct(DT, format = "%Y-%m-%d %H:%M:%S")) %>%
arrange(DT) %>%
mutate(#Litte trick to get last DT Observation
last_DT = case_when(Event==1 ~ DT),
last_DT = na.locf(last_DT),
Elapsed_min = as.numeric( (DT - last_DT)/60)
) %>%
select(-last_DT)
The output:
# Event HR DT Elapsed_min
# 1 100 2020-01-01 09:00:00 0
# 0 120 2020-01-01 09:15:00 15
# 0 115 2020-01-01 10:00:00 60
# 0 105 2020-01-01 10:30:00 90
# 1 105 2020-01-01 11:00:00 0
# 0 115 2020-01-01 12:00:00 60
# 0 100 2020-01-01 13:00:00 120
I'm trying to use R to measure how many days supply of a prescription an individual already has on-hand when they make a refill, taking into account all previous prescriptions. For example, if I had this table...
member rx_id fill_date to_date days_supply
1 A 1 2018-10-01 2018-10-02 2
2 B 1 2016-11-07 2016-11-10 4
3 B 2 2016-11-07 2016-12-04 28
4 B 3 2016-11-08 2016-11-09 2
5 B 4 2016-11-10 2016-12-03 24
I'd expect the following output
member rx_id fill_date to_date days_supply_on_hand
1 A 1 2018-10-01 2018-10-02 0
2 B 1 2016-11-07 2016-11-10 0
3 B 2 2016-11-07 2016-12-04 4
4 B 3 2016-11-08 2016-11-09 30
5 B 4 2016-11-10 2016-12-03 26
For member B, when the second script is filled on the same day as the first script, the individual already has 4 days worth of RX on hand. When the third script is filled, the individual has 3 days left from the first script and 27 left from the second (30 total). When the fourth script is filled, the third script is depleted, but there is 1 day left from the first script and 25 from the third script (26 total).
I know how to do rolling totals in both dplyr and data.table, but I can't figure out how to take into account variable levels of depletion based on previous records on an individual by individual basis. Below is code to remake the original table, thanks in advance for any suggestions!
structure(list(member = structure(c(1L, 2L, 2L, 2L, 2L), .Label =
c("A",
"B"), class = "factor"), rx_id = c(1, 1, 2, 3, 4), fill_date =
structure(c(17805,
17112, 17112, 17113, 17115), class = "Date"), to_date =
structure(c(17806,
17115, 17139, 17114, 17138), class = "Date"), days_supply = c(2,
4, 28, 2, 24)), .Names = c("member", "rx_id", "fill_date",
"to_date",
"days_supply"), row.names = c(NA, -5L), class = "data.frame")
library(data.table)
dt = as.data.table(your_df) # or setDT to convert in place
# merge on relevant days, then compute sum of supply - days elapsed
dt[dt, on = .(member, fill_date <= fill_date, to_date >= fill_date, rx_id < rx_id), by = .EACHI,
sum(days_supply, na.rm = T) - sum(i.fill_date - x.fill_date, na.rm = T)]
# member fill_date to_date rx_id V1
#1: A 2018-10-01 2018-10-01 1 0 days
#2: B 2016-11-07 2016-11-07 1 0 days
#3: B 2016-11-07 2016-11-07 2 4 days
#4: B 2016-11-08 2016-11-08 3 30 days
#5: B 2016-11-10 2016-11-10 4 26 days
Using a simple loop
dt$days_supply_on_hand <- 0
for (a in unique(dt$member)) {
I <- which(.subset2(dt,1) == a)
flDate <- as.integer(.subset2(dt,3)[I])
toDate <- as.integer(.subset2(dt,4)[I])
V <- vapply(seq_along(I), function (k) sum(toDate[1:(k-1)] - flDate[k] + 1), numeric(1))
dt$days_supply_on_hand[I] <- c(0,V[-1])
}
dt
member rx_id fill_date to_date days_supply days_supply_on_hand
1 A 1 2018-10-01 2018-10-02 2 0
2 B 1 2016-11-07 2016-11-10 4 0
3 B 2 2016-11-07 2016-12-04 28 4
4 B 3 2016-11-08 2016-11-09 2 30
5 B 4 2016-11-10 2016-12-03 24 26
where dt is data frame provided above. (Note that the use of .subset2 or as.integer is for efficiency purposes - they can be changed for more readability).
I am using a dataset which is grouped by group_by function of dplyr package.
Each Group has it's own time index which i.e. supposedly consist of 12 months sequences.
This means that it can start from January and end up in December or in other cases it can start from June of the year before and end up in May next year.
Here is the dataset example:
ID DATE
8 2017-01-31
8 2017-02-28
8 2017-03-31
8 2017-04-30
8 2017-05-31
8 2017-06-30
8 2017-07-31
8 2017-08-31
8 2017-09-30
8 2017-10-31
8 2017-11-30
8 2017-12-31
32 2017-01-31
32 2017-02-28
32 2017-03-31
32 2017-04-30
32 2017-05-31
32 2017-06-30
32 2017-07-31
32 2017-08-31
32 2017-09-30
32 2017-10-31
32 2017-11-30
32 2017-12-31
45 2016-09-30
45 2016-10-31
45 2016-11-30
45 2016-12-31
45 2017-01-31
45 2017-02-28
45 2017-03-31
45 2017-04-30
45 2017-05-31
45 2017-06-30
45 2017-07-31
45 2017-08-31
The Problem is that I can't confirm or validate visualy because of dataset dimensions if there are so called "jumps", in other words if dates are consistent. Is there any simple way in r to do that, perhaps some modification/combination of functions from tibbletime package.
Any help will by appreciated.
Thank you in advance.
Here's how I would typically approach this problem using data.table -- the cut.Date() and seq.Date() functions from base are the meat of the logic, so you use the same approach with dplyr if desired.
library(data.table)
## Convert to data.table
setDT(df)
## Convert DATE to a date in case it wasn't already
df[,DATE := as.Date(DATE)]
## Order by ID and Date
setkey(df,ID,DATE)
## Create a column with the month of each date
df[,Month := as.Date(cut.Date(DATE, breaks = "months"))]
## Generate a sequence of Dates by month for the number of observations
## in each group -- .N
df[,ExpectedMonth := seq.Date(from = min(Month),
by = "months",
length.out = .N), by = .(ID)]
## Create a summary table to test whether an ID had 12 observations where
## the actual month was equal to the expected month
Test <- df[Month == ExpectedMonth, .(Valid = ifelse(.N == 12L,TRUE,FALSE)), by = .(ID)]
print(Test)
# ID Valid
# 1: 8 TRUE
# 2: 32 TRUE
# 3: 45 TRUE
## Do a no-copy join of Test to df based on ID
## and create a column in df based on the 'Valid' column in Test
df[Test, Valid := i.Valid, on = "ID"]
## The final output:
head(df)
# ID DATE Month ExpectedMonth Valid
# 1: 8 2017-01-31 2017-01-01 2017-01-01 TRUE
# 2: 8 2017-02-28 2017-02-01 2017-02-01 TRUE
# 3: 8 2017-03-31 2017-03-01 2017-03-01 TRUE
# 4: 8 2017-04-30 2017-04-01 2017-04-01 TRUE
# 5: 8 2017-05-31 2017-05-01 2017-05-01 TRUE
# 6: 8 2017-06-30 2017-06-01 2017-06-01 TRUE
You could also do things a little more compactly if you really wanted to using a self-join and skip creating Test
setDT(df)
df[,DATE := as.Date(DATE)]
setkey(df,ID,DATE)
df[,Month := as.Date(cut.Date(DATE, breaks = "months"))]
df[,ExpectedMonth := seq.Date(from = min(Month), by = "months", length.out = .N), keyby = .(ID)]
df[df[Month == ExpectedMonth,.(Valid = ifelse(.N == 12L,TRUE,FALSE)),keyby = .(ID)], Valid := i.Valid]
You can use the summarise function from dplyr to return a logical value of whether there are any day differences greater than 31 within each ID. You do this by first constructing a temporary date using only the year and month and attaching "-01" as the fake day:
library(dplyr)
library(lubridate)
df %>%
group_by(ID) %>%
mutate(DATE2 = ymd(paste0(sub('\\-\\d+$', '', DATE),'-01')),
DATE_diff = c(0, diff(DATE2))) %>%
summarise(Valid = !any(DATE_diff > 31))
Result:
# A tibble: 3 x 2
ID Valid
<int> <lgl>
1 8 TRUE
2 32 TRUE
3 45 TRUE
You can also visually check if there are any gaps by plotting your dates for each ID:
library(ggplot2)
df %>%
mutate(DATE = ymd(paste0(sub('\\-\\d+$', '', DATE),'-01')),
ID = as.factor(ID)) %>%
ggplot(aes(x = DATE, y = ID, group = ID)) +
geom_point(aes(color = ID)) +
scale_x_date(date_breaks = "1 month",
date_labels = "%b-%Y") +
labs(title = "Time Line by ID")
I have a table of gas refuelings, a, like this:
a = setDT(structure(list(date = structure(c(NA, 16837, 16843, 16847, 16852,
16854, 16858, 16862, 16867, 16871, 16874), class = "Date"), km = c(NA,
NA, 421, 351, 286, 350, 414, 332, 401, 321, 350)), .Names = c("date",
"km"), class = c("data.table", "data.frame"), row.names = c(NA,
-11L)), key = "date")
It has dates of refuel and km drove with that refuel. I'm also given a different table with dates of tire pressure adjusting and oil changes, actions, like this:
actions = setDT(structure(list(date = structure(c(16841, 16843, 16858, 16869), class = "Date"),
action = structure(c(1L, 2L, 2L, 2L), .Label = c("oil", "tires"
), class = "factor")), .Names = c("date", "action"), row.names = c(NA,
-4L), class = c("data.table", "data.frame")), key = "action")
I need to relate the fuel consumption (in the real version of a I also have gallons) to the days elapsed since last tire pressure checking and since last oil change. There must be a simple way to achieve this, but after some hours trying I'm stuck.
This is what I've tried:
library(data.table)
library(lubridate)
library(reshape2)
b <- dcast(actions, date ~ action, value.var = "date")
d <- seq(min(a$date, b$date, na.rm = TRUE), max(a$date, b$date, na.rm = TRUE), by = "day")
d <- data.table(date=d)
d <- b[d,]
d$daysOil <- as.double(difftime(d$date, d$date[! is.na(d$oil)], units = "days"))
d$daysOil[which(d$daysOil < 0)] <- NA
The thing gets a lot more complicated if I try to calculate the number of days elapsed since the last "tires" event (the one that's closer before the refuel date), and that's where I'm stuck.
My expected output is:
expected
date km daysoil daysTires
1 <NA> NA NA NA
2 2016-02-06 NA NA NA
3 2016-02-12 421 2 0
4 2016-02-16 351 6 4
5 2016-02-21 286 11 9
6 2016-02-23 350 13 11
7 2016-02-27 414 17 0
8 2016-03-02 332 21 4
9 2016-03-07 401 26 9
10 2016-03-11 321 30 2
11 2016-03-14 350 33 5
I'd appreciate any solution, but preferably by using data.table or dplyr packages.
########## EDIT ##########
If you can think of a better information (tables) structure to facilitate this task, it will be highly appreciated too!
Here's one option:
actions[, date.copy := date]
cbind(a,
dcast(actions[, .SD[a, .(days = date - date.copy, N = .I), roll = T, on = 'date']
, by = action],
N ~ action, value.var = 'days'))
# date km N oil tires
# 1: <NA> NA 1 NA days NA days
# 2: 2016-02-06 NA 2 NA days NA days
# 3: 2016-02-12 421 3 2 days 0 days
# 4: 2016-02-16 351 4 6 days 4 days
# 5: 2016-02-21 286 5 11 days 9 days
# 6: 2016-02-23 350 6 13 days 11 days
# 7: 2016-02-27 414 7 17 days 0 days
# 8: 2016-03-02 332 8 21 days 4 days
# 9: 2016-03-07 401 9 26 days 9 days
#10: 2016-03-11 321 10 30 days 2 days
#11: 2016-03-14 350 11 33 days 5 days
Several simple things are going on in the above - run in pieces to understand.
Let's say I have several years worth of data which look like the following
# load date package and set random seed
library(lubridate)
set.seed(42)
# create data.frame of dates and income
date <- seq(dmy("26-12-2010"), dmy("15-01-2011"), by = "days")
df <- data.frame(date = date,
wday = wday(date),
wday.name = wday(date, label = TRUE, abbr = TRUE),
income = round(runif(21, 0, 100)),
week = format(date, format="%Y-%U"),
stringsAsFactors = FALSE)
# date wday wday.name income week
# 1 2010-12-26 1 Sun 91 2010-52
# 2 2010-12-27 2 Mon 94 2010-52
# 3 2010-12-28 3 Tues 29 2010-52
# 4 2010-12-29 4 Wed 83 2010-52
# 5 2010-12-30 5 Thurs 64 2010-52
# 6 2010-12-31 6 Fri 52 2010-52
# 7 2011-01-01 7 Sat 74 2011-00
# 8 2011-01-02 1 Sun 13 2011-01
# 9 2011-01-03 2 Mon 66 2011-01
# 10 2011-01-04 3 Tues 71 2011-01
# 11 2011-01-05 4 Wed 46 2011-01
# 12 2011-01-06 5 Thurs 72 2011-01
# 13 2011-01-07 6 Fri 93 2011-01
# 14 2011-01-08 7 Sat 26 2011-01
# 15 2011-01-09 1 Sun 46 2011-02
# 16 2011-01-10 2 Mon 94 2011-02
# 17 2011-01-11 3 Tues 98 2011-02
# 18 2011-01-12 4 Wed 12 2011-02
# 19 2011-01-13 5 Thurs 47 2011-02
# 20 2011-01-14 6 Fri 56 2011-02
# 21 2011-01-15 7 Sat 90 2011-02
I would like to sum 'income' for each week (Sunday thru Saturday). Currently I do the following:
Weekending 2011-01-01 = sum(df$income[1:7]) = 487
Weekending 2011-01-08 = sum(df$income[8:14]) = 387
Weekending 2011-01-15 = sum(df$income[15:21]) = 443
However I would like a more robust approach which will automatically sum by week. I can't work out how to automatically subset the data into weeks. Any help would be much appreciated.
First use format to convert your dates to week numbers, then plyr::ddply() to calculate the summaries:
library(plyr)
df$week <- format(df$date, format="%Y-%U")
ddply(df, .(week), summarize, income=sum(income))
week income
1 2011-52 413
2 2012-01 435
3 2012-02 379
For more information on format.date, see ?strptime, particular the bit that defines %U as the week number.
EDIT:
Given the modified data and requirement, one way is to divide the date by 7 to get a numeric number indicating the week. (Or more precisely, divide by the number of seconds in a week to get the number of weeks since the epoch, which is 1970-01-01 by default.
In code:
df$week <- as.Date("1970-01-01")+7*trunc(as.numeric(df$date)/(3600*24*7))
library(plyr)
ddply(df, .(week), summarize, income=sum(income))
week income
1 2010-12-23 298
2 2010-12-30 392
3 2011-01-06 294
4 2011-01-13 152
I have not checked that the week boundaries are on Sunday. You will have to check this, and insert an appropriate offset into the formula.
This is now simple using dplyr. Also I would suggest using cut(breaks = "week") rather than format() to cut the dates into weeks.
library(dplyr)
df %>% group_by(week = cut(date, "week")) %>% mutate(weekly_income = sum(income))
I Googled "group week days into weeks R" and came across this SO question. You mention you have multiple years, so I think we need to keep up with both the week number and also the year, so I modified the answers there as so format(date, format = "%U%y")
In use it looks like this:
library(plyr) #for aggregating
df <- transform(df, weeknum = format(date, format = "%y%U"))
ddply(df, "weeknum", summarize, suminc = sum(income))
#----
weeknum suminc
1 1152 413
2 1201 435
3 1202 379
See ?strptime for all the format abbreviations.
Try rollapply from the zoo package:
rollapply(df$income, width=7, FUN = sum, by = 7)
# [1] 487 387 443
Or, use period.sum from the xts package:
period.sum(xts(df$income, order.by=df$date), which(df$wday %in% 7))
# [,1]
# 2011-01-01 487
# 2011-01-08 387
# 2011-01-15 443
Or, to get the output in the format you want:
data.frame(income = period.sum(xts(df$income, order.by=df$date),
which(df$wday %in% 7)),
week = df$week[which(df$wday %in% 7)])
# income week
# 2011-01-01 487 2011-00
# 2011-01-08 387 2011-01
# 2011-01-15 443 2011-02
Note that the first week shows as 2011-00 because that's how it is entered in your data. You could also use week = df$week[which(df$wday %in% 1)] which would match your output.
This solution is influenced by #Andrie and #Chase.
# load plyr
library(plyr)
# format weeks as per requirement (replace "00" with "52" and adjust corresponding year)
tmp <- list()
tmp$y <- format(df$date, format="%Y")
tmp$w <- format(df$date, format="%U")
tmp$y[tmp$w=="00"] <- as.character(as.numeric(tmp$y[tmp$w=="00"]) - 1)
tmp$w[tmp$w=="00"] <- "52"
df$week <- paste(tmp$y, tmp$w, sep = "-")
# get summary
df2 <- ddply(df, .(week), summarize, income=sum(income))
# include week ending date
tmp$week.ending <- lapply(df2$week, function(x) rev(df[df$week==x, "date"])[[1]])
df2$week.ending <- sapply(tmp$week.ending, as.character)
# week income week.ending
# 1 2010-52 487 2011-01-01
# 2 2011-01 387 2011-01-08
# 3 2011-02 443 2011-01-15
df.index = df['week'] #the the dt variable as index
df.resample('W').sum() #sum using resample
With dplyr:
df %>%
arrange(date) %>%
mutate(week = as.numeric(date - date[1])%/%7) %>%
group_by(week) %>%
summarise(weekincome= sum(income))
Instead of date[1] you can have any date from when you want to start your weekly study.