Get the limited rows on date range from dataframe in R

Get the limited rows on date range from dataframe in R - r

I am having this dataframe.
token DD1 Type DD2 Price
AB-1 2018-01-01 10:12:15 Low 2018-01-25 10000
AB-5 2018-01-10 10:12:15 Low 2018-01-25 15000
AB-2 2018-01-05 12:25:04 High 2018-01-20 25000
AB-3 2018-01-03 17:04:25 Low 2018-01-27 50000
....
AB-8 2017-12-10 21:08:12 Low 2017-12-30 60000
AB-8 2017-12-10 21:08:12 High 2017-12-30 30000
dput:
structure(list(token = structure(c(2L, 5L, 3L, 4L, 1L, 6L, 6L
), .Label = c("....", "AB-1", "AB-2", "AB-3", "AB-5", "AB-8"), class = "factor"),
DD1 = structure(c(2L, 5L, 4L, 3L, 1L, 6L, 6L), .Label = c("",
"01/01/2018 10:12:15", "03/01/2018 17:04:25", "05/01/2018 12:25:04",
"10/01/2018 10:12:15", "10/12/2017 21:08:12"), class = "factor"),
Type = structure(c(3L, 3L, 2L, 3L, 1L, 3L, 2L), .Label = c("",
"High", "Low"), class = "factor"), DD2 = structure(c(3L,
3L, 2L, 4L, 1L, 5L, 5L), .Label = c("", "20/01/2018", "25/01/2018",
"27/01/2018", "30/12/2017"), class = "factor"), Price = c(10000L,
15000L, 25000L, 50000L, NA, 60000L, 30000L)), .Names = c("token",
"DD1", "Type", "DD2", "Price"), class = "data.frame", row.names = c(NA,
-7L))
From the above mentioned dataframe I want 2 kind of sub set data frame based on date (last three date in descending order (from DD2) if row is not available for particular date than show that date with all fields as '0') and month (last three date in descending order if row is not available for particular date than show that date with all fields as '0').
Formula for Avg Low (same for Avg high): DD2-DD1 and than take Median as per nrow available.
% Formula For month: (Recent Value-Old Value)/(Old Vaule)
The code should pick last three days data as well as last three months data from dataframe whenever i run the code.
DF1:
Date nrow for Low Med Low sum of value low nrow for High Med High sum of value High
27-01-2018 1 24 50000 0 0 0
26-01-2018 0 0 0 0 0 0
25-01-2018 2 19.5 25000 0 0 0
DF2
Month nrow low % sum low % nrow high % sum high %
Jan-18 3 200% 75000 25% 1 0% 25000 -17%
Dec-17 1 100% 60000 100% 1 100% 0 100%
Nov-17 0 - - - 0 - - -

Although this Q already has an accepted answer, I felt challenged to provide an answer which uses dcast() and melt(). Any missing dates and months are completed using CJ() and joins as requested by the OP.
The code tries to reproduce OP's expected results as close as possible. The particular customisation is why the code looks so much convoluted.
If requested, I am willing to explain the code in more detail.
library(data.table)
setDT(DF)
# daily
DF1 <-
DF[, .(n = .N, days = median(difftime(as.Date(DD2, "%d/%m/%Y"),
as.Date(DD1, "%d/%m/%Y"), units = "day")),
sum = sum(Price)), by = .(DD2, Type)][
, Date := as.Date(DD2, "%d/%m/%Y")][
, dcast(.SD, Date ~ Type, value.var = c("n", "days", "sum"), fill = 0)][
.(Date = seq(max(Date), length.out = 3L, by = "-1 days")), on = "Date"][
, setcolorder(.SD, c(1, 3, 5, 7, 2, 4, 6))][
is.na(n_Low), (2:7) := lapply(.SD, function(x) 0), .SDcols = 2:7][]
DF1
Date n_Low days_Low sum_Low n_High days_High sum_High
1: 2018-01-27 1 24.0 days 50000 0 0 days 0
2: 2018-01-26 0 0.0 days 0 0 0 days 0
3: 2018-01-25 2 19.5 days 25000 0 0 days 0
# monthly
DF2 <-
DF[, Month := lubridate::floor_date(as.Date(DD2, "%d/%m/%Y"), unit = "month")][
, .(n = .N, sum = sum(Price)), by = .(Month, Type)][
CJ(Month = seq(max(Month), length.out = 3L, by = "-1 months"), Type = unique(Type)),
on = .(Month, Type)][
, melt(.SD, id.vars = c("Month", "Type"))][
is.na(value), value := 0][
, Pct := {
old <- shift(value); round(100 * ifelse(old == 0, 1, (value - old) / old))
},
by = .(variable, Type)][
, dcast(.SD, Type + Month ~ variable, value.var = c("value", "Pct"))][
, setnames(.SD, c("value_n", "value_sum"), c("n", "sum"))][
, dcast(.SD, Month ~ Type, value.var = c("n", "Pct_n", "sum", "Pct_sum"))][
order(-Month), setcolorder(.SD, c(1, 3, 5, 7, 9, 2, 4, 6, 8))]
DF2
Month n_Low Pct_n_Low sum_Low Pct_sum_Low n_High Pct_n_High sum_High Pct_sum_High
1: 2018-01-01 3 200 75000 25 1 0 25000 -17
2: 2017-12-01 1 100 60000 100 1 100 30000 100
3: 2017-11-01 0 NA 0 NA 0 NA 0 NA

Does the following approach help?
require(tidyverse)
Edit
This is a very convoluted approach and is most certainly possible to be solved more elegantly.
dat <- structure(list(token = structure(c(2L, 5L, 3L, 4L, 1L, 6L, 6L), .Label = c("....", "AB-1", "AB-2", "AB-3", "AB-5", "AB-8"), class = "character"), DD1 = structure(c(2L, 5L, 4L, 3L, 1L, 6L, 6L), .Label = c("", "01/01/2018 10:12:15", "03/01/2018 17:04:25", "05/01/2018 12:25:04", "10/01/2018 10:12:15", "10/12/2017 21:08:12"), class = "factor"),
Type = structure(c(3L, 3L, 2L, 3L, 1L, 3L, 2L), .Label = c("", "High", "Low"), class = "character"), DD2 = structure(c(3L, 3L, 2L, 4L, 1L, 5L, 5L), .Label = c("", "20/01/2018", "25/01/2018", "27/01/2018", "30/12/2017"), class = "factor"), Price = c(10000L, 15000L, 25000L, 50000L, NA, 60000L, 30000L)), .Names = c("token", "DD1", "Type", "DD2", "Price"), class = "data.frame", row.names = c(NA, -7L))
#I have included this into the code because structure(your output) had messed up a lot with factors
dat <- dat[c(1:4,6:7),]
dat <- dat %>% mutate(DD1 = dmy_hms(DD1), DD2 = dmy(DD2), Type = as.character(Type))
dat_summary <- dat %>%
mutate(diff_days = round(as.duration(DD1%--%DD2)/ddays(1),0),
#uses lubridate to calculate the number of days between each DD2 and DD1
n = n()) %>%
group_by(DD2,Type) %>% #because your operations are performed by each Type by DD2
summarise(med = median(diff_days),# calculates the median
sum = sum(Price)) # and the sum
# A tibble: 5 x 4
# Groups: DD2 [?]
DD2 Type med sum
<date> <chr> <dbl> <int>
1 2017-12-30 2 19.0 30000
2 2017-12-30 3 19.0 60000
3 2018-01-20 2 14.0 25000
4 2018-01-25 3 19.5 25000
5 2018-01-27 3 23.0 50000
Now find the first day with a value in Price
datematch <- dat %>% group_by(Type,month = floor_date(DD2, "month")) %>%
arrange(Type, desc(DD2)) %>%
summarise(maxDate = max(DD2)) %>%
select(Type, maxDate)
now create helper data frames for merging. dummy_dates will contain the last day with a value and the previous two days, for both types (low and high), all_dates will contain... well, all dates
list1 <- split(datematch$maxDate, datematch$Type)
list_type2 <- do.call('c',lapply(list1[['2']], function(x) seq(as.Date(x)-2, as.Date(x), by="days")))
list_type3 <- do.call('c',lapply(list1[['3']], function(x) seq(as.Date(x)-2, as.Date(x), by="days")))
dd_2 <- data.frame (DD2 = list_type2, Type = as.character(rep('2', length(list_type2))), stringsAsFactors = F)
dd_3 <- data.frame (DD2 = list_type3, Type = as.character(rep('3', length(list_type3))), stringsAsFactors = F)
dummy_date = rbind(dd_2, dd_3)
seq_date <- seq(as.Date('2017-12-01'),as.Date('2018-01-31'), by = 'days')
all_dates <- data.frame (DD2 = rep(seq_date,2), Type = as.character(rep(c('2','3'),each = length(seq_date))),stringsAsFactors = F)
now we can join your data frame with all days, so that every single day in the month gets a row
all_dates <- left_join(dd_date, dat_summary, by = c('DD2', 'Type'))
and we can filter this result with dummy_date, which (as we remember) contains only the required days before the last day with data
df1<- left_join(dummy_date, all_dates, by = c('DD2', 'Type')) %>% arrange(Type, desc(DD2))
df1
DD2 Type med sum
1 2018-01-20 2 14.0 25000
2 2018-01-19 2 NA NA
3 2018-01-18 2 NA NA
4 2017-12-30 2 19.0 30000
5 2017-12-29 2 NA NA
6 2017-12-28 2 NA NA
7 2018-01-27 3 23.0 50000
8 2018-01-26 3 NA NA
9 2018-01-25 3 19.5 25000
10 2017-12-30 3 19.0 60000
11 2017-12-29 3 NA NA
12 2017-12-28 3 NA NA
Sorry that 'type' is not correctly put as low and high, had problems to read your data. I hope that this helps somewhat
edit
added suggestion for a way to get to DF2
df1 %>% group_by(Type, month = floor_date(DD2, 'month')) %>%
summarise(sum = sum(sum, na.rm = T),
n = max (n1, na.rm = T)) %>%
unite(sum.n, c('sum','n')) %>%
spread(Type, sum.n) %>%
rename(low = '3', high = '2') %>%
separate(high, c('high','n_high')) %>%
separate(low, c('low','n_low')) %>%
mutate(dummy_low = as.integer(c(NA, low[1:length(low)-1])),
dummy_high = as.integer(c(NA, high[1:length(high)-1])),
low = as.integer(low),
high = as.integer(high))%>%
mutate(perc_low = 100*(low-dummy_low)/dummy_low)
# A tibble: 2 x 8
month high n_high low n_low dummy_low dummy_high perc_low
<date> <int> <chr> <int> <chr> <int> <int> <dbl>
1 2017-12-01 30000 1 60000 1 NA NA NA
2 2018-01-01 25000 1 75000 3 60000 30000 25.0
It's up to you to add the remaining columns for 'high' and the count. I am sure that the solution is not the most elegant one but it should work. DF2 has now only two months, but this is because you have provided only 2 months in your example. It should work with any number of months, and you can then filter the last three months.

Related

Cumulative sums by month in R

I want to transform my data from this
Month Expenditures
1 1
1 2
2 3
2 6
3 2
3 5
to this:
Month Cumulative_expenditures
1 3
2 12
3 19
, but can't seem to figure out how to do it.
I tried using the cumsum() function, but it counts each observation - it doesn't distinguish between groups.
Any help would be much appreciated!

A two steps base R solution would be:
#Code
df1 <- aggregate(Expenditures~Month,data=mydf,sum)
#Create cum sum
df1$Expenditures <- cumsum(df1$Expenditures)
Output:
Month Expenditures
1 1 3
2 2 12
3 3 19
Some data used:
#Data
mydf <- structure(list(Month = c(1L, 1L, 2L, 2L, 3L, 3L), Expenditures = c(1L,
2L, 3L, 6L, 2L, 5L)), class = "data.frame", row.names = c(NA,
-6L))

Using dplyr:
library(dplyr)
df %>%
group_by(Month) %>%
summarise(Expenditures = sum(Expenditures), .groups = "drop") %>%
mutate(Expenditures = cumsum(Expenditures))
#> # A tibble: 3 x 2
#> Month Expenditures
#> <int> <int>
#> 1 1 3
#> 2 2 12
#> 3 3 19
Or in base R:
data.frame(Month = unique(df$Month),
Expenditure = cumsum(tapply(df$Expenditure, df$Month, sum)))
#> Month Expenditure
#> 1 1 3
#> 2 2 12
#> 3 3 19

Here is another base R option using subset + ave
subset(
transform(df, Expenditures = cumsum(Expenditures)),
ave(rep(FALSE, nrow(df)), Month, FUN = function(x) seq_along(x) == length(x))
)
which gives
Month Expenditures
2 1 3
4 2 12
6 3 19

We can use base R
out <- with(df1, rowsum(Expenditures, Month))
data.frame(Month = row.names(out), Expenditure = cumsum(out))
# Month Expenditure
#1 1 3
#2 2 12
#3 3 19
Or more compactly
with(df1, stack(cumsum(rowsum(Expenditures, Month)[,1])))[2:1]
data
df1 <- structure(list(Month = c(1L, 1L, 2L, 2L, 3L, 3L), Expenditures = c(1L,
2L, 3L, 6L, 2L, 5L)), class = "data.frame", row.names = c(NA,
-6L))

Merging two data frames in long format based on date

I have a 2 data frames, one (df1) that records the daily occurrence of different activities and another (df2) that records properties of the occurred activity during the day.
From df1 it is possible to identify the repeated occurrence of an activity as well the duration. When the day starts is specified by the Date variable.
For example:
id 12 the occurrence starts at day1 and ends at d7. In this case the occurrence is 7 and duration is 11.
for id 123 the week starts at day 5 and ends at d7; occurred in repeated order because of there are gap days at day 6 and duration is 6 and id 123 (starts at day6 ends at day 7) occurred 2 times consecutively and duration 6.
In df1 the variable Date defines the day when the record started. For example id 12 record started at day1 and so on.
I would like to identify if during the consecutive occurrence if there are records on the activity properties in df2.
For example id 12, occurred 7 times and duration is 12 there is record for Wednesday (day3 in df1) and this record corresponds to the 3 day of the consecutive occurrence. For id 123 there is no data (eg. no consecutive occurrence) but for id 10 for 6 day occurrence and duration 18 there is a record on the 6th day.
Df1:
id day1 day2 day3 day4 day5 day6 day7 Date
12 2 1 2 1 1 3 1 Mon
123 0 3 0 3 3 0 3 Fri
10 0 3 3 3 3 3 3 Sat
Df2:
id c1 c2 Date
12 3 3 Wednesday
123 3 2 Fri
10 3 1 Sat
Outcome:
id c1 c2 Occurrence Position
12 3 3 7 3
123 0 0 0 0
10 3 1 2 1
Sample data: df1
structure(list(id = c(12L, 123L, 10L), day1 = c(2L, 0L, 3L),
day2 = c(1L, 3L, 3L), day3 = c(2L, 0L, 3L), day4 = c(1L,
3L, 3L), day5 = c(1L, 3L, 3L), day6 = c(3L, 0L, 3L), day7 = c(1L,
3L, 3L), Date = c("Monday", "Friday", "Saturday")), row.names = c(NA,
-3L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x000002a81a571ef0>)
df2:
structure(list(id = c(12, 123, 10), c1 = c(3, 3, 3), c2 = c(3,
2, 1), Date = structure(c(3L, 1L, 2L), .Label = c("Friday", "Saturday",
"Wednesday"), class = "factor")), row.names = c(NA, -3L), class = "data.frame")

A solution with dplyr (maybe not the shortest one):
# library
library(tidyverse)
# get data
df1 <- structure(list(id = c(12L, 123L, 10L),
day1 = c(2L, 0L, 3L),
day2 = c(1L, 3L, 3L),
day3 = c(2L, 0L, 3L),
day4 = c(1L,3L, 3L),
day5 = c(1L, 3L, 3L),
day6 = c(3L, 0L, 3L),
day7 = c(1L,3L, 3L),
Date = c("Monday", "Friday", "Saturday")),
row.names = c(NA,-3L), class = c("data.table", "data.frame"))
df2 <- structure(list(id = c(12, 123, 10),
c1 = c(3, 3, 3),
c2 = c(3, 2, 1),
Date = structure(c(3L, 1L, 2L), .Label = c("Friday", "Saturday","Wednesday"),
class = "factor")), row.names = c(NA, -3L), class = "data.frame")
# change days to numeric (will help you later)
df1 %>% mutate(
Date_nr_df1=case_when(
Date=="Monday" ~ 1,
Date=="Tuesday" ~2,
Date=="Wednesday" ~3,
Date=="Thursday" ~4,
Date=="Friday" ~5,
Date=="Saturday" ~6,
Date=="Sunday" ~7)) -> df1
df2 %>% mutate(
Date_nr_df2=case_when(
Date=="Monday" ~ 1,
Date=="Tuesday" ~2,
Date=="Wednesday" ~3,
Date=="Thursday" ~4,
Date=="Friday" ~5,
Date=="Saturday" ~6,
Date=="Sunday" ~7)) -> df2
# combine data by the id column
left_join(df1,df2, by=c("id")) -> df
# adjust data
df %>%
group_by(id) %>% # to make changes per row
mutate(days=paste0(day1,day2,day3,day4,day5,day6,day7)) %>% #pastes the values together
mutate(days_correct=substring(days,Date_nr_df1)) %>% # applies the start day
mutate(Occurrence_seq=str_split(days_correct, fixed("0"))[[1]][1]) %>% # extracts all days before 0
mutate(Occurrence=nchar(Occurrence_seq)) %>% ## counts these days
mutate(Occurrence=case_when(Occurrence==1 ~ 0, TRUE ~ as.numeric(Occurrence))) %>% # sets Occurrence to 0 if there is no consecutive occurrence
mutate(Position=Date_nr_df2-Date_nr_df1+1) %>% ## calculates the position you wanted
mutate(c1=case_when(Occurrence==0 ~0, TRUE ~ c1),
c2=case_when(Occurrence==0 ~0, TRUE ~c1),
Position=case_when(Occurrence==0 ~ 0, TRUE ~ as.numeric(Position))) %>%
ungroup() %>% ungroups the df
select(id,c1,c2,Occurrence,Position) # selects the wanted variables
#> # A tibble: 3 x 5
#> id c1 c2 Occurrence Position
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 12 3 3 7 3
#> 2 123 0 0 0 0
#> 3 10 3 3 2 1
Created on 2020-04-10 by the reprex package (v0.2.1)

Mapping values across a dataframe

I have a large dataset. The example below is a much abbreviated version.
There are two dataframes, df1 and df2. I would like to map to each row of df1, a derived value using conditions from df2 with arguments from df1.
Hope the example below makes more sense
year <- rep(1996:1997, each=3)
age_group <- rep(c("20-24","25-29","30-34"),2)
df1 <- as.data.frame(cbind(year,age_group))
df1 is a database with all permutations of year and age group.
df2 <- as.data.frame(rbind(c(111,1997,"20-24"),c(222,1997,"30-34")))
names(df2) <- c("id","year","age.group")
df2 is a database where each row represents an individual at a particular year
I would like to use arguments from df1 conditional on values from df2 and then to map to df1. The arguments are as follows:
each_yr <- map(df1, function(year,age_group) case_when(
as.character(df1$year) == as.character(df2$year) & as.character(df1$age_group)
== as.character(df2$age.group)~ 0,
TRUE ~ 1))
The output i get is wrong and shown below
structure(list(year = c(1, 1, 1, 1, 1, 0), age_group = c(1, 1,
1, 1, 1, 0)), .Names = c("year", "age_group"))
The output i would ideally like is something like this (dataframe as an example but would be happy as a list)
structure(list(year = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("1996",
"1997"), class = "factor"), age_group = structure(c(1L, 2L, 3L,
1L, 2L, 3L), .Label = c("20-24", "25-29", "30-34"), class = "factor"),
v1 = structure(c(2L, 2L, 2L, 1L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), v2 = structure(c(2L, 2L, 2L, 2L,
2L, 1L), .Label = c("0", "1"), class = "factor")), .Names = c("year",
"age_group", "v1", "v2"), row.names = c(NA, -6L), class = "data.frame")
I have used map before when 'df1' is a vector but in this scenario it is a dataframe where both columns are used as arguments. Can Map handle this?
In df3 the column v1 is the result of conditions based on df1 and df2 and then mapped to df1 for patient '111'. Likewise column v2 is the outcome for patient '222'.
Thanks in advance

Looks like some work for pmap instead. And a touch of tidyr to get the suggested result.
purrr::pmap(list(df2$id,as.character(df2$year),as.character(df2$age.group)),
function(id,x,y)
data.frame(df1,
key=paste0("v",id),
value=1-as.integer((x==df1$year)&(y==df1$age_group)),
stringsAsFactors=FALSE
)) %>%
replyr::replyr_bind_rows() %>% tidyr::spread(key,value)
# year age_group v1 v2
#1 1996 20-24 1 1
#2 1996 25-29 1 1
#3 1996 30-34 1 1
#4 1997 20-24 0 1
#5 1997 25-29 1 1
#6 1997 30-34 1 0

Whithing tidiverse you can do it this way:
library(tidyverse)
#library(dplyr)
#library(tidyr)
df2 %>%
mutate(tmp = 0) %>%
spread(id, tmp, fill = 1, sep = "_") %>%
right_join(df1, by = c("year", "age.group" = "age_group")) %>%
mutate_at(vars(-c(1, 2)), coalesce, 1)
# year age.group id_111 id_222
# 1 1996 20-24 1 1
# 2 1996 25-29 1 1
# 3 1996 30-34 1 1
# 4 1997 20-24 0 1
# 5 1997 25-29 1 1
# 6 1997 30-34 1 0
#Warning messages:
# 1: Column `year` joining factors with different levels, coercing to character vector
# 2: Column `age.group`/`age_group` joining factors with different levels, coercing to
# character vector

R: Unique count by first occurrence of grouping variable

I would like to create a new variable "Count" that is a count of the unique values of a factor "Period", by grouping variable "ID". The following data includes a column with the values I would want in "Count":
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L
), .Label = c("a", "b"), class = "factor"), Period = c(1.1, 1.1,
1.2, 1.3, 1.2, 1.3, 1.5, 1.5), Count = c(1L, 1L, 2L, 3L, 1L,
2L, 3L, 3L)), .Names = c("ID", "Period", "Count"), class = "data.frame", row.names = c(NA,
-8L))
I tried to use mutate with Count = 1:length(Period) but it creates a cumulative count of each value of "Period", whereas I want a cumulative count of only unique values. This is what I tried:
library(plyr)
samp1<-ddply(samp, .(ID, Period), mutate, Count = 1:length(Period))
Could anyone provide the correct function to use?

Edit- New answer
Now that come to think of it some more, my initial approach won't return correct results if each groups elements aren't grouped together, so for example for
v <- c(1, 3, 2, 2, 1, 2)
My function will put non-consecutive 1s and 2 in different groups
myrleid(v)
## [1] 1 2 3 3 4 5
Thus, the best approach seem to be
match(v, unique(v))
## [1] 1 2 3 3 1 3
Will will both preserve the appearance order and keep un-ordered values in the same group.
Thus, I would recommend just doing
library(data.table)
setDT(df)[, Count2 := match(Period, unique(Period)), by = ID]
or (with base R)
with(df, ave(Period, ID, FUN = function(x) match(x, unique(x))))
Old answer
Looks like a good candidate for the rleid function from the data.table devel version on GH
### Devel version installation instructions
# library(devtools)
# install_github("Rdatatable/data.table", build_vignettes = FALSE)
library(data.table) # v 1.9.5+
setDT(df)[, Count2 := rleid(Period), by = ID]
df
# ID Period Count Count2
# 1: a 1.1 1 1
# 2: a 1.1 1 1
# 3: a 1.2 2 2
# 4: a 1.3 3 3
# 5: b 1.2 1 1
# 6: b 1.3 2 2
# 7: b 1.5 3 3
# 8: b 1.5 3 3
Or, If you don't want to load external packages, we could define this function on our own
myrleid <- function(x) {
temp <- rle(x)$lengths
rep.int(seq_along(temp), temp)
}
with(df, ave(Period, ID, FUN = myrleid))
## [1] 1 1 2 3 1 2 3 3
Or if the groups are in increasing order, you could try ranking them too
library(data.table) ## V1.9.5+
setDT(df)[, Count2 := frank(Period, ties.method = "dense"), by = ID]
Or
library(dplyr)
df %>%
group_by(ID) %>%
mutate(Count2 = dense_rank(Period))

samp <- structure(list(ID = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L
), .Label = c("a", "b"), class = "factor"), Period = c(1.1, 1.1,
1.2, 1.3, 1.2, 1.3, 1.5, 1.5), Count = c(1L, 1L, 2L, 3L, 1L,
2L, 3L, 3L)), .Names = c("ID", "Period", "Count"), class = "data.frame", row.names = c(NA,
-8L))
select(samp, -Count) %>%
arrange(ID, Period) %>%
group_by(ID) %>%
mutate(dup = !duplicated(Period),
Count = cumsum(dup))
The key steps are to arrange by ID and Period, and then to identify that first new representation of Period as "not duplicated".

A solution in base R with transform:
transform(df, Count2 = unlist(
tapply(df$Period, df$ID, function(x)
as.numeric(factor(x)))
))
ID Period Count Count2
a1 a 1.1 1 1
a2 a 1.1 1 1
a3 a 1.2 2 2
a4 a 1.3 3 3
b1 b 1.2 1 1
b2 b 1.3 2 2
b3 b 1.5 3 3
b4 b 1.5 3 3
as David suggested this solution does not work well if data Period are not monotonic increasing.

How do I do a conditional sum which only looks between certain date criteria

Say I have data that looks like
date, user, items_bought, event_number
2013-01-01, x, 2, 1
2013-01-02, x, 1, 2
2013-01-03, x, 0, 3
2013-01-04, x, 0, 4
2013-01-04, x, 1, 5
2013-01-04, x, 2, 6
2013-01-05, x, 3, 7
2013-01-06, x, 1, 8
2013-01-01, y, 1, 1
2013-01-02, y, 1, 2
2013-01-03, y, 0, 3
2013-01-04, y, 5, 4
2013-01-05, y, 6, 5
2013-01-06, y, 1, 6
to get the cumulative sum per user per data point I was doing
data.frame(cum_items_bought=unlist(tapply(as.numeric(data$items_bought), data$user, FUN = cumsum)))
output from this looks like
date, user, items_bought
2013-01-01, x, 2
2013-01-02, x, 3
2013-01-03, x, 3
2013-01-04, x, 3
2013-01-04, x, 4
2013-01-04, x, 6
2013-01-05, x, 9
2013-01-06, x, 10
2013-01-01, y, 1
2013-01-02, y, 2
2013-01-03, y, 2
2013-01-04, y, 7
2013-01-05, y, 13
2013-01-06, y, 14
However I want to restrict my sum to only add up those that happened within 3 days of each row (relative to the user). i.e. the output needs to look like this:
date, user, cum_items_bought_3_days
2013-01-01, x, 2
2013-01-02, x, 3
2013-01-03, x, 3
2013-01-04, x, 1
2013-01-04, x, 2
2013-01-04, x, 4
2013-01-05, x, 6
2013-01-06, x, 7
2013-01-01, y, 1
2013-01-02, y, 2
2013-01-03, y, 2
2013-01-04, y, 6
2013-01-05, y, 11
2013-01-06, y, 12

Here's a dplyr solution which will produce the desired result (14 rows) as specified in the question. Note that it takes care of duplicate date entries, for example, 2013-01-04 for user x.
# define a custom function to be used in the dplyr chain
myfunc <- function(x){
with(x, sapply(event_number, function(y)
sum(items_bought[event_number <= event_number[y] & date[y] - date <= 2])))
}
require(dplyr) #install and load into your library
df %>%
mutate(date = as.Date(as.character(date))) %>%
group_by(user) %>%
do(data.frame(., cum_items_bought_3_days = myfunc(.))) %>%
select(-c(items_bought, event_number))
# date user cum_items_bought_3_days
#1 2013-01-01 x 2
#2 2013-01-02 x 3
#3 2013-01-03 x 3
#4 2013-01-04 x 1
#5 2013-01-04 x 2
#6 2013-01-04 x 4
#7 2013-01-05 x 6
#8 2013-01-06 x 7
#9 2013-01-01 y 1
#10 2013-01-02 y 2
#11 2013-01-03 y 2
#12 2013-01-04 y 6
#13 2013-01-05 y 11
#14 2013-01-06 y 12
In my answer I use a custom function myfunc inside a dplyr chain. This is done using the do operator from dplyr. The custom function is passed the subsetted df by user groups. It then uses sapply to pass each event_number and calculate the sums of items_bought. The last line of the dplyr chain deselects the undesired columns.
Let me know if you'd like a more detailed explanation.
Edit after comment by OP:
If you need more flexibility to also conditionally sum up other columns, you can adjust the code as follows. I assume here, that the other columns should be summed up the same way as items_bought. If that is not correct, please specify how you want to sum up the other columns.
I first create two additional columns with random numbers in the data (I'll post a dput of the data at the bottom of my answer):
set.seed(99) # for reproducibility only
df$newCol1 <- sample(0:10, 14, replace=T)
df$newCol2 <- runif(14)
df
# date user items_bought event_number newCol1 newCol2
#1 2013-01-01 x 2 1 6 0.687800094
#2 2013-01-02 x 1 2 1 0.640190769
#3 2013-01-03 x 0 3 7 0.357885360
#4 2013-01-04 x 0 4 10 0.102584999
#5 2013-01-04 x 1 5 5 0.097790922
#6 2013-01-04 x 2 6 10 0.182886256
#7 2013-01-05 x 3 7 7 0.227903474
#8 2013-01-06 x 1 8 3 0.080524150
#9 2013-01-01 y 1 1 3 0.821618422
#10 2013-01-02 y 1 2 1 0.591113977
#11 2013-01-03 y 0 3 6 0.773389019
#12 2013-01-04 y 5 4 5 0.350085977
#13 2013-01-05 y 6 5 2 0.006061323
#14 2013-01-06 y 1 6 7 0.814506223
Next, you can modify myfunc to take 2 arguments, instead of 1. The first argument will remain the subsetted data.frame as before (represented by . inside the dplyr chain and x in the function definition of myfunc), while the second argument to myfunc will specify the column to sum up (colname).
myfunc <- function(x, colname){
with(x, sapply(event_number, function(y)
sum(x[event_number <= event_number[y] & date[y] - date <= 2, colname])))
}
Then, you can use myfunc several times if you want to conditionally sum up several columns:
df %>%
mutate(date = as.Date(as.character(date))) %>%
group_by(user) %>%
do(data.frame(., cum_items_bought_3_days = myfunc(., "items_bought"),
newCol1Sums = myfunc(., "newCol1"),
newCol2Sums = myfunc(., "newCol2"))) %>%
select(-c(items_bought, event_number, newCol1, newCol2))
# date user cum_items_bought_3_days newCol1Sums newCol2Sums
#1 2013-01-01 x 2 6 0.6878001
#2 2013-01-02 x 3 7 1.3279909
#3 2013-01-03 x 3 14 1.6858762
#4 2013-01-04 x 1 18 1.1006611
#5 2013-01-04 x 2 23 1.1984520
#6 2013-01-04 x 4 33 1.3813383
#7 2013-01-05 x 6 39 0.9690510
#8 2013-01-06 x 7 35 0.6916898
#9 2013-01-01 y 1 3 0.8216184
#10 2013-01-02 y 2 4 1.4127324
#11 2013-01-03 y 2 10 2.1861214
#12 2013-01-04 y 6 12 1.7145890
#13 2013-01-05 y 11 13 1.1295363
#14 2013-01-06 y 12 14 1.1706535
Now you created conditional sums of the columns items_bought, newCol1 and newCol2. You can also leave out any of the sums in the dplyr chain or add more columns to sum up.
Edit #2 after comment by OP:
To calculate the cumulative sum of distinct (unique) items bought per user, you could define a second custom function myfunc2 and use it inside the dplyr chain. This function is also flexible as myfunc so that you can define the columns to which you want to apply the function.
The code would then be:
myfunc <- function(x, colname){
with(x, sapply(event_number, function(y)
sum(x[event_number <= event_number[y] & date[y] - date <= 2, colname])))
}
myfunc2 <- function(x, colname){
cumsum(sapply(seq_along(x[[colname]]), function(y)
ifelse(!y == 1 & x[y, colname] %in% x[1:(y-1), colname], 0, 1)))
}
require(dplyr) #install and load into your library
dd %>%
mutate(date = as.Date(as.character(date))) %>%
group_by(user) %>%
do(data.frame(., cum_items_bought_3_days = myfunc(., "items_bought"),
newCol1Sums = myfunc(., "newCol1"),
newCol2Sums = myfunc(., "newCol2"),
distinct_items_bought = myfunc2(., "items_bought"))) %>%
select(-c(items_bought, event_number, newCol1, newCol2))
Here is the data I used:
dput(df)
structure(list(date = structure(c(1L, 2L, 3L, 4L, 4L, 4L, 5L,
6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("2013-01-01", "2013-01-02",
"2013-01-03", "2013-01-04", "2013-01-05", "2013-01-06"), class = "factor"),
user = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c(" x", " y"), class = "factor"),
items_bought = c(2L, 1L, 0L, 0L, 1L, 2L, 3L, 1L, 1L, 1L,
0L, 5L, 6L, 1L), event_number = c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L), newCol1 = c(6L, 1L, 7L,
10L, 5L, 10L, 7L, 3L, 3L, 1L, 6L, 5L, 2L, 7L), newCol2 = c(0.687800094485283,
0.640190769452602, 0.357885359786451, 0.10258499882184, 0.0977909218054265,
0.182886255905032, 0.227903473889455, 0.0805241498164833,
0.821618422167376, 0.591113976901397, 0.773389018839225,
0.350085976999253, 0.00606132275424898, 0.814506222726777
)), .Names = c("date", "user", "items_bought", "event_number",
"newCol1", "newCol2"), row.names = c(NA, -14L), class = "data.frame")

I'd like to propose an additional data.table approach combined with zoo package rollapplyr function
First, we will aggregate items_bought column per user per unique date (as you pointed out that there could be more than one unique date per user)
library(data.table)
data <- setDT(data)[, lapply(.SD, sum), by = c("user", "date"), .SDcols = "items_bought"]
Next, we will compute rollapplyr combined with sum and partial = TRUE in order to cover up for margins (thanks for the advice #G. Grothendieck) in 3 days intervals
library(zoo)
data[, cum_items_bought_3_days := lapply(.SD, rollapplyr, 3, sum, partial = TRUE), .SDcols = "items_bought", by = user]
# user date items_bought cum_items_bought_3_days
# 1: x 2013-01-01 2 2
# 2: x 2013-01-02 1 3
# 3: x 2013-01-03 0 3
# 4: x 2013-01-04 0 1
# 5: x 2013-01-05 3 3
# 6: x 2013-01-06 1 4
# 7: y 2013-01-01 1 1
# 8: y 2013-01-02 1 2
# 9: y 2013-01-03 0 2
# 10: y 2013-01-04 5 6
# 11: y 2013-01-05 6 11
# 12: y 2013-01-06 1 12
This is the data set I've used
data <- structure(list(date = structure(c(15706, 15707, 15708, 15709, 15710, 15711, 15706, 15707, 15708, 15709, 15710, 15711), class = "Date"), user = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c(" x", " y"), class = "factor"), items_bought = c(2L, 1L, 0L, 0L, 3L, 1L, 1L, 1L, 0L, 5L, 6L, 1L)), .Names = c("date", "user", "items_bought"), row.names = c(NA, -12L), class = "data.frame")

Here is a fairly simple method:
# replicate your data, shifting the days ahead by your required window,
# and rbind into a single data frame
d <- do.call(rbind,lapply(0:2, function(x) transform(data,date=date+x)))
# use aggregate to add it together, subsetting out "future" days
aggregate(items_bought~date+user,subset(d,date<=max(data$date)),sum)
date user items_bought
1 2013-01-01 x 2
2 2013-01-02 x 3
3 2013-01-03 x 3
4 2013-01-04 x 1
5 2013-01-05 x 3
6 2013-01-06 x 4
7 2013-01-01 y 1
8 2013-01-02 y 2
9 2013-01-03 y 2
10 2013-01-04 y 6
11 2013-01-05 y 11
12 2013-01-06 y 12

The following looks valid:
unlist(lapply(split(data, data$user),
function(x) {
ave(x$items_bought,
cumsum(c(0, diff(x$date)) >= 3), FUN = cumsum)
}))
#x1 x2 x3 x4 y1 y2 y3 y4
# 2 3 3 4 1 6 6 7
Where data:
data = structure(list(date = structure(c(15706, 15707, 15710, 15711,
15706, 15707, 15710, 15711), class = "Date"), user = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c(" x", " y"), class = "factor"),
items_bought = c(2L, 1L, 3L, 1L, 1L, 5L, 6L, 1L)), .Names = c("date",
"user", "items_bought"), row.names = c(NA, -8L), class = "data.frame")

Here is an approach that doesn't use cumsum but a nested lapply instead. The first one goes over the users and then for each user the second lapply constructs the desired data frame by summing all items bought from within the last 2 days of each date. Note that if data$date were not sorted, it would have to be sorted in ascending order first.
data <- structure(list(
date = structure(c(15706, 15707, 15708, 15709, 15710, 15711,
15706, 15707, 15708, 15709, 15710, 15711), class = "Date"),
user = c("x", "x", "x", "x", "x", "x", "y", "y", "y", "y", "y", "y"),
items_bought = c(2L, 1L, 0L, 0L, 3L, 1L, 1L, 1L, 0L, 5L, 6L, 1L)),
.Names = c("date", "user", "items_bought"),
row.names = c(NA, -12L),
class = "data.frame")
do.call(rbind, lapply(unique(data$user),
function(u) {
subd <- subset(data, user == u)
do.call(rbind, lapply(subd$date,
function(x) data.frame(date = x,
user = u, items_bought =
sum(subd[subd$date %in% (x - 2):x, "items_bought"]))))
}))
Edit
To deal with the issue of having several timestamps for each day (more than 1 row per date) I would first aggregate by summing all items bought during at each time in the same day. You can do that e.g. using the built-in function aggregate but if your data is too large you can also use data.table for speed. I'll call your original data frame (with more than 1 row per date) predata and the aggregated one (1 row per date) data. So by calling
predt <- data.table(predata)
setkey(predt, date, user)
data <- predt[, list(items_bought = sum(items_bought)), by = key(predt)]
you get a data frame containing one row per date and columns date, user, items_bought. Now, I think the following way will be faster than the nested lapply above, but I am not sure since I cannot test it on your data. I am using data.table because it is meant to be fast (if used the right way, which I am not sure this is). The inner loop will be replaced by a function f. I do not know if there is a neater way, avoiding this function and replacing the double loop with only one call to data.table, or how to write a data.table call that would execute faster.
library(data.table)
dt <- data.table(data)
setkey(dt, user)
f <- function(d, u) {
do.call(rbind, lapply(d$date, function(x) data.frame(date = x,
items_bought = d[date %in% (x - 2):x, sum(items_bought)])))
}
data <- dt[, f(.SD, user), by = user]
Another way, which doesn't use data.table, assuming that you have enough RAM (again, I don't know the size of your data), is to store items bought 1 day before in a vector, then items bought 2 days before in another vector, etc, and to sum them up in the end. Something like
sumlist <- vector("list", 2) # this will hold one vector, which contains items
# bought 1 or 2 days ago
for (i in 1:2) {
# tmpstr will be used to find the items that a given user bought i days ago
tmpstr <- paste(data$date - i, data$user, sep = "|")
tmpv <- data$items_bought[
match(tmpstr, paste(data$date, data$user, sep = "|"))]
# if a date is not in the original data, assume no purchases
tmpv[is.na(tmpv)] <- 0
sumlist[[i]] <- tmpv
}
# finally, add up items bought in the past as well as the present day
data$cum_items_bought_3_days <-
rowSums(as.data.frame(sumlist)) + data$items_bought
A final thing I would try would be to parallelize the lapply calls, e.g. by using the function mclapply instead, or by re-writing the code using the parallel functionality of foreach or plyr. Depending on the strength of your PC and the size of the task, this may outperform the data.table single-core performance...

It seems like packages xts and zoo contain functions that do what you want, although you may have the same problems with the size of your actual dataset as with #alexis_laz answer. Using the functions from the xts answer to this question seem to do the trick.
First I took the code from the answer I link to above and made sure it worked for just one user. I include the apply.daily function because I believe from your edits/comments that you have multiple observations for some days for some users - I added an extra line to the toy dataset to reflect this.
# Make dataset with two observations for one date for "y" user
dat <- structure(list(
date = structure(c(15706, 15707, 15708, 15709, 15710, 15711,
15706, 15707, 15708, 15709, 15710, 15711, 15711), class = "Date"),
user = c("x", "x", "x", "x", "x", "x", "y", "y", "y", "y", "y", "y", "y"),
items_bought = c(2L, 1L, 0L, 0L, 3L, 1L, 1L, 1L, 0L, 5L, 6L, 1L, 0L)),
.Names = c("date", "user", "items_bought"),
row.names = c(NA, -13L),
class = "data.frame")
# Load xts package (also loads zoo)
require(xts)
# See if this works for one user
dat1 = subset(dat, user == "y")
# Create "xts" object for use with apply.daily()
dat1.1 = xts(dat1$items_bought, dat1$date)
dat2 = apply.daily(dat1.1, sum)
# Now use rollapply with a 3-day window
# The "partial" argument appears to only work with zoo objects, not xts
sum.itemsbought = rollapply(zoo(dat2), 3, sum, align = "right", partial = TRUE)
I thought the output could look nicer (more like example output from your question). I haven't worked with zoo objects much, but the answer to this question gave me some pointers for putting the info into a data.frame.
data.frame(Date=time(sum.itemsbought), sum.itemsbought, row.names=NULL)
Once I had this worked out for one user, it was straightforward to expand this to the entire toy dataset. This is where speed could become an issue. I use lapply and do.call for this step.
allusers = lapply(unique(dat$user), function(x) {
dat1 = dat[dat$user == x,]
dat1.1 = xts(dat1$items_bought, dat1$date)
dat2 = apply.daily(dat1.1, sum)
sum.itemsbought = rollapply(zoo(dat2), 3, sum, align = "right", partial = TRUE)
data.frame(Date=time(sum.itemsbought), user = x, sum.itemsbought, row.names=NULL)
} )
do.call(rbind, allusers)

I like James' answer better, but here's an alternative:
with(data,{
sapply(split(data,user),function(x){
sapply(x$date,function(y) sum(x$items_bought[x$date %in% c(y,y-1,y-2)]))
})
})

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Get the limited rows on date range from dataframe in R - r

Related

Cumulative sums by month in R

Merging two data frames in long format based on date

Mapping values across a dataframe

R: Unique count by first occurrence of grouping variable

How do I do a conditional sum which only looks between certain date criteria

Categories

Resources