Age groups into monthly buckets - r

I'm struggling to find a solution for the following problem. I have a df with id's/ dob's and another monthbucket df as following
set.seed(33)
df <- data.frame(dob = sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10),
id = seq(1:10) )
monthbucket <- data.frame(month = format(seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),'%Y-%m'),
startmonth = seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),
endmonth = seq(as.Date("2010-02-01"),as.Date("2011-02-01"),by="months")-1)
I want to get an output which gives me the count of members within age groups (<19, 19-64, >64) for each of my monthly buckets. The count obviously switches over the year when people have birthdays.
I got the age calculation with something like:
age.fct <- function(dob, bucketdate) {
period <- as.period(interval(dob, bucketdate),unit = "year")
period$year}
I guess the general approach would be to calculate the age for each monthbucket, assign into one of the 3 age groups and count it up by month. Any suggestions?
EDIT 1.
Thanks for all the different approaches, I just run a brief benchmark on the solutions to determine which answer to accept. Somehow the data table solution didn't work on my test data set but I will check as soon as I have a few minutes in the next days.
set.seed(33)
df <- data.frame(dob = sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10000),
id = seq(1:10000) )
monthbucket <- data.frame(month = format(seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),'%Y-%m'),
startmonth = seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),
endmonth = seq(as.Date("2010-02-01"),as.Date("2011-02-01"),by="months")-1)
birth_days <- df$dob
month_bucket <- monthbucket$startmonth
and the benchmark
microbenchmark::microbenchmark(
MM= monthbucket %>% group_by_all %>% expand(id=df$id) %>% left_join(.,{df %>% mutate(birth_month =cut(dob, "month"))},by="id") %>% mutate(age=time_length(difftime(startmonth, birth_month),"years")) %>%
mutate(age_cat=case_when(age<19 ~ "<19", age>64 ~ ">64",TRUE ~ "19-64")) %>% group_by(month) %>% count(age_cat) %>% gather(variable, count, n) %>%
unite(variable, age_cat) %>% spread(variable, count)
,
AkselA = {ages <- as.data.frame(t(unclass(outer(monthbucket$startmonth, df$dob, "-")/365.25)))
ages <- do.call(data.frame, lapply(ages, cut, c(0, 19, 64, Inf), c("0-19", "19-64", "64+")))
ages <- sapply(ages, table)
colnames(ages) <- monthbucket$month
},
Cole1 ={t(table(apply(X = outer(month_bucket, birth_days, `-`) / 365.25, MARGIN = 2, FUN = cut, c(0,19,65, Inf)), rep(format(month_bucket,'%Y-%m'), length(birth_days))))
},
# cole2={ cast(CJ(month_bucket, birth_days)[, .N, by = .(month_bucket , cut(as.numeric(month_bucket - birth_days)/365.25, c(0,19,65,Inf)))], month_bucket ~ cut, value.var = 'N')
# },
#
Cole3={crossing(month_bucket, birth_days)%>%count(month_bucket, age_range = cut(as.numeric(month_bucket - birth_days) / 365.25, c(0,19,65,Inf)))%>%spread(age_range, n)
},
Cole4={all_combos <- expand.grid(month_bucket = month_bucket, birth_days = birth_days)
all_combos$age <- as.numeric(all_combos$month_bucket - all_combos$birth_days) / 365.25
all_combos$cut_r <- cut(all_combos$age, c(0,19,65,Inf))
reshape(data = aggregate( all_combos$month_bucket, by = list(bucket = all_combos$month_bucket,age_group = all_combos$cut_r), FUN = length), timevar = 'age_group' , idvar = 'bucket', direction = 'wide' )
},
times = 1L)
Unit: milliseconds
expr min lq mean median uq max neval
MM 4249.02810 4249.02810 4249.02810 4249.02810 4249.02810 4249.02810 1
AkselA 17.12697 17.12697 17.12697 17.12697 17.12697 17.12697 1
Cole1 3237.94534 3237.94534 3237.94534 3237.94534 3237.94534 3237.94534 1
Cole3 23.63945 23.63945 23.63945 23.63945 23.63945 23.63945 1
Cole4 877.92782 877.92782 877.92782 877.92782 877.92782 877.92782 1
Based on speed AkselA's approach seems to be the fastest but I get a different result for M-M's approach compared to all others (once AkselA's changes to 65 in the cut part cut, c(0, 19, 64, Inf)... I will accept answer based on speed but will look into the differences in the results!

Not very sophisticated but I joined the two tables (first expanded monthbucket on df$id) and then calculated the age (as you have the whole month, I just calculated difftime with the first day of month of birth and startmonth). Then, for each month (bucket) I counted number of different age groups and at the end converted long format to wide for better illustration.
library(lubridate)
library(tidyverse)
monthbucket %>%
group_by_all %>%
expand(id=df$id) %>%
left_join(.,{df %>%
mutate(birth_month =cut(dob, "month"))},
by="id") %>%
mutate(age=time_length(difftime(startmonth, birth_month),"years")) %>%
mutate(age_cat=case_when(age<19 ~ "<19",
age>64 ~ ">64",
TRUE ~ "19-64")) %>%
group_by(month) %>%
count(age_cat) %>%
gather(variable, count, n) %>%
unite(variable, age_cat) %>%
spread(variable, count)
#> # A tibble: 13 x 4
#> # Groups: month [13]
#> month `<19` `>64` `19-64`
#> <fct> <int> <int> <int>
#> 1 2010-01 3 2 5
#> 2 2010-02 3 2 5
#> 3 2010-03 3 2 5
#> 4 2010-04 3 2 5
#> 5 2010-05 3 2 5
#> 6 2010-06 3 2 5
#> 7 2010-07 3 2 5
#> 8 2010-08 3 2 5
#> 9 2010-09 3 2 5
#> 10 2010-10 3 2 5
#> 11 2010-11 3 2 5
#> 12 2010-12 3 2 5
#> 13 2011-01 3 2 5
Created on 2019-07-03 by the reprex package (v0.3.0)

Assuming I understand your request.
ages <- as.data.frame(t(unclass(outer(monthbucket$startmonth, df$dob, "-")/365.25)))
ages <- do.call(data.frame,
lapply(ages, cut, c(0, 19, 64, Inf), c("0-19", "19-64", "64+")))
ages <- sapply(ages, table)
colnames(ages) <- monthbucket$month
ages
# 2010-01 2010-02 2010-03 2010-04 2010-05 2010-06 2010-07 2010-08 2010-09 2010-10 2010-11 2010-12 2011-01
# 0-19 2 2 2 2 2 2 2 2 2 2 2 2 2
# 19-64 7 7 7 7 7 7 7 7 7 7 7 7 7
# 64+ 1 1 1 1 1 1 1 1 1 1 1 1 1
#

There are some similarities to #AkselA's answer as it depends on outer(), cut(), and table().
set.seed(33)
birth_days <- sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10)
month_bucket <- seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months")
t(
table(
apply(
X = outer(month_bucket, birth_days, `-`) / 365.25
, MARGIN = 2
, FUN = cut, c(0,19,65, Inf)
)
, rep(format(month_bucket,'%Y-%m'), length(birth_days))
)
)
(0,19] (19,65] (65,Inf]
2010-01 2 7 1
2010-02 2 7 1
2010-03 2 7 1
2010-04 2 7 1
2010-05 2 7 1
2010-06 2 7 1
2010-07 2 7 1
2010-08 2 7 1
2010-09 2 7 1
2010-10 2 7 1
2010-11 2 7 1
2010-12 2 7 1
2011-01 2 7 1
I felt weird having such a similar solution so here is data.table:
library(data.table)
dcast(
CJ(month_bucket, birth_days
)[, .N
, by = .(month_bucket
, cut(as.numeric(month_bucket - birth_days)/365.25, c(0,19,65,Inf)))
]
, month_bucket ~ cut
, value.var = 'N')
dplyr and tidyr:
library(dplyr)
library(tidyr)
crossing(month_bucket, birth_days)%>%
count(month_bucket
, age_range = cut(as.numeric(month_bucket - birth_days) / 365.25, c(0,19,65,Inf))
)%>%
spread(age_range, n)
And a similar approach in base that I'm not completely happy with.
all_combos <- expand.grid(month_bucket = month_bucket, birth_days = birth_days)
all_combos$age <- as.numeric(all_combos$month_bucket - all_combos$birth_days) / 365.25
all_combos$cut_r <- cut(all_combos$age, c(0,19,65,Inf))
reshape(
data = aggregate(
all_combos$month_bucket
, by = list(bucket = all_combos$month_bucket
,age_group = all_combos$cut_r)
, FUN = length)
, timevar = 'age_group'
, idvar = 'bucket'
, direction = 'wide'
)

Related

remove rows with overlaped dates and keep longest time interval in R using dplyr or sqldf

I need to remove rows with overlapped dates and keep the x value which is maximum among the overlapped dates. Here is a data frame:
data.frame(time_left = c("2011-08-05",
"2011-07-25",
"2017-08-20",
"2017-08-20",
"2017-10-09",
"2019-06-01"),
time_right= c("2011-09-14",
"2011-09-01",
"2017-09-12",
"2017-09-26",
"2017-10-15",
"2019-11-05"),
x = c(114,20,10,1,5,100) ) -> df
so my input is:
time_left time_right x
1 2011-08-05 2011-09-14 114
2 2011-07-25 2011-09-01 20
3 2017-08-20 2017-09-12 10
4 2017-08-20 2017-09-26 1
5 2017-10-09 2017-10-15 5
6 2019-06-01 2019-11-05 100
and my desired output is:
time_left time_right x
1 2011-08-05 2011-09-14 114
2 2011-07-25 2011-09-01 20
4 2017-08-20 2017-09-26 10
5 2017-10-09 2017-10-15 5
6 2019-06-01 2019-11-05 100
I appreciate any help.
#Maël brought this issue to my attention over on the ivs issue page https://github.com/DavisVaughan/ivs/issues/20.
I think this can be very elegantly and efficiently solved with ivs, but it is a bit hard to come up with the solution, so I'll probably add a helper to do this more easily.
This solution works with "recursive" containers too, i.e. where range A contains range B, but then range C also contains range A, so you really only want to list range C. I've described this in more detail with examples here https://github.com/DavisVaughan/ivs/issues/20#issuecomment-1234479783.
library(ivs)
library(dplyr)
library(vctrs)
df <- tibble(
time_left = as.Date(c(
"2011-08-05", "2011-07-25", "2017-08-20",
"2017-08-20", "2017-10-09", "2019-06-01"
)),
time_right = as.Date(c(
"2011-09-14", "2011-09-01", "2017-09-12",
"2017-09-26", "2017-10-15", "2019-11-05"
)),
x = c(114, 20, 10, 1, 5, 100)
)
df <- df %>%
mutate(range = iv(time_left, time_right), .keep = "unused")
df
#> # A tibble: 6 × 2
#> x range
#> <dbl> <iv<date>>
#> 1 114 [2011-08-05, 2011-09-14)
#> 2 20 [2011-07-25, 2011-09-01)
#> 3 10 [2017-08-20, 2017-09-12)
#> 4 1 [2017-08-20, 2017-09-26)
#> 5 5 [2017-10-09, 2017-10-15)
#> 6 100 [2019-06-01, 2019-11-05)
iv_locate_max_containment <- function(x) {
# Find all locations where the range "contains" any other range
# (including itself)
locs <- iv_locate_overlaps(x, x, type = "contains")
# Find the "top" ranges, i.e. the containers that aren't contained
# by any other containers
top <- !vec_duplicate_detect(locs$haystack)
top <- vec_slice(locs$haystack, top)
top <- vec_in(locs$needles, top)
locs <- vec_slice(locs, top)
locs
}
# i.e. row 4 "contains" rows 3 and 4
locs <- iv_locate_max_containment(df$range)
locs
#> needles haystack
#> 1 1 1
#> 2 2 2
#> 3 4 3
#> 4 4 4
#> 5 5 5
#> 6 6 6
iv_align(df$range, df$x, locations = locs) %>%
rename(range = needles) %>%
group_by(range) %>%
summarise(x = max(haystack))
#> # A tibble: 5 × 2
#> range x
#> <iv<date>> <dbl>
#> 1 [2011-07-25, 2011-09-01) 20
#> 2 [2011-08-05, 2011-09-14) 114
#> 3 [2017-08-20, 2017-09-26) 10
#> 4 [2017-10-09, 2017-10-15) 5
#> 5 [2019-06-01, 2019-11-05) 100
Created on 2022-09-01 with reprex v2.0.2
This may sound a little verbose, however, this could also be a solution:
First we identify those observations that are potentially overlapped.
Then we group the similar ones.
In each group we choose the minimum time_left and maximum time_right and x.
library(tidyverse)
df %>%
mutate(across(starts_with('time'), ymd),
intv = interval(time_left, time_right),
id = row_number()) %>%
mutate(id2 = map2(intv, id, ~ if (any(.x %within% intv[intv != .x])) {
id[which(.x %within% intv[intv != .x]) + 1]
} else {
.y
})) %>%
group_by(id2) %>%
summarise(time_left = min(time_left),
across(c(time_right, x), max)) %>%
select(!(id2))
# A tibble: 4 × 3
time_left time_right x
<date> <date> <dbl>
1 2011-08-05 2011-09-14 114
2 2017-08-20 2017-09-26 10
3 2017-10-09 2017-10-15 5
4 2019-06-01 2019-11-05 100
I combined Anoushiravan's solution with this
How do I determine in R if a date interval overlaps another date interval for the same individual in a data frame?
and I think it is working now.
df %>%
mutate(id = row_number(), days = as.numeric(as.Date(time_right) - as.Date(time_left)) ) %>%
mutate(Int = interval(time_left, time_right),
within = map(seq_along(Int), function(x){
y = setdiff(seq_along(Int), x)
if(any(id[which((Int[x] %within% Int[y]))+1])){
return(id[days == max(days[which((Int[x] %within% Int[y]))+1])])
}else{ return(0)}
})
) %>%
mutate(within = ifelse(within > 0 , within, id)) %>%
group_by(within) %>%
summarise(time_left = min(time_left), time_right = max(time_right), x = max(x)) %>%
select(!within)
But it still has some bugs. for the following df, this code will not work unless I change the order of the records.
df = data.frame(time_left = c("2014-01-01", "2014-01-01", "2014-12-01", "2014-12-26"),
time_right = c("2014-04-23", "2014-12-31", "2014-12-31", "2014-12-31"),
x = c(10,100,200,20))

manipulate data with multiple values in the cell in r

I have a data-set looks like
universityies <- c("UNI.1;UNI.1;UNI.2;UNI.3","UNI.5", "UNI.3;UNI.4" )
papers <- c(1,1,1)
cited <- c(10,5,20)
df <- data.frame(universityies, papers, cited )
df
and I want to get something like
#total papers total cited
#UNI.1 1 10
#UNI.2 1 10
#UNI.3 2 30
#UNI.4 1 20
#UNI.5 1 5
many thanks in advance,
We can split the data on ";", get unique rows, group_by universityies count distinct papers and the total number of citations.
library(dplyr)
df %>%
mutate(row = row_number()) %>%
tidyr::separate_rows(universityies, sep = ";") %>%
distinct() %>%
group_by(universityies) %>%
summarise(total_papers = n_distinct(row),
total_cited = sum(cited))
# universityies total_papers total_cited
# <chr> <int> <dbl>
#1 UNI.1 1 10
#2 UNI.2 1 10
#3 UNI.3 2 30
#4 UNI.4 1 20
#5 UNI.5 1 5
We can use cSplit from splitstackshape and data.table methods
library(data.table)
library(splitstackshape)
unique(cSplit(setDT(df, keep.rownames = TRUE), "universityies", ";",
"long"))[, .(total_papers = uniqueN(rn), total_cited = sum(cited)),.(universityies)]
# universityies total_papers total_cited
#1: UNI.1 1 10
#2: UNI.2 1 10
#3: UNI.3 2 30
#4: UNI.5 1 5
#5: UNI.4 1 20
You may use strsplit in a first step, then aggregate
tmp <- do.call(rbind, apply(df, 1, function(x)
setNames(data.frame(strsplit(x[1], ";"), as.numeric(x[2]), as.numeric(x[3]),
row.names=NULL, stringsAsFactors=FALSE), names(df))))
res <- aggregate(cbind(total.papers=papers, total.cited=cited) ~ universityies,
unique(tmp), sum)
res[order(res$universityies), ]
# universityies total.papers total.cited
# 1 UNI.1 1 10
# 2 UNI.2 1 10
# 3 UNI.3 2 30
# 4 UNI.4 1 20
# 5 UNI.5 1 5

Create ranges by accumulating values

I have the DF1:
KEY <- c(11,12,22,33,44,55,66,77,88,99,1010,1111,1212,1313,1414,1515,1616,1717,1818,1919,2020)
PRICE <- c(0,0,1,5,7,10,20,80,110,111,200,1000,2500,2799,3215,4999,7896,8968,58914,78422,96352)
DF1 <- data.frame(KEY,PRICE)
I want to group DF1 into ranges, accumulating the values of the two columns (count the KEY column and sum the PRICE column). This is the result I hope for:
INTERVAL <-c('0','UP_TO_10','UP_TO_100','UP_TO_1000','UP_TO_5000','UP_TO_10000','UP_TO_100000')
COUNT_KEY <-c(2,6,8,12,16,18,21)
SUM_PRICE <- c(0,23,123,1544,15057,31921,265609)
DF2 <- data.frame(INTERVAL,COUNT_KEY,SUM_PRICE)
How do I make this table?
If you have a vector of limits or thresholds, such as:
LIMITS <- c(0, 10, 100, 1000, 5000, 10000, 100000)
You could obtain a count of rows where PRICE is below each limit:
unlist(lapply(LIMITS, function(x) sum(DF1$PRICE <= x)))
[1] 2 6 8 12 16 18 21
And a sum of these prices as well:
unlist(lapply(LIMITS, function(x) sum(DF1$PRICE[DF1$PRICE <= x])))
[1] 0 23 123 1544 15057 31921 265609
Is this what you had in mind?
This is everything all together:
LIMITS <- c(0, 10, 100, 1000, 5000, 10000, 100000)
COUNT_KEY <- unlist(lapply(LIMITS, function(x) sum(DF1$PRICE <= x)))
SUM_PRICE <- unlist(lapply(LIMITS, function(x) sum(DF1$PRICE[DF1$PRICE <= x])))
data.frame(INTERVAL = c(0, paste("UP_TO", LIMITS[-1], sep="_")), COUNT_KEY, SUM_PRICE)
INTERVAL COUNT_KEY SUM_PRICE
1 0 2 0
2 UP_TO_10 6 23
3 UP_TO_100 8 123
4 UP_TO_1000 12 1544
5 UP_TO_5000 16 15057
6 UP_TO_10000 18 31921
7 UP_TO_100000 21 265609
You have to manually define you boundaries first:
X = c(-Inf,0,10,100,1000,5000,10000,100000)
Then you use cut to assign to entries to your labels. And we first summarize the counts and total price within the intervals.
library(dplyr)
DF1 %>%
mutate(LABELS = cut(DF1$PRICE,X,INTERVAL,include.lowest =TRUE)) %>%
group_by(LABELS) %>%
summarise(COUNT_KEY=n(),SUM_PRICE=sum(PRICE))
# A tibble: 7 x 3
LABELS COUNT_KEY SUM_PRICE
<fct> <int> <dbl>
1 0 2 0
2 UP_TO_10 4 23
3 UP_TO_100 2 100
4 UP_TO_1000 4 1421
5 UP_TO_5000 4 13513
6 UP_TO_10000 2 16864
7 UP_TO_100000 3 233688
This is close to what you want, except the sum_price and counts, should be cumulative. So this can be achieved by doing mutate_if(is.numeric,cumsum):
DF1 %>%
mutate(LABELS = cut(DF1$PRICE,X,INTERVAL,include.lowest =TRUE)) %>% group_by(LABELS) %>%
summarise(COUNT_KEY=n(),SUM_PRICE=sum(PRICE)) %>%
mutate_if(is.numeric,cumsum)
To give:
# A tibble: 7 x 3
LABELS COUNT_KEY SUM_PRICE
<fct> <int> <dbl>
1 0 2 0
2 UP_TO_10 6 23
3 UP_TO_100 8 123
4 UP_TO_1000 12 1544
5 UP_TO_5000 16 15057
6 UP_TO_10000 18 31921
7 UP_TO_100000 21 265609
Okay, here's an all-in-one, tidy way to handle this using dplyr ;)
library(dplyr)
DF1 %>%
mutate(
INTERVAL =
factor(
case_when( # create discrete variable
PRICE == 0 ~ '0',
PRICE <= 10 ~ 'UP_TO_10',
PRICE <= 100 ~ 'UP_TO_100',
PRICE <= 1000 ~ 'UP_TO_1000',
PRICE <= 5000 ~ 'UP_TO_5000',
PRICE <= 10000 ~ 'UP_TO_10000',
PRICE <= 100000 ~ 'UP_TO_100000'
),
levels = # set the factor levels
c(
'0',
'UP_TO_10',
'UP_TO_100',
'UP_TO_1000',
'UP_TO_5000',
'UP_TO_10000',
'UP_TO_100000'
)
)
) %>%
group_by(INTERVAL) %>% # create desired group
summarise( # and summary variables
COUNT_KEY = n(),
SUM_PRICE = sum(PRICE)
) %>%
mutate( # cumulative totals
COUNT_KEY_CUM = cumsum(COUNT_KEY),
SUM_PRICE_CUM = cumsum(SUM_PRICE)
)

break long characters by "-" and identify the unique components by group

Suppose you have a transaction data set with products purchased by customers. You know the total skus (which unique product they buy for each order). But you want to know the unique sku for the lifetime of the user. Say, I buy "apple" for the first time ever today, then apple is a NEW sku. But when I buy "apple" amd "banana" again. Then apple is not a new sku but banana is (if first time purchase).
data
user_id<-c(1,1,1,2,3,4,4)
order_date<-c("2/9/2016",
"11/19/2015",
"12/30/2016",
"9/27/2016",
"12/10/2016",
"11/5/2016",
"1/1/2017")
sku<-c("262-264-280","280-123","510","6251-16990","9227-14572","9227-14572","280")
dt<-data.frame(user_id,order_date,sku)
Output
update: I typed "user_id" as "order_id"
A data.table possibility, using the strings split by -, and checking for new unique values in each row via a set union and Reduce(..., accumulate=TRUE). The count of new values is then a difference between each successive row:
library(data.table)
setDT(dt)
dt[, sku := as.character(sku)]
dt[,
total := lengths(Reduce(union, strsplit(sku, "\\-"), accumulate=TRUE)),
by=user_id
]
dt[, new := c(total[1], diff(total)), by=user_id]
dt
# user_id order_date sku total new
#1: 1 2/9/2016 262-264-280 3 3
#2: 1 11/19/2015 280-123 4 1
#3: 1 12/30/2016 510 5 1
#4: 2 9/27/2016 6251-16990 2 2
#5: 3 12/10/2016 9227-14572 2 2
#6: 4 11/5/2016 9227-14572 2 2
#7: 4 1/1/2017 280 3 1
A possible tidyverse way:
library(dplyr)
library(tidyr)
dt %>%
separate_rows(sku, sep = "-") %>%
mutate(order_date = as.Date(as.character(order_date), "%m/%d/%Y")) %>%
group_by(order_id, sku) %>%
arrange(order_id, order_date) %>%
mutate(idx = row_number() * (row_number() == 1)) %>%
group_by(order_id, order_date) %>%
summarise(sku = paste(sku, collapse = "-"),
`number of new sku purchase` = sum(idx)) %>%
group_by(order_id) %>%
mutate(`total number of sku` = cumsum(`number of new sku purchase`))
Here's a base solution that is similar to #thelatemail:
dt$sku <- as.character(dt$sku)
dt$n_skus <- ave(dt$sku
, dt$user_id
, FUN = function (sku_f) {
sapply(
Reduce(union, strsplit(sku_f, '-', fixed = T), accumulate = T)
, length)
})
dt$n_skus <- as.integer(dt$n_skus)
dt$n_new <- ave(dt$n_skus
, dt$user_id
, FUN = function(n) c(n[1], diff(n)))
dt
user_id order_date sku n_skus n_new
1 1 2/9/2016 262-264-280 3 3
2 1 11/19/2015 280-123 4 1
3 1 12/30/2016 510 5 1
4 2 9/27/2016 6251-16990 2 2
5 3 12/10/2016 9227-14572 2 2
6 4 11/5/2016 9227-14572 2 2
7 4 1/1/2017 280 3 1
And here's a crazy, inefficient data.table solution that doesn't include the original sku field:
library(data.table)
setDT(dt)
dt[, strsplit(as.character(sku), '-'), by = .(user_id, order_date)
][, .SD[!duplicated(V1), .(n_new = .N), by = order_date], by = .(user_id)
][, n_total := cumsum(n_new), by = .(user_id)][]
Performance
#thelatemail's should scale up the best.
Unit: microseconds
expr min lq mean median uq max neval
argonaut_dplyr 10020.601 10247.851 10720.0709 10474.451 10770.751 16021.3 100
thelatemail_dt 1954.501 2072.101 2385.8019 2303.001 2436.202 5807.0 100
#base is always fastest with 7 rows, it would be outstripped by #thelatemail very quickly
cole_base 651.501 751.601 834.0299 772.201 794.752 6309.2 100
cole_bad_dt 7006.400 7355.200 7757.9991 7668.401 7958.651 12708.0 100
purrr_A_sul 14575.501 14862.552 15566.4809 15033.201 15401.601 33263.3 100
Here is one option based on unlist(strsplit(dt$sku,'-')) the current sku then compare it with the previous skus
library(dplyr)
library(purrr)
library(tidyr)
dt %>%
nest(-user_id) %>%
mutate(NNSP = map(data, ~map_dbl(1:length(.x$sku), function(y) {
#browser()
ynow <- unlist(strsplit(as.character(.x$sku)[y],'-'))
yprev <- unique(unlist(strsplit(as.character(.x$sku[1:y-1]),'-')))
length(ynow[!ynow %in% yprev])
})),
TotNNSP = map(NNSP, ~cumsum(.x))) %>%
unnest()
# A tibble: 7 x 5
user_id NNSP TotNNSP order_date sku
<dbl> <dbl> <dbl> <fct> <fct>
1 1 3 3 2/9/2016 262-264-280
2 1 1 4 11/19/2015 280-123
3 1 1 5 12/30/2016 510
4 2 2 2 9/27/2016 6251-16990
5 3 2 2 12/10/2016 9227-14572
6 4 2 2 11/5/2016 9227-14572
7 4 1 3 1/1/2017 280
Using #thelatemail's Reduce and lengths combination, we can do:
library(dplyr)
setdiff2 <- function(x, y) y[!y %in% x]
dt %>%
group_by(user_id) %>%
mutate(sku = as.character(sku),
NNSP = lengths(Reduce(setdiff2, strsplit(sku,'-'), accumulate = TRUE)),
Tot_NNSP = cumsum(NNSP))

sum by group including intermediate groups

I have:
df <- data.frame(group=c(1,1,2,4,4,5), value=c(3,1,5,2,3,6))
aggregate(value ~ group, data = df, FUN = 'sum')
group value
1 1 4
2 2 5
3 4 5
4 5 6
is there a way to include intermediate groups to return the below? I realise this could be done by creating a dataframe with all the desired groups and matching in the results from aggregate() but I am hoping there is a cleaner way to do this. it would need to be as fast as using aggregate and only use base r packages - this is due to restrictions in my workplace.
group value
1 1 4
2 2 5
3 3 0
4 4 5
5 5 6
You can try this .
library(tidyr)
library(dplyr)
df %>%
mutate(group=factor(group, 1:5)) %>%
complete(group) %>%group_by(group)%>%
dplyr::summarise(value=sum(value,na.rm = T))
group value
<fctr> <dbl>
1 1 4
2 2 5
3 3 0
4 4 5
5 5 6
You can do this easily with the tidyverse:
library(dplyr)
library(tidyr)
df %>%
group_by(group) %>%
summarise(valuesum = sum(value)) %>%
full_join(., expand(df, group = 1:5)) %>%
complete(group, fill = list(valuesum = 0))
The result:
# A tibble: 5 x 2
group valuesum
<dbl> <dbl>
1 1 4
2 2 5
3 3 0
4 4 5
5 5 6
Or a bit more difficult to understand with data.table:
library(data.table)
setDT(df)[.(group = 1:5), on = 'group', sum(value, na.rm = TRUE), by = .EACHI]
You can use mergefrom base R. I've changed the name of your data.frame to dat, since df is the name of an R function.
dat <- read.table(text = "
group value
1 4
2 5
4 5
5 6
", header = TRUE)
str(dat)
res <- aggregate(value ~ group, data = dat, FUN = 'sum')
merge(res, data.frame(group = seq(from = min(res$group), to = max(res$group))), all = TRUE)
Note that there will be a NA, not a zero. I believe that you should solve that by leaving it as a missing value.

Resources