data.table alternative for slow group_by() and case_when() function - r

In my data i have customer-ids, orderdates and an indicator if an order contained a type of product.
I want to give an indicator to each customer, if his first order contained this type of product. But because my data is pretty big i cannot use group_by and case_when, because it is way too slow. I think i could speed things up a lot by using data.table.
Could you point me to a solution? I haven´t had any contact with data.table until now...
# generate data
id <- round(rnorm(3000, mean = 5000, 400),0)
date <- seq.Date(as.Date("2018-01-01"), as.Date("2018-12-31"), "day")
date <- sample(date, length(id), replace = TRUE)
indicator <- rbinom(length(id), 1, 0.5)
df <- data.frame(id, date, indicator)
df$id <- as.factor(df$id)
# Does the first Order contain X?
df <- df %>% group_by(id) %>% mutate(First_Order_contains_x = case_when(
date == min(date) & indicator == "1" ~ 1,
TRUE ~ 0
)) %>% ungroup()
# If first order > 1 ==> all orders get 1 //
df <- df %>% group_by(id) %>% mutate(Customer_type = case_when(
sum(First_Order_contains_x) > 0 ~ "Customer with X in first order",
TRUE ~ "Customer without x in first order"
)) %>% ungroup()

Another way:
library(data.table)
DT = data.table(df[, 1:3])
lookupDT = DT[, .(date = min(date)), by=id]
lookupDT[, fx := DT[copy(.SD), on=.(id, date), max(indicator), by=.EACHI]$V1]
DT[, v := "Customer without x in first order"]
DT[lookupDT[fx == 1L], on=.(id), v := "Customer with X in first order"]
# check results
fsetequal(DT[, .(id, v)], data.table(id = df$id, v = df$Customer_type))
# [1] TRUE
If you want more speed improvements, maybe see ?IDate.
The copy on .SD is needed due to an open issue.

Here's how you can improve your existing code using dplyr more efficiently:
lookup = data.frame(First_Order_contains_x = c(TRUE, FALSE),
Customer_Type = c("Customer with X in first order",
"Customer without x in first order"))
df %>%
group_by(id) %>%
mutate(First_Order_contains_x = any(as.integer(date == min(date) & indicator == 1))) %>%
ungroup() %>%
left_join(lookup, by = "First_Order_contains_x")
# A tibble: 3,000 x 5
id date indicator First_Order_contains_x Customer_Type
<fct> <date> <dbl> <lgl> <fct>
1 5056 2018-03-10 1 TRUE Customer with X in first order
2 5291 2018-12-28 0 FALSE Customer without x in first order
3 5173 2018-04-19 0 FALSE Customer without x in first order
4 5159 2018-11-13 0 TRUE Customer with X in first order
5 5252 2018-05-30 0 TRUE Customer with X in first order
6 5200 2018-01-20 0 FALSE Customer without x in first order
7 4578 2018-12-18 1 FALSE Customer without x in first order
8 5308 2018-03-24 1 FALSE Customer without x in first order
9 5234 2018-05-29 1 TRUE Customer with X in first order
10 5760 2018-06-12 1 TRUE Customer with X in first order
# … with 2,990 more rows

Another data.table approach. Sort the data first so that the first date is the earliest date and we can then use the first indicator for testing the condition. Then, convert logical to an integer (FALSE -> 1 and TRUE -> 2) and map into desired output using a character vector.
library(data.table)
setDT(df)
setorder(df, id, date)
map <- c("Customer without x in first order", "Customer with X in first order")
df[, idx := 1L+any(indicator[1L]==1L), by=.(id)][,
First_Order_contains_x := map[idx]]
If the original order is important, we can store the original order using df[, rn := .I] then finally setorder(df, rn).
data:
set.seed(0L)
id <- round(rnorm(3000, mean = 5000, 5),0)
date <- seq.Date(as.Date("2018-01-01"), as.Date("2018-12-31"), "day")
date <- sample(date, length(id), replace = TRUE)
indicator <- rbinom(length(id), 1, 0.5)
df <- data.frame(id, date, indicator)
df$id <- as.factor(df$id)

Related

How to impute data from one dataframe to another in r

I have two data frames df1 and df2
df1= data.frame( ts = c('2020-01-15', '2020-01-16' , '2020-01-17', '2020-01-20', '2020-01-22','2020-
01-24','2020-01-27','2020-01-30','2020-01-31'),
lla=c(12,13,14,15,16,17,18,19,20),
llb=c(1,2,3,4,6,5,9,8,7),
llc=c(0.6,1.6,2.6,3.6,4.6,5.6,6.6,7.6,8.6),
lld=c(10,11,12,13,14,15,16,154,167))
df2= data.frame( ts = c('2020-01-17','2020-01-24','2020-01-31'),
lla=NA,llb=NA,llc=NA,lld=NA)
if the date of df2$ts matches df1$ts it will look back 4 days ago and impute the max value of every column in df2.
For example:
the 1st value of df2$ts is "2020-01-17" matches to df1$ts .. it will look back 4 days means , it will filter out the data of df1$ts between 2020-01-13 to 2020-01-17 so we get
#code
df1[(as.Date(df1$ts)>= (as.Date(df2[1,1])-2)) &
(as.Date(df1$ts)<= (as.Date(df2[1,1]))),]
#i am writing this in a loop so that it would iterate over every date of df2
df1= data.frame( ts = c('2020-01-15', '2020-01-16' , '2020-01-17'),
lla=c(12,13,14),
llb=c(1,2,3),
llc=c(0.6,1.6,2.6),
lld=c(10,11,12))
So now we have to get the max of every column which we would achieve with this code
#would return the maximum of every column
apply(ohlc[(as.Date(ohlc$ts)>= (as.Date(cls[1,1])-2)) &
(as.Date(ohlc$ts)<= (as.Date(cls[1,1]))),],2,max)
But I don't know how to input this data in df2 for matching date i.e "2020-01-17" and so on for other dates of df2
Try:
libray(dplyr)
df1 <- df1 %>% mutate(ts=as.Date(ts))
df2 <- df2 %>% mutate(ts=as.Date(ts))
my_function <- function(x,df){
df %>% filter(ts >= (x$ts-3) & ts <= x$ts) %>%
mutate(ts=x$ts) %>%
summarise(across(.cols = lla:lld, .fns = max)) %>%
mutate(ts=x$ts)}
lapply(split(df2,df2$ts),my_function,df=df1) %>% do.call(rbind,.)
Here is an option with roll after creating a new column 4 days back
library(data.table)
library(lubridate)
# // convert columns to Date class
df1$ts <- as.Date(df1$ts)
df2$ts <- as.Date(df2$ts)
nm1 <- names(df2)[-1]
# // change the type of NA columns from logical to numeric
setDT(df2)[, (nm1) := lapply(.SD, as.numeric), .SDcols = nm1]
# // subtract 4 days from ts to create ts1
setDT(df1)[, ts1 := ts %m-% days(4)]
# do a rolling join while getting the `max` for each column
df2[df1, (nm1) := lapply(mget(paste0("i.", nm1)), max),
on = .(ts = ts1), roll = -Inf, by = .EACHI]
Alternate approach using runner
df1$ts <- as.Date(df1$ts)
df2$ts <- as.Date(df2$ts)
library(runner)
library(dplyr)
df2 %>%
mutate(across(!ts, ~max_run(x = df1[[cur_column()]],
k = 4,
idx = df1$ts,
at = cur_data()[[1]])))
#> ts lla llb llc lld
#> 1 2020-01-17 14 3 2.6 12
#> 2 2020-01-24 17 6 5.6 15
#> 3 2020-01-31 20 8 8.6 167
Created on 2021-06-06 by the reprex package (v2.0.0)

Select rows from grouped dataframe based on duplicate values

I have a dataframe with 3 columns. The id of each individual, the number of group they belong (gr) and location codes (loc). What I am trying to do is identify which individuals visit 2 locations with the following sequence: Location 1 -> Location 2 -> Location 1.
Dummy dataset:
id <- c(1,1,1,1,1,1,1,2,2,2,2,2,4,4,4,4,4,4,4,4)
gr<-c(1,1,1,1,1,1,1,1,1,1,1,1,1,4,4,4,4,4,4,4)
loc <- c(5,5,4,4,5,5,5,3,3,3,3,2,2,2,2,3,3,2,2,2)
df<- data.frame(id,gr, loc)
I have tried using a diff function, to identify differences between the locations:
dif<- diff(as.numeric(df$loc))
But I can't find any other way to move forward. In addition this approach doesn't account for the groups of each individual (and the ids repeat between groups). I was thinking maybe using a lag function but not sure how or if it helps at all. Any recommendations? Many thanks in advance, I'm still pretty new in R.
Desired output:
id<- c(1,4)
gr<- c(1,4)
out<- data.frame(cbind(id, gr))
A possible data.table option
unique(
setDT(df)[
,
q := rleid(loc), .(id, gr)
][
,
.SD[uniqueN(q) == 3 & first(loc) == last(loc)], .(id, gr)
][
,
.(id, gr)
]
)
gives
id gr
1: 1 1
2: 4 4
May be this works
library(dplyr)
library(data.table)
df %>%
group_by(id) %>%
filter(n_distinct(rleid(loc)) >2) %>%
slice_tail(n = 1) %>%
select(-loc) %>%
ungroup
# A tibble: 2 x 2
# id gr
# <dbl> <dbl>
#1 1 1
#2 4 4

dplyr lag of different group

I am trying to use dplyr to mutate both a column containing the samegroup lag of a variable as well as the lag of (one of) the other group(s).
Edit: Sorry, in the first edition, I messed up the order a bit by rearranging by date at the last second.
This is what my desired result would look like:
Here is a minimal code example:
library(tidyverse)
set.seed(2)
df <-
data.frame(
x = sample(seq(as.Date('2000/01/01'), as.Date('2015/01/01'), by="day"), 10),
group = sample(c("A","B"),10,replace = T),
value = sample(1:10,size=10)
) %>% arrange(x)
df <- df %>%
group_by(group) %>%
mutate(own_lag = lag(value))
df %>% data.frame(other_lag = c(NA,1,2,7,7,9,10,10,8,6))
Thank you very much!
A solution with data.table:
library(data.table)
# to create own lag:
setDT(df)[, own_lag:=c(NA, head(value, -1)), by=group]
# to create other group lag: (the function works actually outside of data.table, in base R, see N.B. below)
df[, other_lag:=sapply(1:.N,
function(ind) {
gp_cur <- group[ind]
if(any(group[1:ind]!=gp_cur)) tail(value[1:ind][group[1:ind]!=gp_cur], 1) else NA
})]
df
# x group value own_lag other_lag
#1: 2001-12-08 B 1 NA NA
#2: 2002-07-09 A 2 NA 1
#3: 2002-10-10 B 7 1 2
#4: 2007-01-04 A 5 2 7
#5: 2008-03-27 A 9 5 7
#6: 2008-08-06 B 10 7 9
#7: 2010-07-15 A 4 9 10
#8: 2012-06-27 A 8 4 10
#9: 2014-02-21 B 6 10 8
#10: 2014-02-24 A 3 8 6
Explanation of other_lag determination: The idea is, for each observation, to look at the group value, if there is any group value different from current one, previous to current one, then take the last value, else, put NA.
N.B.: other_lag can be created without the need of data.table:
df$other_lag <- with(df, sapply(1:nrow(df),
function(ind) {
gp_cur <- group[ind]
if(any(group[1:ind]!=gp_cur)) tail(value[1:ind][group[1:ind]!=gp_cur], 1) else NA
}))
Another data.table approach similar to #Cath's:
library(data.table)
DT = data.table(df)
DT[, vlag := shift(value), by=group]
DT[, volag := .SD[.(chartr("AB", "BA", group), x - 1), on=.(group, x), roll=TRUE, x.value]]
This assumes that A and B are the only groups. If there are more...
DT[, volag := DT[!.BY, on=.(group)][.(.SD$x - 1), on=.(x), roll=TRUE, x.value], by=group]
How it works:
:= creates a new column
DT[, col := ..., by=] does each assignment separately per by= group, essentially as a loop.
The grouping values for the current iteration of the loop are in the named list .BY.
The subset of data used by the current iteration of the loop is the data.table .SD.
x[!i, on=] is an anti-join, looking up rows of i in x and returning x with the matched rows dropped.
x[i, on=, roll=TRUE, x.v] ...
looks up each row of i in x using the on= condition
when no exact on= match is found, it "rolls" to the nearest previous value of the final on= column
it returns v from the x table
For more details and intuition, review the startup messages shown when you type library(data.table).
I am not entirely sure whether I got your question correctly, but if "own" and "other" refers to group A and B, then this might do the trick. I strongly assume there are more elegant ways to do this:
df.x <- df %>%
dplyr::group_by(group) %>%
mutate(value.lag=lag(value)) %>%
mutate(index=seq_along(group)) %>%
arrange(group)
df.a <- df.x %>%
filter(group=="A") %>%
rename(value.lag.a=value.lag)
df.b <- df.x %>%
filter(group=="B") %>%
rename(value.lag.b = value.lag)
df.a.b <- left_join(df.a, df.b[,c("index", "value.lag.b")], by=c("index"))
df.b.a <- left_join(df.b, df.a[,c("index", "value.lag.a")], by=c("index"))
df.x <- bind_rows(df.a.b, df.b.a)
Try this: (Pipe-Only approach)
library(zoo)
df %>%
mutate(groupLag = lag(group),
dupLag = group == groupLag) %>%
group_by(dupLag) %>%
mutate(valueLagHelp = lag(value)) %>%
ungroup() %>%
mutate(helper = ifelse(dupLag == T, NA, valueLagHelp)) %>%
mutate(helper = case_when(is.na(helper) ~ na.locf(helper, na.rm=F),
TRUE ~ helper)) %>%
mutate(valAfterLag = lag(dupLag)) %>%
mutate(otherLag = ifelse(is.na(lag(valueLagHelp)), lag(value), helper)) %>%
mutate(otherLag = ifelse((valAfterLag | is.na(valAfterLag)) & !dupLag,
lag(value), otherLag)) %>%
select(c(x, group, value, ownLag, otherLag))
Sorry for the mess.
What it does it that it first creates a group lag and creates a helper variable for the case when the group is equal to its lag (i. e. when two "A"s are subsequent. Then it groups by this helper variable and it assigns to all values which are dupLag == F the correct value. Now we need to take care of the ones with dupLag == T.
So, ungroup. We need a new lagged-value helper that assigns all dupLag == T an NA, because they are not correctly assigned yet.
What's next is that we assign all NAs in our helper the last non-NA value.
This is not all because we still need to take care of some dupLag == F data points (you get that when you look at the complete tibble). First, we basically just change the second data point with the first mutate(otherLag==... operation. The next operation finalizes everything and then we select the variables which we'd like to have in the end.

groupby summarise outside of groupby dplyr

I'm trying to group ids with date in this dataset, but I want to summarise based on one of the features outside of the group.
library(dplyr)
library(lubridate)
set.seed(100)
df <- data.frame(ids = sample(c('436247', '2465347', '346654645'), 10000, replace=TRUE),
date = sample(seq.Date(ymd('2018-03-01'), ymd('2018-05-01'), by=1), 10000, replace=TRUE))
new_df <- df %>%
group_by(ids, date) %>%
summarise(events = length(ids[date >= date - 30 & date <= date]))
I'm trying to take this dataframe and answer the question - "for each of the ids, and each date, how many other records within that id, are within the past 30 days of that date". Unfortunately, when I group_by both the ids and date, it only looks within the grouped date. I've created the solution below, but not sure if there is a better one with dplyr?
groupby_function <- function(df, spec_date){
result <- df %>%
group_by(ids) %>%
summarise(events = length(ids[date >= spec_date - 30 & date <= spec_date])) %>%
mutate(date = spec_date)
return(result)
}
date_vector <- seq.Date(ymd('2018-03-01'), ymd('2018-05-01'), by=1)
list_results <- lapply(date_vector, groupby_function, df=df)
x <- do.call(rbind, list_results)
"for each of the ids, and each date, how many other records within that id, are within the past 30 days of that date"
For that, a "join by" condition makes sense, but isn't yet included in dplyr. Until it is, you could use data.table inside your dplyr chain:
# enumerate id-date combos of interest
grid_df = expand.grid(
id = unique(df$ids),
d = seq(min(df$date), max(df$date), by="day")
)
# helper function
library(data.table)
count_matches = function(DF, targetDF, ...){
onexpr = substitute(list(...))
data.table(targetDF)[DF, on=eval(onexpr), .N, by=.EACHI]$N
}
# use a non-equi join to count matching rows
res = grid_df %>%
mutate(d_dn = d - 30) %>%
mutate(n = count_matches(., df, ids = id, date >= d_dn, date <= d)) %>%
as.tibble
# A tibble: 186 x 4
id d d_dn n
<fctr> <date> <date> <int>
1 436247 2018-03-01 2018-01-30 72
2 2465347 2018-03-01 2018-01-30 69
3 346654645 2018-03-01 2018-01-30 51
4 436247 2018-03-02 2018-01-31 123
5 2465347 2018-03-02 2018-01-31 120
6 346654645 2018-03-02 2018-01-31 100
7 436247 2018-03-03 2018-02-01 170
8 2465347 2018-03-03 2018-02-01 166
9 346654645 2018-03-03 2018-02-01 154
10 436247 2018-03-04 2018-02-02 228
# ... with 176 more rows
It should work fine for equality conditions to write either ids = id or ids == id, I think.
If you're interested, the syntax is x[i, on=, j, by=.EACHI] where x and i are tables. For each row of i, we look up rows of x based on the on= criteria (left-hand side refers to columns in x; right-hand to columns in i); then we do j for each ("by each row of i" so by=.EACHI). In this case, j = .N means that we count matched rows of x, returned as a column of counts N.
You can look at the "ungrouped" data by just going back to the original data frame(calling df$date or df$ids). So I think what you are after is
test_df <- df %>%
group_by(ids, date) %>%
summarise(events = length(df$ids[df$date >= date[1] - 30 & df$date <= date[1] & df$ids == ids[1]]))
Also, I ran your proposed function, but I did not see any difference in the result from your original group_by solution, so I don't think that is what you want.
If a 'non dplyr' solution is acceptable, this gives you what you want.
df$diff <- as.vector(
sapply(unique(df$ids), function(x)
sapply(df$date[df$ids == x], function(y)
sum(abs(y - df$date[df$ids == x]) >= 30)
)
)
)
Alternatively, in dplyr, you can get a result like the above using:
f <- function(x) {
sapply(x, function(y) sum(abs(y - x) >= 30))
}
df$diff <- unlist(
df %>%
group_by(ids) %>%
do(diff = f(.$date)) %>%
.$diff
)
Here's an answer. But it assumes there's a continuous sequence of dates in each id.
df %>%
group_by(ids, date) %>%
count() %>%
arrange(ids, date) %>%
group_by(ids) %>%
mutate(
events = cumsum(n) - cumsum(lag(n, 30, 0))
)

Apply function to each row of data.frame and preserve column classes

I wonder if there is a way to apply a function to each row of a data.frame such that the column classes are preserved? Let's look at an example to clarify what I mean:
test <- data.frame(startdate = as.Date(c("2010-03-07", "2013-09-13", "2011-11-12")),
enddate = as.Date(c("2010-03-23", "2013-12-01", "2012-01-05")),
nEvents = c(123, 456, 789))
Suppose I would like to expand the data.frame test by inserting all days between startdate and enddate and distribute the number of events over those days. My first try to do so was this:
eventsPerDay1 <- function(row) {
n_days <- as.numeric(row$enddate - row$startdate) + 1
data.frame(date = seq(row$startdate, row$enddate, by = "1 day"),
nEvents = rmultinom(1, row$nEvents, rep(1/n_days, n_days)))
}
apply(test, 1, eventsPerDay1)
This, however, is not possible because apply calls as.matrix on test and thus it gets converted to a character matrix and all column classes are lost.
I already found two workarounds which you can find below, so my question is more of a philosphical nature.
library(magrittr)
############# Workaround 1
eventsPerDay2 <- function(startdate, enddate, nEvents) {
n_days <- as.numeric(enddate - startdate) + 1
data.frame(date = seq(startdate, enddate, by = "1 day"),
nEvents = rmultinom(1, nEvents, rep(1/n_days, n_days)))
}
mapply(eventsPerDay2, test$startdate, test$enddate, test$nEvents, SIMPLIFY = F) %>%
do.call(rbind, .)
############# Workaround 2
seq_along(test) %>%
lapply(function(i) test[i, ]) %>%
lapply(eventsPerDay1) %>%
do.call(rbind, .)
My "problem" with the workarounds is the following:
Workaround 1: It may not be the best reason, but I simply do not like mapply. It has a different signature than the other *apply functions (as the the order of arguments differs) and I always feel that a for loop would just have been clearer.
Workaround 2: While being very flexible, I think it is not clear at first sight what is happening.
So does anyone know a function whose call would look like apply(test, 1, eventsPerDay1) and that will work?
Another idea:
library(dplyr)
library(tidyr)
test %>%
mutate(id = row_number()) %>%
group_by(startdate) %>%
complete(startdate = seq.Date(startdate, enddate, 1), nesting(id)) %>%
group_by(id) %>%
mutate(nEvents = rmultinom(1, first(nEvents), rep(1/n(), n()))) %>%
select(startdate, nEvents)
Which gives:
#Source: local data frame [152 x 3]
#Groups: id [3]
#
# id startdate nEvents
# <int> <date> <int>
#1 1 2010-03-07 6
#2 1 2010-03-08 6
#3 1 2010-03-09 6
#4 1 2010-03-10 7
#5 1 2010-03-11 12
#6 1 2010-03-12 5
#7 1 2010-03-13 8
#8 1 2010-03-14 5
#9 1 2010-03-15 5
#10 1 2010-03-16 9
## ... with 142 more rows
We can do this with data.table
library(data.table)
res <- setDT(test)[,n_days := as.numeric(enddate - startdate) + 1
][, .(date = seq(startdate, enddate, by= "1 day"),
nEvents = c(rmultinom(1, nEvents, rep(1/n_days, n_days)))),
by = 1:nrow(test)][, nrow := NULL]
str(res)
#Classes ‘data.table’ and 'data.frame': 152 obs. of 2 variables:
# $ date : Date, format: "2010-03-07" "2010-03-08" "2010-03-09" "2010-03-10" ...
# $ nEvents: int 5 9 7 11 6 6 10 7 12 3 ...
The above can be wrapped in a function
eventsPerDay <- function(dat){
as.data.table(dat)[, n_days:= as.numeric(enddate - startdate) + 1
][, .(date = seq(startdate, enddate, by= "1 day"),
nEvents = c(rmultinom(1, nEvents, rep(1/n_days, n_days)))) , 1:nrow(dat)
][, nrow := NULL][]
}
eventsPerDay(test)
I have asked myself the same question.
I either end up splitting the df into a list (the base way)
xy <- data.frame()
xy.list <- split(xy, 1:nrow(xy))
out <- lapply(xy.list, function(x) ...)
answer <- unlist(out)
or try the hadleyverse dplyr way using rowwise (the blackbox way)
xy %>%
rowwise() %>%
mutate(newcol = function(x) ....)
I agree that their should be a base implementation of apply(xy, 1, function(x)) that doesn't coerce into character, but I imagine the R ancients implemented the matrix conversion for an advanced reason my primitive mind can't understand.

Resources