Data:
set.seed(42)
df1 = data.frame(
Date = seq.Date(as.Date("2018-01-01"),as.Date("2018-01-30"),1),
value = sample(1:30),
Y = sample(c("yes", "no"), 30, replace = TRUE)
)
df2 = data.frame(
Date = seq.Date(as.Date("2018-01-01"),as.Date("2018-01-30"),7)
)
For sum if data falls within range this works (from my previous question):
library(data.table)
df1$start <- df1$Date
df1$end <- df1$Date
df2$start <- df2$Date
df2$end <- df2$Date + 6
setDT(df1, key = c("start", "end"))
setDT(df2, key = c("start", "end"))
d = foverlaps(df1, df2)[, list(mySum = sum(value)), by = Date ]
How can I do countif ?
because when I try
d = foverlaps(df1, df2)[, list(mySum = count(value)), by = Date ]
I get error
no applicable method for 'groups' applied to an object of class "c('double', 'numeric')"
We can use .N:
foverlaps(df1, df2)[, list(myCount = .N), by = Date ]
# Date myCount
# 1: 2018-01-01 7
# 2: 2018-01-08 7
# 3: 2018-01-15 7
# 4: 2018-01-22 7
# 5: 2018-01-29 2
d = foverlaps(df1, df2)[, .N, by = Date]
If you want to count the number of rows per Date, you can try .N
foverlaps(df1, df2)[, .(mysum = .N), by = Date ]
Date mysum
1: 2018-01-01 7
2: 2018-01-08 7
3: 2018-01-15 7
4: 2018-01-22 7
5: 2018-01-29 2
If you want the count of unique values per Date you can try uniqueN()
foverlaps(df1, df2)[, .(mysum = uniqueN(value)), by = Date ]
Date mysum
1: 2018-01-01 7
2: 2018-01-08 7
3: 2018-01-15 7
4: 2018-01-22 7
5: 2018-01-29 2
Both .N and uniqueN() are from {data.table}.
Instead of list(mySum = count(value)) try c(mySum = count(value)). The Code runs for me then.
d2 <- foverlaps(df1, df2)[, c(mySum = count(value)), by = Date ]
Related
I'm trying cross join a data.table by three variables (group, id, and date). The R code below accomplishes exactly what I want to do, i.e., each id within each group is expanded to include all of the dates_wanted. But is there a way to do the same thing more efficiently using the excellent data.table package?
library(data.table)
data <- data.table(
group = c(rep("A", 10), rep("B", 10)),
id = c(rep("frank", 5), rep("tony", 5), rep("arthur", 5), rep("edward", 5)),
date = seq(as.IDate("2020-01-01"), as.IDate("2020-01-20"), by = "day")
)
data
dates_wanted <- seq(as.IDate("2020-01-01"), as.IDate("2020-01-31"), by = "day")
names_A <- data[group == "A"][["id"]]
names_B <- data[group == "B"][["id"]]
names_A <- CJ(group = "A", id = names_A, date = dates_wanted, unique = TRUE)
names_B <- CJ(group = "B", id = names_B, date = dates_wanted, unique = TRUE)
alldates <- rbind(names_A, names_B)
alldates
data[alldates, on = .(group, id, date)]
You can also do this:
data[, .(date=dates_wanted), .(group,id)]
Output:
group id date
1: A frank 2020-01-01
2: A frank 2020-01-02
3: A frank 2020-01-03
4: A frank 2020-01-04
5: A frank 2020-01-05
---
120: B edward 2020-01-27
121: B edward 2020-01-28
122: B edward 2020-01-29
123: B edward 2020-01-30
124: B edward 2020-01-31
We can use do.call with CJ on the id and date transformed grouped by group:
out <- data[, do.call(CJ, c(.(id = id, date = dates_wanted),
unique = TRUE)), group]
... checking:
> dim(out)
[1] 124 3
> out0 <- data[alldates, on = .(group, id, date)]
> dim(out0)
[1] 124 3
> all.equal(out, out0)
[1] TRUE
Here are some data:
library(data.table)
library(lubridate)
foo <- data.table(
date = seq.Date(from = as_date('2020-01-01'), to = as_date('2020-03-01'), by = '1 month'),
a = rep(1:3, 2),
group = c(rep('a', 3), rep('b', 3))
)
> foo
date a group
1: 2020-01-01 1 a
2: 2020-02-01 2 a
3: 2020-03-01 3 a
4: 2020-01-01 1 b
5: 2020-02-01 2 b
6: 2020-03-01 3 b
The desired output is the following:
date a group diff
1: 2020-01-01 1 a 1
2: 2020-02-01 2 a 1
3: 2020-03-01 3 a 1
4: 2020-04-01 0 a -3
5: 2020-01-01 1 b 1
6: 2020-02-01 2 b 1
7: 2020-03-01 3 b 1
8: 2020-04-01 0 b -3
And here follows my own solution.
bar <- foo[foo[, .I[which.max(date)], by = group]$V1]
bar <- bar[a != 0][, c('date', 'a') := .(date %m+% months(1), 0)]
foo <- rbindlist(list(foo, bar))
foo[, diff := a - shift(a, fill = 0), by = group]
foo[order(group, date)]
I wonder if a more compact solution exists in data.table, such as a fill option able to look at the past with a shift operation performed from the point of view of future non-existent data.
This is more compact but it's largely similar:
foo[
order(date),
.(date = c(date, date[.N] %m+% months(1)), a = c(a, 0)),
by = group
][ , diff := a - shift(a, fill=0)][]
I guess we could also do things in one query:
foo[
order(date),
.(
date = c(date, date[.N] %m+% months(1)),
a = c(a, 0),
diff = c(a - shift(a, fill=0), -a[.N])
),
by = group
]
another option that may be more palatable:
foo[
order(date),
{
out <- rbind(
.SD,
data.table(
date = date[.N] %m+% months(1),
a = 0
)
)
out[ , diff := a - shift(a, fill=0)]
out
},
by = group
]
Data:
set.seed(42)
df1 = data.frame(
Date = seq.Date(as.Date("2018-01-01"),as.Date("2018-01-30"),1),
value = sample(1:30),
Y = sample(c("yes", "no"), 30, replace = TRUE)
)
df2 = data.frame(
Date = seq.Date(as.Date("2018-01-01"),as.Date("2018-01-30"),7)
)
I want for each date in df2$Date calculate the sum of df1$Value if date in df1$Date falls within df2$Date and df2$Date+6
Inshort I need to calculate weekly sums
Using data.table, create a range start/end, then merge on overlap, then get sum over group:
library(data.table)
df1$start <- df1$Date
df1$end <- df1$Date
df2$start <- df2$Date
df2$end <- df2$Date + 6
setDT(df1, key = c("start", "end"))
setDT(df2, key = c("start", "end"))
foverlaps(df1, df2)[, list(mySum = sum(value)), by = Date ]
# Date mySum
# 1: 2018-01-01 138
# 2: 2018-01-08 96
# 3: 2018-01-15 83
# 4: 2018-01-22 109
# 5: 2018-01-29 39
Check out library lubridate and dplyr, those two are quiet common.
library(lubridate)
library(dplyr)
df1$last_week_day <- ceiling_date(df1$Date, "week") + 1
df1 %>% group_by(last_week_day) %>% summarize(week_value = sum(value))
We can use fuzzyjoin
library(dplyr)
library(fuzzyjoin)
df2$EndDate <- df2$Date+6
fuzzy_left_join(
df1, df2,
by = c(
"Date" = "Date",
"Date" = "EndDate"
), match_fun = list(`>=`, `<=`)) %>%
group_by(Date.y) %>% summarise(Sum=sum(value))
# A tibble: 5 x 2
Date.y Sum
<date> <int>
1 2018-01-01 138
2 2018-01-08 96
3 2018-01-15 83
4 2018-01-22 109
5 2018-01-29 39
I am trying to find the max date across rows of a data.table using lapply. I have some rows where all values in the row are NA and in this case I want to return a specific date. I wrote a function to do this but I am not getting the results that I expected.
library(data.table)
my.max = function(x){
if(all(is.na(x))){
return(as.Date("9999-12-01")) #we can use this to identify which BPIDs have no end date
}else{
return(max(x, na.rm = T))
}
}
DT = data.table("Date1" = c(as.Date("2015-12-30"),NA, NA), "Date2" = c(as.Date("2013-02-04"), as.Date("2014-01-01"), NA))
DT[ , "Row" := 1:.N]
DT[ , "Max_Date" := lapply(.SD, my.max), by = .(Row), .SDcols = c("Date1", "Date2")]
This returns
> DT
Date1 Date2 Row Max_Date
1: 2015-12-30 2013-02-04 1 2015-12-30
2: <NA> 2014-01-01 2 9999-12-01
3: <NA> <NA> 3 9999-12-01
So, it does work if all values are NA, but if one of the values is NA it also returns 9999-12-01. I put print functions into my.max to find out what was happening and it looks like it passes in one value of x at a time. This explains why the all(is.na(x)) would be true, but I expected it to pass in a vector of both dates in the row. Otherwise, how would it know what values to take the max of?
How can I change my function so it returns 9999-12-01 only if both of the other dates are NA?
Here is one method that will work. It encapsulates multiple statements in {} to form a single code block:
DT[, "this" := {temp=pmax(Date1, Date2, na.rm=TRUE);
temp[is.na(temp)] = as.Date("9999-12-01"); temp}]
which returns
DT
Date1 Date2 this
1: 2015-12-30 2013-02-04 2015-12-30
2: <NA> 2014-01-01 2014-01-01
3: <NA> <NA> 9999-12-01
data
DT = data.table("Date1" = c(as.Date("2015-12-30"),NA, NA),
"Date2" = c(as.Date("2013-02-04"), as.Date("2014-01-01"), NA))
This way, you don't have to loop through each row which can be quite slow.
While I don't recommend by-row processing...
DT[ , "Row" := 1:.N]
DT[ , "Max_Date" := my.max(unlist(.SD)), by = .(Row), .SDcols = c("Date1", "Date2")]
will produce the same output for this example.
Try this out:
library(data.table)
my.max <- function(x){
if(all(is.na(x))){
return("9999-12-01")
}else{
return(max(x, na.rm = T))
}
}
DT <- data.table("Date1" = c(as.Date("2015-12-30"),NA, NA), "Date2" = c(as.Date("2013-02-04"), as.Date("2014-01-01"), NA))
print(DT)
DT[ , "Max_Date" ] <- apply(DT, 1, my.max)
print(DT)
> DT <- data.table("Date1" = c(as.Date("2015-12-30"),NA, NA), "Date2" = c(as.Date("2013-02-04"), as.Date("2014-01-01"), NA))
> print(DT)
Date1 Date2
1: 2015-12-30 2013-02-04
2: <NA> 2014-01-01
3: <NA> <NA>
> DT[ , "Max_Date" ] <- apply(DT, 1, my.max)
> print(DT)
Date1 Date2 Max_Date
1: 2015-12-30 2013-02-04 2015-12-30
2: <NA> 2014-01-01 2014-01-01
3: <NA> <NA> 9999-12-01
I have a set of data along these lines
d1 <- data.frame(
cat1 = sample(c('a', 'b', 'c'), 100, replace = TRUE),
date = rep(Sys.Date() - sample(1:100)),
val = rnorm(100, 50, 5)
)
require(data.table)
d2 <- data.table(d1)
I can get a daily sum without problem
d2[ , list(.N, sum(val)), by = c("cat1", "date")]
I want to get a sum over 2 days (and then 7 days)
This works:
d.list <- sort(unique(d2$date))
o.list <- list()
for(i in seq_along(d.list)){
o.list[[i]] <- d2[d2$date >= d.list[i] - 1 & d2$date <= d.list[i], list(.N, sum(val), max(date)), by = c("cat1")]
}
do.call(rbind, o.list)
But slows down on a bigger data set, and doesn't seem to be the best use of data.table.
Is there a more efficient way?
This is a bit faster:
First we join for exact matches and obtain the last index (in case of multiple matches)
setkey(d2, cat1, date)
tmp1 = d2[unique(d2, by=key(d2)), which=TRUE, mult="last", allow.cartesian=TRUE]
Then, we construct a copy of d2 and change date to date-1 by reference. Then, we perform a join with roll=-Inf - which is next observation carried backwards. In other words, if there's no exact match, it'll fill the next available value.
d3 = copy(d2)[, date := date-1]
setkey(d3, cat1, date)
tmp2 = d2[unique(d3, by=key(d2)), roll=-Inf, which=TRUE, allow.cartesian=TRUE]
From here, we put together the indices:
idx1 = tmp1-tmp2+1L
idx2 = data.table:::vecseq(tmp2, idx1, sum(idx1))
Subset d2 from idx2 and generate unique ids from idx1:
ans1 = d2[idx2][, grp := rep(seq_along(idx1), idx1)]
Finally aggregate by grp and get the desired result:
ans1 = ans1[, list(cat1=cat1[1L], date=date[.N],
N = .N, val=sum(val)), by=grp][, grp:=NULL]
> head(ans1, 10L)
# cat1 date N val
# 1: a 2014-01-20 1 47.69178
# 2: a 2014-01-25 1 52.01006
# 3: a 2014-02-01 1 46.82132
# 4: a 2014-02-06 1 44.62404
# 5: a 2014-02-11 1 49.63218
# 6: a 2014-02-14 1 48.80676
# 7: a 2014-02-22 1 49.27800
# 8: a 2014-02-23 2 96.17617
# 9: a 2014-02-26 1 49.20623
# 10: a 2014-02-28 1 46.72708
The results are identical as in your solution. This one took 0.02 seconds on my laptop, where as yours took 0.58 seconds.
For 7 days, just change:
d3 = copy(d2)[, date := date-1]
to
d3 = copy(d2)[, date := date-6]
It's very poorly explained in the OP what you want, but this seems to be it:
# generate the [date-1,date] sequences for each date
# adjust length.out to suit your needs
dates = d2[, list(date.seq = seq(date, by = -1, length.out = 2)), by = date]
setkey(dates, date.seq)
setkey(d2, date)
# merge and extract info needed
dates[d2][, list(.N, sum(val), date.seq[.N]), by = list(date, cat1)][, !"date"]
# cat1 N V2 V3
# 1: a 1 38.95774 2014-01-21
# 2: a 1 38.95774 2014-01-21
# 3: c 1 55.68445 2014-01-22
# 4: c 2 102.20806 2014-01-23
# 5: c 1 46.52361 2014-01-23
# ---
#164: c 1 50.17986 2014-04-27
#165: b 1 51.43489 2014-04-28
#166: b 2 100.91982 2014-04-29
#167: b 1 49.48493 2014-04-29
#168: c 1 54.93311 2014-04-30
Would it be possible to set up a binned date, and then do by on that?
d2$day7 <- as.integer(d2$date) %/% 7
d2[ , list(.N, sum(val)), by = c("cat1", "day7")]
That would give a binned value - if you want a sliding 7 day window, I'd need to think again. Also, for a binned approach, you might need to subtract an offset before doing the %/% if you want to chose the day of the week the groups start at.