R data.table cross-join by three variables - r

I'm trying cross join a data.table by three variables (group, id, and date). The R code below accomplishes exactly what I want to do, i.e., each id within each group is expanded to include all of the dates_wanted. But is there a way to do the same thing more efficiently using the excellent data.table package?
library(data.table)
data <- data.table(
group = c(rep("A", 10), rep("B", 10)),
id = c(rep("frank", 5), rep("tony", 5), rep("arthur", 5), rep("edward", 5)),
date = seq(as.IDate("2020-01-01"), as.IDate("2020-01-20"), by = "day")
)
data
dates_wanted <- seq(as.IDate("2020-01-01"), as.IDate("2020-01-31"), by = "day")
names_A <- data[group == "A"][["id"]]
names_B <- data[group == "B"][["id"]]
names_A <- CJ(group = "A", id = names_A, date = dates_wanted, unique = TRUE)
names_B <- CJ(group = "B", id = names_B, date = dates_wanted, unique = TRUE)
alldates <- rbind(names_A, names_B)
alldates
data[alldates, on = .(group, id, date)]

You can also do this:
data[, .(date=dates_wanted), .(group,id)]
Output:
group id date
1: A frank 2020-01-01
2: A frank 2020-01-02
3: A frank 2020-01-03
4: A frank 2020-01-04
5: A frank 2020-01-05
---
120: B edward 2020-01-27
121: B edward 2020-01-28
122: B edward 2020-01-29
123: B edward 2020-01-30
124: B edward 2020-01-31

We can use do.call with CJ on the id and date transformed grouped by group:
out <- data[, do.call(CJ, c(.(id = id, date = dates_wanted),
unique = TRUE)), group]
... checking:
> dim(out)
[1] 124 3
> out0 <- data[alldates, on = .(group, id, date)]
> dim(out0)
[1] 124 3
> all.equal(out, out0)
[1] TRUE

Related

reshape from wide to long, simple data.table issue

I know there must be a one-line data.table solution for this, probably with dcast, but I can't figure it out.
I have data like this:
library(data.table)
data1 <- data.table(
id = seq(1:5),
code = c("A","A","B","A","B"),
date = as.Date( c("2021-08-11","2021-01-05","2021-02-18","2021-02-13","2021-12-13" ))
)
data2 <- data.table(
id = seq(1:5),
code = c("B","B","A","B","A"),
date = as.Date( c("2021-08-13","2021-01-05","2021-02-19","2021-02-14","2021-12-13" ))
)
data3 <- rbind(data1, data2)
I simply wish to reshape to a wide format like this
data_want <- data.table(
id = seq(1:5),
code1 = c("A", "A","B","A","B"),
data1 = c("2021-08-11", "2021-01-05","2021-02-18","2021-02-13","2021-12-13"),
code2 = c("B", "B","A","B","A"),
data2 = c("2021-08-13", "2021-01-05","2021-02-19","2021-02-14","2021-12-13")
)
How to do it with dcast?
You could also make use of rowid as follows
dcast(data3, id ~ rowid(id), value.var = c("code", "date"))
# id code_1 code_2 date_1 date_2
#1: 1 A B 2021-08-11 2021-08-13
#2: 2 A B 2021-01-05 2021-01-05
#3: 3 B A 2021-02-18 2021-02-19
#4: 4 A B 2021-02-13 2021-02-14
#5: 5 B A 2021-12-13 2021-12-13
# load package
library(data.table)
# create batch number
data3[, batch := 1:.N, id]
# long to wide
data4 <- dcast(data3
, id ~ batch
, value.var = c('code', 'date')
); data4
id code_1 code_2 date_1 date_2
1: 1 A B 2021-08-11 2021-08-13
2: 2 A B 2021-01-05 2021-01-05
3: 3 B A 2021-02-18 2021-02-19
4: 4 A B 2021-02-13 2021-02-14
5: 5 B A 2021-12-13 2021-12-13

Multiple first and last non-NA values by group

I have the following data.table:
require(data.table)
dt = data.table(
id = c(rep('Grp 1', 31), rep('Grp 2', 31)),
date = rep(as.IDate(as.IDate('2020-01-01') : as.IDate('2020-01-31')), 2),
change = c(rep(NA, 5), rep('yes', 5), rep(NA, 10), rep('yes', 3), rep(NA, 8),
rep(NA, 2), rep('yes', 8), rep(NA, 8), rep('yes', 5), rep(NA, 8))
)
For every group id I'd like to filter the first and last dates of a series, which is defined by a second column change being yes (i.e. non-NA). I can do the following, which would provide me with the first and last non-NA row by group. However, the problem is that the series occurs more than once per group.
dt[ !is.na(change),
.(head(date, 1),
tail(date, 1)),
.(id) ]
These are the row indices I'd like to have filtered:
dt[c(6,10,21,23,34,41,50,54)]
One way is to give a unique group id to each streak identified by an id and change combination. We can use rleid to generate such run-length type ids. Consider something like this
dt[,
gid := rleid(id, change)
][!is.na(change),
as.list(range(date)),
by = .(id, gid)
][,
gid := NULL
]
Note that I also assume that you want the range of dates, not really the first and last elements. Your method will fail if the dates are not in chronological order. Output looks like this
id V1 V2
1: Grp 1 2020-01-06 2020-01-10
2: Grp 1 2020-01-21 2020-01-23
3: Grp 2 2020-01-03 2020-01-10
4: Grp 2 2020-01-19 2020-01-23
rleid works like this
> rleid(c(1, 1, 2, 3, 3), c("a", "b", "b", "d", "d"))
[1] 1 2 3 4 4
Here is an option with dplyr
library(dplyr)
library(data.table)
dt %>%
group_by(grp = rleid(id, change), id) %>%
filter(!is.na(change)) %>%
summarise(V1 = min(date, na.rm = TRUE),
V2 = max(date, na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 4 x 4
# grp id V1 V2
# <int> <chr> <date> <date>
#1 2 Grp 1 2020-01-06 2020-01-10
#2 4 Grp 1 2020-01-21 2020-01-23
#3 7 Grp 2 2020-01-03 2020-01-10
#4 9 Grp 2 2020-01-19 2020-01-23

How to do Countifs in R

Data:
set.seed(42)
df1 = data.frame(
Date = seq.Date(as.Date("2018-01-01"),as.Date("2018-01-30"),1),
value = sample(1:30),
Y = sample(c("yes", "no"), 30, replace = TRUE)
)
df2 = data.frame(
Date = seq.Date(as.Date("2018-01-01"),as.Date("2018-01-30"),7)
)
For sum if data falls within range this works (from my previous question):
library(data.table)
df1$start <- df1$Date
df1$end <- df1$Date
df2$start <- df2$Date
df2$end <- df2$Date + 6
setDT(df1, key = c("start", "end"))
setDT(df2, key = c("start", "end"))
d = foverlaps(df1, df2)[, list(mySum = sum(value)), by = Date ]
How can I do countif ?
because when I try
d = foverlaps(df1, df2)[, list(mySum = count(value)), by = Date ]
I get error
no applicable method for 'groups' applied to an object of class "c('double', 'numeric')"
We can use .N:
foverlaps(df1, df2)[, list(myCount = .N), by = Date ]
# Date myCount
# 1: 2018-01-01 7
# 2: 2018-01-08 7
# 3: 2018-01-15 7
# 4: 2018-01-22 7
# 5: 2018-01-29 2
d = foverlaps(df1, df2)[, .N, by = Date]
If you want to count the number of rows per Date, you can try .N
foverlaps(df1, df2)[, .(mysum = .N), by = Date ]
Date mysum
1: 2018-01-01 7
2: 2018-01-08 7
3: 2018-01-15 7
4: 2018-01-22 7
5: 2018-01-29 2
If you want the count of unique values per Date you can try uniqueN()
foverlaps(df1, df2)[, .(mysum = uniqueN(value)), by = Date ]
Date mysum
1: 2018-01-01 7
2: 2018-01-08 7
3: 2018-01-15 7
4: 2018-01-22 7
5: 2018-01-29 2
Both .N and uniqueN() are from {data.table}.
Instead of list(mySum = count(value)) try c(mySum = count(value)). The Code runs for me then.
d2 <- foverlaps(df1, df2)[, c(mySum = count(value)), by = Date ]

lapply to find max value across rows not working as expected

I am trying to find the max date across rows of a data.table using lapply. I have some rows where all values in the row are NA and in this case I want to return a specific date. I wrote a function to do this but I am not getting the results that I expected.
library(data.table)
my.max = function(x){
if(all(is.na(x))){
return(as.Date("9999-12-01")) #we can use this to identify which BPIDs have no end date
}else{
return(max(x, na.rm = T))
}
}
DT = data.table("Date1" = c(as.Date("2015-12-30"),NA, NA), "Date2" = c(as.Date("2013-02-04"), as.Date("2014-01-01"), NA))
DT[ , "Row" := 1:.N]
DT[ , "Max_Date" := lapply(.SD, my.max), by = .(Row), .SDcols = c("Date1", "Date2")]
This returns
> DT
Date1 Date2 Row Max_Date
1: 2015-12-30 2013-02-04 1 2015-12-30
2: <NA> 2014-01-01 2 9999-12-01
3: <NA> <NA> 3 9999-12-01
So, it does work if all values are NA, but if one of the values is NA it also returns 9999-12-01. I put print functions into my.max to find out what was happening and it looks like it passes in one value of x at a time. This explains why the all(is.na(x)) would be true, but I expected it to pass in a vector of both dates in the row. Otherwise, how would it know what values to take the max of?
How can I change my function so it returns 9999-12-01 only if both of the other dates are NA?
Here is one method that will work. It encapsulates multiple statements in {} to form a single code block:
DT[, "this" := {temp=pmax(Date1, Date2, na.rm=TRUE);
temp[is.na(temp)] = as.Date("9999-12-01"); temp}]
which returns
DT
Date1 Date2 this
1: 2015-12-30 2013-02-04 2015-12-30
2: <NA> 2014-01-01 2014-01-01
3: <NA> <NA> 9999-12-01
data
DT = data.table("Date1" = c(as.Date("2015-12-30"),NA, NA),
"Date2" = c(as.Date("2013-02-04"), as.Date("2014-01-01"), NA))
This way, you don't have to loop through each row which can be quite slow.
While I don't recommend by-row processing...
DT[ , "Row" := 1:.N]
DT[ , "Max_Date" := my.max(unlist(.SD)), by = .(Row), .SDcols = c("Date1", "Date2")]
will produce the same output for this example.
Try this out:
library(data.table)
my.max <- function(x){
if(all(is.na(x))){
return("9999-12-01")
}else{
return(max(x, na.rm = T))
}
}
DT <- data.table("Date1" = c(as.Date("2015-12-30"),NA, NA), "Date2" = c(as.Date("2013-02-04"), as.Date("2014-01-01"), NA))
print(DT)
DT[ , "Max_Date" ] <- apply(DT, 1, my.max)
print(DT)
> DT <- data.table("Date1" = c(as.Date("2015-12-30"),NA, NA), "Date2" = c(as.Date("2013-02-04"), as.Date("2014-01-01"), NA))
> print(DT)
Date1 Date2
1: 2015-12-30 2013-02-04
2: <NA> 2014-01-01
3: <NA> <NA>
> DT[ , "Max_Date" ] <- apply(DT, 1, my.max)
> print(DT)
Date1 Date2 Max_Date
1: 2015-12-30 2013-02-04 2015-12-30
2: <NA> 2014-01-01 2014-01-01
3: <NA> <NA> 9999-12-01

Reordering each row of a datatable

I am searching a way to reorder each row of a data.table in alphatical order in an efficient way. So I assume that each column does give the same information and is comparable. When you see the example it will make more sense:
test <- data.table(A = c("A", "b", "c"),
B = c(1,"a","d"),
C = c("F", 0, 1))
Expected result:
result <- data.table(t(apply(test,1, sort)))
names(result) <- colnames(test)
In this solution I have to loop through all the rows, can this be prevented?
For 2 columns I found a efficient way to solve this problem:
result <- data.table(A = pmin(test$A, test$B), B = pmax(test$A, test$B) )
But this solution does not work well for more than 2 columns
EDIT:
Lets add a benchmark of the different solutions on two columns:
test <- data.table(A = sample(c("A","B", "C", "D"), 1000000, replace = T),
B = sample(c("A","B", "C", "D"), 1000000, replace = T))
OptionOne <- function(test){
result <- data.table(A = pmin(test$A, test$B), B = pmax(test$A, test$B) )
}
OptionTwo <- function(test){
test[, names(test) := as.list(sort(unlist(.SD))), 1:nrow(test)][]
}
OptionThree <- function(test){
test[, id := .I]
test <- melt(test, id.vars = "id")
setorder(test, id, value)
test[, variable1 := seq_len(.N), by = id]
dcast(test, id ~ variable1, value.var = "value")
}
system.time(OptionOne(test))
#user system elapsed
#0.13 0.00 0.12
system.time(OptionTwo(test))
# user system elapsed
# 17.58 0.00 18.27
system.time(OptionThree(test))
#user system elapsed
# 0.23 0.00 0.24
It seems like for two columns the pmin and pmax is the most efficient way but for more columns the reshape does a good job.
Your data.table is conceptionally in the wrong shape. Sorting over rows (i.e., over variables) does not make sense. Thus, to do this efficiently you need to reshape:
library(data.table)
test <- data.table(A = c("A", "b", "c"),
B = c(1,"a","d"),
C = c("F", 0, 1))
test[, id := .I]
test <- melt(test, id.vars = "id")
setorder(test, id, value)
# id variable value
#1: 1 B 1
#2: 1 A A
#3: 1 C F
#4: 2 C 0
#5: 2 B a
#6: 2 A b
#7: 3 C 1
#8: 3 A c
#9: 3 B d
If you must, you can then reshape again, though I would not recommend that.
test[, variable1 := seq_len(.N), by = id]
dcast(test, id ~ variable1, value.var = "value")
# id 1 2 3
#1: 1 1 A F
#2: 2 0 a b
#3: 3 1 c d
We can try
test[, names(test) := as.list(sort(unlist(.SD))), 1:nrow(test)][]

Resources