Subset data.table columns independently - r

I'm starting with the below table dt and try to subset its column by the list keys:
library(data.table)
set.seed(123)
randomchar <- function(n, w){
chararray <- replicate(w, sample(c(letters, LETTERS), n, replace = TRUE))
apply(chararray, 1, paste0, collapse = "")
}
dt <- data.table(x = randomchar(1000, 3),
y = randomchar(1000, 3),
z = randomchar(1000, 3),
key = c("x", "y", "z"))
keys <- with(dt, list(x = sample(x, 501),
y = sample(y, 500),
z = sample(z, 721)))
I can get the result I want by using a loop:
desired <- copy(dt)
for(i in seq_along(keys)){
keyname <- names(keys)[i]
desired <- desired[get(keyname) %in% keys[[i]]]
}
desired
The question is - Is there a more data.table idiomatic way to do this subset?
I tried using CJ: dt[CJ(keys)], but it takes a very long time.

What about building a mask and filter dt on this mask:
dt[Reduce(`&`, Map(function(key, col) col %in% key, keys, dt)),]

Related

How to deal with data.table with list column and NULL values

I have a fairly large data.table that comes from an SQL table.
All columns containing missing values in SQL are replaced by NULLs in the data.table so that these columns are actually lists containing values and also missing values.
I would like an efficient way to replace the NULLs with NAs and then convert the column (list) to a real data.table column
This is an example to reproduce my case :
library(data.table)
n = 10^6
l1 = as.list(rnorm(n, 10, 25))
l2 = as.list(rnorm(n, 0, 200))
l3 = as.list(rnorm(n))
df = data.table(a = runif(n),
b = l1,
c = l2,
d = rnorm(n, 88, 0.5),
e = l3
)
# create an index vector to set to NULL
id1 = sample(1:n, 0.26*n)
id2 = sample(1:n, 0.60*n)
id3 = sample(1:n, 0.09*n)
# set to NULL
df$b[id1] = list(NULL)
df$c[id2] = list(NULL)
df$e[id3] = list(NULL)
This is what I have done but it's a bit too long :
type = data.frame(type = sapply(df, class))
col = names(df)[which(type$type == "list")]
type = data.frame(type = sapply(df, class))
col = names(df)[which(type$type == "list")]
# ----------- FIRST WAY -----------------------------------------------------------------------
system.time(
df[, (col) := lapply(.SD, function(i) unlist(lapply(i, function(x) ifelse(is.null(x), NA, x)))), .SDcols = col]
)
# ----------- SECOND WAY (a little bit faster) ---------------------------
system.time(
for (i in col) {
df[, eval(i) := unlist(lapply(get(i), function(x) ifelse(is.null(x), NA, x)))]
}
)
Why the 1st solution is slower than second ? Anybody has a better way ?
We may use set here
library(data.table)
df1 <- copy(df)
system.time({
for(nm in col) {
i1 <- which(lengths(df1[[nm]]) == 0)
set(df1, i = i1, j = nm, value = list(NA))
df1[[nm]] <- unlist(df1[[nm]])
}
})
# user system elapsed
# 0.158 0.004 0.161
Compared with OP's second method
system.time(
for (i in col) {
df[, eval(i) := unlist(lapply(get(i), function(x) ifelse(is.null(x), NA, x)))]
}
)
# user system elapsed
# 5.618 0.157 5.756
-checking output
> all.equal(df, df1)
[1] TRUE
One solution based on lengths function:
cols = which(sapply(df, is.list))
df[, (cols) := lapply(.SD, \(x) {x[lengths(x)==0L] = NA; as.numeric(x)}), .SDcols=cols]

R fast cosine distance between consecutive rows of a data.table

How can I efficiently calculate distances between (almost) consecutive rows of a large-ish (~4m rows) of a data.table? I've outlined my current approach, but it is very slow. My actual data has up to a few hundred columns. I need to calculate lags and leads for future use, so I create these and use them to calculate distances.
library(data.table)
library(proxy)
set_shift_col <- function(df, shift_dir, shift_num, data_cols, byvars = NULL){
df[, (paste0(data_cols, "_", shift_dir, shift_num)) := shift(.SD, shift_num, fill = NA, type = shift_dir), byvars, .SDcols = data_cols]
}
set_shift_dist <- function(dt, shift_dir, shift_num, data_cols){
stopifnot(shift_dir %in% c("lag", "lead"))
shift_str <- paste0(shift_dir, shift_num)
dt[, (paste0("dist", "_", shift_str)) := as.numeric(
proxy::dist(
rbindlist(list(
.SD[,data_cols, with=FALSE],
.SD[, paste0(data_cols, "_" , shift_str), with=FALSE]
), use.names = FALSE),
method = "cosine")
), 1:nrow(dt)]
}
n <- 10000
test_data <- data.table(a = rnorm(n), b = rnorm(n), c = rnorm(n), d = rnorm(n))
cols <- c("a", "b", "c", "d")
set_shift_col(test_data, "lag", 1, cols)
set_shift_col(test_data, "lag", 2, cols)
set_shift_col(test_data, "lead", 1, cols)
set_shift_col(test_data, "lead", 2, cols)
set_shift_dist(test_data, "lag", 1, cols)
I'm sure this is a very inefficient approach, any suggestions would be appreciated!
You aren't using the vectorisation efficiencies in the proxy::dist function - rather than call it once for each row you can get all the distances you need from a single call.
Try this replacement function and compare the speed:
set_shift_dist2 <- function(dt, shift_dir, shift_num, data_cols){
stopifnot(shift_dir %in% c("lag", "lead"))
shift_str <- paste0(shift_dir, shift_num)
dt[, (paste0("dist2", "_", shift_str)) := proxy::dist(
.SD[,data_cols, with=FALSE],
.SD[, paste0(data_cols, "_" , shift_str), with=FALSE],
method = "cosine",
pairwise = TRUE
)]
}
You could also do it in one go without storing copies of the data in the table
test_data[, dist_lag1 := proxy::dist(
.SD,
as.data.table(shift(.SD, 1)),
pairwise = TRUE,
method = 'cosine'
), .SDcols = c('a', 'b', 'c', 'd')]

data.table, apply function to portion of a table

I want to apply function to portion of a table.
With data.frame, no problem:
df <- data.frame(name = paste("a", 1:10, sep = "-"),
x = 1:10,
y = rep(1:5),
z = rep(1:2, each = 5))
df[2:5, -1] <- scale(df[2:5, -1], center = c(1,2,3), scale = c(4,5,6))
But data.table complains:
dt <- data.table(name = paste("a", 1:10, sep = "-"),
x = 1:10,
y = rep(1:5),
z = rep(1:2, each = 5))
dt[2:5, -1] <- scale(dt[2:5, -1], center = c(1,2,3), scale = c(4,5,6))
Error in [<-.data.table(*tmp*, 2:5, -1, value = c(0.25, 0.5, 0.75, :
Item 1 of column numbers in j is -1 which is outside range [1,ncol=4]. Use column names
instead in j to add new columns.
What is the correct way in data.table? Thanks!
data.table needs more work to apply scale :
library(data.table)
cols <- names(dt)[-1]
dt[, (cols) := lapply(.SD, as.numeric), .SDcols = cols]
dt[2:5, (cols) := Map(scale, .SD, c(1,2,3), c(4,5,6)), .SDcols = cols]

split join data.table R

Objective
Join DT1 (as i in data.table) to DT2 given key(s) column(s), within each group of DT2 specified by the Date column.
I cannot run DT2[DT1, on = 'key'] as that would be incorrect since key column is repeated across the Date column, but unique within a single date.
Reproducible example with a working solution
DT3 is my expected output. Is there any way to achieve this without the split manoeuvre, which does not feel very data.table-y?
library(data.table)
set.seed(1)
DT1 <- data.table(
Segment = sample(paste0('S', 1:10), 100, TRUE),
Activity = sample(paste0('A', 1:5), 100, TRUE),
Value = runif(100)
)
dates <- seq(as.Date('2018-01-01'), as.Date('2018-11-30'), by = '1 day')
DT2 <- data.table(
Date = rep(dates, each = 5),
Segment = sample(paste0('S', 1:10), 3340, TRUE),
Total = runif(3340, 1, 2)
)
rm(dates)
# To ensure that each Date Segment combination is unique
DT2 <- unique(DT2, by = c('Date', 'Segment'))
iDT2 <- split(DT2, by = 'Date')
iDT2 <- lapply(
iDT2,
function(x) {
x[DT1, on = 'Segment', nomatch = 0]
}
)
DT3 <- rbindlist(iDT2, use.names = TRUE)
You can achieve the same result with a cartesian merge:
DT4 <- merge(DT2,DT1,by='Segment',allow.cartesian = TRUE)
Here is the proof:
> all(DT3[order(Segment,Date,Total,Activity,Value),
c('Segment','Date','Total','Activity','Value')] ==
DT4[order(Segment,Date,Total,Activity,Value),
c('Segment','Date','Total','Activity','Value')])
[1] TRUE

randomize observations by groups (blocks)

I have a data frame with I obsevations, and each observation belongs to one of g categories.
set.seed(9782)
I <- 500
g <- 10
library(dplyr)
anon_id <- function(n = 1, length = 12) {
randomString <- c(1:n)
for (i in 1:n)
{
randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
length, replace = TRUE),
collapse = "")
}
return(randomString)
}
df <- data.frame(id = anon_id(n = I, length = 16),
group = sample(1:g, I, T))
I want to randomly assign each observation to one of J "urns", given some vector of probabilities p. That is the probability of being assign to urn J=1 is p[1]. The added complexity is that I want to do this block by block.
If I ignore the blocks, I can do this easily:
J <- 3
p <- c(0.25, 0.5, 0.25)
df1 <- df %>% mutate(urn = sample(x = c(1:J), size = I, replace = T, prob = p))
I thought about this method to do it by "block"
# Block randomization
randomize_block <- function(g) {
df1 <- df %>% filter(group==g)
size <- nrow(df1)
df1 <- df1 %>% mutate(urn = sample(x = c(1:J),
size = size,
replace = T,
prob = p))
return(df1)
}
df2 <- lapply(1:g, randomize_block)
df2 <- data.table::rbindlist(df2)
Is there a better way?
Not sure if this is better, but here is a base R technique with data.frame df, that has group name "group" as well as urn assignments 1:J with assignment probabilities in vector p of length J.
# get urn assignment
urnAssignment <- lapply(unique(df$group),
function(i) sample(1:J, nrow(df[group==i,]), replace =T, prob=p))
# get a list that collects position of observations
obsOrder <- lapply(unique(df$group),
function(i) which(df$group == i))
df$urnAssignment <- unlist(urnAssignment)[unlist(obsOrder)]
randomizr::block_ra does exactly what you want.
library(randomizr)
library(janitor) #just for the tabyl function
block_rand <- as.tibble(randomizr::block_ra(blocks = df$group, conditions = c("urn_1","urn_2","urn_3")))
df2 <- as.tibble(bind_cols(df, block_rand))
df2 %>% janitor::tabyl(group, value)
This does the trick using dplyr:
randomize <- function(data, groups=2, block_id = NULL, p=NULL, seed=9782) {
if(is.null(p)) p <- rep(1/groups, groups)
if(is.null(block_id)){
df1 <- data %>%
mutate(Treatment = sample(x = c(1:groups),
size = n(),
replace = T,
prob = p))
return(df1)
}else{
df1 <- data %>% group_by_(block_id) %>%
mutate(Treatment = sample(x = c(1:groups),
size = n(),
replace = T,
prob = p))
}
}
df1 <- randomize(data = df, groups = J, block_id = "group", p = p, seed = 9782)

Resources