Issue with split and data.table - r

I have a data.table that I want to split into a list and then modify. I'm discovering some weird behavior when I try to delete a column on one of the data.tables in the list after calling split. Here's a MWE (that throws an error and causes my R session to crash):
library(data.table)
d = data.table(level = c(1, 1, 2, 2), value = 1:4)
list = split(d, f = d$level)
list[[1]][, level := NULL]
list
I get:
Error in .shallow(x, cols = cols, retain.key = TRUE) : Internal error: length(names)>0 but <length(dt)

I recommend to use l name for a variable instead of list.
This seems to be a bug caused by split.data.frame method utilized in the process.
I've quite recently proposed a new split.data.table method defined below. It seems to address your problem.
Update 2016-03-30:
split.data.table has been implemented in data.table 1.9.7. Now use can simply use:
library(data.table)
d = data.table(level = c(1, 1, 2, 2), value = 1:4)
l = split(d, by = "level")
l[[1L]][, level := NULL]
l
#$`1`
# value
#1: 1
#2: 2
#
#$`2`
# level value
#1: 2 3
#2: 2 4
The old answer below, it may be useful if you stuck with 1.9.6 or below. Be aware that it won't handle factor levels the same way as split.data.frame, this isn't the case for method developed in data.table 1.9.7 which is consistent to data.frame method.
library(data.table)
split.data.table = function(x, f, drop = FALSE, by, flatten = FALSE, ...){
if(missing(by) && !missing(f)) by = f
stopifnot(!missing(by), is.character(by), is.logical(drop), is.logical(flatten), !".ll" %in% names(x), by %in% names(x))
if(!flatten){
.by = by[1L]
tmp = x[, list(.ll=list(.SD)), by = .by, .SDcols = if(drop) setdiff(names(x), .by) else names(x)]
setattr(ll <- tmp$.ll, "names", tmp[[.by]])
if(length(by) > 1L) return(lapply(ll, split.data.table, drop = drop, by = by[-1L])) else return(ll)
} else {
tmp = x[, list(.ll=list(.SD)), by=by, .SDcols = if(drop) setdiff(names(x), by) else names(x)]
setattr(ll <- tmp$.ll, 'names', tmp[, .(nm = paste(.SD, collapse = ".")), by = by, .SDcols = by]$nm)
return(ll)
}
}
d = data.table(level = c(1, 1, 2, 2), value = 1:4)
l = split.data.table(d, by = "level")
# below setattr to be addressed in split.data.table
invisible(lapply(l, setattr, ".data.table.locked", NULL))
l[[1]][, level := NULL]
l
#$`1`
# value
#1: 1
#2: 2
#
#$`2`
# level value
#1: 2 3
#2: 2 4
I've also filled a bug report describing your case, you can find it in data.table#1481.

Related

How to avoid storing objects while performing large matrix and vector operations in R

As given in the code below, we are performing matrix operations over three variables V1, V2 , and V3 in dataframe DF5. While the end result we need is matV1, matV2, and matV3 for V1, V2 , and V3, respectively, we are storing a lot of objects while using the code. This process takes a lot of time when DF5 consists of a million row of data. How can we perform the following operation while storing least number of objects?
install.packages("data.table")
library(data.table)
# STEP 1
#V1
setDT(DF5)
numvec <- max(DF5[,k])
dlV1 <- lapply(1:numvec, function(i) DF5[k == i, sort(V1)])
dmatV1 <- CJ(x=1:numvec, y=1:numvec)[, .(z = sum(findInterval(dlV1[[y]],dlV1[[x]]))), .(x,y)]
matV1 <- as.matrix(dcast(dmatV1, x~y, value.var = 'z')[, -'x'])
diag(matV1) <- 0
#V2
dlV2 <- lapply(1:numvec, function(i) DF5[k == i, sort(V2)])
dmatV2 <- CJ(x=1:numvec, y=1:numvec)[, .(z = sum(findInterval(dlV2[[y]],dlV2[[x]]))), .(x,y)]
matV2 <- as.matrix(dcast(dmatV2, x~y, value.var = 'z')[, -'x'])
diag(matV2) <- 0
#V3
dlV3 <- lapply(1:numvec, function(i) DF5[k == i, sort(V3)])
dmatV3 <- CJ(x=1:numvec, y=1:numvec)[, .(z = sum(findInterval(dlV3[[y]],dlV3[[x]]))), .(x,y)]
matV3 <- as.matrix(dcast(dmatV3, x~y, value.var = 'z')[, -'x'])
diag(matV3) <- 0
#STEP 2: Divide
matV12<-sweep(matV1, 2, 10, FUN = '/')
matV22<-sweep(matV2, 2, 10, FUN = '/')
matV32<-sweep(matV3, 2, 10, FUN = '/')
Lists <- list(matV12, matV22, matV32)
E<-Reduce("*", Lists)
#STEP 3
RESULT<-data.frame(rowSums(E))

metaprogramming map on data.table list-columns

I cannot map over a nested column using data.table.
I made it an example.
library(data.table)
library(purrr)
DT <- setDT(list(
gp = c("A", "B"),
data = list(
setDT(list(d1 = c(1, 2, 3), d2 = c(2, 2, 4), d3 = c(0.2, 0.2, 0.4))),
setDT(list(d1 = c(10, 20, 30), d2 = c(20, 20, 40), d3 = c(0.2, 0.2, 0.4)))
),
metric = c("max", "min")
))
choose_a and choose_b are two of the n columns nested.
calc_name is the name of the calculated new column, that has been opereted by
the calc_metric_mean function
calc_metric_mean <- function(a, b, metric){
if(metric == "max"){
return(mean(c(max(a), max(b))))
}
if(metric == "min"){
return(mean(c(min(a), min(b))))
}
if(metric == "q74"){
return(mean(c(quantile(a, 74), quantile(b, 74))))
}
}
choose_a <- c("d1", "d2", "d2")
choose_b <- c("d3", "d1", "d2")
calc_name <- paste(choose_a, choose_b, sep = '')
metric <- "max"
for(i in 1:length(calc_name)){
DT[, calc_name[[i]] := map_dbl(
.x = data,
~calc_metric_mean(
a = choose_a[[i]],
b = choose_b[[i]],
metric = "max"
)
)]
}
The result would be
gp data d1d3 d2d1 d2d2
1: A <data.table[3x3]> 1.7 3.5 4
2: B <data.table[3x3]> 15.2 35.0 40
ADDED 2021-03-18
Second quiz: How about if you have the parameter "metric" in a column, outside the nested data?
The result would be
gp data metric d1d3 d2d1 d2d2
1: A <data.table[3x3]> max 1.7 3.5 4
2: B <data.table[3x3]> min 5.1 15 20
Sorry, if I haven't understood the question correctly, but if you're trying to produce the desired output using DT, using a for() loop with set() is an option:
for(i in 1:length(calc_name)){
set(DT, NULL, j = calc_name[i],
value = lapply(DT$data, function(x){
calc_metric_mean(a = x[[choose_a[i]]], b = x[[choose_b[i]]], metric = "max")
}
)
)
}
DT
This approach is in someways a nested for-loop, which isn't the most elegant, but it gets the job done and looping with set() can still be quite fast since it's updating by reference. One note is that this approach takes advantage of the fact that a data.table is a list with x[[choose_a[i]].
To get my code to work, I had to make two small changes to your example set up. First, because you created DT with structure, you need setDT(DT) to use set(). Second, I edited calc_metric_mean() to be more explicit about what it returns. Otherwise, it returned NULL for me:
calc_metric_mean <- function(a, b, metric){
if(metric == "max"){
return(mean(c(max(a), max(b))))
}
if(metric == "min"){
return(mean(c(min(a), min(b))))
}
if(metric == "q74"){
return(mean(c(quantile(a, 74), quantile(b, 74))))
}
}
There's another answer thanks to wonderful #diaggy 's answer.
for(i in 1:length(calc_name)){
DT[, calc_name[i] := lapply(DT$data, function(x){
calc_metric_mean(a = x[[choose_a[i]]], b = x[[choose_b[i]]], metric = "max")
})][]
}
This leads to the desired result too.
> DT
gp data d1d3 d2d1 d2d2
1: A <data.table[3x3]> 1.7 3.5 4
2: B <data.table[3x3]> 15.2 35 40
There're some comments to do:
The final empty [] is neccesary to list off the := result in the data.table (see 2.23 in faqs).
The double call x[[ is neccesary to assess the inner columns in a list-column. For some reason, x[, choose_a[i]] returns the character choose_a[i] and this won't work.
In the comparison, it is better #diaggy 's solution:
expr min lq mean median uq max neval
eval(diaggys_set) 3.589102 3.849702 4.487934 4.054001 4.516901 10.4261 100
eval(direct) 4.749001 5.127901 5.844534 5.386051 5.985651 12.9724 100
First Variation: using variables from the nested objetive
lapply is enough. See the #diaggy's Answer.
Second Variation: using variables from and Outside the nested objetive
If you have to load a parameter from other column, it is neccesary pass from lapply, to mapply.
for(i in 1:length(calc_name)){
set(DT, NULL, j = calc_name[i],
value = mapply(function(x, m){
calc_metric_mean(a = x[[choose_a[i]]], b = x[[choose_b[i]]], metric = m)
}, x = DT$data, m = DT$metric, SIMPLIFY = FALSE
)
)
}
> DT
gp data metric d1d3 d2d1 d2d2
1: A <data.table[3x3]> max 1.7 3.5 4
2: B <data.table[3x3]> min 5.1 15 20
SIMPLIFY = FALSE is required if it will return a list instead a vector.

Extracting vector with certain qualities from longer vector

I have the following data.table:
DT <- data.table(A = c(rep("aa",2),rep("bb",2)),
B = c(rep("H",2),rep("Na",2)),
Low = c(0,3,1,1),
High = c(8,10,9,8),
Time =c("0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10"),
Intensity = c("0,0,0,0,561464,0,0,0,0,0,0","0,0,0,6548,5464,5616,0,0,0,68716,0","5658,12,6548,6541,8,5646854,54565,56465,546,65,0","0,561464,0,0,0,0,0,0,0,0,0")
)
and use this code to extract the the highest number of consecutive intensity values above a certain value For a more detailed explanation on how this calculation works please see Reading and counting of consecutive points:
newCols <- do.call(rbind, Map(function(u, v, x, y) {
u1 <- as.numeric(u)
v1 <- as.numeric(v)
lb <- which.min(abs(x - u1))
ub <- which.min(abs(y - u1))
v3 <- as.numeric(v[(lb+1):(ub-1)])
i3 = with(rle(v3 > min(as.numeric(v[c(lb, ub)]))),
pmax(max(lengths[values]), 0))
data.frame(Consec.Points.base = i3)
},
strsplit(DT$Time, ","), strsplit(DT$Intensity, ","), DT$Low, DT$High))
DT <- cbind(DT, newCols)
I was wondering how it would be possible to instead of getting the length of the Consec.Points.base, to extract their actual points (Time and Intensity) as two vectors?
Thanks a lot in advance!
I think this answers your question, but let me know if I made a mistake, or something needs more thought/clarification.
DT <- data.table(A = c(rep("aa",2),rep("bb",2)),
B = c(rep("H",2),rep("Na",2)),
Low = c(0,3,1,1),
High = c(8,10,9,8),
Time =c("0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10"),
Intensity = c("0,0,0,0,561464,0,0,0,0,0,0","0,0,0,6548,5464,5616,0,0,0,68716,0","5658,12,6548,6541,8,5646854,54565,56465,546,65,0","0,561464,0,0,0,0,0,0,0,0,0")
)
# unique identifier
DT[, i := .I]
# re-structure
DT2 <- DT[, .(Time = as.numeric(strsplit(Time, ",")[[1]]),
Intensity = as.numeric(strsplit(Intensity, ",")[[1]])), by = i]
DT2 <- merge(DT2, DT[, .(i,A,B,Low,High)], by="i")
DT2 <- DT2[between(Time, Low, High, incbounds = FALSE),]
DT2[, IntensityGood := Intensity != min(Intensity), by=i]
# encode each part of sequence with its own value, if not FALSE
encoder <- function(x){
rle.response <- rle(x)
v2 <- rep(0, length(rle.response$values))
v2[rle.response$values!=FALSE] <- which(rle.response$values != FALSE)
rep(v2, rle.response$lengths)
}
DT2[, encodeI := encoder(IntensityGood), by = i]
# remove ones which are all 0, easily handle seperately
DT3 <- DT2[, test := all(encodeI==0), by=i][test==FALSE,][, test:=NULL]
# get count - can infer missing are 0
count <- DT3[encodeI!=0, .(max(table(encodeI))), by = i]
# get sequence
findMaxDt <- DT3[encodeI != 0, .N, by=.(i, encodeI)]
DT3 <- merge(DT3, findMaxDt, by=c("i", "encodeI"))
DT3 <- DT3[, Best := N==max(N), by=i]
DT3[Best==TRUE, .(list(Intensity)), by=i]

R custom data.table function with multiple variable inputs

I am writing a custom aggregation function with data.table (v 1.9.6) and struggle to pass function arguments to it. there have been similar questions on this but none deals with multiple (variable) inputs and none seems to have a conclusive answer but rather "little hacks".
pass variables and names to data.table function
eval and quote in data.table
How can one work fully generically in data.table in R with column names in variables
I would like to take a data table sum and order defined variables and create new variables on top (2 steps). the crucial think is that everything should be parameterized i.e. variables to sum, variables to group by, variables to order by. and they can all be one or more variables. a small example.
dt <- data.table(a=rep(letters[1:4], 5),
b=rep(letters[5:8], 5),
c=rep(letters[3:6], 5),
x=sample(1:100, 20),
y=sample(1:100, 20),
z=sample(1:100, 20))
temp <-
dt[, .(x_sum = sum(x, na.rm = T),
y_sum = sum(y, na.rm = T)),
by = .(a, b)][order(a, b)]
temp2 <-
temp[, `:=` (x_sum_del = (x_sum - shift(x = x_sum, n = 1, type = "lag")),
y_sum_del = (y_sum - shift(x = y_sum, n = 1, type = "lag")),
x_sum_del_rel = ((x_sum - shift(x = x_sum, n = 1, type = "lag")) /
(shift(x = x_sum, n = 1, type = "lag"))),
y_sum_del_rel = ((y_sum - shift(x = y_sum, n = 1, type = "lag")) /
(shift(x = y_sum, n = 1, type = "lag")))
)
]
how to programmatically pass following function arguments (i.e. not single inputs but vectors/list of inputs):
x and y --> var_list
new names of x and y (e.g. x_sum, y_sum) --> var_name_list
group by arguments a, b --> by_var_list
order by arguments a, b --> order_var_list
temp 2 should work on all pre-defined parameters, I was also thinking about using an apply function but again struggled to pass a list of variables.
I have played around with variations of get(), as.name(), eval(), quote() but as soon as I pass more than one variable, they don't work anymore. I hope the question is clear, otherwise I am happy to adjust where you deem necessary. a function call would look as follows:
fn_agg(dt, var_list, var_name_list, by_var_list, order_var_list)
Looks like a question to me :)
I prefer computing on the language over get/mget.
fn_agg = function(dt, var_list, var_name_list, by_var_list, order_var_list) {
j_call = as.call(c(
as.name("."),
sapply(setNames(var_list, var_name_list), function(var) as.call(list(as.name("sum"), as.name(var), na.rm=TRUE)), simplify=FALSE)
))
order_call = as.call(c(
as.name("order"),
lapply(order_var_list, as.name)
))
j2_call = as.call(c(
as.name(":="),
c(
sapply(setNames(var_name_list, paste0(var_name_list,"_del")), function(var) {
substitute(.var - shift(x = .var, n = 1, type = "lag"), list(.var=as.name(var)))
}, simplify=FALSE),
sapply(setNames(var_name_list, paste0(var_name_list,"_del_rel")), function(var) {
substitute((.var - shift(x = .var, n = 1, type = "lag")) / (shift(x = .var, n = 1, type = "lag")), list(.var=as.name(var)))
}, simplify=FALSE)
)
))
dt[eval(order_call), eval(j_call), by=by_var_list
][, eval(j2_call)
][]
}
ans = fn_agg(dt, var_list=c("x","y"), var_name_list=c("x_sum","y_sum"), by_var_list=c("a","b"), order_var_list=c("a","b"))
all.equal(temp2, ans)
#[1] TRUE
Some extra notes:
make strict input validation as debugging issues is more difficuilt against meta programming.
optimization of step2 is possible as shift is computed multiple times, easy way is just to compute _del in step2 and _del_rel in step3.
if order variables is always the same as by variables you can put them into keyby argument.
Here's an option using mget, as commented:
fn_agg <- function(DT, var_list, var_name_list, by_var_list, order_var_list) {
temp <- DT[, setNames(lapply(.SD, sum, na.rm = TRUE), var_name_list),
by = by_var_list, .SDcols = var_list]
setorderv(temp, order_var_list)
cols1 <- paste0(var_name_list, "_del")
cols2 <- paste0(cols1, "_rel")
temp[, (cols1) := lapply(mget(var_name_list), function(x) {
x - shift(x, n = 1, type = "lag")
})]
temp[, (cols2) := lapply(mget(var_name_list), function(x) {
xshift <- shift(x, n = 1, type = "lag")
(x - xshift) / xshift
})]
temp[]
}
fn_agg(dt,
var_list = c("x", "y"),
var_name_list = c("x_sum", "y_sum"),
by_var_list = c("a", "b"),
order_var_list = c("a", "b"))
# a b x_sum y_sum x_sum_del y_sum_del x_sum_del_rel y_sum_del_rel
#1: a e 254 358 NA NA NA NA
#2: b f 246 116 -8 -242 -0.031496063 -0.6759777
#3: c g 272 242 26 126 0.105691057 1.0862069
#4: d h 273 194 1 -48 0.003676471 -0.1983471
Instead of mget, you could also make use of data.table's .SDcols argument as in
temp[, (cols1) := lapply(.SD, function(x) {
x - shift(x, n = 1, type = "lag")
}), .SDcols = var_name_list]
Also, there are probably ways to improve the function by avoiding duplicated computation of shift(x, n = 1, type = "lag") but I only wanted to demonstrate a way to use data.table in functions.

Pass a list of columns to data.table’s CJ as a vector

I have the following code:
main_cols <- c('num', 'let')
dt <- data.table(num = 1:5, let = letters[1:5])
dt
new_dt <- dt[CJ(num = num
, let = let
, unique = TRUE)
, on = main_cols
]
head(new_dt, 10)
The thing is: I want to pass the columns to cross-join on as a vector. How do I “unpack” main_cols inside the CJ function? Thanks.
I think you'll want to use do.call, as #AnandaMahto suggested:
m = dt[, do.call(CJ, .SD), .SDcols=main_cols]
dt[m, on=main_cols]
You could also create m this way:
m = do.call(CJ, dt[,main_cols,with=FALSE])
If you have repeating values in the columns, use the unique option to CJ:
m = dt[, do.call(CJ, c(.SD, unique=TRUE)), .SDcols=main_cols]
# or
m = do.call(CJ, c(dt[,main_cols,with=FALSE], unique=TRUE))

Resources