I'm having a hard time solving this issue: for a given data.table, can I filter all rows that pass a criteria an all columns?
example:
dt <-data.table(col_a = c(1,1,0,0,1),
col_b = c(50,0,0,1,0),
col_c = c(0,0,0,0,0),
col_d = c(0,0,0,0,0),
col_e = c(1,0,0,0,10))
I want to return the rows that pass the filter<-c(T,F,F,F,T) - so row number 5
I've tried dt[, filter] - tells me that 'filter' is not found
tried dt[,c(T,F,F,F,T)] this returns a string [1] TRUE FALSE FALSE FALSE TRUE
Can I solve this by only using data.table?
It is unclear from the description of the post. Based on the comments, the OP wants to select the rows that matches the values in filter. In order to do that, first convert the columns to logical, replicate the filter to make the dimensions same before doing the comparison ==, get the rowSums, check if it equal to ncol of original dataset for subsetting the rows
dt[rowSums(dt[, lapply(.SD, as.logical)] == filter[col(dt)])== ncol(dt)]
# col_a col_b col_c col_d col_e
#1: 1 0 0 0 10
Or another option is to paste to single string and then compare
dt[dt[, do.call(paste0, lapply(.SD, function(x) +(as.logical(x))))]
== paste(+(filter), collapse = "")]
Or another approach is to loop through the columns, store the boolean comparison output as a list of vectors and Reduce
lst1 <- vector('list', ncol(dt))
for(j in seq_along(dt)) lst1[[j]] <- as.logical(dt[[j]]) == filter[j]
dt[Reduce(`&`, lst1)]
Or a similar approach with Map/Reduce
dt[dt[, Reduce(`&`, Map(`==`, lapply(.SD, as.logical), filter))]]
Considering the size of your actual dataset, you might be better off to convert it into a long format and then perform the filtering:
ans <- melt(DT[, rn := .I], id.vars="rn")[,
value := as.logical(value)][,
if (all(value==filter)) rn, rn]$V1
one timing:
library(data.table)
set.seed(0L)
nc <- 392
nr <- 2e6
filter <- sample(c(1,0), nc, TRUE)
loc <- which(filter>0L)
M <- matrix(sample(c(1,0), nc*nr, TRUE), nrow=nr)
DT <- as.data.table(M)
system.time({
ans <- melt(DT[, rn := .I], id.vars="rn")[,
value := as.logical(value)][,
if (all(value==filter)) rn, rn]$V1
})
# user system elapsed
# 2.20 0.84 1.72
some other options but not as fast as converting into a long format:
library(Matrix)
library(data.table)
library(microbenchmark)
set.seed(0L)
nc <- 392
nr <- 1e5
filter <- sample(c(1,0), nc, TRUE)
loc <- which(filter>0L)
M <- matrix(sample(c(1,0), nc*nr, TRUE), nrow=nr)
DT <- as.data.table(M)
# filter <- c(T,F,F,F,T)
# DT <- data.table(c(1,1,0,0,1), c(50,0,0,1,0), c(0,0,0,0,0), c(0,0,0,0,0), c(1,0,0,0,10))
# M <- as.matrix(DT)
loc <- which(filter>0L)
sumF <- sum(filter)
DTo_f <- copy(DT)
DTj_f <- copy(DT)
#Spare matrix
sm_f <- function() {
sM <- as(M, "dgTMatrix")
ixDT <- data.table(R=sM#i+1L, C=sM#j+1L, I=1L)
univ <- data.table(R=rep(1:nr, each=length(loc)), C=rep(loc, nr), U=1L)
mgDT <- merge(univ, ixDT, by=c("R", "C"), all=TRUE)
mgDT[, if(!(anyNA(U) | anyNA(I))) R, R]$V1
}
#melt
m_f <- function() {
melt(DT[, rn := .I], id.vars="rn")[,
value := as.logical(value)][,
if (all(value==filter)) rn, rn]$V1
}
#order
o_f <- function() {
non0 <- DTo_f[, {
m <- as.matrix(.SD)
ri <- replace(col(.SD), .SD==0L, NA_integer_)
as.data.table(matrix(ri[order(row(.SD), ri, na.last=TRUE)], nrow=.N, byrow=TRUE))
}]
non0[setNames(as.list(c(loc, rep(NA_integer_, nc - length(loc)))), names(DTo_f)),
on=.NATURAL, which=TRUE]
}
#join
j_f <- function() {
setindexv(DTj_f, names(DTj_f))
DTj_f[, names(DTj_f) := lapply(DTj_f, as.logical)]
DTj_f[as.list(as.logical(filter)), on=names(DTj_f), which=TRUE]
}
microbenchmark(sm_f(), m_f(), o_f(), j_f(), times=1L)
timings:
Unit: seconds
expr min lq mean median uq max neval
sm_f() 9.134432 9.134432 9.134432 9.134432 9.134432 9.134432 1
m_f() 2.020081 2.020081 2.020081 2.020081 2.020081 2.020081 1
o_f() 3.413685 3.413685 3.413685 3.413685 3.413685 3.413685 1
j_f() 7.149763 7.149763 7.149763 7.149763 7.149763 7.149763 1
You can use which(colSums((df>0)==filter)==nrow(df)) to get index
> which(colSums((df>0)==filter)==nrow(df))
col_e
5
such that
> df[which(colSums((df>0)==filter)==nrow(df))]
col_a col_b col_c col_d col_e
1: 1 0 0 0 10
If I understand the question correctly, this should answer the question.
Reproduce your data:
library(data.table)
dt <-data.table(col_a = c(1,1,0,0,1),
col_b = c(50,0,0,1,0),
col_c = c(0,0,0,0,0),
col_d = c(0,0,0,0,0),
col_e = c(1,0,0,0,10))
filter<-c(T,F,F,F,T)
Now create a variable that is checking for non-zero values in each row and subset accordingly
to_subset = apply(dt, 1, function(x) {
all((x > 0) == filter)
})
# the output you are looking for
dt[to_subset]
# col_a col_b col_c col_d col_e
# 1: 1 0 0 0 10
The code can be collapsed to be more concise.
dt[apply(dt, 1, function(x) all((x > 0) == filter))]
# col_a col_b col_c col_d col_e
# 1: 1 0 0 0 10
Related
I have a data.table with which I'd like to perform the same operation on certain columns. The names of these columns are given in a character vector. In this particular example, I'd like to multiply all of these columns by -1.
Some toy data and a vector specifying relevant columns:
library(data.table)
dt <- data.table(a = 1:3, b = 1:3, d = 1:3)
cols <- c("a", "b")
Right now I'm doing it this way, looping over the character vector:
for (col in 1:length(cols)) {
dt[ , eval(parse(text = paste0(cols[col], ":=-1*", cols[col])))]
}
Is there a way to do this directly without the for loop?
This seems to work:
dt[ , (cols) := lapply(.SD, "*", -1), .SDcols = cols]
The result is
a b d
1: -1 -1 1
2: -2 -2 2
3: -3 -3 3
There are a few tricks here:
Because there are parentheses in (cols) :=, the result is assigned to the columns specified in cols, instead of to some new variable named "cols".
.SDcols tells the call that we're only looking at those columns, and allows us to use .SD, the Subset of the Data associated with those columns.
lapply(.SD, ...) operates on .SD, which is a list of columns (like all data.frames and data.tables). lapply returns a list, so in the end j looks like cols := list(...).
EDIT: Here's another way that is probably faster, as #Arun mentioned:
for (j in cols) set(dt, j = j, value = -dt[[j]])
I would like to add an answer, when you would like to change the name of the columns as well. This comes in quite handy if you want to calculate the logarithm of multiple columns, which is often the case in empirical work.
cols <- c("a", "b")
out_cols = paste("log", cols, sep = ".")
dt[, c(out_cols) := lapply(.SD, function(x){log(x = x, base = exp(1))}), .SDcols = cols]
UPDATE: Following is a neat way to do it without for loop
dt[,(cols):= - dt[,..cols]]
It is a neat way for easy code readability. But as for performance it stays behind Frank's solution according to below microbenchmark result
mbm = microbenchmark(
base = for (col in 1:length(cols)) {
dt[ , eval(parse(text = paste0(cols[col], ":=-1*", cols[col])))]
},
franks_solution1 = dt[ , (cols) := lapply(.SD, "*", -1), .SDcols = cols],
franks_solution2 = for (j in cols) set(dt, j = j, value = -dt[[j]]),
hannes_solution = dt[, c(out_cols) := lapply(.SD, function(x){log(x = x, base = exp(1))}), .SDcols = cols],
orhans_solution = for (j in cols) dt[,(j):= -1 * dt[, ..j]],
orhans_solution2 = dt[,(cols):= - dt[,..cols]],
times=1000
)
mbm
Unit: microseconds
expr min lq mean median uq max neval
base_solution 3874.048 4184.4070 5205.8782 4452.5090 5127.586 69641.789 1000
franks_solution1 313.846 349.1285 448.4770 379.8970 447.384 5654.149 1000
franks_solution2 1500.306 1667.6910 2041.6134 1774.3580 1961.229 9723.070 1000
hannes_solution 326.154 405.5385 561.8263 495.1795 576.000 12432.400 1000
orhans_solution 3747.690 4008.8175 5029.8333 4299.4840 4933.739 35025.202 1000
orhans_solution2 752.000 831.5900 1061.6974 897.6405 1026.872 9913.018 1000
as shown in below chart
My Previous Answer:
The following also works
for (j in cols)
dt[,(j):= -1 * dt[, ..j]]
None of above solutions seems to work with calculation by group. Following is the best I got:
for(col in cols)
{
DT[, (col) := scale(.SD[[col]], center = TRUE, scale = TRUE), g]
}
dplyr functions work on data.tables, so here's a dplyr solution that also "avoids the for-loop" :)
dt %>% mutate(across(all_of(cols), ~ -1 * .))
I benchmarked it using orhan's code (adding rows and columns) and you'll see dplyr::mutate with across mostly executes faster than most of the other solutions and slower than the data.table solution using lapply.
library(data.table); library(dplyr)
dt <- data.table(a = 1:100000, b = 1:100000, d = 1:100000) %>%
mutate(a2 = a, a3 = a, a4 = a, a5 = a, a6 = a)
cols <- c("a", "b", "a2", "a3", "a4", "a5", "a6")
dt %>% mutate(across(all_of(cols), ~ -1 * .))
#> a b d a2 a3 a4 a5 a6
#> 1: -1 -1 1 -1 -1 -1 -1 -1
#> 2: -2 -2 2 -2 -2 -2 -2 -2
#> 3: -3 -3 3 -3 -3 -3 -3 -3
#> 4: -4 -4 4 -4 -4 -4 -4 -4
#> 5: -5 -5 5 -5 -5 -5 -5 -5
#> ---
#> 99996: -99996 -99996 99996 -99996 -99996 -99996 -99996 -99996
#> 99997: -99997 -99997 99997 -99997 -99997 -99997 -99997 -99997
#> 99998: -99998 -99998 99998 -99998 -99998 -99998 -99998 -99998
#> 99999: -99999 -99999 99999 -99999 -99999 -99999 -99999 -99999
#> 100000: -100000 -100000 100000 -100000 -100000 -100000 -100000 -100000
library(microbenchmark)
mbm = microbenchmark(
base_with_forloop = for (col in 1:length(cols)) {
dt[ , eval(parse(text = paste0(cols[col], ":=-1*", cols[col])))]
},
franks_soln1_w_lapply = dt[ , (cols) := lapply(.SD, "*", -1), .SDcols = cols],
franks_soln2_w_forloop = for (j in cols) set(dt, j = j, value = -dt[[j]]),
orhans_soln_w_forloop = for (j in cols) dt[,(j):= -1 * dt[, ..j]],
orhans_soln2 = dt[,(cols):= - dt[,..cols]],
dplyr_soln = (dt %>% mutate(across(all_of(cols), ~ -1 * .))),
times=1000
)
library(ggplot2)
ggplot(mbm) +
geom_violin(aes(x = expr, y = time)) +
coord_flip()
Created on 2020-10-16 by the reprex package (v0.3.0)
To add example to create new columns based on a string vector of columns. Based on Jfly answer:
dt <- data.table(a = rnorm(1:100), b = rnorm(1:100), c = rnorm(1:100), g = c(rep(1:10, 10)))
col0 <- c("a", "b", "c")
col1 <- paste0("max.", col0)
for(i in seq_along(col0)) {
dt[, (col1[i]) := max(get(col0[i])), g]
}
dt[,.N, c("g", col1)]
library(data.table)
(dt <- data.table(a = 1:3, b = 1:3, d = 1:3))
Hence:
a b d
1: 1 1 1
2: 2 2 2
3: 3 3 3
Whereas (dt*(-1)) yields:
a b d
1: -1 -1 -1
2: -2 -2 -2
3: -3 -3 -3
data1=data.frame("StudentID"=c(1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6),
"Time"=c(1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6),
"var1"=c(0,0,0,NA,1,2,0,1,2,2,2,2,0,0,NA,1,1,1,NA,0,0,0,0,1,0,0,0,NA,0,0,0,0,0,1,NA,NA))
library(dplyr)
data2 <- group_by(data1, StudentID) %>%
slice(seq_len(min(which(var1 == 1), n())))
After much attempt I am able to obtain 'data2' from 'data1'. The rule is simple that in data1 FOR EACH STUDENTID if var1 equals to 1, keep that row and delete everything after.
If we want a similar option in data.table, either use the condition in .SD
library(data.table)
setDT(data1)[, .SD[c(seq_len(min(which(var1 == 1), .N)))],.(StudentID)]
or use row index with .I, and extract the column as $V1 to subset the dataset
setDT(data1)[data1[, .I[c(seq_len(min(which(var1 == 1), .N)))],.(StudentID)]$V1]
Or with match
setDT(data1)[, .SD[seq_len(min(match(1, var1), .N, na.rm = TRUE))], .(StudentID)]
Another option is to find the rows where var1 == 1L and use unique to select the top row then perform a non-equi inner join to filter the rows:
library(data.table)
setDT(data1)
f <- unique(data1[var1==1L | c(diff(StudentID) != 0L, TRUE)], by="StudentID")[, var1 := NULL]
f[data1, on=.(StudentID, Time>=Time), nomatch=0L]
timing code:
library(data.table)
setDT(data1)
DT <- rbindlist(replicate(2e5, data1, simplify=FALSE))
DT[, StudentID:=c(1L, 1L+cumsum(diff(StudentID)!=0L))]
microbenchmark::microbenchmark(times=1L,
mtd0 = a1 <- {
DT[DT[, .I[c(seq_len(min(which(var1 == 1), .N)))],.(StudentID)]$V1]
},
mtd1 = a2 <- {
f <- unique(DT[var1==1L | c(diff(StudentID) != 0L, TRUE)], by="StudentID")[, var1 := NULL]
f[DT, on=.(StudentID, Time>=Time), nomatch=0L]
}
)
fsetequal(a1, a2)
#[1] TRUE
timings:
Unit: seconds
expr min lq mean median uq max neval
mtd0 2.830089 2.830089 2.830089 2.830089 2.830089 2.830089 1
mtd1 1.153433 1.153433 1.153433 1.153433 1.153433 1.153433 1
I need to unite several columns with delimiter in my huge data.table. So I use unite from tidyr package for it.
Do you know if there is any data.table optimized version for that?
library(data.table)
data <- data.table(id=1:10, col1=11:20, col2=21:30, col3=31:40)
print(data)
library(tidyr)
data <- unite(data, "col_test", col1, col2, col3)
print(data)
We can use do.call with paste
data[, .(id, col_test=do.call(paste, c(.SD, sep="_"))), .SDcols= col1:col3]
# id col_test
# 1: 1 11_21_31
# 2: 2 12_22_32
# 3: 3 13_23_33
# 4: 4 14_24_34
# 5: 5 15_25_35
# 6: 6 16_26_36
# 7: 7 17_27_37
# 8: 8 18_28_38
# 9: 9 19_29_39
#10: 10 20_30_40
Benchmarks
microbenchmark(
tidyr_unite = {
unite(data1, "col_test", col1, col2, col3)
},
dt_docallPaste = {
data1[, .(id = data1[["id"]], col_test = do.call(paste, c(.SD, sep="_"))),
.SDcols= col1:col3]
},
apply_Paste = {
cbind.data.frame(id = data1$id,
col_test = apply(data1[, -1, with = FALSE], 1,
paste, collapse = "_"))
},
times = 10
)
# Unit: seconds
# expr min lq mean median uq max neval cld
# tidyr_unite 7.501491 7.521328 7.720600 7.647506 7.756273 8.219710 10 a
# dt_docallPaste 7.530711 7.558436 7.910604 7.618165 8.429796 8.497932 10 a
# apply_Paste 44.743782 45.797092 46.791288 46.325188 47.330887 51.155663 10 b
Compared to base apply, it looks like tidyr and data.table versions are equally efficient. That is to be expected as unite is simply a wrapper around do.call("paste", ...)
As you can see from the source code:
unite_.data.frame <- function(data, col, from, sep = "_", remove = TRUE) {
united <- do.call("paste", c(data[from], list(sep = sep)))
first_col <- which(names(data) %in% from)[1]
data2 <- data
if (remove) {
data2 <- data2[setdiff(names(data2), from)]
}
append_col(data2, united, col, after = first_col - 1)
}
Consider a data set consisting of a grouping variable (here id) and an ordered variable (here date)
(df <- data.frame(
id = rep(1:2,2),
date = 4:1
))
# id date
# 1 1 4
# 2 2 3
# 3 1 2
# 4 2 1
I'm wondering what the easiest way is in data.table to do the equivalent of this dplyr code:
library(dplyr)
df %>%
group_by(id) %>%
filter(min_rank(date)==1)
# Source: local data frame [2 x 2]
# Groups: id
#
# id date
# 1 1 2
# 2 2 1
i.e. for each id get the first according to date.
Based on a similar stackoverflow question (Create an "index" for each element of a group with data.table), I came up with this
library(data.table)
dt <- data.table(df)
setkey(dt, id, date)
for(k in unique(dt$id)){
dt[id==k, index := 1:.N]
}
dt[index==1,]
But it seems like there should be a one-liner for this. Being unfamiliar with data.table I thought something like this
dt[,,mult="first", by=id]
should work, but alas! The last bit of code seems like it should group by id and then take the first (which within id would be determined by date since I've set the keys in this way.)
EDIT
Thanks to Ananda Mahto, this one-liner will now be in my data.table repertoire
dt[,.SD[1], by=id]
# id date
# 1: 1 2
# 2: 2 1
Working directly with your source data.frame, you can try:
setkey(as.data.table(df), id, date)[, .SD[1], by = id]
# id date
# 1: 1 2
# 2: 2 1
Extending your original idea, you can just do:
dt <- data.table(df)
setkey(dt, id, date)
dt[, index := sequence(.N), by = id][index == 1]
# id date index
# 1: 1 2 1
# 2: 2 1 1
It might be that at a certain scale, David is correct about head vs [1], but I'm not sure what scale that would be.
set.seed(1)
nrow <- 10000
ncol <- 20
df <- data.frame(matrix(sample(10, nrow * ncol, TRUE), nrow = nrow, ncol = ncol))
fun1 <- function() setkey(as.data.table(df), X1, X2)[, head(.SD, 1), by = X1]
fun2 <- function() setkey(as.data.table(df), X1, X2)[, .SD[1], by = X1]
library(microbenchmark)
microbenchmark(fun1(), fun2())
# Unit: milliseconds
# expr min lq mean median uq max neval
# fun1() 12.178189 12.496777 13.400905 12.808523 13.483545 30.28425 100
# fun2() 4.474345 4.554527 4.948255 4.620596 4.965912 8.17852 100
Here's another option using data.tables binary search
setkey(dt[, indx := seq_len(.N), by = id], indx)[J(1)]
# id date indx
# 1: 1 2 1
# 2: 2 1 1
Some benchmarks:
It seems that all the methods perform more or less the same, but on huge data set (1e+06*1e+2) binrary search wins
set.seed(1)
nrow <- 1e6
ncol <- 1e2
df <- data.frame(matrix(sample(10, nrow * ncol, TRUE), nrow = nrow, ncol = ncol))
library(data.table)
funAM1 <- function() setkey(as.data.table(df), X1, X2)[, .SD[1], by = X1]
funAM2 <- function() setkey(as.data.table(df), X1, X2)[, index := sequence(.N), by = X1][index == 1]
funDA1 <- function() setkey(as.data.table(df), X1, X2)[, head(.SD, 1), by = X1]
funDA2 <- function() setkey(as.data.table(df)[, indx := seq_len(.N), by = X1], X1)[J(1)]
library(microbenchmark)
Res <- microbenchmark(funAM1(), funAM2(), funDA1(), funDA2())
Res
# Unit: milliseconds
# expr min lq median uq max neval
# funAM1() 737.5690 758.3015 771.9344 794.1417 910.1019 100
# funAM2() 631.7822 693.8286 704.6912 729.6960 806.5556 100
# funDA1() 757.0327 772.4353 784.3107 810.0759 938.6344 100
# funDA2() 564.7291 578.1089 587.6470 611.7269 740.4077 100
boxplot(Res)
I was playing around with data.table and I came across a distinction that I'm not sure I quite understand. Given the following dataset:
library(data.table)
set.seed(400)
DT <- data.table(x = sample(LETTERS[1:5], 20, TRUE), key = "x"); DT
Can you please explain to me the difference between the following expressions?
1) DT[J("E"), .I]
2) DT[ , .I[x == "E"] ]
3) DT[x == "E", .I]
set.seed(400)
library(data.table)
DT <- data.table(x = sample(LETTERS[1:5], 20, TRUE), key = "x"); DT
1)
DT[ , .I[x == "E"] ] # [1] 18 19 20
is a data.table where .I is a vector representing the row number of E in the ORIGINAL dataset DT
2)
DT[J("E") , .I] # [1] 1 2 3
DT["E" , .I] # [1] 1 2 3
DT[x == "E", .I] # [1] 1 2 3
are all the same, producing a vector where .Is are vectors representing the row numbers of the Es in the NEW subsetted data