Remove trailing (last) rows with NAs in all columns - r

I am trying to exclude rows have missing values (NA) in all columns for that row AND for which all subsequent rows have only missing values (or is the last empty row itself), i.e. I want to remove trailing "all-NA" rows.
I came up with the solution below, which works but is too slow (I am using this function on thousands of tables), probably because of the while loop.
## Aux function to remove NA rows below table
remove_empty_row_last <- function(dt){
dt[ , row_empty := rowSums(is.na(dt)) == ncol(dt)]
while (dt[.N, row_empty] == TRUE) {
dt <- dt[1:(.N-1)]
}
dt %>% return()
}
d <- data.table(a = c(1,NA,3,NA,5,NA,NA), b = c(1,NA,3,4,5,NA,NA))
remove_empty_row_last(d)
#EDIT2: adding more test cases
d2 <- data.table(A = c(1,NA,3,NA,5,1 ,NA), B = c(1,NA,3,4,5,NA,NA))
remove_empty_row_last(d2)
d3 <- data.table(A = c(1,NA,3,NA,5,NA,NA), B = c(1,NA,3,4,5,1,NA))
remove_empty_row_last(d3)
#Edit3:adding no NA rows test case
d4 <- data.table(A = c(1,2,3,NA,5,NA,NA), B = c(1,2,3,4,5,1,7))
d4 %>% remove_empty_row_last()

This seems to work with all test cases.
The idea is to use a reverse cumsum to filter out the NA rows at the end.
library(data.table)
remove_empty_row_last_new <- function(d) {
d[d[,is.na(rev(cumsum(rev(ifelse(rowSums(!is.na(.SD))==0,1,NA)))))]]
}
d <- data.table(a=c(1,NA,3,NA,5,NA,NA),b=c(1,NA,3,4,5,NA,NA))
remove_empty_row_last_new(d)
#> a b
#> 1: 1 1
#> 2: NA NA
#> 3: 3 3
#> 4: NA 4
#> 5: 5 5
d2 <- data.table(A=c(1,NA,3,NA,5,1 ,NA),B=c(1,NA,3,4,5,NA,NA))
remove_empty_row_last_new(d2)
#> A B
#> 1: 1 1
#> 2: NA NA
#> 3: 3 3
#> 4: NA 4
#> 5: 5 5
#> 6: 1 NA
d3 <- data.table(A=c(1,NA,3,NA,5,NA,NA),B=c(1,NA,3,4,5,1,NA))
remove_empty_row_last_new(d3)
#> A B
#> 1: 1 1
#> 2: NA NA
#> 3: 3 3
#> 4: NA 4
#> 5: 5 5
#> 6: NA 1
d4 <- data.table(A=c(1,2,3,NA,5,NA,NA),B=c(1,2,3,4,5,1,7))
remove_empty_row_last_new(d4)
#> A B
#> 1: 1 1
#> 2: 2 2
#> 3: 3 3
#> 4: NA 4
#> 5: 5 5
#> 6: NA 1
#> 7: NA 7
You'll have to check performance on your real dataset, but it seems a bit faster :
> microbenchmark::microbenchmark(remove_empty_row_last(d),remove_empty_row_last_new(d))
Unit: microseconds
expr min lq mean median uq max neval cld
remove_empty_row_last(d) 384.701 411.800 468.5251 434.251 483.7515 1004.401 100 b
remove_empty_row_last_new(d) 345.201 359.301 416.1650 382.501 450.5010 1104.401 100 a

Maybe this will be fast enough?
d[!d[,any(rowSums(is.na(.SD)) == ncol(.SD)) & rleid(rowSums(is.na(.SD)) == ncol(.SD)) == max(rleid(rowSums(is.na(.SD)) == ncol(.SD))),]]
a b
1: 1 1
2: NA NA
3: 3 3
4: NA 4
5: 5 5

Here's another approach that relies on rcpp.
library(Rcpp)
library(data.table)
Rcpp::cppFunction("
IntegerVector which_end_cont(LogicalVector x) {
const int n = x.size();
int consecutive = 0;
for (int i = n - 1; i >= 0; i--) {
if (x[i]) consecutive++; else break;
}
IntegerVector out(consecutive);
if (consecutive == 0)
return(out);
else
return(seq(1, n - consecutive));
}
")
remove_empty_row_last3 <- function(dt) {
lgl = rowSums(is.na(dt)) == length(dt)
ind = which_end_cont(lgl)
if (length(ind)) return(dt[ind]) else return(dt)
}
Basically, it
uses R to find out which rows are completely NA.
it uses rcpp to loop through the logical vector to determine how many consecutive empty rows there are at the end. Using rcpp allows us to minimize the memory allocated.
If there are no rows empty at the end, we prevent allocating memory by just returning the input rcpp. Otherwise, we allocate the sequence in rcpp and return it to subset the data.table.
Using microbenchmark, this is about 3 times faster for cases in which there are empty rows at the end and about 15 times faster in which there are no empty rows.
Edit
If you have taken the time to add rcpp, the nice thing is that data.table has exported some of their internal functions so that they can be called directly from C. That can further simplify things and make it very, very quick, mainly because we can skip the NSE performed during [data.table which is why all conditions are now ~15 times faster than the OP original function.
Rcpp::cppFunction("
SEXP mysub2(SEXP dt, LogicalVector x) {
const int n = x.size();
int consecutive = 0;
for (int i = n - 1; i >= 0; i--) {
if (x[i]) consecutive++; else break;
}
if (consecutive == 0)
return(dt);
else
return(DT_subsetDT(dt, wrap(seq(1, n - consecutive)), wrap(seq_len(LENGTH(dt)))));
}",
include="#include <datatableAPI.h>",
depends="data.table")
remove_empty_row_last4 <- function(dt) {
lgl = rowSums(is.na(dt)) == length(dt)
return(mysub2(dt, lgl))
}
dt = copy(d)
dt2 = copy(d2)
dt3 = copy(d3)
dt4 = copy(d4)
microbenchmark::microbenchmark(original = remove_empty_row_last(d3),
rcpp_subset = remove_empty_row_last4(dt3),
rcpp_ind_only = remove_empty_row_last3(dt3),
waldi = remove_empty_row_last_new(dt3),
ian = dt3[!dt3[,any(rowSums(is.na(.SD)) == ncol(.SD)) & rleid(rowSums(is.na(.SD)) == ncol(.SD)) == max(rleid(rowSums(is.na(.SD)) == ncol(.SD))),]])
## Unit: microseconds
## expr min lq mean median uq max neval
## original 498.0 519.00 539.602 537.65 551.85 621.6 100
## rcpp_subset 34.0 39.95 43.422 43.30 46.70 59.0 100
## rcpp_ind_only 116.9 129.75 139.943 140.15 146.35 177.7 100
## waldi 370.9 387.70 408.910 400.55 417.90 683.4 100
## ian 432.0 445.30 461.310 456.25 473.35 554.1 100
## andrew 120.0 131.40 143.153 141.60 151.65 197.5 100

I am late to the party but here is another option that should be relatively memory efficient and only uses base R.
library(data.table)
d <- data.table(a=c(1,NA,3,NA,5,NA,NA),b=c(1,NA,3,4,5,NA,NA))
remove_empty_row_last_andrew(d)
#> a b
#> 1: 1 1
#> 2: NA NA
#> 3: 3 3
#> 4: NA 4
#> 5: 5 5
d2 <- data.table(A=c(1,NA,3,NA,5,1 ,NA),B=c(1,NA,3,4,5,NA,NA))
remove_empty_row_last_andrew(d2)
#> A B
#> 1: 1 1
#> 2: NA NA
#> 3: 3 3
#> 4: NA 4
#> 5: 5 5
#> 6: 1 NA
d3 <- data.table(A=c(1,NA,3,NA,5,NA,NA),B=c(1,NA,3,4,5,1,NA))
remove_empty_row_last_andrew(d3)
#> A B
#> 1: 1 1
#> 2: NA NA
#> 3: 3 3
#> 4: NA 4
#> 5: 5 5
#> 6: NA 1
d4 <- data.table(A=c(1,2,3,NA,5,NA,NA),B=c(1,2,3,4,5,1,7))
remove_empty_row_last_andrew(d4)
#> A B
#> 1: 1 1
#> 2: 2 2
#> 3: 3 3
#> 4: NA 4
#> 5: 5 5
#> 6: NA 1
#> 7: NA 7
Created on 2021-02-01 by the reprex package (v0.3.0)
Function:
remove_empty_row_last_andrew = function(x) {
idx = do.call(pmin.int, lapply(x, is.na))
length_idx = length(idx)
if(idx[length_idx] == 0) {
return(x)
}
start_idx = length_idx - which.min(idx[length_idx:1L]) + 2
x = x[-(start_idx:length_idx), ]
x
}

Related

How to update both data.tables in a join

Suppose I would like to track which rows from one data.table were merged to another data.table. is there a way to do this at once/while merging? Please see my example below and the way I usually do it. However, this seems rather inefficient.
Example
library(data.table)
# initial data
DT = data.table(x = c(1,1,1,2,2,1,1,2,2),
y = c(1,3,6))
# data to merge
DTx <- data.table(x = 1:3,
y = 1,
k = "X")
# regular update join
copy(DT)[DTx,
on = .(x, y),
k := i.k][]
#> x y k
#> 1: 1 1 X
#> 2: 1 3 <NA>
#> 3: 1 6 <NA>
#> 4: 2 1 X
#> 5: 2 3 <NA>
#> 6: 1 6 <NA>
#> 7: 1 1 X
#> 8: 2 3 <NA>
#> 9: 2 6 <NA>
# DTx remains the same
DTx
#> x y k
#> 1: 1 1 X
#> 2: 2 1 X
#> 3: 3 1 X
What I usually do:
# set an Id variable
DTx[, Id := .I]
# assign the Id in merge
DT[DTx,
on = .(x, y),
`:=`(k = i.k,
matched_id = i.Id)][]
#> x y k matched_id
#> 1: 1 1 X 1
#> 2: 1 3 <NA> NA
#> 3: 1 6 <NA> NA
#> 4: 2 1 X 2
#> 5: 2 3 <NA> NA
#> 6: 1 6 <NA> NA
#> 7: 1 1 X 1
#> 8: 2 3 <NA> NA
#> 9: 2 6 <NA> NA
# use matched_id to find merged rows
DTx[, matched := fifelse(Id %in% DT$matched_id, TRUE, FALSE)]
DTx
#> x y k Id matched
#> 1: 1 1 X 1 TRUE
#> 2: 2 1 X 2 TRUE
#> 3: 3 1 X 3 FALSE
Following Jan's comment:
This will provide you indices of matching rows but you will have to call merge again to perform actual merging, unless you manually use provided indices to match/update those tables.
You can pull the indices:
merge_metaDT = DT[DTx, on=.(x, y), .(irow = .GRP, xrow = .I), by=.EACHI]
x y irow xrow
1: 1 1 1 1
2: 1 1 1 7
3: 2 1 2 4
4: 3 1 3 0
Then apply edits to each table using indices rather than merging or matching a second time:
rowDT = merge_metaDT[xrow != 0L]
DT[rowDT$xrow, k := DTx[rowDT$irow, k]]
DTx[, matched := FALSE][rowDT$irow, matched := TRUE]
How it works:
When joining, x[i], the symbol .I indexes rows of x
When grouping in a join with by=.EACHI, .GRP indexes each group, which means each row of i here
We drop the non-matching values of .I which are coded as zeros
On this last point, we might expect NAs instead of zeros, as returned by DT[DTx, on=.(x, y), which=TRUE]. I'm not sure why these differ.
Suppose I would like to track which rows from one data.table were merged to another data.table. is there a way to do this at once/while merging? [...] seems rather inefficient.
I expect this is more efficient than multiple merges or %in% when the merge is costly enough.
It still requires multiple steps. I doubt there's any way around that, since it would be hard to come up with logic and syntax for the update that is easy to follow.
Update logic is already complex in base R, with multiple edits on a single index allowed:
> x = c(1, 2, 3)
> x[c(1, 1)] = c(4, 5)
> x
[1] 5 2 3
And there is the question of how to match and edit multiple indices at once:
> x = c(1, 1, 3)
> x[match(c(1, 3), x)] = c(4, 5)
> x
[1] 4 1 5
In data.table updates, the latter issue is handled with mult=. In the update-two-tables use case, these questions would get much more complicated.

data.table frollmean very slow

I'm trying to calculate the rolling mean of a column in a large data.table (~30M rows) aggregated by two other columns.
The rolling mean should include only the preceding N row values, not the row value itself.
For this purpose, i had to define my own rolling mean function based on frollmean function. (N=3)
Applying the function to the column is really really slow, rendering it rather useless.
Here is sample data:
require(data.table)
DT <- data.table(ID=c('A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C')
, value_type =c('type 1', 'type 1','type 2','type 1','type 2','type 2','type 1','type 1','type 2','type 1','type 1','type 1')
, value=c(1,4,7,2,3,5,1,6,8,2,2,3))
DT
ID value_type value
1: A type 1 1
2: A type 1 4
3: A type 2 7
4: A type 1 2
5: A type 2 3
6: A type 2 5
7: B type 1 1
8: B type 1 6
9: B type 2 8
10: C type 1 2
11: C type 1 2
12: C type 1 3
#this is the customised rolling function
lrollmean<-function(x){
head(frollmean(c(NA,NA,NA,x), n = 3, fill = NA, algo ="exact", align="right", na.rm = TRUE)[-(1:2)], -1)
}
> DT[, roll_mean := lrollmean(value), by=.(ID, value_type)]
> DT
ID value_type value roll_mean
1: A type 1 1 NaN
2: A type 1 4 1.0
3: A type 2 7 NaN
4: A type 1 2 2.5
5: A type 2 3 7.0
6: A type 2 5 5.0
7: B type 1 1 NaN
8: B type 1 6 1.0
9: B type 2 8 NaN
10: C type 1 2 NaN
11: C type 1 2 2.0
12: C type 1 3 2.0
This operation takes more than 30 minutes! I've got a reasonable machine which ample RAM, and I feel the long time of the operation has something to do with my code rather than the machine.
Can you try and see if its fast enough:
n <- 3L
DT[, roll_mean := {
v <- if (.N - n >= 1L) c(seq.int(n), rep(n, .N-n)) else seq.int(min(n, .N))
shift(frollmean(value, v, adaptive=TRUE))
}, .(ID, value_type)]
But if you have a large number of small groups, you can try:
setorder(DT[, rn := .I], ID, value_type)
rid <- DT[, rowid(ID, value_type)]
DT[, roll_mean := shift(frollmean(value, n))]
ix <- DT[rid==3L, which=TRUE]
set(DT, ix, "roll_mean", DT[, shift(frollmean(value, n - 1L))][ix])
ix <- DT[rid==2L, which=TRUE]
set(DT, ix, "roll_mean", DT[, shift(value)][ix])
DT[rid==1L, roll_mean := NA_real_]
setorder(DT, rn)[]
You can try frollapply and since frollmean doesn't completely suit your needs. You can also optimize the function you apply to the window, since you don't need a very complicated operation. I've tried a few modifications to your function that should cut your time down by around 50%.
library(data.table)
library(stringi)
N=1e6
set.seed(123)
DT <- data.table(ID=stri_rand_strings(N,3),
value=rnorm(N,5,5))
head(DT)
#> ID value
#> 1: HmP 12.2667538
#> 2: sw2 -2.2397053
#> 3: WtY 7.0911933
#> 4: SxS 0.4029431
#> 5: gZ6 8.6800795
#> 6: tF2 0.8228594
DT[,.(.N),by=ID][order(N)]
#> ID N
#> 1: HoR 1
#> 2: eNM 1
#> 3: I9h 1
#> 4: xjb 1
#> 5: eFH 1
#> ---
#> 234823: 34Y 15
#> 234824: Xcm 15
#> 234825: IOu 15
#> 234826: tob 16
#> 234827: f70 16
# Your function
lrollmean<-function(x){
head(frollmean(c(NA,NA,NA,x), n = 3, fill = NA, algo ="exact", align="right", na.rm = TRUE)[-(1:2)], -1)
}
#Possible modifications:
lrollmean1<-function(x,n){
frollapply(c(rep(NA,n),x),n+1,weighted.mean,c(rep(1,n),0),na.rm=T)[-(1:3)]
}
lrollmean2<-function(x,n){
frollapply(c(rep(NA,n),x),n+1,function(x) sum(x*c(rep(1,n),0)/n,na.rm = T))[-(1:3)]
}
lrollmean3<-function(x){ # More optimized assuming n=3
frollapply(c(NA,NA,NA,x),4,function(x) sum(x[1:3]/3,na.rm = T))[-(1:3)]
}
library(rbenchmark)
benchmark(original={DT[, roll_mean := lrollmean1(value,3), by=.(ID)]},
a={DT[, roll_mean := lrollmean1(value,3), by=.(ID)]},
b={DT[, roll_mean := lrollmean2(value,3), by=.(ID)]},
c={DT[, roll_mean := lrollmean3(value), by=.(ID)]}
,replications = 1,order = 'relative')
#> test replications elapsed relative user.self sys.self user.child
#> 4 c 1 6.740 1.000 6.829 0.000 0
#> 3 b 1 8.038 1.193 8.085 0.012 0
#> 1 original 1 13.599 2.018 13.692 0.000 0
#> 2 a 1 14.180 2.104 14.233 0.008 0
#> sys.child
#> 4 0
#> 3 0
#> 1 0
#> 2 0
Created on 2020-02-17 by the reprex package (v0.3.0)

Faster index finding of flanking non-NA values

This is a speed optimization question.
Here is my sample data. The real data has over 100k rows and >300 columns.
library(data.table)
dt <- data.table(ref=1:20, tgt1=11:30, tgt2=21:40)
dt[c(3,8,9,15,16,17), "tgt1"] = NA
dt[c(4,5,15,17), "tgt2"] = NA
dt
#> ref tgt1 tgt2
#> 1: 1 11 21
#> 2: 2 12 22
#> 3: 3 NA 23
#> 4: 4 14 NA
#> 5: 5 15 NA
#> 6: 6 16 26
#> 7: 7 17 27
#> 8: 8 NA 28
#> 9: 9 NA 29
#> 10: 10 20 30
#> 11: 11 21 31
#> 12: 12 22 32
#> 13: 13 23 33
#> 14: 14 24 34
#> 15: 15 NA NA
#> 16: 16 NA 36
#> 17: 17 NA NA
#> 18: 18 28 38
#> 19: 19 29 39
#> 20: 20 30 40
Some columns have NA at some positions, and my goal is to get the positions of the nearest non-NA flanking values. For instance, for the second column tgt1, I am using the following code
tgt = dt[, tgt1]
tgt.na = which(is.na(tgt))
tgt.non.na = which(!is.na(tgt))
start = sapply(tgt.na, function(x) max(tgt.non.na[tgt.non.na < x]))
stop = sapply(tgt.na, function(x) min(tgt.non.na[tgt.non.na > x]))
data.frame(start, stop)
#> start stop
#> 1 2 4
#> 2 7 10
#> 3 7 10
#> 4 14 18
#> 5 14 18
#> 6 14 18
Here for the tgt1 column, I get what I want. For example, for the NA at 3rd row, the closest flanking non-NA values are at 2 and 4, and so on for others. My issues is that the sapply are very slow. Imagine running this for >300 columns and 100k rows. In current form it takes over few hours to finish. Ultimately, when these positions are found, then they are used to index values from ref column to compute the missing values in tgt1 and so on columns. But that is the topic for another time.
Is there any way I can make it faster? Any data.table way solution for it.
Edit: All great solutions, here is my benchmark, and you can see all proposed methods worked lightning fast compared to my original sapply. I select lapply, not only because it is the fastest but also because it aligns well with my current code syntax.
Unit: milliseconds
expr min lq mean median uq max neval
sapply 3755.118949 3787.288609 3850.322669 3819.458269 3897.924530 3976.390790 3
dt.thelatemail 9.145551 9.920238 10.242885 10.694925 10.791552 10.888180 3
lapply.andrew 2.626525 3.038480 3.446682 3.450434 3.856760 4.263086 3
zoo.chinsoon 6.457849 6.578099 6.629839 6.698349 6.715834 6.733318 3
Here is a base R alternative using rle. I used lapply because I was not sure how you wanted to save all the output dataframes. Hope this helps!
dt <- data.table(ref=1:20, tgt1=11:30, tgt2=21:40)
dt[c(3,8,9,15,16,17), "tgt1"] = NA
dt[c(4,5,15,17), "tgt2"] = NA
lapply(dt[,-1], function(x) {
na_loc <- which(is.na(x))
rle_x <- rle(is.na(x))
reps <- rle_x$lengths[rle_x$values == T]
start <- na_loc - 1
start <- start[!start %in% na_loc]
end <- na_loc + 1
end <- end[!end %in% na_loc]
data.frame(start = rep(start, reps),
end = rep(end, reps))
})
$tgt1
start end
1: 2 4
2: 7 10
3: 7 10
4: 14 18
5: 14 18
6: 14 18
$tgt2
start end
1: 3 6
2: 3 6
3: 14 16
4: 16 18
It also scales fairly well on my laptop for a sample dataframe w/ 300 columns:
df1 <- data.frame(ref = 1:1e5)
df1[paste0("tgt", 1:300)] <- replicate(300, sample(c(1:50, rep(NA, 5)), 1e5, replace = T))
microbenchmark::microbenchmark(
base = {
lapply(df1[,-1], function(x) {
na_loc <- which(is.na(x))
rle <- rle(is.na(x))
reps <- rle$lengths[rle$values == T]
start <- na_loc - 1
start <- start[!start %in% na_loc]
end <- na_loc + 1
end <- end[!end %in% na_loc]
data.frame(start = rep(start, reps),
end = rep(end, reps))
}
)},
times = 5
)
Unit: seconds
expr min lq mean median uq max neval
base 1.863319 1.888617 1.897651 1.892166 1.898196 1.945954 5
You should be able to take advantage of rleid to calculate the prior value to a run of NAs and then match it up. E.g.:
dt[, a := rleid(is.na(tgt1))]
dt[, rev(ref)[match((a - 1)[is.na(tgt1)], rev(a))] ]
#[1] 2 7 7 14 14 14
dt[, ref[match((a + 1)[is.na(tgt1)], a)] ]
#[1] 4 10 10 18 18 18
Seems pretty quick to process 100k rows:
dt <- dt[rep(1:20,5e3),]
dt[, ref := 1:1e5]
system.time({
dt[, a := rleid(is.na(tgt1))]
dt[, rev(ref)[match((a-1)[is.na(tgt1)],rev(a))]]
dt[, ref[match((a+1)[is.na(tgt1)],a)]]
})
# user system elapsed
# 0.02 0.00 0.02
Another possibility using zoo package:
library(zoo)
for (j in paste0("tgt", 1L:2L)) {
print(dt[, {
k <- is.na(get(j))
x <- replace(ref, k, NA_integer_)
.(start=na.locf0(x)[k],
end=na.locf0(x, fromLast=TRUE)[k])
}])
}
output:
start end
1: 2 4
2: 7 10
3: 7 10
4: 14 18
5: 14 18
6: 14 18
start end
1: 3 6
2: 3 6
3: 14 16
4: 16 18
timing code:
library(data.table)
library(zoo)
sz <- 100e3
nc <- 400
dt <- data.table(ref=1L:sz,
as.data.table(matrix(sample(c(NA_integer_, 1L), sz*nc, replace=TRUE), ncol=nc)))
library(microbenchmark)
microbenchmark(
mtd0=for (j in paste0("V", 1L:nc)) {
k <- dt[,is.na(get(j))]
dt[, a := rleid(k)][,
.(start=rev(ref)[match((a-1)[k],rev(a))], end=ref[match((a+1)[k],a)])]
},
mtd1=for (j in paste0("V", 1L:nc)) {
dt[, {
k <- is.na(get(j))
x <- replace(ref, k, NA_integer_)
.(start=na.locf0(x)[k], end=na.locf0(x, fromLast=TRUE)[k])
}]
},
times=3L)
timings:
Unit: seconds
expr min lq mean median uq max neval cld
mtd0 6.638253 6.698023 6.730352 6.757794 6.776402 6.795010 3 b
mtd1 4.832264 4.835764 4.854799 4.839264 4.866066 4.892867 3 a
Not much diff in timings given the number of rows.

Determine whether column values are unique in data.table

I a using a data.table to store data. I am trying to figure out whether certain columns in each row are unique. I want to add a column to the data.table that will hold the value "Duplicated Values" if there are duplicated values and be NA if there are no duplicated values. The names of the columns that I want to check for duplication are stored in a character vector. For example, I create my data.table:
tmpdt<-data.table(a=c(1,2,3,4,5), b=c(2,2,3,4,5), c=c(4,2,2,4,4), d=c(3,3,1,4,5))
> tmpdt
a b c d
1: 1 2 4 3
2: 2 2 2 3
3: 3 3 2 1
4: 4 4 4 4
5: 5 5 4 5
I have another variable that indicates which columns I need to check for duplicates. It is important that I be able to store the column names in a character vector and not need to "know" them (because they will be passed as an argument to a function).
dupcheckcols<-c("a", "c", "d")
I want the output to be:
> tmpdt
a b c d Dups
1: 1 2 4 3 <NA>
2: 2 2 2 3 Has Dups
3: 3 3 2 1 <NA>
4: 4 4 4 4 Has Dups
5: 5 5 4 5 Has Dups
If I were using a data.frame, this is easy. I could simply use:
tmpdt<-data.frame(a=c(1,2,3,4,5), b=c(2,2,3,4,5), c=c(4,2,2,4,4), d=c(3,3,1,4,5))
tmpdt$Dups<-NA
tmpdt$Dups[apply(tmpdt[,dupcheckcols], 1, function(x) {return(sum(duplicated(x))>0)})]<-"Has Dups"
> tmpdt
a b c d Dups
1 1 2 4 3 <NA>
2 2 2 2 3 Has Dups
3 3 3 2 1 <NA>
4 4 4 4 4 Has Dups
5 5 5 4 5 Has Dups
But I can't figure out how to accomplish the same task with a data.table. Any help is greatly appreciated.
I'm sure there are other ways
tmpdt[, dups := tmpdt[, dupcheckcols, with=FALSE][, apply(.SD, 1, function(x){sum(duplicated(x))>0})] ]
# a b c d dups
#1: 1 2 4 3 FALSE
#2: 2 2 2 3 TRUE
#3: 3 3 2 1 FALSE
#4: 4 4 4 4 TRUE
#5: 5 5 4 5 TRUE
A more convoluted, but slightly quicker (in computational terms) method would be to construct the filter condition in i, then update in j by reference
expr <- paste(apply(t(combn(dupcheckcols,2)), 1, FUN=function(x){ paste0(x, collapse="==") }), collapse = "|")
# [1] "a==c|a==d|c==d"
expr <- parse(text=expr)
tmpdt[ eval(expr), dups := TRUE ]
# a b c d dups
#1: 1 2 4 3 NA
#2: 2 2 2 3 TRUE
#3: 3 3 2 1 NA
#4: 4 4 4 4 TRUE
#5: 5 5 4 5 TRUE
I was interested in speed benefits, so I've benchmarked these two plus Ananda's solution:
library(microbenchmark)
tmpdt<-data.table(a=c(1,2,3,4,5), b=c(2,2,3,4,5), c=c(4,2,2,4,4), d=c(3,3,1,4,5))
t1 <- tmpdt
t2 <- tmpdt
t3 <- tmpdt
expr <- paste(apply(t(combn(dupcheckcols,2)), 1, FUN=function(x){ paste0(x, collapse="==") }), collapse = "|")
expr <- parse(text=expr)
microbenchmark(
#Ananda's solution
t1[, dups := any(duplicated(unlist(.SD))), by = 1:nrow(tmpdt), .SDcols = dupcheckcols],
t2[, dups := t2[, dupcheckcols, with=FALSE][, apply(.SD, 1, function(x){sum(duplicated(x))>0})] ],
t3[ eval(expr), dups := TRUE ]
)
# min lq mean median uq max neval cld
# 531.416 552.5760 577.0345 565.182 573.2015 1761.863 100 b
#1277.569 1333.2615 1389.5857 1358.021 1387.9860 2694.951 100 c
# 265.872 283.3525 293.9362 292.487 301.1640 520.436 100 a
You should be able to do something like this:
tmpdt[, dups := any(duplicated(unlist(.SD, use.names = FALSE))),
by = 1:nrow(tmpdt), .SDcols = dupcheckcols]
tmpdt
# a b c d dups
# 1: 1 2 4 3 FALSE
# 2: 2 2 2 3 TRUE
# 3: 3 3 2 1 FALSE
# 4: 4 4 4 4 TRUE
# 5: 5 5 4 5 TRUE
Adjust accordingly if you really want the words "Has Dups", but note that it would probably be easier to use logical values, as in my answer here.
I found a way to do this with Rcpp, following an example by hadley (under "Sets"):
// [[Rcpp::plugins(cpp11)]]
#include <Rcpp.h>
#include <unordered_set>
using namespace Rcpp;
// [[Rcpp::export]]
LogicalVector anyDupCols(IntegerMatrix x) {
int nr = x.nrow();
int nc = x.ncol();
LogicalVector out(nr, false);
std::unordered_set<int> seen;
for (int i = 0; i < nr; i++) {
seen.clear();
for (int j = 0; j < nc; j++){
int xij = x(i,j);
if (seen.count(xij)){ out[i] = true; break; }
else seen.insert(xij);
}
}
return out;
}
To use it, put it in a cpp file and run
library(Rcpp)
sourceCpp("anyDupCols.cpp")
anyDupCols(as.matrix(DT))
It does pretty well in benchmarks:
nc = 30
nv = nc^2
n = 1e4
set.seed(1)
DT = setDT( replicate(nc, sample(nv, n, replace = TRUE), simplify=FALSE) )
library(microbenchmark)
microbenchmark(
ananda = DT[, any(duplicated(unlist(.SD, use.names = FALSE))), by = 1:nrow(DT)]$V1,
tospig = {
expr = parse(text=paste(apply(t(combn(names(DT),2)),1,FUN =
function(x){ paste0(x, collapse="==") }), collapse = "|"))
DT[, eval(expr)]
},
cpp = anyDupCols(as.matrix(DT)),
alex = ff(DT),
tscharf = apply(DT,1,function(row) any(duplicated(row))),
unit = "relative", times = 10
)
Unit: relative
expr min lq mean median uq max neval cld
ananda 2.462739 2.596990 2.774660 2.659898 2.869048 3.352547 10 c
tospig 3.118158 3.253102 3.606263 3.424598 3.885561 4.583268 10 d
cpp 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
alex 1.295415 1.927802 1.914883 1.982580 2.029868 2.538143 10 b
tscharf 2.112286 2.204654 2.385318 2.234963 2.322206 2.978047 10 bc
If I go to nc = 50, #tospig's expr becomes too long for R to handle and I get node stack overflow, which is fun.
a one-liner with some elegance
define the columns
loop down the rows
see if there are any dupes
tmpdt[,dups:=apply(.SD,1,function(row) any(duplicated(row))),.SDcols = dupcheckcols]
> tmpdt
a b c d dups
1: 1 2 4 3 FALSE
2: 2 2 2 3 TRUE
3: 3 3 2 1 FALSE
4: 4 4 4 4 TRUE
5: 5 5 4 5 TRUE
Another way is to tabulate "tmpdt" along its rows and find which rows have more than one of an element:
tmpdt2 = tmpdt[, dupcheckcols, with = FALSE] # subset tmpdt
colSums(table(unlist(tmpdt2), row(tmpdt2)) > 1L) > 0L
# 1 2 3 4 5
#FALSE TRUE FALSE TRUE TRUE
Peeking at table we could speed it up significantly with something like:
ff = function(x)
{
lvs = Reduce(union, lapply(x, function(X) if(is.factor(X)) levels(X) else unique(X)))
x = lapply(x, function(X) match(X, lvs))
nr = length(lvs); nc = length(x[[1L]])
tabs = "dim<-"(tabulate(unlist(x, use.names = FALSE) + (0:(nc - 1L)) * nr, nr * nc),
c(nr, nc))
colSums(tabs > 1L) > 0L
}
ff(tmpdt2)
#[1] FALSE TRUE FALSE TRUE TRUE

R: Data.table on subset excluding by value

Using data.table in R, I'm trying to make an operation on the subset excluding selected element. I'm using the by operator, but I don't know if this is the right approach.
Here's an example. E.g. the value for Delta in IAH:SNA is (3+3)/2 which is the mean of Stops in IAH:SNA once Delta has been excluded.
library(data.table)
s1 <- "Market Carrier Stops
IAH:SNA Delta 1
IAH:SNA Delta 1
IAH:SNA Southwest 3
IAH:SNA Southwest 3
MSP:CLE Southwest 2
MSP:CLE Southwest 2
MSP:CLE American 2
MSP:CLE JetBlue 1"
d <- data.table(read.table(textConnection(s1), header=TRUE))
setkey(d, Carrier, Market)
f <- function(x, y){
subset(d, !(Carrier %in% x) & Market == y, Stops)[, mean(Stops)]}
d[, s := f(.BY[[1]], .BY[[2]]), by=list(Carrier, Market)]
## Market Carrier Stops s
## 1: MSP:CLE American 2 1.666667
## 2: IAH:SNA Delta 1 3.000000
## 3: IAH:SNA Delta 1 3.000000
## 5: IAH:SNA Southwest 3 1.000000
## 6: IAH:SNA Southwest 3 1.000000
## 7: MSP:CLE Southwest 2 1.500000
## 8: MSP:CLE Southwest 2 1.500000
The above solution performs very poorly on large data sets (it's essentially an mapply), but I'm not sure how to do it in a fast data.table-like way.
Perhaps one could (dynamically) generate a factor that does this? I'm just not sure how. . .
Is there a way to improve it?
Edit: Just for the heck of it, here's a way to get a bigger version of the above
library(data.table)
dl.dta <- function(...){
## input years ..
years <- gsub("\\.", "_", c(...))
baseurl <- "http://www.transtats.bts.gov/Download/"
names <- paste("Origin_and_Destination_Survey_DB1BMarket", years, sep="_")
info <- t(sapply(names, function(x) file.exists(paste(x, c("zip", "csv"), sep="."))))
to.download <- paste(baseurl, names, ".zip", sep="")[!apply(info, 1, any)]
if (length(to.download) > 0){
message("starting download...")
sapply(to.download,
function(x) download.file(x, rev(strsplit(x, "/")[[1]])[1]))}
to.unzip <- paste(names, "zip", sep=".")[!info[, 2]]
if (length(to.unzip > 0)){
message("starting to unzip...")
sapply(to.unzip, unzip)}
paste(names, "csv", sep=".")}
countWords.split <- function(x, s=":"){
## Faster on my machine than grep for some reanon
sapply(strsplit(as.character(x), s), length)}
countWords.grep <- function(x){
sapply(gregexpr("\\W+", x), length)+1}
fname <- dl.dta(2013.1)
cols <- rep("NULL", 41)
## Columns to keep: 9 is Origin, 18 is Dest, 24 is groups of airports in travel
## 30 is RPcarrier (reporting carrier).
## For more columns: 35 is market fare and 36 is distance.
cols[9] <- cols[18] <- cols[24] <- cols[30] <- NA
d <- data.table(read.csv(file=fname, colClasses=cols))
d[, Market := paste(Origin, Dest, sep=":")]
## should probably
d[, Stops := -2 + countWords.split(AirportGroup)]
d[, Carrier := RPCarrier]
d[, c("RPCarrier", "Origin", "Dest", "AirportGroup") := NULL]
Use a tiny bit of elementary maths:
d[, c("tmp.mean", "N") := list(mean(Stops), .N), by = Market]
d[, exep.mean := (tmp.mean * N - sum(Stops)) / (N - .N), by = list(Market,Carrier)]
# Market Carrier Stops tmp.mean N exep.mean
# 1: IAH:SNA Delta 1 2.00 4 3.000000
# 2: IAH:SNA Delta 1 2.00 4 3.000000
# 3: IAH:SNA Southwest 3 2.00 4 1.000000
# 4: IAH:SNA Southwest 3 2.00 4 1.000000
# 5: MSP:CLE Southwest 2 1.75 4 1.500000
# 6: MSP:CLE Southwest 2 1.75 4 1.500000
# 7: MSP:CLE American 2 1.75 4 1.666667
# 8: MSP:CLE JetBlue 1 1.75 4 2.000000
#Roland's answer will work for some functions (and when it does it will be best) but not in general. Unfortunately you can't apply the split-apply-combine strategy to the data as is to do the task, but you can if you make the data larger. Let's start with a simpler example:
dt = data.table(a = c(1,1,2,2,3,3), b = c(1:6), key = 'a')
# now let's extend this table the following way
# take the unique a's and construct all the combinations excluding one element
combinations = dt[, combn(unique(a), 2)]
# now combine this into a data.table with the excluded element as the index
# and merge it back into the original data.table
extension = rbindlist(apply(combinations, 2,
function(x) data.table(a = x, index = setdiff(c(1,2,3), x))))
setkey(extension, a)
dt.extended = extension[dt, allow.cartesian = TRUE]
dt.extended[order(index)]
# a index b
# 1: 2 1 3
# 2: 2 1 4
# 3: 3 1 5
# 4: 3 1 6
# 5: 1 2 1
# 6: 1 2 2
# 7: 3 2 5
# 8: 3 2 6
# 9: 1 3 1
#10: 1 3 2
#11: 2 3 3
#12: 2 3 4
# Now we have everything we need:
dt.extended[, mean(b), by = list(a = index)]
# a V1
#1: 3 2.5
#2: 2 3.5
#3: 1 4.5
Going back to original data (and doing some operations slightly differently, to simplify expressions):
extension = d[, {Carrier.uniq = unique(Carrier);
.SD[, rbindlist(combn(Carrier.uniq, length(Carrier.uniq)-1,
function(x) data.table(Carrier = x,
index = setdiff(Carrier.uniq, x)),
simplify = FALSE))]}, by = Market]
setkey(extension, Market, Carrier)
extension[d, allow.cartesian = TRUE][, mean(Stops), by = list(Market, Carrier = index)]
# Market Carrier V1
#1: IAH:SNA Southwest 1.000000
#2: IAH:SNA Delta 3.000000
#3: MSP:CLE JetBlue 2.000000
#4: MSP:CLE Southwest 1.500000
#5: MSP:CLE American 1.666667

Resources