Efficiently locate group-wise constant columns in a data.frame - r

How can I efficiently extract group-wise constant columns from a data frame? I've included an plyr implementation below to make precise what I'm trying to do, but it's slow. How can I do it as efficiently as possible? (Ideally without splitting the data frame at all).
base <- data.frame(group = 1:1000, a = sample(1000), b = sample(1000))
df <- data.frame(
base[rep(seq_len(nrow(base)), length = 1e6), ],
c = runif(1e6),
d = runif(1e6)
)
is.constant <- function(x) length(unique(x)) == 1
constant_cols <- function(x) head(Filter(is.constant, x), 1)
system.time(constant <- ddply(df, "group", constant_cols))
# user system elapsed
# 20.531 1.670 22.378
stopifnot(identical(names(constant), c("group", "a", "b")))
stopifnot(nrow(constant) == 1000)
In my real use case (deep inside ggplot2) there may be an arbitrary number of constant and non-constant columns. The size of the data in the example is about the right order of magnitude.

(Edited to possibly address the issue of consecutive groups with the same value)
I'm tentatively submitting this answer, but I haven't completely convinced myself that it will correctly identify within group constant columns in all cases. But it's definitely faster (and can probably be improved):
constant_cols1 <- function(df,grp){
df <- df[order(df[,grp]),]
#Adjust values based on max diff in data
rle_group <- rle(df[,grp])
vec <- rep(rep(c(0,ceiling(diff(range(df)))),
length.out = length(rle_group$lengths)),
times = rle_group$lengths)
m <- matrix(vec,nrow = length(vec),ncol = ncol(df)-1)
df_new <- df
df_new[,-1] <- df[,-1] + m
rles <- lapply(df_new,FUN = rle)
nms <- names(rles)
tmp <- sapply(rles[nms != grp],
FUN = function(x){identical(x$lengths,rles[[grp]]$lengths)})
return(tmp)
}
My basic idea was to use rle, obviously.

I'm not sure if this is exactly what you are looking for, but it identifies columns a and b.
require(data.table)
is.constant <- function(x) identical(var(x), 0)
dtOne <- data.table(df)
system.time({dtTwo <- dtOne[, lapply(.SD, is.constant), by=group]
result <- apply(X=dtTwo[, list(a, b, c, d)], 2, all)
result <- result[result == TRUE] })
stopifnot(identical(names(result), c("a", "b")))
result

(edit: better answer)
What about something like
is.constant<-function(x) length(which(x==x[1])) == length(x)
This seems to be a nice improvement. Compare the following.
> a<-rnorm(5000000)
> system.time(is.constant(a))
user system elapsed
0.039 0.010 0.048
>
> system.time(is.constantOld(a))
user system elapsed
1.049 0.084 1.125

A bit slower than what hadley suggested above, but I think it should handle the case of equal adjacent groups
findBreaks <- function(x) cumsum(rle(x)$lengths)
constantGroups <- function(d, groupColIndex=1) {
d <- d[order(d[, groupColIndex]), ]
breaks <- lapply(d, findBreaks)
groupBreaks <- breaks[[groupColIndex]]
numBreaks <- length(groupBreaks)
isSubset <- function(x) length(x) <= numBreaks && length(setdiff(x, groupBreaks)) == 0
unlist(lapply(breaks[-groupColIndex], isSubset))
}
The intuition is that if a column is constant groupwise then the breaks in the column values (sorted by the group value) will be a subset of the breaks in the group value.
Now, compare it with hadley's (with small modification to ensure n is defined)
# df defined as in the question
n <- nrow(df)
changed <- function(x) c(TRUE, x[-1] != x[-n])
constant_cols2 <- function(df,grp){
df <- df[order(df[,grp]),]
changes <- lapply(df, changed)
vapply(changes[-1], identical, changes[[1]], FUN.VALUE = logical(1))
}
> system.time(constant_cols2(df, 1))
user system elapsed
1.779 0.075 1.869
> system.time(constantGroups(df))
user system elapsed
2.503 0.126 2.614
> df$f <- 1
> constant_cols2(df, 1)
a b c d f
TRUE TRUE FALSE FALSE FALSE
> constantGroups(df)
a b c d f
TRUE TRUE FALSE FALSE TRUE

Inspired by #Joran's answer, here's similar strategy that's a little faster (1 s vs 1.5 s on my machine)
changed <- function(x) c(TRUE, x[-1] != x[-n])
constant_cols2 <- function(df,grp){
df <- df[order(df[,grp]),]
n <- nrow(df)
changes <- lapply(df, changed)
vapply(changes[-1], identical, changes[[1]], FUN.VALUE = logical(1))
}
system.time(cols <- constant_cols2(df, "group")) # about 1 s
system.time(constant <- df[changed(df$group), cols])
# user system elapsed
# 1.057 0.230 1.314
stopifnot(identical(names(constant), c("group", "a", "b")))
stopifnot(nrow(constant) == 1000)
It has the same flaws though, in that it won't detect columns that are have the same values for adjacent groups (e.g. df$f <- 1)
With a bit more thinking plus #David's ideas:
constant_cols3 <- function(df, grp) {
# If col == TRUE and group == FALSE, not constant
matching_breaks <- function(group, col) {
!any(col & !group)
}
n <- nrow(df)
changed <- function(x) c(TRUE, x[-1] != x[-n])
df <- df[order(df[,grp]),]
changes <- lapply(df, changed)
vapply(changes[-1], matching_breaks, group = changes[[1]],
FUN.VALUE = logical(1))
}
system.time(x <- constant_cols3(df, "group"))
# user system elapsed
# 1.086 0.221 1.413
And that gives the correct result.

How fast does is.unsorted(x) fail for non-constant x? Sadly I don't have access to R at the moment. Also seems that's not your bottleneck though.

Related

Quickly apply & operation to pairs of columns in R

Let’s say I have two large data.tables and need to combine their columns pairwise using the & operation. The combinations are dictated by grid (combine dt1 column1 with dt2 column2, etc.)
Right now I'm using a mclapply loop and the script takes hours when I run the full dataset. I tried converting the data to a matrix and using a vectorized approach but that took even longer. Is there a faster and/or more elegant way to do this?
mx1 <- replicate(10, sample(c(T,F), size = 1e6, replace = T)) # 1e6 rows x 10 columns
mx1 <- as.data.table(mx1)
colnames(mx1) <- LETTERS[1:10]
mx2 <- replicate(10, sample(c(T,F), size = 1e6, replace = T)) # 1e6 rows x 10 columns
mx2 <- as.data.table(mx2)
colnames(mx2) <- letters[1:10]
grid <- expand.grid(col1 = colnames(mx1), col2 = colnames(mx2)) # the combinations I want to evaluate
out <- new_layer <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) { # <--- mclapply loop
mx1[[col1]] & mx2[[col2]]
}, SIMPLIFY = F)
setDT(out) # convert output into data table
colnames(out) <- paste(grid$col1, grid$col2, sep = "_")
For context, this data is from a gene expression matrix where 1 row = 1 cell
This can be done directly with no mapply: Just ensure that the with argument is FALSE
ie:
mx1[, grid$col1, with = FALSE] & mx2[, grid$col2, with=FALSE]
After some digging around I found a package called bit that is specifically designed for fast boolean operations. Converting each column of my data.table from logical to bit gave me a 100-fold increase in compute speed.
# Load libraries.
library(data.table)
library(bit)
# Create data set.
mx1 <- replicate(10, sample(c(T,F), size = 5e6, replace = T)) # 5e6 rows x 10 columns
colnames(mx1) <- LETTERS[1:10]
mx2 <- replicate(10, sample(c(T,F), size = 5e6, replace = T)) # 5e6 rows x 10 columns
colnames(mx2) <- letters[1:10]
grid <- expand.grid(col1 = colnames(mx1), col2 = colnames(mx2)) # combinations I want to evaluate
# Single operation with logical matrix.
system.time({
out <- mx1[, grid$col1] & mx2[, grid$col2]
}) # 26.014s
# Loop with logical matrix.
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1[, col1] & mx2[, col2]
})
}) # 31.914s
# Single operation with logical data.table.
mx1.dt <- as.data.table(mx1)
mx2.dt <- as.data.table(mx2)
system.time({
out <- mx1.dt[, grid$col1, with = F] & mx2.dt[, grid$col2, with = F] # 26.014s
}) # 32.349s
# Loop with logical data.table.
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1.dt[[col1]] & mx2.dt[[col2]]
})
}) # 15.031s <---- SECOND FASTEST TIME, ~2X IMPROVEMENT
# Loop with bit data.table.
mx1.bit <- mx1.dt[, lapply(.SD, as.bit)]
mx2.bit <- mx2.dt[, lapply(.SD, as.bit)]
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1.bit[[col1]] & mx2.bit[[col2]]
})
}) # 0.383s <---- FASTEST TIME, ~100X IMPROVEMENT
# Convert back to logical table.
out <- setDT(out)
colnames(out) <- paste(grid$col1, grid$col2, sep = "_")
out <- out[, lapply(.SD, as.logical)]
There are also special functions like sum.bit and ri that you can use to aggregate data without converting it back to logical.

Fast ways to subset categorical data in R with multiple conditions

I have a large dataset in R (say >40,000 rows and >20 categorical columns) that I repeatedly subset, so I would like to speed this up as much as possible. It needs to be a general function (each categorical column has a discrete number of possible values, say in string format).
Each time I subset, I need to identify the subset of rows that satisfy multiple logical set membership conditions (e.g. >10 conditions). I.e., I need to check several columns and check if values in that column match a certain set membership (hence the use of %in%).
# simple dataset example
library(dplyr)
num_col <- 15
num_row <- 100000
dat_list <- list()
for (i in 1:num_col) {
dat_list[[i]] <- data_frame(sample(letters[1:10], size = num_row, r = T))
}
dat <- bind_cols(dat_list)
names(dat) <- paste0("col", seq(15))
I've looked around the internet and SO a lot, but haven't found the discussion of performance I'm looking for. I mostly code using dplyr, so apologies if there's a clear performance improvement here in data.table; I've tried some simple benchmarks between the two (but without using any data.table indexing or etc.) and it's not obvious if one is faster.
Example options I've considered (since I'm not great at data.table, I've excluded data.table options from here):
base_filter <- function(dat) {
for (i in 1:7) {
col_name <- paste0('col', i)
dat <- dat[dat[[col_name]] %in% sample(letters[1:10], size = 4), ]
}
dat
}
dplyr_filter1 <- function(dat) {
for (i in 1:7) {
col_name <- paste0('col', i)
dat <- filter_(dat,
.dots = interp(~ colname %in% vals,
colname = as.name(col_name),
vals = sample(letters[1:10], size = 4)))
}
dat
}
dplyr_filter2 <- function(dat) {
dots_filter <- list()
for (i in 1:7) {
col_name <- paste0('col', i)
dots_filter[[i]] <- interp(~ colname %in% vals,
colname = as.name(col_name),
vals = sample(letters[1:10], size = 4))
}
filter_(dat, .dots = dots_filter)
}
Note: In practice, on my real datasets, dplyr_filter2 actually works fastest. I've also tried dtplyr or converting my data to a data.table, but this seems slower than without.
Note: On the other hand, in practice, the base R function outperforms the dplyr examples when data has fewer rows and fewer columns (perhaps due to copying speed?).
Thus, I'd like to ask SO what the general, most efficient way(s) to subset a categorical dataframe under multiple (set membership) conditions is. And if possible, explain the mechanics for why? Does this answer differ for smaller datasets? Does it depend on copying time or search time?
Useful related links
fast lookup for one key
using hash tables in R for key-value pairs
Understand that you prefer not to use data.table. Just providing some timings for reference below. With indexing, subsetting can be performed much faster and inner join of the 2 tables can also be done easily in data.table.
# simple dataset example
library(dplyr)
library(lazyeval)
set.seed(0L)
num_col <- 15
num_row <- 100000
dat_list <- list()
for (i in 1:num_col) {
dat_list[[i]] <- data_frame(sample(letters[1:10], size = num_row, r = T))
}
dat <- bind_cols(dat_list)
names(dat) <- paste0("col", seq(15))
selection <- lapply(1:7, function(n) sample(letters[1:10], size = 4))
base_filter <- function(df) {
for (i in 1:7) {
col_name <- paste0('col', i)
df <- df[df[[col_name]] %in% selection[[i]], ]
}
df
}
dplyr_filter1 <- function(df) {
for (i in 1:7) {
col_name <- paste0('col', i)
df <- filter_(df,
.dots = interp(~ colname %in% vals,
colname = as.name(col_name),
vals = selection[[i]]))
}
df
}
dplyr_filter2 <- function(df) {
dots_filter <- list()
for (i in 1:7) {
col_name <- paste0('col', i)
dots_filter[[i]] <- interp(~ colname %in% vals,
colname = as.name(col_name),
vals = selection[[i]])
}
filter_(df, .dots = dots_filter)
}
library(data.table)
#convert data.frame into data.table
dt <- data.table(dat, key=names(dat)[1:7])
#create the sets of selection
dtSelection <- data.table(expand.grid(selection, stringsAsFactors=FALSE))
library(microbenchmark)
microbenchmark(
base_filter(dat),
dplyr_filter1(dat),
dplyr_filter2(dat),
dt[dtSelection, nomatch=0], #perform inner join between dataset and selection
times=5L)
#Unit: milliseconds
# expr min lq mean median uq max neval
# base_filter(dat) 27.084801 27.870702 35.849261 32.045900 32.872601 59.372301 5
# dplyr_filter1(dat) 23.130100 24.114301 26.922081 24.860701 29.804301 32.701002 5
# dplyr_filter2(dat) 29.641101 30.686002 32.363681 31.103000 31.884701 38.503601 5
# dt[dtSelection, nomatch = 0] 3.626001 3.646201 3.829341 3.686601 3.687001 4.500901 5
In addition to chinsoon12's alternatives, one thing to consider is to avoid subsetting the data.frame in each iteration. So, instead of
f0 = function(x, cond)
{
for(j in seq_along(x)) x = x[x[[j]] %in% cond[[j]], ]
return(x)
}
one alternative is to accumulate a logical vector of whether to include each row in the final subset:
f1 = function(x, cond)
{
i = rep_len(TRUE, nrow(x))
for(j in seq_along(x)) i = i & (x[[j]] %in% cond[[j]])
return(x[i, ])
}
or, another alternative, is to iteratively reduce the amount of comparisons, but by reducing the row indices instead of the data.frame itself:
f2 = function(x, cond)
{
i = 1:nrow(x)
for(j in seq_along(x)) i = i[x[[j]][i] %in% cond[[j]]]
return(x[i, ])
}
And a comparison with data:
set.seed(1821)
dat = as.data.frame(replicate(30, sample(c(letters, LETTERS), 5e5, TRUE), FALSE),
stringsAsFactors = FALSE)
conds = replicate(ncol(dat), sample(c(letters, LETTERS), 48), FALSE)
system.time({ ans0 = f0(dat, conds) })
# user system elapsed
# 3.44 0.28 3.86
system.time({ ans1 = f1(dat, conds) })
# user system elapsed
# 0.66 0.01 0.68
system.time({ ans2 = f2(dat, conds) })
# user system elapsed
# 0.34 0.01 0.39
identical(ans0, ans1)
#[1] TRUE
identical(ans1, ans2)
#[1] TRUE

Faster %in% operator

The fastmatch package implements a much faster version of match for repeated matches (e.g. in a loop):
set.seed(1)
library(fastmatch)
table <- 1L:100000L
x <- sample(table, 10000, replace=TRUE)
system.time(for(i in 1:100) a <- match(x, table))
system.time(for(i in 1:100) b <- fmatch(x, table))
identical(a, b)
Is there a similar implementation for %in% I could use to speed up repeated lookups?
Look at the definition of %in%:
R> `%in%`
function (x, table)
match(x, table, nomatch = 0L) > 0L
<bytecode: 0x1fab7a8>
<environment: namespace:base>
It's easy to write your own %fin% function:
`%fin%` <- function(x, table) {
stopifnot(require(fastmatch))
fmatch(x, table, nomatch = 0L) > 0L
}
system.time(for(i in 1:100) a <- x %in% table)
# user system elapsed
# 1.780 0.000 1.782
system.time(for(i in 1:100) b <- x %fin% table)
# user system elapsed
# 0.052 0.000 0.054
identical(a, b)
# [1] TRUE
match is almost always better done by putting both vectors in dataframes and merging (see various joins from dplyr)
For example, something like this would give you all the info you need:
library(dplyr)
data = data_frame(data.ID = 1L:100000L,
data.extra = 1:2)
sample =
data %>%
sample_n(10000, replace=TRUE) %>%
mutate(sample.ID = 1:n(),
sample.extra = 3:4 )
# join table not strictly necessary in this case
# but necessary in many-to-many matches
data__sample = inner_join(data, sample)
#check whether a data.ID made it into sample
data__sample %>% filter(data.ID == 1)
or left_join, right_join, full_join, semi_join, anti_join, depending on what info is most useful to you

Progressively find most frequent item in list in R

I would like to go through a list, and check to see if that item is the most frequent item in the list up until that point. The solution I currently have is incredibly slow compared to Python. Is there an effective way to speed it up?
dat<-data.table(sample(1:50,10000,replace=T))
k<-1
correct <- 0 # total correct predictions
for (i in 2:(nrow(dat)-1)) {
if (dat[i,V1] %in% dat[1:(i-1),.N,by=V1][order(-N),head(.SD,k)][,V1]) {
correct <- correct + 1
}
}
More generally, I would eventually like to see if an item is one of the k most
frequent items up until a point, or if it has one of the k highest values up until a point.
For comparison, here is a very fast implementation in Python:
dat=[random.randint(1,50) for i in range(10000)]
correct=0
k=1
list={}
for i in dat:
toplist=heapq.nlargest(k,list.iteritems(),key=operator.itemgetter(1))
toplist=[j[0] for j in toplist]
if i in toplist:
correct+=1
if list.has_key(i):
list[i]=list[i]+1
else:
list[i]=1
Here's what I've got so far (my solution is f3):
set.seed(10)
dat<-data.table(sample(1:3,100,replace=T))
k<-1
f3 <- function(dat) {
correct <- 0 # total correct predictions
vf <- factor(dat$V1)
v <- as.integer(vf)
tabs <- integer(max(v))
for (i in 2:(nrow(dat)-1)) {
tabs[v[i-1]] <- tabs[v[i-1]] + 1
#print(tabs)
#print(v[1:i])
if (match(v[i],order(tabs,decreasing = T))<=k) {
correct <- correct + 1
}
#print(correct)
#print('')
}
correct
}
f1 <- function(dat) {
correct <- 0 # total correct predictions
for (i in 2:(nrow(dat)-1)) {
if (dat[i,V1] %in% dat[1:(i-1),.N,by=V1][order(-N),head(.SD,k)]) {
correct <- correct + 1
}
}
correct
}
library(rbenchmark)
print(f1(dat)==f3(dat))
library(rbenchmark)
benchmark(f1(dat),f3(dat),replications=10)
The benchmark results:
test replications elapsed relative user.self sys.self user.child sys.child
1 f1(dat) 10 2.939 163.278 2.931 0.008 0 0
2 f3(dat) 10 0.018 1.000 0.018 0.000 0 0
are encouraging, but f3 has two problems:
It doesn't always provide the same answer as OP's algorithm because the ties are treated differently,
There is a lot of room for improvement, because tabs are sorted every time anew.
The condition is automatically true up until k+1 values have been observed:
startrow <- dat[,list(.I,.GRP),by=V1][.GRP==k+1]$.I[1]
correct <- rep(0L,length(v))
correct[1:(startrow-1)] <- 1L
You can precompute the number of appearances a value of V1 has had so far:
ct <- dat[,ct:=1:.N,by=V1]$ct
During the loop, we can check if the kth most frequent value is knocked out by the current value.
Grab the first k values and their counts up until startrow: topk <- sort(tapply(ct[1:(startrow-1)],v[1:(startrow-1)],max))
Note that the first item is the threshold for joining the top-k club: thresh <- unname(topk[1])
Loop from startrow to length(v), updating correct (here a vector, not a running sum) whenever the threshold is met; and updating the top-k club if the threshold is met and the value is not already in the club.
That's it; the rest is just details. Here's my function:
ff <- function(dat){
vf <- factor(dat$V1)
v <- as.integer(vf)
ct <- dat[,ct:=1:.N,by=V1]$ct
n <- length(v)
ct <- setNames(ct,v)
startrow <- dat[,list(.I,.GRP),by=V1][.GRP==k+1]$.I[1]
topk <- sort(tapply(ct[1:(startrow-1)],v[1:(startrow-1)],max))
thresh <- unname(topk[1])
correct <- rep(0L,n)
correct[1:(startrow-1)] <- 1L
for (i in startrow:n) {
cti = ct[i]
if ( cti >= thresh ){
correct[i] <- 1L
if ( cti > thresh & !( names(cti) %in% names(topk) ) ){
topk <- sort(c(cti,topk))[-1]
thresh <- unname(topk[1])
}
}
}
sum(correct)
}
It is very fast, but differs from #MaratTalipov's and the OP's in its results:
set.seed(1)
dat <- data.table(sample(1:50,10000,replace=T))
k <- 5
f1(dat) # 1012
f3(dat) # 1015
ff(dat) # 1719
Here's my benchmark (excluding the OP's approach as encapsulated in f1(), since I'm impatient):
> benchmark(f3(dat),ff(dat),replications=10)[,1:5]
test replications elapsed relative user.self
1 f3(dat) 10 2.68 2.602 2.67
2 ff(dat) 10 1.03 1.000 1.03
My function gives more matches than #Marat's and the OP's because it allows ties at the threshold to count as "correct", while theirs only count matches for at most k values selected by whatever algorithm R's order function uses.
[New Solution]
There is a lightning fast and very easy dplyr solution for k=1. The fC1 below treats the ties equally, i.e., no tie-breaking. You'll see that you can impose any tie-breaking rule on it. And, it is really fast.
library(dplyr)
fC1 <- function(dat){
dat1 <- tbl_df(dat) %>%
group_by(V1) %>%
mutate(count=row_number()-1) %>% ungroup() %>% slice(2:n()-1) %>%
filter(count!=0) %>%
mutate(z=cummax(count)) %>%
filter(count==z)
z <- dat1$z
length(z)
}
set.seed(1234)
dat<-data.table(sample(1:5000, 100000, replace=T))
system.time(a1 <- fC1(dat))[3] #returns 120
elapsed
0.04
system.time(a3m <- f3m(dat, 1))[3] #returns 29, same to the Python result which runs about 60s
elapsed
89.72
system.time(a3 <- f3(dat, 1))[3] #returns 31.
elapsed
95.07
You can freely impose some tie-breaking rule on the result of fC1 to arrive at a different solutions. For instance, in order to arrive at f3m or f3 solutions, we restrict the selection of some rows as follows
fC1_ <- function(dat){
b <- tbl_df(dat) %>%
group_by(V1) %>%
mutate(count=row_number()-1) %>%
ungroup() %>%
mutate(L=cummax(count+1))# %>%
b1 <- b %>% slice(2:(n()-1)) %>%
group_by(L) %>%
slice(1) %>%
filter(count+1>=L& count>0)
b2 <- b %>% group_by(L) %>%
slice(1) %>%
ungroup() %>%
select(-L) %>%
mutate(L=count)
semi_join(b1, b2, by=c("V1", "L")) %>% nrow
}
set.seed(1234)
dat <- data.table(sample(1:50,10000,replace=T))
fC1_(dat)
#[1] 218
f3m(dat, 1)
#[1] 217
f3(dat, 1)
#[1] 218
and for earlier example
set.seed(1234)
dat<-data.table(sample(1:5000, 100000, replace=T))
system.time(fC1_(dat))[3];fC1_(dat)
#elapsed
# 0.05
#[1] 29
Somehow, I couldn't extend the solution for general k>1, so I resorted to Rcpp.
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
std::vector<int> countrank(std::vector<int> y, int k) {
std::vector<int> v(y.begin(), y.begin() + k);
std::make_heap(v.begin(), v.end());
std::vector<int> count(y.size());
for(int i=0; i < y.size(); i++){
if(y[i]==0){count[i]=0;}
else{
v.push_back(y[i]); std::push_heap(v.begin(), v.end());
std::pop_heap(v.begin(), v.end()); v.pop_back();
std::vector<int>::iterator it = std::find (v.begin(), v.end(), y[i]);
if (it != v.end()) {count[i]=1;};
}
}
return count;
}
For k=1, it is worth noting that fC1 is at least as fast as the following Rcpp version fCpp.
fCpp <- function(dat, k) {
dat1 <- tbl_df(dat) %>%
group_by(V1) %>%
mutate(count=row_number())
x <- dat1$V1
y <- dat1$count-1
z <- countrank(-y, k)
sum(z[2:(nrow(dat)-1)])
}
Again, you can impose any tie-breaking rule with minimal effort.
[f3, f3m functions]
f3 is from #Marat Talipov and f3m is some amendment to it (seems superfluous though).
f3m <- function(dat, k){
n <- nrow(dat)
dat1 <- tbl_df(dat) %>%
group_by(V1) %>%
mutate(count=row_number())
x <- dat1$V1
y <- dat1$count
rank <- rep(NA, n)
tablex <- numeric(max(x))
for(i in 2:(n-1)){
if(y[i]==1){rank[i]=NA} #this condition was originally missing
else{
tablex[x[i-1]] = y[i-1]
rank[i]=match(x[i], order(tablex, decreasing = T))
}
}
rank <- rank[2:(n-1)]
sum(rank<=k, na.rm=T)
}
Refer to the edit history for earlier solution.
How about this solution:
# unique values
unq_vals <- sort(dat[, unique(V1)])
# cumulative count for each unique value by row
cum_count <- as.data.table(lapply(unq_vals, function(x) cumsum(dat$V1==x)))
# running ranking for each unique value by row
cum_ranks <- t(apply(-cum_count, 1, rank, ties.method='max'))
Now the rank of (e.g.) the 2nd unique value as of the 8th observation is stored in:
cum_ranks[8, 2]
You can get the rank of each item by row (and present it in a readable table) like this. If rank <= k for row i, then the i-th item of V1 is among the k-th most frequent items as of observation i.
dat[, .(V1, rank=sapply(1:length(V1), function(x) cum_ranks[x, V1[x]]))]
The first code block takes only 0.6883929 secs on my machine (according to a crude now <- Sys.time(); [code block in here]; Sys.time() - now timing), with dat <- data.table(sample(1:50, 10000, replace=T))

Speed up quantile calculation

I am using the Hmisc Package to calculate the quantiles of two continous variables and compare the results in a crosstable. You find my code below.
My problem is that the calculation of the quantiles takes a considerable amount of time if the number of observations increases.
Is there any possibility to speed up this procedure by using the data.table, ddply or any other package?
Thanks.
library(Hmisc)
# Set seed
set.seed(123)
# Generate some data
a <- sample(1:25, 1e7, replace=TRUE)
b <- sample(1:25, 1e7, replace=TRUE)
c <- data.frame(a,b)
# Calculate quantiles
c$a.quantile <- cut2(a, g=5)
c$b.quantile <- cut2(b, g=5)
# Output some descriptives
summaryM(a.quantile ~ b.quantile, data=c, overall=TRUE)
# Time spent for calculation:
# User System verstrichen
# 25.13 3.47 28.73
As stated by jlhoward and Ricardo Saporta data.table doesn't seem to speed up things too much in this case. The cut2 function is clearly the bottleneck here. I used another function to calculate the quantiles (see Is there a better way to create quantile "dummies" / factors in R?) and was able to decrease the calculation time by half:
qcut <- function(x, n) {
if(n<=2)
{
stop("The sample must be split in at least 3 parts.")
}
else{
break.values <- quantile(x, seq(0, 1, length = n + 1), type = 7)
break.labels <- c(
paste0(">=",break.values[1], " & <=", break.values[2]),
sapply(break.values[3:(n)], function(x){paste0(">",break.values[which(break.values == x)-1], " & <=", x)}),
paste0(">",break.values[(n)], " & <=", break.values[(n+1)]))
cut(x, break.values, labels = break.labels,include.lowest = TRUE)
}
}
c$a.quantile.2 <- qcut(c$a, 5)
c$b.quantile.2 <- qcut(c$b, 5)
summaryM(a.quantile.2 ~ b.quantile.2, data=c, overall=TRUE)
# Time spent for calculation:
# User System verstrichen
# 10.22 1.47 11.70
Using data.table would reduce the calculation time by another second, but I like the summary by the Hmisc package better.
You can use data.table's .N built in variable, to quickly tabulate.
library(data.table)
library(Hmisc)
DT <- data.table(a,b)
DT[, paste0(c("a", "b"), ".quantile") := lapply(.SD, cut2, g=5), .SDcols=c("a", "b")]
DT[, .N, keyby=list(b.quantile, a.quantile)][, setNames(as.list(N), as.character(b.quantile)), by=a.quantile]
You can break that last line down into two steps, to see what is going on. The second "[ " simply reshapes the data in a clean format.
DT.tabulated <- DT[, .N, keyby=list(b.quantile, a.quantile)]
DT.tabulated
DT.tabulated[, setNames(as.list(N), as.character(b.quantile)), by=a.quantile]
Data tables don't seem to improve things here:
library(Hmisc)
set.seed(123)
a <- sample(1:25, 1e7, replace=TRUE)
b <- sample(1:25, 1e7, replace=TRUE)
library(data.table)
# original approach
system.time({
c <- data.frame(a,b)
c$a.quantile <- cut2(a, g=5)
c$b.quantile <- cut2(b, g=5)
smry.1 <-summaryM(a.quantile ~ b.quantile, data=c, overall=TRUE)
})
user system elapsed
72.79 6.22 79.02
# original data.table approach
system.time({
DT <- data.table(a,b)
DT[, paste0(c("a", "b"), ".quantile") := lapply(.SD, cut2, g=5), .SDcols=c("a", "b")]
smry.2 <- DT[, .N, keyby=list(b.quantile, a.quantile)][, setNames(as.list(N), as.character(b.quantile)), by=a.quantile]
})
user system elapsed
66.86 5.11 71.98
# different data.table approach (simpler, and uses table(...))
system.time({
dt <- data.table(a,b)
smry.3 <- table(dt[,lapply(dt,cut2,g=5)])
})
user system elapsed
67.24 5.02 72.26

Resources