Progressively find most frequent item in list in R - r

I would like to go through a list, and check to see if that item is the most frequent item in the list up until that point. The solution I currently have is incredibly slow compared to Python. Is there an effective way to speed it up?
dat<-data.table(sample(1:50,10000,replace=T))
k<-1
correct <- 0 # total correct predictions
for (i in 2:(nrow(dat)-1)) {
if (dat[i,V1] %in% dat[1:(i-1),.N,by=V1][order(-N),head(.SD,k)][,V1]) {
correct <- correct + 1
}
}
More generally, I would eventually like to see if an item is one of the k most
frequent items up until a point, or if it has one of the k highest values up until a point.
For comparison, here is a very fast implementation in Python:
dat=[random.randint(1,50) for i in range(10000)]
correct=0
k=1
list={}
for i in dat:
toplist=heapq.nlargest(k,list.iteritems(),key=operator.itemgetter(1))
toplist=[j[0] for j in toplist]
if i in toplist:
correct+=1
if list.has_key(i):
list[i]=list[i]+1
else:
list[i]=1

Here's what I've got so far (my solution is f3):
set.seed(10)
dat<-data.table(sample(1:3,100,replace=T))
k<-1
f3 <- function(dat) {
correct <- 0 # total correct predictions
vf <- factor(dat$V1)
v <- as.integer(vf)
tabs <- integer(max(v))
for (i in 2:(nrow(dat)-1)) {
tabs[v[i-1]] <- tabs[v[i-1]] + 1
#print(tabs)
#print(v[1:i])
if (match(v[i],order(tabs,decreasing = T))<=k) {
correct <- correct + 1
}
#print(correct)
#print('')
}
correct
}
f1 <- function(dat) {
correct <- 0 # total correct predictions
for (i in 2:(nrow(dat)-1)) {
if (dat[i,V1] %in% dat[1:(i-1),.N,by=V1][order(-N),head(.SD,k)]) {
correct <- correct + 1
}
}
correct
}
library(rbenchmark)
print(f1(dat)==f3(dat))
library(rbenchmark)
benchmark(f1(dat),f3(dat),replications=10)
The benchmark results:
test replications elapsed relative user.self sys.self user.child sys.child
1 f1(dat) 10 2.939 163.278 2.931 0.008 0 0
2 f3(dat) 10 0.018 1.000 0.018 0.000 0 0
are encouraging, but f3 has two problems:
It doesn't always provide the same answer as OP's algorithm because the ties are treated differently,
There is a lot of room for improvement, because tabs are sorted every time anew.

The condition is automatically true up until k+1 values have been observed:
startrow <- dat[,list(.I,.GRP),by=V1][.GRP==k+1]$.I[1]
correct <- rep(0L,length(v))
correct[1:(startrow-1)] <- 1L
You can precompute the number of appearances a value of V1 has had so far:
ct <- dat[,ct:=1:.N,by=V1]$ct
During the loop, we can check if the kth most frequent value is knocked out by the current value.
Grab the first k values and their counts up until startrow: topk <- sort(tapply(ct[1:(startrow-1)],v[1:(startrow-1)],max))
Note that the first item is the threshold for joining the top-k club: thresh <- unname(topk[1])
Loop from startrow to length(v), updating correct (here a vector, not a running sum) whenever the threshold is met; and updating the top-k club if the threshold is met and the value is not already in the club.
That's it; the rest is just details. Here's my function:
ff <- function(dat){
vf <- factor(dat$V1)
v <- as.integer(vf)
ct <- dat[,ct:=1:.N,by=V1]$ct
n <- length(v)
ct <- setNames(ct,v)
startrow <- dat[,list(.I,.GRP),by=V1][.GRP==k+1]$.I[1]
topk <- sort(tapply(ct[1:(startrow-1)],v[1:(startrow-1)],max))
thresh <- unname(topk[1])
correct <- rep(0L,n)
correct[1:(startrow-1)] <- 1L
for (i in startrow:n) {
cti = ct[i]
if ( cti >= thresh ){
correct[i] <- 1L
if ( cti > thresh & !( names(cti) %in% names(topk) ) ){
topk <- sort(c(cti,topk))[-1]
thresh <- unname(topk[1])
}
}
}
sum(correct)
}
It is very fast, but differs from #MaratTalipov's and the OP's in its results:
set.seed(1)
dat <- data.table(sample(1:50,10000,replace=T))
k <- 5
f1(dat) # 1012
f3(dat) # 1015
ff(dat) # 1719
Here's my benchmark (excluding the OP's approach as encapsulated in f1(), since I'm impatient):
> benchmark(f3(dat),ff(dat),replications=10)[,1:5]
test replications elapsed relative user.self
1 f3(dat) 10 2.68 2.602 2.67
2 ff(dat) 10 1.03 1.000 1.03
My function gives more matches than #Marat's and the OP's because it allows ties at the threshold to count as "correct", while theirs only count matches for at most k values selected by whatever algorithm R's order function uses.

[New Solution]
There is a lightning fast and very easy dplyr solution for k=1. The fC1 below treats the ties equally, i.e., no tie-breaking. You'll see that you can impose any tie-breaking rule on it. And, it is really fast.
library(dplyr)
fC1 <- function(dat){
dat1 <- tbl_df(dat) %>%
group_by(V1) %>%
mutate(count=row_number()-1) %>% ungroup() %>% slice(2:n()-1) %>%
filter(count!=0) %>%
mutate(z=cummax(count)) %>%
filter(count==z)
z <- dat1$z
length(z)
}
set.seed(1234)
dat<-data.table(sample(1:5000, 100000, replace=T))
system.time(a1 <- fC1(dat))[3] #returns 120
elapsed
0.04
system.time(a3m <- f3m(dat, 1))[3] #returns 29, same to the Python result which runs about 60s
elapsed
89.72
system.time(a3 <- f3(dat, 1))[3] #returns 31.
elapsed
95.07
You can freely impose some tie-breaking rule on the result of fC1 to arrive at a different solutions. For instance, in order to arrive at f3m or f3 solutions, we restrict the selection of some rows as follows
fC1_ <- function(dat){
b <- tbl_df(dat) %>%
group_by(V1) %>%
mutate(count=row_number()-1) %>%
ungroup() %>%
mutate(L=cummax(count+1))# %>%
b1 <- b %>% slice(2:(n()-1)) %>%
group_by(L) %>%
slice(1) %>%
filter(count+1>=L& count>0)
b2 <- b %>% group_by(L) %>%
slice(1) %>%
ungroup() %>%
select(-L) %>%
mutate(L=count)
semi_join(b1, b2, by=c("V1", "L")) %>% nrow
}
set.seed(1234)
dat <- data.table(sample(1:50,10000,replace=T))
fC1_(dat)
#[1] 218
f3m(dat, 1)
#[1] 217
f3(dat, 1)
#[1] 218
and for earlier example
set.seed(1234)
dat<-data.table(sample(1:5000, 100000, replace=T))
system.time(fC1_(dat))[3];fC1_(dat)
#elapsed
# 0.05
#[1] 29
Somehow, I couldn't extend the solution for general k>1, so I resorted to Rcpp.
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
std::vector<int> countrank(std::vector<int> y, int k) {
std::vector<int> v(y.begin(), y.begin() + k);
std::make_heap(v.begin(), v.end());
std::vector<int> count(y.size());
for(int i=0; i < y.size(); i++){
if(y[i]==0){count[i]=0;}
else{
v.push_back(y[i]); std::push_heap(v.begin(), v.end());
std::pop_heap(v.begin(), v.end()); v.pop_back();
std::vector<int>::iterator it = std::find (v.begin(), v.end(), y[i]);
if (it != v.end()) {count[i]=1;};
}
}
return count;
}
For k=1, it is worth noting that fC1 is at least as fast as the following Rcpp version fCpp.
fCpp <- function(dat, k) {
dat1 <- tbl_df(dat) %>%
group_by(V1) %>%
mutate(count=row_number())
x <- dat1$V1
y <- dat1$count-1
z <- countrank(-y, k)
sum(z[2:(nrow(dat)-1)])
}
Again, you can impose any tie-breaking rule with minimal effort.
[f3, f3m functions]
f3 is from #Marat Talipov and f3m is some amendment to it (seems superfluous though).
f3m <- function(dat, k){
n <- nrow(dat)
dat1 <- tbl_df(dat) %>%
group_by(V1) %>%
mutate(count=row_number())
x <- dat1$V1
y <- dat1$count
rank <- rep(NA, n)
tablex <- numeric(max(x))
for(i in 2:(n-1)){
if(y[i]==1){rank[i]=NA} #this condition was originally missing
else{
tablex[x[i-1]] = y[i-1]
rank[i]=match(x[i], order(tablex, decreasing = T))
}
}
rank <- rank[2:(n-1)]
sum(rank<=k, na.rm=T)
}
Refer to the edit history for earlier solution.

How about this solution:
# unique values
unq_vals <- sort(dat[, unique(V1)])
# cumulative count for each unique value by row
cum_count <- as.data.table(lapply(unq_vals, function(x) cumsum(dat$V1==x)))
# running ranking for each unique value by row
cum_ranks <- t(apply(-cum_count, 1, rank, ties.method='max'))
Now the rank of (e.g.) the 2nd unique value as of the 8th observation is stored in:
cum_ranks[8, 2]
You can get the rank of each item by row (and present it in a readable table) like this. If rank <= k for row i, then the i-th item of V1 is among the k-th most frequent items as of observation i.
dat[, .(V1, rank=sapply(1:length(V1), function(x) cum_ranks[x, V1[x]]))]
The first code block takes only 0.6883929 secs on my machine (according to a crude now <- Sys.time(); [code block in here]; Sys.time() - now timing), with dat <- data.table(sample(1:50, 10000, replace=T))

Related

Using a custom function expecting a vector with mutate and group_by

I would like to run a custom function that uses specific columns of a dataframe split by groups. Here is my sample data & function code:
my_data = data.frame(N = c(12, 12, 24, 24, 12, 12),
p = rep(c(.125,.125,.025),2),
group = rep(c("dogs","cats"),each=3))
power.sequential <- function(d, nseq, pseq){
decvec <- NULL
nvec <- NULL
for (i in 1:100){
decvec[i] <- 0
nvec[i] <- 0
j <- 1
x <- NULL
while(decvec[i] == 0 & nvec[i] < sum(nseq)){
x <- c(x, rnorm(nseq[j], mean = d))
p <- t.test(x)$p.value
nvec[i] <- nvec[i] + nseq[j]
if (p < pseq[j]) decvec[i] <- 1
j <- j + 1
}
}
power <- mean(decvec == 1)
meanN <- mean(nvec)
return(list("power" = power, "mean_N" = meanN))
}
Now I want to run this function on each group in my dataframe. This is how the function is called normally:
power.sequential(d = .5,
nseq = c(12,12,24),
pseq = c(.125,.125,.025))
The function returns two values, and ideally they would each be saved in a separate column of my dataframe.
And this is my best try, but it gives an error message:
my_data %>% group_by(group) %>%
mutate(result = power.sequential(d=.5,nseq=N,pseq=p))
I probably need to reshape my dataframe so that each group is a single row, but I'm stuck on how to proceed.
Here is my desired output, the function outputs two values (power and meanN), each should get its own column.
group power meanN
dogs .94 20.28
cats .95 27.36
You can do:
my_data %>%
group_by(group) %>%
do(data.frame(power.sequential(d=.5,nseq=.$N,pseq=.$p)[c(1, 2)])) %>%
data.frame()
That gives:
group power mean_N
1 cats 0.96 27.24
2 dogs 0.94 21.12
The task can be simplified with use of data.table. One can call the function in 'j` section directly and both values will appear as separate column.
library(data.table)
setDT(my_data)
set.seed(1)
my_data[,power.sequential(0.5, N, p), by=group]
# group power mean_N
# 1: dogs 0.90 24.48
# 2: cats 0.94 27.72
Note: set.seed(1) has been used to keep the result consistent.

How can I convert these nested loops into an R loop function like sapply or tapply or

I have this code which I run over a data frame t.
for (i in years){
for (j in type){
x <- rbind(x, cbind(i, j,
sum(t[(t$year == i) & (t$type == j),]$Emissions,
na.rm = TRUE)))
}
}
Basically, I have two vectors years and type. I'm finding the sum of each category and merging that into a data frame. The above code works, but I cannot figure out how to use one of the loop functions.
Yes, there are ways to do this using the apply functions. I'm going to suggest a high performance approach using dplyr, though.
library(dplyr)
x <- t %>%
group_by(year,type) %>%
summarize(SumEmmissions=sum(Emissions,na.rm=TRUE))
I think you will find that it is much faster than either a loop or apply approach.
=================== Proof, as requested ===============
library(dplyr)
N <- 1000000
Nyear <- 50
Ntype <- 40
myt <- data.frame(year=sample.int(50,N,replace=TRUE),
type=sample.int(4,N,replace=TRUE),
Emissions=rnorm(N)
)
years <- 1:Nyear
type <- 1:Ntype
v1 <- function(){
x <- myt %>%
group_by(year,type) %>%
summarize(SumEmmissions=sum(Emissions,na.rm=TRUE))
}
v2 <- function(){
x <- data.frame()
for (i in years){
for (j in type){
x <- rbind(x, cbind(i, j,
sum(myt[(myt$year == i) & (myt$type == j),]$Emissions, na.rm = TRUE)))
}
}
}
v3 <- function(){
t0 <- myt[myt$year %in% years & myt$type %in% type, ]
x <- aggregate(Emissions ~ year + type, t0, sum, na.rm = TRUE)
}
system.time(v1())
user system elapsed
0.051 0.000 0.051
system.time(v2())
user system elapsed
176.482 0.402 177.231
system.time(v3())
user system elapsed
7.758 0.011 7.783
As the sizes and number of groups increases, so does the performance spread.
Pick out all rows for which year is in years and type is in type giving t0. Then aggregate Emissions based on years and type.
t0 <- t[t$year %in% years & t$type %in% type, ]
aggregate(Emissions ~ year + type, t0, sum, na.rm = TRUE)
If the years and type vectors contain all years and types then the first line could be omitted and t0 in the second line replaced with t.
Next time please make your example reproducible.
Update Some corrections.

How to vectorize a function in R

I need some help vectorizing the following code because I believe that it will become more efficient. However i do not know how to begin... I created a loop that goes through z. z has 3 columns and 112847 rows, which might be a reason it takes a long time. The 3 columns contain numbers that are used in the MACD() function...
library(quantmod)
library(TTR)
# get stock data
getSymbols('LUNA')
#Choose the Adjusted Close of a Symbol
stock <- Ad(LUNA)
#Create matrix for returns only
y <- stock
#Create a "MATRIX" by choosing the Adjusted Close
Nudata3 <- stock
#Sharpe Ratio Matrix
SR1<- matrix(NA, nrow=1)
# I want to create a table with all possible combinations from the ranges below
i = c(2:50)
k = c(4:50)
j = c(2:50)
# stores possible combinations into z
z <- expand.grid(i,k,j)
colnames(z)<- c("one","two","three")
n = 1
stretches <- length(z[,1])
while (n < stretches){
# I am trying to go through all the values in "z"
Nuw <- MACD((stock), nFast=z[n,1], nSlow=z[n,2], nSig=z[n,3], maType="EMA")
colnames(Nuw) <- c("MACD","Signal") #change the col names to create signals
x <- na.omit(merge((stock), Nuw))
x$sig <- NA
# Create trading signals
sig1 <- Lag(ifelse((x$MACD <= x$Signal),-1, 0)) # short when MACD < SIGNAL
sig2 <- Lag(ifelse((x$MACD >= x$Signal),1, 0)) # long when MACD > SIGNAL
x$sig <- sig1 + sig2
#calculate Returns
ret <- na.omit(ROC(Ad(x))*x$sig)
colnames(ret)<- c(paste(z[n,1],z[n,2],z[n,3],sep=","))
x <- merge(ret,x)
y <- merge(y,ret) #This creates a MATRIX with RETURNs ONLY
Nudata3 <- merge(Nudata3, x)
((mean(ret)/sd(ret)) * sqrt(252)) -> ANNUAL # Creates a Ratio
ANNUAL->Shrat # stores Ratio into ShRat
SR1 <- cbind(SR1,Shrat) # binds all ratios as it loops
n <- (n+1)
}
I would like to know how to vectorize the MACD() function, to speed up the process since the length of stretches is approx. 112847. It takes my computer quite some time to go through the loop itself.
First and foremost - case specific optimization - remove the cases where nFast > nSlow as it doesn't make sense technically.
Secondly - you are creating objects and copying them over and over again. This is very expensive.
Thirdly - you can code this better perhaps by creating a matrix of signals in one loop and doing rest of the operations in vectorized manner.
I would code what you are doing something like this.
Please read help pages of mapply, do.call, merge and sapply if you don't understand.
require(quantmod)
getSymbols("LUNA")
#Choose the Adjusted Close of a Symbol
stock <- Ad(LUNA)
# I want to create a table with all possible combinations from the ranges below
i = c(2:50)
k = c(4:50)
j = c(2:50)
# stores possible combinations into z
z <- expand.grid(i,k,j)
IMO : This is where your first optimization should be. Remove cases where i > k
z <- z[z[,1]<z[,2], ]
It reduces the number of cases from 112847 to 57575
#Calculate only once. No need to calculate this in every iteration.
stockret <- ROC(stock)
getStratRet <- function(nFast, nSlow, nSig, stock, stockret) {
x <- MACD((stock), nFast=nFast, nSlow=nSlow, nSig=nSig, maType="EMA")
x <- na.omit(x)
sig <- Lag(ifelse((x$macd <= x$signal),-1, 0)) + Lag(ifelse((x$macd >= x$signal),1, 0))
return(na.omit(stockret * sig))
}
RETURNSLIST <- do.call(merge, mapply(FUN = getStratRet, nFast = z[,1], nSlow = z[,2], nSig = z[,3], MoreArgs = list(stock = stock, stockret = stockret), SIMPLIFY = TRUE))
getAnnualSharpe <- function(ret) {
ret <- na.omit(ret)
return ((mean(ret)/sd(ret)) * sqrt(252))
}
SHARPELIST <- sapply(RETURNSLIST, FUN = getAnnualSharpe)
Results will be as below. Which column belongs to which combo of i, j, k is trivial.
head(RETURNSLIST[, 1:3])
## LUNA.Adjusted LUNA.Adjusted.1 LUNA.Adjusted.2
## 2007-01-10 0.012739026 -0.012739026 0
## 2007-01-11 -0.051959739 0.051959739 0
## 2007-01-12 -0.007968170 -0.007968170 0
## 2007-01-16 -0.007905180 -0.007905180 0
## 2007-01-17 -0.005235614 -0.005235614 0
## 2007-01-18 0.028315920 -0.028315920 0
SHARPELIST
## LUNA.Adjusted LUNA.Adjusted.1 LUNA.Adjusted.2 LUNA.Adjusted.3 LUNA.Adjusted.4 LUNA.Adjusted.5 LUNA.Adjusted.6
## 0.04939150 -0.07428392 NaN 0.02626382 -0.06789803 -0.22584987 -0.07305477
## LUNA.Adjusted.7 LUNA.Adjusted.8 LUNA.Adjusted.9
## -0.05831643 -0.08864845 -0.08221986
system.time(
+ RETURNSLIST <- do.call(merge, mapply(FUN = getStratRet, nFast = z[1:100,1], nSlow = z[1:100,2], nSig = z[1:100,3], MoreArgs = list(stock = stock, stockret = stockret), SIMPLIFY = TRUE)),
+ SHARPELIST <- sapply(RETURNSLIST, FUN = getAnnualSharpe)
+ )
user system elapsed
2.28 0.00 2.29

Efficiently counting numbers falling within each range of numbers

I'm looking for a faster solution to the problem below. I'll illustrate the problem with a small example and then provide the code to simulate a large data as that's the point of this question. My actual problem size is of list length = 1 million entries.
Say, I've two lists as shown below:
x <- list(c(82, 18), c(35, 50, 15))
y <- list(c(1,2,3,55,90), c(37,38,95))
Properties of x and y:
Each element of the list x always sums up to 100.
Each element of y will always be sorted and will be always between 1 and 100.
The problem:
Now, what I'd like is this. Taking x[[1]] and y[[1]], I'd like to find the count of numbers in y[[1]] that are 1) <= 82 and 2) > 82 and <= 100. That would be, c(4, 1) because numbers <= 82 are c(1,2,3,55) and number between 83 and 100 is c(90). Similarly for x[[2]] and y[[2]], c(0, 2, 1). That is, the answer should be:
[[1]]
[1] 4 1
[[2]]
[1] 0 2 1
Let me know if this is still unclear.
Simulated data with 1 million entries
set.seed(1)
N <- 100
n <- 1e6
len <- sample(2:3, n, TRUE)
x <- lapply(seq_len(n), function(ix) {
probs <- sample(100:1000, len[ix])
probs <- probs/sum(probs)
oo <- round(N * probs)
if (sum(oo) != 100) {
oo[1] <- oo[1] + (100 - sum(oo))
}
oo
})
require(data.table)
ss <- sample(1:10, n, TRUE)
dt <- data.table(val=sample(1:N, sum(ss), TRUE), grp=rep(seq_len(n), ss))
setkey(dt, grp, val)
y <- dt[, list(list(val)),by=grp]$V1
What I've done so far:
Using mapply (slow):
I thought of using rank with ties.method="first" and mapply (obvious choice with 2 lists) first and tried out this:
tt1 <- mapply(y, x, FUN=function(a,b) {
tt <- rank(c(a, cumsum(b)), ties="first")[-(1:length(a))]; c(tt[1]-1, diff(tt)-1)
})
Although this works just fine, it takes a lot of time on 1M entries. I think the overhead of computing rank and diff that many times adds to it. This takes 241 seconds!
Therefore, I decided to try and overcome the usage of rank and diff by using data.table and sorting with a "group" column. I came up with a longer but much faster solution shown below:
Using data.table (faster):
xl <- sapply(x, length)
yl <- sapply(y, length)
xdt <- data.table(val=unlist(x, use.names=FALSE), grp=rep(seq_along(xl), xl), type = "x")
xdt[, cumval := cumsum(val), by=grp]
ydt <- data.table(val=unlist(y, use.names=FALSE), grp=rep(seq_along(yl), yl), type = "y")
tt2 <-rbindlist(list(ydt, xdt[, list(cumval, grp, type)]))
setkey(tt2, grp, val)
xdt.pos <- which(tt2$type == "x")
tt2[, type.x := 0L][xdt.pos, type.x := xdt.pos]
tt2 <- tt2[xdt.pos][tt2[, .N, by = grp][, N := cumsum(c(0, head(N, -1)))]][, sub := type.x - N]
tt2[, val := xdt$val]
# time consuming step
tt2 <- tt2[, c(sub[1]-1, sub[2:.N] - sub[1:(.N-1)] - 1), by = grp]
tt2 <- tt2[, list(list(V1)),by=grp]$V1
This takes 26 seconds. So it's about 9 times faster. I'm wondering if it's possible to get much more speedup as I'll have to recursively compute this on 5-10 such 1 million elements. Thank you.
Here's another data.table approach. Edit I added a (dirty?) hack that speeds this up and makes it ~2x faster than the OP data.table solution.
# compile the data.table's, set appropriate keys
xl <- sapply(x, length)
yl <- sapply(y, length)
xdt <- data.table(val=unlist(x, use.names=FALSE), grp=rep(seq_along(xl), xl))
xdt[, cumval := cumsum(val), by=grp]
ydt <- data.table(val=unlist(y, use.names=FALSE), grp=rep(seq_along(yl), yl))
# hack #0, set key but prevent sorting, since we know data is already sorted
setattr(ydt, 'sorted', c('grp', 'val'))
# by setting the key in y to val and in x to cumval we can
# leverage the rolling joins
setattr(xdt, 'sorted', c('grp', 'cumval')) # hack #1 set key, but prevent sorting
vals = xdt[, cumval.copy := cumval][ydt, roll = -Inf]
# hack #2, same deal as above
# we know that the order of cumval and cumval.copy is the same
# so let's convince data.table in that
setattr(vals, 'sorted', c('grp', 'cumval.copy'))
# compute the counts and fill in the missing 0's
# for when there is no y in the appropriate x interval
tt2 = vals[, .N, keyby = list(grp, cumval.copy)][xdt][is.na(N), N := 0L]
# convert to list
tt2 = tt2[order(grp, cumval.copy), list(list(N)), by = grp]$V1
This is about 25% faster but outputs as a matrix rather than a list. You many be able to use appy/sappy to make it work with a list (saving as a list was slowing it down).
c=matrix(0,length(x),100)
for(j in 1:length(x)){
a=-1
b=0
for(i in 1:length(x[[j]])){
a=b
b=b+x[[j]][i]
c[j,i]=sum((a<=y[[j]])*(y[[j]]<=b))
}
}

Efficiently locate group-wise constant columns in a data.frame

How can I efficiently extract group-wise constant columns from a data frame? I've included an plyr implementation below to make precise what I'm trying to do, but it's slow. How can I do it as efficiently as possible? (Ideally without splitting the data frame at all).
base <- data.frame(group = 1:1000, a = sample(1000), b = sample(1000))
df <- data.frame(
base[rep(seq_len(nrow(base)), length = 1e6), ],
c = runif(1e6),
d = runif(1e6)
)
is.constant <- function(x) length(unique(x)) == 1
constant_cols <- function(x) head(Filter(is.constant, x), 1)
system.time(constant <- ddply(df, "group", constant_cols))
# user system elapsed
# 20.531 1.670 22.378
stopifnot(identical(names(constant), c("group", "a", "b")))
stopifnot(nrow(constant) == 1000)
In my real use case (deep inside ggplot2) there may be an arbitrary number of constant and non-constant columns. The size of the data in the example is about the right order of magnitude.
(Edited to possibly address the issue of consecutive groups with the same value)
I'm tentatively submitting this answer, but I haven't completely convinced myself that it will correctly identify within group constant columns in all cases. But it's definitely faster (and can probably be improved):
constant_cols1 <- function(df,grp){
df <- df[order(df[,grp]),]
#Adjust values based on max diff in data
rle_group <- rle(df[,grp])
vec <- rep(rep(c(0,ceiling(diff(range(df)))),
length.out = length(rle_group$lengths)),
times = rle_group$lengths)
m <- matrix(vec,nrow = length(vec),ncol = ncol(df)-1)
df_new <- df
df_new[,-1] <- df[,-1] + m
rles <- lapply(df_new,FUN = rle)
nms <- names(rles)
tmp <- sapply(rles[nms != grp],
FUN = function(x){identical(x$lengths,rles[[grp]]$lengths)})
return(tmp)
}
My basic idea was to use rle, obviously.
I'm not sure if this is exactly what you are looking for, but it identifies columns a and b.
require(data.table)
is.constant <- function(x) identical(var(x), 0)
dtOne <- data.table(df)
system.time({dtTwo <- dtOne[, lapply(.SD, is.constant), by=group]
result <- apply(X=dtTwo[, list(a, b, c, d)], 2, all)
result <- result[result == TRUE] })
stopifnot(identical(names(result), c("a", "b")))
result
(edit: better answer)
What about something like
is.constant<-function(x) length(which(x==x[1])) == length(x)
This seems to be a nice improvement. Compare the following.
> a<-rnorm(5000000)
> system.time(is.constant(a))
user system elapsed
0.039 0.010 0.048
>
> system.time(is.constantOld(a))
user system elapsed
1.049 0.084 1.125
A bit slower than what hadley suggested above, but I think it should handle the case of equal adjacent groups
findBreaks <- function(x) cumsum(rle(x)$lengths)
constantGroups <- function(d, groupColIndex=1) {
d <- d[order(d[, groupColIndex]), ]
breaks <- lapply(d, findBreaks)
groupBreaks <- breaks[[groupColIndex]]
numBreaks <- length(groupBreaks)
isSubset <- function(x) length(x) <= numBreaks && length(setdiff(x, groupBreaks)) == 0
unlist(lapply(breaks[-groupColIndex], isSubset))
}
The intuition is that if a column is constant groupwise then the breaks in the column values (sorted by the group value) will be a subset of the breaks in the group value.
Now, compare it with hadley's (with small modification to ensure n is defined)
# df defined as in the question
n <- nrow(df)
changed <- function(x) c(TRUE, x[-1] != x[-n])
constant_cols2 <- function(df,grp){
df <- df[order(df[,grp]),]
changes <- lapply(df, changed)
vapply(changes[-1], identical, changes[[1]], FUN.VALUE = logical(1))
}
> system.time(constant_cols2(df, 1))
user system elapsed
1.779 0.075 1.869
> system.time(constantGroups(df))
user system elapsed
2.503 0.126 2.614
> df$f <- 1
> constant_cols2(df, 1)
a b c d f
TRUE TRUE FALSE FALSE FALSE
> constantGroups(df)
a b c d f
TRUE TRUE FALSE FALSE TRUE
Inspired by #Joran's answer, here's similar strategy that's a little faster (1 s vs 1.5 s on my machine)
changed <- function(x) c(TRUE, x[-1] != x[-n])
constant_cols2 <- function(df,grp){
df <- df[order(df[,grp]),]
n <- nrow(df)
changes <- lapply(df, changed)
vapply(changes[-1], identical, changes[[1]], FUN.VALUE = logical(1))
}
system.time(cols <- constant_cols2(df, "group")) # about 1 s
system.time(constant <- df[changed(df$group), cols])
# user system elapsed
# 1.057 0.230 1.314
stopifnot(identical(names(constant), c("group", "a", "b")))
stopifnot(nrow(constant) == 1000)
It has the same flaws though, in that it won't detect columns that are have the same values for adjacent groups (e.g. df$f <- 1)
With a bit more thinking plus #David's ideas:
constant_cols3 <- function(df, grp) {
# If col == TRUE and group == FALSE, not constant
matching_breaks <- function(group, col) {
!any(col & !group)
}
n <- nrow(df)
changed <- function(x) c(TRUE, x[-1] != x[-n])
df <- df[order(df[,grp]),]
changes <- lapply(df, changed)
vapply(changes[-1], matching_breaks, group = changes[[1]],
FUN.VALUE = logical(1))
}
system.time(x <- constant_cols3(df, "group"))
# user system elapsed
# 1.086 0.221 1.413
And that gives the correct result.
How fast does is.unsorted(x) fail for non-constant x? Sadly I don't have access to R at the moment. Also seems that's not your bottleneck though.

Resources