Speed up quantile calculation - r

I am using the Hmisc Package to calculate the quantiles of two continous variables and compare the results in a crosstable. You find my code below.
My problem is that the calculation of the quantiles takes a considerable amount of time if the number of observations increases.
Is there any possibility to speed up this procedure by using the data.table, ddply or any other package?
Thanks.
library(Hmisc)
# Set seed
set.seed(123)
# Generate some data
a <- sample(1:25, 1e7, replace=TRUE)
b <- sample(1:25, 1e7, replace=TRUE)
c <- data.frame(a,b)
# Calculate quantiles
c$a.quantile <- cut2(a, g=5)
c$b.quantile <- cut2(b, g=5)
# Output some descriptives
summaryM(a.quantile ~ b.quantile, data=c, overall=TRUE)
# Time spent for calculation:
# User System verstrichen
# 25.13 3.47 28.73

As stated by jlhoward and Ricardo Saporta data.table doesn't seem to speed up things too much in this case. The cut2 function is clearly the bottleneck here. I used another function to calculate the quantiles (see Is there a better way to create quantile "dummies" / factors in R?) and was able to decrease the calculation time by half:
qcut <- function(x, n) {
if(n<=2)
{
stop("The sample must be split in at least 3 parts.")
}
else{
break.values <- quantile(x, seq(0, 1, length = n + 1), type = 7)
break.labels <- c(
paste0(">=",break.values[1], " & <=", break.values[2]),
sapply(break.values[3:(n)], function(x){paste0(">",break.values[which(break.values == x)-1], " & <=", x)}),
paste0(">",break.values[(n)], " & <=", break.values[(n+1)]))
cut(x, break.values, labels = break.labels,include.lowest = TRUE)
}
}
c$a.quantile.2 <- qcut(c$a, 5)
c$b.quantile.2 <- qcut(c$b, 5)
summaryM(a.quantile.2 ~ b.quantile.2, data=c, overall=TRUE)
# Time spent for calculation:
# User System verstrichen
# 10.22 1.47 11.70
Using data.table would reduce the calculation time by another second, but I like the summary by the Hmisc package better.

You can use data.table's .N built in variable, to quickly tabulate.
library(data.table)
library(Hmisc)
DT <- data.table(a,b)
DT[, paste0(c("a", "b"), ".quantile") := lapply(.SD, cut2, g=5), .SDcols=c("a", "b")]
DT[, .N, keyby=list(b.quantile, a.quantile)][, setNames(as.list(N), as.character(b.quantile)), by=a.quantile]
You can break that last line down into two steps, to see what is going on. The second "[ " simply reshapes the data in a clean format.
DT.tabulated <- DT[, .N, keyby=list(b.quantile, a.quantile)]
DT.tabulated
DT.tabulated[, setNames(as.list(N), as.character(b.quantile)), by=a.quantile]

Data tables don't seem to improve things here:
library(Hmisc)
set.seed(123)
a <- sample(1:25, 1e7, replace=TRUE)
b <- sample(1:25, 1e7, replace=TRUE)
library(data.table)
# original approach
system.time({
c <- data.frame(a,b)
c$a.quantile <- cut2(a, g=5)
c$b.quantile <- cut2(b, g=5)
smry.1 <-summaryM(a.quantile ~ b.quantile, data=c, overall=TRUE)
})
user system elapsed
72.79 6.22 79.02
# original data.table approach
system.time({
DT <- data.table(a,b)
DT[, paste0(c("a", "b"), ".quantile") := lapply(.SD, cut2, g=5), .SDcols=c("a", "b")]
smry.2 <- DT[, .N, keyby=list(b.quantile, a.quantile)][, setNames(as.list(N), as.character(b.quantile)), by=a.quantile]
})
user system elapsed
66.86 5.11 71.98
# different data.table approach (simpler, and uses table(...))
system.time({
dt <- data.table(a,b)
smry.3 <- table(dt[,lapply(dt,cut2,g=5)])
})
user system elapsed
67.24 5.02 72.26

Related

Selecting rows from data.table

I'm using the R package data.table to read large amounts of data and analyse them. What I was wondering is why is selecting rows from a data.table so much slower than from a matrix.
require(data.table)
## create some random data
n = 1000
p = 1000
set.seed(1)
data.raw <- matrix(rnorm(n*p), nrow = n, ncol = p)
rownames(data.raw) <- lapply(1:n, FUN = function(x, length)paste(sample(c(letters, LETTERS), length, replace=TRUE), collapse=""), length = 10)
colnames(data.raw) <- paste0("X", 1:n)
#do the same thing as data.table
data.t <- data.table(data.raw)
data.t[, id := rownames(data.raw)]
setkey(data.t, id)
## now select one row after the other in both matrix and data.table
system.time(for(r in rownames(data.raw)) y <- data.raw[r, ])
# user system elapsed
# 0.016 0.000 0.017
system.time(for(r in data.t$id) y <- data.t[r])
# user system elapsed
# 30.580 0.000 30.608
Even for this relatively small example, data.table is extremely slow even though using the setkey. Is there any way to improve the performance of this?

R efficient way to sort a Matrix by row

I have a matrix "multiOrderPairsFlat" of 2m+ rows and 2 columns where each cell contains a SKU description (e.g. "Pipe2mSteel" or "Bushing1inS") and would like to sort every row alphabetically, so that in every row, e.g. "Bushings1inS" is in the first column and "Pipe2mSteel" in the second.
However, if I run:
for (i in 1:length(multiOrderPairsFlat)){
multiOrderPairsFlat[i,] <- sort(multiOrderPairsFlat[i,])
}
It takes forever and I doubt this is the quickest way of dealing with this problem. Do you have any advice on how to solve this more efficiently, e.g. by vectorizing the operation?
Thanks for helping out;)
Best
seulberg1
It may be better to use pmin/pmax after converting to data.frame (as there are only two columns)
system.time({
df1 <- as.data.frame(multiOrderPairsFlat, stringsAsFactors=FALSE)
res <- data.frame(First = do.call(pmin, df1), Second = do.call(pmax, df1))
})
# user system elapsed
# 0.49 0.02 0.50
system.time({
for (i in 1:nrow(multiOrderPairsFlat)){
multiOrderPairsFlat[i,] <- sort(multiOrderPairsFlat[i,])
}
})
# user system elapsed
# 11.99 0.00 12.00
all.equal(as.matrix(res), multiOrderPairsFlat, check.attributes=FALSE)
#[1] TRUE
Checking the memory allocation
library(profvis)
profvis({
df1 <- as.data.frame(multiOrderPairsFlat, stringsAsFactors=FALSE)
res <- data.frame(First = do.call(pmin, df1), Second = do.call(pmax, df1))
})
#3.3 MB
profvis({
for (i in 1:nrow(multiOrderPairsFlat)){
multiOrderPairsFlat[i,] <- sort(multiOrderPairsFlat[i,])
}
})
#12.8 MB
data
set.seed(24)
multiOrderPairsFlat <- cbind(sample(c("Pipe2mSteel" , "Bushing1inS"), 1e6, replace=TRUE),
sample(c("Pipe2mSteel" , "Bushing1inS"), 1e6, replace=TRUE))

Return factor associated with a numeric range defined in two columns

Using a database with a numeric range defined by two columns start and end, I am trying to look up the factor, code, associated with a numeric value in a separate vector identityCodes.
database <- data.frame(start = seq(1, 150000000, 1000),
end = seq(1000, 150000000, 1000),
code = paste0(sample(LETTERS, 15000, replace = TRUE),
sample(LETTERS, 15000, replace = TRUE)))
identityCodes <- sample(1:15000000, 1000)
I've come up with a method for finding the corresponding codes using a for loop and subsetting:
fun <- function (x, y) {
z <- rep(NA, length(x))
for (i in 1:length(x)){
z[i] <- as.character(y[y["start"] <= x[i] & y["end"] >= x[i], "code"])
}
return(z)
}
a <- fun(identityCodes, database)
But the method is slow, especially if I am to scale it:
system.time(fun(identityCodes, database))
user system elapsed
15.36 0.00 15.50
How can I identify the factors associated with each identityCodes faster? Is there a better way to go about this than using a for loop and subsetting?
Here's my attempt using data.table. Very fast - even though I am sure I am not leveraging it efficiently.
Given function:
# method 1
system.time(result1 <- fun(identityCodes, database))
user system elapsed
8.99 0.00 8.98
Using data.table
# method 2
require(data.table)
# x: a data.frame with columns start, end, code
# y: a vector with lookup codes
dt_comb <- function(x, y) {
# convert x to a data.table and set 'start' and 'end' as keys
DT <- setDT(x)
setkey(DT, start, end)
# create a lookup data.table where start and end are the identityCodes
DT2 <- data.table(start=y, end=y)
# overlap join where DT2 start & end are within DT start and end
res <- foverlaps(DT2, DT[, .(start, end)], type="within")
# store i as row number and key (for sorting later)
res[, i:=seq_len(nrow(res))]
setkey(res, i)
# merge the joined table to the original to get codes
final <- merge(res, DT, by=c("start", "end"))[order(i), .(code)]
# export as character the codes
as.character(final[[1]])
}
system.time(result2 <- dt_comb(x=database, y=identityCodes))
user system elapsed
0.08 0.00 0.08
identical(result1, result2)
[1] TRUE
edit: trimmed a couple lines from the function
This is about 45% faster on my machine:
result = lapply(identityCodes, function(x) {
data.frame(identityCode=x,
code=database[database$start <= x & database$end >= x, "code"])
})
result = do.call(rbind, result)
Here's a sample of the output:
identityCode code
1 6836845 OK
2 14100352 RB
3 2313115 NK
4 8440671 XN
5 11349271 TI
6 14467193 VL

R data.table sliding window

What is the best (fastest) way to implement a sliding window function with the data.table package?
I'm trying to calculate a rolling median but have multiple rows per date (due to 2 additional factors), which I think means that the zoo rollapply function wouldn't work. Here is an example using a naive for loop:
library(data.table)
df <- data.frame(
id=30000,
date=rep(as.IDate(as.IDate("2012-01-01")+0:29, origin="1970-01-01"), each=1000),
factor1=rep(1:5, each=200),
factor2=1:5,
value=rnorm(30, 100, 10)
)
dt = data.table(df)
setkeyv(dt, c("date", "factor1", "factor2"))
get_window <- function(date, factor1, factor2) {
criteria <- data.table(
date=as.IDate((date - 7):(date - 1), origin="1970-01-01"),
factor1=as.integer(factor1),
factor2=as.integer(factor2)
)
return(dt[criteria][, value])
}
output <- data.table(unique(dt[, list(date, factor1, factor2)]))[, window_median:=as.numeric(NA)]
for(i in nrow(output):1) {
print(i)
output[i, window_median:=median(get_window(date, factor1, factor2))]
}
data.table doesn't have any special features for rolling windows, currently. Further detail here in my answer to another similar question here :
Is there a fast way to run a rolling regression inside data.table?
Rolling median is interesting. It would need a specialized function to do efficiently (same link as in earlier comment) :
Rolling median algorithm in C
The data.table solutions in the question and answers here are all very inefficient, relative to a proper specialized rollingmedian function (which isn't available for R afaik).
I managed to get the example down to 1.4s by creating a lagged dataset and doing a huge join.
df <- data.frame(
id=30000,
date=rep(as.IDate(as.IDate("2012-01-01")+0:29, origin="1970-01-01"), each=1000),
factor1=rep(1:5, each=200),
factor2=1:5,
value=rnorm(30, 100, 10)
)
dt2 <- data.table(df)
setkeyv(dt, c("date", "factor1", "factor2"))
unique_set <- data.table(unique(dt[, list(original_date=date, factor1, factor2)]))
output2 <- data.table()
for(i in 1:7) {
output2 <- rbind(output2, unique_set[, date:=original_date-i])
}
setkeyv(output2, c("date", "factor1", "factor2"))
output2 <- output2[dt]
output2 <- output2[, median(value), by=c("original_date", "factor1", "factor2")]
That works pretty well on this test dataset but on my real one it fails with 8GB of RAM. I'm going to try moving up to one of the High Memory EC2 instance (with 17, 34 or 68GB RAM) to get it working. Any ideas on how to do this in a less memory intensive way would be appreciated
This solution works but it takes a while.
df <- data.frame(
id=30000,
date=rep(seq.Date(from=as.Date("2012-01-01"),to=as.Date("2012-01-30"),by="d"),each=1000),
factor1=rep(1:5, each=200),
factor2=1:5,
value=rnorm(30, 100, 10)
)
myFun <- function(dff,df){
median(df$value[df$date>as.Date(dff[2])-8 & df$date<as.Date(dff[2])-1 & df$factor1==dff[3] & df$factor2==dff[4]])
}
week_Med <- apply(df,1,myFun,df=df)
week_Med_df <- cbind(df,week_Med)
I address this in a related thread: https://stackoverflow.com/a/62399700/7115566
I suggest looking into the frollapply function. For instance, see below
library(data.table)
set.seed(17)
dt <- data.table(i = 1:100,
x = sample(1:10, 100, replace = T),
y = sample(1:10, 100, replace = T))
dt$index <- dt$x == dt$y
dt[,`:=` (MA = frollapply(index,10,mean)), ]
head(dt,12)

Efficiently locate group-wise constant columns in a data.frame

How can I efficiently extract group-wise constant columns from a data frame? I've included an plyr implementation below to make precise what I'm trying to do, but it's slow. How can I do it as efficiently as possible? (Ideally without splitting the data frame at all).
base <- data.frame(group = 1:1000, a = sample(1000), b = sample(1000))
df <- data.frame(
base[rep(seq_len(nrow(base)), length = 1e6), ],
c = runif(1e6),
d = runif(1e6)
)
is.constant <- function(x) length(unique(x)) == 1
constant_cols <- function(x) head(Filter(is.constant, x), 1)
system.time(constant <- ddply(df, "group", constant_cols))
# user system elapsed
# 20.531 1.670 22.378
stopifnot(identical(names(constant), c("group", "a", "b")))
stopifnot(nrow(constant) == 1000)
In my real use case (deep inside ggplot2) there may be an arbitrary number of constant and non-constant columns. The size of the data in the example is about the right order of magnitude.
(Edited to possibly address the issue of consecutive groups with the same value)
I'm tentatively submitting this answer, but I haven't completely convinced myself that it will correctly identify within group constant columns in all cases. But it's definitely faster (and can probably be improved):
constant_cols1 <- function(df,grp){
df <- df[order(df[,grp]),]
#Adjust values based on max diff in data
rle_group <- rle(df[,grp])
vec <- rep(rep(c(0,ceiling(diff(range(df)))),
length.out = length(rle_group$lengths)),
times = rle_group$lengths)
m <- matrix(vec,nrow = length(vec),ncol = ncol(df)-1)
df_new <- df
df_new[,-1] <- df[,-1] + m
rles <- lapply(df_new,FUN = rle)
nms <- names(rles)
tmp <- sapply(rles[nms != grp],
FUN = function(x){identical(x$lengths,rles[[grp]]$lengths)})
return(tmp)
}
My basic idea was to use rle, obviously.
I'm not sure if this is exactly what you are looking for, but it identifies columns a and b.
require(data.table)
is.constant <- function(x) identical(var(x), 0)
dtOne <- data.table(df)
system.time({dtTwo <- dtOne[, lapply(.SD, is.constant), by=group]
result <- apply(X=dtTwo[, list(a, b, c, d)], 2, all)
result <- result[result == TRUE] })
stopifnot(identical(names(result), c("a", "b")))
result
(edit: better answer)
What about something like
is.constant<-function(x) length(which(x==x[1])) == length(x)
This seems to be a nice improvement. Compare the following.
> a<-rnorm(5000000)
> system.time(is.constant(a))
user system elapsed
0.039 0.010 0.048
>
> system.time(is.constantOld(a))
user system elapsed
1.049 0.084 1.125
A bit slower than what hadley suggested above, but I think it should handle the case of equal adjacent groups
findBreaks <- function(x) cumsum(rle(x)$lengths)
constantGroups <- function(d, groupColIndex=1) {
d <- d[order(d[, groupColIndex]), ]
breaks <- lapply(d, findBreaks)
groupBreaks <- breaks[[groupColIndex]]
numBreaks <- length(groupBreaks)
isSubset <- function(x) length(x) <= numBreaks && length(setdiff(x, groupBreaks)) == 0
unlist(lapply(breaks[-groupColIndex], isSubset))
}
The intuition is that if a column is constant groupwise then the breaks in the column values (sorted by the group value) will be a subset of the breaks in the group value.
Now, compare it with hadley's (with small modification to ensure n is defined)
# df defined as in the question
n <- nrow(df)
changed <- function(x) c(TRUE, x[-1] != x[-n])
constant_cols2 <- function(df,grp){
df <- df[order(df[,grp]),]
changes <- lapply(df, changed)
vapply(changes[-1], identical, changes[[1]], FUN.VALUE = logical(1))
}
> system.time(constant_cols2(df, 1))
user system elapsed
1.779 0.075 1.869
> system.time(constantGroups(df))
user system elapsed
2.503 0.126 2.614
> df$f <- 1
> constant_cols2(df, 1)
a b c d f
TRUE TRUE FALSE FALSE FALSE
> constantGroups(df)
a b c d f
TRUE TRUE FALSE FALSE TRUE
Inspired by #Joran's answer, here's similar strategy that's a little faster (1 s vs 1.5 s on my machine)
changed <- function(x) c(TRUE, x[-1] != x[-n])
constant_cols2 <- function(df,grp){
df <- df[order(df[,grp]),]
n <- nrow(df)
changes <- lapply(df, changed)
vapply(changes[-1], identical, changes[[1]], FUN.VALUE = logical(1))
}
system.time(cols <- constant_cols2(df, "group")) # about 1 s
system.time(constant <- df[changed(df$group), cols])
# user system elapsed
# 1.057 0.230 1.314
stopifnot(identical(names(constant), c("group", "a", "b")))
stopifnot(nrow(constant) == 1000)
It has the same flaws though, in that it won't detect columns that are have the same values for adjacent groups (e.g. df$f <- 1)
With a bit more thinking plus #David's ideas:
constant_cols3 <- function(df, grp) {
# If col == TRUE and group == FALSE, not constant
matching_breaks <- function(group, col) {
!any(col & !group)
}
n <- nrow(df)
changed <- function(x) c(TRUE, x[-1] != x[-n])
df <- df[order(df[,grp]),]
changes <- lapply(df, changed)
vapply(changes[-1], matching_breaks, group = changes[[1]],
FUN.VALUE = logical(1))
}
system.time(x <- constant_cols3(df, "group"))
# user system elapsed
# 1.086 0.221 1.413
And that gives the correct result.
How fast does is.unsorted(x) fail for non-constant x? Sadly I don't have access to R at the moment. Also seems that's not your bottleneck though.

Resources