R efficient way to sort a Matrix by row - r

I have a matrix "multiOrderPairsFlat" of 2m+ rows and 2 columns where each cell contains a SKU description (e.g. "Pipe2mSteel" or "Bushing1inS") and would like to sort every row alphabetically, so that in every row, e.g. "Bushings1inS" is in the first column and "Pipe2mSteel" in the second.
However, if I run:
for (i in 1:length(multiOrderPairsFlat)){
multiOrderPairsFlat[i,] <- sort(multiOrderPairsFlat[i,])
}
It takes forever and I doubt this is the quickest way of dealing with this problem. Do you have any advice on how to solve this more efficiently, e.g. by vectorizing the operation?
Thanks for helping out;)
Best
seulberg1

It may be better to use pmin/pmax after converting to data.frame (as there are only two columns)
system.time({
df1 <- as.data.frame(multiOrderPairsFlat, stringsAsFactors=FALSE)
res <- data.frame(First = do.call(pmin, df1), Second = do.call(pmax, df1))
})
# user system elapsed
# 0.49 0.02 0.50
system.time({
for (i in 1:nrow(multiOrderPairsFlat)){
multiOrderPairsFlat[i,] <- sort(multiOrderPairsFlat[i,])
}
})
# user system elapsed
# 11.99 0.00 12.00
all.equal(as.matrix(res), multiOrderPairsFlat, check.attributes=FALSE)
#[1] TRUE
Checking the memory allocation
library(profvis)
profvis({
df1 <- as.data.frame(multiOrderPairsFlat, stringsAsFactors=FALSE)
res <- data.frame(First = do.call(pmin, df1), Second = do.call(pmax, df1))
})
#3.3 MB
profvis({
for (i in 1:nrow(multiOrderPairsFlat)){
multiOrderPairsFlat[i,] <- sort(multiOrderPairsFlat[i,])
}
})
#12.8 MB
data
set.seed(24)
multiOrderPairsFlat <- cbind(sample(c("Pipe2mSteel" , "Bushing1inS"), 1e6, replace=TRUE),
sample(c("Pipe2mSteel" , "Bushing1inS"), 1e6, replace=TRUE))

Related

Speeding up calculation across columns

I have a few moderately large data frames and need to do a calculation across different columns in the data; for example I want to compare column i in one data frame with i - 1 in another. I currently use a for loop. The calculation involves element-wise comparison of each pair of values so is somewhat slow: e.g. I take each column of data, turn it into a matrix and compare with the transpose of itself (with some additional complications). In my application (in which the data have about 100 columns and 3000 rows) this currently takes about 95 seconds. I am looking for ways to make this more efficient. If I were comparing the SAME column of each data frame I would try using mapply, but because I need to make comparisons across different columns I don't see how this could work. The current code is something like this:
d1 <- as.data.frame(matrix(rnorm(100000), nrow=1000))
d2 <- as.data.frame(matrix(rnorm(100000), nrow=1000))
r <- list()
ptm2 <- proc.time()
for(i in 2:100){
t <- matrix(0 + d1[,i] > 0,1000,1000)
u <- matrix(d1[,i],1000,1000)*t(matrix(d2[,i-1],1000,1000))
r[[i]] <- t * u
}
proc.time() - ptm2
This takes about 3 seconds on my computer; as mentioned the actual calculation is a bit more complicated than this MWE suggests. Obviously one could also improve efficiency in the calculation itself but I am looking for a solution to the 'compare column i to column i-1' issue.
Based on your example, if you align the d1 and d2 matrices ahead of time based on which columns you are comparing, then here is how you could use mapply. It appears to be only marginally faster, so parallel computing would be a better way to achieve speed gains.
d1 <- as.data.frame(matrix(rnorm(100000), nrow=1000))
d2 <- as.data.frame(matrix(rnorm(100000), nrow=1000))
r <- list()
ptm2 <- proc.time()
for(i in 2:100){
t <- matrix(0 + d1[,i] > 0,1000,1000)
u <- matrix(d1[,i],1000,1000)*t(matrix(d2[,i-1],1000,1000))
r[[i]] <- t * u
}
proc.time() - ptm2
#user system elapsed
#0.90 0.87 1.79
#select last 99 columns of d1 and first 99 columns of d2 based on your calcs
d1_99 <- as.data.frame(d1[,2:100]) #have to convert to data.frame for mapply to loop across columns; a data.frame is simply a list of vectors of equal length
d2_99 <- as.data.frame(d2[,1:99])
ptm3 <- proc.time()
r_test <- mapply(function(x, y) {
t <- matrix(x > 0, 1000, 1000) #didn't understand why you were adding 0 in your example
u <- matrix(x,1000,1000)*t(matrix(y,1000,1000))
t * u
}, x=d1_99, y=d2_99, SIMPLIFY = FALSE)
proc.time() - ptm3
#user system elapsed
#0.91 0.83 1.75
class(r_test)
#[1] "list"
length(r_test)
#[1] 99
#test for equality
all.equal(r[[2]], r_test[[1]])
#[1] TRUE
all.equal(r[[100]], r_test[[99]])
#[1] TRUE

Selecting rows from data.table

I'm using the R package data.table to read large amounts of data and analyse them. What I was wondering is why is selecting rows from a data.table so much slower than from a matrix.
require(data.table)
## create some random data
n = 1000
p = 1000
set.seed(1)
data.raw <- matrix(rnorm(n*p), nrow = n, ncol = p)
rownames(data.raw) <- lapply(1:n, FUN = function(x, length)paste(sample(c(letters, LETTERS), length, replace=TRUE), collapse=""), length = 10)
colnames(data.raw) <- paste0("X", 1:n)
#do the same thing as data.table
data.t <- data.table(data.raw)
data.t[, id := rownames(data.raw)]
setkey(data.t, id)
## now select one row after the other in both matrix and data.table
system.time(for(r in rownames(data.raw)) y <- data.raw[r, ])
# user system elapsed
# 0.016 0.000 0.017
system.time(for(r in data.t$id) y <- data.t[r])
# user system elapsed
# 30.580 0.000 30.608
Even for this relatively small example, data.table is extremely slow even though using the setkey. Is there any way to improve the performance of this?

Return factor associated with a numeric range defined in two columns

Using a database with a numeric range defined by two columns start and end, I am trying to look up the factor, code, associated with a numeric value in a separate vector identityCodes.
database <- data.frame(start = seq(1, 150000000, 1000),
end = seq(1000, 150000000, 1000),
code = paste0(sample(LETTERS, 15000, replace = TRUE),
sample(LETTERS, 15000, replace = TRUE)))
identityCodes <- sample(1:15000000, 1000)
I've come up with a method for finding the corresponding codes using a for loop and subsetting:
fun <- function (x, y) {
z <- rep(NA, length(x))
for (i in 1:length(x)){
z[i] <- as.character(y[y["start"] <= x[i] & y["end"] >= x[i], "code"])
}
return(z)
}
a <- fun(identityCodes, database)
But the method is slow, especially if I am to scale it:
system.time(fun(identityCodes, database))
user system elapsed
15.36 0.00 15.50
How can I identify the factors associated with each identityCodes faster? Is there a better way to go about this than using a for loop and subsetting?
Here's my attempt using data.table. Very fast - even though I am sure I am not leveraging it efficiently.
Given function:
# method 1
system.time(result1 <- fun(identityCodes, database))
user system elapsed
8.99 0.00 8.98
Using data.table
# method 2
require(data.table)
# x: a data.frame with columns start, end, code
# y: a vector with lookup codes
dt_comb <- function(x, y) {
# convert x to a data.table and set 'start' and 'end' as keys
DT <- setDT(x)
setkey(DT, start, end)
# create a lookup data.table where start and end are the identityCodes
DT2 <- data.table(start=y, end=y)
# overlap join where DT2 start & end are within DT start and end
res <- foverlaps(DT2, DT[, .(start, end)], type="within")
# store i as row number and key (for sorting later)
res[, i:=seq_len(nrow(res))]
setkey(res, i)
# merge the joined table to the original to get codes
final <- merge(res, DT, by=c("start", "end"))[order(i), .(code)]
# export as character the codes
as.character(final[[1]])
}
system.time(result2 <- dt_comb(x=database, y=identityCodes))
user system elapsed
0.08 0.00 0.08
identical(result1, result2)
[1] TRUE
edit: trimmed a couple lines from the function
This is about 45% faster on my machine:
result = lapply(identityCodes, function(x) {
data.frame(identityCode=x,
code=database[database$start <= x & database$end >= x, "code"])
})
result = do.call(rbind, result)
Here's a sample of the output:
identityCode code
1 6836845 OK
2 14100352 RB
3 2313115 NK
4 8440671 XN
5 11349271 TI
6 14467193 VL

How to append rows to an R data frame

I have looked around StackOverflow, but I cannot find a solution specific to my problem, which involves appending rows to an R data frame.
I am initializing an empty 2-column data frame, as follows.
df = data.frame(x = numeric(), y = character())
Then, my goal is to iterate through a list of values and, in each iteration, append a value to the end of the list. I started with the following code.
for (i in 1:10) {
df$x = rbind(df$x, i)
df$y = rbind(df$y, toString(i))
}
I also attempted the functions c, append, and merge without success. Please let me know if you have any suggestions.
Update from comment:
I don't presume to know how R was meant to be used, but I wanted to ignore the additional line of code that would be required to update the indices on every iteration and I cannot easily preallocate the size of the data frame because I don't know how many rows it will ultimately take. Remember that the above is merely a toy example meant to be reproducible. Either way, thanks for your suggestion!
Update
Not knowing what you are trying to do, I'll share one more suggestion: Preallocate vectors of the type you want for each column, insert values into those vectors, and then, at the end, create your data.frame.
Continuing with Julian's f3 (a preallocated data.frame) as the fastest option so far, defined as:
# pre-allocate space
f3 <- function(n){
df <- data.frame(x = numeric(n), y = character(n), stringsAsFactors = FALSE)
for(i in 1:n){
df$x[i] <- i
df$y[i] <- toString(i)
}
df
}
Here's a similar approach, but one where the data.frame is created as the last step.
# Use preallocated vectors
f4 <- function(n) {
x <- numeric(n)
y <- character(n)
for (i in 1:n) {
x[i] <- i
y[i] <- i
}
data.frame(x, y, stringsAsFactors=FALSE)
}
microbenchmark from the "microbenchmark" package will give us more comprehensive insight than system.time:
library(microbenchmark)
microbenchmark(f1(1000), f3(1000), f4(1000), times = 5)
# Unit: milliseconds
# expr min lq median uq max neval
# f1(1000) 1024.539618 1029.693877 1045.972666 1055.25931 1112.769176 5
# f3(1000) 149.417636 150.529011 150.827393 151.02230 160.637845 5
# f4(1000) 7.872647 7.892395 7.901151 7.95077 8.049581 5
f1() (the approach below) is incredibly inefficient because of how often it calls data.frame and because growing objects that way is generally slow in R. f3() is much improved due to preallocation, but the data.frame structure itself might be part of the bottleneck here. f4() tries to bypass that bottleneck without compromising the approach you want to take.
Original answer
This is really not a good idea, but if you wanted to do it this way, I guess you can try:
for (i in 1:10) {
df <- rbind(df, data.frame(x = i, y = toString(i)))
}
Note that in your code, there is one other problem:
You should use stringsAsFactors if you want the characters to not get converted to factors. Use: df = data.frame(x = numeric(), y = character(), stringsAsFactors = FALSE)
Let's benchmark the three solutions proposed:
# use rbind
f1 <- function(n){
df <- data.frame(x = numeric(), y = character())
for(i in 1:n){
df <- rbind(df, data.frame(x = i, y = toString(i)))
}
df
}
# use list
f2 <- function(n){
df <- data.frame(x = numeric(), y = character(), stringsAsFactors = FALSE)
for(i in 1:n){
df[i,] <- list(i, toString(i))
}
df
}
# pre-allocate space
f3 <- function(n){
df <- data.frame(x = numeric(1000), y = character(1000), stringsAsFactors = FALSE)
for(i in 1:n){
df$x[i] <- i
df$y[i] <- toString(i)
}
df
}
system.time(f1(1000))
# user system elapsed
# 1.33 0.00 1.32
system.time(f2(1000))
# user system elapsed
# 0.19 0.00 0.19
system.time(f3(1000))
# user system elapsed
# 0.14 0.00 0.14
The best solution is to pre-allocate space (as intended in R). The next-best solution is to use list, and the worst solution (at least based on these timing results) appears to be rbind.
Suppose you simply don't know the size of the data.frame in advance. It can well be a few rows, or a few millions. You need to have some sort of container, that grows dynamically. Taking in consideration my experience and all related answers in SO I come with 4 distinct solutions:
rbindlist to the data.frame
Use data.table's fast set operation and couple it with manually doubling the table when needed.
Use RSQLite and append to the table held in memory.
data.frame's own ability to grow and use custom environment (which has reference semantics) to store the data.frame so it will not be copied on return.
Here is a test of all the methods for both small and large number of appended rows. Each method has 3 functions associated with it:
create(first_element) that returns the appropriate backing object with first_element put in.
append(object, element) that appends the element to the end of the table (represented by object).
access(object) gets the data.frame with all the inserted elements.
rbindlist to the data.frame
That is quite easy and straight-forward:
create.1<-function(elems)
{
return(as.data.table(elems))
}
append.1<-function(dt, elems)
{
return(rbindlist(list(dt, elems),use.names = TRUE))
}
access.1<-function(dt)
{
return(dt)
}
data.table::set + manually doubling the table when needed.
I will store the true length of the table in a rowcount attribute.
create.2<-function(elems)
{
return(as.data.table(elems))
}
append.2<-function(dt, elems)
{
n<-attr(dt, 'rowcount')
if (is.null(n))
n<-nrow(dt)
if (n==nrow(dt))
{
tmp<-elems[1]
tmp[[1]]<-rep(NA,n)
dt<-rbindlist(list(dt, tmp), fill=TRUE, use.names=TRUE)
setattr(dt,'rowcount', n)
}
pos<-as.integer(match(names(elems), colnames(dt)))
for (j in seq_along(pos))
{
set(dt, i=as.integer(n+1), pos[[j]], elems[[j]])
}
setattr(dt,'rowcount',n+1)
return(dt)
}
access.2<-function(elems)
{
n<-attr(elems, 'rowcount')
return(as.data.table(elems[1:n,]))
}
SQL should be optimized for fast record insertion, so I initially had high hopes for RSQLite solution
This is basically copy&paste of Karsten W. answer on similar thread.
create.3<-function(elems)
{
con <- RSQLite::dbConnect(RSQLite::SQLite(), ":memory:")
RSQLite::dbWriteTable(con, 't', as.data.frame(elems))
return(con)
}
append.3<-function(con, elems)
{
RSQLite::dbWriteTable(con, 't', as.data.frame(elems), append=TRUE)
return(con)
}
access.3<-function(con)
{
return(RSQLite::dbReadTable(con, "t", row.names=NULL))
}
data.frame's own row-appending + custom environment.
create.4<-function(elems)
{
env<-new.env()
env$dt<-as.data.frame(elems)
return(env)
}
append.4<-function(env, elems)
{
env$dt[nrow(env$dt)+1,]<-elems
return(env)
}
access.4<-function(env)
{
return(env$dt)
}
The test suite:
For convenience I will use one test function to cover them all with indirect calling. (I checked: using do.call instead of calling the functions directly doesn't makes the code run measurable longer).
test<-function(id, n=1000)
{
n<-n-1
el<-list(a=1,b=2,c=3,d=4)
o<-do.call(paste0('create.',id),list(el))
s<-paste0('append.',id)
for (i in 1:n)
{
o<-do.call(s,list(o,el))
}
return(do.call(paste0('access.', id), list(o)))
}
Let's see the performance for n=10 insertions.
I also added a 'placebo' functions (with suffix 0) that don't perform anything - just to measure the overhead of the test setup.
r<-microbenchmark(test(0,n=10), test(1,n=10),test(2,n=10),test(3,n=10), test(4,n=10))
autoplot(r)
For 1E5 rows (measurements done on Intel(R) Core(TM) i7-4710HQ CPU # 2.50GHz):
nr function time
4 data.frame 228.251
3 sqlite 133.716
2 data.table 3.059
1 rbindlist 169.998
0 placebo 0.202
It looks like the SQLite-based sulution, although regains some speed on large data, is nowhere near data.table + manual exponential growth. The difference is almost two orders of magnitude!
Summary
If you know that you will append rather small number of rows (n<=100), go ahead and use the simplest possible solution: just assign the rows to the data.frame using bracket notation and ignore the fact that the data.frame is not pre-populated.
For everything else use data.table::set and grow the data.table exponentially (e.g. using my code).
Update with purrr, tidyr & dplyr
As the question is already dated (6 years), the answers are missing a solution with newer packages tidyr and purrr. So for people working with these packages, I want to add a solution to the previous answers - all quite interesting, especially .
The biggest advantage of purrr and tidyr are better readability IMHO.
purrr replaces lapply with the more flexible map() family,
tidyr offers the super-intuitive method add_row - just does what it says :)
map_df(1:1000, function(x) { df %>% add_row(x = x, y = toString(x)) })
This solution is short and intuitive to read, and it's relatively fast:
system.time(
map_df(1:1000, function(x) { df %>% add_row(x = x, y = toString(x)) })
)
user system elapsed
0.756 0.006 0.766
It scales almost linearly, so for 1e5 rows, the performance is:
system.time(
map_df(1:100000, function(x) { df %>% add_row(x = x, y = toString(x)) })
)
user system elapsed
76.035 0.259 76.489
which would make it rank second right after data.table (if your ignore the placebo) in the benchmark by #Adam Ryczkowski:
nr function time
4 data.frame 228.251
3 sqlite 133.716
2 data.table 3.059
1 rbindlist 169.998
0 placebo 0.202
A more generic solution for might be the following.
extendDf <- function (df, n) {
withFactors <- sum(sapply (df, function(X) (is.factor(X)) )) > 0
nr <- nrow (df)
colNames <- names(df)
for (c in 1:length(colNames)) {
if (is.factor(df[,c])) {
col <- vector (mode='character', length = nr+n)
col[1:nr] <- as.character(df[,c])
col[(nr+1):(n+nr)]<- rep(col[1], n) # to avoid extra levels
col <- as.factor(col)
} else {
col <- vector (mode=mode(df[1,c]), length = nr+n)
class(col) <- class (df[1,c])
col[1:nr] <- df[,c]
}
if (c==1) {
newDf <- data.frame (col ,stringsAsFactors=withFactors)
} else {
newDf[,c] <- col
}
}
names(newDf) <- colNames
newDf
}
The function extendDf() extends a data frame with n rows.
As an example:
aDf <- data.frame (l=TRUE, i=1L, n=1, c='a', t=Sys.time(), stringsAsFactors = TRUE)
extendDf (aDf, 2)
# l i n c t
# 1 TRUE 1 1 a 2016-07-06 17:12:30
# 2 FALSE 0 0 a 1970-01-01 01:00:00
# 3 FALSE 0 0 a 1970-01-01 01:00:00
system.time (eDf <- extendDf (aDf, 100000))
# user system elapsed
# 0.009 0.002 0.010
system.time (eDf <- extendDf (eDf, 100000))
# user system elapsed
# 0.068 0.002 0.070
Lets take a vector 'point' which has numbers from 1 to 5
point = c(1,2,3,4,5)
if we want to append a number 6 anywhere inside the vector then below command may come handy
i) Vectors
new_var = append(point, 6 ,after = length(point))
ii) columns of a table
new_var = append(point, 6 ,after = length(mtcars$mpg))
The command append takes three arguments:
the vector/column to be modified.
value to be included in the modified vector.
a subscript, after which the values are to be appended.
simple...!!
Apologies in case of any...!
My solution is almost the same as the original answer but it doesn't worked for me.
So, I gave names for the columns and it works:
painel <- rbind(painel, data.frame("col1" = xtweets$created_at,
"col2" = xtweets$text))

Speed up quantile calculation

I am using the Hmisc Package to calculate the quantiles of two continous variables and compare the results in a crosstable. You find my code below.
My problem is that the calculation of the quantiles takes a considerable amount of time if the number of observations increases.
Is there any possibility to speed up this procedure by using the data.table, ddply or any other package?
Thanks.
library(Hmisc)
# Set seed
set.seed(123)
# Generate some data
a <- sample(1:25, 1e7, replace=TRUE)
b <- sample(1:25, 1e7, replace=TRUE)
c <- data.frame(a,b)
# Calculate quantiles
c$a.quantile <- cut2(a, g=5)
c$b.quantile <- cut2(b, g=5)
# Output some descriptives
summaryM(a.quantile ~ b.quantile, data=c, overall=TRUE)
# Time spent for calculation:
# User System verstrichen
# 25.13 3.47 28.73
As stated by jlhoward and Ricardo Saporta data.table doesn't seem to speed up things too much in this case. The cut2 function is clearly the bottleneck here. I used another function to calculate the quantiles (see Is there a better way to create quantile "dummies" / factors in R?) and was able to decrease the calculation time by half:
qcut <- function(x, n) {
if(n<=2)
{
stop("The sample must be split in at least 3 parts.")
}
else{
break.values <- quantile(x, seq(0, 1, length = n + 1), type = 7)
break.labels <- c(
paste0(">=",break.values[1], " & <=", break.values[2]),
sapply(break.values[3:(n)], function(x){paste0(">",break.values[which(break.values == x)-1], " & <=", x)}),
paste0(">",break.values[(n)], " & <=", break.values[(n+1)]))
cut(x, break.values, labels = break.labels,include.lowest = TRUE)
}
}
c$a.quantile.2 <- qcut(c$a, 5)
c$b.quantile.2 <- qcut(c$b, 5)
summaryM(a.quantile.2 ~ b.quantile.2, data=c, overall=TRUE)
# Time spent for calculation:
# User System verstrichen
# 10.22 1.47 11.70
Using data.table would reduce the calculation time by another second, but I like the summary by the Hmisc package better.
You can use data.table's .N built in variable, to quickly tabulate.
library(data.table)
library(Hmisc)
DT <- data.table(a,b)
DT[, paste0(c("a", "b"), ".quantile") := lapply(.SD, cut2, g=5), .SDcols=c("a", "b")]
DT[, .N, keyby=list(b.quantile, a.quantile)][, setNames(as.list(N), as.character(b.quantile)), by=a.quantile]
You can break that last line down into two steps, to see what is going on. The second "[ " simply reshapes the data in a clean format.
DT.tabulated <- DT[, .N, keyby=list(b.quantile, a.quantile)]
DT.tabulated
DT.tabulated[, setNames(as.list(N), as.character(b.quantile)), by=a.quantile]
Data tables don't seem to improve things here:
library(Hmisc)
set.seed(123)
a <- sample(1:25, 1e7, replace=TRUE)
b <- sample(1:25, 1e7, replace=TRUE)
library(data.table)
# original approach
system.time({
c <- data.frame(a,b)
c$a.quantile <- cut2(a, g=5)
c$b.quantile <- cut2(b, g=5)
smry.1 <-summaryM(a.quantile ~ b.quantile, data=c, overall=TRUE)
})
user system elapsed
72.79 6.22 79.02
# original data.table approach
system.time({
DT <- data.table(a,b)
DT[, paste0(c("a", "b"), ".quantile") := lapply(.SD, cut2, g=5), .SDcols=c("a", "b")]
smry.2 <- DT[, .N, keyby=list(b.quantile, a.quantile)][, setNames(as.list(N), as.character(b.quantile)), by=a.quantile]
})
user system elapsed
66.86 5.11 71.98
# different data.table approach (simpler, and uses table(...))
system.time({
dt <- data.table(a,b)
smry.3 <- table(dt[,lapply(dt,cut2,g=5)])
})
user system elapsed
67.24 5.02 72.26

Resources