faster way to compare rows in a data frame - r

Consider the data frame below. I want to compare each row with rows below and then take the rows that are equal in more than 3 values.
I wrote the code below, but it is very slow if you have a large data frame.
How could I do that faster?
data <- as.data.frame(matrix(c(10,11,10,13,9,10,11,10,14,9,10,10,8,12,9,10,11,10,13,9,13,13,10,13,9), nrow=5, byrow=T))
rownames(data)<-c("sample_1","sample_2","sample_3","sample_4","sample_5")
>data
V1 V2 V3 V4 V5
sample_1 10 11 10 13 9
sample_2 10 11 10 14 9
sample_3 10 10 8 12 9
sample_4 10 11 10 13 9
sample_5 13 13 10 13 9
output <- data.frame(sample = NA, duplicate = NA, matches = NA)
dfrow <- 1
for(i in 1:nrow(data)) {
sample <- data[i, ]
for(j in (i+1):nrow(data)) if(i+1 <= nrow(data)) {
matches <- 0
for(V in 1:ncol(data)) {
if(data[j,V] == sample[,V]) {
matches <- matches + 1
}
}
if(matches > 3) {
duplicate <- data[j, ]
pair <- cbind(rownames(sample), rownames(duplicate), matches)
output[dfrow, ] <- pair
dfrow <- dfrow + 1
}
}
}
>output
sample duplicate matches
1 sample_1 sample_2 4
2 sample_1 sample_4 5
3 sample_2 sample_4 4

Here is an Rcpp solution. However, if the result matrix gets too big (i.e., there are too many hits), this will throw an error. I run the loops twice, first to get the necessary size of the result matrix and then to fill it. There is probably a better possibility. Also, obviously, this will only work with integers. If your matrix is numeric, you'll have to deal with floating point precision.
library(Rcpp)
library(inline)
#C++ code:
body <- '
const IntegerMatrix M(as<IntegerMatrix>(MM));
const int m=M.ncol(), n=M.nrow();
long count1;
int count2;
count1 = 0;
for (int i=0; i<(n-1); i++)
{
for (int j=(i+1); j<n; j++)
{
count2 = 0;
for (int k=0; k<m; k++) {
if (M(i,k)==M(j,k)) count2++;
}
if (count2>3) count1++;
}
}
IntegerMatrix R(count1,3);
count1 = 0;
for (int i=0; i<(n-1); i++)
{
for (int j=(i+1); j<n; j++)
{
count2 = 0;
for (int k=0; k<m; k++) {
if (M(i,k)==M(j,k)) count2++;
}
if (count2>3) {
count1++;
R(count1-1,0) = i+1;
R(count1-1,1) = j+1;
R(count1-1,2) = count2;
}
}
}
return wrap(R);
'
fun <- cxxfunction(signature(MM = "matrix"),
body,plugin="Rcpp")
#with your data
fun(as.matrix(data))
# [,1] [,2] [,3]
# [1,] 1 2 4
# [2,] 1 4 5
# [3,] 2 4 4
#Benchmarks
set.seed(42)
mat1 <- matrix(sample(1:10,250*26,TRUE),ncol=26)
mat2 <- matrix(sample(1:10,2500*26,TRUE),ncol=26)
mat3 <- matrix(sample(1:10,10000*26,TRUE),ncol=26)
mat4 <- matrix(sample(1:10,25000*26,TRUE),ncol=26)
library(microbenchmark)
microbenchmark(
fun(mat1),
fun(mat2),
fun(mat3),
fun(mat4),
times=3
)
# Unit: milliseconds
# expr min lq median uq max neval
# fun(mat1) 2.675568 2.689586 2.703603 2.732487 2.761371 3
# fun(mat2) 272.600480 274.680815 276.761151 276.796217 276.831282 3
# fun(mat3) 4623.875203 4643.634249 4663.393296 4708.067638 4752.741979 3
# fun(mat4) 29041.878164 29047.151348 29052.424532 29235.839275 29419.254017 3

EDIT: Not sure what I was thinking last night when I subtracted rows considering I could've directly tested for equality. Removed that uncessary step from the code below.
Here is one approach that may either be slightly clever or poorly thought out... but hopefully the former. The idea is that instead of doing a series of comparisons row-by-row you can instead perform some vectorized operations by subtracting the row from the rest of the data frame and then looking at the number of elements that are equal to zero. Here is a simple implementation of the approach:
> library(data.table)
> data <- as.data.frame(matrix(c(10,11,10,13,9,10,11,10,14,9,10,10,8,12,9,10,11,10,13,9,13,13,10,13,9), nrow=5, byrow=T))
> rownames(data)<-c("sample_1","sample_2","sample_3","sample_4","sample_5")
>
> findMatch <- function(i,n){
+ tmp <- colSums(t(data[-(1:i),]) == unlist(data[i,]))
+ tmp <- tmp[tmp > n]
+ if(length(tmp) > 0) return(data.table(sample=rownames(data)[i],duplicate=names(tmp),match=tmp))
+ return(NULL)
+ }
>
> system.time(tab <- rbindlist(lapply(1:(nrow(data)-1),findMatch,n=3)))
user system elapsed
0.003 0.000 0.003
> tab
sample duplicate match
1: sample_1 sample_2 4
2: sample_1 sample_4 5
3: sample_2 sample_4 4
EDIT: Here is version2 that uses matrices and pre-tranposes the data so you only need to do that once. It should scale better to your example with a non-trivial amount of data.
library(data.table)
data <- matrix(round(runif(26*250000,0,25)),ncol=26)
tdata <- t(data)
findMatch <- function(i,n){
tmp <- colSums(tdata[,-(1:i)] == data[i,])
j <- which(tmp > n)
if(length(tmp) > 0) return(data.table(sample=i,duplicate=j+1,match=tmp[j]))
return(NULL)
}
tab <- rbindlist(lapply(1:(nrow(data)-1),findMatch,n=3))
I ran than on my machine for a bit and got through the first 1500 iterations a full 250,000 x 26 matrix in under 15 minutes and required 600 Mb memory. Since previous iterations do not impact future iterations you could certainly chunk this into parts and run it separately if needed.

This is not a complete answer, just a quick workout that comes in mind is to use matrices instead of data.frame (those are quite slow tbh). Matrices are quite fast in R and by completing at least some operations in it and then appending the vector with column names will result in significant speed increase.
Just a quick demo:
data <- matrix(c(10,11,10,13,9,10,11,10,14,9,10,10,8,12,9,10,11,10,13,9,13,13,10,13,9), nrow=5, byrow=T)rownames(data)<-c("sample_1","sample_2","sample_3","sample_4","sample_5")
mu<-c("sample_1","sample_2","sample_3","sample_4","sample_5")
t=proc.time()
tab <- data.frame(sample = NA, duplicate = NA, matches = NA)
dfrow <- 1
for(i in 1:nrow(data)) {
sample <- data[i, ]
for(j in (i+1):nrow(data)) if(i+1 <= nrow(data)) {
matches <- 0
for(V in 1:ncol(data)) {
if(data[j,V] == sample[V]) {
matches <- matches + 1
}
}
if(matches > 3) {
duplicate <- data[j, ]
pair <- cbind(mu[i], mu[j], matches)
tab[dfrow, ] <- pair
dfrow <- dfrow + 1
}
}
}
proc.time()-t
On the average, on my machine, yields
user system elapsed
0.00 0.06 0.06
While in your case I get
user system elapsed
0.02 0.06 0.08
I'm not sure whether there's something more quicker than matrices. You can also play around with parallelisation, but for loops C++ code inlining are quite often used (package Rcpp).

library(data.table)
#creating the data
dt <- data.table(read.table(textConnection(
"Sample V1 V2 V3 V4 V5
sample_1 10 11 10 13 9
sample_2 10 11 10 14 9
sample_3 10 10 8 12 9
sample_4 10 11 10 13 9
sample_5 13 13 10 13 9"), header= TRUE))
# some constants which will be used frequently
nr = nrow(dt)
nc = ncol(dt)-1
#list into which we will insert the no. of matches for each sample
#for example's sake, i still suggest you write output to a file possibly
totalmatches <- vector(mode = "list", length = (nr-1))
#looping over each sample
for ( i in 1:(nr-1))
{
# all combinations of i with i+1 to nr
samplematch <- cbind(dt[i],dt[(i+1):nr])
# renaming the comparison sample columns
setnames(samplematch,append(colnames(dt),paste0(colnames(dt),"2")))
#calculating number of matches
samplematch[,noofmatches := 0]
for (j in 1:nc)
{
samplematch[,noofmatches := noofmatches+1*(get(paste0("V",j)) == get(paste0("V",j,"2")))]
}
# removing individual value columns and matches < 3
samplematch <- samplematch[noofmatches >= 3,list(Sample,Sample2,noofmatches)]
# adding to the list
totalmatches[[i]] <- samplematch
}
The output -
rbindlist(totalmatches)
Sample Sample2 noofmatches
1: sample_1 sample_2 4
2: sample_1 sample_4 5
3: sample_1 sample_5 3
4: sample_2 sample_4 4
5: sample_4 sample_5 3
The performance on matrices seems to be better though, this method clocked -
user system elapsed
0.17 0.01 0.19

Everything that has been said in the comments is very valid; in particular, I also don't necessarily think R is the best place to do this. That said, this works a lot quicker for me than what you've posed on a much larger dataset (~9.7 sec vs. unfinished after two minutes):
data <- matrix(sample(1:30, 10000, replace=TRUE), ncol=5)
#Pre-prepare
x <- 1
#Loop
for(i in seq(nrow(data)-2)){
#Find the number of matches on that row
sums <- apply(data[seq(from=-1,to=-i),], 1, function(x) sum(x==data[i,]))
#Find how many are greater than/equal to 3
matches <- which(sums >= 3)
#Prepare output
output[seq(from=x, length.out=length(matches)),1] <- rep(i, length(matches))
output[seq(from=x, length.out=length(matches)),2] <- matches
output[seq(from=x, length.out=length(matches)),3] <- sums[matches]
#Alter the counter of how many we've made...
x <- x + length(matches)
}
#Cleanup output
output <- output[!is.na(output[,1]),]})
...I'm fairly certain my weird x variable and the assignment of output could be improved/turned into an apply-type problem, but it's late and I'm tired! Good luck!

Well, I took a stab at it, the following code runs about 3 times faster than the original.
f <- function(ind, mydf){
res <- NULL
matches <- colSums(t(mydf[-(1:ind),])==mydf[ind,])
Ndups <- sum(matches > 3)
if(Ndups > 0){
res <- data.frame(sample=rep(ind,Ndups),duplicate=which(matches > 3),
matches= matches[matches > 3],stringsAsFactors = F)
rownames(res) <- NULL
return(as.matrix(res))
}
return(res)
}
f(1,mydf=as.matrix(data))
f(2,mydf=as.matrix(data))
system.time(
for(i in 1:1000){
tab <- NULL
for(j in 1:(dim(data)[1]-1))
tab <- rbind(tab,f(j,mydf=as.matrix(data)))
}
)/1000
tab

Assuming that all the entries in your dataset are of the same mode (numeric), turn it into a matrix. By transposing, you can take advantage of how == can be vectorized.
data <- as.matrix(data)
data <- t(data)
output <- lapply(seq_len(ncol(data) - 1), function(x) {
tmp <- data[,x] == data[, (x+1):ncol(data)]
n_matches <- {
if (x == ncol(data) - 1) {
setNames(sum(tmp),colnames(data)[ncol(data)])
} else {
colSums(tmp)
}
}
good_matches <- n_matches[n_matches >= 3]
})
The big question is how to output the results. As it stands I have your data in a list. I would think that this is the least memory-intensive way of storing your data.
[[1]]
sample_2 sample_4 sample_5
4 5 3
[[2]]
sample_4
4
[[3]]
named numeric(0)
[[4]]
sample_5
3
If you want a data frame output, then you'll want to tweak the return value of the function within lapply. Perhaps add in the last line of the function:
return(data.frame(
sample = colnames(data)[x],
duplicate = names(good_matches),
noofmatches = good_matches,
stringsAsFactors = FALSE))
And then use:
newoutput <- do.call(rbind, output)
## or, using plyr
# require(plyr)
# newoutput <- rbind.fill(output)

Related

R: Logical Conditions Not Being Respected

I am working with the R programming language. I am trying to build a loop that performs the following :
Step 1: Keep generating two random numbers "a" and "b" until both "a" and "b" are greater than 12
Step 2: Track how many random numbers had to be generated until it took for Step 1 to be completed
Step 3: Repeat Step 1 and Step 2 100 times
Since I do not know how to keep generating random numbers until a condition is met, I tried to generate a large amount of random numbers hoping that the condition is met (there is probably a better way to write this):
results <- list()
for (i in 1:100){
# do until break
repeat {
# repeat many random numbers
a = rnorm(10000,10,1)
b = rnorm(10000,10,1)
# does any pair meet the requirement
if (any(a > 12 & b > 12)) {
# put it in a data.frame
d_i = data.frame(a,b)
# end repeat
break
}
}
# select all rows until the first time the requirement is met
# it must be met, otherwise the loop would not have ended
d_i <- d_i[1:which(d_i$a > 10 & d_i$b > 10)[1], ]
# prep other variables and only keep last row (i.e. the row where the condition was met)
d_i$index = seq_len(nrow(d_i))
d_i$iteration = as.factor(i)
e_i = d_i[nrow(d_i),]
results[[i]] <- e_i
}
results_df <- do.call(rbind.data.frame, results)
Problem: When I look at the results, I noticed that the loop is incorrectly considering the condition to be met, for example:
head(results_df)
a b index iteration
4 10.29053 10.56263 4 1
5 10.95308 10.32236 5 2
3 10.74808 10.50135 3 3
13 11.87705 10.75067 13 4
1 10.17850 10.58678 1 5
14 10.14741 11.07238 1 6
For instance, in each one of these rows - both "a" and "b" are smaller than 12.
Does anyone know why this is happening and can someone please show me how to fix this problem?
Thanks!
How about this way? As you tag while-loop, I tried using it.
res <- matrix(0, nrow = 0, ncol = 3)
for (j in 1:100){
a <- rnorm(1, 10, 1)
b <- rnorm(1, 10, 1)
i <- 1
while(a < 12 | b < 12) {
a <- rnorm(1, 10, 1)
b <- rnorm(1, 10, 1)
i <- i + 1
}
x <- c(a,b,i)
res <- rbind(res, x)
}
head(res)
[,1] [,2] [,3]
x 12.14232 12.08977 399
x 12.27158 12.01319 1695
x 12.57345 12.42135 302
x 12.07494 12.64841 600
x 12.03210 12.07949 82
x 12.34006 12.00365 782
dim(res)
[1] 100 3

Loop calculation with previous value not using for in R

I'm a beginning R programmer. I have trouble in a loop calculation with a previous value like recursion.
An example of my data:
dt <- data.table(a = c(0:4), b = c( 0, 1, 2, 1, 3))
And calculated value 'c' is y[n] = (y[n-1] + b[n])*a[n]. Initial value of c is 0. (c[1] = 0)
I used the for loop and the code and result is as below.
dt$y <- 0
for (i in 2:nrow(dt)) {
dt$y[i] <- (dt$y[i - 1] + dt$b[i]) * dt$a[i]
}
a b y
1: 0 0 0
2: 1 1 1
3: 2 2 6
4: 3 1 21
5: 4 3 96
This result is what I want. However, my data has over 1,000,000 rows and several columns, therefore I'm trying to find other ways without using a for loop. I tried to use "Reduce()", but it only works with a single vector (ex. y[n] = y_[n-1]+b[n]). As shown above, my function uses two vectors, a and b, so I can't find a solution.
Is there a more efficient way to be faster without using a for loop, such as using a recursive function or any good package functions?
This kind of computation cannot make use of R's advantage of vectorization because of the iterative dependencies. But the slow-down appears to really be coming from indexing performance on a data.frame or data.table.
Interestingly, I was able to speed up the loop considerably by accessing a, b, and y directly as numeric vectors (1000+ fold advantage for 2*10^5 rows) or as matrix "columns" (100+ fold advantage for 2*10^5 rows) versus as columns in a data.table or data.frame.
This old discussion may still shed some light on this rather surprising result: https://stat.ethz.ch/pipermail/r-help/2011-July/282666.html
Please note that I also made a different toy data.frame, so I could test a larger example without returning Inf as y grew with i:
Option data.frame (numeric vectors embedded in a data.frame or data.table per your example):
vec_length <- 200000
dt <- data.frame(a=seq(from=0, to=1, length.out = vec_length), b=seq(from=0, to=-1, length.out = vec_length), y=0)
system.time(for (i in 2:nrow(dt)) {
dt$y[i] <- (dt$y[i - 1] + dt$b[i]) * dt$a[i]
})
#user system elapsed
#79.39 146.30 225.78
#NOTE: Sorry, I didn't have the patience to let the data.table version finish for vec_length=2*10^5.
tail(dt$y)
#[1] -554.1953 -555.1842 -556.1758 -557.1702 -558.1674 -559.1674
Option vector (numeric vectors extracted in advance of loop):
vec_length <- 200000
dt <- data.frame(a=seq(from=0, to=1, length.out = vec_length), b=seq(from=0, to=-1, length.out = vec_length), y=0)
y <- as.numeric(dt$y)
a <- as.numeric(dt$a)
b <- as.numeric(dt$b)
system.time(for (i in 2:length(y)) {
y[i] <- (y[i - 1] + b[i]) * a[i]
})
#user system elapsed
#0.03 0.00 0.03
tail(y)
#[1] -554.1953 -555.1842 -556.1758 -557.1702 -558.1674 -559.1674
Option matrix (data.frame converted to matrix before loop):
vec_length <- 200000
dt <- as.matrix(data.frame(a=seq(from=0, to=1, length.out = vec_length), b=seq(from=0, to=-1, length.out = vec_length), y=0))
system.time(for (i in 2:nrow(dt)) {
dt[i, 1] <- (dt[i - 1, 3] + dt[i, 2]) * dt[i, 1]
})
#user system elapsed
#0.67 0.01 0.69
tail(dt[,3])
#[1] -554.1953 -555.1842 -556.1758 -557.1702 -558.1674 -559.1674
#NOTE: a matrix is actually a vector but with an additional attribute (it's "dim") that says how the "matrix" should be organized into rows and columns
Option data.frame with matrix style indexing:
vec_length <- 200000
dt <- data.frame(a=seq(from=0, to=1, length.out = vec_length), b=seq(from=0, to=-1, length.out = vec_length), y=0)
system.time(for (i in 2:nrow(dt)) {
dt[i, 3] <- (dt[(i - 1), 3] + dt[i, 2]) * dt[i, 1]
})
#user system elapsed
#110.69 0.03 112.01
tail(dt[,3])
#[1] -554.1953 -555.1842 -556.1758 -557.1702 -558.1674 -559.1674
An option is to use Rcpp since for this recursive equation is easy to code in C++:
library(Rcpp)
cppFunction("
NumericVector func(NumericVector b, NumericVector a) {
int len = b.size();
NumericVector y(len);
for (int i = 1; i < len; i++) {
y[i] = (y[i-1] + b[i]) * a[i];
}
return(y);
}
")
func(c( 0, 1, 2, 1, 3), c(0:4))
#[1] 0 1 6 21 96
timing code:
vec_length <- 1e7
dt <- data.frame(a=1:vec_length, b=1:vec_length, y=0)
y <- as.numeric(dt$y)
a <- as.numeric(dt$a)
b <- as.numeric(dt$b)
system.time(for (i in 2:length(y)) {
y[i] <- (y[i - 1] + b[i]) * a[i]
})
# user system elapsed
# 19.22 0.06 19.44
system.time(func(b, a))
# user system elapsed
# 0.09 0.02 0.09
Here is a base R solution.
According to the information from #ThetaFC, an indication for speedup is to use matrix or vector (rather than data.frame for data.table). Thus, it is better to have the following preprocessing before calculating df$y, i.e.,
a <- as.numeric(df$a)
b <- as.numeric(df$b)
Then, you have two approaches to get df$y:
writing your customized recursion function
f <- function(k) {
if (k == 1) return(0)
c(f(k-1),(tail(f(k-1),1) + b[k])*a[k])
}
df$y <- f(nrow(df))
Or a non-recursion function (I guess this will be much faster then the recursive approach)
g <- Vectorize(function(k) sum(rev(cumprod(rev(a[2:k])))*b[2:k]))
df$y <- g(seq(nrow(df)))
such that
> df
a b y
1 0 0 0
2 1 1 1
3 2 2 6
4 3 1 21
5 4 3 96
I don't think this will be any faster, but here's one way to do it without an explicit loop
dt[, y := purrr::accumulate2(a, b, function(last, a, b) (last + b)*a
, .init = 0)[-1]]
dt
# a b y
# 1: 0 0 0
# 2: 1 1 1
# 3: 2 2 6
# 4: 3 1 21
# 5: 4 3 96

Merge-sorting in R using lists instead of vectors

So i've written this basic code that sorts a list using the well-known merge-sorting algorithm, i've defined two functions mergelists that compares and merges the elements and mergesort that divides the list into single elements:
mergelists <- function(a,b) {
al <- length(a)
bl <- length(b)
r <- numeric(al+bl)
ai <- 1
bi <- 1
j <- 1
while((ai<=al) && (bi<=bl)) {
if(a[ai]<b[bi]) {
r[j] <- a[ai]
ai <- ai+1
} else {
r[j] <- b[bi]
bi <- bi+1
}
j <- j+1
}
if(ai<=al) r[j:(al+bl)] <- a[ai:al]
else if(bi<=bl) r[j:(al+bl)] <- b[bi:bl]
return(r)
}
mergesort <- function(x) {
l <- length(x)
if(l>1) {
p <- ceiling(l/2)
a <- mergesort(x[1:p])
b <- mergesort(x[(p+1):l])
return(mergelists(a,b))
}
return(x)
}
this seems to work fine for the examples i used so far, for example:
> mergesort(c(11,10,9,15,6,12,17,8,19,7))
[1] 6 7 8 9 10 11 12 15 17 19
now for the sake of some research i'm doing, i want to change this code to work with R-lists and not vectors, the lists are usually defined as following:
> list(number=10,data=c(10,5,8,2))
$number
[1] 10
$data
[1] 10 5 8 2
data represents here the vector and number is the number of comparaisons.
After the change i imagine that the program should give me something like this:
>mergelists(list(number=8,data=c(1,3,5,8,9,10)),list(number=5,data=c(2,4,6,7)))
$number
[1] 20
$data
[1] 1 2 3 4 5 6 7 8 9 10
> mergesort(c(11,10,9,15,6,12,17,8,19,7))
$number
[1] 22
$data
[1] 6 7 8 9 10 11 12 15 17 19
the 20 here is basically 8 + 5 + 7, because 7 comparaisons would be necessary to merge the two sorted lists, but i don't know how to do this because i have a little experience with R-lists. i would appreciate your help. Thanks.
The starting point for any vector vec is list(number = 0, data = vec), where number is 0 because it as taken 0 comparisons to start with an unsorted vector.
You first need to modify mergelists to deal with two lists, simply by adding the indexing and then reforming the list at the end.
mergelists <- function(a,b) {
firstn <- a$number + b$number
a <- a$data
b <- b$data
al <- length(a)
bl <- length(b)
r <- numeric(al+bl)
ai <- 1
bi <- 1
j <- 1
while((ai<=al) && (bi<=bl)) {
if(a[ai]<b[bi]) {
r[j] <- a[ai]
ai <- ai+1
} else {
r[j] <- b[bi]
bi <- bi+1
}
j <- j+1
}
if(ai<=al) r[j:(al+bl)] <- a[ai:al]
else if(bi<=bl) r[j:(al+bl)] <- b[bi:bl]
return(list(number = firstn + j - 1L, data = r))
}
mergelists(list(number=8,data=c(1,3,5,8,9,10)), list(number=5,data=c(2,4,6,7)))
# $number
# [1] 20
# $data
# [1] 1 2 3 4 5 6 7 8 9 10
Now that you have the "base function" defined, you need the calling function to generate the enhanced vector (list) and pass it accordingly. This function can easily be improved for efficiency, but I think its recursive properties are sound.
mergesort <- function(x) {
# this first guarantees that if called with a vector, it is list-ified,
# but if called with a list (i.e., every other time in the recursion),
# the argument is untouched
if (! is.list(x)) x <- list(number = 0, data = x)
l <- length(x$data)
if (l > 1) {
p <- ceiling(l/2)
# the `within(...)` trick is a sneaky trick, can easily be
# handled with pre-assignment/subsetting
a <- mergesort(within(x, { data <- data[1:p]; }))
b <- mergesort(within(x, { data <- data[(p+1):l]; }))
return(mergelists(a,b))
}
return(x)
}
mergesort(c(11,10,9,15,6,12,17,8,19,7))
# $number
# [1] 22
# $data
# [1] 6 7 8 9 10 11 12 15 17 19

Memorize the last "correct" value of a sequence (for removing outliers)

I have a little problem in a function.
The aim of it is to remove outliers I've detected in my data.frame. They are detected when there's a too big difference with the previous correct value (e.g c(1,2,3,20,30,4,5,6): "20" and "30" are the outliers). But my data is much more complex than this.
My idea is to consider the first two numeric values of my column as "correct". Then, I want to test each next value:
if the difference between the tested value and the previous one is <20, then it's a new correct one, and the test must start again from this new correct value (and not from the previous correct one)
if the same difference is >20, then it's a wrong one. An index must be put next to the wrong value, and the test must still continue from this same correct value, until a new correct value is detected
Here's an example with my function and a fake DF:
myts <- data.frame(x=c(12,12,35,39,46,45,33,5,26,28,29,34,15,15),z=NA)
test <- function(x){
st1 = NULL
temp <- st1[1] <- x[1]
st1 <- numeric(length(x))
for (i in 2:(length(x))){
if((!is.na(x[i])) & (!is.na(x[i-1]))& (abs((x[i])-(temp)) > 20)){
st1[i] <- 1
} }
return(st1)
}
myts[,2] <- apply(as.data.frame(myts[,1]),2,test)
myts[,2] <- as.numeric(myts[,2])
It does nearly the job, but the problem is that the last correct value is not memorized. It still does the test from the first correct value.
Due to this, rows 9 to 11 in my example are not detected. I let you imagine the problem on a 500,000 rows data.frame.
How can I solve this little problem? The rest of the function may be OK.
You just need to update temp for any indices that aren't outliers:
test <- function(x) {
temp <- x[1]
st1 <- numeric(length(x))
for (i in 2:(length(x))){
if(!is.na(x[i]) & !is.na(x[i-1]) & abs(x[i]-temp) > 20) {
st1[i] <- 1
} else {
temp <- x[i]
}
}
return(st1)
}
myts[,2] <- apply(as.data.frame(myts[,1]),2,test)
myts[,2] <- as.numeric(myts[,2])
myts
# x z
# 1 12 0
# 2 12 0
# 3 35 1
# 4 39 1
# 5 46 1
# 6 45 1
# 7 33 1
# 8 5 0
# 9 26 1
# 10 28 1
# 11 29 1
# 12 34 1
# 13 15 0
# 14 15 0
One thing to note is that for loops in R will be quite slow compared to vectorized functions. However, because each element in your vector depends on a complicated way on the previous ones, it's tough to use R's built-in vectorized functions to efficiently compute your vector. You can convert this code nearly verbatim to C++ and use the Rcpp package to regain the efficiency:
library(Rcpp)
test2 <- cppFunction(
"IntegerVector test2(NumericVector x) {
const int n = x.length();
IntegerVector st1(n, 0);
double temp = x[0];
for (int i=1; i < n; ++i) {
if (!R_IsNA(x[i]) && !R_IsNA(x[i]) && fabs(x[i] - temp) > 20.0) {
st1[i] = 1;
} else {
temp = x[i];
}
}
return st1;
}")
all.equal(test(myts[,1]), test2(myts[,1]))
# [1] TRUE
# Benchmark on large vector with some NA values:
set.seed(144)
large.vec <- c(0, sample(c(1:50, NA), 1000000, replace=T))
all.equal(test(large.vec), test2(large.vec))
# [1] TRUE
library(microbenchmark)
microbenchmark(test(large.vec), test2(large.vec))
# Unit: milliseconds
# expr min lq mean median uq max neval
# test(large.vec) 2343.684164 2468.873079 2667.67970 2604.22954 2747.23919 3753.54901 100
# test2(large.vec) 9.596752 9.864069 10.97127 10.23011 11.68708 16.67855 100
The Rcpp code is about 250x faster on a vector of length 1 million. Depending on your use case this speedup may or may not be important.

Aggregate rows in a large matrix by rowname

I would like to aggregate the rows of a matrix by adding the values in rows that have the same rowname. My current approach is as follows:
> M
a b c d
1 1 1 2 0
1 2 3 4 2
2 3 0 1 2
3 4 2 5 2
> index <- as.numeric(rownames(M))
> M <- cbind(M,index)
> Dfmat <- data.frame(M)
> Dfmat <- aggregate(. ~ index, data = Dfmat, sum)
> M <- as.matrix(Dfmat)
> rownames(M) <- M[,"index"]
> M <- subset(M, select= -index)
> M
a b c d
1 3 4 6 2
2 3 0 1 2
3 4 2 5 2
The problem of this appraoch is that i need to apply it to a number of very large matrices (up to 1.000 rows and 30.000 columns). In these cases the computation time is very high (Same problem when using ddply). Is there a more eficcient to come up with the solution? Does it help that the original input matrices are DocumentTermMatrix from the tm package? As far as I know they are stored in a sparse matrix format.
Here's a solution using by and colSums, but requires some fiddling due to the default output of by.
M <- matrix(1:9,3)
rownames(M) <- c(1,1,2)
t(sapply(by(M,rownames(M),colSums),identity))
V1 V2 V3
1 3 9 15
2 3 6 9
There is now an aggregate function in Matrix.utils. This can accomplish what you want with a single line of code and is about 10x faster than the combineByRow solution and 100x faster than the by solution:
N <- 10000
m <- matrix( runif(N*100), nrow=N)
rownames(m) <- sample(1:(N/2),N,replace=T)
> microbenchmark(a<-t(sapply(by(m,rownames(m),colSums),identity)),b<-combineByRow(m),c<-aggregate.Matrix(m,row.names(m)),times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
a <- t(sapply(by(m, rownames(m), colSums), identity)) 6000.26552 6173.70391 6660.19820 6419.07778 7093.25002 7723.61642 10
b <- combineByRow(m) 634.96542 689.54724 759.87833 732.37424 866.22673 923.15491 10
c <- aggregate.Matrix(m, row.names(m)) 42.26674 44.60195 53.62292 48.59943 67.40071 70.40842 10
> identical(as.vector(a),as.vector(c))
[1] TRUE
EDIT: Frank is right, rowsum is somewhat faster than any of these solutions. You would want to consider using another one of these other functions only if you were using a Matrix, especially a sparse one, or if you were performing an aggregation besides sum.
The answer by James work as expected, but is quite slow for large matrices. Here is a version that avoids creating of new objects:
combineByRow <- function(m) {
m <- m[ order(rownames(m)), ]
## keep track of previous row name
prev <- rownames(m)[1]
i.start <- 1
i.end <- 1
## cache the rownames -- profiling shows that it takes
## forever to look at them
m.rownames <- rownames(m)
stopifnot(all(!is.na(m.rownames)))
## go through matrix in a loop, as we need to combine some unknown
## set of rows
for (i in 2:(1+nrow(m))) {
curr <- m.rownames[i]
## if we found a new row name (or are at the end of the matrix),
## combine all rows and mark invalid rows
if (prev != curr || is.na(curr)) {
if (i.start < i.end) {
m[i.start,] <- apply(m[i.start:i.end,], 2, max)
m.rownames[(1+i.start):i.end] <- NA
}
prev <- curr
i.start <- i
} else {
i.end <- i
}
}
m[ which(!is.na(m.rownames)),]
}
Testing it shows that is about 10x faster than the answer using by (2 vs. 20 seconds in this example):
N <- 10000
m <- matrix( runif(N*100), nrow=N)
rownames(m) <- sample(1:(N/2),N,replace=T)
start <- proc.time()
m1 <- combineByRow(m)
print(proc.time()-start)
start <- proc.time()
m2 <- t(sapply(by(m,rownames(m),function(x) apply(x, 2, max)),identity))
print(proc.time()-start)
all(m1 == m2)

Resources