Get all combinations of shared elements in a list - r

For a list:
terms <- list(Item1 = c("a", "b", "c", "d"),
Item2 = c("a", "e", "f", "g"),
Item3 = c("b", "e", "h", "i"),
Item4 = c("j", "k"))
I would like to get the number of shared letters between each pair of items in the list. The expected output is therefore:
[,1] [,2] [,3] [,4]
[1,] 4 1 1 0
[2,] 1 4 1 0
[3,] 1 1 4 0
[4,] 0 0 0 2
From a previous StackOverflow answer, I found one possible solution:
overlapLength <- function(x, y) mapply(function(x, y)
length(intersect(x, y)), terms[x], terms[y])
s <- seq_along(terms)
outer(s, s, overlapLength)
But this is very slow for my list, which is very large (~9,000 items).
Is there a faster way to do this?
Thanks everyone for your input. I timed all answers with the first 100 items of my list.
> system.time(f_crossprod(go))
user system elapsed
0.024 0.001 0.025
> system.time(f_crossprod2(go))
user system elapsed
0.007 0.000 0.008
> system.time(f_mapply(go))
user system elapsed
2.018 0.032 2.059
> system.time(f_outer(go))
user system elapsed
1.950 0.016 1.979
> system.time(f_combn(go))
user system elapsed
1.056 0.005 1.062
> system.time(f_Rcpp(go))
user system elapsed
163.236 84.226 249.240
I then timed the outer and Matrix::crossprod solutions with the entire list of ~9,000 elements. The outer solution ran in about 55 minutes. The Matrix::crossprod solution ran in about 0.1 seconds!
It is possible I have made an error in implementation of the Rcpp function. However, #alexis_laz if you make your comment an answer I will accept it.
By the way, sorry I was not clear, I am not interested in the values on the diagonal.

We can use outer
outer(names(terms), names(terms), FUN = function(x,y)
lengths(Map(intersect, terms[x], terms[y])))
# [,1] [,2] [,3] [,4]
#[1,] 4 1 1 0
#[2,] 1 4 1 0
#[3,] 1 1 4 0
#[4,] 0 0 0 2
Or more compactly
outer(terms, terms, FUN = function(...) lengths(Map(intersect, ...)))
# Item1 Item2 Item3 Item4
#Item1 4 1 1 0
#Item2 1 4 1 0
#Item3 1 1 4 0
#Item4 0 0 0 2
We could also implement this in Rcpp. Below is the test1.cpp file
#include <Rcpp.h>
#include <math.h>
using namespace Rcpp;
//[[Rcpp::export]]
List foo(List xs) {
List x(xs);
List x1 = Rcpp::clone(xs);
List y1 = Rcpp::clone(xs);
int n = x1.size();
NumericVector res;
for( int i=0; i<n; i++){
for(int j=0; j<n; j++){
CharacterVector xd = x1[i];
CharacterVector yd = y1[j];
res.push_back(intersect(xd, yd).length());
}
}
return wrap(res) ;
We call it in R using
library(Rcpp)
sourceCpp("test1.cpp")
`dim<-`(unlist(foo(terms)), c(4, 4))
# [,1] [,2] [,3] [,4]
#[1,] 4 1 1 0
#[2,] 1 4 1 0
#[3,] 1 1 4 0
#[4,] 0 0 0 2
Benchmarks
In addition to the functions above, we included another version with a RcppEigen implementation that was posted here
n <- 100
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))
library(Matrix)
library(inline)
library(Rcpp)
alexis1 <- function() {crossprod(table(stack(terms1)))}
alexis2 <- function() {Matrix::crossprod(xtabs( ~ values + ind,
stack(terms1), sparse = TRUE)) }
akrun1 <- function(){outer(terms1, terms1, FUN = function(...) lengths(Map(intersect, ...)))}
akrun2 <- function() {`dim<-`(unlist(foo(terms1)), c(n, n))}
akrun3 <- function() {tbl <- table(stack(terms1))
funCPr(tbl, tbl)[[1]]}
db <- function() {do.call(rbind, lapply(1:length(terms1), function(i)
sapply(terms1, function(a)
sum(unlist(terms1[i]) %in% unlist(a)))))}
lmo <- function() { setNames(data.frame(t(combn(names(terms1), 2)),
combn(seq_along(terms1), 2,
function(x) length(intersect(terms1[[x[1]]], terms1[[x[2]]])))),
c("col1", "col2", "counts"))}
and the benchmark output for n at 100 are
library(microbenchmark)
microbenchmark(alexis1(), alexis2(), akrun1(), akrun2(),akrun3(), db(), lmo(),
unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 1.035975 1.032101 1.031239 1.010472 1.044217 1.129092 10 a
# alexis2() 3.896928 3.656585 3.461980 3.386301 3.335469 3.288161 10 a
# akrun1() 218.456708 207.099841 198.391784 189.356065 188.542712 214.415661 10 d
# akrun2() 84.239272 79.073087 88.594414 75.719853 78.277769 129.731990 10 b
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
# db() 86.921164 82.201117 80.358097 75.113471 73.311414 105.761977 10 b
# lmo() 125.128109 123.203318 118.732911 113.271352 113.164333 138.075212 10 c
With a slightly higher n at 200
n <- 200
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))
microbenchmark(alexis1(), alexis2(), akrun3(), db(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 1.117234 1.164198 1.181280 1.166070 1.230077 1.229899 10 a
# alexis2() 3.428904 3.425942 3.337112 3.379675 3.280729 3.164852 10 b
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
# db() 219.971285 219.577403 207.793630 213.232359 196.122420 187.433635 10 c
With n set at 9000
n <- 9000
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))
microbenchmark(alexis1(),alexis2(), akrun3(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 2.048708 2.021709 2.009396 2.085750 2.141060 1.767329 10 b
# alexis2() 3.520220 3.518339 3.419368 3.616512 3.515993 2.952927 10 c
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
Checking the output
res1 <- alexis1()
res2 <- akrun3()
res3 <- alexis2()
all.equal(res1, res2, check.attributes = FALSE)
#[1] TRUE
all.equal(res1, as.matrix(res3), check.attributes = FALSE)
#[1] TRUE
Based on the comments from #alexis_laz included 3 more functions to replace the table/stack part to compare the efficiency for n at 9000
alexis3 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
crossprod(tab, tab)
}
alexis4 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = sparseMatrix(x = 1L, i = match(unlt, u),
j = rep(seq_along(terms1), lengths(terms1)), dimnames = list(u, names(terms1)))
Matrix::crossprod(tab, tab, sparse = TRUE)
}
akrun4 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
funCPr(tab, tab)[[1]]
}
and the benchmarks are
microbenchmark(alexis1(),alexis2(), alexis3(), alexis4(),
akrun3(), akrun4(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 2.1888254 2.2897883 2.204237 2.169618 2.162955 2.122552 10 b
# alexis2() 3.7651292 3.9178071 3.672550 3.616577 3.587886 3.426039 10 c
# alexis3() 2.1776887 2.2410663 2.197293 2.137106 2.192834 2.241645 10 b
# alexis4() 4.1640895 4.3431379 4.262192 4.187449 4.388335 4.172607 10 d
# akrun3() 1.0000000 1.0000000 1.000000 1.000000 1.000000 1.000000 10 a
# akrun4() 0.9364288 0.9692772 1.043292 1.063931 1.090301 1.171245 10 a

This uses combn to produce a data.frame of the term combinations where the values of the terms are different. setNames adds variable names.
result <- setNames(data.frame(t(combn(names(terms), 2)),
combn(seq_along(terms), 2,
function(x) length(intersect(terms[[x[1]]], terms[[x[2]]])))),
c("col1", "col2", "counts"))
which returns
result
col1 col2 counts
1 Item1 Item2 1
2 Item1 Item3 1
3 Item1 Item4 0
4 Item2 Item3 1
5 Item2 Item4 0
6 Item3 Item4 0
If desired, you could use lengths to get own-term lengths and then rbind the results like
temp <- lengths(terms)
rbind(result, data.frame(col1=names(temp), col2=names(temp), counts=temp, row.names=NULL))
col1 col2 counts
1 Item1 Item2 1
2 Item1 Item3 1
3 Item1 Item4 0
4 Item2 Item3 1
5 Item2 Item4 0
6 Item3 Item4 0
7 Item1 Item1 4
8 Item2 Item2 4
9 Item3 Item3 4
10 Item4 Item4 2

I'm not sure if this is necessarily faster or more efficient, but it sure was interesting.
do.call(rbind, lapply(1:length(terms), function(i)
sapply(terms, function(a)
sum(unlist(terms[i]) %in% unlist(a)))))
# Item1 Item2 Item3 Item4
#[1,] 4 1 1 0
#[2,] 1 4 1 0
#[3,] 1 1 4 0
#[4,] 0 0 0 2

Related

how to delete a row [duplicate]

I have the following data frame
dat <- data.frame(a = c(0,0,2,3), b= c(1,0,0,0), c=c(0,0,1,3))
Which prints:
> dat
a b c
1 0 1 0
2 0 0 0
3 2 0 1
4 3 0 3
I want to remove rows where all the columns are zeros,
resulting in this:
a b c
1 0 1 0
3 2 0 1
4 3 0 3
How can I achieve that?
I tried this but failed:
> row_sub = apply(dat, 1, function(row) all(row !=0 ))
> dat[row_sub,]
[1] a b c
<0 rows> (or 0-length row.names)
You can use (1)
dat[as.logical(rowSums(dat != 0)), ]
This works for both positive and negative values.
Another, even faster, possibility for large datasets is (2)
dat[rowSums(!as.matrix(dat)) < ncol(dat), ]
A faster approach for short and long data frames is to use matrix multiplication (3):
dat[as.logical(abs(as.matrix(dat)) %*% rep(1L, ncol(dat))), ]
Some benchmarks:
# the original dataset
dat <- data.frame(a = c(0,0,2,3), b= c(1,0,0,0), c=c(0,0,1,3))
Codoremifa <- function() dat[rowSums(abs(dat)) != 0,]
Marco <- function() dat[!apply(dat, 1, function(x) all(x == 0)), ]
Sven <- function() dat[as.logical(rowSums(dat != 0)), ]
Sven_2 <- function() dat[rowSums(!as.matrix(dat)) < ncol(dat), ]
Sven_3 <- function() dat[as.logical(abs(as.matrix(dat)) %*% rep(1L,ncol(dat))), ]
library(microbenchmark)
microbenchmark(Codoremifa(), Marco(), Sven(), Sven_2(), Sven_3())
# Unit: microseconds
# expr min lq median uq max neval
# Codoremifa() 267.772 273.2145 277.1015 284.0995 1190.197 100
# Marco() 192.509 198.4190 201.2175 208.9925 265.594 100
# Sven() 143.372 147.7260 150.0585 153.9455 227.031 100
# Sven_2() 152.080 155.1900 156.9000 161.5650 214.591 100
# Sven_3() 146.793 151.1460 153.3235 157.9885 187.845 100
# a data frame with 10.000 rows
set.seed(1)
dat <- dat[sample(nrow(dat), 10000, TRUE), ]
microbenchmark(Codoremifa(), Marco(), Sven(), Sven_2(), Sven_3())
# Unit: milliseconds
# expr min lq median uq max neval
# Codoremifa() 2.426419 2.471204 3.488017 3.750189 84.268432 100
# Marco() 36.268766 37.840246 39.406751 40.791321 119.233175 100
# Sven() 2.145587 2.184150 2.205299 2.270764 83.055534 100
# Sven_2() 2.007814 2.048711 2.077167 2.207942 84.944856 100
# Sven_3() 1.814994 1.844229 1.861022 1.917779 4.452892 100
Why use sum? it is much more efficient to simply check if all elements are zero.
I would do
dat = dat[!apply(dat, 1, function(x) all(x == 0)), ]
If you need to keep track of which rows were removed:
indremoved = which(apply(dat, 1, function(x) all(x == 0)) )
dat = dat[ -indremoved, ]
Try dat[rowSums(abs(dat)) != 0,].
Shorter and more efficient (at least on my machine) is to use Reduce and |
dat <- data.frame(a = c(0,0,2,3), b= c(1,0,0,0), c=c(0,0,1,3))
dat[Reduce(`|`,dat),]
# a b c
# 1 0 1 0
# 3 2 0 1
# 4 3 0 3
Handling NAs
Current solutions don't handle NAs, to adapt mine (using example from: How to remove rows with all zeros without using rowSums in R?):
dat2 <- data.frame(a=c(0,0,0,0),b=c(0,-1,NA,1),c=c(0,1,0,-1),d=c(0,NA,0,0), e=c(0,0,NA,1))
# a b c d e
# 1 0 0 0 0 0
# 2 0 -1 1 NA 0
# 3 0 NA 0 0 NA
# 4 0 1 -1 0 1
If you want to remove rows containing NAs AND zeros
dat[Reduce(`|`,`[<-`(dat,is.na(dat),value=0)),]
# a b c d e
# 2 0 -1 1 NA 0
# 4 0 1 -1 0 1
If you want to keep them:
dat[Reduce(`|`,`[<-`(dat,is.na(dat),value=1)),]
# a b c d e
# 2 0 -1 1 NA 0
# 3 0 NA 0 0 NA
# 4 0 1 -1 0 1
Updated benchmark (all methods assuming no NAs)
dat <- data.frame(a = c(0,0,2,3), b= c(1,0,0,0), c=c(0,0,1,3))
mm <- function() dat[Reduce(`|`,dat),]
microbenchmark(Codoremifa(), Marco(), Sven(), Sven_2(), Sven_3(),mm(),unit='relative',times=50)
# Unit: relative
# expr min lq mean median uq max neval
# Codoremifa() 4.060050 4.020630 3.979949 3.921504 3.814334 4.517048 50
# Marco() 2.473624 2.358608 2.397922 2.444411 2.431119 2.365830 50
# Sven() 1.932279 1.937906 1.954935 2.013045 1.999980 1.960975 50
# Sven_2() 1.857111 1.834460 1.871929 1.885606 1.898201 2.595113 50
# Sven_3() 1.781943 1.731038 1.814738 1.800647 1.766469 3.346325 50
# mm() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 50
# a data frame with 10.000 rows
set.seed(1)
dat <- dat[sample(nrow(dat), 10000, TRUE), ]
library(microbenchmark)
microbenchmark(Codoremifa(), Marco(), Sven(), Sven_2(), Sven_3(),mm(),unit='relative',times=50)
# Unit: relative
# expr min lq mean median uq max neval
# Codoremifa() 1.395990 1.496361 3.224857 1.520903 3.146186 26.793544 50
# Marco() 35.794446 36.015642 29.930283 35.625356 34.414162 13.379470 50
# Sven() 1.347117 1.363027 1.473354 1.375143 1.408369 1.175388 50
# Sven_2() 1.268169 1.281210 1.466629 1.299255 1.355403 2.605840 50
# Sven_3() 1.067669 1.124846 1.380731 1.122851 1.191207 2.384538 50
# mm() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 50
dat[as.logical(rowSums(abs(dat))), ] or if you wanna handle NA's in your zeros row as well dat[as.logical(rowSums(abs(dat), na.rm = TRUE)), ]
We can use
dat[t(sapply( data.frame(t(dat)) , \(x) any(x != 0))) , ]
output
a b c
1 0 1 0
3 2 0 1
4 3 0 3

Generate series 1, 2,1, 3,2,1, 4,3,2,1, 5,4,3,2,1

I am trying to generate a vector containing decreasing sequences of increasing length, such as 1, 2,1, 3,2,1, 4,3,2,1, 5,4,3,2,1, i.e.
c(1, 2:1, 3:1, 4:1, 5:1)
I tried to use a loop for this, but I don't know how to stack or concatenate the results.
for (i in 1:11)
{
x = rev(seq(i:1))
print(x)
}
[1] 1
[1] 2 1
[1] 3 2 1
[1] 4 3 2 1
[1] 5 4 3 2 1
[1] 6 5 4 3 2 1
[1] 7 6 5 4 3 2 1
[1] 8 7 6 5 4 3 2 1
[1] 9 8 7 6 5 4 3 2 1
[1] 10 9 8 7 6 5 4 3 2 1
[1] 11 10 9 8 7 6 5 4 3 2 1
I have also been experimenting with the rep, rev and seq, which are my favourite option but did not get far.
With sequence:
rev(sequence(5:1))
# [1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
From R 4.0.0 sequence takes arguments from and by:
sequence(1:5, from = 1:5, by = -1)
# [1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
Far from the golf minimalism of rev... However, if you wake up one morning and want to create such a sequence with n = 1000 (like in the answer below), the latter is in fact faster (but I can hear Brian Ripley in fortunes::fortune(98))
n = 1000
microbenchmark(
f_rev = rev(sequence(n:1)),
f_seq4.0.0 = sequence(1:n, from = 1:n, by = -1))
# Unit: microseconds
# expr min lq mean median uq max neval
# f_rev 993.7 1040.3 1128.391 1076.95 1133.3 1904.7 100
# f_seq4.0.0 136.4 141.5 153.778 148.25 150.1 304.7 100
We can do this with lapply
unlist(lapply(1:11, function(x) rev(seq(x))))
Or as #zx8754 mentioned in the comments, in place of rev(seq, : can be used
unlist(lapply(1:11, function(x) x:1))
Or as #BrodieG suggested, we can make this more compact by removing the anonymous function call
unlist(lapply(1:11, ":", 1))
And for fun, using matrices (and ignoring the warning ;) )
m <- matrix(c(1:5,0), ncol = 5, nrow = 5, byrow = T)
m[ upper.tri(m, diag = T) ]
# [1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
And we can simplify the upper.tri into its component parts
m[ row(m) <= col(m)]
# [1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
And if you can handle even more fun, then how about some benchmarking:
library(microbenchmark)
maxValue <- 1000
vec2 <- maxValue:1
m2 <- matrix(c(1:maxValue,0), ncol = maxValue, nrow = maxValue, byrow = T)
microbenchmark(
henrik = {
rev(sequence(maxValue:1))
},
henrik_4.0.0 = {
sequence(1:maxValue, from = 1:maxValue, by = -1)
},
akrun = {
unlist(lapply(1:maxValue, function(x) x:1))
},
symbolix1 = {
m <- matrix(c(1:maxValue,0), ncol = maxValue, nrow = maxValue, byrow = T)
m[ row(m) <= col(m) ]
},
symbolix2 = {
m2[ row(m2) <= col(m2) ]
},
lmo1 = {
unlist(lapply(1:maxValue, tail, x=maxValue:1))
},
lmo2 = {
vec <- maxValue:1
unlist(lapply(rev(vec), tail, x=vec))
},
lmo3 = {
unlist(lapply(rev(vec2), tail, x=vec2))
}
)
# Unit: microseconds
# expr min lq mean median uq max neval
# henrik 1018.7 1068.20 1176.430 1103.65 1223.20 2348.4 100
# henrik_4.0.0 139.9 147.90 166.092 151.40 162.70 379.0 100
# akrun 3420.1 3637.75 3825.336 3729.10 3897.00 4960.6 100
# symbolix1 6999.5 7483.20 7807.747 7618.30 7810.70 12138.7 100
# symbolix2 4791.2 5043.00 5677.742 5190.50 5393.65 29318.7 100
# lmo1 7530.1 7967.05 10918.201 8161.10 8566.45 132324.1 100
# lmo2 7385.7 8017.95 12271.158 8213.90 8500.70 143798.2 100
# lmo3 7539.5 7959.05 14355.810 8177.85 8500.85 131154.2 100
In this example, henrik_4.0.0 is the winner! (for bm with pre-R 4.0.0 sequence only, see previous edits)
But I know what you're thinking, 'why end all the fun there!'
Well, lets write our own C++ function and see how that performs
library(Rcpp)
cppFunction('NumericVector reverseSequence(int maxValue, int vectorLength){
NumericVector out(vectorLength);
int counter = 0;
for(int i = 1; i <= maxValue; i++){
for(int j = i; j > 0; j--){
out[counter] = j;
counter++;
}
}
return out;
}')
maxValue <- 5
reverseSequence(maxValue, sum(1:maxValue))
[1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
library(microbenchmark)
maxValue <- 1000
microbenchmark(
akrun = {
unlist(sapply(1:maxValue, function(x) x:1))
},
symbolix3 = {
reverseSequence(maxValue, sum(1:maxValue))
}
)
# Unit: microseconds
# expr min lq mean median uq max neval
# akrun 1522.250 1631.6030 3148.922 1829.9370 3357.493 45576.148 100
# symbolix3 338.626 495.3825 1293.720 950.6635 2169.656 3816.091 100
Another alternative is to use tail within lapply, to successively select the number of elements to keep from the initial vector:
unlist(lapply(1:5, tail, x=5:1))
[1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
Or, it may be faster to construct the base vector first and then call on it:
vec <- 5:1
unlist(lapply(rev(vec), tail, x=vec))
[1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1

Efficient implementation in computing pairwise differences

Suppose I have a data frame as follows:
> foo = data.frame(x = 1:9, id = c(1, 1, 2, 2, 2, 3, 3, 3, 3))
> foo
x id
1 1 1
2 2 1
3 3 2
4 4 2
5 5 2
6 6 3
7 7 3
8 8 3
9 9 3
I want a very efficient implementation of h(a, b) that computes sums all (a - xi)*(b - xj) for xi, xj belonging to the same id class. For example, my current implementation is
h(a, b, foo){
a.diff = a - foo$x
b.diff = b - foo$x
prod = a.diff%*%t(b.diff)
id.indicator = as.matrix(ifelse(dist(foo$id, diag = T, upper = T),0,1)) + diag(nrow(foo))
return(sum(prod*id.indicator))
}
For example, with (a, b) = (0, 1), here is the output from each step in the function
> a.diff
[1] -1 -2 -3 -4 -5 -6 -7 -8 -9
> b.diff
[1] 0 -1 -2 -3 -4 -5 -6 -7 -8
> prod
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] 0 1 2 3 4 5 6 7 8
[2,] 0 2 4 6 8 10 12 14 16
[3,] 0 3 6 9 12 15 18 21 24
[4,] 0 4 8 12 16 20 24 28 32
[5,] 0 5 10 15 20 25 30 35 40
[6,] 0 6 12 18 24 30 36 42 48
[7,] 0 7 14 21 28 35 42 49 56
[8,] 0 8 16 24 32 40 48 56 64
[9,] 0 9 18 27 36 45 54 63 72
> id.indicator
1 2 3 4 5 6 7 8 9
1 1 1 0 0 0 0 0 0 0
2 1 1 0 0 0 0 0 0 0
3 0 0 1 1 1 0 0 0 0
4 0 0 1 1 1 0 0 0 0
5 0 0 1 1 1 0 0 0 0
6 0 0 0 0 0 1 1 1 1
7 0 0 0 0 0 1 1 1 1
8 0 0 0 0 0 1 1 1 1
9 0 0 0 0 0 1 1 1 1
In reality, there can be up to 1000 id clusters, and each cluster will be at least 40, making this method too inefficient because of the sparse entries in id.indicator and extra computations in prod on the off-block-diagonals which won't be used.
I played a round a bit. First, your implementation:
foo = data.frame(x = 1:9, id = c(1, 1, 2, 2, 2, 3, 3, 3, 3))
h <- function(a, b, foo){
a.diff = a - foo$x
b.diff = b - foo$x
prod = a.diff%*%t(b.diff)
id.indicator = as.matrix(ifelse(dist(foo$id, diag = T, upper = T),0,1)) +
diag(nrow(foo))
return(sum(prod*id.indicator))
}
h(a = 1, b = 0, foo = foo)
#[1] 891
Next, I tried a variant using a proper sparse matrix implementation (via the Matrix package) and functions for the index matrix. I also use tcrossprod which I often find to be a bit faster than a %*% t(b).
library("Matrix")
h2 <- function(a, b, foo) {
a.diff <- a - foo$x
b.diff <- b - foo$x
prod <- tcrossprod(a.diff, b.diff) # the same as a.diff%*%t(b.diff)
id.indicator <- do.call(bdiag, lapply(table(foo$id), function(n) matrix(1,n,n)))
return(sum(prod*id.indicator))
}
h2(a = 1, b = 0, foo = foo)
#[1] 891
Note that this function relies on foo$id being sorted.
Lastly, I tried avoid creating the full n by n matrix.
h3 <- function(a, b, foo) {
a.diff <- a - foo$x
b.diff <- b - foo$x
ids <- unique(foo$id)
res <- 0
for (i in seq_along(ids)) {
indx <- which(foo$id == ids[i])
res <- res + sum(tcrossprod(a.diff[indx], b.diff[indx]))
}
return(res)
}
h3(a = 1, b = 0, foo = foo)
#[1] 891
Benchmarking on your example:
library("microbenchmark")
microbenchmark(h(a = 1, b = 0, foo = foo),
h2(a = 1, b = 0, foo = foo),
h3(a = 1, b = 0, foo = foo))
# Unit: microseconds
# expr min lq mean median uq max neval
# h(a = 1, b = 0, foo = foo) 248.569 261.9530 493.2326 279.3530 298.2825 21267.890 100
# h2(a = 1, b = 0, foo = foo) 4793.546 4893.3550 5244.7925 5051.2915 5386.2855 8375.607 100
# h3(a = 1, b = 0, foo = foo) 213.386 227.1535 243.1576 234.6105 248.3775 334.612 100
Now, in this example, the h3 is the fastest and h2 is really slow. But I guess that both will be faster for larger examples. Probably, h3 will still win for larger examples though. While there is plenty of room of more optimization, h3 should be faster and more memory efficient. So, I think you should go for a variant of h3 which does not create unnecessarily large matrices.
tapply lets you apply a function across groups of a vector, and will simplify the results to a matrix or vector if it can. Using tcrossprod to multiply all the combinations for each group, and on some suitably large data it performs well:
# setup
set.seed(47)
foo = data.frame(x = 1:9, id = c(1, 1, 2, 2, 2, 3, 3, 3, 3))
foo2 <- data.frame(id = sample(1000, 40000, TRUE), x = rnorm(40000))
h_OP <- function(a, b, foo){
a.diff = a - foo$x
b.diff = b - foo$x
prod = a.diff %*% t(b.diff)
id.indicator = as.matrix(ifelse(dist(foo$id, diag = T, upper = T),0,1)) + diag(nrow(foo))
return(sum(prod * id.indicator))
}
h3_AEBilgrau <- function(a, b, foo) {
a.diff <- a - foo$x
b.diff <- b - foo$x
ids <- unique(foo$id)
res <- 0
for (i in seq_along(ids)) {
indx <- which(foo$id == ids[i])
res <- res + sum(tcrossprod(a.diff[indx], b.diff[indx]))
}
return(res)
}
h_d.b <- function(a, b, foo){
sum(sapply(split(foo, foo$id), function(d) sum(outer(a-d$x, b-d$x))))
}
h_alistaire <- function(a, b, foo){
sum(tapply(foo$x, foo$id, function(x){sum(tcrossprod(a - x, b - x))}))
}
All return the same thing, and are not that different on small data:
h_OP(0, 1, foo)
#> [1] 891
h3_AEBilgrau(0, 1, foo)
#> [1] 891
h_d.b(0, 1, foo)
#> [1] 891
h_alistaire(0, 1, foo)
#> [1] 891
# small data test
microbenchmark::microbenchmark(
h_OP(0, 1, foo),
h3_AEBilgrau(0, 1, foo),
h_d.b(0, 1, foo),
h_alistaire(0, 1, foo)
)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> h_OP(0, 1, foo) 143.749 157.8895 189.5092 189.7235 214.3115 262.258 100 b
#> h3_AEBilgrau(0, 1, foo) 80.970 93.8195 112.0045 106.9285 125.9835 225.855 100 a
#> h_d.b(0, 1, foo) 355.084 381.0385 467.3812 437.5135 516.8630 2056.972 100 c
#> h_alistaire(0, 1, foo) 148.735 165.1360 194.7361 189.9140 216.7810 287.990 100 b
On bigger data, difference become more stark, though. The original threatened to crash my laptop, but here are benchmarks for the fastest two:
# on 1k groups, 40k rows
microbenchmark::microbenchmark(
h3_AEBilgrau(0, 1, foo2),
h_alistaire(0, 1, foo2)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> h3_AEBilgrau(0, 1, foo2) 336.98199 403.04104 412.06778 410.52391 423.33008 443.8286 100 b
#> h_alistaire(0, 1, foo2) 14.00472 16.25852 18.07865 17.22296 18.09425 96.9157 100 a
Another possibility is to use a data.frame to summarize by group, then sum the appropriate column. In base R you'd do this with aggregate, but dplyr and and data.table are popular for making such an approach simpler with more complicated aggregations.
aggregate is slower than tapply. dplyr is faster than aggregate, but still slower. data.table, which is designed for speed, is almost exactly as fast as tapply.
library(dplyr)
library(data.table)
h_aggregate <- function(a, b, foo){sum(aggregate(x ~ id, foo, function(x){sum(tcrossprod(a - x, b - x))})$x)}
tidy_h <- function(a, b, foo){foo %>% group_by(id) %>% summarise(x = sum(tcrossprod(a - x, b - x))) %>% select(x) %>% sum()}
h_dt <- function(a, b, foo){setDT(foo)[, .(x = sum(tcrossprod(a - x, b - x))), by = id][, sum(x)]}
microbenchmark::microbenchmark(
h_alistaire(1, 0, foo2),
h_aggregate(1, 0, foo2),
tidy_h(1, 0, foo2),
h_dt(1, 0, foo2)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> h_alistaire(1, 0, foo2) 13.30518 15.52003 18.64940 16.48818 18.13686 62.35675 100 a
#> h_aggregate(1, 0, foo2) 93.08401 96.61465 107.14391 99.16724 107.51852 143.16473 100 c
#> tidy_h(1, 0, foo2) 39.47244 42.22901 45.05550 43.94508 45.90303 90.91765 100 b
#> h_dt(1, 0, foo2) 13.31817 15.09805 17.27085 16.46967 17.51346 56.34200 100 a
sum(sapply(split(foo, foo$id), function(d) sum(outer(a-d$x, b-d$x))))
#[1] 891
#TESTING
foo = data.frame(x = sample(1:9,10000,replace = TRUE),
id = sample(1:3, 10000, replace = TRUE))
system.time(sum(sapply(split(foo, foo$id), function(d) sum(outer(a-d$x, b-d$x)))))
# user system elapsed
# 0.15 0.01 0.17

Most efficient way to turn factor matrix into binary (indicator) matrix in R

I can think of several ways to turn matrix (data frame) of this type:
dat = data.frame(
x1 = rep(c('a', 'b'), 100),
x2 = rep(c('x', 'y'), 100)
)
head(dat)
x1 x2
1 a x
2 b y
3 a x
4 b y
5 a x
6 b y
Into a binary (indicator) matrix (or data frame) like this:
a b x y
1 0 1 0
0 1 0 1
...
(This structure is, of course, trivial and only for illustrative purpose!)
Many thanks!
We can use table
tbl <- table(rep(1:nrow(dat),2),unlist(dat))
head(tbl, 2)
# a b x y
# 1 1 0 1 0
# 2 0 1 0 1
Or a possibly efficient option would be
library(Matrix)
sM <- sparse.model.matrix(~ -1 + x1 +x2, dat,
contrasts.arg = lapply(dat, contrasts, contrasts = FALSE))
colnames(sM) <- sub(".*\\d", "", colnames(sM))
head(sM, 2)
# 2 x 4 sparse Matrix of class "dgCMatrix"
# a b x y
#1 1 . 1 .
#2 . 1 . 1
It can be converted to binary by converting to matrix
head(as.matrix(sM),2)
# a b x y
#1 1 0 1 0
#2 0 1 0 1
There are some good solutions posted already, but none are optimal for performance. We can optimize performance by looping over each input column, and then looping over each factor level index within each input column and doing a straight integer comparison of the factor indexes. It's not the most concise or elegant piece of code, but it's fairly straightforward and fast:
do.call(cbind,lapply(dat,function(col)
`colnames<-`(do.call(cbind,lapply(seq_along(levels(col)),function(i)
as.integer(as.integer(col)==i)
)),levels(col))
));
Performance:
library(Matrix);
library(data.table);
library(microbenchmark);
bgoldst <- function(dat) do.call(cbind,lapply(dat,function(col) `colnames<-`(do.call(cbind,lapply(seq_along(levels(col)),function(i) as.integer(as.integer(col)==i))),levels(col))));
akrun1 <- function(dat) table(rep(1:nrow(dat),2),unlist(dat));
akrun2 <- function(dat) sparse.model.matrix(~-1+x1+x2,dat,contrasts.arg=lapply(dat,contrasts,contrasts=FALSE));
davidar <- function(dat) { dat[,rowid:=.I]; dcast(melt(dat,id='rowid'),rowid~value,length); }; ## requires a data.table
dataminer <- function(dat) t(apply(dat,1,function(x) as.numeric(unique(unlist(dat))%in%x)));
N <- 100L; dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
identical(unname(bgoldst(dat)),matrix(as.vector(akrun1(dat)),ncol=4L));
## [1] TRUE
identical(unname(bgoldst(dat)),unname(matrix(as.integer(as.matrix(akrun2(dat))),ncol=4L)));
## [1] TRUE
identical(bgoldst(dat),as.matrix(davidar(datDT)[,rowid:=NULL]));
## [1] TRUE
identical(unname(bgoldst(dat)),matrix(as.integer(dataminer(dat)),ncol=4L));
## [1] TRUE
N <- 100L;
dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
microbenchmark(bgoldst(dat),akrun1(dat),akrun2(dat),davidar(datDT),dataminer(dat));
## Unit: microseconds
## expr min lq mean median uq max neval
## bgoldst(dat) 67.570 92.374 106.2853 99.6440 121.2405 188.596 100
## akrun1(dat) 581.182 652.386 773.6300 690.6605 916.4625 1192.299 100
## akrun2(dat) 4429.208 4836.119 5554.5902 5145.3135 5977.0990 11263.537 100
## davidar(datDT) 5064.273 5498.555 6104.7621 5664.9115 6203.9695 11713.856 100
## dataminer(dat) 47577.729 49529.753 55217.3726 53190.8940 60041.9020 74346.268 100
N <- 1e4L;
dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
microbenchmark(bgoldst(dat),akrun1(dat),akrun2(dat),davidar(datDT));
## Unit: milliseconds
## expr min lq mean median uq max neval
## bgoldst(dat) 1.775617 1.820949 2.299493 1.84725 1.972124 8.362336 100
## akrun1(dat) 38.954524 41.109257 48.409613 45.60304 52.147633 162.365472 100
## akrun2(dat) 16.915832 17.762799 21.288200 19.20164 23.775180 46.494055 100
## davidar(datDT) 36.151684 38.366715 42.875940 42.38794 45.916937 58.695008 100
N <- 1e5L;
dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
microbenchmark(bgoldst(dat),akrun1(dat),akrun2(dat),davidar(datDT));
## Unit: milliseconds
## expr min lq mean median uq max neval
## bgoldst(dat) 17.16473 22.97654 35.01815 26.76662 31.75562 152.6188 100
## akrun1(dat) 501.72644 626.14494 671.98315 680.91152 727.88262 828.8313 100
## akrun2(dat) 212.12381 242.65505 298.90254 272.28203 357.65106 429.6023 100
## davidar(datDT) 368.04924 461.60078 500.99431 511.54921 540.39358 638.3840 100
If you have a data.frame as you are showing (not a matrix), you could as well recast the data
library(data.table)
setDT(dat)[, rowid := .I] # Creates a row index
res <- dcast(melt(dat, id = "rowid"), rowid ~ value, length) # long/wide format
head(res)
# rowid a b x y
# 1 1 1 0 1 0
# 2 2 0 1 0 1
# 3 3 1 0 1 0
# 4 4 0 1 0 1
# 5 5 1 0 1 0
# 6 6 0 1 0 1
Some benchmarks
dat = data.frame(
x1 = rep(c('a', 'b'), 1e3),
x2 = rep(c('x', 'y'), 1e3)
)
library(data.table)
library(Matrix)
library(microbenchmark)
dat2 <- copy(dat)
microbenchmark("akrun1 : " = table(rep(1:nrow(dat),2),unlist(dat)),
"akrun2 : " = sparse.model.matrix(~ -1 + x1 +x2, dat, contrasts.arg = lapply(dat, contrasts, contrasts = FALSE)),
"DatamineR : " = t(apply(dat,1, function(x) as.numeric(unique(unlist(dat)) %in% x))),
"David Ar : " = {setDT(dat2)[, rowid := .I] ; dcast(melt(dat2, id = "rowid"), rowid ~ value, length)},
times = 10L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# akrun1 : 3.826075 4.061904 6.654399 5.165376 11.26959 11.82029 10 a
# akrun2 : 5.269531 5.713672 8.794434 5.943422 13.34118 20.01961 10 a
# DatamineR : 3199.336286 3343.774160 3410.618547 3385.756972 3517.22133 3625.70909 10 b
# David Ar : 8.092769 8.254682 11.030785 8.465232 15.44893 19.83914 10 a
The apply solution is highly inefficient and will take forever on a bigger data set. Comparing for a bigger data set while excluding the apply solution
dat = data.frame(
x1 = rep(c('a', 'b'), 1e4),
x2 = rep(c('x', 'y'), 1e4)
)
dat2 <- copy(dat)
microbenchmark("akrun1 : " = table(rep(1:nrow(dat),2),unlist(dat)),
"akrun2 : " = sparse.model.matrix(~ -1 + x1 +x2, dat, contrasts.arg = lapply(dat, contrasts, contrasts = FALSE)),
#"DatamineR : " = t(apply(dat,1, function(x) as.numeric(unique(unlist(dat)) %in% x))),
"David Ar : " = {setDT(dat2)[, rowid := .I] ; dcast(melt(dat2, id = "rowid"), rowid ~ value, length)},
times = 100L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# akrun1 : 38.66744 41.27116 52.97982 42.72534 47.17203 161.0420 100 b
# akrun2 : 17.02006 18.93534 27.27582 19.35580 20.72022 153.2397 100 a
# David Ar : 34.15915 37.91659 46.11050 38.58536 41.40412 149.0038 100 b
Seems like the Matrix package shines for a bigger data sets.
It probably worth comparing different scenarios when there are more columns/unique values too.
One alternative using apply
head(t(apply(dat,1, function(x) as.numeric(unique(unlist(dat)) %in% x))))
[,1] [,2] [,3] [,4]
[1,] 1 0 1 0
[2,] 0 1 0 1
[3,] 1 0 1 0
[4,] 0 1 0 1
[5,] 1 0 1 0
[6,] 0 1 0 1

How to delete rows where all the columns are zero

I have the following data frame
dat <- data.frame(a = c(0,0,2,3), b= c(1,0,0,0), c=c(0,0,1,3))
Which prints:
> dat
a b c
1 0 1 0
2 0 0 0
3 2 0 1
4 3 0 3
I want to remove rows where all the columns are zeros,
resulting in this:
a b c
1 0 1 0
3 2 0 1
4 3 0 3
How can I achieve that?
I tried this but failed:
> row_sub = apply(dat, 1, function(row) all(row !=0 ))
> dat[row_sub,]
[1] a b c
<0 rows> (or 0-length row.names)
You can use (1)
dat[as.logical(rowSums(dat != 0)), ]
This works for both positive and negative values.
Another, even faster, possibility for large datasets is (2)
dat[rowSums(!as.matrix(dat)) < ncol(dat), ]
A faster approach for short and long data frames is to use matrix multiplication (3):
dat[as.logical(abs(as.matrix(dat)) %*% rep(1L, ncol(dat))), ]
Some benchmarks:
# the original dataset
dat <- data.frame(a = c(0,0,2,3), b= c(1,0,0,0), c=c(0,0,1,3))
Codoremifa <- function() dat[rowSums(abs(dat)) != 0,]
Marco <- function() dat[!apply(dat, 1, function(x) all(x == 0)), ]
Sven <- function() dat[as.logical(rowSums(dat != 0)), ]
Sven_2 <- function() dat[rowSums(!as.matrix(dat)) < ncol(dat), ]
Sven_3 <- function() dat[as.logical(abs(as.matrix(dat)) %*% rep(1L,ncol(dat))), ]
library(microbenchmark)
microbenchmark(Codoremifa(), Marco(), Sven(), Sven_2(), Sven_3())
# Unit: microseconds
# expr min lq median uq max neval
# Codoremifa() 267.772 273.2145 277.1015 284.0995 1190.197 100
# Marco() 192.509 198.4190 201.2175 208.9925 265.594 100
# Sven() 143.372 147.7260 150.0585 153.9455 227.031 100
# Sven_2() 152.080 155.1900 156.9000 161.5650 214.591 100
# Sven_3() 146.793 151.1460 153.3235 157.9885 187.845 100
# a data frame with 10.000 rows
set.seed(1)
dat <- dat[sample(nrow(dat), 10000, TRUE), ]
microbenchmark(Codoremifa(), Marco(), Sven(), Sven_2(), Sven_3())
# Unit: milliseconds
# expr min lq median uq max neval
# Codoremifa() 2.426419 2.471204 3.488017 3.750189 84.268432 100
# Marco() 36.268766 37.840246 39.406751 40.791321 119.233175 100
# Sven() 2.145587 2.184150 2.205299 2.270764 83.055534 100
# Sven_2() 2.007814 2.048711 2.077167 2.207942 84.944856 100
# Sven_3() 1.814994 1.844229 1.861022 1.917779 4.452892 100
Why use sum? it is much more efficient to simply check if all elements are zero.
I would do
dat = dat[!apply(dat, 1, function(x) all(x == 0)), ]
If you need to keep track of which rows were removed:
indremoved = which(apply(dat, 1, function(x) all(x == 0)) )
dat = dat[ -indremoved, ]
Try dat[rowSums(abs(dat)) != 0,].
Shorter and more efficient (at least on my machine) is to use Reduce and |
dat <- data.frame(a = c(0,0,2,3), b= c(1,0,0,0), c=c(0,0,1,3))
dat[Reduce(`|`,dat),]
# a b c
# 1 0 1 0
# 3 2 0 1
# 4 3 0 3
Handling NAs
Current solutions don't handle NAs, to adapt mine (using example from: How to remove rows with all zeros without using rowSums in R?):
dat2 <- data.frame(a=c(0,0,0,0),b=c(0,-1,NA,1),c=c(0,1,0,-1),d=c(0,NA,0,0), e=c(0,0,NA,1))
# a b c d e
# 1 0 0 0 0 0
# 2 0 -1 1 NA 0
# 3 0 NA 0 0 NA
# 4 0 1 -1 0 1
If you want to remove rows containing NAs AND zeros
dat[Reduce(`|`,`[<-`(dat,is.na(dat),value=0)),]
# a b c d e
# 2 0 -1 1 NA 0
# 4 0 1 -1 0 1
If you want to keep them:
dat[Reduce(`|`,`[<-`(dat,is.na(dat),value=1)),]
# a b c d e
# 2 0 -1 1 NA 0
# 3 0 NA 0 0 NA
# 4 0 1 -1 0 1
Updated benchmark (all methods assuming no NAs)
dat <- data.frame(a = c(0,0,2,3), b= c(1,0,0,0), c=c(0,0,1,3))
mm <- function() dat[Reduce(`|`,dat),]
microbenchmark(Codoremifa(), Marco(), Sven(), Sven_2(), Sven_3(),mm(),unit='relative',times=50)
# Unit: relative
# expr min lq mean median uq max neval
# Codoremifa() 4.060050 4.020630 3.979949 3.921504 3.814334 4.517048 50
# Marco() 2.473624 2.358608 2.397922 2.444411 2.431119 2.365830 50
# Sven() 1.932279 1.937906 1.954935 2.013045 1.999980 1.960975 50
# Sven_2() 1.857111 1.834460 1.871929 1.885606 1.898201 2.595113 50
# Sven_3() 1.781943 1.731038 1.814738 1.800647 1.766469 3.346325 50
# mm() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 50
# a data frame with 10.000 rows
set.seed(1)
dat <- dat[sample(nrow(dat), 10000, TRUE), ]
library(microbenchmark)
microbenchmark(Codoremifa(), Marco(), Sven(), Sven_2(), Sven_3(),mm(),unit='relative',times=50)
# Unit: relative
# expr min lq mean median uq max neval
# Codoremifa() 1.395990 1.496361 3.224857 1.520903 3.146186 26.793544 50
# Marco() 35.794446 36.015642 29.930283 35.625356 34.414162 13.379470 50
# Sven() 1.347117 1.363027 1.473354 1.375143 1.408369 1.175388 50
# Sven_2() 1.268169 1.281210 1.466629 1.299255 1.355403 2.605840 50
# Sven_3() 1.067669 1.124846 1.380731 1.122851 1.191207 2.384538 50
# mm() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 50
dat[as.logical(rowSums(abs(dat))), ] or if you wanna handle NA's in your zeros row as well dat[as.logical(rowSums(abs(dat), na.rm = TRUE)), ]
We can use
dat[t(sapply( data.frame(t(dat)) , \(x) any(x != 0))) , ]
output
a b c
1 0 1 0
3 2 0 1
4 3 0 3

Resources