I have the following data frame
dat <- data.frame(a = c(0,0,2,3), b= c(1,0,0,0), c=c(0,0,1,3))
Which prints:
> dat
a b c
1 0 1 0
2 0 0 0
3 2 0 1
4 3 0 3
I want to remove rows where all the columns are zeros,
resulting in this:
a b c
1 0 1 0
3 2 0 1
4 3 0 3
How can I achieve that?
I tried this but failed:
> row_sub = apply(dat, 1, function(row) all(row !=0 ))
> dat[row_sub,]
[1] a b c
<0 rows> (or 0-length row.names)
You can use (1)
dat[as.logical(rowSums(dat != 0)), ]
This works for both positive and negative values.
Another, even faster, possibility for large datasets is (2)
dat[rowSums(!as.matrix(dat)) < ncol(dat), ]
A faster approach for short and long data frames is to use matrix multiplication (3):
dat[as.logical(abs(as.matrix(dat)) %*% rep(1L, ncol(dat))), ]
Some benchmarks:
# the original dataset
dat <- data.frame(a = c(0,0,2,3), b= c(1,0,0,0), c=c(0,0,1,3))
Codoremifa <- function() dat[rowSums(abs(dat)) != 0,]
Marco <- function() dat[!apply(dat, 1, function(x) all(x == 0)), ]
Sven <- function() dat[as.logical(rowSums(dat != 0)), ]
Sven_2 <- function() dat[rowSums(!as.matrix(dat)) < ncol(dat), ]
Sven_3 <- function() dat[as.logical(abs(as.matrix(dat)) %*% rep(1L,ncol(dat))), ]
library(microbenchmark)
microbenchmark(Codoremifa(), Marco(), Sven(), Sven_2(), Sven_3())
# Unit: microseconds
# expr min lq median uq max neval
# Codoremifa() 267.772 273.2145 277.1015 284.0995 1190.197 100
# Marco() 192.509 198.4190 201.2175 208.9925 265.594 100
# Sven() 143.372 147.7260 150.0585 153.9455 227.031 100
# Sven_2() 152.080 155.1900 156.9000 161.5650 214.591 100
# Sven_3() 146.793 151.1460 153.3235 157.9885 187.845 100
# a data frame with 10.000 rows
set.seed(1)
dat <- dat[sample(nrow(dat), 10000, TRUE), ]
microbenchmark(Codoremifa(), Marco(), Sven(), Sven_2(), Sven_3())
# Unit: milliseconds
# expr min lq median uq max neval
# Codoremifa() 2.426419 2.471204 3.488017 3.750189 84.268432 100
# Marco() 36.268766 37.840246 39.406751 40.791321 119.233175 100
# Sven() 2.145587 2.184150 2.205299 2.270764 83.055534 100
# Sven_2() 2.007814 2.048711 2.077167 2.207942 84.944856 100
# Sven_3() 1.814994 1.844229 1.861022 1.917779 4.452892 100
Why use sum? it is much more efficient to simply check if all elements are zero.
I would do
dat = dat[!apply(dat, 1, function(x) all(x == 0)), ]
If you need to keep track of which rows were removed:
indremoved = which(apply(dat, 1, function(x) all(x == 0)) )
dat = dat[ -indremoved, ]
Try dat[rowSums(abs(dat)) != 0,].
Shorter and more efficient (at least on my machine) is to use Reduce and |
dat <- data.frame(a = c(0,0,2,3), b= c(1,0,0,0), c=c(0,0,1,3))
dat[Reduce(`|`,dat),]
# a b c
# 1 0 1 0
# 3 2 0 1
# 4 3 0 3
Handling NAs
Current solutions don't handle NAs, to adapt mine (using example from: How to remove rows with all zeros without using rowSums in R?):
dat2 <- data.frame(a=c(0,0,0,0),b=c(0,-1,NA,1),c=c(0,1,0,-1),d=c(0,NA,0,0), e=c(0,0,NA,1))
# a b c d e
# 1 0 0 0 0 0
# 2 0 -1 1 NA 0
# 3 0 NA 0 0 NA
# 4 0 1 -1 0 1
If you want to remove rows containing NAs AND zeros
dat[Reduce(`|`,`[<-`(dat,is.na(dat),value=0)),]
# a b c d e
# 2 0 -1 1 NA 0
# 4 0 1 -1 0 1
If you want to keep them:
dat[Reduce(`|`,`[<-`(dat,is.na(dat),value=1)),]
# a b c d e
# 2 0 -1 1 NA 0
# 3 0 NA 0 0 NA
# 4 0 1 -1 0 1
Updated benchmark (all methods assuming no NAs)
dat <- data.frame(a = c(0,0,2,3), b= c(1,0,0,0), c=c(0,0,1,3))
mm <- function() dat[Reduce(`|`,dat),]
microbenchmark(Codoremifa(), Marco(), Sven(), Sven_2(), Sven_3(),mm(),unit='relative',times=50)
# Unit: relative
# expr min lq mean median uq max neval
# Codoremifa() 4.060050 4.020630 3.979949 3.921504 3.814334 4.517048 50
# Marco() 2.473624 2.358608 2.397922 2.444411 2.431119 2.365830 50
# Sven() 1.932279 1.937906 1.954935 2.013045 1.999980 1.960975 50
# Sven_2() 1.857111 1.834460 1.871929 1.885606 1.898201 2.595113 50
# Sven_3() 1.781943 1.731038 1.814738 1.800647 1.766469 3.346325 50
# mm() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 50
# a data frame with 10.000 rows
set.seed(1)
dat <- dat[sample(nrow(dat), 10000, TRUE), ]
library(microbenchmark)
microbenchmark(Codoremifa(), Marco(), Sven(), Sven_2(), Sven_3(),mm(),unit='relative',times=50)
# Unit: relative
# expr min lq mean median uq max neval
# Codoremifa() 1.395990 1.496361 3.224857 1.520903 3.146186 26.793544 50
# Marco() 35.794446 36.015642 29.930283 35.625356 34.414162 13.379470 50
# Sven() 1.347117 1.363027 1.473354 1.375143 1.408369 1.175388 50
# Sven_2() 1.268169 1.281210 1.466629 1.299255 1.355403 2.605840 50
# Sven_3() 1.067669 1.124846 1.380731 1.122851 1.191207 2.384538 50
# mm() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 50
dat[as.logical(rowSums(abs(dat))), ] or if you wanna handle NA's in your zeros row as well dat[as.logical(rowSums(abs(dat), na.rm = TRUE)), ]
We can use
dat[t(sapply( data.frame(t(dat)) , \(x) any(x != 0))) , ]
output
a b c
1 0 1 0
3 2 0 1
4 3 0 3
Related
I have the following data frame
dat <- data.frame(a = c(0,0,2,3), b= c(1,0,0,0), c=c(0,0,1,3))
Which prints:
> dat
a b c
1 0 1 0
2 0 0 0
3 2 0 1
4 3 0 3
I want to remove rows where all the columns are zeros,
resulting in this:
a b c
1 0 1 0
3 2 0 1
4 3 0 3
How can I achieve that?
I tried this but failed:
> row_sub = apply(dat, 1, function(row) all(row !=0 ))
> dat[row_sub,]
[1] a b c
<0 rows> (or 0-length row.names)
You can use (1)
dat[as.logical(rowSums(dat != 0)), ]
This works for both positive and negative values.
Another, even faster, possibility for large datasets is (2)
dat[rowSums(!as.matrix(dat)) < ncol(dat), ]
A faster approach for short and long data frames is to use matrix multiplication (3):
dat[as.logical(abs(as.matrix(dat)) %*% rep(1L, ncol(dat))), ]
Some benchmarks:
# the original dataset
dat <- data.frame(a = c(0,0,2,3), b= c(1,0,0,0), c=c(0,0,1,3))
Codoremifa <- function() dat[rowSums(abs(dat)) != 0,]
Marco <- function() dat[!apply(dat, 1, function(x) all(x == 0)), ]
Sven <- function() dat[as.logical(rowSums(dat != 0)), ]
Sven_2 <- function() dat[rowSums(!as.matrix(dat)) < ncol(dat), ]
Sven_3 <- function() dat[as.logical(abs(as.matrix(dat)) %*% rep(1L,ncol(dat))), ]
library(microbenchmark)
microbenchmark(Codoremifa(), Marco(), Sven(), Sven_2(), Sven_3())
# Unit: microseconds
# expr min lq median uq max neval
# Codoremifa() 267.772 273.2145 277.1015 284.0995 1190.197 100
# Marco() 192.509 198.4190 201.2175 208.9925 265.594 100
# Sven() 143.372 147.7260 150.0585 153.9455 227.031 100
# Sven_2() 152.080 155.1900 156.9000 161.5650 214.591 100
# Sven_3() 146.793 151.1460 153.3235 157.9885 187.845 100
# a data frame with 10.000 rows
set.seed(1)
dat <- dat[sample(nrow(dat), 10000, TRUE), ]
microbenchmark(Codoremifa(), Marco(), Sven(), Sven_2(), Sven_3())
# Unit: milliseconds
# expr min lq median uq max neval
# Codoremifa() 2.426419 2.471204 3.488017 3.750189 84.268432 100
# Marco() 36.268766 37.840246 39.406751 40.791321 119.233175 100
# Sven() 2.145587 2.184150 2.205299 2.270764 83.055534 100
# Sven_2() 2.007814 2.048711 2.077167 2.207942 84.944856 100
# Sven_3() 1.814994 1.844229 1.861022 1.917779 4.452892 100
Why use sum? it is much more efficient to simply check if all elements are zero.
I would do
dat = dat[!apply(dat, 1, function(x) all(x == 0)), ]
If you need to keep track of which rows were removed:
indremoved = which(apply(dat, 1, function(x) all(x == 0)) )
dat = dat[ -indremoved, ]
Try dat[rowSums(abs(dat)) != 0,].
Shorter and more efficient (at least on my machine) is to use Reduce and |
dat <- data.frame(a = c(0,0,2,3), b= c(1,0,0,0), c=c(0,0,1,3))
dat[Reduce(`|`,dat),]
# a b c
# 1 0 1 0
# 3 2 0 1
# 4 3 0 3
Handling NAs
Current solutions don't handle NAs, to adapt mine (using example from: How to remove rows with all zeros without using rowSums in R?):
dat2 <- data.frame(a=c(0,0,0,0),b=c(0,-1,NA,1),c=c(0,1,0,-1),d=c(0,NA,0,0), e=c(0,0,NA,1))
# a b c d e
# 1 0 0 0 0 0
# 2 0 -1 1 NA 0
# 3 0 NA 0 0 NA
# 4 0 1 -1 0 1
If you want to remove rows containing NAs AND zeros
dat[Reduce(`|`,`[<-`(dat,is.na(dat),value=0)),]
# a b c d e
# 2 0 -1 1 NA 0
# 4 0 1 -1 0 1
If you want to keep them:
dat[Reduce(`|`,`[<-`(dat,is.na(dat),value=1)),]
# a b c d e
# 2 0 -1 1 NA 0
# 3 0 NA 0 0 NA
# 4 0 1 -1 0 1
Updated benchmark (all methods assuming no NAs)
dat <- data.frame(a = c(0,0,2,3), b= c(1,0,0,0), c=c(0,0,1,3))
mm <- function() dat[Reduce(`|`,dat),]
microbenchmark(Codoremifa(), Marco(), Sven(), Sven_2(), Sven_3(),mm(),unit='relative',times=50)
# Unit: relative
# expr min lq mean median uq max neval
# Codoremifa() 4.060050 4.020630 3.979949 3.921504 3.814334 4.517048 50
# Marco() 2.473624 2.358608 2.397922 2.444411 2.431119 2.365830 50
# Sven() 1.932279 1.937906 1.954935 2.013045 1.999980 1.960975 50
# Sven_2() 1.857111 1.834460 1.871929 1.885606 1.898201 2.595113 50
# Sven_3() 1.781943 1.731038 1.814738 1.800647 1.766469 3.346325 50
# mm() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 50
# a data frame with 10.000 rows
set.seed(1)
dat <- dat[sample(nrow(dat), 10000, TRUE), ]
library(microbenchmark)
microbenchmark(Codoremifa(), Marco(), Sven(), Sven_2(), Sven_3(),mm(),unit='relative',times=50)
# Unit: relative
# expr min lq mean median uq max neval
# Codoremifa() 1.395990 1.496361 3.224857 1.520903 3.146186 26.793544 50
# Marco() 35.794446 36.015642 29.930283 35.625356 34.414162 13.379470 50
# Sven() 1.347117 1.363027 1.473354 1.375143 1.408369 1.175388 50
# Sven_2() 1.268169 1.281210 1.466629 1.299255 1.355403 2.605840 50
# Sven_3() 1.067669 1.124846 1.380731 1.122851 1.191207 2.384538 50
# mm() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 50
dat[as.logical(rowSums(abs(dat))), ] or if you wanna handle NA's in your zeros row as well dat[as.logical(rowSums(abs(dat), na.rm = TRUE)), ]
We can use
dat[t(sapply( data.frame(t(dat)) , \(x) any(x != 0))) , ]
output
a b c
1 0 1 0
3 2 0 1
4 3 0 3
I want to determine the amount of duplicate records per row for certain columns in a big data table. Simple example with desired output:
test <- data.table(a=c(1,2,3),b=c(1,4,6),c=c(5,6,9),duplicatercds=c(1,0,0))
Is there a command for this?
You can melt into a long format before calculating the dupes
library(data.table)
DT <- data.table(a=c(1,2,3),b=c(1,4,6),c=c(5,6,9))
stat <- melt(DT[, rn:=.I], id.vars="rn")[,
.(duplicatercds=.N - uniqueN(value)), by=.(rn)]
DT[stat, duplicatercds := duplicatercds, on=.(rn)]
Be careful when doing uniqueN on double values because of machine precision
There is a github fr for this: https://github.com/Rdatatable/data.table/issues/1063
EDIT:
Another cleaner method
DT[, duplicatercds := apply(.SD, 1, function(x) length(x) - uniqueN(x))]
EDIT: Added some timings:
library(data.table)
set.seed(0L)
ncols <- 10L
nrows <- 1e4L
uniqVal <- seq_len(1000L)
test <- as.data.table(matrix(sample(uniqVal, nrows*ncols, replace=TRUE), nrow=nrows))
test[, duplicatercds := NA_real_]
f1 <- function() test[, apply(.SD, 1, function(x) { y <- table(x); sum(y) - length(y) }) ]
f2 <- function() test[, sum(table(unlist(.SD)) > 1), by=.(1:nrows)]$V1
f3 <- function() test[, apply(test, 1, function(x) sum(diff(sort(x))==0))]
f4 <- function() test[, apply(.SD, 1, function(x) length(x) - uniqueN(x))]
f5 <- function() test[, ncols - vapply(transpose(.SD), uniqueN, 1L) + 1L]
identical(f2(), f1())
# [1] FALSE
identical(f3(), f1())
# [1] TRUE
identical(f4(), f1())
# [1] TRUE
identical(unname(f5()), f1())
# [1] TRUE
library(microbenchmark)
microbenchmark(f1(), f2(), f3(), f4(), f5(), times=5L)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f1() 1883.7817 1903.7626 1940.5378 1922.6539 1981.1139 2011.3771 5
# f2() 1821.0437 1901.1188 1933.8926 1908.4297 1999.6216 2039.2491 5
# f3() 657.4502 666.6721 679.5539 672.6617 686.4095 714.5760 5
# f4() 167.8048 168.5211 174.3660 169.9920 180.1304 185.3816 5
# f5() 146.0255 154.6341 159.4630 160.1968 164.3369 172.1219 5
Let's assume you don't have that last column then you can get your desired result (modulo your clarifying comment) with :
test[ ,duplicatercds := apply(.SD, 1, function(x) {sum(table(x))-length(table(x))}),
by=1:nrow(test) ]
> test
a b c duplicatercds
1: 1 1 5 1
2: 2 4 6 0
3: 3 6 9 0
And a 'test' with a more complex example:
> test <- data.table(a=c(1,2,3),b=c(1,4,6),c=c(5,6,9), d=c(1,2,3), c=c(5,6,9))
> test
a b c d c
1: 1 1 5 1 5
2: 2 4 6 2 6
3: 3 6 9 3 9
> test[ , duplicatercds := apply(.SD, 1, function(x) {sum(table(x))-length(table(x))}), by=1:nrow(test) ]
> test
a b c d c duplicatercds
1: 1 1 5 1 5 3
2: 2 4 6 2 6 2
3: 3 6 9 3 9 2
Or maybe:
test[ , duplicatercds := apply(.SD, 1,
function(x) {sum(table(x))-length(table(x))}) ]
You can do a table, and count how many have a frequency of more than 1
test=test[,1:3]#Remove your duplicatercds
test[, duplicatercds:=sum(table(unlist(.SD))>1),by=.(1:nrow(test))][]
a b c duplicatercds
1: 1 1 5 1
2: 2 4 6 0
3: 3 6 9 0
One solution is to use diff as part of apply function.
test <- data.table(a=c(1,2,3),b=c(1,4,6),c=c(5,6,9))
test$dup <- apply(test,1,function(x)sum(diff(sort(x))==0))
test
# a b c dup
# 1: 1 1 5 1
# 2: 2 4 6 0
# 3: 3 6 9 0
For a list:
terms <- list(Item1 = c("a", "b", "c", "d"),
Item2 = c("a", "e", "f", "g"),
Item3 = c("b", "e", "h", "i"),
Item4 = c("j", "k"))
I would like to get the number of shared letters between each pair of items in the list. The expected output is therefore:
[,1] [,2] [,3] [,4]
[1,] 4 1 1 0
[2,] 1 4 1 0
[3,] 1 1 4 0
[4,] 0 0 0 2
From a previous StackOverflow answer, I found one possible solution:
overlapLength <- function(x, y) mapply(function(x, y)
length(intersect(x, y)), terms[x], terms[y])
s <- seq_along(terms)
outer(s, s, overlapLength)
But this is very slow for my list, which is very large (~9,000 items).
Is there a faster way to do this?
Thanks everyone for your input. I timed all answers with the first 100 items of my list.
> system.time(f_crossprod(go))
user system elapsed
0.024 0.001 0.025
> system.time(f_crossprod2(go))
user system elapsed
0.007 0.000 0.008
> system.time(f_mapply(go))
user system elapsed
2.018 0.032 2.059
> system.time(f_outer(go))
user system elapsed
1.950 0.016 1.979
> system.time(f_combn(go))
user system elapsed
1.056 0.005 1.062
> system.time(f_Rcpp(go))
user system elapsed
163.236 84.226 249.240
I then timed the outer and Matrix::crossprod solutions with the entire list of ~9,000 elements. The outer solution ran in about 55 minutes. The Matrix::crossprod solution ran in about 0.1 seconds!
It is possible I have made an error in implementation of the Rcpp function. However, #alexis_laz if you make your comment an answer I will accept it.
By the way, sorry I was not clear, I am not interested in the values on the diagonal.
We can use outer
outer(names(terms), names(terms), FUN = function(x,y)
lengths(Map(intersect, terms[x], terms[y])))
# [,1] [,2] [,3] [,4]
#[1,] 4 1 1 0
#[2,] 1 4 1 0
#[3,] 1 1 4 0
#[4,] 0 0 0 2
Or more compactly
outer(terms, terms, FUN = function(...) lengths(Map(intersect, ...)))
# Item1 Item2 Item3 Item4
#Item1 4 1 1 0
#Item2 1 4 1 0
#Item3 1 1 4 0
#Item4 0 0 0 2
We could also implement this in Rcpp. Below is the test1.cpp file
#include <Rcpp.h>
#include <math.h>
using namespace Rcpp;
//[[Rcpp::export]]
List foo(List xs) {
List x(xs);
List x1 = Rcpp::clone(xs);
List y1 = Rcpp::clone(xs);
int n = x1.size();
NumericVector res;
for( int i=0; i<n; i++){
for(int j=0; j<n; j++){
CharacterVector xd = x1[i];
CharacterVector yd = y1[j];
res.push_back(intersect(xd, yd).length());
}
}
return wrap(res) ;
We call it in R using
library(Rcpp)
sourceCpp("test1.cpp")
`dim<-`(unlist(foo(terms)), c(4, 4))
# [,1] [,2] [,3] [,4]
#[1,] 4 1 1 0
#[2,] 1 4 1 0
#[3,] 1 1 4 0
#[4,] 0 0 0 2
Benchmarks
In addition to the functions above, we included another version with a RcppEigen implementation that was posted here
n <- 100
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))
library(Matrix)
library(inline)
library(Rcpp)
alexis1 <- function() {crossprod(table(stack(terms1)))}
alexis2 <- function() {Matrix::crossprod(xtabs( ~ values + ind,
stack(terms1), sparse = TRUE)) }
akrun1 <- function(){outer(terms1, terms1, FUN = function(...) lengths(Map(intersect, ...)))}
akrun2 <- function() {`dim<-`(unlist(foo(terms1)), c(n, n))}
akrun3 <- function() {tbl <- table(stack(terms1))
funCPr(tbl, tbl)[[1]]}
db <- function() {do.call(rbind, lapply(1:length(terms1), function(i)
sapply(terms1, function(a)
sum(unlist(terms1[i]) %in% unlist(a)))))}
lmo <- function() { setNames(data.frame(t(combn(names(terms1), 2)),
combn(seq_along(terms1), 2,
function(x) length(intersect(terms1[[x[1]]], terms1[[x[2]]])))),
c("col1", "col2", "counts"))}
and the benchmark output for n at 100 are
library(microbenchmark)
microbenchmark(alexis1(), alexis2(), akrun1(), akrun2(),akrun3(), db(), lmo(),
unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 1.035975 1.032101 1.031239 1.010472 1.044217 1.129092 10 a
# alexis2() 3.896928 3.656585 3.461980 3.386301 3.335469 3.288161 10 a
# akrun1() 218.456708 207.099841 198.391784 189.356065 188.542712 214.415661 10 d
# akrun2() 84.239272 79.073087 88.594414 75.719853 78.277769 129.731990 10 b
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
# db() 86.921164 82.201117 80.358097 75.113471 73.311414 105.761977 10 b
# lmo() 125.128109 123.203318 118.732911 113.271352 113.164333 138.075212 10 c
With a slightly higher n at 200
n <- 200
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))
microbenchmark(alexis1(), alexis2(), akrun3(), db(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 1.117234 1.164198 1.181280 1.166070 1.230077 1.229899 10 a
# alexis2() 3.428904 3.425942 3.337112 3.379675 3.280729 3.164852 10 b
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
# db() 219.971285 219.577403 207.793630 213.232359 196.122420 187.433635 10 c
With n set at 9000
n <- 9000
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))
microbenchmark(alexis1(),alexis2(), akrun3(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 2.048708 2.021709 2.009396 2.085750 2.141060 1.767329 10 b
# alexis2() 3.520220 3.518339 3.419368 3.616512 3.515993 2.952927 10 c
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
Checking the output
res1 <- alexis1()
res2 <- akrun3()
res3 <- alexis2()
all.equal(res1, res2, check.attributes = FALSE)
#[1] TRUE
all.equal(res1, as.matrix(res3), check.attributes = FALSE)
#[1] TRUE
Based on the comments from #alexis_laz included 3 more functions to replace the table/stack part to compare the efficiency for n at 9000
alexis3 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
crossprod(tab, tab)
}
alexis4 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = sparseMatrix(x = 1L, i = match(unlt, u),
j = rep(seq_along(terms1), lengths(terms1)), dimnames = list(u, names(terms1)))
Matrix::crossprod(tab, tab, sparse = TRUE)
}
akrun4 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
funCPr(tab, tab)[[1]]
}
and the benchmarks are
microbenchmark(alexis1(),alexis2(), alexis3(), alexis4(),
akrun3(), akrun4(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 2.1888254 2.2897883 2.204237 2.169618 2.162955 2.122552 10 b
# alexis2() 3.7651292 3.9178071 3.672550 3.616577 3.587886 3.426039 10 c
# alexis3() 2.1776887 2.2410663 2.197293 2.137106 2.192834 2.241645 10 b
# alexis4() 4.1640895 4.3431379 4.262192 4.187449 4.388335 4.172607 10 d
# akrun3() 1.0000000 1.0000000 1.000000 1.000000 1.000000 1.000000 10 a
# akrun4() 0.9364288 0.9692772 1.043292 1.063931 1.090301 1.171245 10 a
This uses combn to produce a data.frame of the term combinations where the values of the terms are different. setNames adds variable names.
result <- setNames(data.frame(t(combn(names(terms), 2)),
combn(seq_along(terms), 2,
function(x) length(intersect(terms[[x[1]]], terms[[x[2]]])))),
c("col1", "col2", "counts"))
which returns
result
col1 col2 counts
1 Item1 Item2 1
2 Item1 Item3 1
3 Item1 Item4 0
4 Item2 Item3 1
5 Item2 Item4 0
6 Item3 Item4 0
If desired, you could use lengths to get own-term lengths and then rbind the results like
temp <- lengths(terms)
rbind(result, data.frame(col1=names(temp), col2=names(temp), counts=temp, row.names=NULL))
col1 col2 counts
1 Item1 Item2 1
2 Item1 Item3 1
3 Item1 Item4 0
4 Item2 Item3 1
5 Item2 Item4 0
6 Item3 Item4 0
7 Item1 Item1 4
8 Item2 Item2 4
9 Item3 Item3 4
10 Item4 Item4 2
I'm not sure if this is necessarily faster or more efficient, but it sure was interesting.
do.call(rbind, lapply(1:length(terms), function(i)
sapply(terms, function(a)
sum(unlist(terms[i]) %in% unlist(a)))))
# Item1 Item2 Item3 Item4
#[1,] 4 1 1 0
#[2,] 1 4 1 0
#[3,] 1 1 4 0
#[4,] 0 0 0 2
I can think of several ways to turn matrix (data frame) of this type:
dat = data.frame(
x1 = rep(c('a', 'b'), 100),
x2 = rep(c('x', 'y'), 100)
)
head(dat)
x1 x2
1 a x
2 b y
3 a x
4 b y
5 a x
6 b y
Into a binary (indicator) matrix (or data frame) like this:
a b x y
1 0 1 0
0 1 0 1
...
(This structure is, of course, trivial and only for illustrative purpose!)
Many thanks!
We can use table
tbl <- table(rep(1:nrow(dat),2),unlist(dat))
head(tbl, 2)
# a b x y
# 1 1 0 1 0
# 2 0 1 0 1
Or a possibly efficient option would be
library(Matrix)
sM <- sparse.model.matrix(~ -1 + x1 +x2, dat,
contrasts.arg = lapply(dat, contrasts, contrasts = FALSE))
colnames(sM) <- sub(".*\\d", "", colnames(sM))
head(sM, 2)
# 2 x 4 sparse Matrix of class "dgCMatrix"
# a b x y
#1 1 . 1 .
#2 . 1 . 1
It can be converted to binary by converting to matrix
head(as.matrix(sM),2)
# a b x y
#1 1 0 1 0
#2 0 1 0 1
There are some good solutions posted already, but none are optimal for performance. We can optimize performance by looping over each input column, and then looping over each factor level index within each input column and doing a straight integer comparison of the factor indexes. It's not the most concise or elegant piece of code, but it's fairly straightforward and fast:
do.call(cbind,lapply(dat,function(col)
`colnames<-`(do.call(cbind,lapply(seq_along(levels(col)),function(i)
as.integer(as.integer(col)==i)
)),levels(col))
));
Performance:
library(Matrix);
library(data.table);
library(microbenchmark);
bgoldst <- function(dat) do.call(cbind,lapply(dat,function(col) `colnames<-`(do.call(cbind,lapply(seq_along(levels(col)),function(i) as.integer(as.integer(col)==i))),levels(col))));
akrun1 <- function(dat) table(rep(1:nrow(dat),2),unlist(dat));
akrun2 <- function(dat) sparse.model.matrix(~-1+x1+x2,dat,contrasts.arg=lapply(dat,contrasts,contrasts=FALSE));
davidar <- function(dat) { dat[,rowid:=.I]; dcast(melt(dat,id='rowid'),rowid~value,length); }; ## requires a data.table
dataminer <- function(dat) t(apply(dat,1,function(x) as.numeric(unique(unlist(dat))%in%x)));
N <- 100L; dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
identical(unname(bgoldst(dat)),matrix(as.vector(akrun1(dat)),ncol=4L));
## [1] TRUE
identical(unname(bgoldst(dat)),unname(matrix(as.integer(as.matrix(akrun2(dat))),ncol=4L)));
## [1] TRUE
identical(bgoldst(dat),as.matrix(davidar(datDT)[,rowid:=NULL]));
## [1] TRUE
identical(unname(bgoldst(dat)),matrix(as.integer(dataminer(dat)),ncol=4L));
## [1] TRUE
N <- 100L;
dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
microbenchmark(bgoldst(dat),akrun1(dat),akrun2(dat),davidar(datDT),dataminer(dat));
## Unit: microseconds
## expr min lq mean median uq max neval
## bgoldst(dat) 67.570 92.374 106.2853 99.6440 121.2405 188.596 100
## akrun1(dat) 581.182 652.386 773.6300 690.6605 916.4625 1192.299 100
## akrun2(dat) 4429.208 4836.119 5554.5902 5145.3135 5977.0990 11263.537 100
## davidar(datDT) 5064.273 5498.555 6104.7621 5664.9115 6203.9695 11713.856 100
## dataminer(dat) 47577.729 49529.753 55217.3726 53190.8940 60041.9020 74346.268 100
N <- 1e4L;
dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
microbenchmark(bgoldst(dat),akrun1(dat),akrun2(dat),davidar(datDT));
## Unit: milliseconds
## expr min lq mean median uq max neval
## bgoldst(dat) 1.775617 1.820949 2.299493 1.84725 1.972124 8.362336 100
## akrun1(dat) 38.954524 41.109257 48.409613 45.60304 52.147633 162.365472 100
## akrun2(dat) 16.915832 17.762799 21.288200 19.20164 23.775180 46.494055 100
## davidar(datDT) 36.151684 38.366715 42.875940 42.38794 45.916937 58.695008 100
N <- 1e5L;
dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
microbenchmark(bgoldst(dat),akrun1(dat),akrun2(dat),davidar(datDT));
## Unit: milliseconds
## expr min lq mean median uq max neval
## bgoldst(dat) 17.16473 22.97654 35.01815 26.76662 31.75562 152.6188 100
## akrun1(dat) 501.72644 626.14494 671.98315 680.91152 727.88262 828.8313 100
## akrun2(dat) 212.12381 242.65505 298.90254 272.28203 357.65106 429.6023 100
## davidar(datDT) 368.04924 461.60078 500.99431 511.54921 540.39358 638.3840 100
If you have a data.frame as you are showing (not a matrix), you could as well recast the data
library(data.table)
setDT(dat)[, rowid := .I] # Creates a row index
res <- dcast(melt(dat, id = "rowid"), rowid ~ value, length) # long/wide format
head(res)
# rowid a b x y
# 1 1 1 0 1 0
# 2 2 0 1 0 1
# 3 3 1 0 1 0
# 4 4 0 1 0 1
# 5 5 1 0 1 0
# 6 6 0 1 0 1
Some benchmarks
dat = data.frame(
x1 = rep(c('a', 'b'), 1e3),
x2 = rep(c('x', 'y'), 1e3)
)
library(data.table)
library(Matrix)
library(microbenchmark)
dat2 <- copy(dat)
microbenchmark("akrun1 : " = table(rep(1:nrow(dat),2),unlist(dat)),
"akrun2 : " = sparse.model.matrix(~ -1 + x1 +x2, dat, contrasts.arg = lapply(dat, contrasts, contrasts = FALSE)),
"DatamineR : " = t(apply(dat,1, function(x) as.numeric(unique(unlist(dat)) %in% x))),
"David Ar : " = {setDT(dat2)[, rowid := .I] ; dcast(melt(dat2, id = "rowid"), rowid ~ value, length)},
times = 10L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# akrun1 : 3.826075 4.061904 6.654399 5.165376 11.26959 11.82029 10 a
# akrun2 : 5.269531 5.713672 8.794434 5.943422 13.34118 20.01961 10 a
# DatamineR : 3199.336286 3343.774160 3410.618547 3385.756972 3517.22133 3625.70909 10 b
# David Ar : 8.092769 8.254682 11.030785 8.465232 15.44893 19.83914 10 a
The apply solution is highly inefficient and will take forever on a bigger data set. Comparing for a bigger data set while excluding the apply solution
dat = data.frame(
x1 = rep(c('a', 'b'), 1e4),
x2 = rep(c('x', 'y'), 1e4)
)
dat2 <- copy(dat)
microbenchmark("akrun1 : " = table(rep(1:nrow(dat),2),unlist(dat)),
"akrun2 : " = sparse.model.matrix(~ -1 + x1 +x2, dat, contrasts.arg = lapply(dat, contrasts, contrasts = FALSE)),
#"DatamineR : " = t(apply(dat,1, function(x) as.numeric(unique(unlist(dat)) %in% x))),
"David Ar : " = {setDT(dat2)[, rowid := .I] ; dcast(melt(dat2, id = "rowid"), rowid ~ value, length)},
times = 100L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# akrun1 : 38.66744 41.27116 52.97982 42.72534 47.17203 161.0420 100 b
# akrun2 : 17.02006 18.93534 27.27582 19.35580 20.72022 153.2397 100 a
# David Ar : 34.15915 37.91659 46.11050 38.58536 41.40412 149.0038 100 b
Seems like the Matrix package shines for a bigger data sets.
It probably worth comparing different scenarios when there are more columns/unique values too.
One alternative using apply
head(t(apply(dat,1, function(x) as.numeric(unique(unlist(dat)) %in% x))))
[,1] [,2] [,3] [,4]
[1,] 1 0 1 0
[2,] 0 1 0 1
[3,] 1 0 1 0
[4,] 0 1 0 1
[5,] 1 0 1 0
[6,] 0 1 0 1
My sample dataframe is as belows
p<-c("name1","name2","name3","name4","name5")
x<-c(seq(0,4,by=1))
y<-c(0,0,1,1,2)
z<-c(11,2,1,0,1)
df<-data.frame(p,x,y,z)
I want to convert the above dataframe in below format
p<-c("name1","name2","name3","name4","name5")
x<-c(0,1,1,1,1)
y<-c(0,0,1,1,1)
z<-c(1,1,1,0,1)
df<-data.frame(p,x,y,z)
i.e., I want all my records greater than 1 to be 1 and all zeros to be zeros.Please help
You can use function sign for this purpose:
df[c("x","y","z")] <- sign(df[c("x","y","z")])
df
# p x y z
# 1 name1 0 0 1
# 2 name2 1 0 1
# 3 name3 1 1 1
# 4 name4 1 1 0
# 5 name5 1 1 1
You may also do
df[-1] <- (df[-1]!=0)+0L
Or
df[-1] <- (!!df[-1])+0L
Benchmarks
set.seed(24)
df2 <- as.data.frame(matrix(sample(0:10, 5000*5000, replace=TRUE), ncol=5000))
system.time((df2!=0)+0L)
# user system elapsed
# 0.801 0.061 0.861
system.time(sign(df2))
# user system elapsed
#1.315 0.022 1.336
system.time((!!df2)+0L)
# user system elapsed
# 0.602 0.044 0.647
0.602 0.044 0.647
library(microbenchmark)
microbenchmark(pascal=sign(df2), akrun=(!!df2)+0L, times=20L, unit='relative')
#Unit: relative
# expr min lq mean median uq max neval cld
# pascal 2.184227 2.164029 2.163411 2.142952 2.138964 2.196735 20 b
# akrun 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20 a
Use ifelse to conditionally assign 1 or 0 to each element:
df[, 2:4] <- ifelse(df[, 2:4] == 0, 0, 1)