I am trying to take the results of a which(..., arr.ind = TRUE) function and remove the rows that are not the first to "connect" with one another.
Examples:
#example 1 example 2 example 3
row col row col row col
1 4 2 3 1 3
2 4 2 4 2 5
4 5 3 5 3 5
3 6 2 7 4 6
4 6 3 7 5 6
3 7 4 7 6 8
4 7 5 7 9 10
# should become (trimmed.mtx)
row col row col row col
1 4 2 3 1 3
4 5 3 5 3 5
5 7 5 6
6 8
These examples can be read in using:
example1 <- structure(list(row = c(1L, 2L, 4L, 3L, 4L, 3L, 4L), col = c(4L, 4L, 5L, 6L, 6L, 7L, 7L)), .Names = c("row", "col"), class = "data.frame", row.names = c(NA, -7L))
example2 <- structure(list(row = c(2L, 2L, 3L, 2L, 3L, 4L, 5L), col = c(3L, 4L, 5L, 7L, 7L, 7L, 7L)), .Names = c("row", "col"), class = "data.frame", row.names = c(NA, -7L))
example3 <- structure(list(row = c(1L, 2L, 3L, 4L, 5L, 6L, 9L), col = c(3L, 5L, 5L, 6L, 6L, 8L, 10L)), .Names = c("row", "col"), class = "data.frame", row.names = c(NA, -7L))
The purpose of this is to take a dist matrix of Euclidean distances and turn it into a sequence of point-to-point distances that skip distances below a certain threshold. While there may be other ways to solve this problem, I am very interested in figuring out the best way to do this by filtering out rows from the which-matrix.
Reproducible example of my intended use:
set.seed(81417) # Aug 14th, 2017
# Generate fake location data (temporally sequential)
x <- as.matrix(cbind(x = rnorm(10, 10, 3), y = rnorm(10, 10, 3)))
# Find euclidean point-to-point distances and remove distances that are less than:
value = 5
# I attempted to do so by calculating an entire Euclidean distance matrix (dist())
# and then finding a path from point-to-nearest-point
# using distances that are greater than the value
d <- as.matrix(dist(x[,c("x","y")]))
d[lower.tri(d)] <- 0
mtx <- which(d > value, arr.ind = T)
mtx
# Change from EVERY point-to-point distance (mtx) > value
# to only the "connecting" points that exceed the skipping value
trimmed.mtx <- {?}
# final result
cbind(x[unique(c(trimmed.mtx)),],d[trimmed.mtx])
This is a perfect problem for Rcpp. Observe:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
IntegerMatrix findConnections(IntegerMatrix m) {
int i = 0, j = 0, k = 1, n = m.nrow();
// initialize matrix with same dimensions as m
IntegerMatrix myConnections(n, 2);
while (i < n) {
// Populate with "connected" row
myConnections(j,_) = m(i,_);
// Search for next connection
while (k < n && m(i, 1) != m(k, 0)) {k++;}
i = k;
j++;
}
// Subset matrix and output result
IntegerMatrix subMatrix(j, 2);
for (i = 0; i < j; i++) {subMatrix(i,_) = myConnections(i,_);}
return subMatrix;
}
findConnections(as.matrix(example3))
[,1] [,2]
[1,] 1 3
[2,] 3 5
[3,] 5 6
[4,] 6 8
Here are the benchmarks on example3 provided by the OP:
microbenchmark(get_path(example3),
foo(example3),
f(example3),
findConnections(as.matrix(example3)))
Unit: microseconds
expr min lq mean median uq max neval cld
get_path(example3) 3345.999 3519.0255 6361.76978 3714.014 3892.9930 202511.942 100 b
foo(example3) 215.514 239.3230 360.81086 257.180 278.3200 10256.384 100 a
f(example3) 936.355 1034.4645 1175.60323 1073.668 1142.4270 9676.755 100 a
findConnections(as.matrix(example3)) 52.135 60.3445 71.62075 67.528 80.4585 103.858 100 a
Here are some benchmarks on a larger example (didn't include get_graph as it was taking a very long time):
set.seed(6221)
x <- as.matrix(cbind(x = rnorm(1000, 10, 3), y = rnorm(1000, 10, 3)))
value = 5
d <- as.matrix(dist(x[,c("x","y")]))
d[lower.tri(d)] <- 0
mtxLarge <- which(d > value, arr.ind = T)
mtxLargeFoo <- data.frame(mtxLarge, row.names = NULL) ## this is for the function foo
## as we don't want to include
## the time it takes to create
## a data.frame every time.
microbenchmark(foo(mtxLargeFoo),
f(mtxLarge),
findConnections(as.matrix(mtxLarge)), times = 10, unit = "relative")
Unit: relative
expr min lq mean median uq max neval cld
foo(mtxLargeFoo) 3168.479 3376.909 2660.377 3424.276 2319.434 1960.161 10 b
f(mtxLarge) 8307.009 8436.569 6420.919 8319.151 5184.557 4610.922 10 c
findConnections(as.matrix(mtxLarge)) 1.000 1.000 1.000 1.000 1.000 1.000 10 a
Test for equality:
a <- findConnections(as.matrix(mtxLarge))
b <- foo(mtxLargeFoo)
c <- f(mtxLarge)
sapply(1:2, function(x) identical(a[,x], b[,x], c[, x]))
[1] TRUE TRUE
UPDATE
If Rcpp isn't your flavor, here is a Base R translation of the above code that is still faster than the other solutions:
findConnectionsBase <- function(m) {
n <- nrow(m)
myConnections <- matrix(integer(0), nrow = n, ncol = 2)
i <- j <- 1L
k <- 2L
while (i <= n) {
myConnections[j, ] <- m[i, ]
while (k <= n && m[i, 2] != m[k, 1]) {k <- k + 1L}
i <- k
j <- j + 1L
}
myConnections[!is.na(myConnections[,1]), ]
}
microbenchmark(get_path(example3),
foo(example3),
f(example3),
BaseR = findConnectionsBase(as.matrix(example3)),
Rcpp = findConnections(as.matrix(example3)))
Unit: microseconds
expr min lq mean median uq max neval cld
get_path(example3) 3128.844 3204.3765 6057.18995 3406.137 3849.274 188685.016 100 b
foo(example3) 239.734 251.4325 399.71418 267.648 301.309 12455.441 100 a
f(example3) 899.409 961.3950 1145.72695 1014.555 1127.237 9583.982 100 a
BaseR 79.638 89.2850 103.63571 97.905 111.657 212.230 100 a
Rcpp 48.850 55.8290 64.24807 61.781 69.170 123.151 100 a
And for the larger example:
microbenchmark(foo(mtxLargeFoo),
f(mtxLarge),
BaseR = findConnectionsBase(as.matrix(mtxLarge)),
Rcpp = findConnections(as.matrix(mtxLarge)), times = 10, unit = "relative")
Unit: relative
expr min lq mean median uq max neval cld
foo(mtxLargeFoo) 2651.9626 2555.0515 1606.2785 1703.0256 1711.4850 671.9115 10 c
f(mtxLarge) 6812.7195 6433.2009 3976.6135 4218.1703 4105.1138 1642.2768 10 d
BaseR 787.9947 733.4528 440.2043 478.9412 435.4744 167.7491 10 b
Rcpp 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 10 a
Here is an idea using igraph package along with zoo,
get_path <- function(df){
g1 <- graph_from_data_frame(df)
l1 <- all_simple_paths(g1, 1)
ind1 <- as.numeric(names(l1[[which.max(lengths(l1))]]))
final_df <- setNames(as.data.frame(rollapply(ind1, 2, c)),
c('row', 'col'))
return(final_df)
}
which gives the following,
library(igraph)
library(zoo)
get_path(example1)
row col
1 1 4
2 4 5
get_path(example2)
row col
1 2 3
2 3 5
3 5 7
get_path(example3)
row col
1 1 3
2 3 5
3 5 6
4 6 8
FUNCTION
foo = function(df){
#Initiate with a value of 1 (first row)
inds = 1
while(TRUE){
# Look for the first index where the 'row' is equal to the value
# in 'col' at the index specified by the last value of 'inds'
temp = tail(inds, 1)
ind = temp + which(df[["row"]][(temp+1):NROW(df)] == df[["col"]][temp])[1]
#Append 'ind' to 'inds'
inds = c(inds, ind)
#Iterate until the end of the rows or when NA is encountered
if (ind == NROW(df) | is.na(ind)){
#Return the subset of the df with appropirate rows
return(df[inds[!is.na(inds)],])
}
}
}
USAGE
foo(example1)
# row col
#1 1 4
#3 4 5
foo(example2)
# row col
#1 2 3
#3 3 5
#7 5 7
foo(example3)
# row col
#1 1 3
#3 3 5
#5 5 6
#6 6 8
foo(data.frame(mtx, row.names = NULL))
# row col
#1 1 3
#5 3 4
#11 4 7
This function is applicable for matrices and data.frames with two columns.
f <- function(x){
res <- x[1, ] # first row as defined
tmpCol <- x[1,2] # the target column for the "connection"
while (TRUE){ # loop until breaked
connectingRow <- x[which(x[, 1] == tmpCol)[1], ] # get first matching row
if (any(is.na(connectingRow))) return(res)
# if this row is not NA (which it would be if no connecting line is found) continue,
# else return the results
# append connecting matches and set new tmpCol for reiteration.
res <- rbind(res, connectingRow)
tmpCol <- res[nrow(res), 2]
}
}
f(example1)
# row col
# 1 1 4
# 3 4 5
f(example2)
# row col
# 1 2 3
# 3 3 5
# 7 5 7
Benchmarking
Benchmark comparison between #d.b's foo() and the above proposed f()
microbenchmark(f(mtx), foo(mtx))
# Unit: microseconds
# expr min lq mean median uq max neval cld
# f(mtx) 18.204 19.058 22.61003 20.053 20.7640 64.851 100 a
# foo(mtx) 14.506 15.075 73.97871 15.360 15.9285 5740.151 100 a
For all you fans of functional programming, here's a recursive solution. R is not optimized for this, but it most closely represents the abstract process that the OP is describing.
connected_rows <- function(df, next.row.val = NULL){
if(is.null(next.row.val)){
return(
rbind(
head(df,1),
Recall(
df = tail(df,-1),
next.row.val = head(df$col,1)
)
)
)
} else {
next.row <- match(next.row.val,df$row)
if(is.na(next.row)){
return(NULL)
} else {
return(
rbind(
df[next.row,],
Recall(
df = tail(df,-next.row),
next.row.val = df$col[next.row]
)
)
)
}
}
}
connected_rows(example1)
# row col
# 1 1 4
# 3 4 5
connected_rows(example2)
# row col
# 1 2 3
# 3 3 5
# 7 5 7
connected_rows(example3)
# row col
# 1 1 3
# 3 3 5
# 5 5 6
# 6 6 8
Related
Suppose we have a matrix like below,
A <- matrix(c(1,7,13,19,9,5,8,14,20,10,3,4,15,21,1,2,4,16,22,2,8,3,17,23,1,6,3,18,24,2), nrow=5)
A
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1 5 3 2 8 6
[2,] 7 8 4 4 3 3
[3,] 13 14 15 16 17 18
[4,] 19 20 21 22 23 24
[5,] 9 10 1 2 1 2
The dist function can calculate the maximum absolute distance between each row of the matrix A and return distance matrix D using dist(A, method = "maximum"). D[i,j] = \max_{k}(|A[i,k]-A[j,k]|) For example,
D[1,2] = max( abs( A[1,] - A[2,] ) ) = max(6, 3, 1, 2, 5, 3) = 6
However, in my case, I need to firstly remove the i, j element , i.e, D[i,j] = \max_{k not equal to i or j}(|A[i,k]-A[j,k]|), for example, in the above example, the answer beomes
D[1,2] = max( abs( A[1,] - A[2,] ) ) = max( 1, 2, 5, 3) = 5
I have no idea how to do this in a efficient way, I know I can use for loop, but the data set is large, for loop is extremely slow.
Assume that your real matrix also has columns more than rows. Here is a base R implementation of the function you want:
max_dist <- function(mat, i, j) {
mat <- mat[c(i, j), -c(i, j)]
max(abs(mat[1L, ] - mat[2L, ]))
}
dist1 <- function(mat) {
n <- nrow(mat)
ids <- do.call(rbind, lapply(2:n, function(i, e) cbind(i:e, rep.int(i - 1L, e - i + 1L)), n))
out <- apply(ids, 1L, function(i) max_dist(mat, i[[1L]], i[[2L]]))
attributes(out) <- list(
Size = n, Labels = dimnames(mat)[[1L]], Diag = FALSE,
Upper = FALSE, method = "dist1", call = match.call(),
class = "dist"
)
out
}
If you think R is not fast enough for your case, then you can use the package parallelDist, which allows user-defined C++ distance functions. Consider the following implementation:
library(parallelDist)
library(RcppXPtrUtils)
library(RcppArmadillo)
mydist_ptr <- cppXPtr("double mydist(const arma::mat &A, const arma::mat &B) {
arma::uvec ids = {0, (unsigned int)A(0, 0), (unsigned int)B(0, 0)};
arma::mat A_ = A, B_ = B;
A_.shed_cols(ids); B_.shed_cols(ids);
return abs((A_ - B_)).max();
}", depends = "RcppArmadillo")
dist2 <- function(mat) {
# prepend row numbers to the matrix
# this later allows cpp function `mydist` to identify which rows to drop
parDist(cbind(seq_len(nrow(mat)), mat), method = "custom", func = mydist_ptr)
}
Test with the following matrices (small_m is the example in your post):
small_m <- matrix(c(1,5,3,2,8,6,7,8,4,4,3,3,13,14,15,16,17,18,19,20,21,22,23,24,9,10,1,2,1,2), 5, 6, byrow = TRUE)
large_m <- matrix(rnorm(1000000), 10, 100000)
Benchmark
# no real difference between these two implementations when the input matrix is small
> microbenchmark::microbenchmark(dist1(small_m), dist2(small_m))
Unit: microseconds
expr min lq mean median uq max neval cld
dist1(small_m) 77.4 87.10 112.403 106.5 125.95 212.2 100 a
dist2(small_m) 145.5 160.25 177.786 170.2 183.80 286.7 100 b
# `dist2` is faster with large matrix input. However, the efficiency of `dist1` is also acceptable IMO.
> microbenchmark::microbenchmark(dist1(large_m), dist2(large_m))
Unit: milliseconds
expr min lq mean median uq max neval cld
dist1(large_m) 129.7531 139.3909 152.13154 143.0549 149.5870 322.0173 100 b
dist2(large_m) 48.8025 52.5081 55.84333 55.5175 58.6095 67.6470 100 a
Output as follows
> dist1(small_m)
1 2 3 4
2 5
3 14 15
4 18 21 6
5 5 3 16 22
> dist2(small_m)
1 2 3 4
2 5
3 14 15
4 18 21 6
5 5 3 16 22
Here is a base R option using dist + combn + as.dist
r <- diag(0,nrow(m))
r[lower.tri(r)] <- combn(1:nrow(m),2,function(k) max(abs(do.call(`-`,asplit(m[k,],1)))[-k]))
out <- as.dist(r)
which gives
1 2 3 4
2 5
3 14 15
4 18 21 6
5 5 3 16 22
Data
> dput(m)
structure(c(1, 7, 13, 19, 9, 5, 8, 14, 20, 10, 3, 4, 15, 21,
1, 2, 4, 16, 22, 2, 8, 3, 17, 23, 1, 6, 3, 18, 24, 2), .Dim = 5:6)
I'm trying to create a df that has col #, row number and cell value based on the lowest value according to a linear model with constraints. column is the col with the lowest value according to the lm output. row is the corresponding row with lowest value, now I want the actual value and i'm having a hard time. if your curious the time value is gmapsdistance output in seconds.
What i'm getting
> h
column row time.1 time.2 time.3 time.4 time.5
1 1 1 8262 8262 8262 66357 66357
2 1 2 21386 21386 21386 73307 73307
3 1 3 30698 30698 30698 52547 52547
4 2 4 32711 32711 32711 53006 53006
5 2 5 66156 66156 66156 65205 65205
What I want is one "time" column with the minimum time corresponding to column and row within aa.
> h
column row time
1 1 1 8262
2 1 2 21386
3 1 3 30698
4 2 4 53006
5 2 5 65205
Here is a reproducible example:
library(lpSolve)
aa <- matrix(c(8262, 21386, 30698, 32711, 66156, 66357, 73307, 52547, 53006, 65205),
nrow=5,
ncol=2)
aa
#Run aa through a Linear model with lower constraint of 2 and upper constraint of 8
gwide <- aa
k <- ncol(gwide)
n <- nrow(gwide)
dir <- "min"
objective.in <- c(gwide)
A <- t(rep(1, k)) %x% diag(n)
B <- diag(k) %x% t(rep(1, n))
const.mat <- rbind(A, B, B)
const.dir <- c(rep("==", n), rep(">=", k), rep("<=", k))
const.rhs <- c(rep(1, n), rep(2, k), rep(8, k))
res <- lp(dir, objective.in, const.mat, const.dir, const.rhs, all.bin = TRUE)
res
#create a matrix from LM
soln <- matrix(res$solution, n, k)
soln
column <- apply(soln, 1, which.max)
h <- as.data.frame(column)
h$row = 1:nrow(h)
h$time <- aa[h$row,c(h$column)] #this seems to be where the problem is
h
I thought h$time <- aa[h$row,c(h$column)] would return a new column named "time" with the value from aa based on the row and column from h but that didn't work out so well. I've been racking my brain for hours and have come up with nothing. Any thoughts?
You have to loop through the rows of h and then extract the values in aa using the row and column indices from h.
h$time <- apply( h, 1, function(x) aa[x[2], x[1]] )
h
# column row time
# 1 1 1 8262
# 2 1 2 21386
# 3 1 3 30698
# 4 2 4 53006
# 5 2 5 65205
Data:
aa <- structure(c(8262, 21386, 30698, 32711, 66156, 66357, 73307, 52547,
53006, 65205), .Dim = c(5L, 2L))
h <- structure(list(column = c(1L, 1L, 1L, 2L, 2L), row = 1:5), .Names = c("column",
"row"), row.names = c(NA, -5L), class = "data.frame")
For a list:
terms <- list(Item1 = c("a", "b", "c", "d"),
Item2 = c("a", "e", "f", "g"),
Item3 = c("b", "e", "h", "i"),
Item4 = c("j", "k"))
I would like to get the number of shared letters between each pair of items in the list. The expected output is therefore:
[,1] [,2] [,3] [,4]
[1,] 4 1 1 0
[2,] 1 4 1 0
[3,] 1 1 4 0
[4,] 0 0 0 2
From a previous StackOverflow answer, I found one possible solution:
overlapLength <- function(x, y) mapply(function(x, y)
length(intersect(x, y)), terms[x], terms[y])
s <- seq_along(terms)
outer(s, s, overlapLength)
But this is very slow for my list, which is very large (~9,000 items).
Is there a faster way to do this?
Thanks everyone for your input. I timed all answers with the first 100 items of my list.
> system.time(f_crossprod(go))
user system elapsed
0.024 0.001 0.025
> system.time(f_crossprod2(go))
user system elapsed
0.007 0.000 0.008
> system.time(f_mapply(go))
user system elapsed
2.018 0.032 2.059
> system.time(f_outer(go))
user system elapsed
1.950 0.016 1.979
> system.time(f_combn(go))
user system elapsed
1.056 0.005 1.062
> system.time(f_Rcpp(go))
user system elapsed
163.236 84.226 249.240
I then timed the outer and Matrix::crossprod solutions with the entire list of ~9,000 elements. The outer solution ran in about 55 minutes. The Matrix::crossprod solution ran in about 0.1 seconds!
It is possible I have made an error in implementation of the Rcpp function. However, #alexis_laz if you make your comment an answer I will accept it.
By the way, sorry I was not clear, I am not interested in the values on the diagonal.
We can use outer
outer(names(terms), names(terms), FUN = function(x,y)
lengths(Map(intersect, terms[x], terms[y])))
# [,1] [,2] [,3] [,4]
#[1,] 4 1 1 0
#[2,] 1 4 1 0
#[3,] 1 1 4 0
#[4,] 0 0 0 2
Or more compactly
outer(terms, terms, FUN = function(...) lengths(Map(intersect, ...)))
# Item1 Item2 Item3 Item4
#Item1 4 1 1 0
#Item2 1 4 1 0
#Item3 1 1 4 0
#Item4 0 0 0 2
We could also implement this in Rcpp. Below is the test1.cpp file
#include <Rcpp.h>
#include <math.h>
using namespace Rcpp;
//[[Rcpp::export]]
List foo(List xs) {
List x(xs);
List x1 = Rcpp::clone(xs);
List y1 = Rcpp::clone(xs);
int n = x1.size();
NumericVector res;
for( int i=0; i<n; i++){
for(int j=0; j<n; j++){
CharacterVector xd = x1[i];
CharacterVector yd = y1[j];
res.push_back(intersect(xd, yd).length());
}
}
return wrap(res) ;
We call it in R using
library(Rcpp)
sourceCpp("test1.cpp")
`dim<-`(unlist(foo(terms)), c(4, 4))
# [,1] [,2] [,3] [,4]
#[1,] 4 1 1 0
#[2,] 1 4 1 0
#[3,] 1 1 4 0
#[4,] 0 0 0 2
Benchmarks
In addition to the functions above, we included another version with a RcppEigen implementation that was posted here
n <- 100
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))
library(Matrix)
library(inline)
library(Rcpp)
alexis1 <- function() {crossprod(table(stack(terms1)))}
alexis2 <- function() {Matrix::crossprod(xtabs( ~ values + ind,
stack(terms1), sparse = TRUE)) }
akrun1 <- function(){outer(terms1, terms1, FUN = function(...) lengths(Map(intersect, ...)))}
akrun2 <- function() {`dim<-`(unlist(foo(terms1)), c(n, n))}
akrun3 <- function() {tbl <- table(stack(terms1))
funCPr(tbl, tbl)[[1]]}
db <- function() {do.call(rbind, lapply(1:length(terms1), function(i)
sapply(terms1, function(a)
sum(unlist(terms1[i]) %in% unlist(a)))))}
lmo <- function() { setNames(data.frame(t(combn(names(terms1), 2)),
combn(seq_along(terms1), 2,
function(x) length(intersect(terms1[[x[1]]], terms1[[x[2]]])))),
c("col1", "col2", "counts"))}
and the benchmark output for n at 100 are
library(microbenchmark)
microbenchmark(alexis1(), alexis2(), akrun1(), akrun2(),akrun3(), db(), lmo(),
unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 1.035975 1.032101 1.031239 1.010472 1.044217 1.129092 10 a
# alexis2() 3.896928 3.656585 3.461980 3.386301 3.335469 3.288161 10 a
# akrun1() 218.456708 207.099841 198.391784 189.356065 188.542712 214.415661 10 d
# akrun2() 84.239272 79.073087 88.594414 75.719853 78.277769 129.731990 10 b
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
# db() 86.921164 82.201117 80.358097 75.113471 73.311414 105.761977 10 b
# lmo() 125.128109 123.203318 118.732911 113.271352 113.164333 138.075212 10 c
With a slightly higher n at 200
n <- 200
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))
microbenchmark(alexis1(), alexis2(), akrun3(), db(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 1.117234 1.164198 1.181280 1.166070 1.230077 1.229899 10 a
# alexis2() 3.428904 3.425942 3.337112 3.379675 3.280729 3.164852 10 b
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
# db() 219.971285 219.577403 207.793630 213.232359 196.122420 187.433635 10 c
With n set at 9000
n <- 9000
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))
microbenchmark(alexis1(),alexis2(), akrun3(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 2.048708 2.021709 2.009396 2.085750 2.141060 1.767329 10 b
# alexis2() 3.520220 3.518339 3.419368 3.616512 3.515993 2.952927 10 c
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
Checking the output
res1 <- alexis1()
res2 <- akrun3()
res3 <- alexis2()
all.equal(res1, res2, check.attributes = FALSE)
#[1] TRUE
all.equal(res1, as.matrix(res3), check.attributes = FALSE)
#[1] TRUE
Based on the comments from #alexis_laz included 3 more functions to replace the table/stack part to compare the efficiency for n at 9000
alexis3 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
crossprod(tab, tab)
}
alexis4 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = sparseMatrix(x = 1L, i = match(unlt, u),
j = rep(seq_along(terms1), lengths(terms1)), dimnames = list(u, names(terms1)))
Matrix::crossprod(tab, tab, sparse = TRUE)
}
akrun4 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
funCPr(tab, tab)[[1]]
}
and the benchmarks are
microbenchmark(alexis1(),alexis2(), alexis3(), alexis4(),
akrun3(), akrun4(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 2.1888254 2.2897883 2.204237 2.169618 2.162955 2.122552 10 b
# alexis2() 3.7651292 3.9178071 3.672550 3.616577 3.587886 3.426039 10 c
# alexis3() 2.1776887 2.2410663 2.197293 2.137106 2.192834 2.241645 10 b
# alexis4() 4.1640895 4.3431379 4.262192 4.187449 4.388335 4.172607 10 d
# akrun3() 1.0000000 1.0000000 1.000000 1.000000 1.000000 1.000000 10 a
# akrun4() 0.9364288 0.9692772 1.043292 1.063931 1.090301 1.171245 10 a
This uses combn to produce a data.frame of the term combinations where the values of the terms are different. setNames adds variable names.
result <- setNames(data.frame(t(combn(names(terms), 2)),
combn(seq_along(terms), 2,
function(x) length(intersect(terms[[x[1]]], terms[[x[2]]])))),
c("col1", "col2", "counts"))
which returns
result
col1 col2 counts
1 Item1 Item2 1
2 Item1 Item3 1
3 Item1 Item4 0
4 Item2 Item3 1
5 Item2 Item4 0
6 Item3 Item4 0
If desired, you could use lengths to get own-term lengths and then rbind the results like
temp <- lengths(terms)
rbind(result, data.frame(col1=names(temp), col2=names(temp), counts=temp, row.names=NULL))
col1 col2 counts
1 Item1 Item2 1
2 Item1 Item3 1
3 Item1 Item4 0
4 Item2 Item3 1
5 Item2 Item4 0
6 Item3 Item4 0
7 Item1 Item1 4
8 Item2 Item2 4
9 Item3 Item3 4
10 Item4 Item4 2
I'm not sure if this is necessarily faster or more efficient, but it sure was interesting.
do.call(rbind, lapply(1:length(terms), function(i)
sapply(terms, function(a)
sum(unlist(terms[i]) %in% unlist(a)))))
# Item1 Item2 Item3 Item4
#[1,] 4 1 1 0
#[2,] 1 4 1 0
#[3,] 1 1 4 0
#[4,] 0 0 0 2
This question already has answers here:
How do I get a contingency table?
(6 answers)
Faster ways to calculate frequencies and cast from long to wide
(4 answers)
Closed 4 years ago.
I have a dataframe as follows:
Destination User User_Price
A a 5
A b 4
B c 6
B a 5
C b 4
C d 7
I want to convert this into a matrix that shows what destination a user has hit that looks like this:
User User_Price A B C
a 5 1 1 0
b 4 1 0 1
c 6 0 1 0
d 7 0 0 1
One way with the dplyr and tidyr package would be:
library(dplyr)
library(tidyr)
count(foo, User, User_Price, Destination) %>%
spread(key = Destination, value = n, fill = 0)
# User User_Price A B C
# (fctr) (int) (dbl) (dbl) (dbl)
#1 a 5 1 1 0
#2 b 4 1 0 1
#3 c 6 0 1 0
#4 d 7 0 0 1
If you need a matrix, you can convert this outcome (data frame) to matrix.
DATA
foo <- structure(list(Destination = structure(c(1L, 1L, 2L, 2L, 3L,
3L), .Label = c("A", "B", "C"), class = "factor"), User = structure(c(1L,
2L, 3L, 1L, 2L, 4L), .Label = c("a", "b", "c", "d"), class = "factor"),
User_Price = c(5L, 4L, 6L, 5L, 4L, 7L)), .Names = c("Destination",
"User", "User_Price"), class = "data.frame", row.names = c(NA,
-6L))
Here is an option using data.table
library(data.table)
dcast(setDT(foo),User + User_Price ~ Destination, length, value.var="Destination")
# User User_Price A B C
#1: a 5 1 1 0
#2: b 4 1 0 1
#3: c 6 0 1 0
#4: d 7 0 0 1
This looks very similar to a normal reshaping operation, except with some idiosyncrasies that require several lines of code to achieve in base R.
First, for reference and comparison purposes, here's what the minimalist reshape() call produces:
df <- data.frame(Destination=c('A','A','B','B','C','C'),User=c('a','b','c','a','b','d'),User_Price=c(5L,4L,6L,5L,4L,7L),stringsAsFactors=F);
reshape(df,dir='w',idvar='User',timevar='Destination');
## User User_Price.A User_Price.B User_Price.C
## 1 a 5 5 NA
## 2 b 4 NA 4
## 3 c NA 6 NA
## 6 d NA NA 7
Clearly there are several issues that must be addressed before we can arrive at the required output:
We must compute the required singular User_Price column from the multiple widened columns.
We must replace NA prices with 0.
We must replace non-NA prices with 1.
We must fix the column names to omit the User_Price. prefix.
Here's a complete solution, using df from above:
res <- reshape(df,dir='w',idvar='User',timevar='Destination');
pre <- '^User_Price\\.';
cis <- grep(pre,names(res));
res$User_Price <- do.call(pmax,c(res[cis],na.rm=T));
names(res)[cis] <- sub(pre,'',names(res)[cis]);
nas <- is.na(res[cis]);
res[cis][nas] <- 0;
res[cis][!nas] <- 1;
res;
User A B C User_Price
1 a 1 1 0 5
2 b 1 0 1 4
3 c 0 1 0 6
6 d 0 0 1 7
Benchmarking
library(microbenchmark);
library(dplyr);
library(tidyr);
library(data.table);
bgoldst <- function(df) { res <- reshape(df,dir='w',idvar='User',timevar='Destination'); pre <- '^User_Price\\.'; cis <- grep(pre,names(res)); res$User_Price <- do.call(pmax,c(res[cis],na.rm=T)); names(res)[cis] <- sub(pre,'',names(res)[cis]); nas <- is.na(res[cis]); res[cis][nas] <- 0; res[cis][!nas] <- 1; res; };
thelatemail <- function(df) { x <- table(df[,c('User','Destination')]); data.frame(User=rownames(x),User_Price=df[match(rownames(x),df$User),'User_Price'],unclass(x)); };
jazzurro <- function(foo) { count(foo, User, User_Price, Destination) %>% spread(key = Destination, value = n, fill = 0); };
akrun <- function(foo) dcast(setDT(foo),User + User_Price ~ Destination, length, value.var="Destination");
## OP's test case
df <- data.frame(Destination=c('A','A','B','B','C','C'),User=c('a','b','c','a','b','d'),User_Price=c(5L,4L,6L,5L,4L,7L));
dt <- as.data.table(df);
ex <- bgoldst(df); o <- names(ex); us <- ex$User;
all.equal(ex,thelatemail(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,jazzurro(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F);
## [1] TRUE
microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt));
## Unit: microseconds
## expr min lq mean median uq max neval
## bgoldst(df) 1767.488 1897.281 2021.7741 1943.894 2035.6260 5227.196 100
## thelatemail(df) 473.412 536.063 574.4233 578.186 608.1225 738.129 100
## jazzurro(df) 2707.468 2914.666 3145.7258 3032.270 3160.3515 5677.514 100
## akrun(dt) 4403.964 4721.069 5026.5023 4875.238 5028.1230 7703.303 100
## scale test
set.seed(1L);
ND <- 1e3L; NU <- 1e3L; NR <- 1e4L;
dests <- sample(make.unique(rep(LETTERS,len=ND)),NR,T);
us <- make.unique(rep(letters,len=NU));
users <- ave(dests,dests,FUN=function(x) sample(us,length(x)));
prices <- ave(seq_along(users),users,FUN=function(x) rep(sample(1:9,1L),len=length(x)));
df <- data.frame(Destination=dests,User=users,User_Price=prices);
dt <- as.data.table(df);
ex <- bgoldst(df); o <- names(ex); us <- ex$User;
all.equal(ex,thelatemail(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,jazzurro(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F);
## [1] TRUE
microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt),times=10L);
## Unit: milliseconds
## expr min lq mean median uq max neval
## bgoldst(df) 1381.46461 1418.13922 1445.20568 1437.82683 1474.79075 1538.37153 10
## thelatemail(df) 31.84727 37.56498 57.47417 44.54106 82.39749 92.63933 10
## jazzurro(df) 79.18924 91.20755 117.20360 126.22693 136.13885 168.26623 10
## akrun(dt) 52.06625 59.02158 79.59568 70.09136 106.93019 130.31208 10
## scale test 2
set.seed(1L);
ND <- 1e4L; NU <- 1e4L; NR <- 1e6L;
dests <- sample(make.unique(rep(LETTERS,len=ND)),NR,T);
us <- make.unique(rep(letters,len=NU));
users <- ave(dests,dests,FUN=function(x) sample(us,length(x)));
prices <- ave(seq_along(users),users,FUN=function(x) rep(sample(1:9,1L),len=length(x)));
df <- data.frame(Destination=dests,User=users,User_Price=prices);
dt <- as.data.table(df);
ex <- bgoldst(df); o <- names(ex); us <- ex$User;
all.equal(ex,thelatemail(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,jazzurro(df)[us,o],check.attributes=F);
## [1] TRUE
all.equal(ex,as.data.frame(akrun(dt))[us,o],check.attributes=F);
## [1] TRUE
microbenchmark(bgoldst(df),thelatemail(df),jazzurro(df),akrun(dt),times=1L);
## Unit: seconds
## expr min lq mean median uq max neval
## bgoldst(df) 485.849043 485.849043 485.849043 485.849043 485.849043 485.849043 1
## thelatemail(df) 3.377981 3.377981 3.377981 3.377981 3.377981 3.377981 1
## jazzurro(df) 12.858542 12.858542 12.858542 12.858542 12.858542 12.858542 1
## akrun(dt) 4.132785 4.132785 4.132785 4.132785 4.132785 4.132785 1
Another way to achieve the same is using dcast.
a <- dcast(foo,User + User_Price ~ Destination,fill=0)
Later change the values for the Destination columns
I want to omit rows where NA appears in both of two columns.
I'm familiar with na.omit, is.na, and complete.cases, but can't figure out how to use these to get what I want. For example, I have the following dataframe:
(df <- structure(list(x = c(1L, 2L, NA, 3L, NA),
y = c(4L, 5L, NA, 6L, 7L),
z = c(8L, 9L, 10L, 11L, NA)),
.Names = c("x", "y", "z"),
class = "data.frame",
row.names = c(NA, -5L)))
x y z
1 4 8
2 5 9
NA NA 10
3 6 11
NA 7 NA
and I want to remove only those rows where NAappears in both the x and y columns (excluding anything in z), to give
x y z
1 4 8
2 5 9
3 6 11
NA 7 NA
Does anyone know an easy way to do this? Using na.omit, is.na, or complete.cases is not working.
df[!with(df,is.na(x)& is.na(y)),]
# x y z
#1 1 4 8
#2 2 5 9
#4 3 6 11
#5 NA 7 NA
I did benchmarked on a slightly bigger dataset. Here are the results:
set.seed(237)
df <- data.frame(x=sample(c(NA,1:20), 1e6, replace=T), y= sample(c(NA, 1:10), 1e6, replace=T), z= sample(c(NA, 5:15), 1e6,replace=T))
f1 <- function() df[!with(df,is.na(x)& is.na(y)),]
f2 <- function() df[rowSums(is.na(df[c("x", "y")])) != 2, ]
f3 <- function() df[ apply( df, 1, function(x) sum(is.na(x))>1 ), ]
library(microbenchmark)
microbenchmark(f1(), f2(), f3(), unit="relative")
Unit: relative
#expr min lq median uq max neval
# f1() 1.000000 1.000000 1.000000 1.000000 1.000000 100
# f2() 1.044812 1.068189 1.138323 1.129611 0.856396 100
# f3() 26.205272 25.848441 24.357665 21.799930 22.881378 100
dplyr solution
require("dplyr")
df %>% filter_at(.vars = vars(x, y), .vars_predicate = any_vars(!is.na(.)))
can be modified to take any number columns using the .vars argument
Update: dplyr 1.0.4
df %>%
filter(!if_all(c(x, y), is.na))
See similar answer: https://stackoverflow.com/a/66136167/6105259
You can apply to slice up the rows:
sel <- apply( df, 1, function(x) sum(is.na(x))>1 )
Then you can select with that:
df[ sel, ]
To ignore the z column, just omit it from the apply:
sel <- apply( df[,c("x","y")], 1, function(x) sum(is.na(x))>1 )
If they all have to be TRUE, just change the function up a little:
sel <- apply( df[,c("x","y")], 1, function(x) all(is.na(x)) )
The other solutions here are more specific to this particular problem, but apply is worth learning as it solves many other problems. The cost is speed (usual caveats about small datasets and speed testing apply):
> microbenchmark( df[!with(df,is.na(x)& is.na(y)),], df[rowSums(is.na(df[c("x", "y")])) != 2, ], df[ apply( df, 1, function(x) sum(is.na(x))>1 ), ] )
Unit: microseconds
expr min lq median uq max neval
df[!with(df, is.na(x) & is.na(y)), ] 67.148 71.5150 76.0340 86.0155 1049.576 100
df[rowSums(is.na(df[c("x", "y")])) != 2, ] 132.064 139.8760 145.5605 166.6945 498.934 100
df[apply(df, 1, function(x) sum(is.na(x)) > 1), ] 175.372 184.4305 201.6360 218.7150 321.583 100
Use rowSums with is.na, like this:
> df[rowSums(is.na(df[c("x", "y")])) != 2, ]
x y z
1 1 4 8
2 2 5 9
4 3 6 11
5 NA 7 NA
Jumping on the benchmarking wagon, and demonstrating what I was referring to about this being a fairly easy-to-generalize solution, consider the following:
## Sample data with 10 columns and 1 million rows
set.seed(123)
df <- data.frame(replicate(10, sample(c(NA, 1:20),
1e6, replace = TRUE)))
First, here's what things look like if you're just interested in two columns. Both solutions are pretty legible and short. Speed is quite close.
f1 <- function() {
df[!with(df, is.na(X1) & is.na(X2)), ]
}
f2 <- function() {
df[rowSums(is.na(df[1:2])) != 2, ]
}
library(microbenchmark)
microbenchmark(f1(), f2(), times = 20)
# Unit: milliseconds
# expr min lq median uq max neval
# f1() 745.8378 1100.764 1128.047 1199.607 1310.236 20
# f2() 784.2132 1101.695 1125.380 1163.675 1303.161 20
Next, let's look at the same problem, but this time, we are considering NA values across the first 5 columns. At this point, the rowSums approach is slightly faster and the syntax does not change much.
f1_5 <- function() {
df[!with(df, is.na(X1) & is.na(X2) & is.na(X3) &
is.na(X4) & is.na(X5)), ]
}
f2_5 <- function() {
df[rowSums(is.na(df[1:5])) != 5, ]
}
microbenchmark(f1_5(), f2_5(), times = 20)
# Unit: seconds
# expr min lq median uq max neval
# f1_5() 1.275032 1.294777 1.325957 1.368315 1.572772 20
# f2_5() 1.088564 1.169976 1.193282 1.225772 1.275915 20
This is also the very basic dplyr solution:
library(dplyr)
df %>%
filter(!(is.na(x) & is.na(y)))
x y z
1 1 4 8
2 2 5 9
3 3 6 11
4 NA 7 NA