R Sum every k columns in matrix - r

I have a matrix temp1 (dimensions Nx16) (generally, NxM)
I would like to sum every k columns in each row to one value.
Here is what I got to so far:
cbind(rowSums(temp1[,c(1:4)]), rowSums(temp1[,c(5:8)]), rowSums(temp1[,c(9:12)]), rowSums(temp1[,c(13:16)]))
There must be a more elegant (and generalized) method to do it.
I have noticed similar question here:
sum specific columns among rows
couldn't make it work with Ananda's solution;
Got following error:
sapply(split.default(temp1, 0:(length(temp1)-1) %/% 4), rowSums)
Error in FUN(X[[1L]], ...) :
'x' must be an array of at least two dimensions
Please advise.

You can use by:
do.call(cbind, by(t(temp1), (seq(ncol(temp1)) - 1) %/% 4, FUN = colSums))

If the dimensions are equal for the sub matrices, you could change the dimensions to an array and then do the rowSums
m1 <- as.matrix(temp1)
n <- 4
dim(m1) <- c(nrow(m1), ncol(m1)/n, n)
res <- matrix(rowSums(apply(m1, 2, I)), ncol=n)
identical(res[,1],rowSums(temp1[,1:4]))
#[1] TRUE
Or if the dimensions are unequal
t(sapply(seq(1,ncol(temp2), by=4), function(i) {
indx <- i:(i+3)
rowSums(temp2[indx[indx <= ncol(temp2)]])}))
data
set.seed(24)
temp1 <- as.data.frame(matrix(sample(1:20, 16*4, replace=TRUE), ncol=16))
set.seed(35)
temp2 <- as.data.frame(matrix(sample(1:20, 17*4, replace=TRUE), ncol=17))

Another possibility:
x1<-sapply(1:(ncol(temp1)/4),function(x){rowSums(temp1[,1:4+(x-1)*4])})
## check
x0<-cbind(rowSums(temp1[,c(1:4)]), rowSums(temp1[,c(5:8)]), rowSums(temp1[,c(9:12)]), rowSums(temp1[,c(13:16)]))
identical(x1,x0)
# TRUE

Here's another approach. Convert the matrix to an array and then use apply with sum.
n <- 4
apply(array(temp1, dim=c(dim(temp1)/c(1,n), n)), MARGIN=c(1,3), FUN=sum)
Using #akrun's data
set.seed(24)
temp1 <- matrix(sample(1:20, 16*4, replace=TRUE), ncol=16)

a function which sums matrix columns with each group of size n columns
set.seed(1618)
mat <- matrix(rnorm(24 * 16), 24, 16)
f <- function(mat, n = 4) {
if (ncol(mat) %% n != 0)
stop()
cols <- split(colSums(mat), rep(1:(ncol(mat) / n), each = n))
## or use this to have n mean the number of groups you want
# cols <- split(colSums(mat), rep(1:n, each = ncol(mat) / n))
sapply(cols, sum)
}
f(mat, 4)
# 1 2 3 4
# -17.287137 -1.732936 -5.762159 -4.371258
c(sum(mat[,1:4]), sum(mat[,5:8]), sum(mat[,9:12]), sum(mat[,13:16]))
# [1] -17.287137 -1.732936 -5.762159 -4.371258
More examples:
## first 8 and last 8 cols
f(mat, 8)
# 1 2
# -19.02007 -10.13342
## each group is 16 cols, ie, the entire matrix
f(mat, 16)
# 1
# -29.15349
sum(mat)
# [1] -29.15349

Related

R: Find set of columns which contain most 1s in matrix of 0 and 1

I have a matrix of 1s and 0s where the rows are individuals and the columns are events. A 1 indicates that an event happened to an individual and a 0 that it did not.
I want to find which set of (in the example) 5 columns/events that cover the most rows/individuals.
Test Data
#Make test data
set.seed(123)
d <- sapply(1:300, function(x) sample(c(0,1), 30, T, c(0.9,0.1)))
colnames(d) <- 1:300
rownames(d) <- 1:30
My attempt
My initial attempt was just based on combining the set of 5 columns with the highest colMeans:
#Get top 5 columns with highest row coverage
col_set <- head(sort(colMeans(d), decreasing = T), 5)
#Have a look the set
col_set
>
197 199 59 80 76
0.2666667 0.2666667 0.2333333 0.2333333 0.2000000
#Check row coverage of the column set
sum(apply(d[,colnames(d) %in% names(col_set)], 1, sum) > 0) / 30 #top 5
>
[1] 0.7
However this set does not cover the most rows. I tested this by pseudo-random sampling 10.000 different sets of 5 columns, and then finding the set with the highest coverage:
#Get 5 random columns using colMeans as prob in sample
##Random sample 10.000 times
set.seed(123)
result <- lapply(1:10000, function(x){
col_set2 <- sample(colMeans(d), 5, F, colMeans(d))
cover <- sum(apply(d[,colnames(d) %in% names(col_set2)], 1, sum) > 0) / 30 #random 5
list(set = col_set2, cover = cover)
})
##Have a look at the best set
result[which.max(sapply(result, function(x) x[["cover"]]))]
>
[[1]]
[[1]]$set
59 169 262 68 197
0.23333333 0.10000000 0.06666667 0.16666667 0.26666667
[[1]]$cover
[1] 0.7666667
The reason for supplying the colMeans to sample is that the columns with the highest coverages are the ones I am most interested in.
So, using pseudo-random sampling I can collect a set of columns with higher coverage than when just using the top 5 columns. However, since my actual data sets are larger than the example I am looking for a more efficient and rational way of finding the set of columns with the highest coverage.
EDIT
For the interested, I decided to microbenchmark the 3 solutions provided:
#Defining G. Grothendieck's coverage funciton outside his solutions
coverage <- function(ix) sum(rowSums(d[, ix]) > 0) / 30
#G. Grothendieck top solution
solution1 <- function(d){
cols <- tail(as.numeric(names(sort(colSums(d)))), 20)
co <- combn(cols, 5)
itop <- which.max(apply(co, 2, coverage))
co[, itop]
}
#G. Grothendieck "Older solution"
solution2 <- function(d){
require(lpSolve)
ones <- rep(1, 300)
res <- lp("max", colSums(d), t(ones), "<=", 5, all.bin = TRUE, num.bin.solns = 10)
m <- matrix(res$solution[1:3000] == 1, 300)
cols <- which(rowSums(m) > 0)
co <- combn(cols, 5)
itop <- which.max(apply(co, 2, coverage))
co[, itop]
}
#user2554330 solution
bestCols <- function(d, n = 5) {
result <- numeric(n)
for (i in seq_len(n)) {
result[i] <- which.max(colMeans(d))
d <- d[d[,result[i]] != 1,, drop = FALSE]
}
result
}
#Benchmarking...
microbenchmark::microbenchmark(solution1 = solution1(d),
solution2 = solution2(d),
solution3 = bestCols(d), times = 10)
>
Unit: microseconds
expr min lq mean median uq max neval
solution1 390811.850 497155.887 549314.385 578686.3475 607291.286 651093.16 10
solution2 55252.890 71492.781 84613.301 84811.7210 93916.544 117451.35 10
solution3 425.922 517.843 3087.758 589.3145 641.551 25742.11 10
This looks like a relatively hard optimization problem, because of the ways columns interact. An approximate strategy would be to pick the column with the highest mean; then delete the rows with ones in that column, and repeat. You won't necessarily find the best solution this way, but you should get a fairly good one.
For example,
set.seed(123)
d <- sapply(1:300, function(x) sample(c(0,1), 30, T, c(0.9,0.1)))
colnames(d) <- 1:300
rownames(d) <- 1:30
bestCols <- function(d, n = 5) {
result <- numeric(n)
for (i in seq_len(n)) {
result[i] <- which.max(colMeans(d))
d <- d[d[,result[i]] != 1,, drop = FALSE]
}
cat("final dim is ", dim(d))
result
}
col_set <- bestCols(d)
sum(apply(d[,colnames(d) %in% col_set], 1, sum) > 0) / 30 #top 5
This gives 90% coverage.
The following provides a heuristic to find an approximate solution. Find the N=20 columns, say, with the most ones, cols, and then use brute force to find every subset of 5 columns out of those 20. The subset having the highest coverage is shown below and its coverage is 93.3%.
coverage <- function(ix) sum(rowSums(d[, ix]) > 0) / 30
N <- 20
cols <- tail(as.numeric(names(sort(colSums(d)))), N)
co <- combn(cols, 5)
itop <- which.max(apply(co, 2, coverage))
co[, itop]
## [1] 90 123 197 199 286
coverage(co[, itop])
## [1] 0.9333333
Repeating this for N=5, 10, 15 and 20 we get coverages of 83.3%, 86.7%, 90% and 93.3%. The higher the N the better the coverage but the lower the N the less the run time.
Older solution
We can approximate the problem with a knapsack problem that chooses the 5 columns with largest numbers of ones using integer linear programming.
We get the 10 best solutions to this approximate problem, get all columns which are in at least one of the 10 solutions. There are 14 such columns and we then use brute force to find which subset of 5 of the 14 columns has highest coverage.
library(lpSolve)
ones <- rep(1, 300)
res <- lp("max", colSums(d), t(ones), "<=", 5, all.bin = TRUE, num.bin.solns = 10)
coverage <- function(ix) sum(rowSums(d[, ix]) > 0) / 30
# each column of m is logical 300-vector defining possible soln
m <- matrix(res$solution[1:3000] == 1, 300)
# cols is the set of columns which are in any of the 10 solutions
cols <- which(rowSums(m) > 0)
length(cols)
## [1] 14
# use brute force to find the 5 best columns among cols
co <- combn(cols, 5)
itop <- which.max(apply(co, 2, coverage))
co[, itop]
## [1] 90 123 197 199 286
coverage(co[, itop])
## [1] 0.9333333
You can try to test if there is a better column and exchange this with the one currently in the selection.
n <- 5 #Number of columns / events
i <- rep(1, n)
for(k in 1:10) { #How many times itterate
tt <- i
for(j in seq_along(i)) {
x <- +(rowSums(d[,i[-j]]) > 0)
i[j] <- which.max(colSums(x == 0 & d == 1))
}
if(identical(tt, i)) break
}
sort(i)
#[1] 90 123 197 199 286
mean(rowSums(d[,i]) > 0)
#[1] 0.9333333
Taking into account, that the initial condition influences the result you can take random starts.
n <- 5 #Number of columns / events
x <- apply(d, 2, function(x) colSums(x == 0 & d == 1))
diag(x) <- -1
idx <- which(!apply(x==0, 1, any))
x <- apply(d, 2, function(x) colSums(x != d))
diag(x) <- -1
x[upper.tri(x)] <- -1
idx <- unname(c(idx, which(apply(x==0, 1, any))))
res <- sample(idx, n)
for(l in 1:100) {
i <- sample(idx, n)
for(k in 1:10) { #How many times itterate
tt <- i
for(j in seq_along(i)) {
x <- +(rowSums(d[,i[-j]]) > 0)
i[j] <- which.max(colSums(x == 0 & d == 1))
}
if(identical(tt, i)) break
}
if(sum(rowSums(d[,i]) > 0) > sum(rowSums(d[,res]) > 0)) res <- i
}
sort(res)
#[1] 90 123 197 199 286
mean(rowSums(d[,res]) > 0)
#[1] 0.9333333

Extract the combinations of cells without repeating the index

I am trying to calculate the combinations of elements of a matrix but each element should appear only once.
The (real) matrix is symmetric, and can have more then 5 elements (up to ~2000):
o <- matrix(runif(25), ncol = 5, nrow = 5)
dimnames(o) <- list(LETTERS[1:5], LETTERS[1:5])
# A B C D E
# A 0.4400317 0.1715681 0.7319108946 0.3994685 0.4466997
# B 0.5190471 0.1666164 0.3430245044 0.3837903 0.9322599
# C 0.3249180 0.6122229 0.6312876740 0.8017402 0.0141673
# D 0.1641411 0.1581701 0.0001703419 0.7379847 0.8347536
# E 0.4853255 0.5865909 0.6096330935 0.8749807 0.7230507
I desire to calculate the product of all the combinations of pairs (If possible it should appear all elements:AB, CD, EF if the matrix is of 6 elements), where for each pair one letter is the column, the other one is the row. Here are some combinations:
AB, CD, E
AC, BD, E
AD, BC, E
AE, BC, D
AE, BD, C
Where the value of the single element is just 1.
Combinations not desired:
AB, BC: Element B appears twice
AB, AC: Element A appears twice
Things I tried:
I thought about removing the unwanted part of the matrix:
out <- which(upper.tri(o), arr.ind = TRUE)
out <- cbind.data.frame(out, value = o[upper.tri(o)])
out[, 1] <- colnames(o)[out[, 1]]
out[, 2] <- colnames(o)[out[, 2]]
# row col value
# 1 A B 0.1715681
# 2 A C 0.7319109
# 3 B C 0.3430245
# 4 A D 0.3994685
# 5 B D 0.3837903
# 6 C D 0.8017402
# 7 A E 0.4466997
# 8 B E 0.9322599
# 9 C E 0.0141673
# 10 D E 0.8347536
My attempt involves the following process:
Make a copy of the matrix (out)
Store first value of the first row.
Remove all the pairs that involve any of the pair.
Select the next pair of the resulting matrix
Repeat until all rows are removed of the matrix
Repeat 2:5 starting from a different row
However, this method has one big problem, it doesn't guarantee that all the combinations are stored, and it could store several times the same combination.
My expected output is a vector, where each element is the product of the values in the cell selected by the combination:
AB, CD: 0.137553
How can I extract all those combinations efficiently?
This might work. I tested this on N elements = 5 and 6.
Note that this is not optimised, and hopefully can provide a framework for you to work from. With a much larger array, I can see steps involving apply and combn being a bottleneck.
The idea here is to generate a collection of unique sets first before calculating the product of the sets from another data.frame that stores values of sets.
Unique sets are identified by counting the number of unique elements in all combination pairs. For example, if N elements = 6, we expect length(unlist(combination)) == 6. The same is true if N elements = 7 (there will only be 3 pairs plus a remainder element). In cases where N elements is odd, we can ignore the remaining, unpaired element since it is constrained by the other elements.
library(dplyr)
library(reshape2)
## some functions
unique_by_n <- function(inlist, N){
## select unique combinations by count
## if unique, expect n = 6 if n elements = 6)
if(N %% 2) N <- N - 1 ## for odd numbers
return(length(unique(unlist(inlist))) == N)
}
get_combs <- function(x,xall){
## format and catches remainder if matrix of odd elements
xu <- unlist(x)
remainder <- setdiff(xall,xu) ## catch remainder if any
xset <- unlist(lapply(x, paste0, collapse=''))
finalset <- c(xset, remainder)
return(finalset)
}
## make dataset
set.seed(0) ## set reproducible example
#o <- matrix(runif(25), ncol = 5, nrow = 5) ## uncomment to test 5
#dimnames(o) <- list(LETTERS[1:5], LETTERS[1:5])
o <- matrix(runif(36), ncol = 6, nrow = 6)
dimnames(o) <- list(LETTERS[1:6], LETTERS[1:6])
o[lower.tri(o)] <- t(o)[lower.tri(o)] ## make matrix symmetric
n_elements = nrow(o)
#### get matrix
dat <- melt(o, varnames = c('Rw', 'Cl'), as.is = TRUE)
dat$Set <- apply(dat, 1, function(x) paste0(sort(unique(x[1:2])), collapse = ''))
## get unique sets (since your matrix is symmetric)
dat <- subset(dat, !duplicated(Set))
#### get sets
elements <- rownames(o)
allpairs <- expand.grid(Rw = elements, Cl = elements) %>%
filter(Rw != Cl) ## get all pairs
uniqpairsgrid <- unique(t(apply(allpairs,1,sort)))
uniqpairs <- split(uniqpairsgrid, seq(nrow(uniqpairsgrid))) ## get unique pairs
allpaircombs <- combn(uniqpairs,floor(n_elements/2)) ## get combinations of pairs
uniqcombs <- allpaircombs[,apply(allpaircombs, 2, unique_by_n, N = n_elements)] ## remove pairs with repeats
finalcombs <- apply(uniqcombs, 2, get_combs, xall=elements)
#### calculate results
res <- apply(finalcombs, 2, function(x) prod(subset(dat, Set %in% x)$value)) ## calculate product
names(res) <- apply(finalcombs, 2, paste0, collapse=',') ## add names
resdf <- data.frame(Sets = names(res), Products = res, stringsAsFactors = FALSE, row.names = NULL)
print(resdf)
#> Sets Products
#> 1 AB,CD,EF 0.130063454
#> 2 AB,CE,DF 0.171200062
#> 3 AB,CF,DE 0.007212619
#> 4 AC,BD,EF 0.012494787
#> 5 AC,BE,DF 0.023285088
#> 6 AC,BF,DE 0.001139712
#> 7 AD,BC,EF 0.126900247
#> 8 AD,BE,CF 0.158919605
#> 9 AD,BF,CE 0.184631344
#> 10 AE,BC,DF 0.042572488
#> 11 AE,BD,CF 0.028608495
#> 12 AE,BF,CD 0.047056905
#> 13 AF,BC,DE 0.003131029
#> 14 AF,BD,CE 0.049941770
#> 15 AF,BE,CD 0.070707311
Created on 2018-07-23 by the [reprex package](http://reprex.tidyverse.org) (v0.2.0.9000).
Maybe the following does what you want.
Note that I was more interested in being right than in performance.
Also, I have set the RNG seed, to have reproducible results.
set.seed(9840) # Make reproducible results
o <- matrix(runif(25), ncol = 5, nrow = 5)
dimnames(o) <- list(LETTERS[1:5], LETTERS[1:5])
cmb <- combn(LETTERS[1:5], 2)
n <- ncol(cmb)
res <- NULL
nms <- NULL
for(i in seq_len(n)){
for(j in seq_len(n)[-seq_len(i)]){
x <- unique(c(cmb[, i], cmb[, j]))
if(length(x) == 4){
res <- c(res, o[cmb[1, i], cmb[2, i]] * o[cmb[1, j], cmb[2, j]])
nms <- c(nms, paste0(cmb[1, i], cmb[2, i], '*', cmb[1, j], cmb[2, j]))
}
}
}
names(res) <- nms
res

Multiply values of column with itself in R

I am trying to multiply elements of column with itself but am unable to do it.
I have column A with values a, b, c, I want answer as (a*b + a*c + b*c).
For example, with
A <- c(2, 3, 5) the expected output is sum(6 + 10 + 15) = 31.
I am trying to run for loop to execute but was failing. Can anyone please provide R code to do this.
example data :
df1 <- data.frame(A=c(2,3,5))
combn will give you the combinations
combinations <- combn(df1$A,2)
# [,1] [,2] [,3]
# [1,] 2 2 3
# [2,] 3 5 5
apply with margin 2 (by columns), will do the multiplication
multiplied_terms <- apply(combinations,2,function(x) x[1]*x[2])
# [1] 6 10 15
Or shorter and more general, thanks to #zacdav :
multiplied_terms <- apply(combinations,2,prod)
then we can sum them
output <- sum(multiplied_terms)
# [1] 31
Piped for a compact solution:
library(magrittr)
df1$A %>% combn(2) %>% apply(2,prod) %>% sum
Here's another way. Approach by #Moody_Mudskipper maybe easier to extend to groups of 3 etc. But, I think this should be much faster since there isn't the need to actually find the combinations.
Using for loop
It just goes through the vector A multiplying the rest of the elements until the last one.
len <- length(A)
res <- numeric(0)
for (j in seq_len(len - 1))
res <- res + sum(A[j] * A[(j+1) : len]))
res
#[1] 31
Using lapply or sapply
The for loop can be replaced by using lapply
res <- sum(unlist(lapply(1 : (len - 1), function(j) sum(A[j] * A[(j+1) : len]))))
or sapply,
res <- sum(sapply(1 : (len - 1), function(j) sum(A[j] * A[(j+1) : len])))
I didn't check which of these is the fastest.
# If you need to store the pairwise multiplications, then use the following;
# res <- NULL
# for (j in 1 : (len-1))
# res <- c(res, A[j] * A[(j+1) : len])
# res
# [1] 6 10 15
# sum(res)
# [1] 31

compare the information between two matrices R

I have two matrices, one is generated out of the other by deleting some rows. For example:
m = matrix(1:18, 6, 3)
m1 = m[c(-1, -3, -6),]
Suppose I do not know which rows in m were eliminated to create m1, how should I find it out by comparing the two matrices? The result I want looks like this:
1, 3, 6
The actual matrix I am dealing with is very big. I was wondering if there is any efficient way of conducting it.
Here are some approaches:
1) If we can assume that there are no duplicated rows in m -- this is the case in the example in the question -- then:
which(tail(!duplicated(rbind(m1, m)), nrow(m)))
## [1] 1 3 6
2) Transpose m and m1 giving tm and tm1 since it is more efficient to work on columns than rows.
Define match_indexes(i) which returns a vector r such that each row in m[r, ] matches m1[i, ].
Apply that to each i in 1:n1 and remove the result from 1:n.
n <- nrow(m); n1 <- nrow(m1)
tm <- t(m); tm1 <- t(m1)
match_indexes <- function(i) which(colSums(tm1[, i] == tm) == n1)
setdiff(1:n, unlist(lapply(1:n1, match_indexes)))
## [1] 1 3 6
3) Calculate an interaction vector for each matrix and then use setdiff and finally match to get the indexes:
i <- interaction(as.data.frame(m))
i1 <- interaction(as.data.frame(m1))
match(setdiff(i, i1), i)
## [1] 1 3 6
Added If there can be duplicates in m then (1) and (3) will only return the first of any multiply occurring row in m not in m1.
m <- matrix(1:18, 6, 3)
m1 <- m[c(2, 4, 5),]
m <- rbind(m, m[1:2, ])
# 1
which(tail(!duplicated(rbind(m1, m)), nrow(m)))
## 1 3 6
# 2
n <- nrow(m); n1 <- nrow(m1)
tm <- t(m); tm1 <- t(m1)
match_indexes <- function(i) which(colSums(tm1[, i] == tm) == n1)
setdiff(1:n, unlist(lapply(1:n1, match_indexes)))
## 1 3 6 7
# 3
i <- interaction(as.data.frame(m))
i1 <- interaction(as.data.frame(m1))
match(setdiff(i, i1), i)
## 1 3 6
A possible way is to represent each row as a string:
x1 <- apply(m, 1, paste0, collapse = ';')
x2 <- apply(m1, 1, paste0, collapse = ';')
which(!x1 %in% x2)
# [1] 1 3 6
Some benchmark with a large matrix using my solution and G. Grothendieck's solutions:
set.seed(123)
m <- matrix(rnorm(20000 * 5000), nrow = 20000)
m1 <- m[-sample.int(20000, 1000), ]
system.time({
which(tail(!duplicated(rbind(m1, m)), nrow(m)))
})
# user system elapsed
# 339.888 2.368 342.204
system.time({
x1 <- apply(m, 1, paste0, collapse = ';')
x2 <- apply(m1, 1, paste0, collapse = ';')
which(!x1 %in% x2)
})
# user system elapsed
# 395.428 0.568 395.955
system({
n <- nrow(m); n1 <- nrow(m1)
tm <- t(m); tm1 <- t(m1)
match_indexes <- function(i) which(colSums(tm1[, i] == tm) == n1)
setdiff(1:n, unlist(lapply(1:n1, match_indexes)))
})
# > 15 min, not finish
system({
i <- interaction(as.data.frame(m))
i1 <- interaction(as.data.frame(m1))
match(setdiff(i, i1), i)
})
# run out of memory. My 32G RAM machine crashed.
We can also use do.call
which(!do.call(paste, as.data.frame(m)) %in% do.call(paste, as.data.frame(m1)))
#[1] 1 3 6

extract unique rows with a condition in r

I have this kind of data:
x <- matrix(c(2,2,3,3,3,4,4,20,33,2,3,45,6,9,45,454,7,4,6,7,5), nrow = 7, ncol = 3)
In the real dataset, I have a huge matrix with a lot of columns.
I want to extract unique rows with respect to the first column(Id) and minimum of the third column. For instance, for this matrix I would expect
y <- matrix(c(2,3,4,20,3,9,45,4,5), nrow = 3, ncol = 3)
I tried a lot of things but I couldn't figure out.
Any help is appreciated.
Thanks in advance,
Zeray
Here's a version that is more complicated, but somewhat faster that Chase's ddply solution - some 200x faster :-)
uniqueMin <- function(m, idCol = 1L, minCol = ncol(m)) {
t(vapply(split(1:nrow(m), m[,idCol]), function(i, x, minCol) x[i, , drop=FALSE][which.min(x[i,minCol]),], m[1,], x=m, minCol=minCol))
}
And the following test code:
nRows <- 10000
nCols <- 100
ids <- nRows/5
m <- cbind(sample(ids, nRows, T), matrix(runif(nRows*nCols), nRows))
system.time( a<-uniqueMin(m, minCol=3L) ) # 0.07
system.time(ddply(as.data.frame(m), "V1", function(x) x[which.min(x$V3) ,])) # 15.72
You can use package plyr. Convert to a data.frame so you can group on the first column, then use which.min to extract the min row by group:
library(plyr)
ddply(as.data.frame(x), "V1", function(x) x[which.min(x$V3) ,])
V1 V2 V3
1 2 20 45
2 3 3 4
3 4 9 5

Resources