Related
This is a combination problem. I have 10 shops. I want to find the best 8 shops which minimize the sum of distances from my 100 observations.
From the combination matrix 'test_comb_matrix', I wish to extract combinations of 8 at each iteration of choose(10, 8).
I then apply those indices to the distance matrix 'test_dist_matrix' and record distances. I use pmin() to find the closest shop for each observation, then record the minimum in myminCol.
Below is my code (scroll down for reproducible code). I want to remove the 'a to h' bit.
for(i in 1:nrow(testDat))
{
print(i)
# get indices from combination matrix
a <- test_comb_matrix[1, i]
b <- test_comb_matrix[2, i]
c <- test_comb_matrix[3, i]
d <- test_comb_matrix[4, i]
e <- test_comb_matrix[5, i]
f <- test_comb_matrix[6, i]
g <- test_comb_matrix[7, i]
h <- test_comb_matrix[8, i]
# find the minimum
myminCol <- as.vector(pmin(test_dist_matrix[, a], test_dist_matrix[, b],
test_dist_matrix[, c], test_dist_matrix[, d],
test_dist_matrix[, e], test_dist_matrix[, f],
test_dist_matrix[, g], test_dist_matrix[, h]))
# sum distances
mySum <- sum(myminCol)
testDat[i, 1] <- mySum
}
Reproducible code:
# number of combinations from 10 choose 8
n <- choose(10, 8)
# get combination matrix
test_comb_matrix <- combn(1:10, 8)
# view first 5 combinations
test_comb_matrix[, 1:5]
# create distance matrix for 100 observations and 10 columns
test_dist_matrix <- data.frame(matrix(rnorm(100), nrow = 100, ncol = 10))
testDat <- data.frame(matrix(NA, nrow = n, ncol = 1))
names(testDat) <- "min"
Try using this :
result_vec <- sapply(seq_len(nrow(testDat)), function(i)
sum(matrixStats::rowMins(as.matrix(test_dist_matrix[, test_comb_matrix[, i]]))))
I have a piece of working code that is taking too many hours (days?) to compute.
I have a sparse matrix of 1s and 0s, I need to subtract each row from any other row, in all possible combinations, multiply the resulting vector by another vector, and finally average the values in it so to get a single scalar which I need to insert in a matrix. What I have is:
m <- matrix(
c(0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0), nrow=4,ncol=4,
byrow = TRUE)
b <- c(1,2,3,4)
for (j in 1:dim(m)[1]){
for (i in 1:dim(m)[1]){
a <- m[j,] - m[i,]
a[i] <- 0L
a[a < 0] <- 0L
c <- a*b
d[i,j] <- mean(c[c > 0])
}
}
The desired output is matrix with the same dimensions of m, where each entry is the result of these operations.
This loop works, but are there any ideas on how to make this more efficient? Thank you
My stupid solution is to use apply or sapply function, instead of for loop to do the iterations:
sapply(1:dim(m)[1], function(k) {z <- t(apply(m, 1, function(x) m[k,]-x)); diag(z) <- 0; z[z<0] <- 0; apply(t(apply(z, 1, function(x) x*b)),1,function(x) mean(x[x>0]))})
I tried to compare your solution and this in terms of running time in my computer, yours takes
t1 <- Sys.time()
d1 <- m
for (j in 1:dim(m)[1]){
for (i in 1:dim(m)[1]){
a <- m[j,] - m[i,]
a[i] <- 0L
a[a < 0] <- 0L
c <- a*b
d1[i,j] <- mean(c[c > 0])
}
}
Sys.time()-t1
Yours needs Time difference of 0.02799988 secs. For mine, it is reduced a bit but not too much, i.e., Time difference of 0.01899815 secs, when you run
t2 <- Sys.time()
d2 <- sapply(1:dim(m)[1], function(k) {z <- t(apply(m, 1, function(x) m[k,]-x)); diag(z) <- 0; z[z<0] <- 0; apply(t(apply(z, 1, function(x) x*b)),1,function(x) mean(x[x>0]))})
Sys.time()-t2
You can try it on your own computer with larger matrix, good luck!
1) create test sparse matrix:
nc <- nr <- 100
p <- 0.001
require(Matrix)
M <- Matrix(0L, nr, nc, sparse = T) # 0 matrix
n1 <- ceiling(p * (prod(dim(M)))) # 1 count
M[1:n1] <- 1L # fill only first column, to approximate max non 0 row count
# (each row has at maximum 1 positive element)
sum(M)/(prod(dim(M)))
b <- 1:ncol(M)
sum(rowSums(M))
So, if the proportion given is correct then we have at most 10 rows that contain non 0 elements
Based on this fact and your supplied calculations:
# a <- m[j, ] - m[i, ]
# a[i] <- 0L
# a[a < 0] <- 0L
# c <- a*b
# mean(c[c > 0])
we can see that the result will be meaningful only form[, j] rows which have at least 1 non 0 element
==> we can skip calculations for all m[, j] which contain only 0s, so:
minem <- function() { # write as function
t1 <- proc.time() # timing
require(data.table)
i <- CJ(1:nr, 1:nr) # generate all combinations
k <- rowSums(M) > 0L # get index where at least 1 element is greater that 0
i <- i[data.table(V1 = 1:nr, k), on = 'V1'] # merge
cat('at moust', i[, sum(k)/.N*100], '% of rows needs to be calculated \n')
i[k == T, rowN := 1:.N] # add row nr for 0 subset
i2 <- i[k == T] # subset only those indexes who need calculation
a <- M[i2[[1]],] - M[i2[[2]],] # operate on all combinations at once
a <- drop0(a) # clean up 0
ids <- as.matrix(i2[, .(rowN, V2)]) # ids for 0 subset
a[ids] <- 0L # your line: a[i] <- 0L
a <- drop0(a) # clean up 0
a[a < 0] <- 0L # the same as your line
a <- drop0(a) # clean up 0
c <- t(t(a)*b) # multiply each row with vector
c <- drop0(c) # clean up 0
c[c < 0L] <- 0L # for mean calculation
c <- drop0(c) # clean up 0
r <- rowSums(c)/rowSums(c > 0L) # row means
i[k == T, result := r] # assign results to data.table
i[is.na(result), result := NaN] # set rest to NaN
d2 <- matrix(i$result, nr, nr, byrow = F) # create resulting matrix
t2 <- proc.time() # timing
cat(t2[3] - t1[3], 'sec \n')
d2
}
d2 <- minem()
# at most 10 % of rows needs to be calculated
# 0.05 sec
Test on smaller example if results matches
d <- matrix(NA, nrow(M), ncol(M))
for (j in 1:dim(M)[1]) {
for (i in 1:dim(M)[1]) {
a <- M[j, ] - M[i, ]
a[i] <- 0L
a[a < 0] <- 0L
c <- a*b
d[i, j] <- mean(c[c > 0])
}
}
all.equal(d, d2)
Can we get results for your real data size?:
# generate data:
nc <- nr <- 6663L
b <- 1:nr
p <- 0.0001074096 # proportion of 1s
M <- Matrix(0L, nr, nc, sparse = T) # 0 matrix
n1 <- ceiling(p * (prod(dim(M)))) # 1 count
M[1:n1] <- 1L
object.size(as.matrix(M))/object.size(M)
# storing this data in usual matrix uses 4000+ times more memory
# calculation:
d2 <- minem()
# at most 71.57437 % of rows needs to be calculated
# 28.33 sec
So you need to convert your matrix to sparse one with
M <- Matrix(m, sparse = T)
I have a classic dice simulation problem, which I'm struggling to implement since I'm new with R syntax. The function (which I have called simu) works as follows:
Start with 0 points
Simulate n random draws of three six-sided dice
For each draw:
If sum of three dice >12 --> +1 point
If sum of three dice <6 --> -1 point
Otherwise (ie sum between 6 and 12):
If three dice have same number --> +5 points
Otherwise --> 0 points
Return total # of points obtained at the end of n simulations
Having tried a number of different methods I seem to be pretty close with:
simu <- function(n){
k <- 0
for(i in 1:n) {
a <- sample(y,1,replace=TRUE)
b <- sample(y,1,replace=TRUE)
c <- sample(y,1,replace=TRUE)
if ((a + b + c) > 12) {
k <- k+1
} else if ((a + b + c) < 6) {
k <- k-1
} else if ((a == b) & (b == c)) {
k <- k+5
} else k <- 0
}
return(k)
}
The problem seems to be that I am failing to iterate over new simulations (for a, b, c) for each "i" in the function.
I have commented the only issue I have found... The last else that always re-initialize k to 0. Instead it should have been k <- k + 0 but it does not change anything to remove it.
y <- seq(1,6) # 6-sided dice
simu <- function(n){
k <- 0
for(i in 1:n) {
a <- sample(y,1,replace=TRUE)
b <- sample(y,1,replace=TRUE)
c <- sample(y,1,replace=TRUE)
if ((a + b + c) > 12) {
k <- k+1
} else if ((a + b + c) < 6) {
k <- k-1
} else if ((a == b) & (b == c)) {
k <- k+5
} #else k <- 0
}
return(k)
}
The results look quite fine :
> simu(1000)
[1] 297
> simu(100)
[1] 38
If you are going to use R, then you should learn to create vectorized operations instead of 'for' loops. Here is a simulation of 1 million rolls of the dice that took less than 1 second to calculate. I am not sure how long the 'for' loop approach would have taken.
n <- 1000000 # trials
start <- proc.time() # time how long it takes
result <- matrix(0L, ncol = 6, nrow = n)
colnames(result) <- c('d1', 'd2', 'd3', 'sum', 'same', 'total')
# initial the roll of three dice
result[, 1:3] <- sample(6L, n * 3L, replace = TRUE)
# compute row sum
result[, 'sum'] <- as.integer(rowSums(result[, 1:3]))
# check for being the same
result[, 'same'] <- result[,1L] == result[, 2L] & result[, 2L] == result[, 3L]
result[, 'total'] <- ifelse(result[, 'sum'] > 12L,
1L,
ifelse(result[, 'sum'] < 6L,
-1L,
ifelse(result[, 'same'] == 1L,
5L,
0L
)
)
)
table(result[, 'total'])
-1 0 1 5
46384 680762 259083 13771
cat("simulation took:", proc.time() - start, '\n')
simulation took: 0.7 0.1 0.8 NA NA
I am not sure that's what you need, but you can try something like that:
# Draw the dice(s) - returns vector of length == n_dices
draw <- function(sides = 6, dices = 3){
sample(1:sides, dices, replace = T)
}
# test simulation x and return -1, 0, 1, 1 or 5
test <- function(x){
(sum(x) > 12)*1 + (sum(x) < 6)*(-1) + (sum(x) >= 6 &
sum(x) <= 12 &
var(x) == 0)*5
}
# simulate n draws of x dices with y sides
simu <- function(sides = 6, dices = 3, n = 100){
sum(replicate(n, test(draw(sides, dices))))
}
# run simulations of 100 draws for 1, 2, ..., 11, 12-side dices (3 dices each simulation)
dt <- lapply(1:12, function(side) replicate(100, simu(side, 3, 100)))
# plot dicstribution of scores
par(mfrow = c(3,4))
lapply(1:length(dt), function(i) hist(dt[[i]],
main = sprintf("%i sides dice", i),
xlab = "Score"
)
)
I have a data frame with sequences as columns and amino acid sites as rows. I would like to compare the difference between these sequences at each site.
seq1 seq2 seq3 seq4 seq5 seq6 seq7 seq8
1 K E K K A A A A
2 V D A A T A A A
3 W W W W W W W W
4 R R R R R R S R
5 F S F F F Y F F
6 P P P P P P P P
7 N N N C N N N N
8 V I D D Q Q Q Q
9 Q Q Q Q Q Q Q Q
10 E E G G L I S F
11 L L Q L L L L L
12 N N Y Y V V S S
13 N N N N Q Q P P
14 L L L L L L L L
15 T T T T T T T I
Ideally, I would like to be able to have an additional column in my data frame that shows me the sites that are the same in all sequences and those that are the same only between seq1-4 or seq 5-8.
I am not sure what the best way to do this is, and any help is greatly appreciated.
Also, is there a way to add another column that shows the types of amino acids observed at each site?
Thanks in advance!
I am first getting an array where all columns are same:
allsame <- apply(df,1,function(x){
val <- ifelse(length(unique(x)) == 1,1,0)
})
Next I am getting an an array where either of the column sets are same
startfour <- apply(df[,1:4],1,function(x){
val <- ifelse(length(unique(x)) == 1,1,0)
})
lastfour <- apply(df[,5:8],1,function(x){
val <- ifelse(length(unique(x)) == 1,1,0)
})
gen <- startfour + lastfour
eithersame <- ifelse(gen == 0,0,1)
Finally you can just create a column vector as required and join it to the dataframe using the above 2 arrays
output <- as.character(length(allsame))
for(i in 1:length(allsame)){
if(allsame[i] == 1){
output[i] <- "all same"
}
else if(eithersame[i] == 1){
output[i] <- "either same"
}
else{
output[i] <- "none same"
}
}
df <- cbind(df,output)
Here is a quick and dirty way to create the flags that you mentioned. Assuming the dataframe is called amino:
amino$first_flag<-with(amino,ifelse(seq1==seq2 & seq2==seq3 & seq3 == seq4,"same","diff"))
amino$second_flag<-with(amino,ifelse(seq5==seq6 & seq6==seq7 & seq7 == seq8,"same","diff"))
amino$total_flag<-with(amino,ifelse(first_flag=="same" & second_flag=="same" & seq1==seq5,"same","diff"))
Hopefully that works.
edit: and for your last question, I'm not sure what you mean but if you just want the letters that appear in each row then something like this could work:
for(i in 1:nrow(amino)) amino$types[i]<-paste(unique(amino[i,1:4,drop=TRUE]),collapse=",")
It will give you a column containing a comma separated list of the letters that appeared in each row.
edit2: If you have significantly more than 8 sequences, then a modified form of Ganesh's solution might work better (his output code isn't actually necessary):
amino$first_flag <- apply(amino[,1:4],1,function(x){
ifelse(length(unique(x)) == 1,"same","diff")
})
amino$second_flag <- apply(amino[,5:8],1,function(x){
ifelse(length(unique(x)) == 1,"same","diff")
})
amino$total_flag <- apply(amino[,1:8],1,function(x){
ifelse(length(unique(x)) == 1,"same","diff")
})
amino$types <- apply(amino[,1:8],1,function(x) paste(unique(x),collapse=","))
And for your new question-
amino$one_diff <- apply(amino[,1:8],1,function(x){
ifelse(7 %in% as.data.frame(table(x))[,2,drop=TRUE],"1 diff",NA)
})
This uses the table() function which normally gives you a count based on a vector or a column like table(amino$seq1). Using apply, we instead stick a row of the 8 sequences into it, it returns the counts, then we use as.data.frame and the brackets [] to get rid of some extra table() output that we don't need. The "7 %in%" part means if there are 7 of the same letters then there must be 1 different one. Anything else (i.e., all 8 same or more than 1 difference) will get NA.
Suppose that I have a 20 X 5 matrix, I would like to select subsets of the matrix and do some computation with them. Further suppose that each sub-matrix is 7 X 5. I could of course do
ncomb <- combn(20, 7)
which gives me all possible combinations of 7 row indices, and I can use these to obtain sub-matrices. But with a small, 20 X 5 matrix, there are already 77520 possible combination. So I would like to instead randomly sample some of the combinations, e.g., 5000 of them.
One possibility is the following:
ncomb <- combn(20, 7)
ncombsub <- ncomb[, sample(77520, 5000)]
In other words, I obtain all possible combinations, and then randomly select only 5000 of the combinations. But I imagine it would be problematic to compute all possible combinations if I had a larger matrix - say, 100 X 7.
So I wonder if there is a way to get subsets of combinations without first obtaining all possible combinations.
Your approach:
op <- function(){
ncomb <- combn(20, 7)
ncombsub <- ncomb[, sample(choose(20,7), 5000)]
return(ncombsub)
}
A different strategy that simply samples seven rows from the original matrix 5000 times (replacing any duplicate samples with a new sample until 5000 unique row combinations are found):
me <- function(){
rowsample <- replicate(5000,sort(sample(1:20,7,FALSE)),simplify=FALSE)
while(length(unique(rowsample))<5000){
rowsample <- unique(rowsample)
rowsample <- c(rowsample,
replicate(5000-length(rowsample),
sort(sample(1:20,7,FALSE)),simplify=FALSE))
}
return(do.call(cbind,rowsample))
}
This should be more efficient because it prevents you from having to calculate all of the combinations first, which will get costly as the matrix gets larger.
And yet, some benchmarking reveals that is not the case. At least on this matrix:
library(microbenchmark)
microbenchmark(op(),me())
Unit: milliseconds
expr min lq median uq max neval
op() 184.5998 201.9861 206.3408 241.430 299.9245 100
me() 411.7213 422.9740 429.4767 474.047 490.3177 100
I ended up doing what #Roland suggested, by modifying combn(), and byte-compiling the code:
combn_sub <- function (x, m, nset = 5000, seed=123, simplify = TRUE, ...) {
stopifnot(length(m) == 1L)
if (m < 0)
stop("m < 0", domain = NA)
if (is.numeric(x) && length(x) == 1L && x > 0 && trunc(x) ==
x)
x <- seq_len(x)
n <- length(x)
if (n < m)
stop("n < m", domain = NA)
m <- as.integer(m)
e <- 0
h <- m
a <- seq_len(m)
len.r <- length(r <- x[a] )
count <- as.integer(round(choose(n, m)))
if( count < nset ) nset <- count
dim.use <- c(m, nset)
##-----MOD 1: Change the output matrix size--------------
out <- matrix(r, nrow = len.r, ncol = nset)
if (m > 0) {
i <- 2L
nmmp1 <- n - m + 1L
##----MOD 2: Select a subset of indices
set.seed(seed)
samp <- sort(c(1, sample( 2:count, nset - 1 )))
##----MOD 3: Start a counter.
counter <- 2L
while (a[1L] != nmmp1 ) {
if (e < n - h) {
h <- 1L
e <- a[m]
j <- 1L
}
else {
e <- a[m - h]
h <- h + 1L
j <- 1L:h
}
a[m - h + j] <- e + j
#-----MOD 4: Whenever the counter matches an index in samp,
#a combination of row indices is produced and stored in the matrix `out`
if(samp[i] == counter){
out[, i] <- x[a]
if( i == nset ) break
i <- i + 1L
}
#-----Increase the counter by 1 for each iteration of the while-loop
counter <- counter + 1L
}
}
array(out, dim.use)
}
library("compiler")
comb_sub <- cmpfun(comb_sub)