finding similar elements within two arrays - r

Is there a faster way to do this? N^2 time just seems terrible.
mergeData<-function(p,c) {
for(i in 1:length(p[[1]])) {
for(k in 1:length(c[[1]])) {
if(toString(c[[k,46]]) == toString(p[[i,1]])) {
#Do stuff here with pairs found
print(i)
}
}
}
}

row1 = c[[,46]]
row2 = p[[,1]]
x = data.frame(row = row1, nr1 = c(1:len(row1)))
y = data.frame(row = row2, nr2 = c(1:len(row2)))
same_pairs = merge(x, y)[c("nr1", "nr2")]
In same_pairs you have now indeces of a rows with the same elements.
Complexity : O(len(row1) + len(row2))

Related

R - nested loop alternatives/optimization

I'm currently trying to implement an algorithm in R that requires to loop through the rows and columns of a matrix and that for every cell it computes a value based on the value of previously computed cells.
Here is the code that does what I said above, it is a part of the Needleman Wunsch algorithm:
globalSequenceAlignment <- function(seq1, seq2, match, mismatch, gap) {
# splitting the sequences in order to use them as rows and columns names
seq1_split <- unlist(strsplit(toString(seq1), ""))
seq2_split <- unlist(strsplit(toString(seq2), ""))
len1 <- length(seq1_split)
len2 <- length(seq2_split)
# creating the alignment matrix
alignment_matrix <- matrix(0, nrow = len2+1, ncol = len1+1)
colnames(alignment_matrix) <- c("-", seq1_split)
rownames(alignment_matrix) <- c("-", seq2_split)
# filling first row and column of the alignment matrix
for (i in 2:ncol(alignment_matrix)) {
alignment_matrix[1,i] <- (alignment_matrix[1,i]+(i-1))*(gap)
}
for (j in 2:nrow(alignment_matrix)) {
alignment_matrix[j,1] <- (alignment_matrix[j,1]+(j-1))*(gap)
}
for (i in 2:ncol(alignment_matrix)) {
for (j in 2:nrow(alignment_matrix)) {
horizontal_score <- alignment_matrix[j,i-1] + gap
vertical_score <- alignment_matrix[j-1,i] + gap
if (colnames(alignment_matrix)[i] == rownames(alignment_matrix)[j]) {
diagonal_score <- alignment_matrix[j-1,i-1] + match
} else {
diagonal_score <- alignment_matrix[j-1,i-1] + mismatch
}
scores <- c(horizontal_score, vertical_score, diagonal_score)
alignment_matrix[j,i] <- max(scores)
}
}
return(alignment_matrix)
}
a <- 'GAATC'
b <- 'CATACG'
globalSequenceAlignment(a, b, 10,-5,-4)
Using this code I get the result that I want.
The problem is that with matrices with dimensions grater than 500x500 the nested loops become way too slow (running this code with a 500x500 matrix takes more or less 2 minutes).
I know that *apply functions could improve this but I couldn't achieve to use them since for computing each cell it requires that the previous ones have been computed yet.
I was wondering if there is a way to achieve the same result using *apply functions or a way to vectorize this type of code so that it's more rapid in R.
If someone would ever need this I wrote my own solution to this problem using the package Rcpp. The runtime, from about 3 minutes for sequences of 500 characters, is now about 0.3s.
I post here the code for the part of the two nested loops that you can see in the text of the question, hope that will be useful for someone.
library(Rcpp)
rcppFunction('IntegerMatrix rcpp_compute_matrices(IntegerMatrix Am, StringMatrix Dm,
StringVector seq1, StringVector seq2,
int gap, int miss, int match) {
int nrow = Am.nrow(), ncol = Am.ncol();
for (int i = 1; i < nrow; i++) {
for (int j = 1; j < ncol; j++) {
int vertical_score = Am(i-1, j) + gap;
int horizontal_score = Am(i, j-1) + gap;
int diagonal_score = 0;
if (seq1[j-1] == seq2[i-1]) {
diagonal_score = Am(i-1, j-1) + match;
}
else {
diagonal_score = Am(i-1, j-1) + miss;
}
IntegerVector score = {vertical_score, horizontal_score, diagonal_score};
int max_score = max(score);
Am(i, j) = max_score;
}
}
return Am;
}')

How to best combine unique and match in R?

I found myself often writing code such as
#' #param x input vector
#' #param ... passed to [slow_fun()]
fast_fun <- function(x, ...) {
u <- unique(x)
i <- match(x, u)
v <- slow_fun(u, ...)
v[i]
}
To accelerate a slow vectorized "pure" function where each input entry could theoretically be computed individually and where input is expected to contain many duplicates.
Now I wonder whether this is the best way to achieve such a speedup or is there some function (preferrably in base R or the tidyverse) which does something like unique and match at the same time?
Benchmarks so far
Thanks for the provided answers. I've written a small benchmark suite to compare the approaches:
method <- list(
brute = slow_fun,
unique_match = function(x, ...) {
u <- unique(x)
i <- match(x, u)
v <- slow_fun(u, ...)
v[i]
},
unique_factor = function(x, ...) {
if (is.character(x)) {
x <- factor(x)
i <- as.integer(x)
u <- levels(x)
} else {
u <- unique(x)
i <- as.integer(factor(x, levels = u))
}
v <- slow_fun(u, ...)
v[i]
},
unique_match_df = function(x, ...) {
u <- unique(x)
i <- if (is.numeric(x)) {
match(data.frame(t(round(x, 10))), data.frame(t(round(u, 10))))
} else {
match(data.frame(t(x)), data.frame(t(u)))
}
v <- slow_fun(u, ...)
v[i]
},
rcpp_uniquify = function(x, ...) {
iu <- uniquify(x)
v <- slow_fun(iu[["u"]], ...)
v[iu[["i"]]]
}
)
exprs <- lapply(method, function(fun) substitute(fun(x), list(fun = fun)))
settings$bench <- lapply(seq_len(nrow(settings)), function(i) {
cat("\rBenchmark ", i, " / ", nrow(settings), sep = "")
x <- switch(
settings$type[i],
integer = sample.int(
n = settings$n_distinct[i],
size = settings$n_total[i],
replace = TRUE
),
double = sample(
x = runif(n = settings$n_distinct[i]),
size = settings$n_total[i],
replace = TRUE
),
character = sample(
x = stringi::stri_rand_strings(
n = settings$n_distinct[i],
length = 20L
),
size = settings$n_total[i],
replace = TRUE
)
)
microbenchmark::microbenchmark(
list = exprs
)
})
library(tidyverse)
settings %>%
mutate(
bench = map(bench, summary)
) %>%
unnest(bench) %>%
group_by(n_distinct, n_total, type) %>%
mutate(score = median / min(median)) %>%
group_by(expr) %>%
summarise(mean_score = mean(score)) %>%
arrange(mean_score)
Currently, the rcpp-based approach is best in all tested settings on my machine but barely manages to exceed the unique-then-match method.
I suspect a greater advantage in performance the longer x becomes, because unique-then-match needs two passes over the data while uniquify() only needs one pass.
|expr | mean_score|
|:---------------|----------:|
|rcpp_uniquify | 1.018550|
|unique_match | 1.027154|
|unique_factor | 5.024102|
|unique_match_df | 36.613970|
|brute | 45.106015|
Maybe you can try factor + as.integer like below
as.integer(factor(x))
I found a cool, and fast, answer recently,
match(data.frame(t(x)), data.frame(t(y)))
As always, beware when working with floats. I recommend something like
match(data.frame(t(round(x,10))), data.frame(t(round(y))))
in such cases.
I've finally managed to beat unique() and match() using Rcpp to hand-code the algorithm in C++ using a std::unordered_map as core bookkeeping data structure.
Here is the source code, which can be used in R by writing it into a file and running Rcpp::sourceCpp on it.
#include <Rcpp.h>
using namespace Rcpp;
template <int T>
List uniquify_impl(Vector<T> x) {
IntegerVector idxes(x.length());
typedef typename Rcpp::traits::storage_type<T>::type storage_t;
std::unordered_map<storage_t, int> unique_map;
int n_unique = 0;
// 1. Pass through x once
for (int i = 0; i < x.length(); i++) {
storage_t curr = x[i];
int idx = unique_map[curr];
if (idx == 0) {
unique_map[curr] = ++n_unique;
idx = n_unique;
}
idxes[i] = idx;
}
// 2. Sort unique_map by its key
Vector<T> uniques(unique_map.size());
for (auto &pair : unique_map) {
uniques[pair.second - 1] = pair.first;
}
return List::create(
_["u"] = uniques,
_["i"] = idxes
);
}
// [[Rcpp::export]]
List uniquify(RObject x) {
switch (TYPEOF(x)) {
case INTSXP: {
return uniquify_impl(as<IntegerVector>(x));
}
case REALSXP: {
return uniquify_impl(as<NumericVector>(x));
}
case STRSXP: {
return uniquify_impl(as<CharacterVector>(x));
}
default: {
warning(
"Invalid SEXPTYPE %d (%s).\n",
TYPEOF(x), type2name(x)
);
return R_NilValue;
}
}
}

Double sampling method in R

My initial code for double sampling is the following. I did only one sample.
# Data
samples<-matrix(NA,nrow = 12, ncol = 2000)
for (i in 1:12) {
samples[i,]<- rbinom(2000,1,prob = 0.05)
}
# Double Sampling Plan
accept<-rep(0,12)
for (i in 1:12) {
if (sum(samples[i,1:80])<=5){
accept[i]<-1
} else if (sum(samples[i,1:80]<=8) & sum(samples[i,1:80]>5) ) {
if (sum(samples[i,1:160])<=12) {
accept[i]<-1
}
}
}
sum(accept)
Since I generated randomly from Bernoulli, every time you run the code, the results will not be the same.
I want 100 repetitions of this double sample.
My solution:
nm=double(100)
for (j in 1:100){
# Data
samples<-matrix(NA,nrow = 12, ncol = 2000)
for (i in 1:12) {
samples[i,]<- rbinom(2000,1,prob = 0.05)
}
# Double Sampling Plan
accept<-rep(0,12)
for (i in 1:12) {
if (sum(samples[i,1:80])<=5){
accept[i]<-1
} else if (sum(samples[i,1:80]<=8) & sum(samples[i,1:80]>5) ) {
if (sum(samples[i,1:160])<=12) {
accept[i]<-1
}
}
}
nm[j]=sum(accept)
}
mean(nm)
What do you think?
If we follow the proposition of #Onyambu, we can embeded one simulation inside a function and call it in a loop like this :
one_double_sampling <- function(){
# Data
samples<-matrix(NA,nrow = 12, ncol = 2000)
for (i in 1:12) {
samples[i,]<- rbinom(2000,1,prob = 0.05)
}
# Double Sampling Plan
accept<-rep(0,12)
for (i in 1:12) {
if (sum(samples[i, 1:80])<=5){
accept[i]<-1
} else if (sum(samples[i,1:80]<=8) & sum(samples[i,1:80]>5) ) {
if (sum(samples[i,1:160])<=12) {
accept[i]<-1
}
}
}
return(sum(accept))
}
set.seed(123)
# number of sample
n <- 100
# stock the result
res <- rep(0, n)
for(i in 1:n){
res[i] <- one_double_sampling()
}
# mean
mean(res)
Definitly your code is correct. For people interresting by the double sampling method I advise you to see this.
Edit 1
In one line code based on Onyambu advise :
mean(replicate(n, one_double_sampling()))

Manupulate the full Row of a tiddle with 2 loops

first, this is a homework question.
It's easy to manipulate the full row with:
testMan[2,] = apply(testMan[2,], 2, function(x) 100)
But we have to do this in a loop and it must be a function with a parameter.
manipulateRow = function(rowNumber){
i = 1;
for(row in testMan){
#print(i)
if(i == rowNumber){
for(price in row){
price = 100
}
break;
}
i = i + 1;
}
}
test = manipulateRow(2);
The goal is to replace the full 2nd line with the value 100.
There are more than 600 columns, so we have to do it with a loop.
We are working with tibble and tiddyverse.

R function keeps returning NULL

I am having a very odd problem in R. The question was to make a function for global and semi global allignment. Appropriate algorithms were made which are able to "print out" the correct allignment. However "returning" the alginment seems to be a problem for the semi global algorithm.
Below are the functions for both alignments which both contain two functions: one computing the score matrix and the other outputs the alignment. As you can see, the output function for semi global was inspired by the global one but although it is able to print out values A and B, when returning A and B a value NULL is returned.
It came to my attention that when making defining A and B, they also contain a NULL part which seen by printing the structures of A and B at the end. This is also the case in the global alignment but does not seem to be a problem here.
Global Alignment Algorithm
########### GLOBAL ALLIGNMENT ALGORITHM ############
GA_score = function(v,w,score.gap=-3,score.match=8,score.mismatch=-5){
v = strsplit(v,split="")[[1]]
w = strsplit(w,split="")[[1]]
S = matrix(0,nrow=(length(v)+1),ncol = (length(w)+1) )
S[1,1] = 0
for(j in 2:dim(S)[2]){
S[1,j] = score.gap*(j-1)
}
for(i in 2:dim(S)[1]){
S[i,1] = score.gap*(i-1)
for(j in 2:dim(S)[2]){
if(v[i-1]==w[j-1]){diag = S[i-1,j-1] + score.match} else {diag = S[i-1,j-1] + score.mismatch}
down = S[i-1,j] + score.gap
right = S[i,j-1] + score.gap
S[i,j] = max(diag,down,right)
}
}
return(S)
}
GA_output = function(v,w,S,score.gap=-3,score.match=8,score.mismatch=-5){
v = strsplit(v,split="")[[1]]
w = strsplit(w,split="")[[1]]
A=c()
B=c()
GA_rec = function(A,B,S,i,j,v,w,score.gap,score.match,score.mismatch){
if (i==1 | j==1){
if(i>1){
for(i1 in seq(i-1,1,-1)){
A = c(v[i1],A)
B = c("-",B)
}
}
if(j>1){
for(j1 in seq(j-1,1,-1)){
A = c("-",A)
B = c(w[j1],B)
}
}
return(list(v=A,w=B))
}
if(v[i-1]==w[j-1] ){diag = score.match} else {diag=score.mismatch}
if (S[i,j] == (S[i-1,j-1] + diag)){
A.temp = c(v[i-1],A)
B.temp = c(w[j-1],B)
GA_rec(A.temp,B.temp,S,i-1,j-1,v,w,score.gap,score.match,score.mismatch)
}
else if (S[i,j] == (S[i-1,j] + score.gap)){
A.temp <- c(v[i-1],A)
B.temp <- c("-",B)
GA_rec(A.temp,B.temp,S,i-1,j,v,w,score.gap,score.match,score.mismatch)
}
else {
A.temp = c("-",A)
B.temp = c(w[j-1],B)
GA_rec(A.temp,B.temp,S,i,j-1,v,w,score.gap,score.match,score.mismatch)
}
}
return( GA_rec(A,B,S,length(v)+1,length(w)+1,v,w,score.gap,score.match,score.mismatch))
}
Semi-Global Alignment Algorithm
########### SEMI GLOBAL ALLIGNMENT ALGORITHM ############
SGA_score = function(sequence1,sequence2,score.gap=-1,score.match=1,score.mismatch=-1){
v=sequence2
w=sequence1
v = strsplit(v,split="")[[1]]
w = strsplit(w,split="")[[1]]
S = matrix(0,nrow=length(v)+1,ncol=length(w)+1)
for(i in 1:(length(w)+1)){
for( j in 1:(length(v)+1)){
if (i==1|j==1){S[i,j]=0}
else{
if((i==length(w)+1) | (j==length(v)+1)){
from.top = S[i,j-1]
from.left = S[i-1,j]
}
else{
from.top = max(S[i,j-1]+score.gap) # Max is artifact from max(0,... )
from.left = max(S[i-1,j]+score.gap)
}
if(w[i-1] == v[j-1]){
from.diag = S[i-1,j-1]+score.match
}
else{
from.diag = S[i-1,j-1]+score.mismatch
}
S[i,j] = max(from.top,from.left,from.diag)
}
}
}
return(S)
}
SGA_output = function(v,w,S,score.gap=-1,score.match=1,score.mismatch=-1){
v = strsplit(v,split="")[[1]]
w = strsplit(w,split="")[[1]]
A=c()
B=c()
print(str(A))
print(str(B))
SGA_rec = function(A,B,S,i,j,v,w,score.gap,score.match,score.mismatch){
if (i==1 | j==1){
if(i>1){
for(i1 in seq(i-1,1,-1)){
A = c(v[i1],A)
B = c("-",B)
}
}
if(j>1){
for(j1 in seq(j-1,1,-1)){
A = c("-",A)
B = c(w[j1],B)
}
}
print(A)
print(B)
out = list(v=A,w=B)
#print(out)
print(str(A))
print(str(B))
print(str(out))
return(out)
}
if(v[i-1]==w[j-1] ){diag = score.match} else {diag=score.mismatch}
if (S[i,j] == (S[i-1,j-1] + diag)){
A.temp = c(v[i-1],A)
B.temp = c(w[j-1],B)
SGA_rec(A.temp,B.temp,S,i-1,j-1,v,w,score.gap,score.match,score.mismatch)
}
#####
if ( j==length(w)+1) { # Are we in last row?
score.temp = score.gap
score.gap=0
}
else{score.temp=score.gap}
if(S[i,j] == (S[i-1,j] + score.gap)){
A.temp <- c(v[i-1],A)
B.temp <- c("-",B)
score.gap = score.temp
SGA_rec(A.temp,B.temp,S,i-1,j,v,w,score.gap,score.match,score.mismatch)
}
score.gap=score.temp
####
if(i==length(v)+1){
score.temp=score.gap
score.gap=0
}
else{score.temp=score.gap}
if(S[i,j] == (S[i,j-1] + score.gap)){
A.temp = c("-",A)
B.temp = c(w[j-1],B)
score.gap=score.temp
SGA_rec(A.temp,B.temp,S,i,j-1,v,w,score.gap,score.match,score.mismatch)
}
}
return(SGA_rec(A,B,S,length(v)+1,length(w)+1,v,w,score.gap,score.match,score.mismatch))
}
S1 = SGA_score("ACGTCAT","TCATGCA")
S1
align = SGA_output("ACGTCAT","TCATGCA",S1)
align
I am surpised that the global alignment works but the semi global one doesn't, even tough they both have this NULL part (can someone maybe explain what this is? Has it something to do with internal objects in a function?) and the semi global knows what A and B is.
Any help is greatly appreciated!
SGA_rec seems to be missing a return value. You need an else {return(<something>)) after the last if.
Illustration:
fun <- function() if (FALSE) 1
x <- fun()
x
#NULL
Read help("NULL") to learn what it means.

Resources