I want to reduce time and memory usage (I previously used outer for this but it consumes more memory than I have) by reducing the iterations to create a symmetric matrix, that is sol[i, j] is the same as sol[j, i].
My code so far:
# Prepare input
subss <- list(a = c(1, 2, 4), b = c(1, 2, 3), c = c(4, 5))
A <- matrix(runif(25), ncol = 5, nrow = 5)
# Pre allocate memory
sol <- matrix(nrow = length(subss), ncol = length(subss),
dimnames = list(names(subss), names(subss)))
x <- 0
for (i in seq_along(subss)) {
# Omit for the subsets I already calculated ?
for (j in seq_along(subss)) {
x <- x + 1
message(x)
# The function I use here might result in a NA
sol[i, j] <- mean(A[subss[[i]], subss[[j]]])
sol[j, i] <- sol[i, j] # Will overwrite when it shouldn't
}
}
Will use 9 iterations, how can I avoid them and do just 6 iterations?
I need to calculate the symmetric values, so this question doesn't apply. Also this other one doesn't work either because there might be many combinations and at some point it can't allocate the vector in memory.
A for loop will usually be slower than outer. Try byte-compiling the loop or implement it in Rcpp.
subss <- list(a = c(1, 2, 4), b = c(1, 2, 3), c = c(4, 5))
set.seed(42)
A <- matrix(runif(25), ncol = 5, nrow = 5)
#all combinations of indices
ij <- combn(seq_along(subss), 2)
#add all i = j
ij <- matrix(c(ij, rep(seq_along(subss), each = 2)), nrow = 2)
#preallocate
res <- numeric(ncol(ij))
#only one loop
for (k in seq_len(ncol(ij))) {
message(k)
res[k] <- mean(A[subss[[ij[1, k]]], subss[[ij[2, k]]]])
}
#1
#2
#3
#4
#5
#6
#create symmetric sparse matrix
library(Matrix)
sol <- sparseMatrix(i = ij[1,], j = ij[2,],
x = res, dims = rep(length(subss), 2),
symmetric = TRUE, index1 = TRUE)
#3 x 3 sparse Matrix of class "dsCMatrix"
#
#[1,] 0.7764715 0.6696987 0.7304413
#[2,] 0.6696987 0.6266553 0.6778936
#[3,] 0.7304413 0.6778936 0.5161089
I found a way with plain for loops:
x <- 0
for (i in seq_along(subss)) {
for (j in seq_len(i)) { # or for (j in 1:i) as proposed below
x <- x + 1
message(x)
sol[i, j] <- mean(A[subss[[i]], subss[[j]]])
sol[j, i] <- sol[i, j]
}
}
for (i in 1:length(subss)) {
for (j in 1:i) {
message(i, ' ', j, ' - ', mean(A[subss[[i]], subss[[j]]]) ) # Check iterations and value
sol2[i, j] <- sol2[j, i] <- mean(A[subss[[i]], subss[[j]]])
}
}
I checked your script values and aren't symmetric:
1 1 - 0.635455905252861
1 2 - 0.638608284398086
1 3 - 0.488700995299344
2 1 - 0.568414432255344
2 2 - 0.602851431118324
2 3 - 0.516099992596234
3 1 - 0.595461705311512
3 2 - 0.656920690399905
3 3 - 0.460815121419728
Mine values (same as #Llopis):
1 2 - 0.638608284398086
1 3 - 0.488700995299344
2 2 - 0.602851431118324
2 3 - 0.516099992596234
3 2 - 0.656920690399905
3 3 - 0.460815121419728
Related
In an earlier question (R: Logical Conditions Not Being Respected), I learned how to make the following simulation :
Step 1: Keep generating two random numbers "a" and "b" until both "a" and "b" are greater than 12
Step 2: Track how many random numbers had to be generated until it took for Step 1 to be completed
Step 3: Repeat Step 1 and Step 2 100 times
res <- matrix(0, nrow = 0, ncol = 3)
for (j in 1:100){
a <- rnorm(1, 10, 1)
b <- rnorm(1, 10, 1)
i <- 1
while(a < 12 | b < 12) {
a <- rnorm(1, 10, 1)
b <- rnorm(1, 10, 1)
i <- i + 1
}
x <- c(a,b,i)
res <- rbind(res, x)
}
head(res)
[,1] [,2] [,3]
x 12.14232 12.08977 399
x 12.27158 12.01319 1695
x 12.57345 12.42135 302
x 12.07494 12.64841 600
x 12.03210 12.07949 82
x 12.34006 12.00365 782
Question: Now, I am trying to make a slight modification to the above code - Instead of "a" and "b" being produced separately, I want them to be produced "together" (in math terms: "a" and "b" were being produced from two independent univariate normal distributions, now I want them to come from a bivariate normal distribution).
I tried to modify this code myself:
library(MASS)
Sigma = matrix(
c(1,0.5, 0.5, 1), # the data elements
nrow=2, # number of rows
ncol=2, # number of columns
byrow = TRUE) # fill matrix by rows
res <- matrix(0, nrow = 0, ncol = 3)
for (j in 1:100){
e_i = data.frame(mvrnorm(n = 1, c(10,10), Sigma))
e_i$i <- 1
while(e_i$X1 < 12 | e_i$X2 < 12) {
e_i = data.frame(mvrnorm(n = 1, c(10,10), Sigma))
e_i$i <- i + 1
}
x <- c(e_i$X1, e_i$X2 ,i)
res <- rbind(res, x)
}
res = data.frame(res)
But this is producing the following error:
Error in while (e_i$X1 < 12 | e_i$X2 < 12) { : argument is of length
zero
If I understand your code correctly you are trying to see how many samples occur before both values are >=12 and doing that for 100 trials? This is the approach I would take:
library(MASS)
for(i in 1:100){
n <- 1
while(any((x <- mvrnorm(1, mu=c(10,10), Sigma=diag(0.5, nrow=2)+0.5))<12)) n <- n+1
if(i==1) res <- data.frame("a"=x[1], "b"=x[2], n)
else res <- rbind(res, data.frame("a"=x[1], "b"=x[2], n))
}
Here I am assigning the results of a mvrnorm to x within the while() call. In that same call, it evaluates whether either are less than 12 using the any() function. If that evaluates to FALSE, n (the counter) is increased and the process repeated. Once TRUE, the values are appended to your data.frame and it goes back to the start of the for-loop.
Regarding your code, the mvrnorm() function is returning a vector, not a matrix, when n=1 so both values go into a single variable in the data.frame:
data.frame(mvrnorm(n = 1, c(10,10), Sigma))
Returns:
mvrnorm.n...1..c.10..10...Sigma.
1 9.148089
2 10.605546
The matrix() function within your data.frame() calls, along with some tweaks to your use of i, will fix your code:
library(MASS)
Sigma = matrix(
c(1,0.5, 0.5, 1), # the data elements
nrow=2, # number of rows
ncol=2, # number of columns
byrow = TRUE) # fill matrix by rows
res <- matrix(0, nrow = 0, ncol = 3)
for (j in 1:10){
e_i = data.frame(matrix(mvrnorm(n = 1, c(10,10), Sigma), ncol=2))
i <- 1
while(e_i$X1[1] < 12 | e_i$X2[1] < 12) {
e_i = data.frame(matrix(mvrnorm(n = 1, c(10,10), Sigma), ncol=2))
i <- i + 1
}
x <- c(e_i$X1, e_i$X2 ,i)
res <- rbind(res, x)
}
res = data.frame(res)
Nh<-matrix(c(17,26,30,17,23, 17 ,24, 23), nrow=2, ncol=4); Nh
Sh<-matrix(c(8.290133, 6.241174, 6.096808, 7.4449672, 6.894924, 7.692115,
4.540521, 7.409122), nrow=2, ncol=4); Sh
NhSh<-as.matrix(Nh*Sh); NhSh
rh<-c( 0.70710678, 0.40824829, 0.28867513, 0.22360680, 0.18257419,
0.15430335, 0.13363062, 0.11785113, 0.10540926, 0.09534626); rh
pv <- c()
for (j in 1:2) {
for (i in 1:4) {
pv <- rbind(pv, NhSh[j,i]*rh)
}
}
pv
row.names(pv) <- rep(c(1:2), each = 4)
lst<-lapply(split(seq_len(nrow(pv)), as.numeric(row.names(pv))), function(i)
pv[i,])
data<-40
nlargest <- function(x, data)
{
res <- order(x)[seq_len(data)];
pos <- arrayInd(res, dim(x), useNames = TRUE);
list(values = pv[res], position = pos)
}
out <- lapply(lst, nlargest, data = 40)
In continuation of above code Is there any brief way of repeating the following steps for each out$âkâ$position for k in 1:2?
s1<-c(1,1,1,1); ch<-c(5,7,10,5); C<-150; a<-out$'1'$position
for (j in a[40:1, "row"] )
{
s1[j] <- s1[j]+1;
cost1 <- sum(ch*s1);
if (cost1>=C) break
}
s1; cost1
#Output [1] 5 6 6 5
# [1] 152
I have to get 2 values for 's' and 'cost' for out$k$position. I tried
mat = replicate (2,{x = matrix(data = rep(NA, 80), ncol = 2)}); mat
for (k in 1:2)
{
mat[,,k]<-out$'k'$position
}
mat
Error in mat[, , k] <- out$k$position :number of items to replace is not a multiple of replacement length
for (k in 1:2)
{
for (j in mat[,,k][40:1] ) {
s[j] <- s[j]+1
cost <- sum(ch*s)
if (cost>=C) break
}
}
s; cost
Error : Error in s[j] <- s[j] + 1 : NAs are not allowed in subscripted assignments
Please anyone help in resolving these errors.
We could apply the function directly by looping over the list. Note that each element of the list is a matrix
sapply(lst, is.matrix)
# 1 2
#TRUE TRUE
so, there is no need to unlist and create a matrix
out <- lapply(lst, nlargest, data = 40)
-checking with the OP's results
out1 <- nlargest(sub1, 40)
identical(out[[1]], out1)
#[1] TRUE
Update2
Based on the second update, we need to initialize 'cost' and 'sl' with the same length as 'k' elements. Here, we initialize 'sl' as a list of vectors
sl <- rep(list(c(1, 1, 1, 1)), 2)
C <- 150
cost <- numeric(2)
for (k in 1:2){
for (j in mat[,,k][40:1, 1] ) {
sl[[k]][j] <- sl[[k]][j]+1
cost[k] <- sum(ch*sl[[k]])
if (cost[k] >=C) break
}
}
sl
#[[1]]
#[1] 5 7 6 4
#[[2]]
#[1] 6 5 5 7
cost
#[1] 154 150
I have two data frames of same number of columns (but not rows) df1 and df2. For each row in df2, I was able to find the best (and second best) matching rows from df1 in terms of hamming distance, in my previous post. In that post, we have been using the following example data:
set.seed(0)
df1 <- as.data.frame(matrix(sample(1:10), ncol = 2)) ## 5 rows 2 cols
df2 <- as.data.frame(matrix(sample(1:6), ncol = 2)) ## 3 rows 2 cols
I now need to compute the number of bits equal to 1 for:
each row in df2
the best matching rows in df1
the second matching rows in df1
The number of bits equal to 1 of an integer a maybe computed as
sum(as.integer(intToBits(a)))
And I have applied this to #ZheyuanLi's original function, so I have got item 1>. However I'm unable to apply the same logic to get item 2> and 3>, by simple modification of #ZheyuanLi's function.
Below are the functions from #ZheyuanLi's with modification:
hmd <- function(x,y) {
rawx <- intToBits(x)
rawy <- intToBits(y)
nx <- length(rawx)
ny <- length(rawy)
if (nx == ny) {
## quick return
return (sum(as.logical(xor(rawx,rawy))))
} else if (nx < ny) {
## pivoting
tmp <- rawx; rawx <- rawy; rawy <- tmp
tmp <- nx; nx <- ny; ny <- tmp
}
if (nx %% ny) stop("unconformable length!") else {
nc <- nx / ny ## number of cycles
return(unname(tapply(as.logical(xor(rawx,rawy)), rep(1:nc, each=ny), sum)))
}
}
foo <- function(df1, df2, p = 2) {
## check p
if (p > nrow(df2)) p <- nrow(df2)
## transpose for CPU cache friendly code
xt <- t(as.matrix(df1))
yt <- t(as.matrix(df2))
## after transpose, we compute hamming distance column by column
## a for loop is decent; no performance gain from apply family
n <- ncol(yt)
id <- integer(n * p)
d <- numeric(n * p)
sb <- integer(n)
k <- 1:p
for (i in 1:n) {
set.bits <- sum(as.integer(intToBits(yt[,i])))
distance <- hmd(xt, yt[,i])
minp <- order(distance)[1:p]
id[k] <- minp
d[k] <- distance[minp]
sb[i] <- set.bits
k <- k + p
}
## recode "id", "d" and "sb" into data frame and return
id <- as.data.frame(matrix(id, ncol = p, byrow = TRUE))
colnames(id) <- paste0("min.", 1:p)
d <- as.data.frame(matrix(d, ncol = p, byrow = TRUE))
colnames(d) <- paste0("mindist.", 1:p)
sb <- as.data.frame(matrix(sb, ncol = 1)) ## no need for byrow as you have only 1 column
colnames(sb) <- "set.bits.1"
list(id = id, d = d, sb = sb)
}
Running these gives:
> foo(df1, df2)
$id
min1 min2 ## row id for best/second best match in df1
1 1 4
2 2 3
3 5 2
$d
mindist.1 mindist.2 ## minimum 2 hamming distance
1 2 2
2 1 3
3 1 3
$sb
set.bits.1 ## number of bits equal to 1 for each row of df2
1 3
2 2
3 4
OK, after reading through while re-editing your question (many times!), I think I know what you want. Essentially we need change nothing to hmd(). Your required items 1>, 2>, 3> can all be computed after the for loop in foo().
To get item 1>, which you called sb, we can use a tapply(). However, your computation of sb along the for loop is fine, so I will not change it. In the following, I will demonstrate the basic procedure to get item 2> and item 3>.
The id vector inside foo() stores all matching rows in df1:
id <- c(1, 4, 2, 3, 5, 2)
so we can simply extract those rows of df1 (actually, columns of xt), to compute the number of bits equal to 1. As you can see, there are lots of duplicity in id, so we can only computes on unique(id):
id0 <- sort(unique(id))
## [1] 1 2 3 4 5
We now extract those subset columns of xt:
sub_xt <- xt[, id0]
## [,1] [,2] [,3] [,4] [,5]
## V1 9 3 10 5 6
## V2 2 4 8 7 1
To compute the number of bits equal to 1 for each column of sub_xt, we again use tapply() and vectorized approach.
rawbits <- as.integer(intToBits(as.numeric(sub_xt))) ## convert sub_xt to binary
sbxt0 <- unname(tapply(X = rawbits,
INDEX = rep(1:length(id0), each = length(rawbits) / length(id0)),
FUN = sum))
## [1] 3 3 3 5 3
Now we need to map sbxt0 to sbxt:
sbxt <- sbxt0[match(id, id0)]
## [1] 3 5 3 3 3 3
Then we can convert sbxt to a data frame sb1:
sb1 <- as.data.frame(matrix(sbxt, ncol = p, byrow = TRUE))
colnames(sb1) <- paste(paste0("min.", 1:p), "set.bits.1", sep = ".")
## min.1.set.bits.1 min.2.set.bits.1
## 1 3 5
## 2 3 3
## 3 3 3
Finally we can assemble these things up:
foo <- function(df1, df2, p = 2) {
## check p
if (p > nrow(df2)) p <- nrow(df2)
## transpose for CPU cache friendly code
xt <- t(as.matrix(df1))
yt <- t(as.matrix(df2))
## after transpose, we compute hamming distance column by column
## a for loop is decent; no performance gain from apply family
n <- ncol(yt)
id <- integer(n * p)
d <- numeric(n * p)
sb2 <- integer(n)
k <- 1:p
for (i in 1:n) {
set.bits <- sum(as.integer(intToBits(yt[,i])))
distance <- hmd(xt, yt[,i])
minp <- order(distance)[1:p]
id[k] <- minp
d[k] <- distance[minp]
sb2[i] <- set.bits
k <- k + p
}
## compute "sb1"
id0 <- sort(unique(id))
sub_xt <- xt[, id0]
rawbits <- as.integer(intToBits(as.numeric(sub_xt))) ## convert sub_xt to binary
sbxt0 <- unname(tapply(X = rawbits,
INDEX = rep(1:length(id0), each = length(rawbits) / length(id0)),
FUN = sum))
sbxt <- sbxt0[match(id, id0)]
sb1 <- as.data.frame(matrix(sbxt, ncol = p, byrow = TRUE))
colnames(sb1) <- paste(paste0("min.", 1:p), "set.bits.1", sep = ".")
## recode "id", "d" and "sb2" into data frame and return
id <- as.data.frame(matrix(id, ncol = p, byrow = TRUE))
colnames(id) <- paste0("min.", 1:p)
d <- as.data.frame(matrix(d, ncol = p, byrow = TRUE))
colnames(d) <- paste0("mindist.", 1:p)
sb2 <- as.data.frame(matrix(sb2, ncol = 1)) ## no need for byrow as you have only 1 column
colnames(sb2) <- "set.bits.1"
list(id = id, d = d, sb1 = sb1, sb2 = sb2)
}
Now, running foo(df1, df2) gives:
> foo(df1,df2)
$id
min.1 min.2
1 1 4
2 2 3
3 5 2
$d
mindist.1 mindist.2
1 2 2
2 1 3
3 1 3
$sb1
min.1.set.bits.1 min.2.set.bits.1
1 3 5
2 3 3
3 3 3
$sb2
set.bits.1
1 3
2 2
3 4
Note that I have renamed the sb you used to sb2.
Let's say I have a matrix that looks like this, and I convert it into a dist class object (without diagonal), and then into a vector for later purposes.
m = matrix(c(0,1,2,3, 1,0,3,4, 2,3,0,5, 3,4,5,0), nrow=4)
#m:
[,1] [,2] [,3] [,4]
[1,] 0 1 2 3
[2,] 1 0 3 4
[3,] 2 3 0 5
[4,] 3 4 5 0
md = as.dist(m, diag=F)
# md:
1 2 3
2 1
3 2 3
4 3 4 5
mdv = as.vector(md)
# 1 2 3 3 4 5
I can access the original matrix as usual with [], and I could easily access the one-dimensional index (of, for example row 3, col 2) using m[ 3+((2-1)*4) ]. The dist object (and the vector) is one-dimensional, but composes only of the lower triangle of the original matrix (and also lacks one element from each original col/row, since the diagonal was removed).
How can I later access the equivalent element in the vector mdv? So e.g. how could I access the equivalent of m[3,2] (value 3) in the object mdv? (Not by the value, since there can be duplicate values, but by the index) Related Q&A resolve similar problems with as.matrix on the dist object, but that doesn't do it for me (since I need to deal with the vector).
Having the lower.tri(, diag = FALSE) distances-vector ("mdv") you could (1) find the respective dimensions of the distances-matrix ("m") and (2) convert the i + (j - 1)*nrow indices accordingly by subtracting the equivalent missing "upper.tri".
ff = function(x, i, j)
{
#assumes that 'x' is a valid distances vector that results in correct 'n'
n = (1 + sqrt(1 + 8 * length(x))) / 2
#make sure i >= j
ii = pmax(i, j); jj = pmin(i, j)
#insert 0s to handle 'i == j'
x = c(unlist(lapply(split(x, rep(seq_len(n - 1), (n - 1):1)),
function(X) c(0, X)), FALSE, FALSE), 0)
#subtract the missing `upper.tri` elements
x[(ii + (jj - 1L) * n) - cumsum(0:(n - 1))[jj]]
}
E.g.:
n = 3
m = matrix(0, n, n); m[lower.tri(m)] = runif(choose(n, 2)); m = m + t(m); x = c(as.dist(m))
m
# [,1] [,2] [,3]
#[1,] 0.0000000 0.3796833 0.5199015
#[2,] 0.3796833 0.0000000 0.4770344
#[3,] 0.5199015 0.4770344 0.0000000
m[cbind(c(2, 2, 3, 1), c(3, 2, 1, 2))]
#[1] 0.4770344 0.0000000 0.5199015 0.3796833
ff(x, c(2, 2, 3, 1), c(3, 2, 1, 2))
#[1] 0.4770344 0.0000000 0.5199015 0.3796833
n = 23
m = matrix(0, n, n); m[lower.tri(m)] = runif(choose(n, 2)); m = m + t(m); x = c(as.dist(m))
i = sample(seq_len(n), 25, TRUE); j = sample(seq_len(n), 25, TRUE)
all.equal(m[cbind(i, j)], ff(x, i, j))
#[1] TRUE
etc...
How about this function:
fun <- function(r, c){
stopifnot(r != c)
if(r > c) (r-2)*(r-1)/2 + c
else (c-2)*(c-1)/2 + r
}
mdv[fun(1, 2)] # 1
mdv[fun(2, 3)] # 3
mdv[fun(3, 4)] # 5
mdv[fun(2, 1)] # 1
mdv[fun(3, 2)] # 3
mdv[fun(1, 1)] # stop
Cases with r == c should be handled before applying fun. For convenience, You can write another function for handling this case.
I wrote the following code, and I need to repeat this for 100 times, and I know I need to user another for loop, but I don't know how to do it. Here is the code:
mean <- c(5,5,10,10,5,5,5)
x <- NULL
u <- NULL
delta1 <- NULL
w1 <- NULL
for (i in 1:7 ) {
x[i] <- rexp(1, rate = mean[i])
u[i] <- (1/1.2)*runif(1, min=0, max=1)
y1 <- min(x,u)
if (y1 == min(x)) {
delta1 <- 1
}
else {
delta1 <- 0
}
if (delta1 == 0)
{
w1 <- NULL
}
else {
if(y1== x[[1]])
{
w1 <- "x1"
}
}
}
output <- cbind(delta1,w1)
output
I want the final output to be 100 rows* 3 columns matrix representing run number, delta1, and w1.
Any thought will be truly appreciated.
Here's what I gather you're trying to achieve from your code:
Given two vectors drawn from different distributions (Exponential and Uniform)
Find out which distribution the smallest number comes from
Repeat this 100 times.
Theres a couple of problems with your code if you want to achieve this, so here's a cleaned up example:
rates <- c(5, 5, 10, 10, 5, 5, 5) # 'mean' is an inbuilt function
# Initialise the output data frame:
output <- data.frame(number=rep(0, 100), delta1=rep(1, 100), w1=rep("x1", 100))
for (i in 1:100) {
# Generating u doesn't require a for loop. Additionally, can bring in
# the (1/1.2) out the front.
u <- runif(7, min=0, max=5/6)
# Generating x doesn't need a loop either. It's better to use apply functions
# when you can!
x <- sapply(rates, function(x) { rexp(1, rate=x) })
y1 <- min(x, u)
# Now we can store the output
output[i, "number"] <- y1
# Two things here:
# 1) use all.equal instead of == to compare floating point numbers
# 2) We initialised the data frame to assume they always came from x.
# So we only need to overwrite it where it comes from u.
if (isTRUE(all.equal(y1, min(u)))) {
output[i, "delta1"] <- 0
output[i, "w1"] <- NA # Can't use NULL in a character vector.
}
}
output
Here's an alternative, more efficient approach with replicate:
Mean <- c(5, 5, 10, 10, 5, 5, 5)
n <- 100 # number of runs
res <- t(replicate(n, {
x <- rexp(n = length(Mean), rate = Mean)
u <- runif(n = length(Mean), min = 0, max = 1/1.2)
mx <- min(x)
delta1 <- mx <= min(u)
w1 <- delta1 & mx == x[1]
c(delta1, w1)
}))
output <- data.frame(run = seq.int(n), delta1 = as.integer(res[ , 1]),
w1 = c(NA, "x1")[res[ , 2] + 1])
The result:
head(output)
# run delta1 w1
# 1 1 1 <NA>
# 2 2 1 <NA>
# 3 3 1 <NA>
# 4 4 1 x1
# 5 5 1 <NA>
# 6 6 0 <NA>