Related
Taking into account your past answers, I've changed for the following:
n <- 100
B <- 20
S <- 50
alpha <- 0.3
beta <- 1.2
theta <- alpha*beta
for (i in 1:S) {
###
sim_original_samples <- rgamma(n, alpha, beta) # for each S, we have a sample of 100 observations
sim_original_samples_X_bar <- mean(sim_original_samples) # for each dataset, compute the sample mean and input it
sim_bs_samples_X_bar <- matrix(0,B,1)
# in the same loop we are going to compute the sample mean per bootstrap per original sample i
####
####
for (j in 1:B) {
sim_bs_samples <- sample(sim_original_samples,n,replace=TRUE)
# for each original sample, we are going to draw B times a bootstrap sample
sim_bs_samples_X_bar[j] <- mean(sim_bs_samples)
# all the elements of this matrix should be the bootstrap sample mean
var_sim_bs_samples <- matrix(0,B,1)
var_sim_bs_samples[j] <- (sim_bs_samples_X_bar[j] - sim_original_samples_X_bar)^2
se_sim_bs_samples <- sqrt((1/B*sum(var_sim_bs_samples)))
}
####
####
# now we want to compute the asymptotic CI of i)
z <- 1.96
var_gamma <- alpha*beta^2/n
CI_sim_asy_norm <- matrix(ncol = 3, nrow = S) # create a vector for the CI
names <- c("Lower bound", "Upper bound", "teta covered")
colnames(CI_sim_asy_norm) <- names
#
CI_sim_asy_norm[i,1] <- theta - z*sqrt(var_gamma)
CI_sim_asy_norm[i,2] <- theta + z*sqrt(var_gamma)
CI_sim_asy_norm[i,3] <- theta >= CI_sim_asy_norm[i,1] & theta <= CI_sim_asy_norm[i,2]
# check whether the true parameter of interest is covered
####
####
# do the same for the asymptotic BS CI of ii)
CI_sim_asy_bs <- matrix(ncol = 3, nrow = S)
colnames(CI_sim_asy_bs) <- names
CI_sim_asy_bs[i,1] <- sim_original_samples_X_bar - z*se_sim_bs_samples
CI_sim_asy_bs[i,2] <- sim_original_samples_X_bar + z*se_sim_bs_samples
CI_sim_asy_bs[i,3] <- theta >= CI_sim_asy_bs[i,1] & theta <= CI_sim_asy_bs[i,2]
####
####
# do the same for the percentile BS CI of iii) assuming B = 1000 for simplicity
sim_bs_samples_X_bar_sorted <- sort(sim_bs_samples_X_bar, decreasing=FALSE)
CI_sim_percentile <- matrix(ncol = 3, nrow = S)
colnames(CI_sim_percentile) <- names
CI_sim_percentile[i,1] <- sim_bs_samples_X_bar_sorted[1000*(0.05/2)]
CI_sim_percentile[i,2] <- sim_bs_samples_X_bar_sorted[1000*((1-0.05)/2)]
CI_sim_percentile[i,3] <- theta >= CI_sim_percentile[i,1] & theta <= CI_sim_percentile[i,2]
####
}
The issue I have now, is that only the last row of the CI is filled (when filled) whereas it should be filled for all rows.
Where is the issue ? I cannot see it.
That is, for each original sample i, I draw B bootstrap samples.
For each, original sample i, I want to construct confidence intervals.
For each confidence intervals I want to know whether the true parameter (theta) has been contained in each of the CI.
Hence, I'd have 50 confidence intervals.
For the bootstrap one it is based on the estimates of the 20 simulations (per original sample).
Many thanks
The Question
Let's divide your question into a two parts:
How to create the data: samples, re-samples, means, etc.
How to create a multi-dimensional object
1. Creating the data
Configuration from your question
S <- 5
n <- 100
B <- 2
alpha <- 0.3
beta <- 1.2
Sample and re-sample
require( tidyverse )
U <- map_dfc( 1:S, ~rgamma( n, alpha, beta ))
Ubar <- map_dfc( 1:S, mean )
V <- map_dfc( 1:S, ~sample( U[[ . ]], n, replace = TRUE ))
Vbar <- map_dfc( 1:S, mean )
What shape are these?
dim( U )
# 100 5
dim( V )
# 100 5
dim( Ubar )
# 1 5
dim( Vbar )
# 1 5
Now, what did you want to stack? (and why?)
2. How to create a multi-dimensional object
Sometimes it can be helpful to pack data into a multi-dimensional object, in order to facilitate slicing along axes, or to select specific elements.
Define the object
multi_dimensional <- array(
data = 0:( S * B * n )
, dim = c( S, B, n )
, dimnames = list( # <---- names are optional
paste0( 'X', 1:S )
, paste0( 'Y', 1:B )
, paste0( 'Z', 1:n )
)
)
dim( multi_dimensional )
# [1] 5 2 100
Slice, dice, and chop
multi_dimensional[ 1, 1, 1 ]
# [1] 0
multi_dimensional[ S, B, n ]
# [1] 999
multi_dimensional[ 1, 2, 1:10]
# Z1 Z2 Z3 Z4 Z5 Z6 Z7 Z8 Z9 Z10
# 5 15 25 35 45 55 65 75 85 95
multi_dimensional[ , , 1:2 ]
# , , Z1
#
# Y1 Y2
# X1 0 5
# X2 1 6
# X3 2 7
# X4 3 8
# X5 4 9
#
# , , Z2
#
# Y1 Y2
# X1 10 15
# X2 11 16
# X3 12 17
# X4 13 18
# X5 14 19
multi_dimensional[ S, B, 99:100 ]
# Z99 Z100
# 989 999
Use the named dimensions, if you wish
multi_dimensional[ , , c( 'Z1', 'Z2' ) ]
# , , Z1
#
# Y1 Y2
# X1 0 5
# X2 1 6
# X3 2 7
# X4 3 8
# X5 4 9
#
# , , Z2
#
# Y1 Y2
# X1 10 15
# X2 11 16
# X3 12 17
# X4 13 18
# X5 14 19
multi_dimensional[ c( 'X1', 'X3', 'X5' ), 'Y2' , c( 'Z1', 'Z2' ) ]
# Z1 Z2
# X1 5 15
# X3 7 17
# X5 9 19
Assign new values to specific elements
multi_dimensional[ 5, 1, 29:30 ] <- c( 124.76, -5.0002 )
Now show the new values
multi_dimensional[ 5, 1, 29:30 ]
# Z29 Z30
# 124.8 -5.0
multi_dimensional[ 1:3, , 91:100 ] # slice off a particular 3 x 10 block
# (not shown, due to size)
Initial Answer
I think you misunderstoond the purpose of matrix datatype as in R matrices can't store complex objects such as other matrices, they are limited to: double/numeric, integer, logical, character, complex and raw.
Since you know the sizes of most of those data structures you should declare them beforehand AND outside the loops.
What you seem to want is to be able to store a list of B matrices of arbitrary size (1 by n) that are generated on the second loop. You can declare an empty list and start adding the matrices in the second loop to it with something like this:
#You should declare this outside the loops.
matrix_j <- vector(mode='list', length=B)
#Then on the inner loop you can use [[]] to add elements to a list
for (j in 1:B) {
matrix_j[[j]] <- sample(sim_original_samples[i],n,replace=TRUE)
or if you want an empty list of size 0, you can do matrix_j <- list() instead.
Next i didn't get if you want to compute the mean of each sample inside the list or if you want to compute the mean of the whole set of numbers, so:
First one would require you to you use the list apply function lapply,like this: lapply(matrix_j,mean), which would return a list in which each element is the mean of the the element in the same position of matrix_j.
For the second possibility, i think it would be more appropriate to combine the list elements into one simpler data structure and then compute the mean.
For your last problem, it seems to me that using lists (lists of lists) would solve your issue.
I'd create a big empty list and then add other lists as elements, as lists are allowed to contain other lists.
Answer to edits:
You are redeclaring lots of matrices inside loops. This is a bad practice as every time you do this, you're assigning the initial values to them, so don't do that if you want to keep data from previous iterations.
Consider this part of your code:
for (j in 1:B) {
sim_bs_samples <- sample(sim_original_samples,n,replace=TRUE)
# for each original sample, we are going to draw B times a bootstrap sample
sim_bs_samples_X_bar[j] <- mean(sim_bs_samples)
# all the elements of this matrix should be the bootstrap sample mean
var_sim_bs_samples <- matrix(0,B,1)
var_sim_bs_samples[j] <- (sim_bs_samples_X_bar[j] - sim_original_samples_X_bar)^2
se_sim_bs_samples <- sqrt((1/B*sum(var_sim_bs_samples)))
}
Every time the var_sim_bs_samples <- matrix(0,B,1) line runs inside the loop, it substitutes the current matrix for a new matrix full of zeros and then the following line assigns something to its i-th line.
This declaration shouldn't happen inside the loop to avoid this behavior, hence why I told you to create a list and store each new matrix inside it OR move the declaration outside the loop and keep adding things to each line. So, to fix that, you could move the declaration to outside the loop like i've done here:
var_sim_bs_samples <- matrix(0,B,1)
for (j in 1:B) {
sim_bs_samples <- sample(sim_original_samples,n,replace=TRUE)
# for each original sample, we are going to draw B times a bootstrap sample
sim_bs_samples_X_bar[j] <- mean(sim_bs_samples)
# all the elements of this matrix should be the bootstrap sample mean
var_sim_bs_samples[j] <- (sim_bs_samples_X_bar[j] - sim_original_samples_X_bar)^2
se_sim_bs_samples <- sqrt((1/B*sum(var_sim_bs_samples)))
}
The reason only the last line of multiple matrices is present is because you're redeclaring (i.e. erasing it) as an empty matrix everytime with <- matrix(ncol = 3, nrow = S), so you're kinda emptying the matrix and then adding/assigning something to the i-th position, the last one happens to do it to the last position since i goes from 1 to S.
Second Edit:
Just move the declarations outside of the loops where you're using them, like this:
n <- 100
B <- 20
S <- 50
alpha <- 0.3
beta <- 1.2
theta <- alpha*beta
CI_sim_asy_norm <- matrix(ncol = 3, nrow = S)
CI_sim_asy_bs <- matrix(ncol = 3, nrow = S)
CI_sim_percentile <- matrix(ncol = 3, nrow = S)
for (i in 1:S) {
sim_original_samples <- rgamma(n, alpha, beta)
sim_original_samples_X_bar <- mean(sim_original_samples)
sim_bs_samples_X_bar <- matrix(0,B,1)
var_sim_bs_samples <- matrix(0,B,1)
for (j in 1:B) {
sim_bs_samples <- sample(sim_original_samples,n,replace=TRUE)
Your CI_sim_percentile will still be full of NAs, because this command sim_bs_samples_X_bar_sorted[1000*(0.05/2)] is just trying to access an index of sim_bs_samples_X_bar_sorted where there is no data and R assumes NA for that.
I have two large sparse matrices (about 41,000 x 55,000 in size). The density of nonzero elements is around 10%. They both have the same row index and column index for nonzero elements.
I now want to modify the values in the first sparse matrix if values in the second matrix are below a certain threshold.
library(Matrix)
# Generating the example matrices.
set.seed(42)
# Rows with values.
i <- sample(1:41000, 227000000, replace = TRUE)
# Columns with values.
j <- sample(1:55000, 227000000, replace = TRUE)
# Values for the first matrix.
x1 <- runif(227000000)
# Values for the second matrix.
x2 <- sample(1:3, 227000000, replace = TRUE)
# Constructing the matrices.
m1 <- sparseMatrix(i = i, j = j, x = x1)
m2 <- sparseMatrix(i = i, j = j, x = x2)
I now get the rows, columns and values from the first matrix in a new matrix. This way, I can simply subset them and only the ones I am interested in remain.
# Getting the positions and values from the matrices.
position_matrix_from_m1 <- rbind(i = m1#i, j = summary(m1)$j, x = m1#x)
position_matrix_from_m2 <- rbind(i = m2#i, j = summary(m2)$j, x = m2#x)
# Subsetting to get the elements of interest.
position_matrix_from_m1 <- position_matrix_from_m1[,position_matrix_from_m1[3,] > 0 & position_matrix_from_m1[3,] < 0.05]
# We add 1 to the values, since the sparse matrix is 0-based.
position_matrix_from_m1[1,] <- position_matrix_from_m1[1,] + 1
position_matrix_from_m1[2,] <- position_matrix_from_m1[2,] + 1
Now I am getting into trouble. Overwriting the values in the second matrix takes too long. I let it run for several hours and it did not finish.
# This takes hours.
m2[position_matrix_from_m1[1,], position_matrix_from_m1[2,]] <- 1
m1[position_matrix_from_m1[1,], position_matrix_from_m1[2,]] <- 0
I thought about pasting the row and column information together. Then I have a unique identifier for each value. This also takes too long and is probably just very bad practice.
# We would get the unique identifiers after the subsetting.
m1_identifiers <- paste0(position_matrix_from_m1[1,], "_", position_matrix_from_m1[2,])
m2_identifiers <- paste0(position_matrix_from_m2[1,], "_", position_matrix_from_m2[2,])
# Now, I could use which and get the position of the values I want to change.
# This also uses to much memory.
m2_identifiers_of_interest <- which(m2_identifiers %in% m1_identifiers)
# Then I would modify the x values in the position_matrix_from_m2 matrix and overwrite m2#x in the sparse matrix object.
Is there a fundamental error in my approach? What should I do to run this efficiently?
Is there a fundamental error in my approach?
Yes. Here it is.
# This takes hours.
m2[position_matrix_from_m1[1,], position_matrix_from_m1[2,]] <- 1
m1[position_matrix_from_m1[1,], position_matrix_from_m1[2,]] <- 0
Syntax as mat[rn, cn] (whether mat is a dense or sparse matrix) is selecting all rows in rn and all columns in cn. So you get a length(rn) x length(cn) matrix. Here is a small example:
A <- matrix(1:9, 3, 3)
# [,1] [,2] [,3]
#[1,] 1 4 7
#[2,] 2 5 8
#[3,] 3 6 9
rn <- 1:2
cn <- 2:3
A[rn, cn]
# [,1] [,2]
#[1,] 4 7
#[2,] 5 8
What you intend to do is to select (rc[1], cn[1]), (rc[2], cn[2]) ..., only. The correct syntax is then mat[cbind(rn, cn)]. Here is a demo:
A[cbind(rn, cn)]
#[1] 4 8
So you need to fix your code to:
m2[cbind(position_matrix_from_m1[1,], position_matrix_from_m1[2,])] <- 1
m1[cbind(position_matrix_from_m1[1,], position_matrix_from_m1[2,])] <- 0
Oh wait... Based on your construction of position_matrix_from_m1, this is just
ij <- t(position_matrix_from_m1[1:2, ])
m2[ij] <- 1
m1[ij] <- 0
Now, let me explain how you can do better. You have underused summary(). It returns a 3-column data frame, giving (i, j, x) triplet, where both i and j are index starting from 1. You could have worked with this nice output directly, as follows:
# Getting (i, j, x) triplet (stored as a data.frame) for both `m1` and `m2`
position_matrix_from_m1 <- summary(m1)
# you never seem to use `position_matrix_from_m2` so I skip it
# Subsetting to get the elements of interest.
position_matrix_from_m1 <- subset(position_matrix_from_m1, x > 0 & x < 0.05)
Now you can do:
ij <- as.matrix(position_matrix_from_m1[, 1:2])
m2[ij] <- 1
m1[ij] <- 0
Is there a even better solution? Yes! Note that nonzero elements in m1 and m2 are located in the same positions. So basically, you just need to change m2#x according to m1#x.
ind <- m1#x > 0 & m1#x < 0.05
m2#x[ind] <- 1
m1#x[ind] <- 0
A complete R session
I don't have enough RAM to create your large matrix, so I reduced your problem size a little bit for testing. Everything worked smoothly.
library(Matrix)
# Generating the example matrices.
set.seed(42)
## reduce problem size to what my laptop can bear with
squeeze <- 0.1
# Rows with values.
i <- sample(1:(41000 * squeeze), 227000000 * squeeze ^ 2, replace = TRUE)
# Columns with values.
j <- sample(1:(55000 * squeeze), 227000000 * squeeze ^ 2, replace = TRUE)
# Values for the first matrix.
x1 <- runif(227000000 * squeeze ^ 2)
# Values for the second matrix.
x2 <- sample(1:3, 227000000 * squeeze ^ 2, replace = TRUE)
# Constructing the matrices.
m1 <- sparseMatrix(i = i, j = j, x = x1)
m2 <- sparseMatrix(i = i, j = j, x = x2)
## give me more usable RAM
rm(i, j, x1, x2)
##
## fix to your code
##
m1a <- m1
m2a <- m2
# Getting (i, j, x) triplet (stored as a data.frame) for both `m1` and `m2`
position_matrix_from_m1 <- summary(m1)
# Subsetting to get the elements of interest.
position_matrix_from_m1 <- subset(position_matrix_from_m1, x > 0 & x < 0.05)
ij <- as.matrix(position_matrix_from_m1[, 1:2])
m2a[ij] <- 1
m1a[ij] <- 0
##
## the best solution
##
m1b <- m1
m2b <- m2
ind <- m1#x > 0 & m1#x < 0.05
m2b#x[ind] <- 1
m1b#x[ind] <- 0
##
## they are identical
##
all.equal(m1a, m1b)
#[1] TRUE
all.equal(m2a, m2b)
#[1] TRUE
Caveat:
I know that some people may propose
m1c <- m1
m2c <- m2
logi <- m1 > 0 & m1 < 0.05
m2c[logi] <- 1
m1c[logi] <- 0
It looks completely natural in R's syntax. But trust me, it is extremely slow for large matrices.
I have two matrices, one is generated out of the other by deleting some rows. For example:
m = matrix(1:18, 6, 3)
m1 = m[c(-1, -3, -6),]
Suppose I do not know which rows in m were eliminated to create m1, how should I find it out by comparing the two matrices? The result I want looks like this:
1, 3, 6
The actual matrix I am dealing with is very big. I was wondering if there is any efficient way of conducting it.
Here are some approaches:
1) If we can assume that there are no duplicated rows in m -- this is the case in the example in the question -- then:
which(tail(!duplicated(rbind(m1, m)), nrow(m)))
## [1] 1 3 6
2) Transpose m and m1 giving tm and tm1 since it is more efficient to work on columns than rows.
Define match_indexes(i) which returns a vector r such that each row in m[r, ] matches m1[i, ].
Apply that to each i in 1:n1 and remove the result from 1:n.
n <- nrow(m); n1 <- nrow(m1)
tm <- t(m); tm1 <- t(m1)
match_indexes <- function(i) which(colSums(tm1[, i] == tm) == n1)
setdiff(1:n, unlist(lapply(1:n1, match_indexes)))
## [1] 1 3 6
3) Calculate an interaction vector for each matrix and then use setdiff and finally match to get the indexes:
i <- interaction(as.data.frame(m))
i1 <- interaction(as.data.frame(m1))
match(setdiff(i, i1), i)
## [1] 1 3 6
Added If there can be duplicates in m then (1) and (3) will only return the first of any multiply occurring row in m not in m1.
m <- matrix(1:18, 6, 3)
m1 <- m[c(2, 4, 5),]
m <- rbind(m, m[1:2, ])
# 1
which(tail(!duplicated(rbind(m1, m)), nrow(m)))
## 1 3 6
# 2
n <- nrow(m); n1 <- nrow(m1)
tm <- t(m); tm1 <- t(m1)
match_indexes <- function(i) which(colSums(tm1[, i] == tm) == n1)
setdiff(1:n, unlist(lapply(1:n1, match_indexes)))
## 1 3 6 7
# 3
i <- interaction(as.data.frame(m))
i1 <- interaction(as.data.frame(m1))
match(setdiff(i, i1), i)
## 1 3 6
A possible way is to represent each row as a string:
x1 <- apply(m, 1, paste0, collapse = ';')
x2 <- apply(m1, 1, paste0, collapse = ';')
which(!x1 %in% x2)
# [1] 1 3 6
Some benchmark with a large matrix using my solution and G. Grothendieck's solutions:
set.seed(123)
m <- matrix(rnorm(20000 * 5000), nrow = 20000)
m1 <- m[-sample.int(20000, 1000), ]
system.time({
which(tail(!duplicated(rbind(m1, m)), nrow(m)))
})
# user system elapsed
# 339.888 2.368 342.204
system.time({
x1 <- apply(m, 1, paste0, collapse = ';')
x2 <- apply(m1, 1, paste0, collapse = ';')
which(!x1 %in% x2)
})
# user system elapsed
# 395.428 0.568 395.955
system({
n <- nrow(m); n1 <- nrow(m1)
tm <- t(m); tm1 <- t(m1)
match_indexes <- function(i) which(colSums(tm1[, i] == tm) == n1)
setdiff(1:n, unlist(lapply(1:n1, match_indexes)))
})
# > 15 min, not finish
system({
i <- interaction(as.data.frame(m))
i1 <- interaction(as.data.frame(m1))
match(setdiff(i, i1), i)
})
# run out of memory. My 32G RAM machine crashed.
We can also use do.call
which(!do.call(paste, as.data.frame(m)) %in% do.call(paste, as.data.frame(m1)))
#[1] 1 3 6
I have a matrix temp1 (dimensions Nx16) (generally, NxM)
I would like to sum every k columns in each row to one value.
Here is what I got to so far:
cbind(rowSums(temp1[,c(1:4)]), rowSums(temp1[,c(5:8)]), rowSums(temp1[,c(9:12)]), rowSums(temp1[,c(13:16)]))
There must be a more elegant (and generalized) method to do it.
I have noticed similar question here:
sum specific columns among rows
couldn't make it work with Ananda's solution;
Got following error:
sapply(split.default(temp1, 0:(length(temp1)-1) %/% 4), rowSums)
Error in FUN(X[[1L]], ...) :
'x' must be an array of at least two dimensions
Please advise.
You can use by:
do.call(cbind, by(t(temp1), (seq(ncol(temp1)) - 1) %/% 4, FUN = colSums))
If the dimensions are equal for the sub matrices, you could change the dimensions to an array and then do the rowSums
m1 <- as.matrix(temp1)
n <- 4
dim(m1) <- c(nrow(m1), ncol(m1)/n, n)
res <- matrix(rowSums(apply(m1, 2, I)), ncol=n)
identical(res[,1],rowSums(temp1[,1:4]))
#[1] TRUE
Or if the dimensions are unequal
t(sapply(seq(1,ncol(temp2), by=4), function(i) {
indx <- i:(i+3)
rowSums(temp2[indx[indx <= ncol(temp2)]])}))
data
set.seed(24)
temp1 <- as.data.frame(matrix(sample(1:20, 16*4, replace=TRUE), ncol=16))
set.seed(35)
temp2 <- as.data.frame(matrix(sample(1:20, 17*4, replace=TRUE), ncol=17))
Another possibility:
x1<-sapply(1:(ncol(temp1)/4),function(x){rowSums(temp1[,1:4+(x-1)*4])})
## check
x0<-cbind(rowSums(temp1[,c(1:4)]), rowSums(temp1[,c(5:8)]), rowSums(temp1[,c(9:12)]), rowSums(temp1[,c(13:16)]))
identical(x1,x0)
# TRUE
Here's another approach. Convert the matrix to an array and then use apply with sum.
n <- 4
apply(array(temp1, dim=c(dim(temp1)/c(1,n), n)), MARGIN=c(1,3), FUN=sum)
Using #akrun's data
set.seed(24)
temp1 <- matrix(sample(1:20, 16*4, replace=TRUE), ncol=16)
a function which sums matrix columns with each group of size n columns
set.seed(1618)
mat <- matrix(rnorm(24 * 16), 24, 16)
f <- function(mat, n = 4) {
if (ncol(mat) %% n != 0)
stop()
cols <- split(colSums(mat), rep(1:(ncol(mat) / n), each = n))
## or use this to have n mean the number of groups you want
# cols <- split(colSums(mat), rep(1:n, each = ncol(mat) / n))
sapply(cols, sum)
}
f(mat, 4)
# 1 2 3 4
# -17.287137 -1.732936 -5.762159 -4.371258
c(sum(mat[,1:4]), sum(mat[,5:8]), sum(mat[,9:12]), sum(mat[,13:16]))
# [1] -17.287137 -1.732936 -5.762159 -4.371258
More examples:
## first 8 and last 8 cols
f(mat, 8)
# 1 2
# -19.02007 -10.13342
## each group is 16 cols, ie, the entire matrix
f(mat, 16)
# 1
# -29.15349
sum(mat)
# [1] -29.15349
I've got a set of objects, let's say with the IDs 'A' to 'J'. And I've got two data frames which look the following way (as you can see, the second data frame is symmetric):
df1 <- data.frame(ID = LETTERS[1:5], Var = c(9,13,15,11,28))
df2 <- as.data.frame(matrix(data = c(NA,42,83,74,84,42,NA,26,69,9,83,26,NA,67,95,74,69,67,NA,6,84,9,95,6,NA), ncol = 5, nrow = 5, dimnames = list(df1$ID, df1$ID)))
For example, take the objects 'B' and 'E'. I want to know: Is 13+28 (from df1) less than 9 (from df2)? I'd like to know this for all pairs of objects. The output should be
(a) a logical data frame structured like df2 and
(b) the number of "TRUE" values.
Most of the time I will only need result (b), but sometimes I would also need (a). So if (b) can be calculated without (a) and if this would be significantly faster, then I'd like to have both algorithms in order to select the suitable one dependent on which output I need to answer a particular question.
I'm comparing around 2000 objects, so the algorithm should be reasonably fast. So far I've been only able to implement this with two nested for-loops which is awfully slow. I bet there is a much nicer way to do this, maybe exploiting vectorisation.
This is what it currently looks like:
df3 <- as.data.frame(matrix(data = NA, ncol = nrow(df1), nrow = nrow(df1),
dimnames = list(df1$ID, df1$ID)))
for (i in 2:nrow(df3)){
for (j in 1:(i-1)){
sum.val <- df1[df1$ID == rownames(df3)[i], "Var"] + df1[df1$ID == names(df3)[j], "Var"]
df3[i,j] <- sum.val <= df2[i,j]
}
}
#
Is this what you want?
df3 <- outer(df1$Var, df1$Var, "+")
df3
df4 <- df3 < df2
df4
sum(df4, na.rm = TRUE)
Here's one way to do it...
# Get row and column indices
ind <- t( combn( df1$ID , 2 ) )
# Get totals
tot <- with( df1 , Var[ match( ind[,1] , ID ) ] + Var[ match( ind[,2] , ID ) ] )
# Make df2 a matrix
m <- as.matrix( df2 )
# Total number of values is simply
sum( m[ ind ] > tot )
#[1] 7
# Find which values in upper triangle part of the matrix exceed those from df1 (1 = TRUE)
m[upper.tri(m)] <- m[ ind ] > tot
# A B C D E
#A NA 1 1 1 0
#B 42 NA 1 0 1
#C 83 26 NA 1 1
#D 74 69 67 NA 0
#E 84 9 95 6 NA
This will do what you want.
# Generate the data
df1 <- data.frame(ID = LETTERS[1:5], Var = c(9,13,15,11,28))
df2 <- as.data.frame(matrix(data = c(NA,42,83,74,84,42,NA,26,
69,9,83,26,NA,67,95,74,69,
67,NA,6,84,9,95,6,NA),
ncol = 5, nrow = 5,
dimnames = list(df1$ID, df1$ID)))
# Define a pairwise comparison index matrix using 'combn'
idx <- combn(nrow(df1), 2)
# Create a results matrix
res <- matrix(NA, ncol = ncol(df2), nrow = nrow(df2))
# Loop through 'idx' for each possible comparison (without repeats)
for(i in 1:ncol(idx)){
logiTest <- (df1$Var[idx[1,i]] + df1$Var[idx[2,i]]) < df2[idx[1,i], idx[2,i]]
res[idx[1,i], idx[2, i]] <- logiTest
res[idx[2,i], idx[1, i]] <- logiTest
}
# Count the number of 'true' comparisons
nTrues <- sum(res, na.rm = TRUE)/2
The code simply uses a pairwise comparison index (idx) to define which elements in both df1 and df2 are to be used in each iteration of the 'for loop'. It then uses this same index to define where in the 'res' matrix the answer to the logical test is to be written.
N.B. This code will break down if the order of elements in df1 and df2 are not the same. In such cases, it would be appropriate to use the actual letters to define which values to compare.