distribute `n` among `k` units without repetition and zero structures in R - r

I was wondering if there might be a way in R to distribute n among k units without repetition (e.g., 3 5 2 is the same as 5 3 2, and 2 3 5 and 5 2 3) and without considering 0 combinations (i.e., no 9 1 0) and see the make-up of this distribution?
For example if n = 9 and k = 3 then we expect the make-up to be:
(Note: k will always be the # of columns)
3 3 3
4 3 2
4 1 4
5 2 2
5 1 3
6 2 1
7 1 1
makeup <- function(n, k){
# your suggested solution #
}

These are called integer partitions (more specifically restricted integer partitions) and can efficiently be generated with the packages partitions or arrangements like so:
partitions::restrictedparts(9, 3, include.zero = FALSE)
[1,] 7 6 5 4 5 4 3
[2,] 1 2 3 4 2 3 3
[3,] 1 1 1 1 2 2 3
arrangements::partitions(9, 3)
[,1] [,2] [,3]
[1,] 1 1 7
[2,] 1 2 6
[3,] 1 3 5
[4,] 1 4 4
[5,] 2 2 5
[6,] 2 3 4
[7,] 3 3 3
They are much faster than the solutions thus provided:
library(microbenchmark)
microbenchmark(arrangePack = arrangements::partitions(20, 5),
partsPack = partitions::restrictedparts(20, 5, include.zero = FALSE),
myfun2(20, 5, 20),
myfun1(20, 5, 20),
makeup(20, 5),
mycomb(20, 5), times = 3, unit = "relative")
Unit: relative
expr min lq mean median uq max neval
arrangePack 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 3
partsPack 3.070203 2.755573 2.084231 2.553477 1.854912 1.458389 3
myfun2(20, 5, 20) 10005.679667 8528.784033 6636.284386 7580.133387 5852.625112 4872.050067 3
myfun1(20, 5, 20) 12770.400243 10574.957696 8005.844282 9164.764625 6897.696334 5610.854109 3
makeup(20, 5) 15422.745155 12560.083171 9248.916738 10721.316721 7812.997976 6162.166646 3
mycomb(20, 5) 1854.125325 1507.150003 1120.616461 1284.278219 950.015812 760.280469 3
In fact, for the example below, the other functions will error out because of memory:
system.time(arrangements::partitions(100, 10))
user system elapsed
0.068 0.031 0.099
arrangements::npartitions(100, 10)
[1] 2977866

You may try gtools::combinations for this work like below with repeats.allowed=TRUE option:
m <- gtools::combinations(9, 3, repeats.allowed = TRUE)
m[rowSums(m) == 9,]
A probable function could be, with options(expressions = 500000), this function could go till n = 500 (successfully ran on my machine for n=500, r=3):
mycomb <- function(n, r, sumval){
m <- combinations(n, r, repeats.allowed = TRUE)
m[rowSums(m) == sumval,]
}
mycomb(9,3,9)
Output:
# [,1] [,2] [,3]
#[1,] 1 1 7
#[2,] 1 2 6
#[3,] 1 3 5
#[4,] 1 4 4
#[5,] 2 2 5
#[6,] 2 3 4
#[7,] 3 3 3

Here's a base solution using expand.grid. I'm not going to recommend it for large n, but it works:
makeup <- function(n, k) {
x <- expand.grid(rep(list(1:n), 3)) # generate all combinations
x <- x[rowSums(x) == n,] # filter out stuff that doesn't sum to n
x <- as.data.frame(t(apply(x, 1, sort))) # order everything
unique(x) # keep non-duplicates
}
A little rethinking simplifies this greatly. If we have a vector of n objects, we can break it apart at n-1 different spots.. starting from this, we can reduce the work substantially:
makeup <- function(n, k) {
splits <- combn(n-1, k-1) # locations where to split up the data
bins <- rbind(rep(0, ncol(splits)), splits) # add an extra "split" before the 1st element
x <- apply(bins, 2, function(x) c(x[-1],9) -x) # count how many items in each bin
x <- as.data.frame(t(apply(x, 2, sort))) # order everything
unique(x) # keep non-duplicates
}

using matrix in base R:
myfun1 <- function( n, k){
x <- as.matrix(expand.grid( rep(list(seq_len(n)), k)))
x <- x[rowSums(x) == n,]
x[ ! duplicated( t( apply(x, 1, sort)) ),]
}
myfun1( n = 9, k = 3 )
May be this using data.table.
myfun2 <- function( n, k){
require('data.table')
dt <- do.call(CJ, rep(list(seq_len(n)), k))
dt <- dt[rowSums(dt) == n,]
dt[which(!duplicated(dt[, transpose(lapply( transpose(.SD), sort ))])),]
}
myfun2( n = 9, k = 3 )
# V1 V2 V3
# 1: 7 1 1
# 2: 6 2 1
# 3: 5 3 1
# 4: 4 4 1
# 5: 5 2 2
# 6: 4 3 2
# 7: 3 3 3

Related

how to find the columns with the most similar values

Just starting to use R and am feeling a bit confused. Suppose I have three columns
data = data.frame(id=c(101, 102, 103),column1=c(2, 4, 9),
column2=c(3, 4, 2), column3=c(5, 15, 7))
How can I create a new column (e.g., colmean) that is the mean of the two columns closest in value? I thought about doing a bunch of ifelse statements, but that seemed unnecessarily messy.
In this case, for instance, colmean=c(2.5, 4, 8).
Borrowing the function findClosest() created here by #Cole, we can do the following,
findClosest <- function(x, n) {
x <- sort(x)
x[seq.int(which.min(diff(x, lag = n - 1L)), length.out = n)]
}
colMeans(apply(data[-1], 1, function(i)findClosest(i, 2)))
#[1] 2.5 4.0 8.0
A vectorized function using the Rfast package:
library(Rfast)
fClosest <- function(m, n) {
m <- colSort(t(m))
matrix(
m[
sequence(
rep(n, ncol(m)),
seq(0, nrow(m)*(ncol(m) - 1), nrow(m)) + colMins(diff(m, lag = n - 1))
)
],
ncol(m), n, TRUE
)
}
m <- matrix(sample(10, 24, 1), 4)
m
#> [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,] 4 2 6 2 5 3
#> [2,] 3 4 7 3 4 7
#> [3,] 4 2 7 6 10 2
#> [4,] 8 1 10 8 2 9
fClosest(m, 3L)
#> [,1] [,2] [,3]
#> [1,] 2 2 3
#> [2,] 3 3 4
#> [3,] 2 2 4
#> [4,] 8 8 9
rowMeans(fClosest(m, 3L))
#> [1] 2.333333 3.333333 2.666667 8.333333
Here is a version with a loop:
data = data.frame(id=c(101, 102, 103),column1=c(2, 4, 9),
column2=c(3, 4, 2), column3=c(5, 15, 7))
data$colmean <- NaN # set up empty column for results
for(i in seq(nrow(data))){
data.i <- data[i,-1] # get ith row
d <- as.matrix(dist(c(data.i))) # get distances between values
diag(d) <- NaN # replace diagonal of distance matrix with NaN
hit <- which.min(d) # identify value of lowest distance
pos <- c(row(d)[hit], col(d)[hit]) # get the position (i.e. the values that are closest)
data$colmean[i] <- mean(unlist(data.i[pos])) # calculate mean
}
data
# id column1 column2 column3 colmean
# 1 101 2 3 5 2.5
# 2 102 4 4 15 4.0
# 3 103 9 2 7 8.0
Here's a self-contained solution, based on the tidyverse, that is independent of the number of columns to be compared.
library(tidyverse)
data %>%
# Add the means of smallest pairwise differences to the input data
bind_cols(
data %>%
# Make the data tidy (and hence independent of the number of "column"s)
pivot_longer(starts_with("column")) %>%
# For each id/row (replace with rowwise() if appropriate)
group_by(id) %>%
group_map(
function(.x, .y) {
# Form a tibble of all pairwise ciombinations of values
as_tibble(t(combn(.x$value, 2))) %>%
# Calculate pairwise differences
mutate(difference = abs(V1 - V2)) %>%
# Find the smallest pairwise difference
arrange(difference) %>%
head(1) %>%
# Calculate the mean of this pair
pivot_longer(starts_with("V")) %>%
summarise(colmean=mean(value))
}
) %>%
# Convert list of values to column
bind_rows()
)
id column1 column2 column3 colmean
1 101 2 3 5 2.5
2 102 4 4 15 4.0
3 103 9 2 7 8.0

Generate series 1, 2,1, 3,2,1, 4,3,2,1, 5,4,3,2,1

I am trying to generate a vector containing decreasing sequences of increasing length, such as 1, 2,1, 3,2,1, 4,3,2,1, 5,4,3,2,1, i.e.
c(1, 2:1, 3:1, 4:1, 5:1)
I tried to use a loop for this, but I don't know how to stack or concatenate the results.
for (i in 1:11)
{
x = rev(seq(i:1))
print(x)
}
[1] 1
[1] 2 1
[1] 3 2 1
[1] 4 3 2 1
[1] 5 4 3 2 1
[1] 6 5 4 3 2 1
[1] 7 6 5 4 3 2 1
[1] 8 7 6 5 4 3 2 1
[1] 9 8 7 6 5 4 3 2 1
[1] 10 9 8 7 6 5 4 3 2 1
[1] 11 10 9 8 7 6 5 4 3 2 1
I have also been experimenting with the rep, rev and seq, which are my favourite option but did not get far.
With sequence:
rev(sequence(5:1))
# [1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
From R 4.0.0 sequence takes arguments from and by:
sequence(1:5, from = 1:5, by = -1)
# [1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
Far from the golf minimalism of rev... However, if you wake up one morning and want to create such a sequence with n = 1000 (like in the answer below), the latter is in fact faster (but I can hear Brian Ripley in fortunes::fortune(98))
n = 1000
microbenchmark(
f_rev = rev(sequence(n:1)),
f_seq4.0.0 = sequence(1:n, from = 1:n, by = -1))
# Unit: microseconds
# expr min lq mean median uq max neval
# f_rev 993.7 1040.3 1128.391 1076.95 1133.3 1904.7 100
# f_seq4.0.0 136.4 141.5 153.778 148.25 150.1 304.7 100
We can do this with lapply
unlist(lapply(1:11, function(x) rev(seq(x))))
Or as #zx8754 mentioned in the comments, in place of rev(seq, : can be used
unlist(lapply(1:11, function(x) x:1))
Or as #BrodieG suggested, we can make this more compact by removing the anonymous function call
unlist(lapply(1:11, ":", 1))
And for fun, using matrices (and ignoring the warning ;) )
m <- matrix(c(1:5,0), ncol = 5, nrow = 5, byrow = T)
m[ upper.tri(m, diag = T) ]
# [1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
And we can simplify the upper.tri into its component parts
m[ row(m) <= col(m)]
# [1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
And if you can handle even more fun, then how about some benchmarking:
library(microbenchmark)
maxValue <- 1000
vec2 <- maxValue:1
m2 <- matrix(c(1:maxValue,0), ncol = maxValue, nrow = maxValue, byrow = T)
microbenchmark(
henrik = {
rev(sequence(maxValue:1))
},
henrik_4.0.0 = {
sequence(1:maxValue, from = 1:maxValue, by = -1)
},
akrun = {
unlist(lapply(1:maxValue, function(x) x:1))
},
symbolix1 = {
m <- matrix(c(1:maxValue,0), ncol = maxValue, nrow = maxValue, byrow = T)
m[ row(m) <= col(m) ]
},
symbolix2 = {
m2[ row(m2) <= col(m2) ]
},
lmo1 = {
unlist(lapply(1:maxValue, tail, x=maxValue:1))
},
lmo2 = {
vec <- maxValue:1
unlist(lapply(rev(vec), tail, x=vec))
},
lmo3 = {
unlist(lapply(rev(vec2), tail, x=vec2))
}
)
# Unit: microseconds
# expr min lq mean median uq max neval
# henrik 1018.7 1068.20 1176.430 1103.65 1223.20 2348.4 100
# henrik_4.0.0 139.9 147.90 166.092 151.40 162.70 379.0 100
# akrun 3420.1 3637.75 3825.336 3729.10 3897.00 4960.6 100
# symbolix1 6999.5 7483.20 7807.747 7618.30 7810.70 12138.7 100
# symbolix2 4791.2 5043.00 5677.742 5190.50 5393.65 29318.7 100
# lmo1 7530.1 7967.05 10918.201 8161.10 8566.45 132324.1 100
# lmo2 7385.7 8017.95 12271.158 8213.90 8500.70 143798.2 100
# lmo3 7539.5 7959.05 14355.810 8177.85 8500.85 131154.2 100
In this example, henrik_4.0.0 is the winner! (for bm with pre-R 4.0.0 sequence only, see previous edits)
But I know what you're thinking, 'why end all the fun there!'
Well, lets write our own C++ function and see how that performs
library(Rcpp)
cppFunction('NumericVector reverseSequence(int maxValue, int vectorLength){
NumericVector out(vectorLength);
int counter = 0;
for(int i = 1; i <= maxValue; i++){
for(int j = i; j > 0; j--){
out[counter] = j;
counter++;
}
}
return out;
}')
maxValue <- 5
reverseSequence(maxValue, sum(1:maxValue))
[1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
library(microbenchmark)
maxValue <- 1000
microbenchmark(
akrun = {
unlist(sapply(1:maxValue, function(x) x:1))
},
symbolix3 = {
reverseSequence(maxValue, sum(1:maxValue))
}
)
# Unit: microseconds
# expr min lq mean median uq max neval
# akrun 1522.250 1631.6030 3148.922 1829.9370 3357.493 45576.148 100
# symbolix3 338.626 495.3825 1293.720 950.6635 2169.656 3816.091 100
Another alternative is to use tail within lapply, to successively select the number of elements to keep from the initial vector:
unlist(lapply(1:5, tail, x=5:1))
[1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
Or, it may be faster to construct the base vector first and then call on it:
vec <- 5:1
unlist(lapply(rev(vec), tail, x=vec))
[1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1

Create N random integers with no gaps

For a clustering algorithm that I'm implementing, I would like to initialize the clusters assignments at random. However, I need that there are no gaps. That is, this is not ok:
set.seed(2)
K <- 10 # initial number of clusters
N <- 20 # number of data points
z_init <- sample(K,N, replace=TRUE) # initial assignments
z_init
# [1] 2 8 6 2 10 10 2 9 5 6 6 3 8 2 5 9 10 3 5 1
sort(unique(z_init))
# [1] 1 2 3 5 6 8 9 10
where labels 4 and 7 have not been used.
Instead, I would like this vector to be:
# [1] 2 6 5 2 8 8 2 7 4 5 5 3 6 2 4 7 8 3 4 1
where the label 5 has become 4 and so forth to fill the lower empty labels.
More examples:
The vector 1 2 3 5 6 8 should be ̀1 2 3 4 5 6 7
The vector 15,5,7,7,10 should be ̀1 2 3 3 4
Can it be done avoiding for loops? I don't need it to be fast, I prefer it to be elegant and short, since I'm doing it only once in the code (for label initialization).
My solution using a for loop
z_init <- c(3,2,1,3,3,7,9)
idx <- order(z_init)
for (i in 2:length(z_init)){
if(z_init[idx[i]] > z_init[idx[i-1]]){
z_init[idx[i]] <- z_init[idx[i-1]]+1
}
else{
z_init[idx[i]] <- z_init[idx[i-1]]
}
}
z_init
# 3 2 1 3 3 4 5
Edit: #GregSnow came up with the current shortest answer. I'm 100% convinced that this is the shortest possible way.
For fun, I decided to golf the code, i.e. write it as short as possible:
z <- c(3, 8, 4, 4, 8, 2, 3, 9, 5, 1, 4)
# solution by hand: 1 2 3 3 4 4 4 5 6 6 7
sort(c(factor(z))) # 18 bits, as proposed by #GregSnow in the comments
# [1] 1 2 3 3 4 4 4 5 6 6 7
Some other (functioning) attempts:
y=table(z);rep(seq(y),y) # 24 bits
sort(unclass(factor(z))) # 24 bits, based on #GregSnow 's answer
diffinv(diff(sort(z))>0)+1 # 26 bits
sort(as.numeric(factor(z))) # 27 bits, #GregSnow 's original answer
rep(seq(unique(z)),table(z)) # 28 bits
cumsum(c(1,diff(sort(z))>0)) # 28 bits
y=rle(sort(z))$l;rep(seq(y),y) # 30 bits
Edit2: Just to show that bits isn't everything:
z <- sample(1:10,10000,replace=T)
Unit: microseconds
expr min lq mean median uq max neval
sort(c(factor(z))) 2550.128 2572.2340 2681.4950 2646.6460 2729.7425 3140.288 100
{ y = table(z) rep(seq(y), y) } 2436.438 2485.3885 2580.9861 2556.4440 2618.4215 3070.812 100
sort(unclass(factor(z))) 2535.127 2578.9450 2654.7463 2623.9470 2708.6230 3167.922 100
diffinv(diff(sort(z)) > 0) + 1 551.871 572.2000 628.6268 626.0845 666.3495 940.311 100
sort(as.numeric(factor(z))) 2603.814 2672.3050 2762.2030 2717.5050 2790.7320 3558.336 100
rep(seq(unique(z)), table(z)) 2541.049 2586.0505 2733.5200 2674.0815 2760.7305 5765.815 100
cumsum(c(1, diff(sort(z)) > 0)) 530.159 545.5545 602.1348 592.3325 632.0060 844.385 100
{ y = rle(sort(z))$l rep(seq(y), y) } 661.218 684.3115 727.4502 724.1820 758.3280 857.412 100
z <- sample(1:100000,replace=T)
Unit: milliseconds
expr min lq mean median uq max neval
sort(c(factor(z))) 84.501189 87.227377 92.13182 89.733291 94.16700 150.08327 100
{ y = table(z) rep(seq(y), y) } 78.951701 82.102845 85.54975 83.935108 87.70365 106.05766 100
sort(unclass(factor(z))) 84.958711 87.273366 90.84612 89.317415 91.85155 121.99082 100
diffinv(diff(sort(z)) > 0) + 1 9.784041 9.963853 10.37807 10.090965 10.34381 17.26034 100
sort(as.numeric(factor(z))) 85.917969 88.660145 93.42664 91.542263 95.53720 118.44512 100
rep(seq(unique(z)), table(z)) 86.568528 88.300325 93.01369 90.577281 94.74137 118.03852 100
cumsum(c(1, diff(sort(z)) > 0)) 9.680615 9.834175 10.11518 9.963261 10.16735 14.40427 100
{ y = rle(sort(z))$l rep(seq(y), y) } 12.842614 13.033085 14.73063 13.294019 13.66371 133.16243 100
It seems to me that you are trying to randomly assign elements of a set (the numbers 1 to 20) to clusters, subject to the requirement that each cluster be assigned at least one element.
One approach that I could think of would be to select a random reward r_ij for assigning element i to cluster j. Then I would define binary decision variables x_ij that indicate whether element i is assigned to cluster j. Finally, I would use mixed integer optimization to select the assignment from elements to clusters that maximizes the collected reward subject to the following conditions:
Every element is assigned to exactly one cluster
Every cluster has at least one element assigned to it
This is equivalent to randomly selecting an assignment, keeping it if all clusters have at least one element, and otherwise discarding it and trying again until you get a valid random assignment.
In terms of implementation, this is pretty easy to accomplish in R using the lpSolve package:
library(lpSolve)
N <- 20
K <- 10
set.seed(144)
r <- matrix(rnorm(N*K), N, K)
mod <- lp(direction = "max",
objective.in = as.vector(r),
const.mat = rbind(t(sapply(1:K, function(j) rep((1:K == j) * 1, each=N))),
t(sapply(1:N, function(i) rep((1:N == i) * 1, K)))),
const.dir = c(rep(">=", K), rep("=", N)),
const.rhs = rep(1, N+K),
all.bin = TRUE)
(assignments <- apply(matrix(mod$solution, nrow=N), 1, function(x) which(x > 0.999)))
# [1] 6 5 3 3 5 6 6 9 2 1 3 4 7 6 10 2 10 6 6 8
sort(unique(assignments))
# [1] 1 2 3 4 5 6 7 8 9 10
You could do like this:
un <- sort(unique(z_init))
(z <- unname(setNames(1:length(un), un)[as.character(z_init)]))
# [1] 2 6 5 2 8 8 2 7 4 5 5 3 6 2 4 7 8 3 4 1
sort(unique(z))
# [1] 1 2 3 4 5 6 7 8
Here I replace elements of un in z_init with corresponding elements of 1:length(un).
A simple (but possibly inefficient) approach is to convert to a factor then back to numeric. Creating the factor will code the information as integers from 1 to the number of unique values, then add labels with the original values. Converting to numeric then drops the labels and leaves the numbers:
> x <- c(1,2,3,5,6,8)
> (x2 <- as.numeric(factor(x)))
[1] 1 2 3 4 5 6
>
> xx <- c(15,5,7,7,10)
> (xx2 <- as.numeric(factor(xx)))
[1] 4 1 2 2 3
> (xx3 <- as.numeric(factor(xx, levels=unique(xx))))
[1] 1 2 3 3 4
The levels = portion in the last example sets the numbers to match the order in which they appear in the original vector.

How to select/find coordinates within a distance from a list (X/Y) using R

I have a data frame with list of X/Y locations (>2000 rows). What I want is to select or find all the rows/locations based on a max distance. For example, from the data frame select all the locations that are between 1-100 km from each other. Any suggestions on how to do this?
You need to somehow determine the distance between each pair of rows.
The simplest way is with a corresponding distance matrix
# Assuming Thresh is your threshold
thresh <- 10
# create some sample data
set.seed(123)
DT <- data.table(X=sample(-10:10, 5, TRUE), Y=sample(-10:10, 5, TRUE))
# create the disance matrix
distTable <- matrix(apply(createTable(DT), 1, distance), nrow=nrow(DT))
# remove the lower.triangle since we have symmetry (we don't want duplicates)
distTable[lower.tri(distTable)] <- NA
# Show which rows are above the threshold
pairedRows <- which(distTable >= thresh, arr.ind=TRUE)
colnames(pairedRows) <- c("RowA", "RowB") # clean up the names
Starting with:
> DT
X Y
1: -4 -10
2: 6 1
3: -2 8
4: 8 1
5: 9 -1
We get:
> pairedRows
RowA RowB
[1,] 1 2
[2,] 1 3
[3,] 2 3
[4,] 1 4
[5,] 3 4
[6,] 1 5
[7,] 3 5
These are the two functions used for creating the distance matrix
# pair-up all of the rows
createTable <- function(DT)
expand.grid(apply(DT, 1, list), apply(DT, 1, list))
# simple cartesian/pythagorean distance
distance <- function(CoordPair)
sqrt(sum((CoordPair[[2]][[1]] - CoordPair[[1]][[1]])^2, na.rm=FALSE))
I'm not entirely clear from your question, but assuming you mean you want to take each row of coordinates and find all the other rows whose coordinates fall within a certain distance:
# Create data set for example
set.seed(42)
x <- sample(-100:100, 10)
set.seed(456)
y <- sample(-100:100, 10)
coords <- data.frame(
"x" = x,
"y" = y)
# Loop through all rows
lapply(1:nrow(coords), function(i) {
dis <- sqrt(
(coords[i,"x"] - coords[, "x"])^2 + # insert your preferred
(coords[i,"y"] - coords[, "y"])^2 # distance calculation here
)
names(dis) <- 1:nrow(coords) # replace this part with an index or
# row names if you have them
dis[dis > 0 & dis <= 100] # change numbers to preferred threshold
})
[[1]]
2 6 7 9 10
25.31798 95.01579 40.01250 30.87070 73.75636
[[2]]
1 6 7 9 10
25.317978 89.022469 51.107729 9.486833 60.539243
[[3]]
5 6 8
70.71068 91.78780 94.86833
[[4]]
5 10
40.16217 99.32774
[[5]]
3 4 6 10
70.71068 40.16217 93.40771 82.49242
[[6]]
1 2 3 5 7 8 9 10
95.01579 89.02247 91.78780 93.40771 64.53681 75.66373 97.08244 34.92850
[[7]]
1 2 6 9 10
40.01250 51.10773 64.53681 60.41523 57.55867
[[8]]
3 6
94.86833 75.66373
[[9]]
1 2 6 7 10
30.870698 9.486833 97.082439 60.415230 67.119297
[[10]]
1 2 4 5 6 7 9
73.75636 60.53924 99.32774 82.49242 34.92850 57.55867 67.11930

Extracting off-diagonal slice of large matrix

I've got a large nxn matrix and would like to take off-diagonal slices of varying sizes. For example:
1 2 3 4 5 6
1 2 3 4 5 6
1 2 3 4 5 6
1 2 3 4 5 6
1 2 3 4 5 6
1 2 3 4 5 6
I'd like an R function which, when given the matrix and "width of diagonal slice" would return an nxn matrix of just those values. So for the matrix above and, say, 3, I'd get:
1 x x x x x
1 2 x x x x
1 2 3 x x x
x 2 3 4 x x
x x 3 4 5 x
x x x 4 5 6
At the moment I'm using (forgive me) a for loop which is incredibly slow:
getDiags<-function(ndiags, cormat){
resmat=matrix(ncol=ncol(cormat),nrow=nrow(cormat))
dimnames(resmat)<-dimnames(cormat)
for(j in 1:ndiags){
resmat[row(resmat) == col(resmat) + j] <-
cormat[row(cormat) == col(cormat) + j]
}
return(resmat)
}
I realise that this is a very "un-R" way to go about solving this problem. Is there a better way to do it, probably using diag or lower.tri?
size <- 6
mat <- matrix(seq_len(size ^ 2), ncol = size)
low <- 0
high <- 3
delta <- rep(seq_len(ncol(mat)), nrow(mat)) -
rep(seq_len(nrow(mat)), each = ncol(mat))
#or Ben Bolker's better alternative
delta <- row(mat) - col(mat)
mat[delta < low | delta > high] <- NA
mat
this works with 5000 x 5000 matrices on my machine
If you want to use upper.tri and lower.tri you could write functions like these:
cormat <- mapply(rep, 1:6, 6)
u.diags <- function(X, n) {
X[n:nrow(X),][lower.tri(X[n:nrow(X),])] <- NA
return(X)
}
or
l.diags <- function(X, n) {
X[,n:ncol(X)][upper.tri(X[,n:ncol(X)])] <- NA
return(X)
}
or
n.diags <- function(X, n.u, n.l) {
X[n.u:nrow(X),][lower.tri(X[n.u:nrow(X),])] <- NA
X[,n.l:ncol(X)][upper.tri(X[,n.l:ncol(X)])] <- NA
return(X)
}
l.diags(cormat, 3)
u.diags(cormat, 3)
n.diags(cormat, 3, 1)
you can do:
matrix:
m<-
matrix(1:6,ncol = 6, nrow=6 ,byrow = T)
function:
n_diag <- function (x, n) {
d <- dim(x)
ndiag <- .row(d) - n >= .col(d)
x[upper.tri(x) | ndiag] <- NA
return(x)
}
call:
n_diag(m,3)
# [,1] [,2] [,3] [,4] [,5] [,6]
#[1,] 1 NA NA NA NA NA
#[2,] 1 2 NA NA NA NA
#[3,] 1 2 3 NA NA NA
#[4,] NA 2 3 4 NA NA
#[5,] NA NA 3 4 5 NA
#[6,] NA NA NA 4 5 6
just for fun:
#lapply(1:6, n_diag, x = m)

Resources