Generate all combinations given a constraint - r

How can I generate all of the 6 combinations of 2 treatments (A,B) in blocks of 4, such that in each block there is an equal number of A's and B's, using R?
"AABB","ABAB","ABBA","BBAA","BABA","BAAB"
P.S. The number of combinations is calculated as follows:
If
T = #treatments
n = #treatments in each block = k*T,
The number of combinations equals n! / [k!*k! (T times)]
Thank you

Something like this should work:
library(gtools)
t <- c('A','B')
k <- 2
n <- k * length(t)
t2 <- rep(t, k)
m <- permutations(n,n)
res <- unique(apply(m,MARGIN=1,function(x) paste(t2[x],collapse='')))
--------------------------------------------------------------------
res
[1] "ABAB" "ABBA" "AABB" "BAAB" "BABA" "BBAA"

The multicool package implements an algorithm for permuting multisets --- exactly the task you want to have performed. Here's an example of what it can do:
library(multicool)
# Create a simple convenience function
enumAllPartitions <- function(multiset) {
m1 <- initMC(multiset) # Initialize the permutation object
N <- fact(length(multiset))/ # Calculate number of permutations
prod(fact(table(multiset)))
sapply(seq_len(N), function(X) paste(nextPerm(m1), collapse=""))
}
# Try it out with a few different multisets
x <- c("A", "A", "B", "B")
y <- c("G", "L", "L", "L")
z <- c("X", "X", "Y", "Z", "Z")
lapply(list(x,y,z), enumAllPartitions)
[[1]]
[1] "BBAA" "ABBA" "BABA" "ABAB" "AABB" "BAAB"
[[2]]
[1] "LLLG" "GLLL" "LGLL" "LLGL"
[[3]]
[1] "ZZYXX" "XZZYX" "ZXZYX" "ZZXYX" "XZZXY" "ZXZXY" "XZXZY" "XXZZY" "ZXXZY"
[10] "ZZXXY" "YZZXX" "ZYZXX" "XZYZX" "ZXYZX" "YZXZX" "XYZZX" "YXZZX" "ZYXZX"
[19] "XZYXZ" "ZXYXZ" "XZXYZ" "XXZYZ" "ZXXYZ" "YZXXZ" "XYZXZ" "YXZXZ" "XYXZZ"
[28] "XXYZZ" "YXXZZ" "ZYXXZ"

The expected solution can also be achieved using the new iterpc package.
I <- iterpc(c(2, 2), labels=c("A", "B"), ordered=TRUE)
getall(I)
# [,1] [,2] [,3] [,4]
# [1,] "A" "A" "B" "B"
# [2,] "A" "B" "A" "B"
# [3,] "A" "B" "B" "A"
# [4,] "B" "A" "A" "B"
# [5,] "B" "A" "B" "A"
# [6,] "B" "B" "A" "A"

Related

Using mvrnorm from MASS package

I need to generate a random sample with a multivariate normal distribution using seed(12346) with 100 columns and 5000 rows.
So far I have got this:
set.seed(12346)
Preg1 <- data.frame(MASS::mvrnorm(n=5000,mu=c(0,0,0),Sigma = diag(3)))
The above gives me three columns, how can I get 100?
I cannot figure out how to get the vector of mu with 100 zeros without typing them in and the Sigma would then be Sigma = diag(100)
You can use mu = rep(0, 100). The rep function is used to repeat values.
set.seed(12346)
ncol = 100
Preg1<-data.frame(mvrnorm(n = 5000, mu = rep(0, ncol), Sigma = diag(ncol)))
dim(Preg1)
# [1] 5000 100
The rep function is quite useful, it can be used in various ways that aren't applicable here but are good to know about:
rep(c("A", "B", "C"), times = 3)
# [1] "A" "B" "C" "A" "B" "C" "A" "B" "C"
rep(c("A", "B", "C"), times = 1:3)
# [1] "A" "B" "B" "C" "C" "C"
rep(c("A", "B", "C"), each = 3)
# [1] "A" "A" "A" "B" "B" "B" "C" "C" "C"
In this particular case, because your Sigma is an identity matrix, each column is actually independent. So it would be equivalent to generate each column (or even each draw) independently, which we could do either of these ways:
x = replicate(n = ncol, rnorm(5000))
dim(x)
# [1] 5000 100
z = matrix(rnorm(5000 * ncol), ncol = ncol)
dim(z)
# [1] 5000 100

Calculate probability of observing sequence using markovchain package

Let's use the dataset from this question:
dat<-data.frame(replicate(20,sample(c("A", "B", "C","D"), size = 100, replace=TRUE)))
Then we can build the transition matrix and the markov chain:
# Build transition matrix
trans.matrix <- function(X, prob=T)
{
tt <- table( c(X[,-ncol(X)]), c(X[,-1]) )
if(prob) tt <- tt / rowSums(tt)
tt
}
trans.mat <- trans.matrix(as.matrix(dat))
attributes(trans.mat)$class <- 'matrix'
# Build markovchain
library(markovchain)
chain <- new('markovchain', transitionMatrix = trans.mat)
If I now encounter a new sequence, let's say AAABCAD can I then calculate the probability of observing this sequence given this markovchain?
I cannot see a function in markovchain exactly for that, but it can be easily done manually too. There's one caveat though: the transition matrix does not provide the probability of observing the first A, which needs to be provided by you. Let it be 0.25, as it would be if all four states were equally likely (which is true in your example).
Then the transitions in the observed chain can be obtained with
cbind(head(obs, -1), obs[-1])
# [,1] [,2]
# [1,] "A" "A"
# [2,] "A" "A"
# [3,] "A" "B"
# [4,] "B" "C"
# [5,] "C" "A"
# [6,] "A" "D"
Probabilities for each of those transitions then are
trans.mat[cbind(head(obs, -1), obs[-1])]
# [1] 0.2268722 0.2268722 0.2268722 0.2926316 0.2791165 0.2665198
and the final answer is 0.25 * (the product of the above vector):
0.25 * prod(trans.mat[cbind(head(obs, -1), obs[-1])])
# [1] 6.355069e-05
For comparison, we may estimate this probability by generating many chains of length 7:
dat <- replicate(2000000, paste(sample(c("A", "B", "C", "D"), size = 7, replace = TRUE), collapse = ""))
mean(dat == "AAABCAD")
# [1] 6.55e-05
Looks close enough!

R: How to extract all labels in a certain node of a dendrogram

I am writing a program that (as a part of it) automatically creates dendrograms from an input dataset.
For each node/split I want to extract all the labels that are under that node and the location of that node on the dendrogram plot (for further plotting purposes).
So, let's say my data looks like this:
> Ltrs <- data.frame("A" = c(3,1), "B" = c(1,1), "C" = c(2,4), "D" = c(6,6))
> dend <- as.dendrogram(hclust(dist(t(Ltrs))))
> plot(dend)
The dendrogram
Now I can extract the location of the splits/nodes:
> library(dendextend)
> nodes <- get_nodes_xy(dend)
> nodes <- nodes[nodes[,2] != 0, ]
> nodes
[,1] [,2]
[1,] 1.875 7.071068
[2,] 2.750 3.162278
[3,] 3.500 2.000000
Now I want to get all the labels under a node, for each node (/row from the 'nodes' variable).
This should look something like this:
$`1`
[1] "D" "C" "B" "A"
$`2`
[1] "C" "B" "A"
$`3 `
[1] "B" "A"
Can anybody help me out? Thanks in advance :)
How about something like this?
library(tidyverse)
library(dendextend)
Ltrs <- data.frame("A" = c(3,1), "B" = c(1,1), "C" = c(2,4), "D" = c(6,6))
dend <- as.dendrogram(hclust(dist(t(Ltrs))))
accumulator <- list();
myleaves <- function(anode){
if(!is.list(anode))return(attr(anode,"label"))
accumulator[[length(accumulator)+1]] <<- (reduce(lapply(anode,myleaves),c))
}
myleaves(dend);
ret <- rev(accumulator); #generation was depth first, so root was found last.
Better test this. I am not very trustworthy. In particular, I really hope the list ret is in an order that makes sense, otherwise it's going to be a pain associating the entries with the correct nodes! Good luck.
Function partition_leaves() extracts all leaf labels per each node and makes a list ordered in the same fashion as get_nodes_xy() output. With your example,
Ltrs <- data.frame("A" = c(3,1), "B" = c(1,1), "C" = c(2,4), "D" = c(6,6))
dend <- as.dendrogram(hclust(dist(t(Ltrs))))
plot(dend)
partition_leaves(dend)
yields:
[[1]]
[1] "D" "C" "A" "B"
[[2]]
[1] "D"
[[3]]
[1] "C" "A" "B"
[[4]]
[1] "C"
[[5]]
[1] "A" "B"
[[6]]
[1] "A"
[[7]]
[1] "B"
filtering list by vector length will give output similar to the desired one.

Extracting names of vector by time bin

I have written this loop to extract the names of each element of a vector that occurs within a time interval (bin). I was wondering if I am missing a faster way to do this... I want to implement a randomization aspect to vectors that are 1000s in length and as such do not want to rely on a loop.
mydata <- structure(c(1199.91666666667, 1200.5, 1204.63333333333, 1205.5,
1206.3, 1208.73333333333, 1209.06666666667, 1209.93333333333,
1210.98333333333, 1214.56666666667, 1216.06666666667, 1216.63333333333,
1216.91666666667, 1219.13333333333, 1221.35, 1221.51666666667,
1225.35, 1225.53333333333, 1225.96666666667, 1227.61666666667,
1228.91666666667, 1230.31666666667, 1233.53333333333, 1235.8,
1237.51666666667, 1239.41666666667, 1241.6, 1247.08333333333,
1247.45, 1252.7, 1253.26666666667), .Names = c("B", "A", "B",
"E", "A", "A", "B", "G", "G", "C", "A", "D", "E", "B", "B", "E",
"E", "G", "F", "A", "C", "A", "F", "B", "A", "F", "F", "G", "F",
"G", "F"))
mydata
B A B E A A B G G C A D E B B E E
1199.917 1200.500 1204.633 1205.500 1206.300 1208.733 1209.067 1209.933 1210.983 1214.567 1216.067 1216.633 1216.917 1219.133 1221.350 1221.517 1225.350
G F A C A F B A F F G F G F
1225.533 1225.967 1227.617 1228.917 1230.317 1233.533 1235.800 1237.517 1239.417 1241.600 1247.083 1247.450 1252.700 1253.267
These represent consecutive times in seconds of events. Say we want to make our intervals 5s long. My approach is to make a vector of the beginning of each interval and then use a loop to find the names of elements occurring within that interval:
N=5
ints <- seq(mydata[1], mydata[length(mydata)], N)
out<-list()
for(i in 1:length(ints)){
out[[i]] <- names(mydata[mydata>=ints[i] & mydata<ints[i]+N])
}
out
[[1]]
[1] "B" "A" "B"
[[2]]
[1] "E" "A" "A" "B"
[[3]]
[1] "G" "G" "C"
[[4]]
[1] "A" "D" "E" "B"
[[5]]
[1] "B" "E"
[[6]]
[1] "E" "G" "F" "A" "C"
[[7]]
[1] "A" "F"
[[8]]
[1] "B" "A" "F"
[[9]]
[1] "F"
[[10]]
[1] "G" "F"
[[11]]
[1] "G" "F"
This is fine for small samples - but I can see this would get slow when dealing with very large samples that are permuted 1000s of times.
My suggestion is to use findInterval (based on an answer to this earlier question of mine):
mydata2 = c(-Inf, mydata)
ints <- seq(mydata[1], mydata[length(mydata)]+5, N)
idx = findInterval(ints-1e-10, mydata2)
out<-list()
for(i in 1:(length(ints)-1)){
out[[i]] <- names(mydata2[(idx[i]+1):(idx[i+1])])
}
As you can see I have to do a little tinkering with the beginning (adding a first value that is smaller than the first breakpoint, adding an epsilon). Here's the result, it is identical to yours:
> out
[[1]]
[1] "B" "A" "B"
[[2]]
[1] "E" "A" "A" "B"
[[3]]
[1] "G" "G" "C"
[[4]]
[1] "A" "D" "E" "B"
[[5]]
[1] "B" "E"
[[6]]
[1] "E" "G" "F" "A" "C"
[[7]]
[1] "A" "F"
[[8]]
[1] "B" "A" "F"
[[9]]
[1] "F"
[[10]]
[1] "G" "F"
[[11]]
[1] "G" "F"
In terms of speed for the example there is some improvement:
> microbenchmark( jalapic = {out<-list(); for(i in 1:length(ints)){out[[i]] <- names(mydata[mydata>=ints[i] & mydata<ints[i]+N])}},
+ mts = {idx = findInterval(ints2-1e-10, mydata2); out<-list(); for(i in 1:(length(ints)-1)){out[[i]] <- names(mydata2[(idx[i]+1):(idx[i+1])])}},
+ alexis = {split(names(mydata), findInterval(mydata, ints))},
+ R_Yoda = {dt[, groups := cut2(data,ints)]; result <- dt[, paste0(names, collapse=", "), by=groups]})
Unit: microseconds
expr min lq mean median uq max neval
jalapic 67.177 76.9725 85.73347 82.8035 95.866 119.890 100
mts 43.851 52.7150 62.72116 58.3130 73.007 96.099 100
alexis 75.573 86.5360 95.72593 91.4340 100.531 234.649 100
R_Yoda 2032.066 2158.4870 2303.68887 2191.3750 2281.409 8719.314 100
For larger vectors (I chose length 2000) this is clearer:
set.seed(123)
mydata = sort(runif(n = 2000, min = 0, max = 100))
names(mydata) = sample(LETTERS[1:7], size = 2000, replace = T)
mydata2 = c(-Inf, mydata)
ints2 <- seq(mydata[1], mydata[length(mydata)]+5, N)
dt <- data.table(data=mydata, names=names(mydata) )
> microbenchmark( jalapic = {out<-list(); for(i in 1:length(ints)){out[[i]] <- names(mydata[mydata>=ints[i] & mydata<ints[i]+N])}},
+ mts = {idx = findInterval(ints2-1e-10, mydata2); out<-list(); for(i in 1:(length(ints)-1)){out[[i]] <- names(mydata2[(idx[i]+1):(idx[i+1])])}},
+ alexis = {split(names(mydata), findInterval(mydata, ints))},
+ R_Yoda = {dt[, groups := cut2(data,ints)]; result <- dt[, paste0(names, collapse=", "), by=groups]})
Unit: microseconds
expr min lq mean median uq max neval
jalapic 804.243 846.9275 993.9957 862.0890 883.3140 7140.218 100
mts 77.439 88.8685 100.6148 100.0640 106.5955 188.466 100
alexis 187.066 204.7930 220.1689 215.5225 225.3190 299.026 100
R_Yoda 3831.348 4066.4640 4366.5382 4140.1700 4248.8635 11829.923 100
For performance reasons I am using data.table:
Edit: This solution works, but is NOT very fast (as proved by the answer of mts)
library(Hmisc)
library(data.table)
# assuming that your mydata vector from the question is loaded
N=5 # code from your question...
ints <- seq(mydata[1], mydata[length(mydata)], N) # code from your question...
dt <- data.table(data=mydata, names=names(mydata) )
dt[, groups := cut2(data,ints)] # attention: shall the interval ends be included in the group or not?
groups <- dt[ , .(result=list(names)), by=groups] # the elements of a data.table can be a list itself!
# to get the result as list:
out <- groups[,result]
out
Edit: You could replace cut2 by findInterval and do it all in one line, but it is still slower:
out <- dt[, .(result=list(names)), by = findInterval(data,ints) ]
This is the result:
[[1]]
[1] "B" "A" "B"
[[2]]
[1] "E" "A" "A" "B"
[[3]]
[1] "G" "G" "C"
[[4]]
[1] "A" "D" "E" "B"
[[5]]
[1] "B" "E"
[[6]]
[1] "E" "G" "F" "A" "C"
[[7]]
[1] "A" "F"
[[8]]
[1] "B" "A" "F"
[[9]]
[1] "F"
[[10]]
[1] "G" "F"
[[11]]
[1] "G" "F"

Replace Items In Matrix Based on Value

I have a matrix that contains integer values that represent the index of the item in an array and I'd like to switch out item 1 for the values[1] and so on for each item in the values array.
Some code to demonstrate what I'd like
> m = matrix(1:3, ncol=3, nrow=3)
> m
[,1] [,2] [,3]
[1,] 1 1 1
[2,] 2 2 2
[3,] 3 3 3
> replace(m, 1="a", 2="b", 3="c")
> m
[,1] [,2] [,3]
[1,] "a" "a" "a"
[2,] "b" "b" "b"
[3,] "c" "c" "c"
Basically it takes 1 and turns it into "a" and so on. It seems like if I try to do this with a for loop it changes after the first iteration from int to string and since I'd like to do this with any object type that's not great behavior.
I can think of three possibilities to solve this
m <- matrix(1:3, 3, 3) # Your data
1
Either define a function that will get a vector in the correct matching order (the first entry will match the first unique value in m, etc.)
vec <- c("Ralf", "Jhons", "Pete")
Then you can define a simple function such as
Match_func <- function(x, y) "dim<-"(y[match(unique(x), seq_along(y))], dim(x))
Test
Match_func(m, vec)
# [,1] [,2] [,3]
# [1,] "Ralf" "Ralf" "Ralf"
# [2,] "Jhons" "Jhons" "Jhons"
# [3,] "Pete" "Pete" "Pete"
2
The second option will be to define your manual replace function, something like
Match_func2 <- function(x, ...) {
temp <- list(...)[[1]]
"dim<-"(temp[match(x, as.numeric(names(temp)))], dim(x))
}
Test
Match_func2(m, c("1" = "a", "2" = "b", "3" = "c"))
# [,1] [,2] [,3]
# [1,] "a" "a" "a"
# [2,] "b" "b" "b"
# [3,] "c" "c" "c"
3
You can also make a use of plyr::revalue
library(plyr)
Match_func3 <- function(x, ...) {
temp <- list(...)[[1]]
"dim<-"(revalue(as.character(x), temp), dim(x))
}
Test
Match_func3(m, c("1" = "a", "2" = "b", "3" = "c"))
# [,1] [,2] [,3]
# [1,] "a" "a" "a"
# [2,] "b" "b" "b"
# [3,] "c" "c" "c"
Note: The last approach is the safest in case you don't want to replace all the unique values
Here's an option, starting with a character matrix so that you don't need to worry about making a copy or coercion of the original matrix.
m = matrix(as.character(1:3), ncol=3, nrow=3)
old <- as.character(1:3)
new <- c("a", "b", "c")
for (i in 1:length(old)) {
m <- ifelse(m == old[i], new[i], m)
}

Resources