Split string with n repetitive elements into n sub-strings - r

I have a string that is a concatenation of m possible types of elements - for the sake of simplicity m = 4 with A, B, C and D.
Whenever there are single elements more than once, I would have to split the string so that there are no repetitions left. However, I would like to generate all possible strings without repetitions.
To make this a little bit clearer, here is an example:
For A B A C D
String: A B C D
String: B A C D
This gets more complicated when there are several different elements that show up more than once:
For A B A C B D
String: A B C D
String: A C B D
String: B A C D
String: A C B D
Is there a smart way to compute this in R?

vec <- c("A","B","A","C","B","D")
combs <- lapply(setNames(nm = unique(vec)), function(a) which(vec == a))
eg <- do.call(expand.grid, combs)
out <- t(apply(eg, 1, function(r) names(eg)[order(r)]))
# [,1] [,2] [,3] [,4]
# [1,] "A" "B" "C" "D"
# [2,] "B" "A" "C" "D"
# [3,] "A" "C" "B" "D"
# [4,] "A" "C" "B" "D"
out
First vector:
vec <- c("A","B","A","C","D")
# ...
# [,1] [,2] [,3] [,4]
# [1,] "A" "B" "C" "D"
# [2,] "B" "A" "C" "D"
If you are starting and ending with strings vice vectors, then know that you can wrap the above with:
strsplit("ABACBD", "")[[1]]
# [1] "A" "B" "A" "C" "B" "D"
apply(out, 1, paste, collapse = "")
# [1] "ABCD" "BACD" "ACBD" "ACBD"

Related

I want to check if a string like A B C D exists in each cell of its corresponding row or not. If it does not exists then NA should be returned

I have put the data and output here. In first row if anything is not A,B,C or D then it should return NA , in second row if anything is not A,C,B or E then return NA
Here is a example showing one option to make it
> t(mapply(function(a, b) b[match(a, b)], asplit(x, 1), strsplit(y, "")))
[,1] [,2] [,3] [,4]
[1,] NA "B" "C" "A"
[2,] NA "B" "C" NA
Data
> x <- rbind(c("E", "B", "C", "A"), c("S", "B", "C", "D"))
> y <- c("ABCD", "ACBE")
> x
[,1] [,2] [,3] [,4]
[1,] "E" "B" "C" "A"
[2,] "S" "B" "C" "D"
> y
[1] "ABCD" "ACBE"

How to perform a check on a permutation "on-the-fly" without storing the result in R

Assume we have the following permutations of the letters, "a", "b", and "c":
library(combinat)
do.call(rbind, permn(letters[1:3]))
# [,1] [,2] [,3]
# [1,] "a" "b" "c"
# [2,] "a" "c" "b"
# [3,] "c" "a" "b"
# [4,] "c" "b" "a"
# [5,] "b" "c" "a"
# [6,] "b" "a" "c"
Is it possible to perform some function on a given permutation "on-the-fly" (i.e., a particular row) without storing the result?
That is, if the row == "a" "c" "b" or row == "b" "c" "a", do not store the result. The desired result in this case would be:
# [,1] [,2] [,3]
# [1,] "a" "b" "c"
# [2,] "c" "a" "b"
# [3,] "c" "b" "a"
# [4,] "b" "a" "c"
I know I can apply a function to all the permutations on the fly within combinat::permn with the fun argument such as:
permn(letters[1:3], fun = function(x) {
res <- paste0(x, collapse = "")
if (res == "acb" | res == "bca") {
return(NA)
} else {
return(res)
}
})
But this stills stores an NA and the returned list has 6 elements instead of the desired 4 elements:
# [[1]]
# [1] "abc"
#
# [[2]]
# [1] NA
#
# [[3]]
# [1] "cab"
#
# [[4]]
# [1] "cba"
#
# [[5]]
# [1] NA
#
# [[6]]
# [1] "bac"
Note, I am not interested in subsequently removing the NA values; I am specifically interested in not appending to the result list "on-the-fly" for a given permutation.
We could use a magrittr pipeline where we rbind the input matrix to the Rows to be checked and omit the duplicate rows.
library(combinat)
library(magrittr)
Rows <- rbind(c("a", "c", "b"), c("b", "c", "a"))
do.call(rbind, permn(letters[1:3])) %>%
subset(tail(!duplicated(rbind(Rows, .)), -nrow(Rows)))
giving:
[,1] [,2] [,3]
[1,] "a" "b" "c"
[2,] "c" "a" "b"
[3,] "c" "b" "a"
[4,] "b" "a" "c"
You can return NULL for the particular condition that you want to ignore and rbind the result which will ignore the NULL elements and bind only the combinations that you need.
do.call(rbind, combinat::permn(letters[1:3], function(x)
if(!all(x == c("a", "c", "b") | x == c("b", "c", "a")))
return(x)
))
# [,1] [,2] [,3]
#[1,] "a" "b" "c"
#[2,] "c" "a" "b"
#[3,] "c" "b" "a"
#[4,] "b" "a" "c"
Similarly,
do.call(rbind, permn(letters[1:3],function(x) {
res <- paste0(x, collapse = "")
if (!res %in% c("acb","bca"))
return(res)
}))
# [,1]
#[1,] "abc"
#[2,] "cab"
#[3,] "cba"
#[4,] "bac"

sample unique pairs from two vectors

Given are two vectors, a and b
a = letters[1:6]
b = letters[7:11]
The goal is to sample a two column matrix using a and b. The first column should contain elements from a such that each element of a is repeated two times. The second column should contain elements from b such that each element of b is also repeated at least two times. One more condition is that the pairs have to be unique.
I have figured out how to sample the 12 pairs but have not figured out how I can ensure they will always be unique. For example, in the solution presented below, row 3 and row 11 are the same.
The desired output should have no duplicate rows.
set.seed(42)
m = cbind(sample(c(a, a)), sample(c(b, b, sample(b, 2, replace = TRUE))))
m
# [,1] [,2]
# [1,] "e" "g"
# [2,] "f" "k"
# [3,] "c" "k"
# [4,] "b" "h"
# [5,] "f" "j"
# [6,] "d" "i"
# [7,] "e" "h"
# [8,] "a" "g"
# [9,] "d" "h"
#[10,] "a" "i"
#[11,] "c" "k"
#[12,] "b" "j"
You can make it a function and throw replace in there, i.e.
f1 <- function(a, b){
m <- cbind(sample(c(a, a)), sample(c(b, b, sample(b, 2, replace = TRUE))))
m[,2] <-replace(m[,2], duplicated(m), sample(b[!b %in% m[duplicated(m),2]], 1))
return(m)
}
#which seems stable
sum(duplicated(f1(a, b)))
#[1] 0
sum(duplicated(f1(a, b)))
#[1] 0
sum(duplicated(f1(a, b)))
#[1] 0
sum(duplicated(f1(a, b)))
#[1] 0
Another way that doesn't require replacement
m = rbind(
c(1,1,0,0,0),
c(1,1,0,0,0),
c(0,0,1,1,0),
c(0,0,1,1,0),
c(0,0,0,0,1),
c(0,0,0,0,1)
)
# One "free" selection in each of the last two rows
m[5, sample(4,1)] = 1
m[6, sample(4,1)] = 1
# Scramble it while preserving row/column sums
m = m[sample(6), sample(5)]
> as.matrix(expand.grid(a=a,b=b))[as.logical(m),]
# a b
# [1,] "a" "g"
# [2,] "b" "g"
# [3,] "e" "g"
# [4,] "c" "h"
# [5,] "d" "h"
# [6,] "f" "h"
# [7,] "d" "i"
# [8,] "f" "i"
# [9,] "b" "j"
#[10,] "c" "j"
#[11,] "a" "k"
#[12,] "e" "k"
Definitely not elegant, but would work.
a = letters[1:6]
b = letters[7:11]
asamp <- sample(c(a,a))
finished <- F
while(!finished) {
bsamp <- sample(c(b, b, sample(b, 2, replace = TRUE)))
if(length(unique(paste(asamp,bsamp)))==12) finished <- T
}
cbind(asamp,bsamp)

Creating an edgelist from Patent data in R

I am trying to create an edgelist out of patent data of the form:
PatentID InventorIDs CoinventorIDs
1 A ; B C,D,E ; F,G,H,C
2 J ; K ; L M,O ; N ; P, Q
What I would like is the edgelist below showing the connections between inventors and patents. (the semicolons separate the coinventors associated with each primary inventor):
1 A B
1 A C
1 A D
1 A E
1 B F
1 B G
1 B H
1 B C
2 J K
2 J L
2 J M
2 J O
2 K N
2 L P
2 L Q
Is there an easy way to do this with igraph in R?
I'm confused by the edges going between the inventorIds. But, here is a kind of brute force function that you could just apply by row. There may be a way with igraph, it being a massive library, that is better, but once you have the data in an this form it should be simple to convert to an igraph data structure.
Note that this leaves out the edges between primary inventors.
## A function to make the edges for each row
rowFunc <- function(row) {
tmp <- lapply(row[2:3], strsplit, '\\s*;\\s*')
tmp2 <- lapply(tmp[[2]], strsplit, ',')
do.call(rbind, mapply(cbind, row[[1]], unlist(tmp[[1]]), unlist(tmp2, recursive=FALSE)))
}
## Apply the function by row
do.call(rbind, apply(dat, 1, rowFunc))
# [,1] [,2] [,3]
# [1,] "1" "A" "C"
# [2,] "1" "A" "D"
# [3,] "1" "A" "E"
# [4,] "1" "B" "F"
# [5,] "1" "B" "G"
# [6,] "1" "B" "H"
# [7,] "1" "B" "C"
# [8,] "2" "J" "M"
# [9,] "2" "J" "O"
# [10,] "2" "K" "N"
# [11,] "2" "L" "P"
# [12,] "2" "L" " Q"

Combination without repetition in R

I am trying to get all the possible combinations of length 3 of the elements of a variable. Although it partly worked with combn() I did not quite get the output I was looking for. Here's my example
x <- c("a","b","c","d","e")
t(combn(c(x,x), 3))
The output I get looks like this
[,1] [,2] [,3]
[1,] "a" "b" "c"
[2,] "a" "b" "d"
[3,] "a" "b" "e"
I am not really happy with this command for 2 reasons. I wanted to get an output that says "a+b+c" "a+b+b"...., unfortunately I wasn't able to edit the output with paste() or something.
I was also looking forward for one combination of each set of letters, that is I either get "a+b+c" or "b+a+c" but not both.
Try something like:
x <- c("a","b","c","d","e")
d1 <- combn(x,3) # All combinations
d1
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# [1,] "a" "a" "a" "a" "a" "a" "b" "b" "b" "c"
# [2,] "b" "b" "b" "c" "c" "d" "c" "c" "d" "d"
# [3,] "c" "d" "e" "d" "e" "e" "d" "e" "e" "e"
nrow(unique(t(d1))) == nrow(t(d1))
# [1] TRUE
d2 <- expand.grid(x,x,x) # All permutations
d2
# Var1 Var2 Var3
# 1 a a a
# 2 b a a
# 3 c a a
# 4 d a a
# 5 e a a
# 6 a b a
# 7 b b a
# 8 c b a
# 9 d b a
# ...
nrow(unique(d2)) == nrow(d2)
# [1] TRUE
try this
x <- c("a","b","c","d","e")
expand.grid(rep(list(x), 3))

Resources