Creating an edgelist from Patent data in R - r

I am trying to create an edgelist out of patent data of the form:
PatentID InventorIDs CoinventorIDs
1 A ; B C,D,E ; F,G,H,C
2 J ; K ; L M,O ; N ; P, Q
What I would like is the edgelist below showing the connections between inventors and patents. (the semicolons separate the coinventors associated with each primary inventor):
1 A B
1 A C
1 A D
1 A E
1 B F
1 B G
1 B H
1 B C
2 J K
2 J L
2 J M
2 J O
2 K N
2 L P
2 L Q
Is there an easy way to do this with igraph in R?

I'm confused by the edges going between the inventorIds. But, here is a kind of brute force function that you could just apply by row. There may be a way with igraph, it being a massive library, that is better, but once you have the data in an this form it should be simple to convert to an igraph data structure.
Note that this leaves out the edges between primary inventors.
## A function to make the edges for each row
rowFunc <- function(row) {
tmp <- lapply(row[2:3], strsplit, '\\s*;\\s*')
tmp2 <- lapply(tmp[[2]], strsplit, ',')
do.call(rbind, mapply(cbind, row[[1]], unlist(tmp[[1]]), unlist(tmp2, recursive=FALSE)))
}
## Apply the function by row
do.call(rbind, apply(dat, 1, rowFunc))
# [,1] [,2] [,3]
# [1,] "1" "A" "C"
# [2,] "1" "A" "D"
# [3,] "1" "A" "E"
# [4,] "1" "B" "F"
# [5,] "1" "B" "G"
# [6,] "1" "B" "H"
# [7,] "1" "B" "C"
# [8,] "2" "J" "M"
# [9,] "2" "J" "O"
# [10,] "2" "K" "N"
# [11,] "2" "L" "P"
# [12,] "2" "L" " Q"

Related

R - Expand Grid Without Duplicates

I need a function similar to expand.grid but without the combinations of duplicate elements.
Here is a simplified version of my problem.
X1 = c("x","y","z")
X2 = c("A","B","C")
X3 = c("y","C","G")
d <- expand.grid(X1,X2,X3)
d
Var1 Var2 Var3
1 x A y
2 y A y
3 z A y
4 x B y
. . . .
. . . .
. . . .
23 y B G
24 z B G
25 x C G
26 y C G
27 z C G
d has 27 rows. But 6 of these contain duplicate values which I do not need Rows: 2, 5, 8, 16, 17 & 18
Is there a way to get the other 21 rows which does not contain any duplicates.
Note that vectors have more than 3 elements (c("x","y","z","k","m"...), up to 50) and number of vectors is more than 3 in the real case. (X4, X5, X6... up to 11 ). Because of this expanded object is getting real large and RAM cannot handle it.
In RcppAlgos*, there is a function called comboGrid that does the trick:
library(RcppAlgos) ## as of v2.4.3
comboGrid(X1, X2, X3, repetition = F)
# Var1 Var2 Var3
# [1,] "x" "A" "C"
# [2,] "x" "A" "G"
# [3,] "x" "A" "y"
# [4,] "x" "B" "C"
# [5,] "x" "B" "G"
# [6,] "x" "B" "y"
# [7,] "x" "C" "G"
# [8,] "x" "C" "y"
# [9,] "y" "A" "C"
# [10,] "y" "A" "G"
# [11,] "y" "B" "C"
# [12,] "y" "B" "G"
# [13,] "y" "C" "G"
# [14,] "z" "A" "C"
# [15,] "z" "A" "G"
# [16,] "z" "A" "y"
# [17,] "z" "B" "C"
# [18,] "z" "B" "G"
# [19,] "z" "B" "y"
# [20,] "z" "C" "G"
# [21,] "z" "C" "y"
Large Test
set.seed(42)
rnd_lst <- lapply(1:11, function(x) {
sort(sample(LETTERS, sample(26, 1)))
})
## Number of results that expand.grid would return if your machine
## had enough memory... over 300 trillion!!!
prettyNum(prod(lengths(rnd_lst)), big.mark = ",")
# [1] "365,634,846,720"
exp_grd_test <- expand.grid(rnd_lst)
# Error: vector memory exhausted (limit reached?)
system.time(cmb_grd_test <- comboGrid(rnd_lst, repetition=FALSE))
# user system elapsed
# 9.866 0.330 10.196
dim(cmb_grd_test)
# [1] 3036012 11
head(cmb_grd_test)
# Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 Var11
# [1,] "A" "E" "C" "B" "D" "G" "F" "H" "J" "I" "K"
# [2,] "A" "E" "C" "B" "D" "G" "F" "H" "J" "I" "L"
# [3,] "A" "E" "C" "B" "D" "G" "F" "H" "J" "I" "M"
# [4,] "A" "E" "C" "B" "D" "G" "F" "H" "J" "I" "N"
# [5,] "A" "E" "C" "B" "D" "G" "F" "H" "J" "I" "O"
# [6,] "A" "E" "C" "B" "D" "G" "F" "H" "J" "I" "P"
* I am the author of RcppAlgos
(Sorry, I just realized that your problem is as much a size problem, so removing them post-generation may not be feasible. For that, this may not be the best answer, but I'll keep it around for smaller-and-related questions.)
base R
I hard-code "3", but you can use ncol(d) and/or ncol(d)-1 for programmatic use.
d[lengths(apply(d, 1, unique)) > 2, ]
# Var1 Var2 Var3
# 1 x A y
# 3 z A y
# 4 x B y
# 6 z B y
# 7 x C y
# 9 z C y
# 10 x A C
# 11 y A C
# 12 z A C
# 13 x B C
# 14 y B C
# 15 z B C
# 19 x A G
# 20 y A G
# 21 z A G
# 22 x B G
# 23 y B G
# 24 z B G
# 25 x C G
# 26 y C G
# 27 z C G
(The row names are not reset, you can see the gaps to verify it is not 27 rows.)
And to verify, here are the rows with dupes:
d[lengths(apply(d, 1, unique)) < 3, ]
# Var1 Var2 Var3
# 2 y A y
# 5 y B y
# 8 y C y
# 16 x C C
# 17 y C C
# 18 z C C

Split string with n repetitive elements into n sub-strings

I have a string that is a concatenation of m possible types of elements - for the sake of simplicity m = 4 with A, B, C and D.
Whenever there are single elements more than once, I would have to split the string so that there are no repetitions left. However, I would like to generate all possible strings without repetitions.
To make this a little bit clearer, here is an example:
For A B A C D
String: A B C D
String: B A C D
This gets more complicated when there are several different elements that show up more than once:
For A B A C B D
String: A B C D
String: A C B D
String: B A C D
String: A C B D
Is there a smart way to compute this in R?
vec <- c("A","B","A","C","B","D")
combs <- lapply(setNames(nm = unique(vec)), function(a) which(vec == a))
eg <- do.call(expand.grid, combs)
out <- t(apply(eg, 1, function(r) names(eg)[order(r)]))
# [,1] [,2] [,3] [,4]
# [1,] "A" "B" "C" "D"
# [2,] "B" "A" "C" "D"
# [3,] "A" "C" "B" "D"
# [4,] "A" "C" "B" "D"
out
First vector:
vec <- c("A","B","A","C","D")
# ...
# [,1] [,2] [,3] [,4]
# [1,] "A" "B" "C" "D"
# [2,] "B" "A" "C" "D"
If you are starting and ending with strings vice vectors, then know that you can wrap the above with:
strsplit("ABACBD", "")[[1]]
# [1] "A" "B" "A" "C" "B" "D"
apply(out, 1, paste, collapse = "")
# [1] "ABCD" "BACD" "ACBD" "ACBD"

R function for manipulative field experiment design - Plant ecology

I am a plant ecologist and I would like to design a field manipulative experiment. To achieve a good degree of randomisation in the disposition of plants inside experimental plots, I would like to use R (with which I'm familiar).
The picture is a schematisation of the design: in total there would be 6 plots, each containing 24 different plant species. In total I have 36 plant species, from which I would like to randomly sample 24 species per plot (with high variation between plots). Each species should be present in 4 out of the 6 plots but in different positions, twice along the border (in two of the warm colours) and twice in the core area (once per cold colour); in the picture I visually explain what I mean using 4 different species (A, B, C, D).
Could anyone suggest a way to do it or some insight to write the function?
I borrowed some insight from this thread and this thread. Basically, you make a circular matrix and shuffle groups.
coords <- list(c(1,1),c(1,2),c(2,1),c(3,1),
c(1,3),c(1,4),c(2,4),c(3,4),
c(2,2),c(3,2),c(2,3),c(3,3),
c(4,1),c(5,1),c(6,1),c(6,2),
c(4,2),c(5,2),c(4,3),c(5,3),
c(4,4),c(5,4),c(6,3),c(6,4))
Matrix <- matrix(c(LETTERS,0:9)[1:36][matrix(1:36,36+1,36+1,byrow=T)[c(1,36:2),1:36]],36,36)
PlotLayouts <- Matrix[(1:6*6),1:24][,unlist(lapply(split(1:24,rep(1:6,each=4)),sample,4))]
PlotLayouts <- split(PlotLayouts,sample(1:6,6))
Result <- lapply(PlotLayouts,function(Vector){
Layout <- matrix(NA,nrow=6,ncol=4)
for(i in 1:24){
Layout[coords[[i]][1],coords[[i]][2]] <- Vector[i]
}
Layout
})
#Species Counts
table(unlist(Result))
0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
Result
$`1`
[,1] [,2] [,3] [,4]
[1,] "I" "H" "M" "L"
[2,] "K" "R" "S" "N"
[3,] "J" "P" "Q" "O"
[4,] "U" "Z" "0" "2"
[5,] "V" "Y" "X" "1"
[6,] "T" "W" "4" "3"
$`2`
[,1] [,2] [,3] [,4]
[1,] "0" "Z" "4" "3"
[2,] "2" "9" "A" "5"
[3,] "1" "7" "8" "6"
[4,] "C" "H" "I" "K"
[5,] "D" "G" "F" "J"
[6,] "B" "E" "M" "L"
...

sample unique pairs from two vectors

Given are two vectors, a and b
a = letters[1:6]
b = letters[7:11]
The goal is to sample a two column matrix using a and b. The first column should contain elements from a such that each element of a is repeated two times. The second column should contain elements from b such that each element of b is also repeated at least two times. One more condition is that the pairs have to be unique.
I have figured out how to sample the 12 pairs but have not figured out how I can ensure they will always be unique. For example, in the solution presented below, row 3 and row 11 are the same.
The desired output should have no duplicate rows.
set.seed(42)
m = cbind(sample(c(a, a)), sample(c(b, b, sample(b, 2, replace = TRUE))))
m
# [,1] [,2]
# [1,] "e" "g"
# [2,] "f" "k"
# [3,] "c" "k"
# [4,] "b" "h"
# [5,] "f" "j"
# [6,] "d" "i"
# [7,] "e" "h"
# [8,] "a" "g"
# [9,] "d" "h"
#[10,] "a" "i"
#[11,] "c" "k"
#[12,] "b" "j"
You can make it a function and throw replace in there, i.e.
f1 <- function(a, b){
m <- cbind(sample(c(a, a)), sample(c(b, b, sample(b, 2, replace = TRUE))))
m[,2] <-replace(m[,2], duplicated(m), sample(b[!b %in% m[duplicated(m),2]], 1))
return(m)
}
#which seems stable
sum(duplicated(f1(a, b)))
#[1] 0
sum(duplicated(f1(a, b)))
#[1] 0
sum(duplicated(f1(a, b)))
#[1] 0
sum(duplicated(f1(a, b)))
#[1] 0
Another way that doesn't require replacement
m = rbind(
c(1,1,0,0,0),
c(1,1,0,0,0),
c(0,0,1,1,0),
c(0,0,1,1,0),
c(0,0,0,0,1),
c(0,0,0,0,1)
)
# One "free" selection in each of the last two rows
m[5, sample(4,1)] = 1
m[6, sample(4,1)] = 1
# Scramble it while preserving row/column sums
m = m[sample(6), sample(5)]
> as.matrix(expand.grid(a=a,b=b))[as.logical(m),]
# a b
# [1,] "a" "g"
# [2,] "b" "g"
# [3,] "e" "g"
# [4,] "c" "h"
# [5,] "d" "h"
# [6,] "f" "h"
# [7,] "d" "i"
# [8,] "f" "i"
# [9,] "b" "j"
#[10,] "c" "j"
#[11,] "a" "k"
#[12,] "e" "k"
Definitely not elegant, but would work.
a = letters[1:6]
b = letters[7:11]
asamp <- sample(c(a,a))
finished <- F
while(!finished) {
bsamp <- sample(c(b, b, sample(b, 2, replace = TRUE)))
if(length(unique(paste(asamp,bsamp)))==12) finished <- T
}
cbind(asamp,bsamp)

R: transposing and splitting a row with a delimiter.

I have a table
rawData <- as.data.frame(matrix(c(1,2,3,4,5,6,"a,b,c","d,e","f"),nrow=3,ncol=3))
1 4 a,b,c
2 5 d,e
3 6 f
I would like to convert to
1 2 3
4 5 6
a d f
b e
c
so far I can transpose and split the third column, however, I'm lost as to how to reconstruct a new table with the format outline above?
new = t(rawData)
for (e in 1:ncol(new)){
s<-strsplit(new[3:3,e], split=",")
print(s)
}
I tried creating new vectors for each iteration but I'm not sure how to efficiently put each one back into a dataframe. Would be grateful for any help. thanks!
You can use stri_list2matrix from the stringi package:
library(stringi)
rawData <- as.data.frame(matrix(c(1,2,3,4,5,6,"a,b,c","d,e","f"),nrow=3,ncol=3),stringsAsFactors = F)
d1 <- t(rawData[,1:2])
rownames(d1) <- NULL
d2 <- stri_list2matrix(strsplit(rawData$V3,split=','))
rbind(d1,d2)
# [,1] [,2] [,3]
# [1,] "1" "2" "3"
# [2,] "4" "5" "6"
# [3,] "a" "d" "f"
# [4,] "b" "e" NA
# [5,] "c" NA NA
You can also use cSplit from my "splitstackshape" package.
By default, it just creates additional columns after splitting the input:
library(splitstackshape)
cSplit(rawData, "V3")
# V1 V2 V3_1 V3_2 V3_3
# 1: 1 4 a b c
# 2: 2 5 d e NA
# 3: 3 6 f NA NA
You can just transpose that to get your desired output.
t(cSplit(rawData, "V3"))
# [,1] [,2] [,3]
# V1 "1" "2" "3"
# V2 "4" "5" "6"
# V3_1 "a" "d" "f"
# V3_2 "b" "e" NA
# V3_3 "c" NA NA

Resources