link records in r with least number of steps - r

I have a dataset with 2 columns like below.
INBOUND OUTBOUND
a1 a2
a1 a3
a1 a6
a2 a50
a4 a63
a8 a9
a10 a11
a50 a51
I wanted to give a similar id to those which are interlinked
Ex: a1 is linked to a2 , a2 is linked to a50, a50 to a51... So I would want to create a variable "id" and give same id to all these records.
I am right now doing a merge operation and marking these records. But since this is not an exhaustive search, I am able to mark only a1 --> a2, and a2 --> a50 with the same id.
So I am assuming there should be a better way to do this in one shot.
Help needed on this. Thanks
#
> dat <- read.csv("C:\\Users\\abc\\Documents\\Book1.csv", stringsAsFactors=FALSE, header = TRUE, sep = ",")
> dat
INBOUND OUTBOUND
1 a1 a2
2 a1 a3
3 a1 a6
4 a2 a50
5 a4 a63
6 a8 a9
7 a10 a11
8 a50 a51
> g <- graph.data.frame(dat)
> clusters(g)
$membership
[1]
1 2 3 4 5 6 1 1 1 2 3 4 5 6
$csize
[1]
4 2 2 2 2 2
$no
[1]
6
The clusters(g)$membership i got is this
1 2 3 4 5 6 1 1 1 2 3 4 5 6
And the one you got for the same data is
1 1 2 3 4 1 1 1 2 3 4 1

Edit: I added a much simpler answer using igraph below
Coincidentally, I happened to have a similar problem this week. I ended up using an algorithm from Numerical Recipes (section 8.6 on page 345) (the code in this edition contains some errors, by the way). However, the algorithm itself it is written in c++, so I hope you have the tools installed to compile this.
The code is given below. Using the function equivalence on you dataset:
> relations <- "INBOUND OUTBOUND
+ a1 a2
+ a1 a3
+ a1 a6
+ a2 a50
+ a4 a63
+ a8 a9
+ a10 a11
+ a50 a51"
> relations <- read.table(textConnection(relations), stringsAsFactors=FALSE, header=TRUE)
>
> source("equivalence.R")
> objects <- unique(c(relations[[1]], relations[[2]]))
> groups <- equivalence(objects, relations)
>
> data.frame(objects, groups)
objects groups
1 a1 12
2 a2 12
3 a4 9
4 a8 10
5 a10 11
6 a50 12
7 a3 12
8 a6 12
9 a63 9
10 a9 10
11 a11 11
12 a51 12
equivalence.cpp
The c++ file with the algorithm
#include <R.h>
#include <Rinternals.h>
#include <string>
extern "C" {
SEXP equivalence(SEXP ra, SEXP rb, SEXP rn) {
try {
if (LENGTH(ra) != LENGTH(rb))
throw std::string("Lengths of a and be do not match.");
int* a = INTEGER(ra);
int* b = INTEGER(rb);
int m = LENGTH(ra);
int n = INTEGER(rn)[0];
SEXP classes = PROTECT(allocVector(INTSXP, n));
int* cls = INTEGER(classes);
//Initialize each element its own class.
for (int k = 0; k < n; k++) cls[k] = k;
//For each piece of input information...
for (int l = 0; l < m; l++) {
//Track first element up to its ancestor.
int j = a[l];
while (cls[j] != j) j = cls[j];
//Track second element up to its ancestor.
int k = b[l];
while (cls[k] != k) k = cls[k];
//If they are not already related, make them so.
if (j != k) {
cls[j] = k;
}
}
//Final sweep up to highest ancestors.
for (int j = 0; j < n; j++) {
while (cls[j] != cls[cls[j]]) cls[j] = cls[cls[j]];
}
UNPROTECT(1);
return classes;
} catch(const std::string& e) {
error(e.c_str());
return R_NilValue;
} catch (...) {
error("Uncaught exception.");
return R_NilValue;
}
}
equivalence.R
The code that loads the shared library (change extension from .so to .dll if you are working under windows)
dyn.load("equivalence.so")
equivalence <- function(x, rules) {
tmp <- unique(x)
tmp <- tmp[!is.na(tmp)]
a <- match(rules[[1]], tmp)
b <- match(rules[[2]], tmp)
sel <- !is.na(a) & !is.na(b)
if (any(!sel)) {
warning("Not all values in rules are present in x.")
a <- a[sel]
b <- b[sel]
}
res <- .Call("equivalence", as.integer(a)-1L, as.integer(b)-1L,
as.integer(length(tmp)));
res[match(x, tmp)] + 1L
}
Using igraph
You can also use igraph, which is much simpler (should have thought of that before). The groups can be obtained using the clusters function and the corresponding nodes/vertices can be obtained using the V function:
> library(igraph)
> g <- graph.data.frame(relations)
> cl <- clusters(g)
> data.frame(object = V(g)$name, groups = cl$membership)
object groups
1 a1 1
2 a2 1
3 a4 2
4 a8 3
5 a10 4
6 a50 1
7 a3 1
8 a6 1
9 a63 2
10 a9 3
11 a11 4
12 a51 1

Related

Dependents and Precedents in R

Need help in flagging number of Dependents and Precedents in R. My data frame contains some formulas (strings) and I want to add "col3" which should contain: 0 for A1, 1 for A2 (Because A2 is dependent on A1 - One dependency) and 2 for A3 (Because A3 is dependent on A2/A1).
col1 <- c('A1','A2','A3', 'A6','A4','A7')
col2 <- c('X1+Y1','A1+Y2', 'A4+Y3+A2', 'Y5+A1','A2+A1+A3','A2+A1')
df <- data.frame(col1, col2, stringsAsFactors=F)
My Output should look like:
col1 col2 col3
1 A1 Y1 0
2 A2 A1+Y2 1
3 A3 A4+Y3+A2 5
4 A6 Y5+A1 1
5 A4 A2+A1+Y3 3
6 A7 A2+A1 3
I have a data frame with 100+ rows of this format. Appreciate if you could help with this.
Below code produces the correct output.
col0 <- c('A1','A2','A3', 'A6','A4','A7')
col2 <- c('X1+Y1','A1+Y2', 'A1+Y3+A2', 'Y5+A2','A2+A1+A3','A2+A3')
df <- data.frame(col0, col2, stringsAsFactors=F)
library(tidyr)
library(dplyr)
df1 <- df %>%
separate(col2, into = as.character(c(1:4)),sep = "\\+") %>%
replace(is.na(.),"")
df1$OOE <- 0
for (i in 1:nrow(df1)) {
for (j in 2:ncol(df1)) {
for (k in 1:nrow(df1)) {
if (df1[i,j] == df1$col0[k]) df1$OOE[i]=df1$OOE[k]+df1$OOE[i]+1
}
}
}
col0 1 2 3 4 OOE
1 A1 X1 Y1 0
2 A2 A1 Y2 1
3 A3 A1 Y3 A2 3
4 A6 Y5 A2 2
5 A4 A2 A1 A3 7
6 A7 A2 A3 6
If AX can have a dependency on AY where Y>X, we need a tree like structure to find the dependencies. I knew about the igraph package but it seems to complex for the task. We just need some reference semantics and after some research, data.tree package seems appropriate. Here is the code:
col1 <- c('A1','A2','A3', 'A6','A4','A7')
col2 <- c('X1+Y1','A1+Y2', 'A1+Y3+A2', 'Y5+A2','A2+A1+A3','A2+A3')
df <- data.frame(col1, col2, stringsAsFactors=F)
require(data.tree)
# Create the graph/forest based on the data
getForest <- function(data) {
res <- new.env()
for( i in 1:nrow(data)){
nname <- data$col1[i]
if(!exists(nname,where=res))
assign(nname,Node$new(nname), pos=res)
par <- get(nname, envir=res)
print(par)
#Add the childs
deps <- unlist(regmatches(data$col2[i],gregexpr("A\\d+",data$col2[i])))
for( ch in deps) {
print("Ammm")
if(!exists(ch, where=res))
assign(ch,Node$new(ch), pos=res)
child <- get(ch, envir=res)
par$AddChildNode(child)
}
}
#Return the nodes
res
}
f <- getForest(df)
# Function to get the dependency level
getLevel<- function(node) {
if (node$count == 0)
return (0)
else {
return (length(node$children)+sum(sapply(node$children,getlevel)))
}
}
#Add dependency level to data frame
df$col3 <- sapply(df$col1, function(x) {getLevel(get(x,f))})
df
# col1 col2 col3
#1 A1 X1+Y1 0
#2 A2 A1+Y2 1
#3 A3 A1+Y3+A2 3
#4 A6 Y5+A2 2
#5 A4 A2+A1+A3 7
#6 A7 A2+A3 6

cbind with partially nested list

I'm trying to cbind or unnest or as.data.table a partially nested list.
id <- c(1,2)
A <- c("A1","A2","A3")
B <- c("B1")
AB <- list(A=A,B=B)
ABAB <- list(AB,AB)
nested_list <- list(id=id,ABAB=ABAB)
The length of id is the same as ABAB (2 in this case). I don't know how to unlist a part of this list (ABAB) and cbind another part (id). Here's my desired result as a data.table:
data.table(id=c(1,1,1,2,2,2),A=c("A1","A2","A3","A1","A2","A3"),B=rep("B1",6))
id A B
1: 1 A1 B1
2: 1 A2 B1
3: 1 A3 B1
4: 2 A1 B1
5: 2 A2 B1
6: 2 A3 B1
I haven't tested for more general cases, but this works for the OP example:
library(data.table)
as.data.table(nested_list)[, lapply(ABAB, as.data.table)[[1]], id]
# id A B
#1: 1 A1 B1
#2: 1 A2 B1
#3: 1 A3 B1
#4: 2 A1 B1
#5: 2 A2 B1
#6: 2 A3 B1
Or another option (which is probably faster, but is more verbose):
rbindlist(lapply(nested_list$ABAB, as.data.table),
idcol = 'id')[, id := nested_list$id[id]]
This is some super ugly base R, but produces the desired output.
Reduce(rbind, Map(function(x, y) setNames(data.frame(x, y), c("id", "A", "B")),
as.list(nested_list[[1]]),
lapply(unlist(nested_list[-1], recursive=FALSE),
function(x) Reduce(cbind, x))))
id A B
1 1 A1 B1
2 1 A2 B1
3 1 A3 B1
4 2 A1 B1
5 2 A2 B1
6 2 A3 B1
lapply takes the a list of two elements (each containing the A and B variables) extracted with unlist and recursive=FALSE. It returns a list of character matrices with the B variable filled in by recycling. A list of the individual id variables from as.list(nested_list[[1]]) and the lit of matrices are fed to Map which converts corresponding pairs to a data.frame and gives the columns the desired names and returns a list of data.frames. Finally, this list of data.frames is fed to Reduce, which rbinds the results to a single data.frame.
The final Reduce(rbind, could be replaced by data.tables rbindlist if desired.
Here's another hideous solution
max_length = max(unlist(lapply(nested_list, function(x) lapply(x, lengths))))
data.frame(id = do.call(c, lapply(nested_list$id, rep, max_length)),
do.call(rbind, lapply(nested_list$ABAB, function(x)
do.call(cbind, lapply(x, function(y) {
if(length(y) < max_length) {
rep(y, max_length)
} else {
y
}
})))))
# id A B
#1 1 A1 B1
#2 1 A2 B1
#3 1 A3 B1
#4 2 A1 B1
#5 2 A2 B1
#6 2 A3 B1
And one more, also inelegant- but I`d gone too far by the time I saw the other answers.
restructure <- function(nested_l) {
ids <- as.numeric(max(unlist(lapply(unlist(nested_l, recursive = FALSE), function(x){
lapply(x, length)
}))))
temp = data.frame(rep(nested_l$id, each = ids),
sapply(1:length(nested_l$id), function(x){
out <-unlist(lapply(nested_l[[2]], function(y){
return(y[x])
}))
}))
names(temp) <- c("id", unique(substring(unlist(nested_l[2]), first = 1, last = 1)))
return(temp)
}
> restructure(nested_list)
id A B
1 1 A1 B1
2 1 A2 B1
3 1 A3 B1
4 2 A1 B1
5 2 A2 B1
6 2 A3 B1
Joining the party:
library(tidyverse)
temp <- map(nested_list,~map(.x,~expand.grid(.x)))
df <- map_df(1:2,~cbind(temp$id[[.x]],temp$ABAB[[.x]]))
Var1 A B
1 1 A1 B1
2 1 A2 B1
3 1 A3 B1
4 2 A1 B1
5 2 A2 B1
6 2 A3 B1

Paste a string of variable names into function in R

I've got two strings of variable names that looks like this
> names_a = paste(paste0('a', seq(0,6,1)), collapse = ", ")
> names_a
[1] "a0, a1, a2, a3, a4, a5, a6"
> names_b = paste(paste0('b', seq(0,6,1)), collapse = ", ")
> names_b
[1] "b0, b1, b2, b3, b4, b5, b6"
Eacha and b variable contains a vector of ids, for example:
> head(a3)
[1] "1234" "56567" "457659"...
I aim to get all possible pairs of a and b ids. For this purpose I try to paste variables' names rigth into function rbind and then to expand.grid
pairs = expand.grid(rbind(parse(text = names_a), rbind(parse(text = names_b))
I mean I try to collapse all a0 to a6 vectors into a single vector using rbind, let it be named a, the same for all b's vectors and then find all pairs of values in a and b
surprisingly nothing works. Can it be fixed?
Something like this?
a1 = 1:2
a2 = 3:4
b1 = 5:6
b2 = 7:8
expand.grid(do.call(rbind, mget(paste("a", 1:2, sep = ""))),
do.call(rbind, mget(paste("b", 1:2, sep = ""))))
# Var1 Var2
#1 1 5
#2 3 5
#3 2 5
#4 4 5
#5 1 7
#6 3 7
#7 2 7
#8 4 7
#9 1 6
#10 3 6
#11 2 6
#12 4 6
#13 1 8
#14 3 8
#15 2 8
#16 4 8
Collapse all of a0 through a6 into one vector:
a <- as.vector(sapply(strsplit(gsub(" ","",names_a),",")[[1]],function(x) get(x)))
(or if you don't have the names as a single string you need to parse):
a <- as.vector(sapply(paste0("a",0:6),function(x) get(x)))
Do the same with b and then
merge(a,b) #all pairs
This will generate duplicates if any of the a or b variables has duplicates, so you may want to add unique to the collapsing of a and b

Comparing two columns of two dataframes (logical operators)

I would like to compare two columns simultaneously. My data looks like this:
a <- data.frame("a1" = c(1,1,1,3,4), "a2" = c(2,1,2,1,2))
b <- data.frame("b1" = c(1,1,3,1,3), "b2" = c(2,2,1,2,1))
cbind(a, b)
# a1 a2 b1 b2
# 1 1 2 1 2
# 2 1 1 1 2
# 3 1 2 3 1
# 4 3 1 1 2
# 5 4 2 3 1
I would like to identify all rows of a where a1 is not in b1 or where a1 is in b1 but a2 for the special a1 is not in b2 for the special b2. So the second question is: When a1 is in b1 is then a2 for this row for a1 also in b2 for this row for b1.
Example for line 2: I am checking, if a1 = 1 is anywhere in b1 = c(1,1,3,1,3). It is, so I want to check if a2 = 1 in line 2 (where a1 = 1) is anywhere in b2 where b1 = a1 = 1, so here b2 = c(2, 2, 2). For line 2 a2 = 1 is not in b2 = c(2, 2, 2), so the result should show me this line.
The first question is easy to answer with the following code:
a[which(!(a$a1 %in% b$b1)), ]
# a1 a2
# 5 4 2
But I can't fix the second problem. Maybe I am working in a wrong way with the logical operators. My result should look like this:
a1 a2
2 1 1
4 4 2
Following the explanation in your edit, you want the rows where either the specific a1 from a is not in b1 from b or where the specific a1 from a is equal to b1 of the same row in b and a2 from a is not among the values of b2 from b of the rows for which b1 equals the value of the specific a1.
In R, you can write this like that:
cond <- sapply(seq(nrow(a)), # check each row, one by one
function (i){
!(a$a1[i] %in% b$b1) | # a1 of the specific row is not in b1 or
!(a$a2[i] %in% b$b2[b$b1==a$a1[i]]) # a2 of the specific row is not in the values of b2 for which b1 equals a1 of the sepcific row
})
a[cond, ]
# a1 a2
#2 1 1
#5 4 2
Obviously not a nice solution, but it works with my data (unequal dimension of rows of the two datasets, not the same position of the values in the variables) - here with new example data, because I chose the first really bad.
a <- data.frame("a1" = c(1,1,1,3,4), "a2" = c(2,1,2,1,2))
b <- data.frame("b1" = c(1,3,1,1), "b2" = c(2,1,2,2))
test <- function (data1, data2) {
for (i in unique(data1[data1$a1 %in% data2$b1, "a1"])) {
temp_data1 <- data1[data1$a1 == i, c("a1", "a2")]
temp_data2 <- data2[data2$b1 == i, c("b1", "b2")]
for (j in unique(temp_data1$a2)) {
test <- j %in% unique(temp_data2$b2)
if (test == FALSE) {
print(unique(temp_data1[temp_data1$a1 == i & temp_data1$a2 == j, ]))
}
}
}
for (k in unique(data1[which(!(data1$a1 %in% data2$b1)), "a1"])) {
print(unique(data1[data1$a1 == k, c("a1", "a2")]))
}
}
test(a, b)
a1 a2
2 1 1
a1 a2
5 4 2
Based on your answer I improved the function test(). This version returns a dataframe:
a <- data.frame(a1=c(1,1,1,3,4), a2=c(2,1,2,1,2))
b <- data.frame(b1=c(1,1,3,1,3), b2=c(2,2,1,2,1))
test <- function (a, b) {
R <- subset(a,!a1 %in% b$b1)
I <- unique(a$a1[a$a1 %in% b$b1])
for (i in I) {
ai <- subset(a, a1 == i)
bi <- subset(b, b1 == i)
J <- unique(bi$b2)
for (j in unique(ai$a2)) if (! j %in% J) R <- rbind(subset(ai, a2==j), R)
}
R
}
test(a, b)

Generating a 96 or 384 well plate layout in R

I am trying to write some code which will take a .csv file which contains some sample names as input and will output a data.frame containing the sample names and either a 96 well plate or 384 well plate format (A1, B1, C1...). For those who do not know, a 96 well plate has eight alphabetically labeled rows (A, B, C, D, E, F, G, H) and 12 numerically labeled columns (1:12) and a 384 well plate has 16 alphabetically labeled rows (A:P) and 24 numerically labeled columns (1:24). I am trying to write some code that will generate either of these formats (there CAN be two different functions to do this) allowing for the samples to be labeled either DOWN (A1, B1, C1, D1, E1, F1, G1, H1, A2...) or ACROSS (A1, A2, A3, A4, A5 ...).
So far, I have figured out how to get the row names fairly easily
rowLetter <- rep(LETTERS[1:8], length.out = variable)
#variable will be based on how many samples I have
I just cannot figure out how to get the numeric column names to apply correctly... I have tried:
colNumber <- rep(1:12, times = variable)
but it isn't that simple. All 8 rows must be filled before the col number increases by 1 if you're going 'DOWN' or all 12 columns must be filled before the row letter increases by 1 if you're going 'ACROSS'.
EDIT:
Here is a clunky version. It takes the number of samples that you have, a 'plate format' which IS NOT functional yet, and a direction and will return a data.frame with the wells and plate numbers. Next, I am going to a) fix the plate format so that it will work correctly and b) give this function the ability to take a list of samples names or ID's or whatever and return the sample names, well positions, and plate numbers!
plateLayout <- function(numOfSamples, plateFormat = 96, direction = "DOWN"){
#This assumes that each well will be filled in order. I may need to change this, but lets get it working first.
#Calculate the number of plates required
platesRequired <- ceiling(numOfSamples/plateFormat)
rowLetter <- character(0)
colNumber <- numeric(0)
plateNumber <- numeric(0)
#The following will work if the samples are going DOWN
if(direction == "DOWN"){
for(k in 1:platesRequired){
rowLetter <- c(rowLetter, rep(LETTERS[1:8], length.out = 96))
for(i in 1:12){
colNumber <- c(colNumber, rep(i, times = 8))
}
plateNumber <- c(plateNumber, rep(k, times = 96))
}
plateLayout <- paste0(rowLetter, colNumber)
plateLayout <- data.frame(plateLayout, plateNumber)
plateLayout <- plateLayout[1:numOfSamples,]
return(plateLayout)
}
#The following will work if the samples are going ACROSS
if(direction == "ACROSS"){
for(k in 1:platesRequired){
colNumber <- c(colNumber, rep(1:12, times = 8))
for(i in 1:8){
rowLetter <- c(rowLetter, rep(LETTERS[i], times = 12))
}
plateNumber <- c(plateNumber, rep(k, times = 96))
}
plateLayout <- paste0(rowLetter, colNumber)
plateLayout <- data.frame(plateLayout, plateNumber)
plateLayout <- plateLayout[1:numOfSamples,]
return(plateLayout)
}
}
Does anybody have any thoughts on what else might make this cool? I'm going to use this function to generate .csv or .txt files to use as sample name imports for different instruments so I will be kind of constrained in terms of 'cool features', but I think it would be cool to use ggplot to make a graphic which shows the plates and sample names?
You don't need for loops. Here is a start:
#some sample ids
ids <- c(LETTERS, letters)
#plate size:
n <- 96
nrow <- 8
samples <- character(n)
samples[seq_along(ids)] <- ids
samples <- matrix(samples, nrow=nrow)
colnames(samples) <- seq_len(n/nrow)
rownames(samples) <- LETTERS[seq_len(nrow)]
# 1 2 3 4 5 6 7 8 9 10 11 12
# A "A" "I" "Q" "Y" "g" "o" "w" "" "" "" "" ""
# B "B" "J" "R" "Z" "h" "p" "x" "" "" "" "" ""
# C "C" "K" "S" "a" "i" "q" "y" "" "" "" "" ""
# D "D" "L" "T" "b" "j" "r" "z" "" "" "" "" ""
# E "E" "M" "U" "c" "k" "s" "" "" "" "" "" ""
# F "F" "N" "V" "d" "l" "t" "" "" "" "" "" ""
# G "G" "O" "W" "e" "m" "u" "" "" "" "" "" ""
# H "H" "P" "X" "f" "n" "v" "" "" "" "" "" ""
library(reshape2)
samples <- melt(samples)
samples$position <- paste0(samples$Var1, samples$Var2)
# Var1 Var2 value position
# 1 A 1 A A1
# 2 B 1 B B1
# 3 C 1 C C1
# 4 D 1 D D1
# 5 E 1 E E1
# 6 F 1 F F1
# 7 G 1 G G1
# 8 H 1 H H1
# 9 A 2 I A2
# 10 B 2 J B2
# 11 C 2 K C2
# 12 D 2 L D2
# 13 E 2 M E2
# 14 F 2 N F2
# 15 G 2 O G2
# 16 H 2 P H2
# 17 A 3 Q A3
# 18 B 3 R B3
# 19 C 3 S C3
# 20 D 3 T D3
# 21 E 3 U E3
# 22 F 3 V F3
# 23 G 3 W G3
# 24 H 3 X H3
# 25 A 4 Y A4
# 26 B 4 Z B4
# 27 C 4 a C4
# 28 D 4 b D4
# 29 E 4 c E4
# 30 F 4 d F4
# 31 G 4 e G4
# 32 H 4 f H4
# 33 A 5 g A5
# 34 B 5 h B5
# 35 C 5 i C5
# 36 D 5 j D5
# 37 E 5 k E5
# 38 F 5 l F5
# 39 G 5 m G5
# 40 H 5 n H5
# 41 A 6 o A6
# 42 B 6 p B6
# 43 C 6 q C6
# 44 D 6 r D6
# 45 E 6 s E6
# 46 F 6 t F6
# 47 G 6 u G6
# 48 H 6 v H6
# 49 A 7 w A7
# 50 B 7 x B7
# 51 C 7 y C7
# 52 D 7 z D7
# 53 E 7 E7
# 54 F 7 F7
# 55 G 7 G7
# 56 H 7 H7
# 57 A 8 A8
# 58 B 8 B8
# 59 C 8 C8
# 60 D 8 D8
# 61 E 8 E8
# 62 F 8 F8
# 63 G 8 G8
# 64 H 8 H8
# 65 A 9 A9
# 66 B 9 B9
# 67 C 9 C9
# 68 D 9 D9
# 69 E 9 E9
# 70 F 9 F9
# 71 G 9 G9
# 72 H 9 H9
# 73 A 10 A10
# 74 B 10 B10
# 75 C 10 C10
# 76 D 10 D10
# 77 E 10 E10
# 78 F 10 F10
# 79 G 10 G10
# 80 H 10 H10
# 81 A 11 A11
# 82 B 11 B11
# 83 C 11 C11
# 84 D 11 D11
# 85 E 11 E11
# 86 F 11 F11
# 87 G 11 G11
# 88 H 11 H11
# 89 A 12 A12
# 90 B 12 B12
# 91 C 12 C12
# 92 D 12 D12
# 93 E 12 E12
# 94 F 12 F12
# 95 G 12 G12
# 96 H 12 H12
Use the byrow argument to fill the matrix in the other direction:
samples <- matrix(samples, nrow=nrow, byrow=TRUE)
To fill more than one plate, you can use basically the same idea, but use an array instead of a matrix.
I've never written this code in R before but it should be the same as Perl, Python or Java
For Row major order (going across) the pseudocode algorithm is simply:
for each( i : 0..totalNumWells - 1){
column = (i % numColumns)
row = ((i % totalNumWells) / numColumns)
}
Where numColumns is 12 for 96 well plate, 24 or 384 and totalNumWells is 96 or 384 respectively. This will give you a column and row index in 0-based coordinates which is perfect for accessing arrays.
wellName = ABCs[row], column + 1
Where ABCs is an array of all the valid letters in your plate (or A-Z). +1 is to convert 0-based into 1-based, otherwise the first well will be A0 instead of A1.
I also want to point out that often 384 wells aren't in row major order. I've seen most often sequencing centers preferring a "checker board" pattern A01, A03, A05... then A02, A04, A06..., B01, B03... etc to be able to combine 4 96-well plates into a single 384 well without changing the layout and simplifying the picking robot's work. that's a much harder algorithm to compute the ith well for
The following code does what I set out to do. You can use it to make as many plates as you need, with the assumptions that whatever your import list is will be in order. It can make as many plates as you need and will add a column for "plateNumber" which will indicate which batch it's on. It can only handle 96 or 384 well plates, but that is all I deal in so that is fine.
plateLayout <- function(numOfSamples, plateFormat = 96, direction = "DOWN"){
#This assumes that each well will be filled in order.
#Calculate the number of plates required
platesRequired <- ceiling(numOfSamples/plateFormat)
rowLetter <- character(0)
colNumber <- numeric(0)
plateNumber <- numeric(0)
#define the number of columns and number of rows based on plate format (96 or 384 well plate)
switch(as.character(plateFormat),
"96" = {numberOfColumns = 12; numberOfRows = 8},
"384" = {numberOfColumns = 24; numberOfRows = 16})
#The following will work if the samples are going DOWN
if(direction == "DOWN"){
for(k in 1:platesRequired){
rowLetter <- c(rowLetter, rep(LETTERS[1:numberOfRows], length.out = plateFormat))
for(i in 1:numberOfColumns){
colNumber <- c(colNumber, rep(i, times = numberOfRows))
}
plateNumber <- c(plateNumber, rep(k, times = plateFormat))
}
plateLayout <- paste0(rowLetter, colNumber)
plateLayout <- data.frame(plateNumber,plateLayout)
plateLayout <- plateLayout[1:numOfSamples,]
return(plateLayout)
}
#The following will work if the samples are going ACROSS
if(direction == "ACROSS"){
for(k in 1:platesRequired){
colNumber <- c(colNumber, rep(1:numberOfColumns, times = numberOfRows))
for(i in 1:numberOfRows){
rowLetter <- c(rowLetter, rep(LETTERS[i], times = numberOfColumns))
}
plateNumber <- c(plateNumber, rep(k, times = plateFormat))
}
plateLayout <- paste0(rowLetter, colNumber)
plateLayout <- data.frame(plateNumber, plateLayout)
plateLayout <- plateLayout[1:numOfSamples,]
return(plateLayout)
}
}
An example of how to use this would be as follows
#load whatever data you're going to use to get a plate layout on (sample ID's or names or whatever)
thisData <- read.csv("data.csv")
#make a data.frame containing your sample names and the function's output
#alternatively you can use length() if you have a list
plateLayoutDataFrame <- data.frame(thisData$sampleNames, plateLayout(nrow(thisData), plateFormat = 96, direction = "DOWN")
#It will return something similar to the following, depending on your selections
#data plateNumber plateLayout
#sample1 1 A1
#sample2 1 B1
#sample3 1 C1
#sample4 1 D1
#sample5 1 E1
#sample6 1 F1
#sample7 1 G1
#sample8 1 H1
#sample9 1 A2
#sample10 1 B2
#sample11 1 C2
#sample12 1 D2
#sample13 1 E2
#sample14 1 F2
#sample15 1 G2
That sums up this function for now. Roland offered a good method of doing this which is less verbose, but I wanted to avoid the use of external packages if possible. I'm working on a shiny app now which actually uses this! I want it to be able to automatically subset based on the 'plateNumber' and write each plate as it's own file... for more on this, go to: Automatic multi-file download in R-Shiny
Here's how I'd do it.
put_samples_in_plates = function(sample_list, nwells=96, direction="across")
{
if(!nwells %in% c(96, 384)){
stop("Invalid plate size")
}
nsamples = nrow(sample_list)
nplates = ceiling(nsamples/nwells);
if(nwells==96){
rows = LETTERS[1:8]
cols = 1:12
}else if(nwells==384){
rows = LETTERS[1:16]
cols = 1:24
}else{
stop("Unrecognized nwells")
}
nrows = length(rows)
ncols = length(cols)
if(tolower(direction)=="down"){
single_plate_df = data.frame(row = rep(rows, times=ncols),
col = rep(cols, each=nrows))
}else if(tolower(direction)=="across"){
single_plate_df = data.frame(row = rep(rows, each=ncols),
col = rep(cols, times=nrows))
}else{
stop("Unrecognized direction")
}
single_plate_df = transform(single_plate_df,
well = sprintf("%s%02d", row, col))
toobig_plate_df = cbind(data.frame(plate=rep(1:nplates, each=nwells)),
do.call("rbind", replicate(nplates,
single_plate_df,
simplify=FALSE)))
res = cbind(sample_list, toobig_plate_df[1:nsamples,])
return(res)}
# Quick test
a_sample_list = data.frame(x=1:386, y=rnorm(386))
r.096.across = put_samples_in_plates(sample_list = a_sample_list,
nwells= 96,
direction="across")
r.096.down = put_samples_in_plates(sample_list = a_sample_list,
nwells= 96,
direction="down")
r.384.across = put_samples_in_plates(sample_list = a_sample_list,
nwells=384,
direction="across")
r.384.down = put_samples_in_plates(sample_list = a_sample_list,
nwells=384,
direction="down")
Two points worth noting in the function above:
the use of the times and each parameters within the rep function to differentiate "across" and "down" directions, and
the use of replicate to repeat the individual plate as many times as needed along with the use of a call to rbind from do.call.

Resources