Split multiple strings in multiple columns in a matrix in R - r

I have following matrix with numbers of 0 and 1 with always the same number of strings per column, but also containing columns with only one string. I would like to to split each number into separate columns, that only one number per column and row occurs. But I would like to leave the columns with only one string as it is:
r1 <- c("0","001","0001","01","100")
r2 <- c("1","001","0001","10","100")
r3 <- c("0","100","1000","10","010")
r4 <- c("0","010","0100","10","001")
r5<- c("0","010","0010","10","001")
n.mat <- rbind(r1,r2,r3,r4,r5)
The output:
r1 <- c("0","0","0","1","0","0","0","1","0","1","1","0","0")
r2 <- c("1","0","0","1","0","0","0","1","1","0","1","0","0")
r3 <- c("0","1","0","0","1","0","0","0","1","0","0","1","0")
r4 <- c("0","0","1","0","0","1","0","0","1","0","0","0","1")
r5 <- c("0","0","1","0","0","0","1","0","1","0","0","0","1")
n.mat_new <- rbind(r1,r2,r3,r4,r5)
My code, but it crashes, because of the columns with only one string:
n.mat <- do.call(cbind, apply(n.mat, 2, function(x) {
tmp <-strsplit(x, '')
t(sapply(tmp, `[`, 1:max(lengths(tmp))))
}))

There's no need for apply or paste for this specific problem. Simply transpose the matrix, split all the strings, and re-construct the matrix according to the number of rows in the original matrix.
matrix(unlist(strsplit(t(n.mat), "")), nrow = nrow(n.mat), byrow = TRUE)
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
# [1,] "0" "0" "0" "1" "0" "0" "0" "1" "0" "1" "1" "0" "0"
# [2,] "1" "0" "0" "1" "0" "0" "0" "1" "1" "0" "1" "0" "0"
# [3,] "0" "1" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1" "0"
# [4,] "0" "0" "1" "0" "0" "1" "0" "0" "1" "0" "0" "0" "1"
# [5,] "0" "0" "1" "0" "0" "0" "1" "0" "1" "0" "0" "0" "1"
If you want further optimizations, you can do something like the following, which will retain the rownames
matrix(unlist(strsplit(t(n.mat), "", TRUE), use.names = FALSE),
nrow = nrow(n.mat), byrow = TRUE,
dimnames = list(rownames(n.mat), NULL))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
# r1 "0" "0" "0" "1" "0" "0" "0" "1" "0" "1" "1" "0" "0"
# r2 "1" "0" "0" "1" "0" "0" "0" "1" "1" "0" "1" "0" "0"
# r3 "0" "1" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1" "0"
# r4 "0" "0" "1" "0" "0" "1" "0" "0" "1" "0" "0" "0" "1"
# r5 "0" "0" "1" "0" "0" "0" "1" "0" "1" "0" "0" "0" "1"
By avoiding apply, you're only calling strsplit once, so you're going to notice much better performance if you have a lot of rows to process.
On my Chromebook (so these times are likely to be slow to begin with) testing with 10,000 rows, I get the following:
nrow(n.mat)
# [1] 10000
bench::mark(am_opt(), am(), gki(), jay(), check = FALSE)
# # A tibble: 4 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
# 1 am_opt() 28.3ms 40.1ms 27.4 2.75MB 0 14 0 511ms
# 2 am() 36.1ms 41.2ms 24.6 2.75MB 0 13 0 528ms
# 3 gki() 220.3ms 229.4ms 4.39 3.43MB 0 3 0 683ms
# 4 jay() 975.8ms 975.8ms 1.02 3.51MB 1.02 1 1 976ms
# # … with 4 more variables: result <list>, memory <list>, time <list>, gc <list>
I didn't benchmark Karthik's answer because just running it once took more than 1 minute.
system.time(karthik())
# user system elapsed
# 81.341 0.000 81.343
Where the functions are directly copied from the other answers:
am_opt <- function() {
matrix(unlist(strsplit(t(n.mat), "", TRUE), use.names = FALSE),
nrow = nrow(n.mat), byrow = TRUE,
dimnames = list(rownames(n.mat), NULL))
}
am <- function() matrix(unlist(strsplit(t(n.mat), "")), nrow = nrow(n.mat), byrow = TRUE)
gki <- function() matrix(unlist(apply(n.mat, 1, strsplit, split = "")), nrow(n.mat), byrow=TRUE)
jay <- function() t(apply(n.mat, 1, function(x) el(strsplit(Reduce(paste0, x), ""))))
karthik <- function() bind_rows(apply(n.mat, 2, strsplit, split = '')) %>% t

Collapse paste0 using Reduce and use strsplit on "".
t(apply(n.mat, 1, function(x) el(strsplit(Reduce(paste0, x), ""))))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
# r1 "0" "0" "0" "1" "0" "0" "0" "1" "0" "1" "1" "0" "0"
# r2 "1" "0" "0" "1" "0" "0" "0" "1" "1" "0" "1" "0" "0"
# r3 "0" "1" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1" "0"
# r4 "0" "0" "1" "0" "0" "1" "0" "0" "1" "0" "0" "0" "1"
# r5 "0" "0" "1" "0" "0" "0" "1" "0" "1" "0" "0" "0" "1"

Does this work:
library(dplyr)
bind_rows(apply(n.mat, 2, strsplit, split = '')) %>% t
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
r1 "0" "0" "0" "1" "0" "0" "0" "1" "0" "1" "1" "0" "0"
r2 "1" "0" "0" "1" "0" "0" "0" "1" "1" "0" "1" "0" "0"
r3 "0" "1" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1" "0"
r4 "0" "0" "1" "0" "0" "1" "0" "0" "1" "0" "0" "0" "1"
r5 "0" "0" "1" "0" "0" "0" "1" "0" "1" "0" "0" "0" "1"

You can use strsplit in apply, unlist the result and create with this a matrix.
matrix(unlist(apply(n.mat, 1, strsplit, split = "")), nrow(n.mat), byrow=TRUE)
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
#[1,] "0" "0" "0" "1" "0" "0" "0" "1" "0" "1" "1" "0" "0"
#[2,] "1" "0" "0" "1" "0" "0" "0" "1" "1" "0" "1" "0" "0"
#[3,] "0" "1" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1" "0"
#[4,] "0" "0" "1" "0" "0" "1" "0" "0" "1" "0" "0" "0" "1"
#[5,] "0" "0" "1" "0" "0" "0" "1" "0" "1" "0" "0" "0" "1"

Related

In a list object ordering characters in each row

I have a list object. I used strsplit(data, ";") to unlist. It is characters "A";"B" so on and each row has different length. Therefore, I wrote a for loop to create a matrix. I want to have column 1 all same object "A".
Here is the code I wrote but it does not work as I wanted.
myList <- list()
myList[[1]] <- c("A", "C", 0, 0)
myList[[2]] <- c("A", "B", "C")
myList[[3]] <- c("A", 0, 0, 0, 0)
myList[[4]] <- c("B", "A")
myList[[5]] <- c("Aa", "A", "B", 0, 0)
myList[[6]] <- c("Aa", "A", "C", 0, 0)
myList[[7]] <- c("C", "A", 0, 0)
myList
TD=TD2=matrix(0,length(myList),5)
for(i in 1:length(myList))
{
m1=length(myList[[i]])
TD[i,1:m1]=matrix( myList[[i]] , ncol = m1 , byrow = TRUE )
}
for(j in 1:length(myList)){
TD2[j,]=TD[j,order(TD[j,],decreasing = T)]
}
Desired output to be
[,1] [,2] [,3] [,4] [,5]
[1,] "A" "C" "0" "0" "0"
[2,] "A" "C" "B" "0" "0"
[3,] "A" "0" "0" "0" "0"
[4,] "A" "B" "0" "0" "0"
[5,] "A" "B" "Aa" "0" "0"
[6,] "A" "C" "Aa" "0" "0"
[7,] "A" "C" "0" "0" "0"
You can define a factor object with custom order with factor(..., ordered = T) and sort it.
ord <- names(sort(table(unlist(myList))[-1], dec = T))
len <- max(lengths(myList))
t(sapply(myList, function(x){
y <- sort(factor(x, levels = c(ord, "0"), ordered = T))[1:len]
replace(y, is.na(y), "0")
}))
# [,1] [,2] [,3] [,4] [,5]
# [1,] "A" "C" "0" "0" "0"
# [2,] "A" "C" "B" "0" "0"
# [3,] "A" "0" "0" "0" "0"
# [4,] "A" "B" "0" "0" "0"
# [5,] "A" "B" "Aa" "0" "0"
# [6,] "A" "C" "Aa" "0" "0"
# [7,] "A" "C" "0" "0" "0"
Not exactly what you want, but since you can create a frequency table, this should also work:
TD = matrix(0,length(myList),5)
for (i in 1:length(myList)) {
myList[[i]] = sort(myList[[i]][which(myList[[i]] != "0")])
for (j in 1:length(myList[[i]])) TD[i, j] = myList[[i]][j]
}
> TD
[,1] [,2] [,3] [,4] [,5]
[1,] "A" "C" "0" "0" "0"
[2,] "A" "B" "C" "0" "0"
[3,] "A" "0" "0" "0" "0"
[4,] "A" "B" "0" "0" "0"
[5,] "A" "Aa" "B" "0" "0"
[6,] "A" "Aa" "C" "0" "0"
[7,] "A" "C" "0" "0" "0"

Make 0/1 character matrix from random phylogenetic tree in R?

Is it possible to generate 0/1 character matrices like those shown below right from bifurcating phylogenetic trees like those on the left. The 1 in the matrix indicates presence of a shared character that unites the clades.
This code generates nice random trees but I have no idea where to begin to turn the results into a character matrix.
library(ape) # Other package solutions are acceptable
forest <- rmtree(N = 2, n = 10, br = NULL)
plot(forest)
To be clear, I can use the following code to generate random matrices, and then plot the trees.
library(ape)
library(phangorn)
ntaxa <- 10
nchar <- ntaxa - 1
char_mat <- array(0, dim = c(ntaxa, ntaxa - 1))
for (i in 1:nchar) {
char_mat[,i] <- replace(char_mat[,i], seq(1, (ntaxa+1)-i), 1)
}
char_mat <- char_mat[sample.int(nrow(char_mat)), # Shuffle rows
sample.int(ncol(char_mat))] # and cols
# Ensure all branch lengths > 0
dist_mat <- dist.gene(char_mat) + 0.5
upgma_tree <- upgma(dist_mat)
plot.phylo(upgma_tree, "phylo")
What I want is to generate random trees, and then make the matrices from those trees. This solution does not make the right type of matrix.
Edit for clarity: I am generating binary character matrices that students can use to draw phylogenetic trees using simple parsimony. The 1 character represents homologies that unite taxa into clades. So, all rows must share one character (a 1 across all rows in one column) and some characters must be shared by only two taxa. (I'm discounting autapomorphies.)
Examples:
you can have a look at the rTraitDisc function in ape that is pretty straight forward:
library(ape)
## You'll need to simulate branch length!
forest <- rmtree(N = 2, n = 10)
## Generate on equal rate model character
(one_character <- rTraitDisc(forest[[1]], type = "ER", states = c(0,1)))
# t10 t7 t5 t9 t1 t4 t2 t8 t3 t6
# 0 0 0 1 0 0 0 0 0 0
# Levels: 0 1
## Generate a matrix of ten characters
(replicate(10, rTraitDisc(forest[[1]], type = "ER", states = c(0,1))))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# t10 "0" "0" "0" "0" "1" "0" "0" "0" "0" "0"
# t7 "0" "0" "0" "0" "1" "0" "0" "0" "0" "0"
# t5 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t9 "0" "0" "1" "0" "0" "0" "0" "0" "0" "0"
# t1 "0" "0" "1" "0" "0" "0" "0" "0" "0" "0"
# t4 "0" "0" "1" "0" "0" "0" "0" "0" "0" "0"
# t2 "0" "0" "1" "0" "0" "0" "0" "0" "0" "0"
# t8 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t3 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t6 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
To apply it to multiple tree, the best would be to create a lapply function like so:
## Lapply wrapper function
generate.characters <- function(tree) {
return(replicate(10, rTraitDisc(tree, type = "ER", states = c(0,1))))
}
## Generate 10 character matrices for each tree
lapply(forest, generate.characters)
# [[1]]
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# t10 "0" "0" "0" "1" "0" "0" "0" "0" "0" "0"
# t7 "0" "0" "0" "1" "0" "0" "0" "0" "0" "0"
# t5 "0" "0" "0" "1" "0" "0" "0" "0" "0" "0"
# t9 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t1 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t4 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t2 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t8 "0" "0" "0" "1" "0" "1" "0" "0" "0" "1"
# t3 "0" "0" "0" "0" "0" "1" "0" "0" "0" "0"
# t6 "0" "0" "0" "0" "0" "1" "0" "0" "0" "0"
# [[2]]
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# t7 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t9 "1" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t5 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t2 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t4 "0" "1" "0" "0" "1" "0" "0" "0" "0" "0"
# t6 "0" "1" "0" "0" "1" "0" "0" "0" "0" "0"
# t10 "0" "1" "1" "0" "1" "1" "0" "0" "0" "1"
# t8 "0" "1" "1" "0" "1" "0" "0" "0" "0" "0"
# t3 "0" "1" "0" "0" "0" "0" "0" "0" "0" "0"
# t1 "0" "1" "0" "0" "0" "0" "0" "0" "0" "0"
Another option is to use the sim.morpho from the dispRity package. This function reuses the rTraitDisc function but has a bit more models implemented and alllows the rates to be provided as distributions from which to sample. It also allows characters to look a bit more "realistic" without to much invariant data and insuring that the generated character "looks" like a real morphological character (like with the right amount of homoplasy, etc...).
library(dispRity)
## You're first tree
tree <- forest[[1]]
## Setting up the parameters
my_rates = c(rgamma, rate = 10, shape = 5)
my_substitutions = c(runif, 2, 2)
## HKY binary (15*50)
matrixHKY <- sim.morpho(tree, characters = 50, model = "HKY",
rates = my_rates, substitution = my_substitutions)
## Mk matrix (15*50) (for Mkv models)
matrixMk <- sim.morpho(tree, characters = 50, model = "ER", rates = my_rates)
## Mk invariant matrix (15*50) (for Mk models)
matrixMk <- sim.morpho(tree, characters = 50, model = "ER", rates = my_rates,
invariant = FALSE)
## MIXED model invariant matrix (15*50)
matrixMixed <- sim.morpho(tree, characters = 50, model = "MIXED",
rates = my_rates, substitution = my_substitutions, invariant = FALSE,
verbose = TRUE)
I suggest you have a read at the sim.morpho function for the proper references on how the model work or at the relevant section in the dispRity package manual.
I figured out how to make the matrix using Descendants from the phangorn package. I still have to tweak it with suitable node labels to match the example matrix in the original question, but the framework is there.
library(ape)
library(phangorn)
ntaxa <- 8
nchar <- ntaxa - 1
tree <- rtree(ntaxa, br = NULL)
# Gets descendants, but removes the first ntaxa elements,
# which are the individual tips
desc <- phangorn::Descendants(tree)[-seq(1, ntaxa)]
char_mat <- array(0, dim = c(ntaxa, nchar))
for (i in 1:nchar) {
char_mat[,i] <- replace(char_mat[,i], y <- desc[[i]], 1)
}
rownames(char_mat) <- tree$tip.label
char_mat
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7]
#> t6 1 1 0 0 0 0 0
#> t3 1 1 1 0 0 0 0
#> t7 1 1 1 1 0 0 0
#> t2 1 1 1 1 1 0 0
#> t5 1 1 1 1 1 0 0
#> t1 1 0 0 0 0 1 1
#> t8 1 0 0 0 0 1 1
#> t4 1 0 0 0 0 1 0
plot(tree)
Created on 2019-01-28 by the reprex package (v0.2.1)

Row number in dataframe based on multiple parameters in R

I wish to find the row number, based on multiple parameters. I have made this test matrix:
data=
[,1] [,2] [,3]
[1,] "1" "a" "0"
[2,] "2" "b" "0"
[3,] "3" "c" "0"
[4,] "4" "a" "0"
[5,] "1" "b" "0"
[6,] "2" "c" "0"
[7,] "3" "a" "0"
[8,] "4" "b" "0"
Then I want to get the row number where
data[,1]==1 and data[,2]=='b'

Enumerate all possible combined probabilities of a series of Bernoulli trials with different probabilities

Suppose I have a series of n probabilities for success of independent Bernoulli trials, p1 to pn such that p1 != p2 != ... != pn. Give each trial a unique name.
p <- c(0.5, 0.12, 0.7, 0.8, .02)
a <- c("A","B","C","D","E")
I know from searching stack exchange (e.g., here and here) that I can find the cdf, pmf, etc. using the Poisson Binomial distribution function.
What I'm interested in is the exact probability of every possible combination of success and failures. (E.g. If I drew a probability tree, I want to know the probability at the end of each branch.)
all <- prod(p)
all
[1] 0.000672
o1 <- (0.5 * (1-0.12) * 0.7 * 0.8 * .02)
o1
[1] 0.004928
o2 <- (0.5 * 0.12 * (1-0.7) * 0.8 * .02)
o2
[1] 0.000288
...for all 2^5 possible combinations of success/failure.
What's an efficient way to go about this in R?
In the case of my actual data set, the number of trials is 19, so we're talking about 2^19 total paths on the probability tree.
The key to making this computation fast is to do it in log-probability space so that the product for each branch of the tree is a sum that can be computed as the inner sum of a matrix multiply. In this manner, all the branches can be computed together in vectorized fashion.
First, we construct an enumeration of all branches. For this, we use the intToBin function from R.utils package:
library(R.utils)
enum.branches <- unlist(strsplit(intToBin(seq_len(2^n)-1),split=""))
where n is the number of Bernoulli variables. For your example, n=5:
matrix(enum.branches, nrow=n)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17]
##[1,] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "1"
##[2,] "0" "0" "0" "0" "0" "0" "0" "0" "1" "1" "1" "1" "1" "1" "1" "1" "0"
##[3,] "0" "0" "0" "0" "1" "1" "1" "1" "0" "0" "0" "0" "1" "1" "1" "1" "0"
##[4,] "0" "0" "1" "1" "0" "0" "1" "1" "0" "0" "1" "1" "0" "0" "1" "1" "0"
##[5,] "0" "1" "0" "1" "0" "1" "0" "1" "0" "1" "0" "1" "0" "1" "0" "1" "0"
## [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32]
##[1,] "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1"
##[2,] "0" "0" "0" "0" "0" "0" "0" "1" "1" "1" "1" "1" "1" "1" "1"
##[3,] "0" "0" "0" "1" "1" "1" "1" "0" "0" "0" "0" "1" "1" "1" "1"
##[4,] "0" "1" "1" "0" "0" "1" "1" "0" "0" "1" "1" "0" "0" "1" "1"
##[5,] "1" "0" "1" "0" "1" "0" "1" "0" "1" "0" "1" "0" "1" "0" "1"
results in a matrix where each column is the outcomes from a branch of the probability tree.
Now, use that to construct a matrix of log probabilities of the same size as enum.branches where the value is log(p) if enum.branches=="1" and log(1-p) otherwise. For your data, with p <- c(0.5, 0.12, 0.7, 0.8, .02), this is:
logp <- matrix(ifelse(enum.branches == "1", rep(log(p), 2^n), rep(log(1-p), 2^n)), nrow=n)
Then, sum the log-probabilities and take the exponential to get the product of the probabilities:
result <- exp(rep(1,n) %*% logp)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##[1,] 0.025872 0.000528 0.103488 0.002112 0.060368 0.001232 0.241472 0.004928 0.003528 7.2e-05
[,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
##[1,] 0.014112 0.000288 0.008232 0.000168 0.032928 0.000672 0.025872 0.000528 0.103488 0.002112
[,21] [,22] [,23] [,24] [,25] [,26] [,27] [,28] [,29] [,30]
##[1,] 0.060368 0.001232 0.241472 0.004928 0.003528 7.2e-05 0.014112 0.000288 0.008232 0.000168
[,31] [,32]
##[1,] 0.032928 0.000672
The result will be in the same order as the numeration of branches in enum.branches.
We can encapsulate the computation into a function:
enum.prob.product <- function(n, p) {
enum.branches <- unlist(strsplit(intToBin(seq_len(2^n)-1),split=""))
exp(rep(1,n) %*% matrix(ifelse(enum.branches == "1", rep(log(p), 2^n), rep(log(1-p), 2^n)), nrow=n))
}
Timing this with 19 independent Bernoulli variables:
n <- 19
p <- runif(n)
system.time(enum.prob.product(n,p))
## user system elapsed
## 24.064 1.470 26.082
This is on my 2 GHz MacBook (circa 2009). It should be noted that the computation itself is quite fast; it is the enumeration of the branches of the probability tree (I would guess the unlist within that) that is taking the bulk of the time. Any suggestions from the community on another approach to do that will be appreciated.
Just try this in base R:
p <- c(0.5, 0.12, 0.7, 0.8, .02)
a <- c("A","B","C","D","E")
n <- length(p)
apply(expand.grid(replicate(n,list(0:1)))[n:1], 1,
function(x) prod(p[which(x==1)])*prod(1-p[which(x==0)]))
#[1] 0.025872 0.000528 0.103488 0.002112 0.060368 0.001232 0.241472 0.004928 0.003528 0.000072 0.014112 0.000288 0.008232 0.000168 0.032928 0.000672 0.025872
#[18] 0.000528 0.103488 0.002112 0.060368 0.001232 0.241472 0.004928 0.003528 0.000072 0.014112 0.000288 0.008232 0.000168 0.032928 0.000672

Convert row data to binary columns

I am attempting to format a column of data into many binary columns to eventually use for association rule mining. I have had some success using a for loop and a simple triplet matrix, but I am unsure how to aggregate by the levels in the first column thereafter--similar to a group by statement in SQL. I have provided an example below, albeit with a much smaller data set--if successful my actual data set will be 4,200 rows by 3,902 columns so any solution needs to be scaleable. Any suggestions or alternative approaches would be greatly appreciated!
> data <- data.frame(a=c('sally','george','andy','sue','sue','sally','george'), b=c('green','yellow','green','yellow','purple','brown','purple'))
> data
a b
1 sally green
2 george yellow
3 andy green
4 sue yellow
5 sue purple
6 sally brown
7 george purple
x <- data[,1]
for(i in as.numeric(2:ncol(data)))
x <- cbind(x, simple_triplet_matrix(i=1:nrow(data), j=as.numeric(data[,i]),
v = rep(1,nrow(data)), dimnames = list(NULL, levels(data[,i]))) )
##Looks like this:
> as.matrix(x)
name brown green purple yellow
[1,] "sally" "0" "1" "0" "0"
[2,] "george" "0" "0" "0" "1"
[3,] "andy" "0" "1" "0" "0"
[4,] "sue" "0" "0" "0" "1"
[5,] "sue" "0" "0" "1" "0"
[6,] "sally" "1" "0" "0" "0" ##Need to aggregate by Name
##Would like it to look like this:
name brown green purple yellow
[1,] "sally" "1" "1" "0" "0"
[2,] "george" "0" "0" "0" "1"
[3,] "andy" "0" "1" "0" "0"
[4,] "sue" "0" "0" "1" "1"
This should do the trick:
## Get a contingency table of counts
X <- with(data, table(a,b))
## Massage it into the format you're wanting
cbind(name = rownames(X), apply(X, 2, as.character))
# name brown green purple yellow
# [1,] "andy" "0" "1" "0" "0"
# [2,] "george" "0" "0" "1" "1"
# [3,] "sally" "1" "1" "0" "0"
# [4,] "sue" "0" "0" "1" "1"

Resources