Make 0/1 character matrix from random phylogenetic tree in R? - r
Is it possible to generate 0/1 character matrices like those shown below right from bifurcating phylogenetic trees like those on the left. The 1 in the matrix indicates presence of a shared character that unites the clades.
This code generates nice random trees but I have no idea where to begin to turn the results into a character matrix.
library(ape) # Other package solutions are acceptable
forest <- rmtree(N = 2, n = 10, br = NULL)
plot(forest)
To be clear, I can use the following code to generate random matrices, and then plot the trees.
library(ape)
library(phangorn)
ntaxa <- 10
nchar <- ntaxa - 1
char_mat <- array(0, dim = c(ntaxa, ntaxa - 1))
for (i in 1:nchar) {
char_mat[,i] <- replace(char_mat[,i], seq(1, (ntaxa+1)-i), 1)
}
char_mat <- char_mat[sample.int(nrow(char_mat)), # Shuffle rows
sample.int(ncol(char_mat))] # and cols
# Ensure all branch lengths > 0
dist_mat <- dist.gene(char_mat) + 0.5
upgma_tree <- upgma(dist_mat)
plot.phylo(upgma_tree, "phylo")
What I want is to generate random trees, and then make the matrices from those trees. This solution does not make the right type of matrix.
Edit for clarity: I am generating binary character matrices that students can use to draw phylogenetic trees using simple parsimony. The 1 character represents homologies that unite taxa into clades. So, all rows must share one character (a 1 across all rows in one column) and some characters must be shared by only two taxa. (I'm discounting autapomorphies.)
Examples:
you can have a look at the rTraitDisc function in ape that is pretty straight forward:
library(ape)
## You'll need to simulate branch length!
forest <- rmtree(N = 2, n = 10)
## Generate on equal rate model character
(one_character <- rTraitDisc(forest[[1]], type = "ER", states = c(0,1)))
# t10 t7 t5 t9 t1 t4 t2 t8 t3 t6
# 0 0 0 1 0 0 0 0 0 0
# Levels: 0 1
## Generate a matrix of ten characters
(replicate(10, rTraitDisc(forest[[1]], type = "ER", states = c(0,1))))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# t10 "0" "0" "0" "0" "1" "0" "0" "0" "0" "0"
# t7 "0" "0" "0" "0" "1" "0" "0" "0" "0" "0"
# t5 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t9 "0" "0" "1" "0" "0" "0" "0" "0" "0" "0"
# t1 "0" "0" "1" "0" "0" "0" "0" "0" "0" "0"
# t4 "0" "0" "1" "0" "0" "0" "0" "0" "0" "0"
# t2 "0" "0" "1" "0" "0" "0" "0" "0" "0" "0"
# t8 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t3 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t6 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
To apply it to multiple tree, the best would be to create a lapply function like so:
## Lapply wrapper function
generate.characters <- function(tree) {
return(replicate(10, rTraitDisc(tree, type = "ER", states = c(0,1))))
}
## Generate 10 character matrices for each tree
lapply(forest, generate.characters)
# [[1]]
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# t10 "0" "0" "0" "1" "0" "0" "0" "0" "0" "0"
# t7 "0" "0" "0" "1" "0" "0" "0" "0" "0" "0"
# t5 "0" "0" "0" "1" "0" "0" "0" "0" "0" "0"
# t9 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t1 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t4 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t2 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t8 "0" "0" "0" "1" "0" "1" "0" "0" "0" "1"
# t3 "0" "0" "0" "0" "0" "1" "0" "0" "0" "0"
# t6 "0" "0" "0" "0" "0" "1" "0" "0" "0" "0"
# [[2]]
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# t7 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t9 "1" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t5 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t2 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t4 "0" "1" "0" "0" "1" "0" "0" "0" "0" "0"
# t6 "0" "1" "0" "0" "1" "0" "0" "0" "0" "0"
# t10 "0" "1" "1" "0" "1" "1" "0" "0" "0" "1"
# t8 "0" "1" "1" "0" "1" "0" "0" "0" "0" "0"
# t3 "0" "1" "0" "0" "0" "0" "0" "0" "0" "0"
# t1 "0" "1" "0" "0" "0" "0" "0" "0" "0" "0"
Another option is to use the sim.morpho from the dispRity package. This function reuses the rTraitDisc function but has a bit more models implemented and alllows the rates to be provided as distributions from which to sample. It also allows characters to look a bit more "realistic" without to much invariant data and insuring that the generated character "looks" like a real morphological character (like with the right amount of homoplasy, etc...).
library(dispRity)
## You're first tree
tree <- forest[[1]]
## Setting up the parameters
my_rates = c(rgamma, rate = 10, shape = 5)
my_substitutions = c(runif, 2, 2)
## HKY binary (15*50)
matrixHKY <- sim.morpho(tree, characters = 50, model = "HKY",
rates = my_rates, substitution = my_substitutions)
## Mk matrix (15*50) (for Mkv models)
matrixMk <- sim.morpho(tree, characters = 50, model = "ER", rates = my_rates)
## Mk invariant matrix (15*50) (for Mk models)
matrixMk <- sim.morpho(tree, characters = 50, model = "ER", rates = my_rates,
invariant = FALSE)
## MIXED model invariant matrix (15*50)
matrixMixed <- sim.morpho(tree, characters = 50, model = "MIXED",
rates = my_rates, substitution = my_substitutions, invariant = FALSE,
verbose = TRUE)
I suggest you have a read at the sim.morpho function for the proper references on how the model work or at the relevant section in the dispRity package manual.
I figured out how to make the matrix using Descendants from the phangorn package. I still have to tweak it with suitable node labels to match the example matrix in the original question, but the framework is there.
library(ape)
library(phangorn)
ntaxa <- 8
nchar <- ntaxa - 1
tree <- rtree(ntaxa, br = NULL)
# Gets descendants, but removes the first ntaxa elements,
# which are the individual tips
desc <- phangorn::Descendants(tree)[-seq(1, ntaxa)]
char_mat <- array(0, dim = c(ntaxa, nchar))
for (i in 1:nchar) {
char_mat[,i] <- replace(char_mat[,i], y <- desc[[i]], 1)
}
rownames(char_mat) <- tree$tip.label
char_mat
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7]
#> t6 1 1 0 0 0 0 0
#> t3 1 1 1 0 0 0 0
#> t7 1 1 1 1 0 0 0
#> t2 1 1 1 1 1 0 0
#> t5 1 1 1 1 1 0 0
#> t1 1 0 0 0 0 1 1
#> t8 1 0 0 0 0 1 1
#> t4 1 0 0 0 0 1 0
plot(tree)
Created on 2019-01-28 by the reprex package (v0.2.1)
Related
Split multiple strings in multiple columns in a matrix in R
I have following matrix with numbers of 0 and 1 with always the same number of strings per column, but also containing columns with only one string. I would like to to split each number into separate columns, that only one number per column and row occurs. But I would like to leave the columns with only one string as it is: r1 <- c("0","001","0001","01","100") r2 <- c("1","001","0001","10","100") r3 <- c("0","100","1000","10","010") r4 <- c("0","010","0100","10","001") r5<- c("0","010","0010","10","001") n.mat <- rbind(r1,r2,r3,r4,r5) The output: r1 <- c("0","0","0","1","0","0","0","1","0","1","1","0","0") r2 <- c("1","0","0","1","0","0","0","1","1","0","1","0","0") r3 <- c("0","1","0","0","1","0","0","0","1","0","0","1","0") r4 <- c("0","0","1","0","0","1","0","0","1","0","0","0","1") r5 <- c("0","0","1","0","0","0","1","0","1","0","0","0","1") n.mat_new <- rbind(r1,r2,r3,r4,r5) My code, but it crashes, because of the columns with only one string: n.mat <- do.call(cbind, apply(n.mat, 2, function(x) { tmp <-strsplit(x, '') t(sapply(tmp, `[`, 1:max(lengths(tmp)))) }))
There's no need for apply or paste for this specific problem. Simply transpose the matrix, split all the strings, and re-construct the matrix according to the number of rows in the original matrix. matrix(unlist(strsplit(t(n.mat), "")), nrow = nrow(n.mat), byrow = TRUE) # [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] # [1,] "0" "0" "0" "1" "0" "0" "0" "1" "0" "1" "1" "0" "0" # [2,] "1" "0" "0" "1" "0" "0" "0" "1" "1" "0" "1" "0" "0" # [3,] "0" "1" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1" "0" # [4,] "0" "0" "1" "0" "0" "1" "0" "0" "1" "0" "0" "0" "1" # [5,] "0" "0" "1" "0" "0" "0" "1" "0" "1" "0" "0" "0" "1" If you want further optimizations, you can do something like the following, which will retain the rownames matrix(unlist(strsplit(t(n.mat), "", TRUE), use.names = FALSE), nrow = nrow(n.mat), byrow = TRUE, dimnames = list(rownames(n.mat), NULL)) # [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] # r1 "0" "0" "0" "1" "0" "0" "0" "1" "0" "1" "1" "0" "0" # r2 "1" "0" "0" "1" "0" "0" "0" "1" "1" "0" "1" "0" "0" # r3 "0" "1" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1" "0" # r4 "0" "0" "1" "0" "0" "1" "0" "0" "1" "0" "0" "0" "1" # r5 "0" "0" "1" "0" "0" "0" "1" "0" "1" "0" "0" "0" "1" By avoiding apply, you're only calling strsplit once, so you're going to notice much better performance if you have a lot of rows to process. On my Chromebook (so these times are likely to be slow to begin with) testing with 10,000 rows, I get the following: nrow(n.mat) # [1] 10000 bench::mark(am_opt(), am(), gki(), jay(), check = FALSE) # # A tibble: 4 x 13 # expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time # <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> # 1 am_opt() 28.3ms 40.1ms 27.4 2.75MB 0 14 0 511ms # 2 am() 36.1ms 41.2ms 24.6 2.75MB 0 13 0 528ms # 3 gki() 220.3ms 229.4ms 4.39 3.43MB 0 3 0 683ms # 4 jay() 975.8ms 975.8ms 1.02 3.51MB 1.02 1 1 976ms # # … with 4 more variables: result <list>, memory <list>, time <list>, gc <list> I didn't benchmark Karthik's answer because just running it once took more than 1 minute. system.time(karthik()) # user system elapsed # 81.341 0.000 81.343 Where the functions are directly copied from the other answers: am_opt <- function() { matrix(unlist(strsplit(t(n.mat), "", TRUE), use.names = FALSE), nrow = nrow(n.mat), byrow = TRUE, dimnames = list(rownames(n.mat), NULL)) } am <- function() matrix(unlist(strsplit(t(n.mat), "")), nrow = nrow(n.mat), byrow = TRUE) gki <- function() matrix(unlist(apply(n.mat, 1, strsplit, split = "")), nrow(n.mat), byrow=TRUE) jay <- function() t(apply(n.mat, 1, function(x) el(strsplit(Reduce(paste0, x), "")))) karthik <- function() bind_rows(apply(n.mat, 2, strsplit, split = '')) %>% t
Collapse paste0 using Reduce and use strsplit on "". t(apply(n.mat, 1, function(x) el(strsplit(Reduce(paste0, x), "")))) # [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] # r1 "0" "0" "0" "1" "0" "0" "0" "1" "0" "1" "1" "0" "0" # r2 "1" "0" "0" "1" "0" "0" "0" "1" "1" "0" "1" "0" "0" # r3 "0" "1" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1" "0" # r4 "0" "0" "1" "0" "0" "1" "0" "0" "1" "0" "0" "0" "1" # r5 "0" "0" "1" "0" "0" "0" "1" "0" "1" "0" "0" "0" "1"
Does this work: library(dplyr) bind_rows(apply(n.mat, 2, strsplit, split = '')) %>% t [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] r1 "0" "0" "0" "1" "0" "0" "0" "1" "0" "1" "1" "0" "0" r2 "1" "0" "0" "1" "0" "0" "0" "1" "1" "0" "1" "0" "0" r3 "0" "1" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1" "0" r4 "0" "0" "1" "0" "0" "1" "0" "0" "1" "0" "0" "0" "1" r5 "0" "0" "1" "0" "0" "0" "1" "0" "1" "0" "0" "0" "1"
You can use strsplit in apply, unlist the result and create with this a matrix. matrix(unlist(apply(n.mat, 1, strsplit, split = "")), nrow(n.mat), byrow=TRUE) # [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] #[1,] "0" "0" "0" "1" "0" "0" "0" "1" "0" "1" "1" "0" "0" #[2,] "1" "0" "0" "1" "0" "0" "0" "1" "1" "0" "1" "0" "0" #[3,] "0" "1" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1" "0" #[4,] "0" "0" "1" "0" "0" "1" "0" "0" "1" "0" "0" "0" "1" #[5,] "0" "0" "1" "0" "0" "0" "1" "0" "1" "0" "0" "0" "1"
Compare rows of a data frame with a matrix rows in R
I have created a matrix like this: > head(matrix) Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 Var11 [1,] "0" "0" "1" "0" "1" "1" "0" "0" "0" "0" "NA" [2,] "1" "0" "1" "0" "1" "1" "0" "0" "0" "0" "NA" [3,] "0" "1" "1" "0" "1" "1" "0" "0" "0" "0" "NA" [4,] "1" "1" "1" "0" "1" "1" "0" "0" "0" "0" "NA" [5,] "0" "0" "2" "0" "1" "1" "0" "0" "0" "0" "NA" [6,] "1" "0" "2" "0" "1" "1" "0" "0" "0" "0" "NA" Now, I want to compare the matrix above with the following data frame: > head(df) cod Var11 Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 Var12 1 C000354 B 1 1 4 0 1 2 0 0 0 1 51520.72 2 C000404 A 1 0 1 0 4 4 0 0 1 1 21183.25 3 C000444 A 1 0 4 1 3 3 0 0 0 1 67504.74 4 C000480 A 1 1 2 0 2 3 0 0 1 1 26545.92 5 C000983 C 1 0 1 0 3 4 0 0 0 0 10379.37 6 C000985 C 1 0 3 1 3 4 0 0 0 0 18660.99 Matrix contains all possible combinations of the variables Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10, so basically when a row of df (only column from VAR1 to VAR10) match with a row of matrix and this row in df had a Var12>=90000, I would like it to be written "A" in corresponding column VAR11 of matrix. I have tried with this: for (i in 1 : nrow(matrix)) { for (j in 1 : 10) { ifelse(matrix[i,j]==df[,(j+2)] && df$Var12[] >= 90000, matrix[i,"Var11"] <- "A", matrix[i,"Var11"] <- "NA") } } But this writes NA in all rows of matrix. Does anyone know why this happen or how to solve it? Thanks in advance.
I don't understand why you used 1:10 and j+2 in your loop. #Some dummy data col_to_match<-paste0("V",1:10) set.seed(123) mat <- cbind(matrix(sample(0:4, 100, replace=TRUE), ncol=10), "NA") colnames(mat)<-c(col_to_match,"V11") set.seed(123) df<- data.frame("cod"=paste0("C",1:20), "V12"= runif(20,min=88000,max=95000)) set.seed(1) df <- cbind(df, rbind(mat[3:10,col_to_match], matrix(sample(0:4, 120, replace=TRUE), ncol=10)) ) From the dummy data, we expect the rows of the matrix c(3:10)[df[1:8,"V12"]>=90000] to match the dummy data. Those are rows 3 4 5 6 7 9 10. Run the following to check for every row in matrix, find whether there are any matching rows in df, and whether the V12 value is greater than 90000. for(i in 1:nrow(mat)){ hasMatch<-any(sapply(1:nrow(df), function(j) all( df[j,col_to_match] == mat[i, col_to_match] ) && df[j,"V12"]>=90000 )) if(hasMatch) mat[i, "V11"]<-"A" } Output > mat V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 [1,] "1" "4" "4" "4" "0" "0" "3" "3" "1" "0" "NA" [2,] "3" "2" "3" "4" "2" "2" "0" "3" "3" "3" "NA" [3,] "2" "3" "3" "3" "2" "3" "1" "3" "2" "1" "A" [4,] "4" "2" "4" "3" "1" "0" "1" "0" "3" "3" "A" [5,] "4" "0" "3" "0" "0" "2" "4" "2" "0" "1" "A" [6,] "0" "4" "3" "2" "0" "1" "2" "1" "2" "0" "A" [7,] "2" "1" "2" "3" "1" "0" "4" "1" "4" "3" "A" [8,] "4" "0" "2" "1" "2" "3" "4" "3" "4" "0" "NA" [9,] "2" "1" "1" "1" "1" "4" "3" "1" "4" "2" "A" [10,] "2" "4" "0" "1" "4" "1" "2" "0" "0" "2" "A"
How to reshape this dataframe in R program [duplicate]
This question already has answers here: R counting strings variables in each row of a dataframe (2 answers) Closed 6 years ago. I have a dataframe like below sample: A B C [1,] "A1" "B3" "C1" [2,] "A2" "B1" "C2" [3,] "A3" "B3" "C3" [4,] "A1" "B2" "C3" [5,] "A3" "B3" "C2" [6,] "A1" "B1" "C1" And I would like to reshape it like this, to expand every unique value of variables to a single variable, and mark 1/0 in the value field. Above data frame shall be reshaped to this: A B1 B2 B3 C1 C2 C3 [1,] "A1" "0" "0" "1" "1" "0" "0" [2,] "A2" "1" "0" "0" "0" "1" "0" [3,] "A3" "0" "0" "1" "0" "0" "1" [4,] "A1" "0" "1" "0" "0" "0" "1" [5,] "A3" "0" "0" "1" "0" "1" "0" [6,] "A1" "1" "0" "0" "1" "0" "0" The real data is in huge amount (>100 thousand per day, and much more fields and unique values. So I need a high efficiency program instead of using for... I believe you could help... I am a beginner, only know for... :(
You can try this too (with base R): df <- cbind(as.character(df$A), model.matrix(~B+C+0,df,list(B=contrasts(df$B, contrasts=F), C=contrasts(df$C, contrasts=F)))) dimnames(df) <- list(NULL, c('A', paste0('B',1:3), paste0('C',1:3))) df # A B1 B2 B3 C1 C2 C3 #[1,] "A1" "0" "0" "1" "1" "0" "0" #[2,] "A2" "1" "0" "0" "0" "1" "0" #[3,] "A3" "0" "0" "1" "0" "0" "1" #[4,] "A1" "0" "1" "0" "0" "0" "1" #[5,] "A3" "0" "0" "1" "0" "1" "0" #[6,] "A1" "1" "0" "0" "1" "0" "0"
We can use library(qdapTools) cbind(df1[1], mtabulate(as.data.frame(t(df1[-1])))) # A B3 C1 B1 C2 C3 B2 #V1 A1 1 1 0 0 0 0 #V2 A2 0 0 1 1 0 0 #V3 A3 1 0 0 0 1 0 #V4 A1 0 0 0 0 1 1 #V5 A3 1 0 0 1 0 0 #V6 A1 0 1 1 0 0 0
Removing rows with less than 4 non zero entries, without using loop
The dataset is like this: "1" 10 40 "r" "q" "0" "r" "r" "0" "r" "0" "0" "0" "0" "0" "t" "q" "0" "0" "s" "0" "r" 0 "0" 0 "0" "0" 0 0 0 "0" "2" 10 173 "s" "s" "s" "0" "0" "s" "s" "0" "t" "t" "s" "t" "t" "r" "s" "0" "q" "0" "0" 0 "0" 0 "0" "0" 0 0 0 "0" "3" 10 2107 "t" "0" "0" "s" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" 0 "0" 0 "0" "0" 0 0 0 "0" "4" 10 993 "s" "0" "q" "s" "s" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" 0 "0" 0 "0" "0" 0 0 0 "0" "5" 10 1712 "t" "0" "s" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "s" "0" "t" "0" 0 "0" 0 "0" "0" 0 0 0 "0" "6" 776 1872 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" 0 "r" 0 "0" "0" 0 0 0 "s" Output should be: "1" 10 40 "r" "q" "0" "r" "r" "0" "r" "0" "0" "0" "0" "0" "t" "q" "0" "0" "s" "0" "r" 0 "0" 0 "0" "0" 0 0 0 "0" "2" 10 173 "s" "s" "s" "0" "0" "s" "s" "0" "t" "t" "s" "t" "t" "r" "s" "0" "q" "0" "0" 0 "0" 0 "0" "0" 0 0 0 "0" "4" 10 993 "s" "0" "q" "s" "s" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" 0 "0" 0 "0" "0" 0 0 0 "0" "5" 10 1712 "t" "0" "s" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "s" "0" "t" "0" 0 "0" 0 "0" "0" 0 0 0 "0" The code that I have tried is: x=read.table("sample.txt") nrowx=nrow(x) for(i in 1:nrowx) { count=0 for(j in 3:30) { if(x[i,j]!=0) count = count+1 } if(count<4) x[i,]=NA } x=x[complete.cases(x),] Please suggest some method that doesn't involve loop.
It looks like none of your rows have less than four non-zero entries: For example, printing the number of nonzero entries per row with tab being your table: apply(tab, 1, function(x)sum(x!="0")) [1] 12 16 5 7 7 5 To for example eliminate all rows which have less than 5 nonzero entries, you could do tab[-which(apply(tab, 1, function(x)sum(x!="0"))<=5),] I am not sure if the first column in your data is treated as a column in your data frame, however. Does this help?
Convert row data to binary columns
I am attempting to format a column of data into many binary columns to eventually use for association rule mining. I have had some success using a for loop and a simple triplet matrix, but I am unsure how to aggregate by the levels in the first column thereafter--similar to a group by statement in SQL. I have provided an example below, albeit with a much smaller data set--if successful my actual data set will be 4,200 rows by 3,902 columns so any solution needs to be scaleable. Any suggestions or alternative approaches would be greatly appreciated! > data <- data.frame(a=c('sally','george','andy','sue','sue','sally','george'), b=c('green','yellow','green','yellow','purple','brown','purple')) > data a b 1 sally green 2 george yellow 3 andy green 4 sue yellow 5 sue purple 6 sally brown 7 george purple x <- data[,1] for(i in as.numeric(2:ncol(data))) x <- cbind(x, simple_triplet_matrix(i=1:nrow(data), j=as.numeric(data[,i]), v = rep(1,nrow(data)), dimnames = list(NULL, levels(data[,i]))) ) ##Looks like this: > as.matrix(x) name brown green purple yellow [1,] "sally" "0" "1" "0" "0" [2,] "george" "0" "0" "0" "1" [3,] "andy" "0" "1" "0" "0" [4,] "sue" "0" "0" "0" "1" [5,] "sue" "0" "0" "1" "0" [6,] "sally" "1" "0" "0" "0" ##Need to aggregate by Name ##Would like it to look like this: name brown green purple yellow [1,] "sally" "1" "1" "0" "0" [2,] "george" "0" "0" "0" "1" [3,] "andy" "0" "1" "0" "0" [4,] "sue" "0" "0" "1" "1"
This should do the trick: ## Get a contingency table of counts X <- with(data, table(a,b)) ## Massage it into the format you're wanting cbind(name = rownames(X), apply(X, 2, as.character)) # name brown green purple yellow # [1,] "andy" "0" "1" "0" "0" # [2,] "george" "0" "0" "1" "1" # [3,] "sally" "1" "1" "0" "0" # [4,] "sue" "0" "0" "1" "1"