I have following matrix with numbers of 0 and 1 with always the same number of strings per column, but also containing columns with only one string. I would like to to split each number into separate columns, that only one number per column and row occurs. But I would like to leave the columns with only one string as it is:
r1 <- c("0","001","0001","01","100")
r2 <- c("1","001","0001","10","100")
r3 <- c("0","100","1000","10","010")
r4 <- c("0","010","0100","10","001")
r5<- c("0","010","0010","10","001")
n.mat <- rbind(r1,r2,r3,r4,r5)
The output:
r1 <- c("0","0","0","1","0","0","0","1","0","1","1","0","0")
r2 <- c("1","0","0","1","0","0","0","1","1","0","1","0","0")
r3 <- c("0","1","0","0","1","0","0","0","1","0","0","1","0")
r4 <- c("0","0","1","0","0","1","0","0","1","0","0","0","1")
r5 <- c("0","0","1","0","0","0","1","0","1","0","0","0","1")
n.mat_new <- rbind(r1,r2,r3,r4,r5)
My code, but it crashes, because of the columns with only one string:
n.mat <- do.call(cbind, apply(n.mat, 2, function(x) {
tmp <-strsplit(x, '')
t(sapply(tmp, `[`, 1:max(lengths(tmp))))
}))
There's no need for apply or paste for this specific problem. Simply transpose the matrix, split all the strings, and re-construct the matrix according to the number of rows in the original matrix.
matrix(unlist(strsplit(t(n.mat), "")), nrow = nrow(n.mat), byrow = TRUE)
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
# [1,] "0" "0" "0" "1" "0" "0" "0" "1" "0" "1" "1" "0" "0"
# [2,] "1" "0" "0" "1" "0" "0" "0" "1" "1" "0" "1" "0" "0"
# [3,] "0" "1" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1" "0"
# [4,] "0" "0" "1" "0" "0" "1" "0" "0" "1" "0" "0" "0" "1"
# [5,] "0" "0" "1" "0" "0" "0" "1" "0" "1" "0" "0" "0" "1"
If you want further optimizations, you can do something like the following, which will retain the rownames
matrix(unlist(strsplit(t(n.mat), "", TRUE), use.names = FALSE),
nrow = nrow(n.mat), byrow = TRUE,
dimnames = list(rownames(n.mat), NULL))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
# r1 "0" "0" "0" "1" "0" "0" "0" "1" "0" "1" "1" "0" "0"
# r2 "1" "0" "0" "1" "0" "0" "0" "1" "1" "0" "1" "0" "0"
# r3 "0" "1" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1" "0"
# r4 "0" "0" "1" "0" "0" "1" "0" "0" "1" "0" "0" "0" "1"
# r5 "0" "0" "1" "0" "0" "0" "1" "0" "1" "0" "0" "0" "1"
By avoiding apply, you're only calling strsplit once, so you're going to notice much better performance if you have a lot of rows to process.
On my Chromebook (so these times are likely to be slow to begin with) testing with 10,000 rows, I get the following:
nrow(n.mat)
# [1] 10000
bench::mark(am_opt(), am(), gki(), jay(), check = FALSE)
# # A tibble: 4 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
# 1 am_opt() 28.3ms 40.1ms 27.4 2.75MB 0 14 0 511ms
# 2 am() 36.1ms 41.2ms 24.6 2.75MB 0 13 0 528ms
# 3 gki() 220.3ms 229.4ms 4.39 3.43MB 0 3 0 683ms
# 4 jay() 975.8ms 975.8ms 1.02 3.51MB 1.02 1 1 976ms
# # … with 4 more variables: result <list>, memory <list>, time <list>, gc <list>
I didn't benchmark Karthik's answer because just running it once took more than 1 minute.
system.time(karthik())
# user system elapsed
# 81.341 0.000 81.343
Where the functions are directly copied from the other answers:
am_opt <- function() {
matrix(unlist(strsplit(t(n.mat), "", TRUE), use.names = FALSE),
nrow = nrow(n.mat), byrow = TRUE,
dimnames = list(rownames(n.mat), NULL))
}
am <- function() matrix(unlist(strsplit(t(n.mat), "")), nrow = nrow(n.mat), byrow = TRUE)
gki <- function() matrix(unlist(apply(n.mat, 1, strsplit, split = "")), nrow(n.mat), byrow=TRUE)
jay <- function() t(apply(n.mat, 1, function(x) el(strsplit(Reduce(paste0, x), ""))))
karthik <- function() bind_rows(apply(n.mat, 2, strsplit, split = '')) %>% t
Collapse paste0 using Reduce and use strsplit on "".
t(apply(n.mat, 1, function(x) el(strsplit(Reduce(paste0, x), ""))))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
# r1 "0" "0" "0" "1" "0" "0" "0" "1" "0" "1" "1" "0" "0"
# r2 "1" "0" "0" "1" "0" "0" "0" "1" "1" "0" "1" "0" "0"
# r3 "0" "1" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1" "0"
# r4 "0" "0" "1" "0" "0" "1" "0" "0" "1" "0" "0" "0" "1"
# r5 "0" "0" "1" "0" "0" "0" "1" "0" "1" "0" "0" "0" "1"
Does this work:
library(dplyr)
bind_rows(apply(n.mat, 2, strsplit, split = '')) %>% t
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
r1 "0" "0" "0" "1" "0" "0" "0" "1" "0" "1" "1" "0" "0"
r2 "1" "0" "0" "1" "0" "0" "0" "1" "1" "0" "1" "0" "0"
r3 "0" "1" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1" "0"
r4 "0" "0" "1" "0" "0" "1" "0" "0" "1" "0" "0" "0" "1"
r5 "0" "0" "1" "0" "0" "0" "1" "0" "1" "0" "0" "0" "1"
You can use strsplit in apply, unlist the result and create with this a matrix.
matrix(unlist(apply(n.mat, 1, strsplit, split = "")), nrow(n.mat), byrow=TRUE)
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
#[1,] "0" "0" "0" "1" "0" "0" "0" "1" "0" "1" "1" "0" "0"
#[2,] "1" "0" "0" "1" "0" "0" "0" "1" "1" "0" "1" "0" "0"
#[3,] "0" "1" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1" "0"
#[4,] "0" "0" "1" "0" "0" "1" "0" "0" "1" "0" "0" "0" "1"
#[5,] "0" "0" "1" "0" "0" "0" "1" "0" "1" "0" "0" "0" "1"
Is it possible to generate 0/1 character matrices like those shown below right from bifurcating phylogenetic trees like those on the left. The 1 in the matrix indicates presence of a shared character that unites the clades.
This code generates nice random trees but I have no idea where to begin to turn the results into a character matrix.
library(ape) # Other package solutions are acceptable
forest <- rmtree(N = 2, n = 10, br = NULL)
plot(forest)
To be clear, I can use the following code to generate random matrices, and then plot the trees.
library(ape)
library(phangorn)
ntaxa <- 10
nchar <- ntaxa - 1
char_mat <- array(0, dim = c(ntaxa, ntaxa - 1))
for (i in 1:nchar) {
char_mat[,i] <- replace(char_mat[,i], seq(1, (ntaxa+1)-i), 1)
}
char_mat <- char_mat[sample.int(nrow(char_mat)), # Shuffle rows
sample.int(ncol(char_mat))] # and cols
# Ensure all branch lengths > 0
dist_mat <- dist.gene(char_mat) + 0.5
upgma_tree <- upgma(dist_mat)
plot.phylo(upgma_tree, "phylo")
What I want is to generate random trees, and then make the matrices from those trees. This solution does not make the right type of matrix.
Edit for clarity: I am generating binary character matrices that students can use to draw phylogenetic trees using simple parsimony. The 1 character represents homologies that unite taxa into clades. So, all rows must share one character (a 1 across all rows in one column) and some characters must be shared by only two taxa. (I'm discounting autapomorphies.)
Examples:
you can have a look at the rTraitDisc function in ape that is pretty straight forward:
library(ape)
## You'll need to simulate branch length!
forest <- rmtree(N = 2, n = 10)
## Generate on equal rate model character
(one_character <- rTraitDisc(forest[[1]], type = "ER", states = c(0,1)))
# t10 t7 t5 t9 t1 t4 t2 t8 t3 t6
# 0 0 0 1 0 0 0 0 0 0
# Levels: 0 1
## Generate a matrix of ten characters
(replicate(10, rTraitDisc(forest[[1]], type = "ER", states = c(0,1))))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# t10 "0" "0" "0" "0" "1" "0" "0" "0" "0" "0"
# t7 "0" "0" "0" "0" "1" "0" "0" "0" "0" "0"
# t5 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t9 "0" "0" "1" "0" "0" "0" "0" "0" "0" "0"
# t1 "0" "0" "1" "0" "0" "0" "0" "0" "0" "0"
# t4 "0" "0" "1" "0" "0" "0" "0" "0" "0" "0"
# t2 "0" "0" "1" "0" "0" "0" "0" "0" "0" "0"
# t8 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t3 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t6 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
To apply it to multiple tree, the best would be to create a lapply function like so:
## Lapply wrapper function
generate.characters <- function(tree) {
return(replicate(10, rTraitDisc(tree, type = "ER", states = c(0,1))))
}
## Generate 10 character matrices for each tree
lapply(forest, generate.characters)
# [[1]]
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# t10 "0" "0" "0" "1" "0" "0" "0" "0" "0" "0"
# t7 "0" "0" "0" "1" "0" "0" "0" "0" "0" "0"
# t5 "0" "0" "0" "1" "0" "0" "0" "0" "0" "0"
# t9 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t1 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t4 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t2 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t8 "0" "0" "0" "1" "0" "1" "0" "0" "0" "1"
# t3 "0" "0" "0" "0" "0" "1" "0" "0" "0" "0"
# t6 "0" "0" "0" "0" "0" "1" "0" "0" "0" "0"
# [[2]]
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# t7 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t9 "1" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t5 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t2 "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# t4 "0" "1" "0" "0" "1" "0" "0" "0" "0" "0"
# t6 "0" "1" "0" "0" "1" "0" "0" "0" "0" "0"
# t10 "0" "1" "1" "0" "1" "1" "0" "0" "0" "1"
# t8 "0" "1" "1" "0" "1" "0" "0" "0" "0" "0"
# t3 "0" "1" "0" "0" "0" "0" "0" "0" "0" "0"
# t1 "0" "1" "0" "0" "0" "0" "0" "0" "0" "0"
Another option is to use the sim.morpho from the dispRity package. This function reuses the rTraitDisc function but has a bit more models implemented and alllows the rates to be provided as distributions from which to sample. It also allows characters to look a bit more "realistic" without to much invariant data and insuring that the generated character "looks" like a real morphological character (like with the right amount of homoplasy, etc...).
library(dispRity)
## You're first tree
tree <- forest[[1]]
## Setting up the parameters
my_rates = c(rgamma, rate = 10, shape = 5)
my_substitutions = c(runif, 2, 2)
## HKY binary (15*50)
matrixHKY <- sim.morpho(tree, characters = 50, model = "HKY",
rates = my_rates, substitution = my_substitutions)
## Mk matrix (15*50) (for Mkv models)
matrixMk <- sim.morpho(tree, characters = 50, model = "ER", rates = my_rates)
## Mk invariant matrix (15*50) (for Mk models)
matrixMk <- sim.morpho(tree, characters = 50, model = "ER", rates = my_rates,
invariant = FALSE)
## MIXED model invariant matrix (15*50)
matrixMixed <- sim.morpho(tree, characters = 50, model = "MIXED",
rates = my_rates, substitution = my_substitutions, invariant = FALSE,
verbose = TRUE)
I suggest you have a read at the sim.morpho function for the proper references on how the model work or at the relevant section in the dispRity package manual.
I figured out how to make the matrix using Descendants from the phangorn package. I still have to tweak it with suitable node labels to match the example matrix in the original question, but the framework is there.
library(ape)
library(phangorn)
ntaxa <- 8
nchar <- ntaxa - 1
tree <- rtree(ntaxa, br = NULL)
# Gets descendants, but removes the first ntaxa elements,
# which are the individual tips
desc <- phangorn::Descendants(tree)[-seq(1, ntaxa)]
char_mat <- array(0, dim = c(ntaxa, nchar))
for (i in 1:nchar) {
char_mat[,i] <- replace(char_mat[,i], y <- desc[[i]], 1)
}
rownames(char_mat) <- tree$tip.label
char_mat
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7]
#> t6 1 1 0 0 0 0 0
#> t3 1 1 1 0 0 0 0
#> t7 1 1 1 1 0 0 0
#> t2 1 1 1 1 1 0 0
#> t5 1 1 1 1 1 0 0
#> t1 1 0 0 0 0 1 1
#> t8 1 0 0 0 0 1 1
#> t4 1 0 0 0 0 1 0
plot(tree)
Created on 2019-01-28 by the reprex package (v0.2.1)
I have created a matrix like this:
> head(matrix)
Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 Var11
[1,] "0" "0" "1" "0" "1" "1" "0" "0" "0" "0" "NA"
[2,] "1" "0" "1" "0" "1" "1" "0" "0" "0" "0" "NA"
[3,] "0" "1" "1" "0" "1" "1" "0" "0" "0" "0" "NA"
[4,] "1" "1" "1" "0" "1" "1" "0" "0" "0" "0" "NA"
[5,] "0" "0" "2" "0" "1" "1" "0" "0" "0" "0" "NA"
[6,] "1" "0" "2" "0" "1" "1" "0" "0" "0" "0" "NA"
Now, I want to compare the matrix above with the following data frame:
> head(df)
cod Var11 Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 Var12
1 C000354 B 1 1 4 0 1 2 0 0 0 1 51520.72
2 C000404 A 1 0 1 0 4 4 0 0 1 1 21183.25
3 C000444 A 1 0 4 1 3 3 0 0 0 1 67504.74
4 C000480 A 1 1 2 0 2 3 0 0 1 1 26545.92
5 C000983 C 1 0 1 0 3 4 0 0 0 0 10379.37
6 C000985 C 1 0 3 1 3 4 0 0 0 0 18660.99
Matrix contains all possible combinations of the variables Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10, so basically when a row of df (only column from VAR1 to VAR10) match with a row of matrix and this row in df had a Var12>=90000, I would like it to be written "A" in corresponding column VAR11 of matrix.
I have tried with this:
for (i in 1 : nrow(matrix)) {
for (j in 1 : 10) {
ifelse(matrix[i,j]==df[,(j+2)]
&& df$Var12[] >= 90000,
matrix[i,"Var11"] <- "A",
matrix[i,"Var11"] <- "NA")
}
}
But this writes NA in all rows of matrix.
Does anyone know why this happen or how to solve it?
Thanks in advance.
I don't understand why you used 1:10 and j+2 in your loop.
#Some dummy data
col_to_match<-paste0("V",1:10)
set.seed(123)
mat <- cbind(matrix(sample(0:4, 100, replace=TRUE), ncol=10), "NA")
colnames(mat)<-c(col_to_match,"V11")
set.seed(123)
df<- data.frame("cod"=paste0("C",1:20), "V12"= runif(20,min=88000,max=95000))
set.seed(1)
df <- cbind(df, rbind(mat[3:10,col_to_match], matrix(sample(0:4, 120, replace=TRUE), ncol=10)) )
From the dummy data, we expect the rows of the matrix
c(3:10)[df[1:8,"V12"]>=90000] to match the dummy data. Those are rows 3 4 5 6 7 9 10.
Run the following to check for every row in matrix, find whether there are any matching rows in df, and whether the V12 value is greater than 90000.
for(i in 1:nrow(mat)){
hasMatch<-any(sapply(1:nrow(df), function(j) all( df[j,col_to_match] == mat[i, col_to_match] ) && df[j,"V12"]>=90000 ))
if(hasMatch) mat[i, "V11"]<-"A"
}
Output
> mat
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
[1,] "1" "4" "4" "4" "0" "0" "3" "3" "1" "0" "NA"
[2,] "3" "2" "3" "4" "2" "2" "0" "3" "3" "3" "NA"
[3,] "2" "3" "3" "3" "2" "3" "1" "3" "2" "1" "A"
[4,] "4" "2" "4" "3" "1" "0" "1" "0" "3" "3" "A"
[5,] "4" "0" "3" "0" "0" "2" "4" "2" "0" "1" "A"
[6,] "0" "4" "3" "2" "0" "1" "2" "1" "2" "0" "A"
[7,] "2" "1" "2" "3" "1" "0" "4" "1" "4" "3" "A"
[8,] "4" "0" "2" "1" "2" "3" "4" "3" "4" "0" "NA"
[9,] "2" "1" "1" "1" "1" "4" "3" "1" "4" "2" "A"
[10,] "2" "4" "0" "1" "4" "1" "2" "0" "0" "2" "A"
This question already has answers here:
R counting strings variables in each row of a dataframe
(2 answers)
Closed 6 years ago.
I have a dataframe like below sample:
A B C
[1,] "A1" "B3" "C1"
[2,] "A2" "B1" "C2"
[3,] "A3" "B3" "C3"
[4,] "A1" "B2" "C3"
[5,] "A3" "B3" "C2"
[6,] "A1" "B1" "C1"
And I would like to reshape it like this, to expand every unique value of variables to a single variable, and mark 1/0 in the value field. Above data frame shall be reshaped to this:
A B1 B2 B3 C1 C2 C3
[1,] "A1" "0" "0" "1" "1" "0" "0"
[2,] "A2" "1" "0" "0" "0" "1" "0"
[3,] "A3" "0" "0" "1" "0" "0" "1"
[4,] "A1" "0" "1" "0" "0" "0" "1"
[5,] "A3" "0" "0" "1" "0" "1" "0"
[6,] "A1" "1" "0" "0" "1" "0" "0"
The real data is in huge amount (>100 thousand per day, and much more fields and unique values. So I need a high efficiency program instead of using for...
I believe you could help... I am a beginner, only know for... :(
You can try this too (with base R):
df <- cbind(as.character(df$A), model.matrix(~B+C+0,df,list(B=contrasts(df$B, contrasts=F),
C=contrasts(df$C, contrasts=F))))
dimnames(df) <- list(NULL, c('A', paste0('B',1:3), paste0('C',1:3)))
df
# A B1 B2 B3 C1 C2 C3
#[1,] "A1" "0" "0" "1" "1" "0" "0"
#[2,] "A2" "1" "0" "0" "0" "1" "0"
#[3,] "A3" "0" "0" "1" "0" "0" "1"
#[4,] "A1" "0" "1" "0" "0" "0" "1"
#[5,] "A3" "0" "0" "1" "0" "1" "0"
#[6,] "A1" "1" "0" "0" "1" "0" "0"
We can use
library(qdapTools)
cbind(df1[1], mtabulate(as.data.frame(t(df1[-1]))))
# A B3 C1 B1 C2 C3 B2
#V1 A1 1 1 0 0 0 0
#V2 A2 0 0 1 1 0 0
#V3 A3 1 0 0 0 1 0
#V4 A1 0 0 0 0 1 1
#V5 A3 1 0 0 1 0 0
#V6 A1 0 1 1 0 0 0