Convert matrix values in vector with some conditions - r

I'd like to convert matrix values in vector with some conditions. In my example:
# Create my matrix
mymatrix <-matrix(
# Create a numeric variable
abs(rnorm(300)),
# No of rows
nrow = 10,
# No of columns
ncol = 3,
# By default matrices are in column-wise order
# So this parameter decides how to arrange the matrix
byrow = TRUE
)
# Naming rows
rownames(mymatrix) = 1:10
# Naming columns
colnames(mymatrix ) = c("1", "2", "3")
mymatrix
# 1 2 3
#1 0.85882558 1.38755611 0.369197570
#2 1.58785948 1.13064411 1.542977629
#3 0.35293056 1.44036121 1.806414543
#4 0.02709663 1.25620400 0.794001157
#5 0.34426152 0.32365824 2.026024465
#6 0.03608507 1.12315562 1.072635275
#7 0.39055300 0.49463748 0.645037388
#8 0.33406392 0.63543332 0.005055208
#9 1.04796081 0.04062249 2.330948193
#10 0.42538451 0.24574490 0.268357588
I'd like to convert my matrix to vector (myvector) using a custom rule:
If mymatrix[,1]is the maximum value in the row and mymatrix[,1]>=0.95 then the vector result is "1", but if mymatrix[,1]<0.95 than the result is "misclassified", but for mymatrix[,2] and mymatrix[,3] the result ("2") or ("3") is the maximum value inside each row. My desirable output is:
myvector
#[1] "2" "1" "3" "2" "3" "2" "3" "2" "1" "misclassified"
Please, any ideas?

Here's a vectorised option -
#Get the column number of max value in each row
res <- max.col(mymatrix)
#Get row number where column 1 is highest
inds <- which(res == 1)
#If those value is less than 0.95 make it 'misclassified'
res[inds][mymatrix[inds, 1] < 0.95] <- 'misclassified'
res
#[1] "2" "1" "3" "2" "3"
#[6] "2" "3" "2" "3" "misclassified"

It looks like you want to apply a function over your rows. So apply would be appropriate here:
apply(mymatrix, 1, \(x) { y <- which.max(x)
if (y == 1) {if (x[y] >= 0.95) "1" else "misclassified"} else as.character(y)})
[1] "2" "1" "3" "2" "3"
[6] "2" "3" "2" "3" "misclassified"

You can try apply + ifelse
apply(
mymatrix,
1,
function(x) {
ifelse(max(x) >= 0.95,
colnames(mymatrix)[which.max(x)],
"misclassified"
)
}
)

Related

Creating unique object names for list entries

I have example data as follows:
listoflists=
list(
list(a = c(1,'a',3,4) , b = c(5,'b',6,7) ),
list(a = c(1,'a',2,6) , b = c(5,'b',0,8) ),
list(d = c(1,'a',2,6) , b = c(5,'b',0,8) ),
list(d = c(1,'a',2,3) , b = c(5,'b',0,8) , a = c(5,'b',0,8)),
list(d = c(1,'a',1,1))
)
I would like rename the names (such as a and b), (only) if they have already been used. For example by adding a number at the end.
I am having some trouble thinking of the right way to approach this..
Any suggestions?
Maybe there is a more elegant way but just going through the list and renaming items while keeping count of each name's uses works fine:
# all used names
names.used <- unique(unlist(sapply(listoflists, names)))
# usage counter for each name
names.n <- setNames(rep(0, length(names.used)), names.used)
for (i in seq_along(listoflists)) {
for (j in seq_along(listoflists[[i]])) {
name.ij <- names(listoflists[[i]])[j]
# rename second and further occurrences
if (names.n[name.ij] > 0) {
names(listoflists[[i]])[j] <- paste0(name.ij, names.n[name.ij])
}
# update counter
names.n[name.ij] <- names.n[name.ij] + 1
}
}
# [[1]]
# [[1]]$a
# [1] "1" "a" "3" "4"
#
# [[1]]$b
# [1] "5" "b" "6" "7"
#
#
# [[2]]
# [[2]]$a1
# [1] "1" "a" "2" "6"
#
# [[2]]$b1
# [1] "5" "b" "0" "8"
#
#
# [[3]]
# [[3]]$d
# [1] "1" "a" "2" "6"
#
# [[3]]$b2
# [1] "5" "b" "0" "8"
#
#
# [[4]]
# [[4]]$d1
# [1] "1" "a" "2" "3"
#
# [[4]]$b3
# [1] "5" "b" "0" "8"
#
# [[4]]$a2
# [1] "5" "b" "0" "8"
#
#
# [[5]]
# [[5]]$d2
# [1] "1" "a" "1" "1"
Another approach would be to alter the elements names using make.unique:
nms = lapply(listoflists, names)
nms = relist(make.unique(unlist(nms), ""), nms)
ll2 = Map(setNames, listoflists, nms)
str(ll2)

rbind named vector to matrix with different lengths

I am trying to bind together a named vector onto a matrix. The named vector has a different length as the matrix:
> m <- matrix(data = c("1", "2", "3"),
nrow = 1, ncol = 3,
dimnames = list(c(),
c("column 1", "column 2", "column 3")))
> named_vec <- c("4", "5")
> names(named_vec) <- c("column 1", "column 2")
> rbind(m, named_vec)
I get the following:
Warning message:
In rbind(m, named_vec) :
number of columns of result is not a multiple of vector length (arg 2)
This has the undesired effect of repeating the shorter vector.
Also, plyr's rbind.fill function does not work here, since both arguments need to be data frames:
> plyr::rbind.fill(data.frame(m), data.frame(named_vec))
Error: All inputs to rbind.fill must be data.frames
My desired output is a matrix that fills in missing values with NA's instead of repeating the vector, like this:
column 1 column 2 column 3
[1,] "1" "2" "3"
[2,] "4" "5" NA
Below is a base R solution
do.call(rbind,lapply(u<-list(m,named_vec),`length<-`,max(lengths(u))))
such that
column 1 column 2 column 3
[1,] "1" "2" "3"
[2,] "4" "5" NA
If it is ok to convert the matrices to dataframe, you can use bind_rows.
dplyr::bind_rows(data.frame(m), data.frame(t(named_vec)))
# column.1 column.2 column.3
#1 1 2 3
#2 4 5 <NA>
We can use rbindlist
library(data.table)
rbindlist(list(as.data.frame(m), as.data.frame(t(named_vec))), fill = TRUE)

Append value to more than one position in vector [duplicate]

This question already has answers here:
Insert elements into a vector at given indexes
(8 answers)
insert elements in a vector in R
(6 answers)
Closed 4 years ago.
How do we append a single value to multiple positions in a vector?
x=c(1,2,3)
append(x, "a", c(1,3))
[1] "1" "a" "2" "3"
Warning messages:
1: In if (!after) c(values, x) else if (after >= lengx) c(x, values) else c(x[1L:after], :
條件的長度 > 1,因此只能用其第一元素
2: In if (after >= lengx) c(x, values) else c(x[1L:after], values, :
條件的長度 > 1,因此只能用其第一元素
3: In 1L:after : numerical expression has 2 elements: only the first used
4: In (after + 1L):lengx :
numerical expression has 2 elements: only the first used
With the above code, only the first position is registered, with a warning message.
lapply(c(1,3), function(y) append(x, 'a', y))
yields this result:
[[1]]
[1] "1" "a" "2" "3"
[[2]]
[1] "1" "2" "3" "a"
Expected output:
1 a 2 3 a
You can use `Reduce function:
x=1:10
pos=c(3,5,7,10)
Reduce(function(i,j)append(i,"a",j),cumsum(c(pos[1],diff(pos)+1)),init=x)
[1] "1" "2" "3" "a" "4" "5" "a" "6" "7" "a" "8" "9" "10" "a"

Selecting specific elements in a vector in R

I have a vector,
myvector <- c("a","b","c","cat","4","dog","cat","f"). I would like to select out those elements that immediately follow elements containing the string "cat".
I.e., I want myvector2 containing only "4" and "f". I'm not sure where to begin.
myvector <- c("a","b","c","cat","4","dog","cat","f")
where_is_cat <- which(myvector == "cat")
# [1] 4 7
myvector[where_is_cat + 1]
# [1] "4" "f"
myvector2 <- myvector[where_is_cat + 1]
Try this:
x[grep('cat',x)+1]
#[1] "4" "f"
You can subset list minus its first element (list[-1]) by indices where list minus its last element (list[-length(list)]) equals "cat"
list[-1][list[-length(list)]=="cat"]
# [1] "4" "f"

including missing values after random sampling in R (merge vectors and include missing values as 0)

I am trying to do many random sampling trials, and in these samplings I may not get everything every time.
Right now, what I do is
test <- sample(rownames(data), size=10000, replace=T, prob=data$refFraction)
Not every rowname(data) is represented in this, but I need it to be for the next step.
I would like to have it so each time I sample I have the same length (and order) vector so that I can combine each sampling into a matrix (which I also am unsure of how best to do - how can I make thousands of test vectors and merge them at once with one of the apply functions?)
edit: Based on answers, I came up with this:
trials <- function(fractions, kmers, times, ref_size) {
replicate(times, sample(kmers, size=ref_size, replace=T, prob=fractions), simplify=F)
}
result <- trials(data$refFraction, rownames(data), 100, 1000)
mat <- matrix(result, nrow=100)
But I still just want the count of the number of times each thing is seen in the row, while also having zero counts so I end up with a even matrix of counts.
The desired result is something like:
"A" "B" "C"
Trial1 2 5 6
Trial2 3 7 12
Trial3 0 5 14
dput(head(data)):
structure(list(refCount = c(3142L, 4102L, 1975L, 2009L, 2363L,
2437L), refFraction = c(0.00300290255094, 0.00392040301208, 0.00188756605287,
0.00192006086086, 0.00225838915591, 0.00232911314979), readCount = c(147L,
719L, 356L, 418L, 745L, 766L), readFraction = c(0.00029577107721,
0.00144666261574, 0.000716289139367, 0.000841036124312, 0.00149897586749,
0.00154122887852), foldChange = c(2.31774884958, 0.996935198459,
0.968959564031, 0.825477549838, 0.409869676355, 0.412907501432
), p_value = c(5.05923221341436e-321, 4.46023836252119e-170,
2.29230878162415e-77, 1.73499617494115e-59, 2.80547347576314e-15,
4.32620038741552e-16)), .Names = c("refCount", "refFraction",
"readCount", "readFraction", "foldChange", "p_value"), row.names = c("AAAAA",
"AAAAT", "AAAAG", "AAAAC", "AAATA", "AAATT"), class = "data.frame")
It's not exactly clear what you are trying to do, but it seems like this might help.
replicate is great for repeated sampling. Here I create a 5 row data frame d, and then sample the row names ten separate times. When used this way, replicate results in a matrix, so it sounds like you may want this method.
> d <- data.frame(x = 1:5, y = 6:10)
> replicate(10, sample(rownames(d)))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# [1,] "5" "1" "1" "3" "4" "1" "4" "5" "3" "1"
# [2,] "4" "5" "2" "2" "3" "5" "1" "2" "1" "2"
# [3,] "1" "4" "5" "5" "5" "4" "3" "3" "2" "3"
# [4,] "2" "3" "3" "1" "1" "2" "2" "4" "4" "5"
# [5,] "3" "2" "4" "4" "2" "3" "5" "1" "5" "4"
This is how I ended up doing it:
trial_fn <- function(counts) {
replicate(num_trials, sample(counts, size=trial_size, replace=F), simplify=F)
}
tableize <- function(x) {
tmp <- matrix(table(factor(x, levels=1:1024)))[,1]
tmp/sum(tmp)
}
counts <- vector()
for (i in 1:1024) {
counts <- c(counts, rep(i, times=data[i,]$readCount))
}
trials <- trial_fn(counts)
trial_table <- sapply(trials, tableize)
Using factor with levels and then using table on the result is the answer to the original question.

Resources