simplifying DNA/ternary encryption process - r

I wanted to simplify a DNA/ternary encryption algorithm. On one hand I have the encrypiton process where I transform a set of base-3 numbers triads (0,1,2) to DNA nucleotides (A,T,C,G). And the de-encryption process that just go the other way around.
The main point of this encryption is to not have two identical consecutive nucleotides together. So if even if the original vector is c(0,0,0,0) the DNA encrypted won´t be c("A","A","A","A"). This is achived by following the tables's algorithm.
Where to deal with the first triad/nucleotide you assume the previus nucleotide is "A". Lest's see a couple of examples.
Encrypting: c(0,0,0,0) -> c("C","G","T","A")
De-encrypting c("T","A","C","G") -> c(2,0,0,0)
This is how I have automated the processes:
Encryption
> S4
[1] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "2" "0" "2" "1" "0" "2" "1" "2"
[31] "1" "1" "0" "1" "0" "1" "1" "0" "0" "0" "1" "0" "1" "2" "1" "0" "0" "0" "0" "0"
S5 <-c(1:length(S4))
for (i in 1:length(S4)) {
if (i == 1 ) {
if (S4[1] == "0") {
S5[1] <- "C"
}
if (S4[1] == "1") {
S5[1] <- "G"
}
if (S4[1] == "2") {
S5[1] <- "T"
}
}
if (i != 1) {
if (S5[i-1] == "A" && S4[i] == "0") {
S5[i] <- "C"
}
if (S5[i-1] == "A" && S4[i] == "1") {
S5[i] <- "G"
}
if (S5[i-1] == "A" && S4[i] == "2") {
S5[i] <- "T"
}
if (S5[i-1] == "C" && S4[i] == "0") {
S5[i] <- "G"
}
if (S5[i-1] == "C" && S4[i] == "1") {
S5[i] <- "T"
}
if (S5[i-1] == "C" && S4[i] == "2") {
S5[i] <- "A"
}
if (S5[i-1] == "G" && S4[i] == "0") {
S5[i] <- "T"
}
if (S5[i-1] == "G" && S4[i] == "1") {
S5[i] <- "A"
}
if (S5[i-1] == "G" && S4[i] == "2") {
S5[i] <- "C"
}
if (S5[i-1] == "T" && S4[i] == "0") {
S5[i] <- "A"
}
if (S5[i-1] == "T" && S4[i] == "1") {
S5[i] <- "C"
}
if (S5[i-1] == "T" && S4[i] == "2") {
S5[i] <- "G"
}
}
}
> S5
[1] "CGTACGTACGTACGTACGTACGCGCTATCAGACTAGACGTCGATCGTACG"
De-encryption
adn <- S5
adn <- unlist(strsplit(adn, split = ""))
adn
s4 <- c()
for (i in 1:length(adn)) { #Loops to transform DNA sequence to terminary according to manual
if (i != 1) {
if (adn[i-1] == "A" && adn[i] == "C") {
s4[i] <- 0
}
if (adn[i-1] == "A" && adn[i] == "G") {
s4[i] <- 1
}
if (adn[i-1] == "A" && adn[i] == "T") {
s4[i] <- 2
}
if (adn[i-1] == "C" && adn[i] == "G") {
s4[i] <- 0
}
if (adn[i-1] == "C" && adn[i] == "T") {
s4[i] <- 1
}
if (adn[i-1] == "C" && adn[i] == "A") {
s4[i] <- 2
}
if (adn[i-1] == "G" && adn[i] == "T") {
s4[i] <- 0
}
if (adn[i-1] == "G" && adn[i] == "A") {
s4[i] <- 1
}
if (adn[i-1] == "G" && adn[i] == "C") {
s4[i] <- 2
}
if (adn[i-1] == "T" && adn[i] == "A") {
s4[i] <- 0
}
if (adn[i-1] == "T" && adn[i] == "C") {
s4[i] <- 1
}
if (adn[i-1] == "T" && adn[i] == "G") {
s4[i] <- 2
}
}
if (i == 1 ) {
if (adn[1] == "C") {
s4[1] <- 0
}
if (adn[1] == "G") {
s4[1] <- 1
}
if (adn[1] == "T") {
s4[1] <- 2
}
}
}
> s4
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 1 0 2 1 2 1 1 0 1 0 1 1 0 0 0 1 0 1 2 1 0 0 0 0 0
Question
How do I make this couple of steps more elegant?

We can simplify this by making a data.frame containing the encodings, which lets us get access them without testing for each combination:
enc <- data.frame(A=c('C','G','T'),
C=c('G','T','A'),
G=c('T','T','C'),
T=c('A','C','G'),
row.names = c(0,1,2))
The decode function is simple, since we have all the information we need upfront. We just go through the string, getting the current base, the previous base (defaulting to 'A' if absent), then using them to look up where in the table matches up with those values, returning the rowname which has the value for the trit:
decode <- function(d) {
sapply(seq_along(d), function(i) {
base = d[i]
prev = ifelse(length(d[i-1]) == 0, 'A', d[i-1])
rownames(enc)[which(enc[[prev]] == base)]
})
}
decode(c('T', 'A', 'C', 'G'))
[1] 2 0 0 0
Encoding is a bit harder since we need to look at the previously encoded values, but it's pretty much the same. The only difference is that we need to use a for loop and store each value in a list as we encode them, so we can look back and find the previous trit.
encode <- function(e) {
res <- list()
for (i in seq_along(e)) {
trit = e[i]
prev = ifelse(length(res[i-1]) == 0, 'A', res[[i-1]])
res[[i]] <- enc[as.character(trit), prev] # Select row by name, not index
}
return(unlist(res)) # unlist to return vector, not list
}
encode(c(0,0,0,0))
[1] "C" "G" "T" "A"

Related

How to use NA in if..else if statement in R

I created this small example. I want to print some values, for example, B for NA values using the if else statement.
x = c(1,7,NA, 3, NA, NA)
for(i in 1: length(x)){
y = x[i]
if(y == 1){
print("A")
}
else if(y == 'NA'){
print("B")
}
else{
print("C")
}
}
I am getting an error message Error in if (y == 1) { : missing value where TRUE/FALSE needed Why can't I print B for NA values? How to use NA within the if else statement?
The issue is also that == with NA returns NA and not FALSE. Make sure to add a condition to check NA as well. Also, y == 'NA' should be replaced with is.na(y)
for(i in 1:length(x)){
y = x[i]
if(y == 1 & !is.na(y)){
print("A")
}
else if(is.na(y)){
print("B")
}
else{
print("C")
}
}
-output
[1] "A"
[1] "C"
[1] "B"
[1] "C"
[1] "B"
[1] "B"
Or this can be done in a vectorized way
c("C", "B", "A")[1 + is.na(x) + 2 *(x %in% 1)]
#[1] "A" "C" "B" "C" "B" "B"
To avoid repetition, ensure that the first block checks for NA:
x = c(1,7,NA, 3, NA, NA)
for(i in 1: length(x)){
y = x[i]
if(is.na(y)){
print("B")
}
else if(y == 1){
print("A")
}
else{
print("C")
}
}
[1] "A"
[1] "C"
[1] "B"
[1] "C"
[1] "B"
[1] "B"
You can use vectorized way using case_when or nested ifelse -
dplyr::case_when(is.na(x) ~ 'B',
x == 1 ~ 'A',
TRUE ~ 'C')
#[1] "A" "C" "B" "C" "B" "B"

R - subscript out of bounds on if statement

I have a ncol=10, nrow=343 matrix "E" containing letters "a", "b",..,or "f" standing for technologies, where "a" stands for gas. Whenever [row, col]="a" and [row+1,col] !="a" I add resale values for "a" in [row,col] from s_house_gas dim(ncol=10,nrow=10) where the rows are the years like in E. Additionally, resale values for "a" shall be added in E[5,] to account for a resale in the last year.
The code below works, however, I want to add the condition that resale values shall not be added when [row+1] =="d" or "e" but I cannot seem to make it work.
Maybe I have looked at it for too long but I would really appreciate your support here, I feel like I am very close.
HV <- rep.int(0, ncol(E))
R<- rbind(E,HV)
i <- 0
for(col in 1:ncol(R)){
for(row in 1:nrow(R)){
if(R[row,col] == "a" && row <= 10){
i = i+1
R[row,col] = i
} else if (R[row,col] != "a" || row == 11){
i = 0
}
}
}
R <- R[-11,]
Y <- R
for(row in 1:nrow(Y)){
for(col in 1:ncol(Y)){
if(Y[row,col] == "a" || Y[row,col] == "b"|| Y[row,col] == "c" || Y[row,col] == "d"|| Y[row,col] == "e"|| Y[row,col] == "f"){
Y[row,col] = 0
}
}
}
Y <- apply(Y, 2,as.numeric)
L <- rbind(HV,Y,HV)
M <- matrix(0L, nrow = dim(L)[1], ncol = dim(L)[2])
for(col in 1:ncol(L)){
for (row in 2:nrow(L)) {
if(L[row,col] == 0 && L[row-1,col] != 0) {
M[row,col] = row-1-L[row-1,col]
}
}
}
I <- M
N <- matrix(0, nrow = dim(I)[1], ncol = dim(I)[2])
i <- 0
j <- 0
for(row in 2:nrow(I)){
for(col in 1:ncol(I)){
if(I[row,col] != 0){
i = L[row-1,col]
j = M[row,col]
N[row-1,col] = s_house_gas[i, j]
}
}
}
resale_gas <- N[c(-1,-12),]
resale_gas <- matrix(resale_gas, ncol=343, nrow=10)
EDIT
library(xts)
library(stringi)
library(gtools)
t <- c("a","b","c","d","e","f")
E <- t(permutations(6,5, v=t,repeats.allowed=T))
s_house_gas <- 15:11
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] "a" "a" "a" "a" "a" "a" "a" "a"
[2,] "a" "a" "a" "a" "a" "a" "a" "a"
[3,] "a" "a" "a" "a" "a" "a" "a" "a"
[4,] "a" "a" "a" "a" "a" "a" "b" "b"
[5,] "a" "b" "c" "d" "e" "f" "a" "b"
This is what I tried:
HV <- rep.int(0, ncol(E))
R<- rbind(E,HV)
i <- 0
for(col in 1:ncol(R)){
for(row in 1:(nrow(R)-1)){
if(R[row,col] == "a" && R[row+1,col] != "d" && row <= 10 || R[row,col] == "a" && R[row+1,col] != "e" && row <= 10){
i = i+1
R[row,col] = i
} else if (R[row,col] != "a" || row == 11){
i = 0
}
}
}
R <- R[-11,]
Y <- R
for(row in 1:nrow(Y)){
for(col in 1:ncol(Y)){
if(Y[row,col] == "a" || Y[row,col] == "b"|| Y[row,col] == "c" || Y[row,col] == "d"|| Y[row,col] == "e"|| Y[row,col] == "f"){
Y[row,col] = 0
}
}
}
Y <- apply(Y, 2,as.numeric)
L <- rbind(HV,Y,HV)
M <- matrix(0L, nrow = dim(L)[1], ncol = dim(L)[2])
for(col in 1:ncol(L)){
for (row in 2:nrow(L)) {
if(L[row,col] == 0 && L[row-1,col] != 0) {
M[row,col] = row-1-L[row-1,col]
}
}
}
I <- M
N <- matrix(0, nrow = dim(I)[1], ncol = dim(I)[2])
i <- 0
j <- 0
for(row in 2:nrow(I)){
for(col in 1:ncol(I)){
if(I[row,col] != 0){
i = L[row-1,col]
j = M[row,col]
N[row-1,col] = s_house_gas[i, j]
}
}
}
resale_gas <- N[c(-1,-12),]
resale_gas <- matrix(resale_gas, ncol=343, nrow=10)

Creating a loop for segments within a vector that give return values for each corresponding segment

I've created a vector "numGrades" of 100 random numbers to represent values within a grading system. I need to write a "for" loop that takes segments of numerical grades and returns a vector of corresponding letter grades i.e.: 90+ = "A", 80-89 = "B", 70-79 = "C", 60-69 = "D", 0-59 = "F". I want to be able to run numGrades to return the corresponding letter grade for example: numGrades = [72, 65, 93] returning = ["C", "D", "A"] with the loop handling vectors of any length. This is what I have tried so far individually. All of these loops have returned warnings:
set.seed(43)
numGrades <- sample(0:100, 100, replace=FALSE)
for (i in numGrades )
if(91 <= numGrades[i]) {
numGrades[i] <- "A"
} else if (80 <= numGrades[i] & numGrades[i] <= 90) {
numGrades[i] =="B"
} else if (70 <= numGrades[i] & numGrades[i] <= 79) {
numGrades[i] =="C"
} else if (60 <= numGrades[i] & numGrades[i] <= 69) {
numGrades[i] =="D"
} else if (0 <= numGrades[i] & numGrades[i] <= 59) {
numGrades[i] =="F"
}
It's saying:
Error in if (91 <= numGrades[i]) { :
missing value where TRUE/FALSE needed
New Edit (Returns for grades >= 91 only):
numGrades <- (0:100)
for (i in 1:length(numGrades ))
if(91 <= numGrades[i]) {
numGrades[i] <- "A"
} else if (80 <= numGrades[i] & numGrades[i] <= 90) {
numGrades[i] =="B"
} else if (70 <= numGrades[i] & numGrades[i] <= 79) {
numGrades[i] =="C"
} else if (60 <= numGrades[i] & numGrades[i] <= 69) {
numGrades[i] =="D"
} else if (0 <= numGrades[i] & numGrades[i] <= 59) {
numGrades[i] =="F"
}
Working Rough Draft
ltrGrades <- (0:100)
numGrades <- character(length(ltrGrades))
for (i in 1:length(ltrGrades ))
if(any(ltrGrades[i] == 91:100)) {
numGrades[i] <- "A"
} else if (any(ltrGrades[i] == 80:90)) {
numGrades[i] <- "B"
} else if (any(ltrGrades[i] == 70:79)) {
numGrades[i] <- "C"
} else if (any(ltrGrades[i] == 60:69)) {
numGrades[i] <- "D"
} else if (any(ltrGrades[i] == 0:59)) {
numGrades[i] <- "F"
}
There are some fundamental R syntax problems with your code.
(numGrades[i]) c(80:90))
(numGrades[i]) >=80 && <=89)
(numGrades[i]) ==80:89 )
Several alternatives, most inefficient:
(80 <= numGrades[i] & numGrades[i] < 90) # the most basic
(dplyr::between(numGrades[i], 80, 90)) # if dplyr is loaded
(data.table::between(numGrades[i], 80, 90)) # if data.table is available
(numGrades[i] %in% 80:89) # works only if all grades are perfectly integral
(any(numGrades[i] == 80:89)) # ditto
Why are they wrong?
(numGrades[i]) c(80:90)), because there is not operator
(numGrades[i]) >=80 && <=89), R does not infer them as you suggest, every time you do an (in)equality test you need to specify both the LHS and RHS for each one; similarly, unlikely many languages, R does not "chain" them, so (80 <= numGrades[i] <= 89) will not work
(numGrades[i]) ==80:89 ) is getting closer, but if/else statements require a single comparison; in this case, you are comparing one number with a sequence (range) of 10, so the reply from this is length 10. It must be length 1.
Bottom line, though, is that you do not need a loop.
# set.seed(43)
numGrades <- sample(0:100, 100, replace=FALSE)
cut(numGrades, c(-1, 60, 70, 80, 90, 101), labels=c("F","D","C","B","A"))
# [1] F A F D F F D F F C F F F F F B F A F F C A F F F F A F A F C C C B F
# [36] F F F B A F F F A B B C D F F F B D F F B D D B F F F F F F D F A F F
# [71] F F F F B F D F A F F F F F A B F F F C F F F D D C C F F F
# Levels: F D C B A
or if you don't like or understand what a factor is, then
as.character(cut(numGrades, c(-1, 60, 70, 80, 90, 101), labels=c("F","D","C","B","A")))
# [1] "F" "A" "F" "D" "F" "F" "D" "F" "F" "C" "F" "F" "F" "F" "F" "B" "F"
# [18] "A" "F" "F" "C" "A" "F" "F" "F" "F" "A" "F" "A" "F" "C" "C" "C" "B"
# [35] "F" "F" "F" "F" "B" "A" "F" "F" "F" "A" "B" "B" "C" "D" "F" "F" "F"
# [52] "B" "D" "F" "F" "B" "D" "D" "B" "F" "F" "F" "F" "F" "F" "D" "F" "A"
# [69] "F" "F" "F" "F" "F" "F" "B" "F" "D" "F" "A" "F" "F" "F" "F" "F" "A"
# [86] "B" "F" "F" "F" "C" "F" "F" "F" "D" "D" "C" "C" "F" "F" "F"
EDIT
I just noticed something else about your code.
When you start, numGrades is either numeric or integer. However, since it is a vector, the first time you assign a letter to one of its elements, the entire vector is converted to a character vector. The second pass through the for loop will try to compare a number with a string, which will not do a numeric comparison, try 8 < "75" for why this will fail.
As a workaround for this:
ltrGrades <- character(length(numGrades))
for (i in 1:length(numGrades ))
if(91 <= numGrades[i]) {
ltrGrades[i] <- "A"
} else if (80 <= numGrades[i] & numGrades[i] <= 90) {
ltrGrades[i] <- "B"
} else if (70 <= numGrades[i] & numGrades[i] <= 79) {
ltrGrades[i] <- "C"
} else if (60 <= numGrades[i] & numGrades[i] <= 69) {
ltrGrades[i] <- "D"
} else if (0 <= numGrades[i] & numGrades[i] <= 59) {
ltrGrades[i] <- "F"
}

how to name the elements of a list on the fly?

I would like to name the elements of a list on the fly with the content of the variable, how should I do?
DT <- data.table(A=LETTERS[1:3], B=letters[1:3], C= 1:9)
lapply(unique(DT$A), function(xA){
RTN <-
lapply(unique(DT$B), function(xB){
output <- DT[A == xA & B == xB]$C
if(length(output)== 0L) {
}else{
c(xA, xB, output)
}
})
})
the result is
[[1]]
[[1]][[1]]
[1] "A" "a" "1" "4" "7"
[[1]][[2]]
NULL
[[1]][[3]]
NULL
[[2]]
[[2]][[1]]
NULL
[[2]][[2]]
[1] "B" "b" "2" "5" "8"
[[2]][[3]]
NULL
I would like to make it as following
[[A]]
[[A]][[a]]
[1] "A" "a" "1" "4" "7"
[[A]][[b]]
NULL
[[A]][[c]]
NULL
[[B]]
[[B]][[a]]
NULL
[[B]][[B]]
[1] "B" "b" "2" "5" "8"
[[B]][[c]]
NULL
Besides, how can I remove the NULL, and make it a complete case matrix? Many thanks.
Here are two solutions:
1) Use sapply and set USE.NAMES = TRUE
2) Capture the names before each lapply and set them after.
DT <- data.table(A=LETTERS[1:3], B=letters[1:3], C= 1:9)
outer_list_names <- unique(DT$A)
outer_list <- lapply(unique(DT$A), function(xA){
RTN_names = unique(DT$B)
RTN <-
lapply(unique(DT$B), function(xB){
output <- DT[A == xA & B == xB]$C
if(length(output)== 0L) {
}else{
c(xA, xB, output)
}
})
names(RTN) <- RTN_names
})
names(outer_list) <- outer_list_names
outer_list
We could create a named vector to name the list
A_vec <- setNames(unique(DT$A), unique(DT$A))
B_vec <- setNames(unique(DT$B), unique(DT$B))
lapply(A_vec, function(xA){
RTN <- lapply(B_vec, function(xB){
output <- DT[A == xA & B == xB]$C
if(length(output) > 0L) {
c(xA, xB, output)
}
})
})
#$A
#$A$a
#[1] "A" "a" "1" "4" "7"
#$A$b
#NULL
#$A$c
#NULL
#$B
#$B$a
#NULL
#$B$b
#[1] "B" "b" "2" "5" "8"
#$B$c
#NULL
If you want to remove the NULL values we could have a Filter to remove them
lapply(A_vec, function(xA){
RTN <- lapply(B_vec, function(xB){
output <- DT[A == xA & B == xB]$C
if(length(output) > 0L) {
c(xA, xB, output)
}
})
Filter(Negate(is.null), RTN)
})
#$A
#$A$a
#[1] "A" "a" "1" "4" "7"
#$B
#$B$b
#[1] "B" "b" "2" "5" "8"
#$C
#$C$c
#[1] "C" "c" "3" "6" "9"

An efficient way of converting a vector of characters into integers based on frequency in R

I have a vector of characters consisting of only 'a' or 'g', I want to convert them to integers based on frequency, i.e. the more frequent one should be coded to 0, and the other to 1, for example:
set.seed(17)
x = sample(c('g', 'a'), 10, replace=T)
x
# [1] "g" "a" "g" "a" "g" "a" "g" "g" "a" "g"
x[x == names(which.max(table(x)))] = 0
x[x != 0] = 1
x
# [1] "0" "1" "0" "1" "0" "1" "0" "0" "1" "0"
This works, but I wonder if there is a more efficient way to do it.
(We don't have to consider the 50%-50% case here, because it should never happen in our study.)
Use this:
ag.encode <- function(x)
{
result <- x == "a"
if( sum(result) > length(result) %/% 2 ) 1-result else as.numeric(result)
}
If you want to keep the labels in a factor structure, use this instead:
ag.encode2factor <- function(x)
{
result <- x == "a"
if( sum(result) > length(result) %/% 2 )
{
factor(2-result, labels=c("a","g"))
}
else
{
factor(result+1, labels=c("g","a"))
}
}
You can convert your character vector to a factor one. This solution is more general in the sense you don't need to know the name of the 2 characters used to create x.
y <- as.integer(factor(x))-1
if(sum(y)>length(y)/2) y <- as.integer(!y)

Resources