Avoid the use of nested loop in multiple column comparison - r

I have a dataframe like this:
df <- data.frame(Patient.ID = rep(paste("Pat", seq(1:3), sep = ""), 2),
Gene = c(rep("Gene1", 3), rep("Gene2", 3)),
Ref = c("A", "C", "G", "T", "A", "T"),
Tum1 = c("A", "A", "T", "T", "A", "T"),
Tum2 = c("A", "C", "G", "G", "C", "C"))
What I would like to do is determine the change that is occurring between the Ref or either Tum column. In other words, if Tum1 is different from Tum2 take the character string which is different to the Ref column and store that in a separate column as the change so the dataframe above would become:
df <- data.frame(Patient.ID = rep(paste("Pat", seq(1:3), sep = ""), 2),
Gene = c(rep("Gene1", 3), rep("Gene2", 3)),
Ref = c("A", "C", "G", "T", "A", "T"),
Tum1 = c("A", "A", "T", "T", "A", "T"),
Tum2 = c("A", "C", "G", "G", "C", "C"),
BaseChange = c("NoCh", "C.A", "G.T", "T.G", "A.C", "T.C"))
I'm aware I could use a nested ifelse() statement like below (but extended) to solve this, but my actual dataframe has many more combinations and I figure there has to be a "safer" method of doing so.
df$BaseChange <- as.factor(ifelse(df$Ref == "C" & df$Tum1 == "A" | df$Ref== "C" & df$Tum2 == "A", "C.A",
ifelse((df$Ref == "G" & df$Tum1 == "T" | df$Ref == "G" & df$Tum2 == "T"), "G.T",...)))
Any help would be greatly appreciated.

It's not pretty, but it works:
df <- df %>%
mutate(BaseChange2 = ifelse( (as.character(Ref)==as.character(Tum1) & as.character(Ref) == as.character(Tum2)), "NoCh",
ifelse(as.character(Ref)==as.character(Tum1),paste(Ref,Tum2, sep="."),paste(Ref,Tum1, sep="."))))

It seems tha you need to paste unique Tums together, i.e.
apply(df[3:5], 1, function(i) paste0(unique(i), collapse = '.'))
#[1] "A" "C.A" "G.T" "T.G" "A.C" "T.C"
To replace the first A,
v2 <- apply(df[3:5], 1, function(i) paste0(unique(i), collapse = '.'))
replace(v2, nchar(v2) == 1, 'NoChange')
#[1] "NoChange" "C.A" "G.T" "T.G" "A.C" "T.C"

Related

R function to to search and replace text in a column

#I'm seeking to write code that takes a column with character values (e.g. ALA3=VAL20) and converts it to a specific single letter corresponding to that three letter code (e.g. A3=V20). I attempted using the following function, but seem to be having trouble:
substitute_codes <- function(data, col_name) {
# Create a dictionary of the code substitutions
code_dict <- c("ALA" = "A", "ARG" = "R", "ASN" = "N", "ASP" = "D",
"CYS" = "C", "GLU" = "E", "GLN" = "Q", "GLY" = "G",
"HIS" = "H", "ILE" = "I", "LEU" = "L", "LYS" = "K",
"MET" = "M", "PHE" = "F", "PRO" = "P", "SER" = "S",
"THR" = "T", "TRP" = "W", "TYR" = "Y", "VAL" = "V")
# Apply the substitutions using gsub()
data[[col_name]] <- gsub(paste(names(code_dict), collapse = "|"),
paste(code_dict, collapse = ""),
data[[col_name]])
return(data)
}
But I get results like the following from ALA3=VAL20 to ARNDCEQGHILKMFPSTWYV3=ARNDCEQGHILKMFPSTWYV20
As Darren Tsai points out in the comments, we can just use the cdoe_dict in str_replace_all():
library(stringr)
set.seed(123)
x <- sample(names(code_dict), 10)
y <- sample(names(code_dict), 10)
my_string <- paste0(x, sample(10), "=", y, sample(10))
my_string
#> [1] "PRO4=PHE9" "TYR1=CYS10" "PHE3=HIS2" "ASN7=ASN7" "ILE5=GLY3"
#> [6] "ARG10=ILE4" "GLU8=GLN1" "LEU2=PRO6" "CYS9=TRP5" "ASP6=THR8"
str_replace_all(my_string,
code_dict)
#> [1] "P4=F9" "Y1=C10" "F3=H2" "N7=N7" "I5=G3" "R10=I4" "E8=Q1" "L2=P6"
#> [9] "C9=W5" "D6=T8"
For more complex replacements, we could a custom function inside str_replace_all() as replacement argument. The custom function below shorten_str just uses a classical lookup with base R's match(), but we could add any kind of complexity here.
library(stringr)
# our dictionary
code_dict <- c("ALA" = "A", "ARG" = "R", "ASN" = "N", "ASP" = "D",
"CYS" = "C", "GLU" = "E", "GLN" = "Q", "GLY" = "G",
"HIS" = "H", "ILE" = "I", "LEU" = "L", "LYS" = "K",
"MET" = "M", "PHE" = "F", "PRO" = "P", "SER" = "S",
"THR" = "T", "TRP" = "W", "TYR" = "Y", "VAL" = "V")
# let's create a toy string
set.seed(123)
x <- sample(names(code_dict), 10)
y <- sample(names(code_dict), 10)
my_string <- paste0(x, sample(10), "=", y, sample(10))
my_string
#> [1] "PRO4=PHE9" "TYR1=CYS10" "PHE3=HIS2" "ASN7=ASN7" "ILE5=GLY3"
#> [6] "ARG10=ILE4" "GLU8=GLN1" "LEU2=PRO6" "CYS9=TRP5" "ASP6=THR8"
# custom function to replace string
shorten_str <- function(abr) {
code_dict[match(abr, names(code_dict))]
}
# implementation with `str_replace_all()`
str_replace_all(my_string,
paste(names(code_dict), collapse = "|"),
shorten_str)
#> [1] "P4=F9" "Y1=C10" "F3=H2" "N7=N7" "I5=G3" "R10=I4" "E8=Q1" "L2=P6"
#> [9] "C9=W5" "D6=T8"
Created on 2023-02-16 by the reprex package (v2.0.1)

R convert dataframe to list of unique memberships per column for each row

This is what I have:
> miniDF
site1 site2 site3 site4 site5
Alpha G T A C T
Beta G T A T T
Delta G T G C T
Gamma G C A T T
Eps G T A T T
Pi A T A T T
Omi G T A C A
miniDF = structure(list(site1 = c("G", "G", "G", "G", "G", "A", "G"),
site2 = c("T", "T", "T", "C", "T", "T", "T"), site3 = c("A",
"A", "G", "A", "A", "A", "A"), site4 = c("C", "T", "C", "T",
"T", "T", "C"), site5 = c("T", "T", "T", "T", "T", "T", "A"
)), row.names = c("Alpha", "Beta", "Delta", "Gamma", "Eps",
"Pi", "Omi"), class = "data.frame")
I'd like to convert it to a list structure for a venn diagram or upset plot where the presence of a unique letter in that column puts that site into the list row name:
myList = list('Alpha'=c('site4'), 'Beta'=c(), 'Delta'=c('site3', 'site4'), 'Gamma'=c('site2'), 'Eps'=c(), 'Pi'=c('site1'), 'Omi'=c('site4','site5'))
Alpha only has one unique site (a column with a unique cell) , Beta has none, but Delta and Omi have two unique sites.
Unique in this context means that cell is different from the other cells in that column. So for site1, A is the unique value (all the other values are G), so Pi includes that site in it's array.
For columns where there is more than one cell with a different value, like site4, I take the value of the first row to be the unique value, hence Alpha, Delta, and Omi include site4 in their arrays.
Assume I have a few hundred columns.
How can I do this?
We create a function to find the "unique" values, then apply it to every column, and finally go through each row see which columns have the unique values.
I've used just base R. The code could probably be a bit more concise if we switched to purrr functions, or possibly more efficient if we used a matrix instead of a data frame.
pseudo_unique = function(x) {
tx = sort(table(x))
if(tx[1] == 1) return(names(tx[1])) else return(x[1])
}
u_vals = lapply(miniDF, pseudo_unique)
result = lapply(
row.names(miniDF),
\(row) names(miniDF)[which(unlist(Map("==", u_vals, miniDF[row, ])))]
)
names(result) = row.names(miniDF)
result
# $Alpha
# [1] "site4"
#
# $Beta
# character(0)
#
# $Delta
# [1] "site3" "site4"
#
# $Gamma
# [1] "site2"
#
# $Eps
# character(0)
#
# $Pi
# [1] "site1"
#
# $Omi
# [1] "site4" "site5"
Here's the matrix version for the same result. With a few hundred columns, I'd recommend this version.
miniMat = as.matrix(miniDF)
u_vals = apply(miniMat, 2, pseudo_unique)
result = apply(miniMat, 1, \(row) colnames(miniMat)[row == u_vals], simplify = FALSE)
Here's a solution in the tidyverse.
Solution
First import the tidyverse and generate your dataset miniDF.
library(tidyverse)
# ...
# Code to generate 'miniDF'.
# ...
Then define the custom function are_unique() to properly identify which values in each column you consider "unique".
are_unique <- function(x) {
# Return an empty logical vector for an empty input...
if(length(x) < 1) {
return(logical(0))
}
# ...and otherwise identify which input values are strictly unique.
are_unique <- !x %in% x[duplicated(x)]
# If unique values actually exist, return that identification as is...
if(any(are_unique)) {
return(are_unique)
}
# ...and otherwise default to treating the first value as "unique"...
token_unique <- x[1]
# ...and identify its every occurrence.
x == token_unique
}
Finally, apply this tidy workflow:
miniDF %>%
# Make the letters (row names) a column of their own.
rownames_to_column("letter") %>%
# In every other column, identify which values you consider "unique".
mutate(across(!letter, are_unique)) %>%
# Pivot into 'col_name | is_unique' format for easy filtration.
pivot_longer(!letter, names_to = "col_name", values_to = "is_unique") %>%
# Split by letter into a list, with the subset of rows for each letter.
split(.$letter) %>%
# Convert each subset into the vector of 'col_name's that filter as "unique".
sapply(function(x){x$col_name[x$is_unique]})
Result
Given a miniDF like your sample here
miniDF <- structure(
list(
site1 = c("G", "G", "G", "G", "G", "A", "G"),
site2 = c("T", "T", "T", "C", "T", "T", "T"),
site3 = c("A", "A", "G", "A", "A", "A", "A"),
site4 = c("C", "T", "C", "T", "T", "T", "C"),
site5 = c("T", "T", "T", "T", "T", "T", "A")
),
row.names = c("Alpha", "Beta", "Delta", "Gamma", "Eps", "Pi", "Omi"),
class = "data.frame"
)
this solution should produce the following list:
list(
Alpha = "site4",
Beta = character(0),
Delta = c("site3", "site4"),
Eps = character(0),
Gamma = "site2",
Omi = c("site4", "site5"),
Pi = "site1"
)
Note
The answer here by #GregorThomas should likely supersede my own. While my answer was technically posted first, I deleted that answer to fix an error, and Gregor's functional solution was posted before I finally undeleted mine.
Gregor's is likely more elegant anyway.

Subset column values into separate vectors using 'for' loop in R

I have this data.frame and vector:
df <- data.frame (fruit = c(rep("apple", 5), rep("banana", 3), rep("cherry", 6), rep("date", 4)),
letter = c("a", "b", "c", "d", "e", "a", "d", "f", "b", "c", "f", "p", "q", "r", "d", "p",
"x", "y")
)
my_vector <- c("apple", "banana", "date")
Now I would like to use a for loop, which results in vectors with as names the elements in my_vector and as elements those listed in the letter column.
So expected outcome is like this:
apple <- c("a", "b", "c", "d", "e")
banana <- c("a", "d", "f")
date <- c("d", "p", "x", "y")
Thanks you.
We can subset to keep only fruit in my_vector in the data and split it into list of vectors.
list2env(with(subset(df, fruit %in% my_vector),split(letter, fruit)), .GlobalEnv)
apple
#[1] "a" "b" "c" "d" "e"
banana
#[1] "a" "d" "f"
date
#[1] "d" "p" "x" "y"
list2env does write the list of vectors as separate vectors in global environment but usually it is good practice to keep data in the list and not separate them in individual vectors.
A for loop solution would be with assign -
for(vec in my_vector) {
assign(vec, df$letter[df$fruit == vec])
}

Minimize a function of characters inputs in R

I have the following function that I want to find the minimum:
model <- Create(parameter1 = list(model = "a" , "b"),
parameter2 = list(distribution = "x" , "y"))
The four inputs of this function are characters, and have as possible values:
parameter1: "a", "b", "c", "d", "e"
parameter2: "x", "y", "z", "w", "t", "v"
I've tried the optim function a few times without success.
Any help is appreciated.
Evaluate the function at every possible set of input values and take the least.
# test function
Create <- function(parameter1, parameter2) {
sum(match(unlist(parameter1), p1), match(unlist(parameter2), p2))
}
p1 <- c("a", "b", "c", "d", "e")
p2 <- c("x", "y", "z", "w", "t", "v")
g <- expand.grid(p1, p1, p2, p2, stringsAsFactors = FALSE)
obj <- function(x) Create(x[1:2], x[3:4])
ix <- which.min(apply(g, 1, obj))
g[ix, ]
## Var1 Var2 Var3 Var4
## 1 a a x x
obj(g[ix, ])
## [1] 4

how to get all possible sequences from IUPAC notation using R

I have a vector of DNA sequences with IUPAC notation (https://www.bioinformatics.org/sms/iupac.html). For example, given the sequence, and the notation:
seq <- "AATCRVTAA"
iuapc <- data.table(code = c("A", "C", "G", "T", "R", "Y", "S", "W", "K", "M", "B", "D", "H", "V", "N"),
base = c("A", "C", "G", "T", "AG", "CT", "GC", "AT", "GT", "AC", "CGT", "AGT", "ACT", "ACG", "ACGT"))
Where "R" and "V" are ambiguous values of DNA nucleotides, and "R" represents either "A" or "G" and "V" represents "A", "C" or "G".
How can I generate all the different combinations of sequences that could be represented by the above ambiguous sequence?
The output for this example sequence would be:
"AATCAATAA"
"AATCACTAA"
"AATCAGTAA"
"AATCGATAA"
"AATCGCTAA"
"AATCGGTAA"
The vector of sequences is quite large, so performance is important. Any help will be greatly appreciated!
This question has already been asked for Python here: how to extend ambiguous dna sequence
Here is something very raw:
library(data.table)
library(magrittr)
# Convert iuapc$base to list of vectors
iuapc[, base := list(strsplit(base, ''))]
setkey(iuapc, code)
tstrsplit(seq, '') %>%
lapply(function(x) iuapc[x, base[[1]]]) %>%
do.call(CJ, .) %>%
.[, paste(.SD, collapse = ''), by = 1:nrow(.)] %>%
.[, V1]
# [1] "AATCAATAA" "AATCACTAA" "AATCAGTAA" "AATCGATAA" "AATCGCTAA" "AATCGGTAA"
Leveraging from your earlier question today (https://stackoverflow.com/a/66274136/6851825), here's a kludgey tidyverse/base approach:
library(tidyverse)
tibble(seq) %>%
separate_rows(seq, sep = '(?<=.)(?=.)') %>%
left_join(iuapc, by = c("seq" = "code")) %>%
pull(base) %>%
str_split("") %>%
expand.grid(stringsAsFactors = FALSE)
# Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1 A A T C A A T A A
#2 A A T C G A T A A
#3 A A T C A C T A A
#4 A A T C G C T A A
#5 A A T C A G T A A
#6 A A T C G G T A A
library(stringr)
all.seq.iuapc <- function(seq, dictio_replace){
seq <- toupper(seq)
vec <- strsplit(seq, "")[[1]]
vec2 <- str_replace_all(string = vec, pattern= dictio_replace)
tmp <- expand.grid(strsplit(vec2, ""), stringsAsFactors = FALSE)
strings <- apply(tmp, 1, paste0, collapse = "")
return(strings)
}
dictio_replace= c("A" = "A",
"C" = "C",
"G" = "G",
"T" = "T",
"R" = "AG",
"Y" = "CT",
"S" = "GC",
"W" = "AT",
"K" = "GT",
"M" = "AC",
"B" = "CGT",
"D" = "AGT",
"H" = "ACT",
"V" = "ACG",
"N" = "ACGT")

Resources