Minimize a function of characters inputs in R - r

I have the following function that I want to find the minimum:
model <- Create(parameter1 = list(model = "a" , "b"),
parameter2 = list(distribution = "x" , "y"))
The four inputs of this function are characters, and have as possible values:
parameter1: "a", "b", "c", "d", "e"
parameter2: "x", "y", "z", "w", "t", "v"
I've tried the optim function a few times without success.
Any help is appreciated.

Evaluate the function at every possible set of input values and take the least.
# test function
Create <- function(parameter1, parameter2) {
sum(match(unlist(parameter1), p1), match(unlist(parameter2), p2))
}
p1 <- c("a", "b", "c", "d", "e")
p2 <- c("x", "y", "z", "w", "t", "v")
g <- expand.grid(p1, p1, p2, p2, stringsAsFactors = FALSE)
obj <- function(x) Create(x[1:2], x[3:4])
ix <- which.min(apply(g, 1, obj))
g[ix, ]
## Var1 Var2 Var3 Var4
## 1 a a x x
obj(g[ix, ])
## [1] 4

Related

R function to to search and replace text in a column

#I'm seeking to write code that takes a column with character values (e.g. ALA3=VAL20) and converts it to a specific single letter corresponding to that three letter code (e.g. A3=V20). I attempted using the following function, but seem to be having trouble:
substitute_codes <- function(data, col_name) {
# Create a dictionary of the code substitutions
code_dict <- c("ALA" = "A", "ARG" = "R", "ASN" = "N", "ASP" = "D",
"CYS" = "C", "GLU" = "E", "GLN" = "Q", "GLY" = "G",
"HIS" = "H", "ILE" = "I", "LEU" = "L", "LYS" = "K",
"MET" = "M", "PHE" = "F", "PRO" = "P", "SER" = "S",
"THR" = "T", "TRP" = "W", "TYR" = "Y", "VAL" = "V")
# Apply the substitutions using gsub()
data[[col_name]] <- gsub(paste(names(code_dict), collapse = "|"),
paste(code_dict, collapse = ""),
data[[col_name]])
return(data)
}
But I get results like the following from ALA3=VAL20 to ARNDCEQGHILKMFPSTWYV3=ARNDCEQGHILKMFPSTWYV20
As Darren Tsai points out in the comments, we can just use the cdoe_dict in str_replace_all():
library(stringr)
set.seed(123)
x <- sample(names(code_dict), 10)
y <- sample(names(code_dict), 10)
my_string <- paste0(x, sample(10), "=", y, sample(10))
my_string
#> [1] "PRO4=PHE9" "TYR1=CYS10" "PHE3=HIS2" "ASN7=ASN7" "ILE5=GLY3"
#> [6] "ARG10=ILE4" "GLU8=GLN1" "LEU2=PRO6" "CYS9=TRP5" "ASP6=THR8"
str_replace_all(my_string,
code_dict)
#> [1] "P4=F9" "Y1=C10" "F3=H2" "N7=N7" "I5=G3" "R10=I4" "E8=Q1" "L2=P6"
#> [9] "C9=W5" "D6=T8"
For more complex replacements, we could a custom function inside str_replace_all() as replacement argument. The custom function below shorten_str just uses a classical lookup with base R's match(), but we could add any kind of complexity here.
library(stringr)
# our dictionary
code_dict <- c("ALA" = "A", "ARG" = "R", "ASN" = "N", "ASP" = "D",
"CYS" = "C", "GLU" = "E", "GLN" = "Q", "GLY" = "G",
"HIS" = "H", "ILE" = "I", "LEU" = "L", "LYS" = "K",
"MET" = "M", "PHE" = "F", "PRO" = "P", "SER" = "S",
"THR" = "T", "TRP" = "W", "TYR" = "Y", "VAL" = "V")
# let's create a toy string
set.seed(123)
x <- sample(names(code_dict), 10)
y <- sample(names(code_dict), 10)
my_string <- paste0(x, sample(10), "=", y, sample(10))
my_string
#> [1] "PRO4=PHE9" "TYR1=CYS10" "PHE3=HIS2" "ASN7=ASN7" "ILE5=GLY3"
#> [6] "ARG10=ILE4" "GLU8=GLN1" "LEU2=PRO6" "CYS9=TRP5" "ASP6=THR8"
# custom function to replace string
shorten_str <- function(abr) {
code_dict[match(abr, names(code_dict))]
}
# implementation with `str_replace_all()`
str_replace_all(my_string,
paste(names(code_dict), collapse = "|"),
shorten_str)
#> [1] "P4=F9" "Y1=C10" "F3=H2" "N7=N7" "I5=G3" "R10=I4" "E8=Q1" "L2=P6"
#> [9] "C9=W5" "D6=T8"
Created on 2023-02-16 by the reprex package (v2.0.1)

R convert dataframe to list of unique memberships per column for each row

This is what I have:
> miniDF
site1 site2 site3 site4 site5
Alpha G T A C T
Beta G T A T T
Delta G T G C T
Gamma G C A T T
Eps G T A T T
Pi A T A T T
Omi G T A C A
miniDF = structure(list(site1 = c("G", "G", "G", "G", "G", "A", "G"),
site2 = c("T", "T", "T", "C", "T", "T", "T"), site3 = c("A",
"A", "G", "A", "A", "A", "A"), site4 = c("C", "T", "C", "T",
"T", "T", "C"), site5 = c("T", "T", "T", "T", "T", "T", "A"
)), row.names = c("Alpha", "Beta", "Delta", "Gamma", "Eps",
"Pi", "Omi"), class = "data.frame")
I'd like to convert it to a list structure for a venn diagram or upset plot where the presence of a unique letter in that column puts that site into the list row name:
myList = list('Alpha'=c('site4'), 'Beta'=c(), 'Delta'=c('site3', 'site4'), 'Gamma'=c('site2'), 'Eps'=c(), 'Pi'=c('site1'), 'Omi'=c('site4','site5'))
Alpha only has one unique site (a column with a unique cell) , Beta has none, but Delta and Omi have two unique sites.
Unique in this context means that cell is different from the other cells in that column. So for site1, A is the unique value (all the other values are G), so Pi includes that site in it's array.
For columns where there is more than one cell with a different value, like site4, I take the value of the first row to be the unique value, hence Alpha, Delta, and Omi include site4 in their arrays.
Assume I have a few hundred columns.
How can I do this?
We create a function to find the "unique" values, then apply it to every column, and finally go through each row see which columns have the unique values.
I've used just base R. The code could probably be a bit more concise if we switched to purrr functions, or possibly more efficient if we used a matrix instead of a data frame.
pseudo_unique = function(x) {
tx = sort(table(x))
if(tx[1] == 1) return(names(tx[1])) else return(x[1])
}
u_vals = lapply(miniDF, pseudo_unique)
result = lapply(
row.names(miniDF),
\(row) names(miniDF)[which(unlist(Map("==", u_vals, miniDF[row, ])))]
)
names(result) = row.names(miniDF)
result
# $Alpha
# [1] "site4"
#
# $Beta
# character(0)
#
# $Delta
# [1] "site3" "site4"
#
# $Gamma
# [1] "site2"
#
# $Eps
# character(0)
#
# $Pi
# [1] "site1"
#
# $Omi
# [1] "site4" "site5"
Here's the matrix version for the same result. With a few hundred columns, I'd recommend this version.
miniMat = as.matrix(miniDF)
u_vals = apply(miniMat, 2, pseudo_unique)
result = apply(miniMat, 1, \(row) colnames(miniMat)[row == u_vals], simplify = FALSE)
Here's a solution in the tidyverse.
Solution
First import the tidyverse and generate your dataset miniDF.
library(tidyverse)
# ...
# Code to generate 'miniDF'.
# ...
Then define the custom function are_unique() to properly identify which values in each column you consider "unique".
are_unique <- function(x) {
# Return an empty logical vector for an empty input...
if(length(x) < 1) {
return(logical(0))
}
# ...and otherwise identify which input values are strictly unique.
are_unique <- !x %in% x[duplicated(x)]
# If unique values actually exist, return that identification as is...
if(any(are_unique)) {
return(are_unique)
}
# ...and otherwise default to treating the first value as "unique"...
token_unique <- x[1]
# ...and identify its every occurrence.
x == token_unique
}
Finally, apply this tidy workflow:
miniDF %>%
# Make the letters (row names) a column of their own.
rownames_to_column("letter") %>%
# In every other column, identify which values you consider "unique".
mutate(across(!letter, are_unique)) %>%
# Pivot into 'col_name | is_unique' format for easy filtration.
pivot_longer(!letter, names_to = "col_name", values_to = "is_unique") %>%
# Split by letter into a list, with the subset of rows for each letter.
split(.$letter) %>%
# Convert each subset into the vector of 'col_name's that filter as "unique".
sapply(function(x){x$col_name[x$is_unique]})
Result
Given a miniDF like your sample here
miniDF <- structure(
list(
site1 = c("G", "G", "G", "G", "G", "A", "G"),
site2 = c("T", "T", "T", "C", "T", "T", "T"),
site3 = c("A", "A", "G", "A", "A", "A", "A"),
site4 = c("C", "T", "C", "T", "T", "T", "C"),
site5 = c("T", "T", "T", "T", "T", "T", "A")
),
row.names = c("Alpha", "Beta", "Delta", "Gamma", "Eps", "Pi", "Omi"),
class = "data.frame"
)
this solution should produce the following list:
list(
Alpha = "site4",
Beta = character(0),
Delta = c("site3", "site4"),
Eps = character(0),
Gamma = "site2",
Omi = c("site4", "site5"),
Pi = "site1"
)
Note
The answer here by #GregorThomas should likely supersede my own. While my answer was technically posted first, I deleted that answer to fix an error, and Gregor's functional solution was posted before I finally undeleted mine.
Gregor's is likely more elegant anyway.

Subset column values into separate vectors using 'for' loop in R

I have this data.frame and vector:
df <- data.frame (fruit = c(rep("apple", 5), rep("banana", 3), rep("cherry", 6), rep("date", 4)),
letter = c("a", "b", "c", "d", "e", "a", "d", "f", "b", "c", "f", "p", "q", "r", "d", "p",
"x", "y")
)
my_vector <- c("apple", "banana", "date")
Now I would like to use a for loop, which results in vectors with as names the elements in my_vector and as elements those listed in the letter column.
So expected outcome is like this:
apple <- c("a", "b", "c", "d", "e")
banana <- c("a", "d", "f")
date <- c("d", "p", "x", "y")
Thanks you.
We can subset to keep only fruit in my_vector in the data and split it into list of vectors.
list2env(with(subset(df, fruit %in% my_vector),split(letter, fruit)), .GlobalEnv)
apple
#[1] "a" "b" "c" "d" "e"
banana
#[1] "a" "d" "f"
date
#[1] "d" "p" "x" "y"
list2env does write the list of vectors as separate vectors in global environment but usually it is good practice to keep data in the list and not separate them in individual vectors.
A for loop solution would be with assign -
for(vec in my_vector) {
assign(vec, df$letter[df$fruit == vec])
}

Avoid the use of nested loop in multiple column comparison

I have a dataframe like this:
df <- data.frame(Patient.ID = rep(paste("Pat", seq(1:3), sep = ""), 2),
Gene = c(rep("Gene1", 3), rep("Gene2", 3)),
Ref = c("A", "C", "G", "T", "A", "T"),
Tum1 = c("A", "A", "T", "T", "A", "T"),
Tum2 = c("A", "C", "G", "G", "C", "C"))
What I would like to do is determine the change that is occurring between the Ref or either Tum column. In other words, if Tum1 is different from Tum2 take the character string which is different to the Ref column and store that in a separate column as the change so the dataframe above would become:
df <- data.frame(Patient.ID = rep(paste("Pat", seq(1:3), sep = ""), 2),
Gene = c(rep("Gene1", 3), rep("Gene2", 3)),
Ref = c("A", "C", "G", "T", "A", "T"),
Tum1 = c("A", "A", "T", "T", "A", "T"),
Tum2 = c("A", "C", "G", "G", "C", "C"),
BaseChange = c("NoCh", "C.A", "G.T", "T.G", "A.C", "T.C"))
I'm aware I could use a nested ifelse() statement like below (but extended) to solve this, but my actual dataframe has many more combinations and I figure there has to be a "safer" method of doing so.
df$BaseChange <- as.factor(ifelse(df$Ref == "C" & df$Tum1 == "A" | df$Ref== "C" & df$Tum2 == "A", "C.A",
ifelse((df$Ref == "G" & df$Tum1 == "T" | df$Ref == "G" & df$Tum2 == "T"), "G.T",...)))
Any help would be greatly appreciated.
It's not pretty, but it works:
df <- df %>%
mutate(BaseChange2 = ifelse( (as.character(Ref)==as.character(Tum1) & as.character(Ref) == as.character(Tum2)), "NoCh",
ifelse(as.character(Ref)==as.character(Tum1),paste(Ref,Tum2, sep="."),paste(Ref,Tum1, sep="."))))
It seems tha you need to paste unique Tums together, i.e.
apply(df[3:5], 1, function(i) paste0(unique(i), collapse = '.'))
#[1] "A" "C.A" "G.T" "T.G" "A.C" "T.C"
To replace the first A,
v2 <- apply(df[3:5], 1, function(i) paste0(unique(i), collapse = '.'))
replace(v2, nchar(v2) == 1, 'NoChange')
#[1] "NoChange" "C.A" "G.T" "T.G" "A.C" "T.C"

Recursive function to check all possible paths (from raw material to product)

I am struggling with a recursive function, who's goal is to determine which raw materials belong to which product. I clouldn't figure out, how to handle multiple possible paths in data frame "db". The wanted function should give: A-B-C-E, A-B-C-F, A-B-D-F for db. My function works for "da". I added it to show what I am after, and it is a bit like bill of materials explosion, but not exactly.
da <- data.frame(parent = c("A", "B", "B", "C", "D"),
child = c("B", "C", "D", "E", "F"),
stringsAsFactors = FALSE)
db <- data.frame(parent = c("A", "B", "B", "C", "D", "C"),
child = c("B", "C", "D", "E", "F", "F"),
stringsAsFactors = FALSE)
my_path <- function(a, df) {
b <- df$parent[df$child == a]
if (length(b) == 0) {
return(a)
} else {
return(c(my_path(b, df), a))
}
}
end_points <- da$child[is.na(match(da$child, da$parent))]
lapply(end_points, function(x) my_path(x, da)) # -> ok
end_points <- db$child[is.na(match(db$child, db$parent))]
lapply(end_points, function(x) my_path(x, db)) # -> not ok
Thx & kind regards
This is a job for igraph:
#the data
db <- data.frame(parent = c("A", "B", "B", "C", "D", "C"),
child = c("B", "C", "D", "E", "F", "F"),
stringsAsFactors = FALSE)
#create a graph
library(igraph)
g <- graph_from_data_frame(db)
#plot the graph
plot(g)
#find all vertices that have no ingoing resp. outgoing edges
starts <- V(g)[degree(g, mode = "in") == 0]
finals <- V(g)[degree(g, mode = "out") == 0]
#find paths, you need to loop if starts is longer than 1
res <- all_simple_paths(g, from = starts[[1]], to = finals)
#[[1]]
#+ 4/6 vertices, named, from 4b85bd1:
#[1] A B C E
#
#[[2]]
#+ 4/6 vertices, named, from 4b85bd1:
#[1] A B C F
#
#[[3]]
#+ 4/6 vertices, named, from 4b85bd1:
#[1] A B D F
#coerce to vectors
lapply(res, as_ids)

Resources