Create a dataframe of combinations of data with proper labels in R - r

I am trying to create a data frame that pulls combinations of values from a data frame, and applies a function on the data, and finally assigning the label of the combinations as identifiers in the data frame.
To create a simple mockup of the data:
A = (1:20)
B = seq(1, 40, 2)
To create a mockup of each identifier:
L = c("q", "r", "q", "p", "q", "p", "r", "r", "r", "r", "r", "r", "r", "r", "o", "p", "q", "r", "r", "q")
The mockup data frame is then created with:
dat.example = data.frame(A, B, L)
Here, the data in each row is subtracted by the data from the previous row, with all possible combinations.
diffA = combn(dat.example$A, 2, FUN = diff)
diffB = combn(dat.example$B, 2, FUN = diff)
After all possible combination of differences within columns A and B are calculated, a function is applied on the generated output.
calc.diff = sqrt((diffA^2) + (diffB^2))
Now comes the tricky part, where I pull out the identifiers for the combinations.
(L.list = combn(dat.example$L, 2, simplify = F))
L.list.1 = sapply(L.list, "[[", 1)
L.list.2 = sapply(L.list, "[[", 2)
L.dat = data.frame(
X = L.list.1,
Y = L.list.2
)
L.dat$comp = with(
L.dat,
interaction(X, Y, sep = "_")
)
And combine them into a master dataset
master = data.frame(
calc.diff,
L.dat$comp
)
Is there a simpler way to perform this that I am missing?
Update: I just realised the labels I created in this fashion is duplicated, as p_q is equivalent to q_r as only the magnitude of the value matters after the calc.dist function. Therefore, this question still needs a better answer. My present solution is to simply substitute each occurrence with it's twin in the dataset, but this becomes substantially difficult the more labels there are.
Update 2: I have solved the issue of a pure R solution to properly assigning unique combination of labels to the dataset. Below, I shall paste the entire code for generating the above dataset, and assigning the right labels. Improvements are of course welcomed.
A = (1:20)
B = seq(1, 40, 2)
L = c("q", "r", "q", "p", "q", "p", "r", "r", "r", "r", "r", "r", "r", "r", "o", "p", "q", "r", "r", "q")
dat.example = data.frame(A, B, L)
diffA = combn(dat.example$A, 2, FUN = diff)
diffB = combn(dat.example$B, 2, FUN = diff)
calc.diff = sqrt((diffA^2) + (diffB^2))
L.dat.comp <- combn(dat.example$L, 2, paste0, collapse = "_", simplify=F)
master <- data.frame(
calc.diff,
id = unlist(L.dat.comp))
Function to help with combinations
expand.grid.unique <- function(x, y, include.equals=FALSE)
{
x <- unique(x)
y <- unique(y)
g <- function(i)
{
z <- setdiff(y, x[seq_len(i-include.equals)])
if(length(z)) cbind(x[i], z, deparse.level=0)
}
do.call(rbind, lapply(seq_along(x), g))
}
Create dataframe with unique combinations of id labels.
dat.combo =
as.data.frame(
expand.grid.unique(L, L, include.equals = T)
)
dat.combo$c1 = with(dat.combo, interaction(V1, V2, sep = "_"))
dat.combo$c2 = with(dat.combo, interaction(V2, V1, sep = "_"))
master$id2 = master$id
Where ids match dat.combo$c2, replace with dat.com$c1
master$id2[] = setNames(dat.combo$c1,
dat.combo$c2)[as.character(
unlist(
master$id2
)
)
]
Where NAs are present in id2, replace with values in id
master$id2[is.na(master$id2)] = master$id[is.na(master$id2)]

Based on Jay's comement for the "tricky part":
L.dat.comp <- combn(dat.example$L, 2, paste0, collapse = "_", simplify=F)
master <- data.frame(cbind(calc.diff,unlist(L.dat.comp)))
So the whole thing looks like:
A = (1:20)
B = seq(1, 40, 2)
L = c("q", "r", "q", "p", "q", "p", "r", "r", "r", "r", "r", "r", "r", "r", "o", "p", "q", "r", "r", "q")
dat.example = data.frame(A, B, L)
diffA = combn(dat.example$A, 2, FUN = diff)
diffB = combn(dat.example$B, 2, FUN = diff)
calc.diff = sqrt((diffA^2) + (diffB^2))
L.dat.comp <- combn(dat.example$L, 2, paste0, collapse = "_", simplify=F)
master <- data.frame(cbind(calc.diff,unlist(L.dat.comp)))

Related

R function to to search and replace text in a column

#I'm seeking to write code that takes a column with character values (e.g. ALA3=VAL20) and converts it to a specific single letter corresponding to that three letter code (e.g. A3=V20). I attempted using the following function, but seem to be having trouble:
substitute_codes <- function(data, col_name) {
# Create a dictionary of the code substitutions
code_dict <- c("ALA" = "A", "ARG" = "R", "ASN" = "N", "ASP" = "D",
"CYS" = "C", "GLU" = "E", "GLN" = "Q", "GLY" = "G",
"HIS" = "H", "ILE" = "I", "LEU" = "L", "LYS" = "K",
"MET" = "M", "PHE" = "F", "PRO" = "P", "SER" = "S",
"THR" = "T", "TRP" = "W", "TYR" = "Y", "VAL" = "V")
# Apply the substitutions using gsub()
data[[col_name]] <- gsub(paste(names(code_dict), collapse = "|"),
paste(code_dict, collapse = ""),
data[[col_name]])
return(data)
}
But I get results like the following from ALA3=VAL20 to ARNDCEQGHILKMFPSTWYV3=ARNDCEQGHILKMFPSTWYV20
As Darren Tsai points out in the comments, we can just use the cdoe_dict in str_replace_all():
library(stringr)
set.seed(123)
x <- sample(names(code_dict), 10)
y <- sample(names(code_dict), 10)
my_string <- paste0(x, sample(10), "=", y, sample(10))
my_string
#> [1] "PRO4=PHE9" "TYR1=CYS10" "PHE3=HIS2" "ASN7=ASN7" "ILE5=GLY3"
#> [6] "ARG10=ILE4" "GLU8=GLN1" "LEU2=PRO6" "CYS9=TRP5" "ASP6=THR8"
str_replace_all(my_string,
code_dict)
#> [1] "P4=F9" "Y1=C10" "F3=H2" "N7=N7" "I5=G3" "R10=I4" "E8=Q1" "L2=P6"
#> [9] "C9=W5" "D6=T8"
For more complex replacements, we could a custom function inside str_replace_all() as replacement argument. The custom function below shorten_str just uses a classical lookup with base R's match(), but we could add any kind of complexity here.
library(stringr)
# our dictionary
code_dict <- c("ALA" = "A", "ARG" = "R", "ASN" = "N", "ASP" = "D",
"CYS" = "C", "GLU" = "E", "GLN" = "Q", "GLY" = "G",
"HIS" = "H", "ILE" = "I", "LEU" = "L", "LYS" = "K",
"MET" = "M", "PHE" = "F", "PRO" = "P", "SER" = "S",
"THR" = "T", "TRP" = "W", "TYR" = "Y", "VAL" = "V")
# let's create a toy string
set.seed(123)
x <- sample(names(code_dict), 10)
y <- sample(names(code_dict), 10)
my_string <- paste0(x, sample(10), "=", y, sample(10))
my_string
#> [1] "PRO4=PHE9" "TYR1=CYS10" "PHE3=HIS2" "ASN7=ASN7" "ILE5=GLY3"
#> [6] "ARG10=ILE4" "GLU8=GLN1" "LEU2=PRO6" "CYS9=TRP5" "ASP6=THR8"
# custom function to replace string
shorten_str <- function(abr) {
code_dict[match(abr, names(code_dict))]
}
# implementation with `str_replace_all()`
str_replace_all(my_string,
paste(names(code_dict), collapse = "|"),
shorten_str)
#> [1] "P4=F9" "Y1=C10" "F3=H2" "N7=N7" "I5=G3" "R10=I4" "E8=Q1" "L2=P6"
#> [9] "C9=W5" "D6=T8"
Created on 2023-02-16 by the reprex package (v2.0.1)

How to find optimal string content that minimizes the MSE of character count vectors with its reference string

I have the following reference sequence:
ref_seq <- "MGHQQLYWSHPRKFGQGSRSCRVTSNRHGLIRKYGLNMSRQSFR"
and this seed pattern string:
seed_pattern <- "FKDHKHIDVKDRHRTRHLAK??????????"
There are 10 wildcards (?) in that pattern.
Given this functions:
aa_count_normalized <- function(x) {
AADict <- c(
"A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"
)
AAC <- summary(factor(strsplit(x, split = "")[[1]], levels = AADict),
maxsum = 21
) / nchar(x)
AAC
}
aa_count <- function(x) {
AADict <- c(
"A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"
)
AAC <- summary(factor(strsplit(x, split = "")[[1]], levels = AADict),
maxsum = 21
)
AAC
}
I can get :
# we need to normalize refseq_aa content with respect
# to the length of seed_pattern, to accommodate the length
# difference between the two.
> refseq_aa_content <- aa_count_normalized(ref_seq) * nchar(seed_pattern)
> refseq_aa_content
A R N D C E
0.0000000 4.7727273 1.3636364 0.0000000 0.6818182 0.0000000
Q G H I L K
2.7272727 3.4090909 2.0454545 0.6818182 2.0454545 1.3636364
M F P S T W
1.3636364 1.3636364 0.6818182 4.0909091 0.6818182 0.6818182
Y V
1.3636364 0.6818182
What I want to do is to replace the wild cards of the seed pattern - while keeping the non-wildcards as it is - with combinations of residue taken from:
AADict <- c(
"A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"
)
such that the mean squared error (MSE) of the final amino acid count of the seed sequence and the normalized reference sequence count is minimized.
With this MSE function:
mse <- function (ref, new_seq) {
return(mean((ref - new_seq)^2))
}
and with this final seed sequences:
seed_final.1 <- aa_count("FKDHKHIDVKDRHRTRHLAKRQQGGGSSSY")
seed_final.2 <- aa_count("FKDHKHIDVKDRHRTRHLAKRQQQGGGSSY")
seed_final.3 <- aa_count("FKDHKHIDVKDRHRTRHLAKSSSGGGRRQQ") # onyambu's
I get
> mse(refseq_aa_content, seed_final.1 )
[1] 1.501446
> mse(refseq_aa_content, seed_final.2 )
[1] 1.63781
> mse(refseq_aa_content, seed_final.3 )
[1] 1.560537
The seed_final.1 is the optimal exact solution, because it has the lowest MSE. Namely the 10 ?s is to be replaced with:
G Q R S Y
3 2 1 3 1 (total 10)
How can I create an efficient R code to return FKDHKHIDVKDRHRTRHLAKRQQGGGSSSY
as the answer.
You can model your problem as integer quadratic problem where you want to minimize:
sum(r^2) - 2 sum(z * r)
with constraints:
sum(r) = k
r[i] nonegative integer
where:
r[i] how many ith letters of AADict you need to add to seed_pattern
z[i] = n(y)/n(x) * x[i] - y[i]
x[i] counts of ith letter of AADict in ref_seq
y[i] counts of ith letter of AADict in seed_pattern
n(x) number of characters in ref_seq
n(y) number of charecters in seed_pattern
k number of wild card characters in seed_pattern
I din't manage to find mixed-integer quadratic solver in R (free one) so here is heuristic using DEoptimR:
ref_seq <- "MGHQQLYWSHPRKFGQGSRSCRVTSNRHGLIRKYGLNMSRQSFR"
seed_pattern <- "FKDHKHIDVKDRHRTRHLAK??????????"
AADict <- c(
"A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"
)
aminoSummary <- function(x){
f <- factor(strsplit(x, split = "")[[1]], levels = AADict)
list(
l = nchar(x),
k = sum(is.na(f)),
z = table(f)
)
}
x <- aminoSummary(ref_seq)
y <- aminoSummary(seed_pattern)
M <- length(AADict)
res <- DEoptimR::JDEoptim(
lower = rep(0, M),
upper = rep(y$k, M) + 1,
fn = function(r, z, k){
r <- floor(r)
sum(r * r) - 2 * sum(z * r)
},
constr = function(r, z, k) sum(floor(r)) - k,
meq = 1,
z = as.vector(x$z * y$l / x$l - y$z),
k = y$k
)
rep(AADict, floor(res$par))
[1] "R" "Q" "Q" "G" "G" "G" "S" "S" "S" "Y"

Efficient way of replacing character string with numeric values based on data frame "dictionary"

This is related to this question:
How to convert a string of text into a vector based on given values numeric values to replace each letter with
For ease, I will put the same information here:
aminoacid <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
aminoacid1 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
aminoacid2 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
df <- expand.grid(aminoacid, aminoacid1, aminoacid2)
df <- transform(df, newname = paste(df$Var1, df$Var2, df$Var3, sep=""))
ptuples <- df[,4]
I want to convert each of these strings (length 3) into a vector of length 15 by defining each amino acid (letter) with 5 numbers, as shown in this table.
key <-
read.table(
text = " pah pss ms cc ec
A -0.59145974 -1.30209266 -0.7330651 1.5703918 -0.14550842
C -1.34267179 0.46542300 -0.8620345 -1.0200786 -0.25516894
D 1.05015062 0.30242411 -3.6559147 -0.2590236 -3.24176791
E 1.35733226 -1.45275578 1.4766610 0.1129444 -0.83715681
F -1.00610084 -0.59046634 1.8909687 -0.3966186 0.41194139
G -0.38387987 1.65201497 1.3301017 1.0449765 2.06385566
H 0.33616543 -0.41662780 -1.6733690 -1.4738898 -0.07772917
I -1.23936304 -0.54652238 2.1314349 0.3931618 0.81630366
K 1.83146558 -0.56109831 0.5332237 -0.2771101 1.64762794
L -1.01895162 -0.98693471 -1.5046185 1.2658296 -0.91181195
M -0.66312569 -1.52353917 2.2194787 -1.0047207 1.21181214
N 0.94535614 0.82846219 1.2991286 -0.1688162 0.93339498
P 0.18862522 2.08084151 -1.6283286 0.4207004 -1.39177378
Q 0.93056541 -0.17926549 -3.0048731 -0.5025910 -1.85303476
R 1.53754853 -0.05472897 1.5021086 0.4403185 2.89744417
S -0.22788299 1.39869991 -4.7596375 0.6701745 -2.64747356
T -0.03181782 0.32571153 2.2134612 0.9078985 1.31337035
V -1.33661279 -0.27854634 -0.5440132 1.2419935 -1.26225362
W -0.59533918 0.00907760 0.6719274 -2.1275244 -0.18358096
Y 0.25999617 0.82992312 3.0973596 -0.8380164 1.51150958"
)
The method proposed in this post was:
output <- t(sapply(as.character(ptuples),
function(x) sapply(1:3, function(i) key[substr(x,i,i),])))
However, this is very inefficient and computationally heavy when working with character vectors with length 10^9
How can this be done efficiently? I was thinking something with package hashmap, but I wasn't sure how to do it. I would still like the output in a data frame like in the proposed solution above.
Thanks!
Here are couple of methods that seems to be faster than the current approach.
1) This method uses only a single loop and with strsplit splits the 'ptuples'
t(sapply(strsplit(as.character(ptuples), ""), function(x) c(t(key[x,])
2) We paste into a single string and then do the split once and cbind by subsetting
m1 <- key[strsplit(paste(ptuples, collapse=""), "")[[1]],]
output3 <- cbind(m1[c(TRUE, FALSE, FALSE),], m1[c(FALSE, TRUE, FALSE),],
m1[c(FALSE, FALSE, TRUE),])
Benchmarks
Based on the dataset provided by OP, the system.time are
system.time({
output <- t(sapply(as.character(ptuples),
function(x) sapply(1:3, function(i) key[substr(x,i,i),])))
})
# user system elapsed
# 3.13 0.00 3.28
system.time({
output2 <- t(sapply(strsplit(as.character(ptuples), ""), function(x) c(t(key[x,]))))
})
#user system elapsed
# 1.50 0.01 1.52
system.time({
m1 <- key[strsplit(paste(ptuples, collapse=""), "")[[1]],]
output3 <- cbind(m1[c(TRUE, FALSE, FALSE),], m1[c(FALSE, TRUE, FALSE),],
m1[c(FALSE, FALSE, TRUE),])
})
#user system elapsed
# 0.01 0.00 0.02

subseting a data frame under a specific condition

How can i get rows of a data frame that has a same value in a element of that comparing with another data frame ?
I have written this but it didn't work.
# example of two data frame
df1 <- data.frame(V1 = c("a", "g", "h", "l", "n", "e"), V2 = c("b", "n", "i", "m", "i", "f"), stringsAsFactors = F)
df2 <- data.frame(V1 = c("a", "c", "f","h"), V2 = c("b", "d", "e","z"), stringsAsFactors = F)
# finding joint values in each element of two data frames
res1<-intersect(df1$V1,df2$V1)
res2<-intersect(df1$V2,df2$V2)
res3<-intersect(df1$V1,df2$V2)
res4<-intersect(df1$V1,df2$V2)
# Getting rows that has joint value at least in one element of df1
ress1<-df1[apply(df1, MARGIN = 1, function(x) all(x== res1)), ]
ress2<-df1[apply(df1, MARGIN = 1, function(x) all(x== res2)), ]
ress3<-df1[apply(df1, MARGIN = 1, function(x) all(x== res3)), ]
ress4<-df1[apply(df1, MARGIN = 1, function(x) all(x== res4)), ]
# Getting rows that has joint value at least in one element of df2
resss1<-df2[apply(df2, MARGIN = 1, function(x) all(x== res1)), ]
resss2<-df2[apply(df2, MARGIN = 1, function(x) all(x== res2)), ]
resss3<-df2[apply(df2, MARGIN = 1, function(x) all(x== res3)), ]
resss4<-df2[apply(df2, MARGIN = 1, function(x) all(x== res4)), ]
# then combine above results
final.res<-rbind(ress1,ress2,ress3,ress4,resss1,resss2,resss3,resss4)
My favorite result is:
a b
h z
h i
f e
e f
This should work
#Import data
df1 <- data.frame(V1 = c("a", "g", "h", "l", "n", "e"), V2 = c("b", "n", "i", "m", "i", "f"), stringsAsFactors = F)
df2 <- data.frame(V1 = c("a", "c", "f","h"), V2 = c("b", "d", "e","z"), stringsAsFactors = F)
# Get the intersects
vals <- intersect(c(df1$V1, df1$V2), c(df2$V1, df2$V2))
#Get the subsets and rbind them
full <- rbind(
subset(df1, df1$V1 %in% vals),
subset(df1, df1$V2 %in% vals),
subset(df2, df2$V1 %in% vals),
subset(df2, df2$V2 %in% vals)
)
#Remove duplicates
full <- full[!duplicated(full),]

ggave() Error: Unknown input R

I am having trouble with ggsave() from the ggplot2 library. I wrote a function that I pass arguments to, and that is supposed to produce and then save the results with ggsave().
Here is some example data and code to reproduce the error:
example.df.1 <- data.frame(matrix(1:100, nrow = 20, ncol = 5))
colnames(example.df.1) <- c("var1", "var2", "var3", "var4", "var5")
rownames(example.df.1) <- c("A", "B", "C", "D", "E", "F", "G", "H",
"I", "J", "K", "L", "M", "N", "O", "P",
"Q", "R", "S", "T")
example.df.2 <- data.frame(matrix(ncol = 2, nrow = 24))
example.df.2[,1] <- c("A", "B", "C", "D", "E", "F", "G", "H",
"I", "J", "K", "L", "M", "N", "O", "P",
"Q", "R", "S", "T", "U", "V", "W", "X")
example.df.2[,2] <- rnorm(24, 10, 2)
problematic_func <- function(data1, col, title, var, data2) {
# only include rows without missing values
loc1 <- subset(data1, rowSums(is.na(data1)) == 0)
loc1 <- cbind(loc1, rank(-as.data.frame(loc1[,1]), ties.method = "first"))
# reduce data2 to only those rows that correspond to rows in data1
loc2 <- data2[data2[,1] %in% rownames(loc1),]
# order loc2
loc2.ordered <- loc2[order(loc2[,1]),]
# correlation between loc1 and loc2.ordered
corr <- cor(loc1[,1], loc2.ordered[,2])
# creating the plot
i <- ggplot(loc1, aes_q(x = loc1[,1], y = loc2.ordered))
i <- i + geom_point(colour = col, size = 4)
i <- i + ggtitle(title)
i <- i + xlab(var)
i <- i + ylab("y-axis")
i <- i + coord_cartesian(xlim = c(0, max(loc1[,1])),
ylim = c(0, max(loc2.ordered[,2])*1.2))
i <- i + annotate("text", x = max(loc1[,1])*.5, y = 1,
label = paste("Correlation coef: ", as.character(corr)), size = 3)
# saving the plot - this is where the error occurs according
# to the debugger
ggsave(filename = paste("my_example_plot_", var, ".png", sep = ""),
plot = i, device = png, width = 625, height = 625, limitsize = FALSE)
}
for (i in 1:ncol(example.df.1)) {
sv <- as.data.frame(example.df.1[,i])
rownames(sv) <- rownames(example.df.1)
problematic_func(sv, "orange", colnames(example.df.1[i]),
colnames(example.df.1[i]), data2 = example.df.2)
}
Edit: Sorry, I forgot to add the error I get:
Error in FUN(X[[2L]], ...) : Unknown input:data.frame
I found the mistake myself. The error was not in the ggsave() function, but in the ggplot() function with which I created the plot in the first place. The correct code must be:
i <- ggplot(loc1, aes_q(x = loc1[,1], y = loc2.ordered[,2]))
The difference is that the y aesthetic needs to be passed a column of loc2.ordered, not the entire dataframe.

Resources