Adding tick names to persp3d plot - r

I have a persp3d plot showing default rates of companies based on the rating (AAA to CCC-) and time (years 1 to 20).
I managed to show a line plot with the axes as numbers, however I would like to have them with the labels AAA to CCC- instead of 5,10,15. I also tried using surface3d() and bbox3d(), but I was not successful.
The code I am using now is:
Y <- c(1:dim(aggregate_row)[2]) # Tenors
X <- c(1:dim(aggregate_row)[1]) # Ratings
Z <- as.matrix.data.frame(aggregate_row)
cc <- colorRamp(rev(rainbow(10)))
Zsc <- (Z-min(Z))/diff(range(Z))
rgbvec2col <- function(x) do.call(rgb,c(as.list(x),list(max=500)))
colvec <- apply(cc(Zsc),1,rgbvec2col)
#surface3d(X,Y,Z,col=colvec)
#bbox3d(color=c("white","black"), xlab=c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "s", "t", "v"))
persp3d(X,Y,Z,col=colvec,xlab="Ratings",ylab="tenor",zlab="Default Rate",front="lines",back="lines")

You didn't include your data, so I'll fake some:
aggregate_row <- matrix(1:400, 20, 20)
The way to get the axes you want is to tell persp3d not to draw axes, and then draw them yourself. For example,
library(rgl)
persp3d(X,Y,Z,col=colvec,xlab="Ratings",ylab="tenor",zlab="Default Rate",front="lines",back="lines",axes=FALSE)
box3d()
axis3d("x") # the default
axis3d("y", at = c(5, 10, 15), labels = c("AAA", "BBB", "CCC"))
axis3d("z-+") # move to a different edge
This produces the following picture:

Related

How to maximize correlation by sampling using R

I have the following reference sequence:
reference_seq <- "KPAACQHRQDKWKNSHWNRFKAYFVVIKKK"
With this function that simply calculate the composition of amino acid sequence:
calculate_aa_content <- function (x)
{
AADict <- c("A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V")
AAC <- summary(factor(strsplit(x, split = "")[[1]], levels = AADict),
maxsum = 21)/nchar(x)
AAC
}
I can get the AA composition of the reference sequence:
> calculate_aa_content(reference_seq)
A R N D C E
0.10000000 0.06666667 0.06666667 0.03333333 0.03333333 0.00000000
Q G H I L K
0.06666667 0.00000000 0.06666667 0.03333333 0.00000000 0.23333333
M F P S T W
0.00000000 0.06666667 0.03333333 0.03333333 0.00000000 0.06666667
Y V
0.03333333 0.06666667
Then I have the seed sequence:
seed <- "FKDHKHIDVKDRHRTRHLAK??????????"
What I want to do is to sample 10aa represented by 10 question marks (?) from 20 amino acid vector here:
AADict <- c("A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V")
So that the Pearson correlation of final constructed sequence from seed with reference sequence is maximized (allowing certain number of iterations if not possible to get absolute max).
For example one of the sample from seed sampling, could be:
> sample_1 <- "FKDHKHIDVKDRHRTRHLAKHHHHHHHHHH"
> cor(calculate_aa_content(reference_seq), calculate_aa_content(sample_1))
[1] 0.2955238
But the correlation is low. What I'd like to do is to find max correlation of constructed string from seed with reference sequence after certain number of iteration.
An additional feature with early stopping if the difference with current maximum is within certain threshold, say 0.01, would be appreciated.
Note that the final sequence should not be the same with reference squence.
How can achieve that efficiently with R?
Write a function to simulate new sequences and compute the correlation. Then call the function R times, get the cor components and find the maximum.
calculate_aa_content <- function (x) {
AADict <- c("A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V")
AAC <- summary(factor(strsplit(x, split = "")[[1]], levels = AADict),
maxsum = 21)/nchar(x)
AAC
}
fun <- function(x, ref, dict = AADict) {
i <- regexpr("\\?", x)
n <- nchar(x) - i + 1L
new <- sample(dict, n, TRUE)
new <- paste(new, collapse = "")
substring(x, i) <- new
list(
new = x,
cor = cor(calculate_aa_content(ref), calculate_aa_content(x))
)
}
reference_seq <- "KPAACQHRQDKWKNSHWNRFKAYFVVIKKK"
seed <- "FKDHKHIDVKDRHRTRHLAK??????????"
AADict <- c("A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V")
set.seed(2022)
R <- 1e4
res <- replicate(R, fun(seed, reference_seq), simplify = FALSE)
cor_vec <- sapply(res, `[[`, 'cor')
res[which.max(cor_vec)]
#> [[1]]
#> [[1]]$new
#> [1] "FKDHKHIDVKDRHRTRHLAKKKWYKEKWFN"
#>
#> [[1]]$cor
#> [1] 0.7902037
Created on 2022-10-21 with reprex v2.0.2
In the event that there are more than one value equal to the maximum, use instead
res[cor_vec == max(cor_vec)]

How to find optimal string content that minimizes the MSE of character count vectors with its reference string

I have the following reference sequence:
ref_seq <- "MGHQQLYWSHPRKFGQGSRSCRVTSNRHGLIRKYGLNMSRQSFR"
and this seed pattern string:
seed_pattern <- "FKDHKHIDVKDRHRTRHLAK??????????"
There are 10 wildcards (?) in that pattern.
Given this functions:
aa_count_normalized <- function(x) {
AADict <- c(
"A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"
)
AAC <- summary(factor(strsplit(x, split = "")[[1]], levels = AADict),
maxsum = 21
) / nchar(x)
AAC
}
aa_count <- function(x) {
AADict <- c(
"A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"
)
AAC <- summary(factor(strsplit(x, split = "")[[1]], levels = AADict),
maxsum = 21
)
AAC
}
I can get :
# we need to normalize refseq_aa content with respect
# to the length of seed_pattern, to accommodate the length
# difference between the two.
> refseq_aa_content <- aa_count_normalized(ref_seq) * nchar(seed_pattern)
> refseq_aa_content
A R N D C E
0.0000000 4.7727273 1.3636364 0.0000000 0.6818182 0.0000000
Q G H I L K
2.7272727 3.4090909 2.0454545 0.6818182 2.0454545 1.3636364
M F P S T W
1.3636364 1.3636364 0.6818182 4.0909091 0.6818182 0.6818182
Y V
1.3636364 0.6818182
What I want to do is to replace the wild cards of the seed pattern - while keeping the non-wildcards as it is - with combinations of residue taken from:
AADict <- c(
"A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"
)
such that the mean squared error (MSE) of the final amino acid count of the seed sequence and the normalized reference sequence count is minimized.
With this MSE function:
mse <- function (ref, new_seq) {
return(mean((ref - new_seq)^2))
}
and with this final seed sequences:
seed_final.1 <- aa_count("FKDHKHIDVKDRHRTRHLAKRQQGGGSSSY")
seed_final.2 <- aa_count("FKDHKHIDVKDRHRTRHLAKRQQQGGGSSY")
seed_final.3 <- aa_count("FKDHKHIDVKDRHRTRHLAKSSSGGGRRQQ") # onyambu's
I get
> mse(refseq_aa_content, seed_final.1 )
[1] 1.501446
> mse(refseq_aa_content, seed_final.2 )
[1] 1.63781
> mse(refseq_aa_content, seed_final.3 )
[1] 1.560537
The seed_final.1 is the optimal exact solution, because it has the lowest MSE. Namely the 10 ?s is to be replaced with:
G Q R S Y
3 2 1 3 1 (total 10)
How can I create an efficient R code to return FKDHKHIDVKDRHRTRHLAKRQQGGGSSSY
as the answer.
You can model your problem as integer quadratic problem where you want to minimize:
sum(r^2) - 2 sum(z * r)
with constraints:
sum(r) = k
r[i] nonegative integer
where:
r[i] how many ith letters of AADict you need to add to seed_pattern
z[i] = n(y)/n(x) * x[i] - y[i]
x[i] counts of ith letter of AADict in ref_seq
y[i] counts of ith letter of AADict in seed_pattern
n(x) number of characters in ref_seq
n(y) number of charecters in seed_pattern
k number of wild card characters in seed_pattern
I din't manage to find mixed-integer quadratic solver in R (free one) so here is heuristic using DEoptimR:
ref_seq <- "MGHQQLYWSHPRKFGQGSRSCRVTSNRHGLIRKYGLNMSRQSFR"
seed_pattern <- "FKDHKHIDVKDRHRTRHLAK??????????"
AADict <- c(
"A", "R", "N", "D", "C", "E", "Q", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"
)
aminoSummary <- function(x){
f <- factor(strsplit(x, split = "")[[1]], levels = AADict)
list(
l = nchar(x),
k = sum(is.na(f)),
z = table(f)
)
}
x <- aminoSummary(ref_seq)
y <- aminoSummary(seed_pattern)
M <- length(AADict)
res <- DEoptimR::JDEoptim(
lower = rep(0, M),
upper = rep(y$k, M) + 1,
fn = function(r, z, k){
r <- floor(r)
sum(r * r) - 2 * sum(z * r)
},
constr = function(r, z, k) sum(floor(r)) - k,
meq = 1,
z = as.vector(x$z * y$l / x$l - y$z),
k = y$k
)
rep(AADict, floor(res$par))
[1] "R" "Q" "Q" "G" "G" "G" "S" "S" "S" "Y"

Apply multiple functions to a list of matrices and output answers in a data frame

I have the following matrix:
mat<- matrix(c(1,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,
2,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,1,0,1,1,0,0,1,0,1,
1,1,0,0,0,0,0,0,1,0,1,2,1,0,0,0), nrow=16, ncol=6)
dimnames(mat)<- list(c("a", "c", "f", "h", "i", "j", "l", "m", "p", "q", "s", "t", "u", "v","x", "z"),
c("1", "2", "3", "4", "5", "6"))
I have created a list of matrices using the function below:
lapply(seq_len(ncol(mat) - 1), function(j) do.call(cbind,
lapply(seq_len(ncol(mat) - j), function(i) rowSums(mat[, i:(i + j)]))))
In this function, columns in the original matrix are combined using a moving window method. First, the window size is 2 such that the data in two columns are combined. The window shifts 1 step (1 column) and the next set of two columns are combined. The output is a matrix for each window size. The window size continues to increase such that, the window increases to 3 columns and the results from 3 columns are output into a new matrix. This continues until the window is the size of the maximum number of columns.
I need to run a series of functions on each matrix within the list and output the answers into a data frame. The functions I need to apply are:
Calculate total frequency for each row (ie. row totals ). I attempted this function:
freq <- rowSums(mat[i:(i + j),])
Calculate mean frequency for each row (ie. row totals/length of row). I attempted this function:
mean_freq <- rowSums(mat[i:(i + j),])/length(mat[i:(i + j),])
Multiply the window size * pi * 25.
total_window_size <- length(ncol(mat) - j))*pi*25
Divide the mean frequency for each row by the total window size.
density <- mean_freq/total_window_size
Below are the expected results for the functions above for each matrix in this example list (i.e., result_mat1, result_mat2...). The data frame result_df combines all the results for each sub data frame and is the final output I need:
df for window size 2
result_mat1 <- data.frame( window_size= rep("2",80),
combined_cols= c(rep("1_2",16), rep("2_3",16), rep("3_4",16), rep("4_5",16), rep("5_6",16)),
row_names= c("a", "c", "f", "h", "i", "j", "l", "m", "p", "q", "s", "t", "u", "v","x", "z"),
freq=c(6,3,2,2,6,2,1,2,1,2,3,2,1,2,3,2),
mean_freq=(c(6,3,2,2,6,2,1,2,1,2,3,2,1,2,3,2)/5),
total_window_size= rep(157.08, 16))
result_mat1$density<- result_mat1$mean_freq/result_mat1$total_window_size
df for window size 3
result_mat2 <- data.frame( window_size= rep("3",64),
combined_cols= c(rep("1_2_3",16), rep("2_3_4",16), rep("3_4_5",16), rep("4_5_6",16)),
row_names= c("a", "c", "f", "h", "i", "j", "l", "m", "p", "q", "s", "t", "u", "v","x", "z"),
freq=c(6,4,3,3,7,3,1,2,1,2,3,2,1,2,4,2),
mean_freq=(c(6,4,3,3,7,3,1,2,1,2,3,2,1,2,4,2)/5),
total_window_size= rep(235.62, 16))
result_mat2$density <- result_mat2$mean_freq/result_mat2$total_window_size
df for window size 4
result_mat3 <- data.frame( window_size= rep("4",48),
combined_cols= c(rep("1_2_3_4",16), rep("2_3_4_5",16), rep("3_4_5_6",16)),
row_names= c("a", "c", "f", "h", "i", "j", "l", "m", "p", "q", "s", "t", "u", "v","x", "z"),
freq=c(6,3,3,3,7,3,1,2,1,2,3,2,1,2,4,2),
mean_freq=(c(6,3,3,3,7,3,1,2,1,2,3,2,1,2,4,2)/5),
total_window_size= rep(314, 16))
result_mat3$density <- result_mat3$mean_freq/result_mat3$total_window_size
df for window size 5
result_mat4 <- data.frame( window_size= rep("5",32),
combined_cols= c(rep("1_2_3_4_5",16), rep("2_3_4_5_6",16)),
row_names= c("a", "c", "f", "h", "i", "j", "l", "m", "p", "q", "s", "t", "u", "v","x", "z"),
freq=c(6,3,2,2,6,2,1,2,1,2,3,2,1,2,4,2),
mean_freq=(c(6,3,2,2,6,2,1,2,1,2,3,2,1,2,4,2)/5),
total_window_size= rep(392.5, 16))
result_mat4$density <- result_mat4$mean_freq/result_mat4$total_window_size
df for window size 6
result_mat5 <- data.frame( window_size= rep("6",16),
combined_cols= c(rep("1_2_3_4_5_6",16)),
row_names= c("a", "c", "f", "h", "i", "j", "l", "m", "p", "q", "s", "t", "u", "v","x", "z"),
freq=c(4,2,1,1,3,1,1,1,1,1,2,2,1,1,3,1),
mean_freq=(c(4,2,1,1,3,1,1,1,1,1,2,2,1,1,3,1)/5),
total_window_size= rep(471, 16))
result_mat5$density <- result_mat5$mean_freq/result_mat5$total_window_size
Final data frame with results for all sub data frames combined
result_df <- rbind(result_mat1, result_mat2, result_mat3, result_mat4, result_mat5)
I need help applying those 4 functions to each element of the list and output the results to one data frame.
Here is a start. I'm not sure how the combined_cols is supposed to be added to each data.frame since it is a different size (seems each is longer than all of the other data.frame columns.) I'm not sure if all of these calculations are exactly right, but this at least demonstrates the crux of the question 'how to iterate through a list, assemble some data.frames with results, and merge them into one big data.frame.)
myList <- lapply(seq_len(ncol(mat) - 1), function(j) do.call(cbind, lapply(seq_len(ncol(mat) - j), function(i) rowSums(mat[, i:(i + j)]))))
myListOutput <- list()
for (i in 1:length(myList)) {
print(i)
myMat = myList[[i]]
freq <- rowSums(myMat)
window_size = rep(as.character(i + 1), length(freq))
# your final data sample shows dividing by 5 on each one,
# but your pseudo code shows something to do with the columns
mean_freq <- rowSums(myMat)/(ncol(myMat))
total_window_size <- rep((i+1)*pi*25, length(freq))
density <- mean_freq/total_window_size
myDf = data.frame(window_size, freq, mean_freq, total_window_size, density)
myListOutput[[i]] <- myDf
}
result_df = do.call(rbind, myListOutput)

Efficient way of replacing character string with numeric values based on data frame "dictionary"

This is related to this question:
How to convert a string of text into a vector based on given values numeric values to replace each letter with
For ease, I will put the same information here:
aminoacid <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
aminoacid1 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
aminoacid2 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
df <- expand.grid(aminoacid, aminoacid1, aminoacid2)
df <- transform(df, newname = paste(df$Var1, df$Var2, df$Var3, sep=""))
ptuples <- df[,4]
I want to convert each of these strings (length 3) into a vector of length 15 by defining each amino acid (letter) with 5 numbers, as shown in this table.
key <-
read.table(
text = " pah pss ms cc ec
A -0.59145974 -1.30209266 -0.7330651 1.5703918 -0.14550842
C -1.34267179 0.46542300 -0.8620345 -1.0200786 -0.25516894
D 1.05015062 0.30242411 -3.6559147 -0.2590236 -3.24176791
E 1.35733226 -1.45275578 1.4766610 0.1129444 -0.83715681
F -1.00610084 -0.59046634 1.8909687 -0.3966186 0.41194139
G -0.38387987 1.65201497 1.3301017 1.0449765 2.06385566
H 0.33616543 -0.41662780 -1.6733690 -1.4738898 -0.07772917
I -1.23936304 -0.54652238 2.1314349 0.3931618 0.81630366
K 1.83146558 -0.56109831 0.5332237 -0.2771101 1.64762794
L -1.01895162 -0.98693471 -1.5046185 1.2658296 -0.91181195
M -0.66312569 -1.52353917 2.2194787 -1.0047207 1.21181214
N 0.94535614 0.82846219 1.2991286 -0.1688162 0.93339498
P 0.18862522 2.08084151 -1.6283286 0.4207004 -1.39177378
Q 0.93056541 -0.17926549 -3.0048731 -0.5025910 -1.85303476
R 1.53754853 -0.05472897 1.5021086 0.4403185 2.89744417
S -0.22788299 1.39869991 -4.7596375 0.6701745 -2.64747356
T -0.03181782 0.32571153 2.2134612 0.9078985 1.31337035
V -1.33661279 -0.27854634 -0.5440132 1.2419935 -1.26225362
W -0.59533918 0.00907760 0.6719274 -2.1275244 -0.18358096
Y 0.25999617 0.82992312 3.0973596 -0.8380164 1.51150958"
)
The method proposed in this post was:
output <- t(sapply(as.character(ptuples),
function(x) sapply(1:3, function(i) key[substr(x,i,i),])))
However, this is very inefficient and computationally heavy when working with character vectors with length 10^9
How can this be done efficiently? I was thinking something with package hashmap, but I wasn't sure how to do it. I would still like the output in a data frame like in the proposed solution above.
Thanks!
Here are couple of methods that seems to be faster than the current approach.
1) This method uses only a single loop and with strsplit splits the 'ptuples'
t(sapply(strsplit(as.character(ptuples), ""), function(x) c(t(key[x,])
2) We paste into a single string and then do the split once and cbind by subsetting
m1 <- key[strsplit(paste(ptuples, collapse=""), "")[[1]],]
output3 <- cbind(m1[c(TRUE, FALSE, FALSE),], m1[c(FALSE, TRUE, FALSE),],
m1[c(FALSE, FALSE, TRUE),])
Benchmarks
Based on the dataset provided by OP, the system.time are
system.time({
output <- t(sapply(as.character(ptuples),
function(x) sapply(1:3, function(i) key[substr(x,i,i),])))
})
# user system elapsed
# 3.13 0.00 3.28
system.time({
output2 <- t(sapply(strsplit(as.character(ptuples), ""), function(x) c(t(key[x,]))))
})
#user system elapsed
# 1.50 0.01 1.52
system.time({
m1 <- key[strsplit(paste(ptuples, collapse=""), "")[[1]],]
output3 <- cbind(m1[c(TRUE, FALSE, FALSE),], m1[c(FALSE, TRUE, FALSE),],
m1[c(FALSE, FALSE, TRUE),])
})
#user system elapsed
# 0.01 0.00 0.02

ggave() Error: Unknown input R

I am having trouble with ggsave() from the ggplot2 library. I wrote a function that I pass arguments to, and that is supposed to produce and then save the results with ggsave().
Here is some example data and code to reproduce the error:
example.df.1 <- data.frame(matrix(1:100, nrow = 20, ncol = 5))
colnames(example.df.1) <- c("var1", "var2", "var3", "var4", "var5")
rownames(example.df.1) <- c("A", "B", "C", "D", "E", "F", "G", "H",
"I", "J", "K", "L", "M", "N", "O", "P",
"Q", "R", "S", "T")
example.df.2 <- data.frame(matrix(ncol = 2, nrow = 24))
example.df.2[,1] <- c("A", "B", "C", "D", "E", "F", "G", "H",
"I", "J", "K", "L", "M", "N", "O", "P",
"Q", "R", "S", "T", "U", "V", "W", "X")
example.df.2[,2] <- rnorm(24, 10, 2)
problematic_func <- function(data1, col, title, var, data2) {
# only include rows without missing values
loc1 <- subset(data1, rowSums(is.na(data1)) == 0)
loc1 <- cbind(loc1, rank(-as.data.frame(loc1[,1]), ties.method = "first"))
# reduce data2 to only those rows that correspond to rows in data1
loc2 <- data2[data2[,1] %in% rownames(loc1),]
# order loc2
loc2.ordered <- loc2[order(loc2[,1]),]
# correlation between loc1 and loc2.ordered
corr <- cor(loc1[,1], loc2.ordered[,2])
# creating the plot
i <- ggplot(loc1, aes_q(x = loc1[,1], y = loc2.ordered))
i <- i + geom_point(colour = col, size = 4)
i <- i + ggtitle(title)
i <- i + xlab(var)
i <- i + ylab("y-axis")
i <- i + coord_cartesian(xlim = c(0, max(loc1[,1])),
ylim = c(0, max(loc2.ordered[,2])*1.2))
i <- i + annotate("text", x = max(loc1[,1])*.5, y = 1,
label = paste("Correlation coef: ", as.character(corr)), size = 3)
# saving the plot - this is where the error occurs according
# to the debugger
ggsave(filename = paste("my_example_plot_", var, ".png", sep = ""),
plot = i, device = png, width = 625, height = 625, limitsize = FALSE)
}
for (i in 1:ncol(example.df.1)) {
sv <- as.data.frame(example.df.1[,i])
rownames(sv) <- rownames(example.df.1)
problematic_func(sv, "orange", colnames(example.df.1[i]),
colnames(example.df.1[i]), data2 = example.df.2)
}
Edit: Sorry, I forgot to add the error I get:
Error in FUN(X[[2L]], ...) : Unknown input:data.frame
I found the mistake myself. The error was not in the ggsave() function, but in the ggplot() function with which I created the plot in the first place. The correct code must be:
i <- ggplot(loc1, aes_q(x = loc1[,1], y = loc2.ordered[,2]))
The difference is that the y aesthetic needs to be passed a column of loc2.ordered, not the entire dataframe.

Resources