Related
#I'm seeking to write code that takes a column with character values (e.g. ALA3=VAL20) and converts it to a specific single letter corresponding to that three letter code (e.g. A3=V20). I attempted using the following function, but seem to be having trouble:
substitute_codes <- function(data, col_name) {
# Create a dictionary of the code substitutions
code_dict <- c("ALA" = "A", "ARG" = "R", "ASN" = "N", "ASP" = "D",
"CYS" = "C", "GLU" = "E", "GLN" = "Q", "GLY" = "G",
"HIS" = "H", "ILE" = "I", "LEU" = "L", "LYS" = "K",
"MET" = "M", "PHE" = "F", "PRO" = "P", "SER" = "S",
"THR" = "T", "TRP" = "W", "TYR" = "Y", "VAL" = "V")
# Apply the substitutions using gsub()
data[[col_name]] <- gsub(paste(names(code_dict), collapse = "|"),
paste(code_dict, collapse = ""),
data[[col_name]])
return(data)
}
But I get results like the following from ALA3=VAL20 to ARNDCEQGHILKMFPSTWYV3=ARNDCEQGHILKMFPSTWYV20
As Darren Tsai points out in the comments, we can just use the cdoe_dict in str_replace_all():
library(stringr)
set.seed(123)
x <- sample(names(code_dict), 10)
y <- sample(names(code_dict), 10)
my_string <- paste0(x, sample(10), "=", y, sample(10))
my_string
#> [1] "PRO4=PHE9" "TYR1=CYS10" "PHE3=HIS2" "ASN7=ASN7" "ILE5=GLY3"
#> [6] "ARG10=ILE4" "GLU8=GLN1" "LEU2=PRO6" "CYS9=TRP5" "ASP6=THR8"
str_replace_all(my_string,
code_dict)
#> [1] "P4=F9" "Y1=C10" "F3=H2" "N7=N7" "I5=G3" "R10=I4" "E8=Q1" "L2=P6"
#> [9] "C9=W5" "D6=T8"
For more complex replacements, we could a custom function inside str_replace_all() as replacement argument. The custom function below shorten_str just uses a classical lookup with base R's match(), but we could add any kind of complexity here.
library(stringr)
# our dictionary
code_dict <- c("ALA" = "A", "ARG" = "R", "ASN" = "N", "ASP" = "D",
"CYS" = "C", "GLU" = "E", "GLN" = "Q", "GLY" = "G",
"HIS" = "H", "ILE" = "I", "LEU" = "L", "LYS" = "K",
"MET" = "M", "PHE" = "F", "PRO" = "P", "SER" = "S",
"THR" = "T", "TRP" = "W", "TYR" = "Y", "VAL" = "V")
# let's create a toy string
set.seed(123)
x <- sample(names(code_dict), 10)
y <- sample(names(code_dict), 10)
my_string <- paste0(x, sample(10), "=", y, sample(10))
my_string
#> [1] "PRO4=PHE9" "TYR1=CYS10" "PHE3=HIS2" "ASN7=ASN7" "ILE5=GLY3"
#> [6] "ARG10=ILE4" "GLU8=GLN1" "LEU2=PRO6" "CYS9=TRP5" "ASP6=THR8"
# custom function to replace string
shorten_str <- function(abr) {
code_dict[match(abr, names(code_dict))]
}
# implementation with `str_replace_all()`
str_replace_all(my_string,
paste(names(code_dict), collapse = "|"),
shorten_str)
#> [1] "P4=F9" "Y1=C10" "F3=H2" "N7=N7" "I5=G3" "R10=I4" "E8=Q1" "L2=P6"
#> [9] "C9=W5" "D6=T8"
Created on 2023-02-16 by the reprex package (v2.0.1)
This is what I have:
> miniDF
site1 site2 site3 site4 site5
Alpha G T A C T
Beta G T A T T
Delta G T G C T
Gamma G C A T T
Eps G T A T T
Pi A T A T T
Omi G T A C A
miniDF = structure(list(site1 = c("G", "G", "G", "G", "G", "A", "G"),
site2 = c("T", "T", "T", "C", "T", "T", "T"), site3 = c("A",
"A", "G", "A", "A", "A", "A"), site4 = c("C", "T", "C", "T",
"T", "T", "C"), site5 = c("T", "T", "T", "T", "T", "T", "A"
)), row.names = c("Alpha", "Beta", "Delta", "Gamma", "Eps",
"Pi", "Omi"), class = "data.frame")
I'd like to convert it to a list structure for a venn diagram or upset plot where the presence of a unique letter in that column puts that site into the list row name:
myList = list('Alpha'=c('site4'), 'Beta'=c(), 'Delta'=c('site3', 'site4'), 'Gamma'=c('site2'), 'Eps'=c(), 'Pi'=c('site1'), 'Omi'=c('site4','site5'))
Alpha only has one unique site (a column with a unique cell) , Beta has none, but Delta and Omi have two unique sites.
Unique in this context means that cell is different from the other cells in that column. So for site1, A is the unique value (all the other values are G), so Pi includes that site in it's array.
For columns where there is more than one cell with a different value, like site4, I take the value of the first row to be the unique value, hence Alpha, Delta, and Omi include site4 in their arrays.
Assume I have a few hundred columns.
How can I do this?
We create a function to find the "unique" values, then apply it to every column, and finally go through each row see which columns have the unique values.
I've used just base R. The code could probably be a bit more concise if we switched to purrr functions, or possibly more efficient if we used a matrix instead of a data frame.
pseudo_unique = function(x) {
tx = sort(table(x))
if(tx[1] == 1) return(names(tx[1])) else return(x[1])
}
u_vals = lapply(miniDF, pseudo_unique)
result = lapply(
row.names(miniDF),
\(row) names(miniDF)[which(unlist(Map("==", u_vals, miniDF[row, ])))]
)
names(result) = row.names(miniDF)
result
# $Alpha
# [1] "site4"
#
# $Beta
# character(0)
#
# $Delta
# [1] "site3" "site4"
#
# $Gamma
# [1] "site2"
#
# $Eps
# character(0)
#
# $Pi
# [1] "site1"
#
# $Omi
# [1] "site4" "site5"
Here's the matrix version for the same result. With a few hundred columns, I'd recommend this version.
miniMat = as.matrix(miniDF)
u_vals = apply(miniMat, 2, pseudo_unique)
result = apply(miniMat, 1, \(row) colnames(miniMat)[row == u_vals], simplify = FALSE)
Here's a solution in the tidyverse.
Solution
First import the tidyverse and generate your dataset miniDF.
library(tidyverse)
# ...
# Code to generate 'miniDF'.
# ...
Then define the custom function are_unique() to properly identify which values in each column you consider "unique".
are_unique <- function(x) {
# Return an empty logical vector for an empty input...
if(length(x) < 1) {
return(logical(0))
}
# ...and otherwise identify which input values are strictly unique.
are_unique <- !x %in% x[duplicated(x)]
# If unique values actually exist, return that identification as is...
if(any(are_unique)) {
return(are_unique)
}
# ...and otherwise default to treating the first value as "unique"...
token_unique <- x[1]
# ...and identify its every occurrence.
x == token_unique
}
Finally, apply this tidy workflow:
miniDF %>%
# Make the letters (row names) a column of their own.
rownames_to_column("letter") %>%
# In every other column, identify which values you consider "unique".
mutate(across(!letter, are_unique)) %>%
# Pivot into 'col_name | is_unique' format for easy filtration.
pivot_longer(!letter, names_to = "col_name", values_to = "is_unique") %>%
# Split by letter into a list, with the subset of rows for each letter.
split(.$letter) %>%
# Convert each subset into the vector of 'col_name's that filter as "unique".
sapply(function(x){x$col_name[x$is_unique]})
Result
Given a miniDF like your sample here
miniDF <- structure(
list(
site1 = c("G", "G", "G", "G", "G", "A", "G"),
site2 = c("T", "T", "T", "C", "T", "T", "T"),
site3 = c("A", "A", "G", "A", "A", "A", "A"),
site4 = c("C", "T", "C", "T", "T", "T", "C"),
site5 = c("T", "T", "T", "T", "T", "T", "A")
),
row.names = c("Alpha", "Beta", "Delta", "Gamma", "Eps", "Pi", "Omi"),
class = "data.frame"
)
this solution should produce the following list:
list(
Alpha = "site4",
Beta = character(0),
Delta = c("site3", "site4"),
Eps = character(0),
Gamma = "site2",
Omi = c("site4", "site5"),
Pi = "site1"
)
Note
The answer here by #GregorThomas should likely supersede my own. While my answer was technically posted first, I deleted that answer to fix an error, and Gregor's functional solution was posted before I finally undeleted mine.
Gregor's is likely more elegant anyway.
I have the following matrix:
mat<- matrix(c(1,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,
2,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,1,0,1,1,0,0,1,0,1,
1,1,0,0,0,0,0,0,1,0,1,2,1,0,0,0), nrow=16, ncol=6)
dimnames(mat)<- list(c("a", "c", "f", "h", "i", "j", "l", "m", "p", "q", "s", "t", "u", "v","x", "z"),
c("1", "2", "3", "4", "5", "6"))
I have created a list of matrices using the function below:
lapply(seq_len(ncol(mat) - 1), function(j) do.call(cbind,
lapply(seq_len(ncol(mat) - j), function(i) rowSums(mat[, i:(i + j)]))))
In this function, columns in the original matrix are combined using a moving window method. First, the window size is 2 such that the data in two columns are combined. The window shifts 1 step (1 column) and the next set of two columns are combined. The output is a matrix for each window size. The window size continues to increase such that, the window increases to 3 columns and the results from 3 columns are output into a new matrix. This continues until the window is the size of the maximum number of columns.
I need to run a series of functions on each matrix within the list and output the answers into a data frame. The functions I need to apply are:
Calculate total frequency for each row (ie. row totals ). I attempted this function:
freq <- rowSums(mat[i:(i + j),])
Calculate mean frequency for each row (ie. row totals/length of row). I attempted this function:
mean_freq <- rowSums(mat[i:(i + j),])/length(mat[i:(i + j),])
Multiply the window size * pi * 25.
total_window_size <- length(ncol(mat) - j))*pi*25
Divide the mean frequency for each row by the total window size.
density <- mean_freq/total_window_size
Below are the expected results for the functions above for each matrix in this example list (i.e., result_mat1, result_mat2...). The data frame result_df combines all the results for each sub data frame and is the final output I need:
df for window size 2
result_mat1 <- data.frame( window_size= rep("2",80),
combined_cols= c(rep("1_2",16), rep("2_3",16), rep("3_4",16), rep("4_5",16), rep("5_6",16)),
row_names= c("a", "c", "f", "h", "i", "j", "l", "m", "p", "q", "s", "t", "u", "v","x", "z"),
freq=c(6,3,2,2,6,2,1,2,1,2,3,2,1,2,3,2),
mean_freq=(c(6,3,2,2,6,2,1,2,1,2,3,2,1,2,3,2)/5),
total_window_size= rep(157.08, 16))
result_mat1$density<- result_mat1$mean_freq/result_mat1$total_window_size
df for window size 3
result_mat2 <- data.frame( window_size= rep("3",64),
combined_cols= c(rep("1_2_3",16), rep("2_3_4",16), rep("3_4_5",16), rep("4_5_6",16)),
row_names= c("a", "c", "f", "h", "i", "j", "l", "m", "p", "q", "s", "t", "u", "v","x", "z"),
freq=c(6,4,3,3,7,3,1,2,1,2,3,2,1,2,4,2),
mean_freq=(c(6,4,3,3,7,3,1,2,1,2,3,2,1,2,4,2)/5),
total_window_size= rep(235.62, 16))
result_mat2$density <- result_mat2$mean_freq/result_mat2$total_window_size
df for window size 4
result_mat3 <- data.frame( window_size= rep("4",48),
combined_cols= c(rep("1_2_3_4",16), rep("2_3_4_5",16), rep("3_4_5_6",16)),
row_names= c("a", "c", "f", "h", "i", "j", "l", "m", "p", "q", "s", "t", "u", "v","x", "z"),
freq=c(6,3,3,3,7,3,1,2,1,2,3,2,1,2,4,2),
mean_freq=(c(6,3,3,3,7,3,1,2,1,2,3,2,1,2,4,2)/5),
total_window_size= rep(314, 16))
result_mat3$density <- result_mat3$mean_freq/result_mat3$total_window_size
df for window size 5
result_mat4 <- data.frame( window_size= rep("5",32),
combined_cols= c(rep("1_2_3_4_5",16), rep("2_3_4_5_6",16)),
row_names= c("a", "c", "f", "h", "i", "j", "l", "m", "p", "q", "s", "t", "u", "v","x", "z"),
freq=c(6,3,2,2,6,2,1,2,1,2,3,2,1,2,4,2),
mean_freq=(c(6,3,2,2,6,2,1,2,1,2,3,2,1,2,4,2)/5),
total_window_size= rep(392.5, 16))
result_mat4$density <- result_mat4$mean_freq/result_mat4$total_window_size
df for window size 6
result_mat5 <- data.frame( window_size= rep("6",16),
combined_cols= c(rep("1_2_3_4_5_6",16)),
row_names= c("a", "c", "f", "h", "i", "j", "l", "m", "p", "q", "s", "t", "u", "v","x", "z"),
freq=c(4,2,1,1,3,1,1,1,1,1,2,2,1,1,3,1),
mean_freq=(c(4,2,1,1,3,1,1,1,1,1,2,2,1,1,3,1)/5),
total_window_size= rep(471, 16))
result_mat5$density <- result_mat5$mean_freq/result_mat5$total_window_size
Final data frame with results for all sub data frames combined
result_df <- rbind(result_mat1, result_mat2, result_mat3, result_mat4, result_mat5)
I need help applying those 4 functions to each element of the list and output the results to one data frame.
Here is a start. I'm not sure how the combined_cols is supposed to be added to each data.frame since it is a different size (seems each is longer than all of the other data.frame columns.) I'm not sure if all of these calculations are exactly right, but this at least demonstrates the crux of the question 'how to iterate through a list, assemble some data.frames with results, and merge them into one big data.frame.)
myList <- lapply(seq_len(ncol(mat) - 1), function(j) do.call(cbind, lapply(seq_len(ncol(mat) - j), function(i) rowSums(mat[, i:(i + j)]))))
myListOutput <- list()
for (i in 1:length(myList)) {
print(i)
myMat = myList[[i]]
freq <- rowSums(myMat)
window_size = rep(as.character(i + 1), length(freq))
# your final data sample shows dividing by 5 on each one,
# but your pseudo code shows something to do with the columns
mean_freq <- rowSums(myMat)/(ncol(myMat))
total_window_size <- rep((i+1)*pi*25, length(freq))
density <- mean_freq/total_window_size
myDf = data.frame(window_size, freq, mean_freq, total_window_size, density)
myListOutput[[i]] <- myDf
}
result_df = do.call(rbind, myListOutput)
I am trying to create a data frame that pulls combinations of values from a data frame, and applies a function on the data, and finally assigning the label of the combinations as identifiers in the data frame.
To create a simple mockup of the data:
A = (1:20)
B = seq(1, 40, 2)
To create a mockup of each identifier:
L = c("q", "r", "q", "p", "q", "p", "r", "r", "r", "r", "r", "r", "r", "r", "o", "p", "q", "r", "r", "q")
The mockup data frame is then created with:
dat.example = data.frame(A, B, L)
Here, the data in each row is subtracted by the data from the previous row, with all possible combinations.
diffA = combn(dat.example$A, 2, FUN = diff)
diffB = combn(dat.example$B, 2, FUN = diff)
After all possible combination of differences within columns A and B are calculated, a function is applied on the generated output.
calc.diff = sqrt((diffA^2) + (diffB^2))
Now comes the tricky part, where I pull out the identifiers for the combinations.
(L.list = combn(dat.example$L, 2, simplify = F))
L.list.1 = sapply(L.list, "[[", 1)
L.list.2 = sapply(L.list, "[[", 2)
L.dat = data.frame(
X = L.list.1,
Y = L.list.2
)
L.dat$comp = with(
L.dat,
interaction(X, Y, sep = "_")
)
And combine them into a master dataset
master = data.frame(
calc.diff,
L.dat$comp
)
Is there a simpler way to perform this that I am missing?
Update: I just realised the labels I created in this fashion is duplicated, as p_q is equivalent to q_r as only the magnitude of the value matters after the calc.dist function. Therefore, this question still needs a better answer. My present solution is to simply substitute each occurrence with it's twin in the dataset, but this becomes substantially difficult the more labels there are.
Update 2: I have solved the issue of a pure R solution to properly assigning unique combination of labels to the dataset. Below, I shall paste the entire code for generating the above dataset, and assigning the right labels. Improvements are of course welcomed.
A = (1:20)
B = seq(1, 40, 2)
L = c("q", "r", "q", "p", "q", "p", "r", "r", "r", "r", "r", "r", "r", "r", "o", "p", "q", "r", "r", "q")
dat.example = data.frame(A, B, L)
diffA = combn(dat.example$A, 2, FUN = diff)
diffB = combn(dat.example$B, 2, FUN = diff)
calc.diff = sqrt((diffA^2) + (diffB^2))
L.dat.comp <- combn(dat.example$L, 2, paste0, collapse = "_", simplify=F)
master <- data.frame(
calc.diff,
id = unlist(L.dat.comp))
Function to help with combinations
expand.grid.unique <- function(x, y, include.equals=FALSE)
{
x <- unique(x)
y <- unique(y)
g <- function(i)
{
z <- setdiff(y, x[seq_len(i-include.equals)])
if(length(z)) cbind(x[i], z, deparse.level=0)
}
do.call(rbind, lapply(seq_along(x), g))
}
Create dataframe with unique combinations of id labels.
dat.combo =
as.data.frame(
expand.grid.unique(L, L, include.equals = T)
)
dat.combo$c1 = with(dat.combo, interaction(V1, V2, sep = "_"))
dat.combo$c2 = with(dat.combo, interaction(V2, V1, sep = "_"))
master$id2 = master$id
Where ids match dat.combo$c2, replace with dat.com$c1
master$id2[] = setNames(dat.combo$c1,
dat.combo$c2)[as.character(
unlist(
master$id2
)
)
]
Where NAs are present in id2, replace with values in id
master$id2[is.na(master$id2)] = master$id[is.na(master$id2)]
Based on Jay's comement for the "tricky part":
L.dat.comp <- combn(dat.example$L, 2, paste0, collapse = "_", simplify=F)
master <- data.frame(cbind(calc.diff,unlist(L.dat.comp)))
So the whole thing looks like:
A = (1:20)
B = seq(1, 40, 2)
L = c("q", "r", "q", "p", "q", "p", "r", "r", "r", "r", "r", "r", "r", "r", "o", "p", "q", "r", "r", "q")
dat.example = data.frame(A, B, L)
diffA = combn(dat.example$A, 2, FUN = diff)
diffB = combn(dat.example$B, 2, FUN = diff)
calc.diff = sqrt((diffA^2) + (diffB^2))
L.dat.comp <- combn(dat.example$L, 2, paste0, collapse = "_", simplify=F)
master <- data.frame(cbind(calc.diff,unlist(L.dat.comp)))
I am having trouble with ggsave() from the ggplot2 library. I wrote a function that I pass arguments to, and that is supposed to produce and then save the results with ggsave().
Here is some example data and code to reproduce the error:
example.df.1 <- data.frame(matrix(1:100, nrow = 20, ncol = 5))
colnames(example.df.1) <- c("var1", "var2", "var3", "var4", "var5")
rownames(example.df.1) <- c("A", "B", "C", "D", "E", "F", "G", "H",
"I", "J", "K", "L", "M", "N", "O", "P",
"Q", "R", "S", "T")
example.df.2 <- data.frame(matrix(ncol = 2, nrow = 24))
example.df.2[,1] <- c("A", "B", "C", "D", "E", "F", "G", "H",
"I", "J", "K", "L", "M", "N", "O", "P",
"Q", "R", "S", "T", "U", "V", "W", "X")
example.df.2[,2] <- rnorm(24, 10, 2)
problematic_func <- function(data1, col, title, var, data2) {
# only include rows without missing values
loc1 <- subset(data1, rowSums(is.na(data1)) == 0)
loc1 <- cbind(loc1, rank(-as.data.frame(loc1[,1]), ties.method = "first"))
# reduce data2 to only those rows that correspond to rows in data1
loc2 <- data2[data2[,1] %in% rownames(loc1),]
# order loc2
loc2.ordered <- loc2[order(loc2[,1]),]
# correlation between loc1 and loc2.ordered
corr <- cor(loc1[,1], loc2.ordered[,2])
# creating the plot
i <- ggplot(loc1, aes_q(x = loc1[,1], y = loc2.ordered))
i <- i + geom_point(colour = col, size = 4)
i <- i + ggtitle(title)
i <- i + xlab(var)
i <- i + ylab("y-axis")
i <- i + coord_cartesian(xlim = c(0, max(loc1[,1])),
ylim = c(0, max(loc2.ordered[,2])*1.2))
i <- i + annotate("text", x = max(loc1[,1])*.5, y = 1,
label = paste("Correlation coef: ", as.character(corr)), size = 3)
# saving the plot - this is where the error occurs according
# to the debugger
ggsave(filename = paste("my_example_plot_", var, ".png", sep = ""),
plot = i, device = png, width = 625, height = 625, limitsize = FALSE)
}
for (i in 1:ncol(example.df.1)) {
sv <- as.data.frame(example.df.1[,i])
rownames(sv) <- rownames(example.df.1)
problematic_func(sv, "orange", colnames(example.df.1[i]),
colnames(example.df.1[i]), data2 = example.df.2)
}
Edit: Sorry, I forgot to add the error I get:
Error in FUN(X[[2L]], ...) : Unknown input:data.frame
I found the mistake myself. The error was not in the ggsave() function, but in the ggplot() function with which I created the plot in the first place. The correct code must be:
i <- ggplot(loc1, aes_q(x = loc1[,1], y = loc2.ordered[,2]))
The difference is that the y aesthetic needs to be passed a column of loc2.ordered, not the entire dataframe.