I am trying trying to create a nested for loop, but somehow, the dimensions of mdata are mostly incorrect. For instance, the values for length(mdata[[1:5]]) are almost all 81, when they should be 81, 50, 60, etc. After the first five, length(mdata) becomes all 108s. Is there something wrong with my nested loop?
chrdata <- list()
mdata <- list()
res_df <- list()
for(i in 1:24) {
l <- length(olap[[i]])
for (j in 1:l) {
#for each row in granges object
sub_olap <- as.data.frame(mcols(subsetByOverlaps(all_list, olap[[i]][j])))
chrdata[[j]] <- data.frame(median(sub_olap$log2Ratio), #make metadata a single row and determine median
paste0(sub_olap$pvalue, collapse = ","),
paste0(sub_olap$length, collapse = ","),
paste0(sub_olap$sample, collapse = ","), stringsAsFactors = F)
colnames(chrdata[[j]]) <- c("med_log2Ratio", "pvalue", "length", "sample")
mdata[[i]] <- chrdata
mdf <- data.frame(matrix(unlist(mdata[[i]]), nrow = l, byrow = T))
colnames(mdf) <- c("med_log2Ratio", "pvalue", "length", "sample")
res_df[[i]] <- cbind(chr[[i]], mdf)
Want to do Bootstrapping while comparing two dataframe column wise with the different number of rows.
I have two dataframe in which row represent values from experiments and column with the dataset names (data1, data2, data3, data4)
emp.data1 <- data.frame(
data1 = c(234,0,34,0,46,0,0,0,2.26,0, 5,8,93,56),
data2 = c(1.40,1.21,0.83,1.379,2.60,9.06,0.88,1.16,0.64,8.28, 5,8,93,56),
data3 =c(0,34,43,0,0,56,0,0,0,45,5,8,93,56),
data4 =c(45,0,545,34,0,35,0,35,0,534, 5,8,93,56),
stringsAsFactors = FALSE
emp.data2 <- data.frame(
data1 = c(45, 0, 0, 45, 45, 53),
data2 = c(23, 0, 45, 12, 90, 78),
data3 = c(72, 45, 756, 78, 763, 98),
data4 = c(1, 3, 65, 78, 9, 45),
stringsAsFactors = FALSE
I am trying to do bootstrapping(n=1000). Values are selected at random replacement from emp.data1(14 * 4) without change in the emp.data2(6 * 4). For example from emp.data2 first column (data1) select 6 values colSum and from emp.data1(data1) select 6 random non zero values colSum Divide the values and store in temp repeat the same 1000 times and take a median value et the end. like this i want to do it for each column of the dataframe. sample code I am providing which is working fine but i am not able get the non-zero random values for emp.data1
nboot <- 1e3
boot_temp_emp<- c()
n_data1 <- nrow(emp.data1); n_data2 <- nrow(emp.data2)
for (j in seq_len(nboot)) {
boot <- sample(x = seq_len(n_data1), size = n_data2, replace = TRUE)
value <- colSums(emp.data2)/colSums(emp.data1[boot,])
boot_temp_emp <- rbind(boot_temp_emp, value)
boot_data<- apply(boot_temp_emp, 2, median)
From the above script i am able get the output but each column emp.data1[boot,] data has zero values and taken sum. I want indivisual ramdomly selected non-zero values column sum so I tried below script not able remove zero values. Not able get desired output please some one help me to correct my script
nboot <- 1e3
boot_temp_emp<- c()
for (i in colnames(emp.data2)){
for (j in seq_len(nboot)){
n_data1 <- nrow(data1); n_data2 <- nrow(data2)
boot <- sample(x = seq_len(n_data1), size = n_data2, replace = TRUE)
value <- colSums(data2[i])/colSums(data1[boot, ,drop = FALSE])
boot_temp_emp <- rbind(boot_temp_emp, value)
boot_data<- apply(boot_temp_emp, 2, median)
Thank you
Here is a solution.
Write a function to make the code clearer. This function takes the following arguments.
x the input data.frame emp.data1;
s2 the columns sums of emp.data2;
n = 6 the number of vector elements to sample from emp.data1's columns with a default value of 6.
The create a results matrix, pre-compute the column sums of emp.data2 and call the function in a loop.
boot_fun <- function(x, s2, n = 6){
# the loop makes sure ther is no divide by zero
nrx <- nrow(x)
i <- sample(nrx, n, replace = TRUE)
s1 <- colSums(x[i, ])
if(all(s1 != 0)) break
nboot <- 1e3
sums2 <- colSums(emp.data2)
results <- matrix(nrow = nboot, ncol = ncol(emp.data1))
for(i in seq_len(nboot)){
results[i, ] <- boot_fun(emp.data1, sums2)
ratios_medians <- apply(results, 2, median)
old_par <- par(mfrow = c(2, 2))
for(j in 1:4) {
main <- paste0("data", j)
hist(results[, j], main = main, xlab = "ratios", freq = FALSE)
abline(v = ratios_medians[j], col = "blue", lty = "dashed")
Created on 2022-02-24 by the reprex package (v2.0.1)
Following the comments here is a revised version of the bootstrap function. It makes sure there are no zeros in the sampled vectors, before computing their sums.
boot_fun2 <- function(x, s2, n = 6){
nrx <- nrow(x)
ncx <- ncol(x)
s1 <- numeric(ncx)
for(j in seq.int(ncx)) {
i <- sample(nrx, n, replace = TRUE)
if(all(x[i, j] != 0)) {
s1[j] <- sum(x[i, j])
nboot <- 1e3
sums2 <- colSums(emp.data2)
results2 <- matrix(nrow = nboot, ncol = ncol(emp.data1))
for(i in seq_len(nboot)){
results2[i, ] <- boot_fun2(emp.data1, sums2)
ratios_medians2 <- apply(results2, 2, median)
old_par <- par(mfrow = c(2, 2))
for(j in 1:4) {
main <- paste0("data", j)
hist(results2[, j], main = main, xlab = "ratios", freq = FALSE)
abline(v = ratios_medians2[j], col = "blue", lty = "dashed")
Created on 2022-02-27 by the reprex package (v2.0.1)
Hi all I am trying to create a distance matrix from a random created sequence.
#set the code
DNA <- c("A","G","T","C")
randomDNA <- c()
#create the vector of 64 elements
for (i in 1:64){
randomDNA[i] <- paste0(sample(DNA, 6, replace = T), sep = "", collapse = "")
sizeofDNA <- length(randomDNA)
#this part that I want to iterate between vector's components
split_vector <- c()
DNAdiff <- c()
for (i in 1:length(randomDNA)){
split_vector <- strsplit(randomDNA[i], "")[[1]]
for (j in 1:length(randomDNA)){
split_vector2 <- strsplit(randomDNA[j], "")[[1]]
DNAdiff[i,j] <- setdiff(split_vector,split_vector2)
#DNAdiff[i] <- lenght(setdiff(strsplit(randomDNA[22], "")[[1]],strsplit(randomDNA[33], "")[[1]]))
What it does not work is
A: the setdiff does not work as I expect
B: no array is created
Question how do I export the results of the setdiff (if it will work) to an array so that I will have the distance matrix like array?
Any recommendation is highly welcomed.
Thank you all
EDIT: So there are 2 solutions:
A. Using, as mentioned in the comments by #ThomasIsCoding , the "adist" function; this will calculate the Levenshtein distances:
DNA <- c("A","G","T","C")
randomDNA <- c()
for (i in 1:64){
randomDNA[i] <- paste0(sample(DNA, 6, replace = T), sep = "", collapse = "")
dm <-as.matrix(adist(randomDNA))
rownames(dm) <- randomDNA
colnames(dm) <- randomDNA
heatmap(dm, Rowv = NA, Colv = NA)
write.csv(dm,"distance_matrix.csv", row.names = T, col.names = T )
B. Another method to calculate the Hamming distance will be:
DNA <- c("A","G","T","C")
randomDNA <- c()
for (i in 1:96){
randomDNA[i] <- paste0(sample(DNA, 6, replace = T), sep = "", collapse = "")
Humm <- matrix(nrow=length(randomDNA), ncol=length(randomDNA))
for (i in 1:length(randomDNA)){
split_vector <- strsplit(randomDNA[i], "")[[1]]
for (j in 1:length(randomDNA)){
split_vector2 <- strsplit(randomDNA[j], "")[[1]]
#Hamming distance is calculated as:
Humm[i,j] <- sum(split_vector != split_vector2)
rownames(Humm) <- randomDNA
colnames(Humm) <- randomDNA
heatmap(Humm, Rowv = NA, Colv = NA)
write.csv(Humm,"distance_matrix.csv", row.names = T, col.names = T )
I think you you might need adist to get the distance matrix, e.g.,
I have a for loop that takes each sample file on a list, creates a matrix for that sample, and then stores it into one big list of all the sample matrices.
Here is what I have done so far:
# load in data ------------------------------------------------------------------
filePaths = getGEOSuppFiles("GSE124395")
tarF <- list.files(path = "./GSE124395/", pattern = "*.tar", full.names = TRUE)
untar(tarF, exdir = "./GSE124395/")
gzipF <- list.files(path = "./GSE124395/", pattern = "*.gz", full.names = TRUE)
ldply(.data = gzipF, .fun = gunzip)
#running test loop -------------------------------------------------------------
testlist <- c("./GSE124395//GSM3531672_P301_3_CRYOMIXED11.coutt.csv",
LoopList_test <- list()
for (i in 1:length(testlist)){
matrix_test <- read.delim(file =testlist[i])
matrix_test <- data.frame(matrix_test[,-1], row.names=matrix_test[,1])
matrix_test <- as.matrix(matrix_test) #<- makes the excel file into a matrix
colname_test <- read.delim(file =testlist[i])
colname_test <- read.table(file = './GSE124395//GSE124395_celseq_barcodes.192.txt', header = FALSE, row.names = 1)
colname_test <- data.frame(colname_test[,-1], col=colname_test[,1])
colname_test <- as.matrix(colname_test)
colnames(matrix_test) <- colname_test[,1]
This is the output:
part of output in the one big list
I would like the loop to store the result of each iteration into its own matrix, so I have multiple matrices instead of one giant list of matrices, if that makes sense. I think it involves either splitting this one giant list into sublists, or storing the results of the loop into a matrix/array/vector instead of a list, or somehow having it store each iteration into its own variable within the loop. I'm not sure how to go about doing any of those.
Thanks for reading!
So the whole point of this was to create matrices to then combine them into one matrix. Then turn this one matrix into a Seurat object which I could then perform clustering on.
So here is what I have done so far: essentially, I made multiple loops of each group within the dataset, added whatever information I needed, and then took the list and the function I think I need actually takes a list so that's good for me. Here's the code I decided on at the moment:
mylist<-list.files(path = "./GSE124395/", pattern = "\\.csv$",full.names = TRUE)
LoopList <- list()
for (i in 1:30){
matrix_input <- read.delim(file =mylist[i])
matrix_input <- data.frame(matrix_input[,-1], row.names=matrix_input[,1])
matrix_input <- as.matrix(matrix_input) #<- makes the excel file into a matrix
colname_input <- read.delim(file =mylist[i])
colname_input <- read.table(file = './GSE124395//GSE124395_celseq_barcodes.192.txt', header = FALSE, row.names = 1)
colname_input <- data.frame(colname_input[,-1], col=colname_input[,1])
colname_input <- as.matrix(colname_input)
colnames(matrix_input) <- colname_input[,1]
colnames(matrix_input) <- paste(colnames(matrix_input), "Colorectal_Metastasis", sep = "_")
P301_pdat <- data.frame("samples" = colnames(matrix_input), "treatment" = "Colorectal_Metastasis")
sobj <- CreateSeuratObject(counts = matrix_input, min.cells = 0, min.features = 1,
project = "Patient301_Colorectal_Metastasis")
#LoopList <- assign(paste0("Patient301", i), sobj )
# P304 loop -------------------------------------------------------------------------
for (i in 31:56){
matrix_input <- read.delim(file =mylist[i])
matrix_input <- data.frame(matrix_input[,-1], row.names=matrix_input[,1])
matrix_input <- as.matrix(matrix_input) #<- makes the excel file into a matrix
colname_input <- read.delim(file =mylist[i])
colname_input <- read.table(file = './GSE124395//GSE124395_celseq_barcodes.192.txt', header = FALSE, row.names = 1)
colname_input <- data.frame(colname_input[,-1], col=colname_input[,1])
colname_input <- as.matrix(colname_input)
colnames(matrix_input) <- colname_input[,1]
colnames(matrix_input) <- paste(colnames(matrix_input), "Colorectal_Metastasis", sep = "_")
P304_pdat <- data.frame("samples" = colnames(matrix_input), "treatment" = "Colorectal_Metastasis")
sobj <- CreateSeuratObject(counts = matrix_input, min.cells = 0, min.features = 1,
project = "Patient304_Colorectal_Metastasis")
and so on. Then, following https://satijalab.org/seurat/articles/integration_large_datasets.html
sobj.list <- SplitObject(LoopList, split.by = "orig.ident")
joined <- lapply(X = LoopList, FUN = function(x) {
x <- NormalizeData(x, verbose = FALSE)
x <- FindVariableFeatures(x, verbose = FALSE)
features <- SelectIntegrationFeatures(object.list = joined)
joined <- lapply(X = joined, FUN = function(x) {
x <- ScaleData(x, features = features, verbose = FALSE)
x <- RunPCA(x, features = features, verbose = FALSE)
anchors <- FindIntegrationAnchors(object.list = joined, reduction = "rpca",
dims = 1:50)
joined.integrated <- IntegrateData(anchorset = anchors, dims = 1:50)
joined.integrated <- ScaleData(joined.integrated, verbose = FALSE)
joined.integrated <- RunPCA(joined.integrated, verbose = FALSE)
joined.integrated <- RunUMAP(joined.integrated, dims = 1:50)
DimPlot(joined.integrated, group.by = "orig.ident")
DimPlot(joined.integrated, reduction = "umap", split.by = "treatment")
I don't know if this works for sure, but I thought I would update this question to reflect what I've learned so far! I guess lesson I've learned is see if you can find a function that takes a list as input heh.
I am trying to output the number of groups on 0s in multiple data frames in a list. I believe the package I need to do this is the raster R package. Here is my attempt...
output_1 <- matrix(sample(c(0,1), 225, prob=c(0.8,0.2), replace=TRUE), nrow = 15)
df_output_1 <- data.frame(output_1)
output_2 <- matrix(sample(c(0,1), 225, prob=c(0.8,0.2), replace=TRUE), nrow = 15)
df_output_2 <- data.frame(output_2)
output_list <- list(df_output_2, df_output_2)
lapply(output_list, function (onedf) {
Rastermat <- raster(onedf)
Clumps <- as.matrix(clump(Rastermat, directions = 8))
#turning the clumps into a list
tot <- max(Clumps, na.rm=TRUE)
res <- vector("list", tot)
for (i in 1:tot){
res[i] <- list(which(Clumps == i, arr.ind = TRUE))
But I get the following error:
Error in .local(x, ...) : list has no "x"
stop("list has no \"x\"")
.local(x, ...)
FUN(X[[i]], ...)
1.lapply(df_list, function(onedf) {
Rastermat <- raster(onedf)
Clumps <- as.matrix(clump(Rastermat, directions = 8))
tot <- max(Clumps, na.rm = TRUE) ...
Can someone please help me? I am really stuck on what to do.
The only problem is that you need to use matrices instead of data.frames.
This should work:
output_1 <- matrix(sample(c(0,1), 225, prob=c(0.8,0.2), replace=TRUE), nrow = 15)
output_2 <- matrix(sample(c(0,1), 225, prob=c(0.8,0.2), replace=TRUE), nrow = 15)
output_list <- list(output_2, output_2)
lapply(output_list, function (onedf) {
Rastermat <- raster(onedf)
Clumps <- as.matrix(clump(Rastermat, directions = 8))
tot <- max(Clumps, na.rm=TRUE)
res <- vector("list", tot)
for (i in 1:tot){
res[i] <- list(which(Clumps == i, arr.ind = TRUE))
EDIT to answer your follow-up questions:
Provided your input are data.frames containing 0s and 1s and you want to count the number of clumps of 0s you could have following code to return a list of the number of clumps in each data.frame:
sapply(list_of_dfs, function(df) {
rm <- raster(as.matrix(df)-1) # -1 because the clump function counts non-zero values
rc <- clump(rm, directions = 8, gaps = F) # gaps = F to prevent having missing numbers in the chunk numbers
rc#data#max # return the highest chunk number
my code is as follows:
x <- data.frame(matrix(rnorm(20), nrow=10))
colnames(x) <- c("z", "m")
bs <- list()
for (i in 1:n_boot) {
bs[[i]] <- x[sample(nrow(x), 10, replace = TRUE), ]
bt<-matrix(unlist(bs), ncol = 2*n_boot, byrow = FALSE)
colnames(bt) <- rep(c("z","m"),times=n_boot)
M_to_boot <- bt[,seq(2,8,by=2)]
funct<-function(M_boot_max) {
rows.combined <- nrow(M_to_boot)
cols.combined <- ncol(M_to_boot) + ncol(V_boot)
matrix.combined <- matrix(NA, nrow=rows.combined, ncol=cols.combined)
matrix.combined[, seq(1, cols.combined, 2)] <- M_to_boot
matrix.combined[, seq(2, cols.combined, 2)] <- V_boot
colnames(matrix.combined) <- rep(c("M_boot","V_boot"),times=n_boot)
start0 <- seq(1, by = 2, length = ncol(df) / 2)
start <- lapply(start0, function(i, df) df[i:(i+1)], df = df)
tests<-lapply(start, function(xy) split(xy, cut(xy$M_boot,breaks=5)))
Now I want to prepare some calculations on values V_boot from a sublists. To be specific I want to for each subsample calculate the sum of V_boot. So, for example I want for a bin M_boot "[[4]]$(0.811,1.25]" to have a value of sum(V_boot) for that bin. But I cannot figure out how to get to that each V_boot values.
Please help me.