I am trying trying to create a nested for loop, but somehow, the dimensions of mdata are mostly incorrect. For instance, the values for length(mdata[[1:5]]) are almost all 81, when they should be 81, 50, 60, etc. After the first five, length(mdata) becomes all 108s. Is there something wrong with my nested loop?
Code:
chrdata <- list()
mdata <- list()
res_df <- list()
for(i in 1:24) {
l <- length(olap[[i]])
for (j in 1:l) {
#for each row in granges object
sub_olap <- as.data.frame(mcols(subsetByOverlaps(all_list, olap[[i]][j])))
chrdata[[j]] <- data.frame(median(sub_olap$log2Ratio), #make metadata a single row and determine median
paste0(sub_olap$pvalue, collapse = ","),
paste0(sub_olap$length, collapse = ","),
paste0(sub_olap$sample, collapse = ","), stringsAsFactors = F)
colnames(chrdata[[j]]) <- c("med_log2Ratio", "pvalue", "length", "sample")
}
mdata[[i]] <- chrdata
mdf <- data.frame(matrix(unlist(mdata[[i]]), nrow = l, byrow = T))
colnames(mdf) <- c("med_log2Ratio", "pvalue", "length", "sample")
res_df[[i]] <- cbind(chr[[i]], mdf)
}
Related
Want to do Bootstrapping while comparing two dataframe column wise with the different number of rows.
I have two dataframe in which row represent values from experiments and column with the dataset names (data1, data2, data3, data4)
emp.data1 <- data.frame(
data1 = c(234,0,34,0,46,0,0,0,2.26,0, 5,8,93,56),
data2 = c(1.40,1.21,0.83,1.379,2.60,9.06,0.88,1.16,0.64,8.28, 5,8,93,56),
data3 =c(0,34,43,0,0,56,0,0,0,45,5,8,93,56),
data4 =c(45,0,545,34,0,35,0,35,0,534, 5,8,93,56),
stringsAsFactors = FALSE
)
emp.data2 <- data.frame(
data1 = c(45, 0, 0, 45, 45, 53),
data2 = c(23, 0, 45, 12, 90, 78),
data3 = c(72, 45, 756, 78, 763, 98),
data4 = c(1, 3, 65, 78, 9, 45),
stringsAsFactors = FALSE
)
I am trying to do bootstrapping(n=1000). Values are selected at random replacement from emp.data1(14 * 4) without change in the emp.data2(6 * 4). For example from emp.data2 first column (data1) select 6 values colSum and from emp.data1(data1) select 6 random non zero values colSum Divide the values and store in temp repeat the same 1000 times and take a median value et the end. like this i want to do it for each column of the dataframe. sample code I am providing which is working fine but i am not able get the non-zero random values for emp.data1
nboot <- 1e3
boot_temp_emp<- c()
n_data1 <- nrow(emp.data1); n_data2 <- nrow(emp.data2)
for (j in seq_len(nboot)) {
boot <- sample(x = seq_len(n_data1), size = n_data2, replace = TRUE)
value <- colSums(emp.data2)/colSums(emp.data1[boot,])
boot_temp_emp <- rbind(boot_temp_emp, value)
}
boot_data<- apply(boot_temp_emp, 2, median)
From the above script i am able get the output but each column emp.data1[boot,] data has zero values and taken sum. I want indivisual ramdomly selected non-zero values column sum so I tried below script not able remove zero values. Not able get desired output please some one help me to correct my script
nboot <- 1e3
boot_temp_emp<- c()
for (i in colnames(emp.data2)){
for (j in seq_len(nboot)){
data1=emp.data1[i]
data2=emp.data2[i]
n_data1 <- nrow(data1); n_data2 <- nrow(data2)
boot <- sample(x = seq_len(n_data1), size = n_data2, replace = TRUE)
value <- colSums(data2[i])/colSums(data1[boot, ,drop = FALSE])
boot_temp_emp <- rbind(boot_temp_emp, value)
}
}
boot_data<- apply(boot_temp_emp, 2, median)
Thank you
Here is a solution.
Write a function to make the code clearer. This function takes the following arguments.
x the input data.frame emp.data1;
s2 the columns sums of emp.data2;
n = 6 the number of vector elements to sample from emp.data1's columns with a default value of 6.
The create a results matrix, pre-compute the column sums of emp.data2 and call the function in a loop.
boot_fun <- function(x, s2, n = 6){
# the loop makes sure ther is no divide by zero
nrx <- nrow(x)
repeat{
i <- sample(nrx, n, replace = TRUE)
s1 <- colSums(x[i, ])
if(all(s1 != 0)) break
}
s2/s1
}
set.seed(2022)
nboot <- 1e3
sums2 <- colSums(emp.data2)
results <- matrix(nrow = nboot, ncol = ncol(emp.data1))
for(i in seq_len(nboot)){
results[i, ] <- boot_fun(emp.data1, sums2)
}
ratios_medians <- apply(results, 2, median)
old_par <- par(mfrow = c(2, 2))
for(j in 1:4) {
main <- paste0("data", j)
hist(results[, j], main = main, xlab = "ratios", freq = FALSE)
abline(v = ratios_medians[j], col = "blue", lty = "dashed")
}
par(old_par)
Created on 2022-02-24 by the reprex package (v2.0.1)
Edit
Following the comments here is a revised version of the bootstrap function. It makes sure there are no zeros in the sampled vectors, before computing their sums.
boot_fun2 <- function(x, s2, n = 6){
nrx <- nrow(x)
ncx <- ncol(x)
s1 <- numeric(ncx)
for(j in seq.int(ncx)) {
repeat{
i <- sample(nrx, n, replace = TRUE)
if(all(x[i, j] != 0)) {
s1[j] <- sum(x[i, j])
break
}
}
}
s2/s1
}
set.seed(2022)
nboot <- 1e3
sums2 <- colSums(emp.data2)
results2 <- matrix(nrow = nboot, ncol = ncol(emp.data1))
for(i in seq_len(nboot)){
results2[i, ] <- boot_fun2(emp.data1, sums2)
}
ratios_medians2 <- apply(results2, 2, median)
old_par <- par(mfrow = c(2, 2))
for(j in 1:4) {
main <- paste0("data", j)
hist(results2[, j], main = main, xlab = "ratios", freq = FALSE)
abline(v = ratios_medians2[j], col = "blue", lty = "dashed")
}
par(old_par)
Created on 2022-02-27 by the reprex package (v2.0.1)
Hi all I am trying to create a distance matrix from a random created sequence.
#set the code
DNA <- c("A","G","T","C")
randomDNA <- c()
#create the vector of 64 elements
for (i in 1:64){
randomDNA[i] <- paste0(sample(DNA, 6, replace = T), sep = "", collapse = "")
warnings()
}
sizeofDNA <- length(randomDNA)
#this part that I want to iterate between vector's components
split_vector <- c()
DNAdiff <- c()
for (i in 1:length(randomDNA)){
split_vector <- strsplit(randomDNA[i], "")[[1]]
#print(split_vector)
for (j in 1:length(randomDNA)){
split_vector2 <- strsplit(randomDNA[j], "")[[1]]
#print(split_vector2)
DNAdiff[i,j] <- setdiff(split_vector,split_vector2)
#or
#DNAdiff[i] <- lenght(setdiff(strsplit(randomDNA[22], "")[[1]],strsplit(randomDNA[33], "")[[1]]))
}
}
What it does not work is
A: the setdiff does not work as I expect
B: no array is created
Question how do I export the results of the setdiff (if it will work) to an array so that I will have the distance matrix like array?
Any recommendation is highly welcomed.
Thank you all
EDIT: So there are 2 solutions:
A. Using, as mentioned in the comments by #ThomasIsCoding , the "adist" function; this will calculate the Levenshtein distances:
DNA <- c("A","G","T","C")
randomDNA <- c()
for (i in 1:64){
randomDNA[i] <- paste0(sample(DNA, 6, replace = T), sep = "", collapse = "")
}
dm <-as.matrix(adist(randomDNA))
rownames(dm) <- randomDNA
colnames(dm) <- randomDNA
pdf("heatmap.pdf")
heatmap(dm, Rowv = NA, Colv = NA)
dev.off()
write.csv(dm,"distance_matrix.csv", row.names = T, col.names = T )
B. Another method to calculate the Hamming distance will be:
DNA <- c("A","G","T","C")
randomDNA <- c()
for (i in 1:96){
randomDNA[i] <- paste0(sample(DNA, 6, replace = T), sep = "", collapse = "")
}
Humm <- matrix(nrow=length(randomDNA), ncol=length(randomDNA))
for (i in 1:length(randomDNA)){
split_vector <- strsplit(randomDNA[i], "")[[1]]
for (j in 1:length(randomDNA)){
split_vector2 <- strsplit(randomDNA[j], "")[[1]]
#Hamming distance is calculated as:
Humm[i,j] <- sum(split_vector != split_vector2)
}
}
rownames(Humm) <- randomDNA
colnames(Humm) <- randomDNA
pdf("heatmap.pdf")
heatmap(Humm, Rowv = NA, Colv = NA)
dev.off()
write.csv(Humm,"distance_matrix.csv", row.names = T, col.names = T )
I think you you might need adist to get the distance matrix, e.g.,
adist(randomDNA)
I have a for loop that takes each sample file on a list, creates a matrix for that sample, and then stores it into one big list of all the sample matrices.
Here is what I have done so far:
# load in data ------------------------------------------------------------------
filePaths = getGEOSuppFiles("GSE124395")
tarF <- list.files(path = "./GSE124395/", pattern = "*.tar", full.names = TRUE)
untar(tarF, exdir = "./GSE124395/")
gzipF <- list.files(path = "./GSE124395/", pattern = "*.gz", full.names = TRUE)
ldply(.data = gzipF, .fun = gunzip)
#running test loop -------------------------------------------------------------
testlist <- c("./GSE124395//GSM3531672_P301_3_CRYOMIXED11.coutt.csv",
"./GSE124395//GSM3531673_P301_4_CRYOMIXED12.coutt.csv",
"./GSE124395//GSM3531674_P301_5_HEP1_1_5.coutt.csv")
LoopList_test <- list()
for (i in 1:length(testlist)){
matrix_test <- read.delim(file =testlist[i])
matrix_test <- data.frame(matrix_test[,-1], row.names=matrix_test[,1])
matrix_test <- as.matrix(matrix_test) #<- makes the excel file into a matrix
colname_test <- read.delim(file =testlist[i])
colname_test <- read.table(file = './GSE124395//GSE124395_celseq_barcodes.192.txt', header = FALSE, row.names = 1)
colname_test <- data.frame(colname_test[,-1], col=colname_test[,1])
colname_test <- as.matrix(colname_test)
colnames(matrix_test) <- colname_test[,1]
LoopList_test[[i]]<-matrix_test
}
This is the output:
part of output in the one big list
I would like the loop to store the result of each iteration into its own matrix, so I have multiple matrices instead of one giant list of matrices, if that makes sense. I think it involves either splitting this one giant list into sublists, or storing the results of the loop into a matrix/array/vector instead of a list, or somehow having it store each iteration into its own variable within the loop. I'm not sure how to go about doing any of those.
Thanks for reading!
UPDATE:
So the whole point of this was to create matrices to then combine them into one matrix. Then turn this one matrix into a Seurat object which I could then perform clustering on.
So here is what I have done so far: essentially, I made multiple loops of each group within the dataset, added whatever information I needed, and then took the list and the function I think I need actually takes a list so that's good for me. Here's the code I decided on at the moment:
mylist<-list.files(path = "./GSE124395/", pattern = "\\.csv$",full.names = TRUE)
LoopList <- list()
for (i in 1:30){
matrix_input <- read.delim(file =mylist[i])
matrix_input <- data.frame(matrix_input[,-1], row.names=matrix_input[,1])
matrix_input <- as.matrix(matrix_input) #<- makes the excel file into a matrix
colname_input <- read.delim(file =mylist[i])
colname_input <- read.table(file = './GSE124395//GSE124395_celseq_barcodes.192.txt', header = FALSE, row.names = 1)
colname_input <- data.frame(colname_input[,-1], col=colname_input[,1])
colname_input <- as.matrix(colname_input)
colnames(matrix_input) <- colname_input[,1]
colnames(matrix_input) <- paste(colnames(matrix_input), "Colorectal_Metastasis", sep = "_")
P301_pdat <- data.frame("samples" = colnames(matrix_input), "treatment" = "Colorectal_Metastasis")
sobj <- CreateSeuratObject(counts = matrix_input, min.cells = 0, min.features = 1,
project = "Patient301_Colorectal_Metastasis")
LoopList[[i]]<-sobj
#LoopList <- assign(paste0("Patient301", i), sobj )
}
# P304 loop -------------------------------------------------------------------------
for (i in 31:56){
matrix_input <- read.delim(file =mylist[i])
matrix_input <- data.frame(matrix_input[,-1], row.names=matrix_input[,1])
matrix_input <- as.matrix(matrix_input) #<- makes the excel file into a matrix
colname_input <- read.delim(file =mylist[i])
colname_input <- read.table(file = './GSE124395//GSE124395_celseq_barcodes.192.txt', header = FALSE, row.names = 1)
colname_input <- data.frame(colname_input[,-1], col=colname_input[,1])
colname_input <- as.matrix(colname_input)
colnames(matrix_input) <- colname_input[,1]
colnames(matrix_input) <- paste(colnames(matrix_input), "Colorectal_Metastasis", sep = "_")
P304_pdat <- data.frame("samples" = colnames(matrix_input), "treatment" = "Colorectal_Metastasis")
sobj <- CreateSeuratObject(counts = matrix_input, min.cells = 0, min.features = 1,
project = "Patient304_Colorectal_Metastasis")
LoopList[[i]]<-sobj
}
and so on. Then, following https://satijalab.org/seurat/articles/integration_large_datasets.html
sobj.list <- SplitObject(LoopList, split.by = "orig.ident")
joined <- lapply(X = LoopList, FUN = function(x) {
x <- NormalizeData(x, verbose = FALSE)
x <- FindVariableFeatures(x, verbose = FALSE)
})
features <- SelectIntegrationFeatures(object.list = joined)
joined <- lapply(X = joined, FUN = function(x) {
x <- ScaleData(x, features = features, verbose = FALSE)
x <- RunPCA(x, features = features, verbose = FALSE)
})
anchors <- FindIntegrationAnchors(object.list = joined, reduction = "rpca",
dims = 1:50)
joined.integrated <- IntegrateData(anchorset = anchors, dims = 1:50)
joined.integrated <- ScaleData(joined.integrated, verbose = FALSE)
joined.integrated <- RunPCA(joined.integrated, verbose = FALSE)
joined.integrated <- RunUMAP(joined.integrated, dims = 1:50)
DimPlot(joined.integrated, group.by = "orig.ident")
DimPlot(joined.integrated, reduction = "umap", split.by = "treatment")
I don't know if this works for sure, but I thought I would update this question to reflect what I've learned so far! I guess lesson I've learned is see if you can find a function that takes a list as input heh.
I am trying to output the number of groups on 0s in multiple data frames in a list. I believe the package I need to do this is the raster R package. Here is my attempt...
set.seed(12345)
output_1 <- matrix(sample(c(0,1), 225, prob=c(0.8,0.2), replace=TRUE), nrow = 15)
df_output_1 <- data.frame(output_1)
set.seed(99999)
output_2 <- matrix(sample(c(0,1), 225, prob=c(0.8,0.2), replace=TRUE), nrow = 15)
df_output_2 <- data.frame(output_2)
output_list <- list(df_output_2, df_output_2)
install.packages("raster")
library(raster)
lapply(output_list, function (onedf) {
Rastermat <- raster(onedf)
Clumps <- as.matrix(clump(Rastermat, directions = 8))
#turning the clumps into a list
tot <- max(Clumps, na.rm=TRUE)
res <- vector("list", tot)
for (i in 1:tot){
res[i] <- list(which(Clumps == i, arr.ind = TRUE))
}
res
})
But I get the following error:
Error in .local(x, ...) : list has no "x"
stop("list has no \"x\"")
.local(x, ...)
raster(onedf)
raster(onedf)
FUN(X[[i]], ...)
1.lapply(df_list, function(onedf) {
Rastermat <- raster(onedf)
Clumps <- as.matrix(clump(Rastermat, directions = 8))
tot <- max(Clumps, na.rm = TRUE) ...
Can someone please help me? I am really stuck on what to do.
The only problem is that you need to use matrices instead of data.frames.
This should work:
library(raster)
set.seed(12345)
output_1 <- matrix(sample(c(0,1), 225, prob=c(0.8,0.2), replace=TRUE), nrow = 15)
set.seed(99999)
output_2 <- matrix(sample(c(0,1), 225, prob=c(0.8,0.2), replace=TRUE), nrow = 15)
output_list <- list(output_2, output_2)
lapply(output_list, function (onedf) {
Rastermat <- raster(onedf)
Clumps <- as.matrix(clump(Rastermat, directions = 8))
tot <- max(Clumps, na.rm=TRUE)
res <- vector("list", tot)
for (i in 1:tot){
res[i] <- list(which(Clumps == i, arr.ind = TRUE))
}
res
})
EDIT to answer your follow-up questions:
Provided your input are data.frames containing 0s and 1s and you want to count the number of clumps of 0s you could have following code to return a list of the number of clumps in each data.frame:
sapply(list_of_dfs, function(df) {
rm <- raster(as.matrix(df)-1) # -1 because the clump function counts non-zero values
rc <- clump(rm, directions = 8, gaps = F) # gaps = F to prevent having missing numbers in the chunk numbers
rc#data#max # return the highest chunk number
})
my code is as follows:
x <- data.frame(matrix(rnorm(20), nrow=10))
colnames(x) <- c("z", "m")
n_boot<-4
bs <- list()
for (i in 1:n_boot) {
bs[[i]] <- x[sample(nrow(x), 10, replace = TRUE), ]
}
bt<-matrix(unlist(bs), ncol = 2*n_boot, byrow = FALSE)
colnames(bt) <- rep(c("z","m"),times=n_boot)
M_to_boot <- bt[,seq(2,8,by=2)]
funct<-function(M_boot_max) {
od<-(1/((10*((10^((16-M_boot_max-25)/5))^3)/3)*((max(M_boot_max)-min(M_boot_max))/50)))
}
V_boot<-apply(M_to_boot,2,funct)
rows.combined <- nrow(M_to_boot)
cols.combined <- ncol(M_to_boot) + ncol(V_boot)
matrix.combined <- matrix(NA, nrow=rows.combined, ncol=cols.combined)
matrix.combined[, seq(1, cols.combined, 2)] <- M_to_boot
matrix.combined[, seq(2, cols.combined, 2)] <- V_boot
colnames(matrix.combined) <- rep(c("M_boot","V_boot"),times=n_boot)
df<-as.data.frame(matrix.combined)
start0 <- seq(1, by = 2, length = ncol(df) / 2)
start <- lapply(start0, function(i, df) df[i:(i+1)], df = df)
tests<-lapply(start, function(xy) split(xy, cut(xy$M_boot,breaks=5)))
Now I want to prepare some calculations on values V_boot from a sublists. To be specific I want to for each subsample calculate the sum of V_boot. So, for example I want for a bin M_boot "[[4]]$(0.811,1.25]" to have a value of sum(V_boot) for that bin. But I cannot figure out how to get to that each V_boot values.
Please help me.