apply statement to sample columns, across rows of different lengths - r

I'm trying to write a simple R function to sample 5-element substrings across two columns of a single data frame. The length of the strings are equal for each row, but they differ down the columns. The function works when I specify a row and col to act on, but I can't get the apply statement to work on on each row and each column. As written, it will only pull random samples based on the length of the first instance, so if the first instance is shorter than any of the other strings, the output for the other rows is sometimes less than 5-elements.
example df:
BP TF
1 CGTCTCTATTCTAGGCAAGA TTTFFFFTFFFTFFFTFTTT
2 AAGTCACTCGAATTCGGATGCCCCCTAGGC TTFFFFFTFFFFTTFTFFTTTFTTTTFTFF
3 TGCTCATGACGGGAC FFFTFTFFFFTFTFT
'intended output:'
1 CTATT FFTFF
2 CCTAG TTTFT
3 TCATG TFTFF
'reproducible example code:'
#make fake data frame
BaseP1 <- paste(sample(size = 20, x = c("A","C","T","G"), replace = TRUE), collapse = "")
BaseP2 <- paste(sample(size = 30, x = c("A","C","T","G"), replace = TRUE), collapse = "")
BaseP3 <- paste(sample(size = 15, x = c("A","C","T","G"), replace = TRUE), collapse = "")
TrueFalse1 <- paste(sample(size = 20, x = c("T","F"), replace = TRUE), collapse = "")
TrueFalse2 <- paste(sample(size = 30, x = c("T","F"), replace = TRUE), collapse = "")
TrueFalse3 <- paste(sample(size = 15, x = c("T","F"), replace = TRUE), collapse = "")
my_df <- data.frame(c(BaseP1,BaseP2,BaseP3), c(TrueFalse1, TrueFalse2, TrueFalse3))
Fragment = function(string) {
nStart = sample(1:nchar(string) -5, 1)
substr(string, nStart, nStart + 4)
}
Fragment(string = my_df[1,1])#works for the first row, first col.
but this does not work:
apply(my_df, c(1,2), function(x) Fragment(string = my_df[1:nrow(my_df),1:ncol(my_df)]))

There was an error in your function:
Fragment = function(string) {
nStart = sample(1:(nchar(string) -5), 1)
substr(string, nStart, nStart + 4)
}
It was missing parentheses between nchar(string) - 5, which made the subsetting go wrong.
You can then simply use apply(my_df, c(1,2), Fragment) as suggested in the comments.
To show that this works now:
for(i in 1:10000){
stopifnot(all(5 == sapply(apply(my_df, c(1,2), Fragment), nchar)))
}
This shows that in 10000 tries, it always produced 5 characters as output.

Related

R bootstrapping for the two dataframe individual column wise

Want to do Bootstrapping while comparing two dataframe column wise with the different number of rows.
I have two dataframe in which row represent values from experiments and column with the dataset names (data1, data2, data3, data4)
emp.data1 <- data.frame(
data1 = c(234,0,34,0,46,0,0,0,2.26,0, 5,8,93,56),
data2 = c(1.40,1.21,0.83,1.379,2.60,9.06,0.88,1.16,0.64,8.28, 5,8,93,56),
data3 =c(0,34,43,0,0,56,0,0,0,45,5,8,93,56),
data4 =c(45,0,545,34,0,35,0,35,0,534, 5,8,93,56),
stringsAsFactors = FALSE
)
emp.data2 <- data.frame(
data1 = c(45, 0, 0, 45, 45, 53),
data2 = c(23, 0, 45, 12, 90, 78),
data3 = c(72, 45, 756, 78, 763, 98),
data4 = c(1, 3, 65, 78, 9, 45),
stringsAsFactors = FALSE
)
I am trying to do bootstrapping(n=1000). Values are selected at random replacement from emp.data1(14 * 4) without change in the emp.data2(6 * 4). For example from emp.data2 first column (data1) select 6 values colSum and from emp.data1(data1) select 6 random non zero values colSum Divide the values and store in temp repeat the same 1000 times and take a median value et the end. like this i want to do it for each column of the dataframe. sample code I am providing which is working fine but i am not able get the non-zero random values for emp.data1
nboot <- 1e3
boot_temp_emp<- c()
n_data1 <- nrow(emp.data1); n_data2 <- nrow(emp.data2)
for (j in seq_len(nboot)) {
boot <- sample(x = seq_len(n_data1), size = n_data2, replace = TRUE)
value <- colSums(emp.data2)/colSums(emp.data1[boot,])
boot_temp_emp <- rbind(boot_temp_emp, value)
}
boot_data<- apply(boot_temp_emp, 2, median)
From the above script i am able get the output but each column emp.data1[boot,] data has zero values and taken sum. I want indivisual ramdomly selected non-zero values column sum so I tried below script not able remove zero values. Not able get desired output please some one help me to correct my script
nboot <- 1e3
boot_temp_emp<- c()
for (i in colnames(emp.data2)){
for (j in seq_len(nboot)){
data1=emp.data1[i]
data2=emp.data2[i]
n_data1 <- nrow(data1); n_data2 <- nrow(data2)
boot <- sample(x = seq_len(n_data1), size = n_data2, replace = TRUE)
value <- colSums(data2[i])/colSums(data1[boot, ,drop = FALSE])
boot_temp_emp <- rbind(boot_temp_emp, value)
}
}
boot_data<- apply(boot_temp_emp, 2, median)
Thank you
Here is a solution.
Write a function to make the code clearer. This function takes the following arguments.
x the input data.frame emp.data1;
s2 the columns sums of emp.data2;
n = 6 the number of vector elements to sample from emp.data1's columns with a default value of 6.
The create a results matrix, pre-compute the column sums of emp.data2 and call the function in a loop.
boot_fun <- function(x, s2, n = 6){
# the loop makes sure ther is no divide by zero
nrx <- nrow(x)
repeat{
i <- sample(nrx, n, replace = TRUE)
s1 <- colSums(x[i, ])
if(all(s1 != 0)) break
}
s2/s1
}
set.seed(2022)
nboot <- 1e3
sums2 <- colSums(emp.data2)
results <- matrix(nrow = nboot, ncol = ncol(emp.data1))
for(i in seq_len(nboot)){
results[i, ] <- boot_fun(emp.data1, sums2)
}
ratios_medians <- apply(results, 2, median)
old_par <- par(mfrow = c(2, 2))
for(j in 1:4) {
main <- paste0("data", j)
hist(results[, j], main = main, xlab = "ratios", freq = FALSE)
abline(v = ratios_medians[j], col = "blue", lty = "dashed")
}
par(old_par)
Created on 2022-02-24 by the reprex package (v2.0.1)
Edit
Following the comments here is a revised version of the bootstrap function. It makes sure there are no zeros in the sampled vectors, before computing their sums.
boot_fun2 <- function(x, s2, n = 6){
nrx <- nrow(x)
ncx <- ncol(x)
s1 <- numeric(ncx)
for(j in seq.int(ncx)) {
repeat{
i <- sample(nrx, n, replace = TRUE)
if(all(x[i, j] != 0)) {
s1[j] <- sum(x[i, j])
break
}
}
}
s2/s1
}
set.seed(2022)
nboot <- 1e3
sums2 <- colSums(emp.data2)
results2 <- matrix(nrow = nboot, ncol = ncol(emp.data1))
for(i in seq_len(nboot)){
results2[i, ] <- boot_fun2(emp.data1, sums2)
}
ratios_medians2 <- apply(results2, 2, median)
old_par <- par(mfrow = c(2, 2))
for(j in 1:4) {
main <- paste0("data", j)
hist(results2[, j], main = main, xlab = "ratios", freq = FALSE)
abline(v = ratios_medians2[j], col = "blue", lty = "dashed")
}
par(old_par)
Created on 2022-02-27 by the reprex package (v2.0.1)

How can I keep a conditional Object available within a function?

I am trying to make a simulation script to generate data from a Stan model for different numbers of subjects. I first use the model to simulate responses, then I use those simulated responses to re-fit the model.
My current approach is to generate the simulated responses on the first run of the function (when seed == 1) and assign the result to the global environment.
Is there a way to skip assigning the output of to the global environment but have that result be accessible for the second and later runs?
sim_data_fit <- function(nSim, nSubj) {
set.seed(nSim)
X <- as.matrix(data.frame(
Var1 = sample(c(0,1), nSubj, replace = TRUE),
Var2 = sample(c(0,1), nSubj, replace = TRUE),
Var3 = scale(rnorm(nSubj, mean = 50, sd = 2), center = TRUE, scale = TRUE)))
if (nSim == 1) {
sim_out <<- sampling(Sim_Mod,
data = list(n = nSubj,
k = ncol(X),
X = X,
Y = rnorm(nSubj, mean = 65, sd = 10),
run_estimation = 0))
}
sim_data <- sim_out %>%
as.data.frame %>%
select(contains("y_sim")) %>%
.[, sample(ncol(.), nSubj)] %>%
apply(., 2, sample, size = 1) %>%
as_tibble() %>%
rename("y_sim" = value)
sampling(Sim_Mod,
data = list(n = nSubj,
k = ncol(X),
X = X,
Y = sim_data$y_sim,
run_estimation = 1))
}
sim_out is what I'm assigning only on the first run (I think). If I don't, I get an error that sim_out doesn't exist.

For loop list of tibbles R

I want to create a list of random tibbles using a for loop. I have a large data set where I will need to apply functions to lists of tibbles and create lists of tibbles as the outputs. I understand there might be better ways to do this and would also appreciate hearing those but am trying to wrap my head around how for loops work.
I can create a list of random tibbles with each tibble in the list named:
tibble_random1 <- tibble(Number = sample((1:100), 10, replace = TRUE),
Letter = sample((LETTERS), 10, replace = TRUE),
Logical = sample(c("True", "False"), 10, replace = TRUE))
tibble_random2 <- tibble(Number = sample((1:100), 10, replace = TRUE),
Letter = sample((LETTERS), 10, replace = TRUE),
Logical = sample(c("True", "False"), 10, replace = TRUE))
tibble_random3 <- tibble(Number = sample((1:100), 10, replace = TRUE),
Letter = sample((LETTERS), 10, replace = TRUE),
Logical = sample(c("True", "False"), 10, replace = TRUE))
tibble_random <- list(tibble1 = tibble_random1,
tibble2 = tibble_random2,
tibble3 = tibble_random3)
I cannot figure out how to do this with a for loop or if a for loop is completely inappropriate for this.
Thanks.
Initialise a list and fill 1 tibble in every iteration using for loop.
tibble_random <- vector('list', 3)
for(i in seq_along(tibble_random)) {
tibble_random[[i]] <- tibble(Number = sample((1:100), 10, replace = TRUE),
Letter = sample((LETTERS), 10, replace = TRUE),
Logical = sample(c("True", "False"), 10, replace = TRUE))
}
You can also use replicate or lapply to do this without for loop.
tibble_random <- replicate(3, tibble(Number = sample((1:100), 10, replace = TRUE),
Letter = sample((LETTERS), 10, replace = TRUE),
Logical = sample(c("True", "False"), 10, replace = TRUE)), simplify = FALSE)
To assign the names of the list you can use :
names(tibble_random) <- paste0('tibble', seq_along(tibble_random))

build matrix in a for loop automatically in R

Suppose I have a code like this
probv=c(0.5,0.1,0.2,0.3)
N=c(1,2,3,4)
g1=matrix(rbinom(n = 10, size = N[1], prob = probv[1]), nrow=5)
g2=matrix(rbinom(n = 10, size = N[2], prob = probv[2]), nrow=5)
g3=matrix(rbinom(n = 10, size = N[3], prob = probv[3]), nrow=5)
g4=matrix(rbinom(n = 10, size = N[4], prob = probv[4]), nrow=5)
I want to use a for loop
for i in (1:J)
{......} J=4 in this case
use one line function to return the same output like this, I want to know
how I create a matrix g_ in the loop
which is also benefit for me when I increase the length
of my vector into 5,6,7......
for example N=c(1,2,3,4,5) probv=c(0.5,0.1,0.2,0.3,0.5)
I do not change my code to create another matrix called g5.The code can create it and I just need to change my input to achieve my goal
Thanks Akrun
what is my N is a three dimensional array, I want to map the last dimension of it? How to change in the map method?
probv=c(0.5,0.1,0.2,0.3)
N=array(1:24,c(3,2,4))
g1=matrix(rbinom(n = 10, size = N[,,1], prob = probv[1]), nrow=5)
g2=matrix(rbinom(n = 10, size = N[,,2], prob = probv[2]), nrow=5)
g3=matrix(rbinom(n = 10, size = N[,,3], prob = probv[3]), nrow=5)
g4=matrix(rbinom(n = 10, size = N[,,4], prob = probv[4]), nrow=5)
We can use Map to loop over the 'N' and 'probv' vector, get the corresponding values into rbinom and create a matrix. It returns a list of matrices
lst1 <- Map(function(x, y) matrix(rbinom(n = 10,
size = x, prob = y), nrow = 5), N, probv)
Or using for loop
lst2 <- vector('list', length(N))
for(i in seq_along(N)) {
lst2[[i]] <- matrix(rbinom(n = 10, size = N[i], prob = probv[i]), nrow = 5)
}
names(lst2) <- paste0("g", seq_along(lst2))
For the updated question to extract from an array
mnLength <- min(length(probv), dim(N)[3])
lst2 <- vector('list', mnLength)
for(i in seq_len(mnLength)) {
lst2[[i]] <- matrix(rbinom(n = 10, size = N[,,i], prob = probv[i]), nrow = 5)
}
names(lst2) <- paste0("g", seq_along(lst2))
lst2$g1
lst2$g2

function to sample variable number of substrings given string length

I'm trying to write an R function that will sample a variable number of 5-element substrings, based on the length of the original string in each row of a data frame. I first calculated the number of times I'd like each draw to repeat, and would like to add this into the function so that the number of samples taken for each row is based on the "num_draws" column for that row. my thought was to use a generalized instance, and then use an apply statement outside of the function to act on each row, but I can't figure out how to set up the function to call col 3 as a generalized instance (without calling either the value of just the first row, or the value of all rows).
example data frame:
BP TF num_draws
1 CGGCGCATGTTCGGTAATGA TFTTTFTTTFFTTFTTTTTF 6
2 ATAAGATGCCCAGAGCCTTTTCATGTACTA TFTFTFTFFFFFFTTFTTTTFTTTTFFTTT 9
3 TCTTAGGAAGGATTC FTTTTTTTTTFFFFF 4
desired output:
[1]GGCGC FTTTF
AATGA TTTTF
TTFFT TGTTC
TAATG TTTTT
AATGA TTTTF
CGGCG TFTTT
[2]AGATG FTFTF
ATAAG TFTFT
ATGCC FTFFF
GCCCA FFFFF
ATAAG TFTFT
GTACT TFFTT
GCCCA FFFFF
TGCCC TFFFF
AGATG FTFTF
[3]TTAGG TTTTT
CTTAG TTTTT
GGAAG TTTTT
GGATT TTFFF
example code:
#make example data frame
BaseP1 <- paste(sample(size = 20, x = c("A","C","T","G"), replace = TRUE), collapse = "")
BaseP2 <- paste(sample(size = 30, x = c("A","C","T","G"), replace = TRUE), collapse = "")
BaseP3 <- paste(sample(size = 15, x = c("A","C","T","G"), replace = TRUE), collapse = "")
TrueFalse1 <- paste(sample(size = 20, x = c("T","F"), replace = TRUE), collapse = "")
TrueFalse2 <- paste(sample(size = 30, x = c("T","F"), replace = TRUE), collapse = "")
TrueFalse3 <- paste(sample(size = 15, x = c("T","F"), replace = TRUE), collapse = "")
my_df <- data.frame(c(BaseP1,BaseP2,BaseP3), c(TrueFalse1, TrueFalse2, TrueFalse3))
#calculate number of draws by length
frag_length<- 5
my_df<- cbind(my_df, (round((nchar(my_df[,1]) / frag_length) * 1.5, digits = 0)))
colnames(my_df) <- c("BP", "TF", "num_draws")
#function to sample x number of draws per row (this does not work)
Fragment = function(string) {
nStart = sample(1:(nchar(string) -5), 1)
samp<- substr(string, nStart, nStart + 4)
replicate(n= string[,3], expr = samp)
}
apply(my_df[,1:2], c(1,2), Fragment)
One option would be to change the function to have another argument n and create the nStart inside the replicate call
Fragment = function(string, n) {
replicate(n= n, {nStart <- sample(1:(nchar(string) -5), 1)
samp <- substr(string, nStart, nStart + 4)
})
}
apply(my_df, 1, function(x) data.frame(lapply(x[1:2], Fragment, n = x[3])))
$`1`
# BP TF
#1 GGCGC FFTTF
#2 GGTAA TFFTT
#3 GCGCA TTFTT
#4 CGCAT TFFTT
#5 GGCGC FTTTF
#6 TGTTC FTTFT
#$`2`
# BP TF
#1 GTACT TTTTF
#2 ATAAG FTTFT
#3 GTACT TFTFF
#4 TAAGA TTTTF
#5 CCTTT FFTTF
#6 TCATG TTTTF
#7 CCAGA TFTFT
#8 TTCAT TFTFT
#9 CCCAG FTFTF
#$`3`
# BP TF
#1 AAGGA TTTFF
#2 AGGAT TTTTT
#3 CTTAG TFFFF
#4 TAGGA TTTFF

Resources