R bootstrapping for the two dataframe individual column wise - r

Want to do Bootstrapping while comparing two dataframe column wise with the different number of rows.
I have two dataframe in which row represent values from experiments and column with the dataset names (data1, data2, data3, data4)
emp.data1 <- data.frame(
data1 = c(234,0,34,0,46,0,0,0,2.26,0, 5,8,93,56),
data2 = c(1.40,1.21,0.83,1.379,2.60,9.06,0.88,1.16,0.64,8.28, 5,8,93,56),
data3 =c(0,34,43,0,0,56,0,0,0,45,5,8,93,56),
data4 =c(45,0,545,34,0,35,0,35,0,534, 5,8,93,56),
stringsAsFactors = FALSE
)
emp.data2 <- data.frame(
data1 = c(45, 0, 0, 45, 45, 53),
data2 = c(23, 0, 45, 12, 90, 78),
data3 = c(72, 45, 756, 78, 763, 98),
data4 = c(1, 3, 65, 78, 9, 45),
stringsAsFactors = FALSE
)
I am trying to do bootstrapping(n=1000). Values are selected at random replacement from emp.data1(14 * 4) without change in the emp.data2(6 * 4). For example from emp.data2 first column (data1) select 6 values colSum and from emp.data1(data1) select 6 random non zero values colSum Divide the values and store in temp repeat the same 1000 times and take a median value et the end. like this i want to do it for each column of the dataframe. sample code I am providing which is working fine but i am not able get the non-zero random values for emp.data1
nboot <- 1e3
boot_temp_emp<- c()
n_data1 <- nrow(emp.data1); n_data2 <- nrow(emp.data2)
for (j in seq_len(nboot)) {
boot <- sample(x = seq_len(n_data1), size = n_data2, replace = TRUE)
value <- colSums(emp.data2)/colSums(emp.data1[boot,])
boot_temp_emp <- rbind(boot_temp_emp, value)
}
boot_data<- apply(boot_temp_emp, 2, median)
From the above script i am able get the output but each column emp.data1[boot,] data has zero values and taken sum. I want indivisual ramdomly selected non-zero values column sum so I tried below script not able remove zero values. Not able get desired output please some one help me to correct my script
nboot <- 1e3
boot_temp_emp<- c()
for (i in colnames(emp.data2)){
for (j in seq_len(nboot)){
data1=emp.data1[i]
data2=emp.data2[i]
n_data1 <- nrow(data1); n_data2 <- nrow(data2)
boot <- sample(x = seq_len(n_data1), size = n_data2, replace = TRUE)
value <- colSums(data2[i])/colSums(data1[boot, ,drop = FALSE])
boot_temp_emp <- rbind(boot_temp_emp, value)
}
}
boot_data<- apply(boot_temp_emp, 2, median)
Thank you

Here is a solution.
Write a function to make the code clearer. This function takes the following arguments.
x the input data.frame emp.data1;
s2 the columns sums of emp.data2;
n = 6 the number of vector elements to sample from emp.data1's columns with a default value of 6.
The create a results matrix, pre-compute the column sums of emp.data2 and call the function in a loop.
boot_fun <- function(x, s2, n = 6){
# the loop makes sure ther is no divide by zero
nrx <- nrow(x)
repeat{
i <- sample(nrx, n, replace = TRUE)
s1 <- colSums(x[i, ])
if(all(s1 != 0)) break
}
s2/s1
}
set.seed(2022)
nboot <- 1e3
sums2 <- colSums(emp.data2)
results <- matrix(nrow = nboot, ncol = ncol(emp.data1))
for(i in seq_len(nboot)){
results[i, ] <- boot_fun(emp.data1, sums2)
}
ratios_medians <- apply(results, 2, median)
old_par <- par(mfrow = c(2, 2))
for(j in 1:4) {
main <- paste0("data", j)
hist(results[, j], main = main, xlab = "ratios", freq = FALSE)
abline(v = ratios_medians[j], col = "blue", lty = "dashed")
}
par(old_par)
Created on 2022-02-24 by the reprex package (v2.0.1)
Edit
Following the comments here is a revised version of the bootstrap function. It makes sure there are no zeros in the sampled vectors, before computing their sums.
boot_fun2 <- function(x, s2, n = 6){
nrx <- nrow(x)
ncx <- ncol(x)
s1 <- numeric(ncx)
for(j in seq.int(ncx)) {
repeat{
i <- sample(nrx, n, replace = TRUE)
if(all(x[i, j] != 0)) {
s1[j] <- sum(x[i, j])
break
}
}
}
s2/s1
}
set.seed(2022)
nboot <- 1e3
sums2 <- colSums(emp.data2)
results2 <- matrix(nrow = nboot, ncol = ncol(emp.data1))
for(i in seq_len(nboot)){
results2[i, ] <- boot_fun2(emp.data1, sums2)
}
ratios_medians2 <- apply(results2, 2, median)
old_par <- par(mfrow = c(2, 2))
for(j in 1:4) {
main <- paste0("data", j)
hist(results2[, j], main = main, xlab = "ratios", freq = FALSE)
abline(v = ratios_medians2[j], col = "blue", lty = "dashed")
}
par(old_par)
Created on 2022-02-27 by the reprex package (v2.0.1)

Related

Using app function from {terra} package on raster stacks? (in parallel)

I have four high resolution rasters for a country. I have split each raster into tiles and done some other processing to them. I now want to apply a function to each cell, of each 'stack' of the raster tiles, to produce one set of output tiles. The function is a little complex. I have tried to synthesise some data below to reproduce my current approach. It works (ish) but I'm convinced that there's a better way to do this. To use parallel processing on my unix box, I simply swap mapply for mcmapply, but I haven't done that in the example below as I presume many will be working on Windows machines. I'd welcome ideas on my approach and particularly optimisation.
library("terra")
library("glue")
## Make some toy data
dir.create("temp_folder")
dir.create("result_folder")
x <- rast(ncols = 10, nrows = 10)
a <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(a) <- some_values
a_tiles <- makeTiles(a, x, glue("temp_folder/tile_a_{1:100}.tif"), overwrite = TRUE)
b <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(b) <- some_values
b_tiles <- makeTiles(b, x, glue("temp_folder/tile_b_{1:100}.tif"), overwrite = TRUE)
c <-rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(c) <- some_values
c_tiles <- makeTiles(c, x, glue("temp_folder/tile_c_{1:100}.tif"), overwrite = TRUE)
d <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(d) <- some_values
d_tiles <- makeTiles(d, x, glue("temp_folder/tile_d_{1:100}.tif"), overwrite = TRUE)
## Outer function so that this can be used in parallel ? But maybe this is a silly way to do it?
outer_function <- function(a_tiles, b_tiles, c_tiles, d_tiles, output_files) {
one_a_tile <- rast(unlist(a_tiles))
one_b_tile <- rast(unlist(b_tiles))
one_c_tile <- rast(unlist(c_tiles))
one_d_tile <- rast(unlist(d_tiles))
output_file <- output_files
# I replace any NAs with 0 as an NA will break my 'if' statement of the inner_function.
# I get Error in if (z["a"] <= z["b"]) { : missing value where TRUE/FALSE needed
one_a_tile[is.na(one_a_tile)] <- 0
one_b_tile[is.na(one_b_tile)] <- 0
one_c_tile[is.na(one_c_tile)] <- 0
one_d_tile[is.na(one_d_tile)] <- 0
z <- sds(one_a_tile, one_b_tile, one_c_tile, one_d_tile)
## Inner function that actually does the work I want doing
inner_function <- function(z) {
names(z) <- c('a', 'b', 'c', 'd')
if (z['a'] <= z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 10),
(z['c'] + z['a'] * 20)))
}
if (z['a'] >= z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 40),
(z['c'] + z['a'] * 10)))
}
if (z['a'] == z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 60),
(z['c'] + z['a'] * 10)))
}
y <- ifelse(y == 0, NA, y)
return(y)
}
app(z,
inner_function,
filename = output_file,
overwrite = TRUE,
wopt = list(datatype = "INT4U"))
return(output_file)
}
results <- mapply(outer_function,
a_tiles = a_tiles,
b_tiles = b_tiles,
c_tiles = c_tiles,
d_tiles = d_tiles,
output_files = output_files <- glue("result_folder/result_tile_{1:length(d_tiles)}.tif"))
names(results) <- NULL
unlink("temp_folder", recursive = TRUE)
unlink("result_folder", recursive = TRUE)

Error in `*tmp*`[[i]]: for loop error in creating nested list in R

I am getting this error: Error in *tmp*[[i]] : subscript out of bounds
When I run a code that iterates through i and j.
Example data below:
#Make example data
A1 <- rnorm(1:100, mean = 10, sd = 1)
B1 <- rep(c(2, 4, 5, 7), each = 25)
AB1 <- rbind(A1, B1)
colnames(AB1) <- rep(c("A","b"), each = 50)
A2 <- rnorm(1:100, mean = 50, sd = 1)
B2 <- rep(c(2, 4, 5, 7), each = 25)
AB2 <- rbind(A2, B2)
colnames(AB2) <- rep(c("A","b"), each = 50)
A3 <- rnorm(1:100, mean = 100, sd = 1)
B3 <- rep(c(2, 4, 5, 7), each = 25)
AB3 <- rbind(A2, B2)
colnames(AB3) <- rep(c("A","b"), each = 50)
data <- list(AB1, AB2, AB3)
# for loop showing error
histlist = NULL
for (i in seq_along(data)) {
columns <- unique(colnames(data[[i]]))
for (j in columns) {
columnsub <- data[[i]][, c(rep(c("A"), each = 50))]
histlist[[i]][[j]] <- hist(columnsub) #error from this line
}
}
I want the loop to create histlist with each i as a nested list having A and B with a nested hist() values. Basically, what you see in histlist after the first iteration.
Any help is greatly appreciated
We can correct the issue by turning histlist = NULL to :
histlist = vector('list', length(data))
This will pre-allocate a vector and allow for assigning to the list correctly.
You can do this with nested lapply call which will generate nested list automatically.
histlist <- lapply(data, function(x) {
cols <- unique(colnames(x))
lapply(cols, function(y) hist(x[, colnames(x) == y]))
})

Why does function return NULL?

A beginner in R over here, so apologies for the basic question.
Why does ATE return a null vector instead of saving the values of the difference of the means?
fun.cluster <- function(M, N){
set.seed(02139)
J <- 1:M # vector J_i
df <- as.data.frame(matrix(data=1:N, nrow = N, ncol = 1)) #data frame of all original values
df$cluster <- cut(df$V1, M, labels = 1:M) #breaking the dataframe into clusters
df$cluster <- as.numeric(df$cluster)
Y1 <- as.vector(sample(J, 5)) # assigning treatment
df$treatment <- ifelse(df$cluster %in% Y1, df$treatment <- 1, df$treatment <- 0)
#Inducing intracluster correlation:
mu_0j <- runif(n = 50, min = -1, max = 1)
df$V1[df$treatment==0] <- mu_0j
mu_1j <- runif(n=50, min = -0.5, max = 1.5)
df$V1[df$treatment==0] <- mu_1j
# drawing values
y_0i <- rnorm(n = 50, mean = mu_0j, sd = 1)
y_1i <- rnorm(n = 50, mean = mu_1j, sd = 1)
D_i <- as.vector(c(y_0i, y_1i))
# calculating ATE:
ATE[i] <- mean(y_1i - y_0i)
}
ATE <- c()
for(i in 1:10){
fun.cluster(M = 10, N = 100)
}

Performing t-Test Selection manually

I’m trying to write simulation code, that generates data and runs t-test selection (discarding those predictors whose t-test p-value exceeds 0.05, retaining the rest) on it. The simulation is largely an adaptation of Applied Econometrics with R by Kleiber and Zeileis (2008, pp. 183–189).
When running the code, it usually fails. Yet with certain seeds (e.g. 1534) it produces plausible output. If it does not produce output (e.g. 1911), it fails due to: "Error in x[, ii] : subscript out of bounds", which traces back to na.omit.data.frame(). So, for some reason, the way I attempt to handle the NAs seems to fail, but I'm unable to figure out in how so.
coef <- rep(coef[,3], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
The first block is unlikely to the cause of the error. It merely generates the data and works well on its own and with other methods, like PCA, as well. The second block pulls the p-values from the regression output; removes the p-value of the intercept (beta_0); and fills the vector with as many 7s as necessary to have the same length as the number of variables, to ensure the same dimension for matrix calculations. Seven is arbitrary and could be any number larger than 0.05 to not pass the test of the loop. This becomes – I believe – necessary, if R discards predictors due to multicollinearity.
The final block creates an empty matrix of the original dimensions; inserts the original data, if the t-test p-value is lower than 0.05, else retains the NA; while the penultimate line removes all columns containing NAs ((exclusively NA or one NA is the same here) taken from mnel’s answer to Remove columns from dataframe where ALL values are NA); lastly, the modified data is again put in the shape of a linear regression.
Does anyone know what causes this behavior or how it would work as intended? I would expect it to either work or not, but not kind of both. Ideally, the former.
A working version of the code is:
set.seed(1534)
Sim_TTS <- function(nobs = c(1000, 15000), pdim = pdims, coef = coef100,
model = c("MLC", "MHC"), ...){
DGP_TTS <- function(nobs = 1000, model = c("MLC", "MHC"), coef = coef100,
sd = 1, pdim = pdims, ALPHA = 0.05)
{
model <- match.arg(model)
if(model == "MLC") {
coef <- rep(coef[,1], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
} else {
coef <- rep(coef[,2], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
}
return(TTR)
}
PG_TTS <- function(nrep = 1, ...)
{
rsq <- matrix(rep(NA, nrep), ncol = 1)
rsqad <- matrix(rep(NA, nrep), ncol = 1)
pastr <- matrix(rep(NA, nrep), ncol = 1)
vmat <- cbind(rsq, rsqad, pastr)
colnames(vmat) <- c("R sq.", "adj. R sq.", "p*")
for(i in 1:nrep) {
vmat[i,1] <- summary(DGP_TTS(...))$r.squared
vmat[i,2] <- summary(DGP_TTS(...))$adj.r.squared
vmat[i,3] <- length(DGP_TTS(...)$coefficients)-1
}
return(c(mean(vmat[,1]), mean(vmat[,2]), round(mean(vmat[,3]))))
}
SIM_TTS <- function(...)
{
prs <- expand.grid(pdim = pdim, nobs = nobs, model = model)
nprs <- nrow(prs)
pow <- matrix(rep(NA, 3 * nprs), ncol = 3)
for(i in 1:nprs) pow[i,] <- PG_TTS(pdim = prs[i,1],
nobs = prs[i,2], model = as.character(prs[i,3]), ...)
rval <- rbind(prs, prs, prs)
rval$stat <- factor(rep(1:3, c(nprs, nprs, nprs)),
labels = c("R sq.", "adj. R sq.", "p*"))
rval$power <- c(pow[,1], pow[,2], pow[,3])
rval$nobs <- factor(rval$nobs)
return(rval)
}
psim_TTS <- SIM_TTS()
tab_TTS <- xtabs(power ~ pdim + stat + model + nobs, data = psim_TTS)
ftable(tab_TTS, row.vars = c("model", "nobs", "stat"), col.vars = "pdim")}
FO_TTS <- Sim_TTS()
FO_TTS
}
Preceeded by:
pdims <- seq(12, 100, 4)
coefLC12 <- c(0, rep(0.2, 4), rep(0.1, 4), rep(0, 4))/1.3
rtL <- c(0.2, rep(0, 3))/1.3
coefLC100 <- c(coefLC12, rep(rtL, 22))
coefHC12 <- c(0, rep(0.8, 4), rep(0.4, 4), rep(0, 4))/1.1
rtH <- c(0.8, rep(0, 3))/1.1
coefHC100 <- c(coefHC12, rep(rtH, 22))
coef100 <- cbind(coefLC100, coefHC100)
I’m aware that model selection via the significance of individual predictors is not recommended, but that is the whole point – it is meant to be compared to more sophisticated methods.

How to do calculations on elements from a sublist in R

my code is as follows:
x <- data.frame(matrix(rnorm(20), nrow=10))
colnames(x) <- c("z", "m")
n_boot<-4
bs <- list()
for (i in 1:n_boot) {
bs[[i]] <- x[sample(nrow(x), 10, replace = TRUE), ]
}
bt<-matrix(unlist(bs), ncol = 2*n_boot, byrow = FALSE)
colnames(bt) <- rep(c("z","m"),times=n_boot)
M_to_boot <- bt[,seq(2,8,by=2)]
funct<-function(M_boot_max) {
od<-(1/((10*((10^((16-M_boot_max-25)/5))^3)/3)*((max(M_boot_max)-min(M_boot_max))/50)))
}
V_boot<-apply(M_to_boot,2,funct)
rows.combined <- nrow(M_to_boot)
cols.combined <- ncol(M_to_boot) + ncol(V_boot)
matrix.combined <- matrix(NA, nrow=rows.combined, ncol=cols.combined)
matrix.combined[, seq(1, cols.combined, 2)] <- M_to_boot
matrix.combined[, seq(2, cols.combined, 2)] <- V_boot
colnames(matrix.combined) <- rep(c("M_boot","V_boot"),times=n_boot)
df<-as.data.frame(matrix.combined)
start0 <- seq(1, by = 2, length = ncol(df) / 2)
start <- lapply(start0, function(i, df) df[i:(i+1)], df = df)
tests<-lapply(start, function(xy) split(xy, cut(xy$M_boot,breaks=5)))
Now I want to prepare some calculations on values V_boot from a sublists. To be specific I want to for each subsample calculate the sum of V_boot. So, for example I want for a bin M_boot "[[4]]$(0.811,1.25]" to have a value of sum(V_boot) for that bin. But I cannot figure out how to get to that each V_boot values.
Please help me.

Resources