Using rollapply and lm over multiple columns of data - r

I have a data frame similar to the following with a total of 500 columns:
Probes <- data.frame(Days=seq(0.01, 4.91, 0.01), B1=5:495,B2=-100:390, B3=10:500,B4=-200:290)
I would like to calculate a rolling window linear regression where my window size is 12 data points and each sequential regression is separated by 6 data points. For each regression, "Days" will always be the x component of the model, and the y's would be each of the other columns (B1, followed by B2, B3, etc). I would then like to save the co-efficients as a dataframe with the existing column titles (B1, B2, etc).
I think my code is close, but is not quite working. I used rollapply from the zoo library.
slopedata<-rollapply(zoo(Probes), width=12, function(Probes) {
coef(lm(formula=y~Probes$Days, data = Probes))[2]
}, by = 6, by.column=TRUE, align="right")
If possible, I would also like to have the "xmins" saved to a vector to add to the dataframe. This would mean the smallest x value used in each regression (basically it would be every 6 numbers in the "Days" column.)
Thanks for your help.

1) Define a zoo object z whose data contains Probes and whose index is taken from the first column of Probes, i.e. Days. Noting that lm allows y to be a matrix define a coefs function which computes the regression coefficients. Finally rollapply over z. Note that the index of the returned object gives xmin.
library(zoo)
z <- zoo(Probes, Probes[[1]])
coefs <- function(z) c(unlist(as.data.frame(coef(lm(z[,-1] ~ z[,1])))))
rz <- rollapply(z, 12, by = 6, coefs, by.column = FALSE, align = "left")
giving:
> head(rz)
B11 B12 B21 B22 B31 B32 B41 B42
0.01 4 100 -101 100 9 100 -201 100
0.07 4 100 -101 100 9 100 -201 100
0.13 4 100 -101 100 9 100 -201 100
0.19 4 100 -101 100 9 100 -201 100
0.25 4 100 -101 100 9 100 -201 100
0.31 4 100 -101 100 9 100 -201 100
Note that DF <- fortify.zoo(rz) could be used if you needed a data frame representation of rz.
2) An alternative somewhat similar approch would be to rollaplly over the row numbers:
library(zoo)
y <- as.matrix(Probes[-1])
Days <- Probes$Days
n <- nrow(Probes)
coefs <- function(ix) c(unlist(as.data.frame(coef(lm(y ~ Days, subset = ix)))),
xmins = Days[ix][1])
r <- rollapply(1:n, 12, by = 6, coefs)

try this:
# here are the xmin values you wanted
xmins <- Probes$Days[seq(1,nrow(Probes),6)]
# here we build a function that will run regressions across the columns
# y1 vs x, y2 vs x, y3 vs x...
# you enter the window and by (12/6) in order to limit the interval being
# regressed. this is later called in do.call
runreg <- function(Probes,m,window=12,by=6){
# beg,end are used to specify the interval
beg <- seq(1,nrow(Probes),by)[m]
end <- beg+window-1
# this is used to go through all the columns
N <- ncol(Probes)-1
tmp <- numeric(N)
# go through each column and store the coefficients in tmp
for(i in 1:N){
y <- Probes[[i+1]][beg:end]
x <- Probes$Days[beg:end]
tmp[i] <- coef(lm(y~x))[2][[1]]
}
# put all our column regressions into a dataframe
res <- rbind('coeff'=tmp)
colnames(res) <- colnames(Probes)[-1]
return(res)
}
# now that we've built the function to do the column regressions
# we just need to go through all the window-ed regressions (row regressions)
res <- do.call(rbind,lapply(1:length(xmins),function(m) runreg(Probes,m)))
# these rownames are the index of the xmin values
rownames(res) <- seq(1,nrow(Probes),6)
res <- data.frame(res,xmins)

You can also use the rollRegres package as follows
# setup data
Probes <- data.frame(
# I changed the days to be intergers
Days=seq(1L, 491L, 1L),
B1=5:495, B2=-100:390, B3=10:500 , B4=-200:290)
# setup grp argument
grp_arg <- as.integer((Probes$Days - 1L) %/% 6)
# estimate coefs. width argument is realtive in grp units
library(rollRegres)
X <- cbind(1, Probes$Days / 100)
Ys <- as.matrix(Probes[, 2:5])
out <- lapply(1:ncol(Ys), function(i)
roll_regres.fit(x = X, y = Ys[, i], width = 2L, grp = grp_arg)$coefs)
out <- do.call(cbind, out)
# only keep the complete.cases and the unique values
colnames(out) <- sapply(1:4, function(i) paste0("B", i, 0:1))
out <- out[c(T, grp_arg[-1] != head(grp_arg, -1)), ]
out <- out[complete.cases(out), ]
head(out)
#R B10 B11 B20 B21 B30 B31 B40 B41
#R [1,] 4 100 -101 100 9 100 -201 100
#R [2,] 4 100 -101 100 9 100 -201 100
#R [3,] 4 100 -101 100 9 100 -201 100
#R [4,] 4 100 -101 100 9 100 -201 100
#R [5,] 4 100 -101 100 9 100 -201 100
#R [6,] 4 100 -101 100 9 100 -201 100
The solution is a lot faster than e.g., the zoo solution
library(zoo) coefs <- function(z) c(unlist(as.data.frame(coef(lm(z[,-1] ~ z[,1]))))) microbenchmark::microbenchmark( rollapply = {
z <- zoo(Probes, Probes[[1]])
rz <- rollapply(z, 12, by = 6, coefs, by.column = FALSE, align = "left") }, roll_regres = {
grp_arg <- as.integer((Probes$Days - 1L) %/% 6)
X <- cbind(1, Probes$Days / 100)
Ys <- as.matrix(Probes[, 2:5])
out <- lapply(1:ncol(Ys), function(i)
roll_regres.fit(x = X, y = Ys[, i], width = 2L, grp = grp_arg)$coefs)
out <- do.call(cbind, out)
colnames(out) <- sapply(1:4, function(i) paste0("B", i, 0:1))
out <- out[c(T, grp_arg[-1] != head(grp_arg, -1)), ]
out <- out[complete.cases(out), ]
head(out) } )
#R Unit: microseconds
#R expr min lq mean median uq max neval
#R rollapply 53392.614 56330.492 59793.106 58363.2825 60902.938 119206.76 100
#R roll_regres 865.186 920.297 1074.161 983.9015 1047.705 5071.41 100
At present you though need to install the package from Github due to an error in the validation in version 0.1.0. Thus, run
devtools::install_github("boennecd/rollRegres", upgrade_dependencies = FALSE,
build_vignettes = TRUE)

Related

How to optimzie my function by dropping loops

I have the following function that uses nested loops and honestly I'm not sure how to proceed with making the code run more efficient. It runs fine for 100 sims in my opinion but when I ran for 2000 sims it took almost 12 seconds.
This code will generate any n Brownian Motion simulations and works well, the issue is once the simulation size is increased to say 500+ then it starts to bog down, and when it hits 2k then it's pretty slow ie 12.
Here is the function:
ts_brownian_motion <- function(.time = 100, .num_sims = 10, .delta_time = 1,
.initial_value = 0) {
# TidyEval ----
T <- as.numeric(.time)
N <- as.numeric(.num_sims)
delta_t <- as.numeric(.delta_time)
initial_value <- as.numeric(.initial_value)
# Checks ----
if (!is.numeric(T) | !is.numeric(N) | !is.numeric(delta_t) | !is.numeric(initial_value)){
rlang::abort(
message = "All parameters must be numeric values.",
use_cli_format = TRUE
)
}
# Initialize empty data.frame to store the simulations
sim_data <- data.frame()
# Generate N simulations
for (i in 1:N) {
# Initialize the current simulation with a starting value of 0
sim <- c(initial_value)
# Generate the brownian motion values for each time step
for (t in 1:(T / delta_t)) {
sim <- c(sim, sim[t] + rnorm(1, mean = 0, sd = sqrt(delta_t)))
}
# Bind the time steps, simulation values, and simulation number together in a data.frame and add it to the result
sim_data <- rbind(
sim_data,
data.frame(
t = seq(0, T, delta_t),
y = sim,
sim_number = i
)
)
}
# Clean up
sim_data <- sim_data %>%
dplyr::as_tibble() %>%
dplyr::mutate(sim_number = forcats::as_factor(sim_number)) %>%
dplyr::select(sim_number, t, y)
# Return ----
attr(sim_data, ".time") <- .time
attr(sim_data, ".num_sims") <- .num_sims
attr(sim_data, ".delta_time") <- .delta_time
attr(sim_data, ".initial_value") <- .initial_value
return(sim_data)
}
Here is some output of the function:
> ts_brownian_motion(.time = 10, .num_sims = 25)
# A tibble: 275 × 3
sim_number t y
<fct> <dbl> <dbl>
1 1 0 0
2 1 1 -2.13
3 1 2 -1.08
4 1 3 0.0728
5 1 4 0.562
6 1 5 0.255
7 1 6 -1.28
8 1 7 -1.76
9 1 8 -0.770
10 1 9 -0.536
# … with 265 more rows
# ℹ Use `print(n = ...)` to see more rows
As suggested in the comments, if you want speed, you should use cumsum. You need to be clear what type of Brownian Motion you want (arithmetic, geometric). For geometric Brownian motion, you'll need to correct the approximation error by adjusting the mean. As an example, the NMOF package (which I maintain), contains a function gbm that implements geometric Brownian Motion through cumsum. Here is an example call for 2000 paths with 100 timesteps each.
library("NMOF")
library("zoo") ## for plotting
timesteps <- 100
system.time(b <- NMOF::gbm(2000, tau = 1, timesteps = 100, r = 0, v = 1))
## user system elapsed
## 0.013 0.000 0.013
dim(b) ## each column is one path, starting at time zero
## [1] 101 2000
plot(zoo(b[, 1:5], 0:timesteps), plot.type = "single")

Counter loops for summary data Confidence Intervals in R

I am trying to use a for-loop as a repeat counter to add summary data to a test sample. I have tried to use a data.frame, matrix, and a vector push my data out of the for loop and populate a table. The best I have got is filling one complete column in a vector and completing all columns but one row in a data frame.
#try empty vector to populate
large.sample.df <- vector(mode = "double", length = 1000)
#try matrix to populate
large.matrix <- matrix(nrow = 1000, ncol = 3)
matrix.names <- c("mean", "lwr", "upr")
colnames(large.matrix) <- matrix.names
#Try dataframe to populate
large.df <- data.frame(mean="", lwr="", upr="")
#set total length
n <- length(large.sample.df)
#use functions to calculate confidence interval
lwr.ci <- function(a) (mean(a) - 1.96 * (sd(a)/sqrt(length(a))))
upp.ci <- function(a) (mean(a) + 1.96 * (sd(a)/sqrt(length(a))))
#Start new seed count
set.seed(1234)
#begin for loop for mean, lwr, upr CI
for (i in 1:n) {
large.sample <- rgamma(n = 1000, shape = 4, rate = 2)
large.df$mean[i] <- mean(large.sample)
large.df$lwr[i] <- lwr.ci(large.sample)
large.df$upr[i] <- upp.ci(large.sample)
}
Here are two ways to get what you want. First we should distinguish between the sample size and the number of samples:
set.seed(1234)
n <- 1000
samples <- 10 # Keep this small for testing and then increase it
s <- 4
r <- 2
First your loop approach:
results <- data.frame(mean=NA, lwr=NA, upr=NA) # Not "" which makes the variables character strings
set.seed(1234)
for (i in 1:samples) {
x <- rgamma(n, shape = s, rate = r)
mn <- mean(x)
sder <- sd(x)/sqrt(n)
lwr <- mn - 1.96 * sder
upr <- mn + 1.96 * sder
results[i, ] <- c(mn, lwr, upr)
}
results
# mean lwr upr
# 1 2.015193688 1.952431714 2.077955663
# 2 2.024218250 1.962404608 2.086031891
# 3 2.008401293 1.948363928 2.068438658
# 4 1.993061142 1.932020588 2.054101696
# 5 1.975824831 1.912961486 2.038688176
# 6 1.983761126 1.923583927 2.043938325
# 7 1.983166350 1.924890819 2.041441880
# 8 1.975453269 1.915336118 2.035570420
# 9 1.976118333 1.915025748 2.037210918
# 10 2.044088839 1.983435628 2.104742050
Now using replicate
confint <- function(n, s, r) {
x <- rgamma(n, shape = s, rate = r)
mn <- mean(x)
sder <- sd(x)/sqrt(n)
lwr <- mn - 1.96 * sder
upr <- mn + 1.96 * sder
return(c(mean=mn, lwr=lwr, upr=upr))
}
confint(n, s, r) # Test the function
# mean lwr upr
# 1.974328366 1.914003710 2.034653023
set.seed(1234)
results <- replicate(samples, confint(n, s, r))
results <- t(results)
results
# mean lwr upr
# [1,] 2.015193688 1.952431714 2.077955663
# [2,] 2.024218250 1.962404608 2.086031891
# [3,] 2.008401293 1.948363928 2.068438658
# [4,] 1.993061142 1.932020588 2.054101696
# [5,] 1.975824831 1.912961486 2.038688176
# [6,] 1.983761126 1.923583927 2.043938325
# [7,] 1.983166350 1.924890819 2.041441880
# [8,] 1.975453269 1.915336118 2.035570420
# [9,] 1.976118333 1.915025748 2.037210918
# [10,] 2.044088839 1.983435628 2.104742050
Both approaches agree.

BTYD Individual Level Estimations For All Observations

I am using BTYD BG NBD in R and did the individual level estimates.
For instance following the documentation in page 20 of:
BTYD Walkthrough
Code for Data Prep:
system.file("data/cdnowElog.csv", package = "BTYD")%>%
dc.ReadLines(., cust.idx = 2, date.idx = 3, sales.idx = 5)%>%
dc.MergeTransactionsOnSameDate()%>%
mutate(date = parse_date_time(date, "%Y%m%d")) -> elog
end.of.cal.period <- as.Date("1997-09-30")
elog.cal <- elog[which(elog$date <= end.of.cal.period), ]
split.data <- dc.SplitUpElogForRepeatTrans(elog.cal);
birth.periods <- split.data$cust.data$birth.per
last.dates <- split.data$cust.data$last.date
clean.elog <- split.data$repeat.trans.elog;
freq.cbt <- dc.CreateFreqCBT(clean.elog);
tot.cbt <- dc.CreateFreqCBT(elog)
cal.cbt <- dc.MergeCustomers(tot.cbt, freq.cbt)
cal.cbs.dates <- data.frame(birth.periods, last.dates, end.of.cal.period)
cal.cbs <- dc.BuildCBSFromCBTAndDates(cal.cbt, cal.cbs.dates,per="week")
params <- pnbd.EstimateParameters(cal.cbs);
one could get estimates for a particular observation.
Code for Individual Level Estimation:
cal.cbs["1516",]
# x t.x T.cal
# 26.00 30.86 31.00
x <- cal.cbs["1516", "x"]
t.x <- cal.cbs["1516", "t.x"]
T.cal <- cal.cbs["1516", "T.cal"]
bgnbd.ConditionalExpectedTransactions(params, T.star = 52,
x, t.x, T.cal)
# [1] 25.76
My question is, is it possible to recursively run this such that I could get a data frame containing the expectations for each row instead of hard coding a particular ID number such as "1516" in this case?
Thanks!
Yes, it is straightforward with dplyr's mutate()
cal.cbs%>%
data.frame()%>%
mutate(`Conditional Expectation` = bgnbd.ConditionalExpectedTransactions(params, T.star = 52, x, t.x, T.cal))
x t.x T.cal Conditional Expectation
1 2 30.428571 38.85714 2.3224971
2 1 1.714286 38.85714 1.0646350
3 0 0.000000 38.85714 0.5607707
4 0 0.000000 38.85714 0.5607707
5 0 0.000000 38.85714 0.5607707
6 7 29.428571 38.85714 6.0231497

Large raster frequency table / counts

I try to calculate the frequency/count of pixel values of a raster in R using freq().
Create two example rasters for comparison:
library(raster)
RastSmall <- raster(nrow=70, ncol=70)
RastBig <- raster(nrow=7000, ncol=7000)
set.seed(0)
RastSmall[] <- round(runif(1:ncell(r_hr), 1, 5))
RastBig[] <- round(runif(1:ncell(r_hr), 1, 5))
Get the pixel count using freq()
freq(RastSmall)
value count
[1,] 1 6540000
[2,] 2 12150000
[3,] 3 12140000
[4,] 4 11720000
[5,] 5 6450000
However, it is a fairly large file and takes extremely long, i.e. up to hours. Is there a faster way in R?
Here the speed difference for a small and a large raster:
system.time(freq(RastSmall))
user system elapsed
0.008 0.000 0.004
system.time(freq(RastBig))
user system elapsed
40.484 0.964 41.445
Is there a way to speed this up? Alternatively can this be done in the command line using something like gdal tools?
I did exactly that last week, however I couldn't find other faster ways to do it in R. I've tried to do it with the rqgis package by calling the r.report of GRASS. It works but was slower than the R native function. Maybe you'll have a better luck. Here is my code with grass in case you want to try it:
library(RQGIS)
monqgis <- set_env("C:\\Mrnmicro\\Applic\\OSGeo4W")
find_algorithms(search_term = "report", qgis_env = monqgis)
get_usage(alg = "grass7:r.report", qgis_env = monqgis)
params <- get_args_man(alg = "grass7:r.report", qgis_env = monqgis)
get_usage(alg = "grass7:r.report", qgis_env = monqgis)
params$map <- classif
params$units <- 5
params$rawoutput <- "C:\\temp\\outputRQGIS_raw"
params$html <- "C:\\temp\\outputRQGIS"
system.time(asas <- run_qgis(alg = "grass7:r.report", params=params,load_output = params$OUTPUT, qgis_env = monqgis))
not an amazing saving but if you getValues from your raster and then run the base::table function, it saves about 20%. My raster was c.500m cells.
# read in raster to obtain frequency table
r <- raster("./path/myraster.tif")
# perform tests; traditional freq() vs. getValues() & table()
require(microbenchmark)
mbm <- microbenchmark(
Freq = {freqf <- freq(r,useNA="no");
freq.df <- data.frame(CODE=freqf[,1], N=freqf[,2]},
GetVals = {v <- getValues(r);
vt <- table(v);
getval.df <- data.frame(CODE=as.numeric(names(vt)),N=as.numeric(as.matrix(vt)))},
times=5
)
mbm
Unit: seconds
expr min lq mean median uq max neval
Freq 191.1649 191.8001 198.8567 192.5256 193.0986 225.6942 5
GetVals 153.5552 154.8776 156.9173 157.0539 159.0400 160.0598 5
# check the routines have identical results
identical(freq.df,getval.df)
[1] TRUE
bit of a saving i guess
(N.B. the reason i make the data frames is that I go on to process the data that comes out of the frequency analysis)
I think the most effective way to calculate that is by using GetHistogram( ) from GDAL. Unfortunately, I can't find a way to use it from R. The closest approach is by using gdalUtilities::gdalinfo from R, and use the flag -hist, or hist = TRUE, but is limited the calculations between 0 - 255.
Another option is using rasterDT::freqDT, which is faster than regular options. Here an example:
library(gdalUtilities)
library(raster)
library(rasterDT)
library(microbenchmark)
RastBig <- raster(nrow=7000, ncol=7000)
set.seed(0)
RastBig[] <- round(runif(1:ncell(RastBig), 1, 5))
writeRaster(RastBig, filename = 'C:/temp/RastBig.tif')
mbm <- microbenchmark(times = 50,
freq1 = freq(RastBig),
freq2 = table(RastBig[]),
freq3 = freqDT(RastBig),
freq4 = ({
gdalLog <- capture.output(gdalUtilities::gdalinfo(datasetname = 'C:/temp/RastBig.tif', hist = TRUE));
(bucxml <- as.numeric(sub('buckets.+', '', grep('buckets ', gdalLog, value = TRUE))));
(minxml <- as.numeric(gsub('.+from | to.+', '', grep('buckets ', gdalLog, value = TRUE)) ));
(maxxml <- as.numeric(gsub('.+to |:', '', grep('buckets ', gdalLog, value = TRUE))));
(histxml <- as.numeric(strsplit(split = '[[:space:]]', gsub("^ |^ ", "", gdalLog[grep('buckets', gdalLog)+1]))[[1]]));
labs <- seq(from = minxml, to = maxxml, length.out = bucxml);
df <- data.frame(labs, nwlab = c(ceiling(labs[1]),
round(labs[2:(bucxml-1)]),
floor(labs[bucxml])),
val = histxml);
hist <- aggregate(df$val, by = list(df$nwlab), sum)})
)
Results:
> freq1
value count
[1,] 1 6127755
[2,] 2 12251324
[3,] 3 12249376
[4,] 4 12248938
[5,] 5 6122607
> freq2
1 2 3 4 5
6127755 12251324 12249376 12248938 6122607
> freq3
ID freq
1: 1 6127755
2: 2 12251324
3: 3 12249376
4: 4 12248938
5: 5 6122607
> freq4
Group.1 x
1 1 6127755
2 2 12251324
3 3 12249376
4 4 12248938
5 5 6122607
Unit: milliseconds
expr min lq mean median uq max neval
freq1 58628.486301 59100.539302 59400.304887 59383.913701 59650.412 60841.3975 50
freq2 55912.170401 56663.025202 56954.032395 56919.905051 57202.001 58307.9500 50
freq3 3785.767301 4006.858102 4288.699531 4292.447250 4536.382 4996.0598 50
freq4 7.892201 8.883102 9.255641 9.154001 9.483 15.6072 50
EDIT: using this is quite faster than option 3:
rB <- raster('C:/temp/RastBig.tif')
freq3B <- freqDT(rB)

view values used by function boot to bootstrap estimates

I have written the code below to obtain a bootstrap estimate of a mean. My objective is to view the numbers selected from the data set, ideally in the order they are selected, by the function boot in the boot package.
The data set only contains three numbers: 1, 10, and 100 and I am only using two bootstrap samples.
The estimated mean is 23.5 and the R code below indicates that the six numbers included one '1', four '10' and one '100'. However, there are 30 possible combinations of those numbers that would have resulted in a mean of 23.5.
Is there a way for me to determine which of those 30 possible combinations is the combination that actually appeared in the two bootstrap samples?
library(boot)
set.seed(1234)
dat <- c(1, 10, 100)
av <- function(dat, i) { sum(dat[i])/length(dat[i]) }
av.boot <- boot(dat, av, R = 2)
av.boot
#
# ORDINARY NONPARAMETRIC BOOTSTRAP
#
#
# Call:
# boot(data = dat, statistic = av, R = 2)
#
#
# Bootstrap Statistics :
# original bias std. error
# t1* 37 -13.5 19.09188
#
mean(dat) + -13.5
# [1] 23.5
# The two samples must have contained one '1', four '10' and one '100',
# but there are 30 possibilities.
# Which of these 30 possible sequences actual occurred?
# This code shows there must have been one '1', four '10' and one '100'
# and shows the 30 possible combinations
my.combos <- expand.grid(V1 = c(1, 10, 100),
V2 = c(1, 10, 100),
V3 = c(1, 10, 100),
V4 = c(1, 10, 100),
V5 = c(1, 10, 100),
V6 = c(1, 10, 100))
my.means <- apply(my.combos, 1, function(x) {( (x[1] + x[2] + x[3])/3 + (x[4] + x[5] + x[6])/3 ) / 2 })
possible.samples <- my.combos[my.means == 23.5,]
dim(possible.samples)
n.1 <- rowSums(possible.samples == 1)
n.10 <- rowSums(possible.samples == 10)
n.100 <- rowSums(possible.samples == 100)
n.1[1]
n.10[1]
n.100[1]
length(unique(n.1)) == 1
length(unique(n.10)) == 1
length(unique(n.100)) == 1
I think you can determine the numbers sampled and the order in which they are sampled with the code below. You have to extract the function ordinary.array from the boot package and paste that function into your R code. Then specify the values for n, R and strata, where n is the number of observations in the data set and R is the number of replicate samples you want.
I do not know how general this approach is, but it worked with a couple of simple examples I tried, including the example below.
library(boot)
set.seed(1234)
dat <- c(1, 10, 100, 1000)
av <- function(dat, i) { sum(dat[i])/length(dat[i]) }
av.boot <- boot(dat, av, R = 3)
av.boot
#
# ORDINARY NONPARAMETRIC BOOTSTRAP
#
#
# Call:
# boot(data = dat, statistic = av, R = 3)
#
#
# Bootstrap Statistics :
# original bias std. error
# t1* 277.75 -127.5 132.2405
#
#
mean(dat) + -127.5
# [1] 150.25
# boot:::ordinary.array
ordinary.array <- function (n, R, strata)
{
inds <- as.integer(names(table(strata)))
if (length(inds) == 1L) {
output <- sample.int(n, n * R, replace = TRUE)
dim(output) <- c(R, n)
}
else {
output <- matrix(as.integer(0L), R, n)
for (is in inds) {
gp <- seq_len(n)[strata == is]
output[, gp] <- if (length(gp) == 1)
rep(gp, R)
else bsample(gp, R * length(gp))
}
}
output
}
# I think the function ordinary.array determines which elements
# of the data are sampled in each of the R samples
set.seed(1234)
ordinary.array(n=4,R=3,1)
# [,1] [,2] [,3] [,4]
# [1,] 1 3 1 3
# [2,] 3 4 1 3
# [3,] 3 3 3 3
#
# which equals:
((1+100+1+100) / 4 + (100+1000+1+100) / 4 + (100+100+100+100) / 4) / 3
# [1] 150.25

Resources