Operations between list objects in R with apply function - r

I have a 1x5 matrix A and a list of 5 B with matrices (double [100x100]) in R. Each argument of A corresponds to one B and I want to create the following function
C = ( max(A[,i],A[,j]) * B[[i]] * B[[j]] ) / 2
for example to calculate C between 1 and 2 I can use the following
set.seed(123)
A <- matrix(c(5.2,6.9,32,40,8.3), ncol = 5 )
B <- list(matrix(rnorm(100 * 100, mean = 0, sd = 1), 100, 100),
matrix(rnorm(100 * 100, mean = 0, sd = 1), 100, 100),
matrix(rnorm(100 * 100, mean = 0, sd = 1), 100, 100),
matrix(rnorm(100 * 100, mean = 0, sd = 1), 100, 100),
matrix(rnorm(100 * 100, mean = 0, sd = 1), 100, 100))
C_1.2<- max(A[,1],A[,2]) * (unlist(B[[1]])*unlist(B[[2]]))
I want to create a list in R that does the above for all possible combinations with the use of an apply function? I want the names to be C_j.j. I manage to it as follows:
combinations <- combn(1:5, 2)
result_list <- lapply(1:ncol(combinations), function(i){
j <- combinations[1, i]
k <- combinations[2, i]
C <- (max(A[, j], A[, k]) * B[[j]] * B[[k]]) / 2
list(C)
})
names(result_list) <- paste0("C_", combinations[1, ], ".", combinations[2, ])
However I can't produce the C_1.1, C_2.2,C_3.3,C_4.4,C_5.5. How can I fix that? I am thinking when i=j, then to calculate C_i.i <- (A[,i] * B[[i]]^2) / 2 .
A solution based on the answers is
out2 <- apply(expand.grid(seq_along(B), seq_along(B)), 1, \(i)
(max(A[, i]) * (B[[i[1]]] * B[[i[2]]])), simplify = FALSE)
names(out2) <- paste0("C_", do.call(paste,
c(expand.grid(seq_along(B), seq_along(B)), sep = ".")))
or
C <- lapply(1:5, function(i) {
lapply(1:5, function(j) {
if (i == j) {
C_i.i <- (A[,i] * B[[i]]^2)
} else {
C_i.j <- max(A[,i],A[,j]) * (unlist(B[[i]])*unlist(B[[j]]))
}
})
})
which however creates all possible scenarios and doubles the computational time since we know that C_1.2 = C_2.1 and ton calculation is needed.

We may do this directly in combn
out <- combn(seq_along(B), 2, FUN = function(i)
(max(A[, i]) * (B[[i[1]]] * B[[i[2]]])), simplify = FALSE)
names(out) <- paste0("C_", combn(seq_along(B), 2, FUN = paste, collapse = "."))
-checking the output with OP's output
> C_1.2<- max(A[,1],A[,2]) * (unlist(B[[1]])*unlist(B[[2]]))
> all.equal(out[[1]], C_1.2)
[1] TRUE
If we want all combinations, use expand.grid
out2 <- apply(expand.grid(seq_along(B), seq_along(B)), 1, \(i)
(max(A[, i]) * (B[[i[1]]] * B[[i[2]]])), simplify = FALSE)
names(out2) <- paste0("C_", do.call(paste,
c(expand.grid(seq_along(B), seq_along(B)), sep = ".")))

Related

Using app function from {terra} package on raster stacks? (in parallel)

I have four high resolution rasters for a country. I have split each raster into tiles and done some other processing to them. I now want to apply a function to each cell, of each 'stack' of the raster tiles, to produce one set of output tiles. The function is a little complex. I have tried to synthesise some data below to reproduce my current approach. It works (ish) but I'm convinced that there's a better way to do this. To use parallel processing on my unix box, I simply swap mapply for mcmapply, but I haven't done that in the example below as I presume many will be working on Windows machines. I'd welcome ideas on my approach and particularly optimisation.
library("terra")
library("glue")
## Make some toy data
dir.create("temp_folder")
dir.create("result_folder")
x <- rast(ncols = 10, nrows = 10)
a <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(a) <- some_values
a_tiles <- makeTiles(a, x, glue("temp_folder/tile_a_{1:100}.tif"), overwrite = TRUE)
b <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(b) <- some_values
b_tiles <- makeTiles(b, x, glue("temp_folder/tile_b_{1:100}.tif"), overwrite = TRUE)
c <-rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(c) <- some_values
c_tiles <- makeTiles(c, x, glue("temp_folder/tile_c_{1:100}.tif"), overwrite = TRUE)
d <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(d) <- some_values
d_tiles <- makeTiles(d, x, glue("temp_folder/tile_d_{1:100}.tif"), overwrite = TRUE)
## Outer function so that this can be used in parallel ? But maybe this is a silly way to do it?
outer_function <- function(a_tiles, b_tiles, c_tiles, d_tiles, output_files) {
one_a_tile <- rast(unlist(a_tiles))
one_b_tile <- rast(unlist(b_tiles))
one_c_tile <- rast(unlist(c_tiles))
one_d_tile <- rast(unlist(d_tiles))
output_file <- output_files
# I replace any NAs with 0 as an NA will break my 'if' statement of the inner_function.
# I get Error in if (z["a"] <= z["b"]) { : missing value where TRUE/FALSE needed
one_a_tile[is.na(one_a_tile)] <- 0
one_b_tile[is.na(one_b_tile)] <- 0
one_c_tile[is.na(one_c_tile)] <- 0
one_d_tile[is.na(one_d_tile)] <- 0
z <- sds(one_a_tile, one_b_tile, one_c_tile, one_d_tile)
## Inner function that actually does the work I want doing
inner_function <- function(z) {
names(z) <- c('a', 'b', 'c', 'd')
if (z['a'] <= z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 10),
(z['c'] + z['a'] * 20)))
}
if (z['a'] >= z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 40),
(z['c'] + z['a'] * 10)))
}
if (z['a'] == z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 60),
(z['c'] + z['a'] * 10)))
}
y <- ifelse(y == 0, NA, y)
return(y)
}
app(z,
inner_function,
filename = output_file,
overwrite = TRUE,
wopt = list(datatype = "INT4U"))
return(output_file)
}
results <- mapply(outer_function,
a_tiles = a_tiles,
b_tiles = b_tiles,
c_tiles = c_tiles,
d_tiles = d_tiles,
output_files = output_files <- glue("result_folder/result_tile_{1:length(d_tiles)}.tif"))
names(results) <- NULL
unlink("temp_folder", recursive = TRUE)
unlink("result_folder", recursive = TRUE)

Convert for loops into foreach loops

I want to make the code below more efficient by using the foreach package. I tried it for a very long time but I don't manage to get the same result as when using the for-loops. I would like to use a nested foreach-loop including parallelization... And as output I would like to have two matrices with dim [R,b1] I would be very grateful for some suggestions!!
n <- c(100, 300, 500)
R <- 100
b0 <- 110
b1 <- seq(0.01, 0.1, length.out = 100)
## all combinations of n and b1
grid <- expand.grid(n, b1)
names(grid) <- c("n", "b1")
calcPower <- function( R, b0, grid) {
cl <- makeCluster(3)
registerDoParallel(cl)
## n and b1 coefficients
n <- grid$n
b1 <- grid$b1
## ensures reproducibility
set.seed(2020)
x <- runif(n, 18, 80)
x.dich <- factor( ifelse( x < median( x), 0, 1))
## enables to store two outputs
solution <- list()
## .options.RNG ensures reproducibility
res <- foreach(i = 1:R, .combine = rbind, .inorder = TRUE, .options.RNG = 666) %dorng% {
p.val <- list()
p.val.d <- list()
for( j in seq_along(b1)) {
y <- b0 + b1[j] * x + rnorm(n, 0, sd = 10)
mod.lm <- lm( y ~ x)
mod.lm.d <- lm( y ~ x.dich)
p.val <- c( p.val, ifelse( summary(mod.lm)$coef[2,4] <= 0.05, 1, 0))
p.val.d <- c( p.val.d, ifelse( summary(mod.lm.d)$coef[2,4] <= 0.05, 1, 0))
}
solution[[1]] <- p.val
solution[[2]] <- p.val.d
return(solution)
}
dp.val <- matrix( unlist(res[,1], use.names = FALSE), R, length(b1), byrow = TRUE)
dp.val.d <- matrix( unlist(res[,2], use.names = FALSE), R, length(b1), byrow = TRUE)
stopCluster(cl)
df <- data.frame(
effectS = b1,
power = apply( dp.val, 2, function(x){ mean(x) * 100}),
power.d = apply( dp.val.d, 2, function(x){ mean(x) * 100}),
n = factor(n))
return(df)
}
## simulation for different n
tmp <- with(grid,
by( grid, n,
calcPower, R = R, b0 = b0))
## combines the 3 results
df.power <- rbind(tmp[[1]], tmp[[2]], tmp[[3]])
I created a foreach loop in following code. There had to be some changes made. It is a lot easier to return a list then a matrix in foreach, since it's combined with rbind. Especially when you want to return multiple ones. My solution here is to save everything in a list and afterwards transform it into a matrix of length 100.
Note: there is one mistake in your code. summary( mod.lm.d)$coef[2,4] does not exist. I changed it to [2]. Adjust to your needing
solution <- list()
df2<-foreach(i = 1:R, .combine = rbind, .inorder=TRUE) %dopar%{
set.seed(i)
p.val <- list()
p.val.d <- list()
counter <- list()
for( j in seq_along(b1)){
x <- sort( runif(n, 18, 80))
x.dich <- factor( ifelse( x < median(x), 0, 1))
y <- b0 + b1[j] * x + rnorm( n, 0, sd = 10)
mod.lm <- lm( y ~ x)
mod.lm.d <- lm( y ~ x.dich)
p.val <- c(p.val, ifelse( summary( mod.lm)$coef[2] <= 0.05, 1, 0))
p.val.d <- c(p.val.d, ifelse( summary( mod.lm.d)$coef[2] <= 0.05, 1, 0))
counter <- c(counter, j)
}
solution[[1]] <- p.val
solution[[2]] <- p.val.d
solution[[3]] <- counter
return(solution)
}
dp.val <- unlist(df2[,1], use.names = FALSE)
dp.val.d <- unlist(df2[,2], use.names = FALSE)
dp.val.matr <- matrix(dp.val, R, length(b1))
dp.val.d.matr <- matrix(dp.val.d, R, length(b1))
stopCluster(cl)
for your comment:
A foreach does work with a normal for loop. Minimal reproducible example:
df<-foreach(i = 1:R, .combine = cbind, .inorder=TRUE) %dopar%{
x <- list()
for(j in 1:3){
x <- c(x,j)
}
return(x)
}

Count with Varying Levels Parameters in R

I want R to count how many times my simulated ARIMA data conform to ARIMA(1,0,0) which I have achieved with:
library(forecast)
library(forecast)
cnt <- 0
num <- 60
phi <- 0.8
for(i in 1:10) {
epselon <- rnorm(num, mean=0, sd=1)
ar1 <- arima.sim(n = num, model=list(ar=phi, order = c(1, 0, 0)),
sd=1)
ar2 <- auto.arima(ar1)
if(all(arimaorder(ar2) == c(1, 0, 0))) cnt <- cnt + 1}
cnt
The above is just for a single case when sd=1, n=60, and ar=0.8.
I want a case when I have varying levels of N <- c(15, 20), SD <- c(1, 2) ^ 2, and phi = c(0.8, 0.9) for sample size, standard diviation and AR parameter respectively.
I have traid this:
library(forecast)
N <- c(15, 20)
SD <- c(1, 2) ^ 2
phi = c(0.8, 0.9)
## generate all combos
all_combos <- expand.grid(N = N, SD = SD, phi = phi)
epselon = function(n) rnorm(n, mean = 0, sd = SD)
## create function
fx_arima <- function(n, SD, phi) {
cnt <- 0
num <- 60
phi <- 0.8
for(i in 1:10) {
epselon <- rnorm(num, mean=0, sd=1)
ar1 <- arima.sim(n = num, model=list(ar=phi, order = c(1, 0, 0)), sd=1)
ar2 <- auto.arima(ar1)
if(all(arimaorder(ar2) == c(1, 0, 0))) cnt <- cnt + 1}
cnt
}
## find arima for all combos using Map
set.seed(123L)
res = Map(fx_arima, all_combos[["N"]], all_combos[["SD"]],
all_combos[["phi"]])
## or a little bit more work:
set.seed(123L)
res2 = by(all_combos, all_combos["N"],
function(DF) {
res = mapply(fx_arima, DF[["N"]], DF[["SD"]], DF[["phi"]])
colnames(res) = paste("SD", DF[["SD"]], "phi", DF[["phi"]], sep = "_")
res
})
res2
## write to csv
Map(function(file, DF) write.csv(DF, paste0("N_", file, ".csv")),
names(res2), res2)
which I mirror from arima.sim() function with varying: sample sizes, phi values and sd values and R Count How Many Time auto.arima() Confirmarima.sim() to be True
I got this error message
Error in `colnames<-`(`*tmp*`, value = c("SD_1_phi_0.2", "SD_4_phi_0.2", : attempt to set 'colnames' on an object with less than two dimensions
Traceback:
How can I solve this such that will have my result to show in varying form suuch that first row will be the label while the second row will be the count itself. The result will in two sheets; the first will be for 'N=15' and the second will be for 'N=20'.
If I understood your problem correctly, the error comes from function colnames because your function does not return a "pure" matrix-like object. If, instead, you use the function names in your last chunk of code as follows:
res2 = by(all_combos, all_combos["N"],
function(DF) {
res = mapply(fx_arima, DF[["N"]], DF[["SD"]], DF[["phi"]])
names(res) = paste("SD", DF[["SD"]], "phi", DF[["phi"]], sep = "_")
return(res)
})
res2
You will get:
> res2
N: 15
SD_1_phi_0.8 SD_4_phi_0.8 SD_1_phi_0.9 SD_4_phi_0.9
1 3 7 5
---------------------------------------------------------------------------
N: 20
SD_1_phi_0.8 SD_4_phi_0.8 SD_1_phi_0.9 SD_4_phi_0.9
3 4 5 2
With elements accessible by name and index:
> res2$`15`["SD_1_phi_0.8"]
SD_1_phi_0.8
1
> res2$`15`[1]
SD_1_phi_0.8
1

subtracting unique pair-wise objects from for loop in R

I'm trying to subtract each unique pair-wise ps from the for loop in my function below. To do so, I first find unique pair-wise ps using combn(p, 2) and second use outer to subtract each unique pair from each other.
In both steps, I get error. Is there a fix for the error?
prop <- function(n, yes, a, b = a){
p <- list()
for(i in 1:length(n)){
p[[i]] <- rbeta(2, a[i] + yes[i], b[i] + (n[i] - yes[i]))
}
outer(combn(p, 2), FUN = "-") # Gives Error
}
prop(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3))
By default, it is simplify = TRUE in combn. So, even though the output is a list, it is simplified to have a dim attribute by converting each of the the list as elements in a matrix. As the m is 2, there are 2 list elements for each comparison, extract those elements using [[ and subtract
combn(p, 2, FUN = function(x) x[[1]]- x[[2]])
-full function
prop <- function(n, yes, a, b = a){
p <- list()
for(i in 1:length(n)){
p[[i]] <- rbeta(2, a[i] + yes[i], b[i] + (n[i] - yes[i]))
}
combn(p, 2, FUN = function(x) x[[1]]- x[[2]])
}
prop(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3))
If we wanted to include another argument how
prop <- function(n, yes, a, b = a, how= "one.two"){
delta <- switch(how,
one.two = function(x) x[[1]] - x[[2]],
two.one = function(x) x[[2]] - x[[1]])
p <- list()
for(i in 1:length(n)){
p[[i]] <- rbeta(2, a[i] + yes[i], b[i] + (n[i] - yes[i]))
}
out <- combn(p, 2, FUN = delta)
nm1 <- paste0("p", combn(seq_along(p), 2, FUN = paste, collapse="-"))
colnames(out) <- nm1
out
}
prop(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3), how = "one.two")
prop(n = c(10, 20, 30), yes = rep(5, 3), a = rep(1, 3), how = "two.one")

Split a vector into chunks such that sum of each chunk is approximately constant

I have a large data frame with more than 100 000 records where the values are sorted
For example, consider the following dummy data set
df <- data.frame(values = c(1,1,2,2,3,4,5,6,6,7))
I want to create 3 groups of above values (in sequence only) such that the sum of each group is more or less the same
So for the above group, if I decide to divide the sorted df in 3 groups as follows, their sums will be
1. 1 + 1 + 2 +2 + 3 + 4 = 13
2. 5 + 6 = 11
3. 6 + 7 = 13
How can create this optimization in R? any logic?
So, let's use pruning. I think other solutions are giving a good solution, but not the best one.
First, we want to minimize
where S_n is the cumulative sum of the first n elements.
computeD <- function(p, q, S) {
n <- length(S)
S.star <- S[n] / 3
if (all(p < q)) {
(S[p] - S.star)^2 + (S[q] - S[p] - S.star)^2 + (S[n] - S[q] - S.star)^2
} else {
stop("You shouldn't be here!")
}
}
I think the other solutions optimize over p and q independently, which won't give a global minima (expected for some particular cases).
optiCut <- function(v) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / 3
# good starting values
p_star <- which.min((S - S_star)^2)
q_star <- which.min((S - 2*S_star)^2)
print(min <- computeD(p_star, q_star, S))
count <- 0
for (q in 2:(n-1)) {
S3 <- S[n] - S[q] - S_star
if (S3*S3 < min) {
count <- count + 1
D <- computeD(seq_len(q - 1), q, S)
ind = which.min(D);
if (D[ind] < min) {
# Update optimal values
p_star = ind;
q_star = q;
min = D[ind];
}
}
}
c(p_star, q_star, computeD(p_star, q_star, S), count)
}
This is as fast as the other solutions because it prunes a lot the iterations based on the condition S3*S3 < min. But, it gives the optimal solution, see optiCut(c(1, 2, 3, 3, 5, 10)).
For the solution with K >= 3, I basically reimplemented trees with nested tibbles, that was fun!
optiCut_K <- function(v, K) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / K
# good starting values
p_vec_first <- sapply(seq_len(K - 1), function(i) which.min((S - i*S_star)^2))
min_first <- sum((diff(c(0, S[c(p_vec_first, n)])) - S_star)^2)
compute_children <- function(level, ind, val) {
# leaf
if (level == 1) {
val <- val + (S[ind] - S_star)^2
if (val > min_first) {
return(NULL)
} else {
return(val)
}
}
P_all <- val + (S[ind] - S[seq_len(ind - 1)] - S_star)^2
inds <- which(P_all < min_first)
if (length(inds) == 0) return(NULL)
node <- tibble::tibble(
level = level - 1,
ind = inds,
val = P_all[inds]
)
node$children <- purrr::pmap(node, compute_children)
node <- dplyr::filter(node, !purrr::map_lgl(children, is.null))
`if`(nrow(node) == 0, NULL, node)
}
compute_children(K, n, 0)
}
This gives you all the solution that are least better than the greedy one:
v <- sort(sample(1:1000, 1e5, replace = TRUE))
test <- optiCut_K(v, 9)
You need to unnest this:
full_unnest <- function(tbl) {
tmp <- try(tidyr::unnest(tbl), silent = TRUE)
`if`(identical(class(tmp), "try-error"), tbl, full_unnest(tmp))
}
print(test <- full_unnest(test))
And finally, to get the best solution:
test[which.min(test$children), ]
Here is one approach:
splitter <- function(values, N){
inds = c(0, sapply(1:N, function(i) which.min(abs(cumsum(as.numeric(values)) - sum(as.numeric(values))/N*i))))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
}
how good is it:
# I calculate the mean and sd of the maximal difference of the sums in the
#splits of 100 runs:
#split on 15 parts
set.seed(5)
z1 = as.data.frame(matrix(1:15, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = splitter(values, 15)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
z1 = z1[-1,]
mean(apply(z1, 1, function(x) max(x) - min(x)))
[1] 1004.158
sd(apply(z1, 1, function(x) max(x) - min(x)))
[1] 210.6653
#with less splits (4)
set.seed(5)
z1 = as.data.frame(matrix(1:4, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = splitter(values, 4)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
z1 = z1[-1,]
mean(apply(z1, 1, function(x) max(x) - min(x)))
#632.7723
sd(apply(z1, 1, function(x) max(x) - min(x)))
#260.9864
library(microbenchmark)
1M:
values = sort(sample(1:1000, 1000000, replace = T))
microbenchmark(
sp_27 = splitter(values, 27),
sp_3 = splitter(values, 3),
)
Unit: milliseconds
expr min lq mean median uq max neval cld
sp_27 897.7346 934.2360 1052.0972 1078.6713 1118.6203 1329.3044 100 b
sp_3 108.3283 116.2223 209.4777 173.0522 291.8669 409.7050 100 a
btw F. Privé is correct this function does not give the globally optimal split. It is greedy which is not a good characteristic for such a problem. It will give splits with sums closer to global sum / n in the initial part of the vector but behaving as so will compromise the splits in the later part of the vector.
Here is a test comparison of the three functions posted so far:
db = function(values, N){
temp = floor(sum(values)/N)
inds = c(0, which(c(0, diff(cumsum(values) %% temp)) < 0)[1:(N-1)], length(values))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
} #had to change it a bit since the posted one would not work - the core
#which calculates the splitting positions is the same
missuse <- function(values, N){
inds = c(0, sapply(1:N, function(i) which.min(abs(cumsum(as.numeric(values)) - sum(as.numeric(values))/N*i))))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
}
prive = function(v, N){ #added dummy N argument because of the tester function
dummy = N
computeD <- function(p, q, S) {
n <- length(S)
S.star <- S[n] / 3
if (all(p < q)) {
(S[p] - S.star)^2 + (S[q] - S[p] - S.star)^2 + (S[n] - S[q] - S.star)^2
} else {
stop("You shouldn't be here!")
}
}
optiCut <- function(v, N) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / 3
# good starting values
p_star <- which.min((S - S_star)^2)
q_star <- which.min((S - 2*S_star)^2)
print(min <- computeD(p_star, q_star, S))
count <- 0
for (q in 2:(n-1)) {
S3 <- S[n] - S[q] - S_star
if (S3*S3 < min) {
count <- count + 1
D <- computeD(seq_len(q - 1), q, S)
ind = which.min(D);
if (D[ind] < min) {
# Update optimal values
p_star = ind;
q_star = q;
min = D[ind];
}
}
}
c(p_star, q_star, computeD(p_star, q_star, S), count)
}
z3 = optiCut(v)
inds = c(0, z3[1:2], length(v))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(v, re))
} #added output to be more in line with the other two
Function for testing:
tester = function(split, seed){
set.seed(seed)
z1 = as.data.frame(matrix(1:3, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = split(values, 3)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
m = mean(apply(z1, 1, function(x) max(x) - min(x)))
s = sd(apply(z1, 1, function(x) max(x) - min(x)))
return(c("mean" = m, "sd" = s))
} #tests 100 random 1M length vectors with elements drawn from 1:1000
tester(db, 5)
#mean sd
#779.5686 349.5717
tester(missuse, 5)
#mean sd
#481.4804 216.9158
tester(prive, 5)
#mean sd
#451.6765 174.6303
prive is the clear winner - however it takes quite a bit longer than the other 2. and can handle splitting on 3 elements only.
microbenchmark(
missuse(values, 3),
prive(values, 3),
db(values, 3)
)
Unit: milliseconds
expr min lq mean median uq max neval cld
missuse(values, 3) 100.85978 111.1552 185.8199 120.1707 304.0303 393.4031 100 a
prive(values, 3) 1932.58682 1980.0515 2096.7516 2043.7133 2211.6294 2671.9357 100 b
db(values, 3) 96.86879 104.5141 194.0085 117.6270 306.7143 500.6455 100 a
N = 3
temp = floor(sum(df$values)/N)
inds = c(0, which(c(0, diff(cumsum(df$values) %% temp)) < 0)[1:(N-1)], NROW(df))
split(df$values, rep(1:N, ifelse(N == 1, NROW(df), diff(inds))))
#$`1`
#[1] 1 1 2 2 3 4
#$`2`
#[1] 5 6
#$`3`
#[1] 6 7

Resources