R - Vectorized implementation of ternary function - r

I have three vectors X, Y and Z of equal length n. I need to create an n x n x n array of a function f(X[i],Y[j],Z[k]). The straightforward way to do this is to sequentially loop through each element of each of the 3 vectors. However, the time required to compute the array grows exponentially with n. Is there a way to implement this using vectorized operations?
EDIT: As mentioned in the comments, I have added a simple example of what's needed.
set.seed(1)
X = rnorm(10)
Y = seq(11,20)
Z = seq(21,30)
F = array(0, dim=c( length(X),length(Y),length(Z) ) )
for (i in 1:length(X))
for (j in 1:length(Y))
for (k in 1:length(Z))
F[i,j,k] = X[i] * (Y[j] + Z[k])
Thanks.

You can use nested outer :
set.seed(1)
X = rnorm(10)
Y = seq(11,20)
Z = seq(21,30)
F = array(0, dim = c( length(X),length(Y),length(Z) ) )
for (i in 1:length(X))
for (j in 1:length(Y))
for (k in 1:length(Z))
F[i,j,k] = X[i] * (Y[j] + Z[k])
F2 <- outer(X, outer(Y, Z, "+"), "*")
> identical(F, F2)
[1] TRUE
A microbenchmark including the expand.grid solution proposed by Nick K :
X = rnorm(100)
Y = seq(1:100)
Z = seq(101:200)
forLoop <- function(X, Y, Z) {
F = array(0, dim = c( length(X),length(Y),length(Z) ) )
for (i in 1:length(X))
for (j in 1:length(Y))
for (k in 1:length(Z))
F[i,j,k] = X[i] * (Y[j] + Z[k])
return(F)
}
nestedOuter <- function(X, Y, Z) {
outer(X, outer(Y, Z, "+"), "*")
}
expandGrid <- function(X, Y, Z) {
df <- expand.grid(X = X, Y = Y, Z = Z)
G <- df$X * (df$Y + df$Z)
dim(G) <- c(length(X), length(Y), length(Z))
return(G)
}
library(microbenchmark)
mbm <- microbenchmark(
forLoop = F1 <- forLoop(X, Y, Z),
nestedOuter = F2 <- nestedOuter(X, Y, Z),
expandGrid = F3 <- expandGrid(X, Y, Z),
times = 50L)
> mbm
Unit: milliseconds
expr min lq mean median uq max neval
forLoop 3261.872552 3339.37383 3458.812265 3388.721159 3524.651971 4074.40422 50
nestedOuter 3.293461 3.36810 9.874336 3.541637 5.126789 54.24087 50
expandGrid 53.907789 57.15647 85.612048 88.286431 103.516819 235.45443 50

Here's as an additional option, a possible Rcpp implementation (in case you like your loops). I wasn't able to outperform #Juliens solution though (maybe someone can), but they are more or less have the same timing
library(Rcpp)
cppFunction('NumericVector RCPP(NumericVector X, NumericVector Y, NumericVector Z){
int nrow = X.size(), ncol = 3, indx = 0;
double temp(1) ;
NumericVector out(pow(nrow, ncol)) ;
IntegerVector dim(ncol) ;
for (int l = 0; l < ncol; l++){
dim[l] = nrow;
}
for (int j = 0; j < nrow; j++) {
for (int k = 0; k < nrow; k++) {
temp = Y[j] + Z[k] ;
for (int i = 0; i < nrow; i++) {
out[indx] = X[i] * temp ;
indx += 1 ;
}
}
}
out.attr("dim") = dim;
return out;
}')
Validating
identical(RCPP(X, Y, Z), F)
## [1] TRUE
A quick benchmark
set.seed(123)
X = rnorm(100)
Y = 1:100
Z = 101:200
nestedOuter <- function(X, Y, Z) outer(X, outer(Y, Z, "+"), "*")
library(microbenchmark)
microbenchmark(
nestedOuter = nestedOuter(X, Y, Z),
RCPP = RCPP(X, Y, Z),
unit = "relative",
times = 1e4)
# Unit: relative
# expr min lq mean median uq max neval
# nestedOuter 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 10000
# RCPP 1.164254 1.141713 1.081235 1.100596 1.080133 0.7092394 10000

You could use expand.grid as follows:
df <- expand.grid(X = X, Y = Y, Z = Z)
G <- df$X * (df$Y + df$Z)
dim(G) <- c(length(X), length(Y), length(Z))
all.equal(F, G)
If you had a vectorised function, this would work just as well. If not, you could use plyr::daply.

Related

Optimise matrix calculation in R - possible to avoid aperm?

I have a matrix calculation that I'd like to speed up.
Some toy data and example code:
n = 2 ; d = 3
mu <- matrix(runif(n*d), nrow=n, ncol=d)
sig <- matrix(runif(n*d), nrow=n, ncol=d)
x_i <- c(0, 0, 1, 1)
not_missing <- !is.na(x_i)
calc1 <-function(n, d, mu, sig, x_i, not_missing){
z <- array( rep(0, length(x_i)*n*d),
dim = c(length(x_i), n, d))
subtract_term <- 0.5*log(2*pi*sig)
for(i in 1:length(x_i)){
if( not_missing[i] ){
z[i, , ] <- ((-(x_i[i] - mu)^2 / (2*sig)) - subtract_term )
}
}
z <- aperm(z, c( 2, 1, 3))
return(z)
}
microbenchmark(
z1 <- calc1(n, d, mu, sig, x_i, not_missing)
)
In profiling with real data, both the z[i, , ] <- line and the aperm() line are the slow points. I've been trying to optimise it to avoid calling aperm altogether by transposing the 2D matrices earlier to avoid a 3D transpose, but then I cannot put the 3D array together properly. Any help much appreciated.
Edit: I have a partial solution from #G. Grothendieck, which eliminated the aperm, it has not resulted in much speed improvments for some reason. New solution from his answer is:
calc2 <-function(n, d, mu, sig, x_i, not_missing){
nx <- length(x_i)
z <- array( 0, dim = c(n, nx, d))
subtract_term <- 0.5*log(2*pi*sig)
for(i in 1:nx){
if( not_missing[i] ) {
z[, i, ] <- ((-(x_i[i] - mu)^2 / (2*sig)) - subtract_term )
}
}
return(z)
}
Speed comparison:
> microbenchmark(
+ z1 <- calc1(n, d, mu, sig, x_i, not_missing),
+ z2 <- calc2(n, d, mu, sig, x_i, not_missing), times = 1000
+ )
Unit: microseconds
expr min lq mean median uq max neval cld
z1 <- calc1(n, d, mu, sig, x_i, not_missing) 13.586 14.2975 24.41132 14.5020 14.781 9125.591 1000 a
z2 <- calc2(n, d, mu, sig, x_i, not_missing) 9.094 9.5615 19.98271 9.8875 10.202 9655.254 1000 a
This eliminates the aperm.
calc2 <-function(n, d, mu, sig, x_i, not_missing){
nx <- length(x_i)
z <- array( 0, dim = c(n, nx, d))
subtract_term <- 0.5*log(2*pi*sig)
for(i in 1:nx){
if( not_missing[i] ) {
z[, i, ] <- ((-(x_i[i] - mu)^2 / (2*sig)) - subtract_term )
}
}
return(z)
}
z1 <- calc1(n, d, mu, sig, x_i, not_missing)
z2 <- calc2(n, d, mu, sig, x_i, not_missing)
identical(z1, z2)
## [1] TRUE

Generate functional data from Gaussian Process in R

Model:
X(t) = 4*t + e(t);
t € [0; 1]
e(t) is a Gaussian process with zero mean and covariance function f(s, t) = exp( -|t - s| )
The final result over 100 runs (=100 gray lines) with 50 sampled points each should be like the gray area in the picture.
The green line is what I get from the code below.
library(MASS)
kernel_1 <- function(x, y){
exp(- abs(x - y))
}
cov_matrix <- function(x, kernel_fn, ...) {
outer(x, x, function(a, b) kernel_fn(a, b, ...))
}
draw_samples <- function(x, N=1, kernel_fn, ...) {
set.seed(100)
Y <- matrix(NA, nrow = length(x), ncol = N)
for (n in 1:N) {
K <- cov_matrix(x, kernel_fn, ...)
Y[, n] <- mvrnorm(1, mu = rep(0, times = length(x)), Sigma = K)
}
Y
}
x <- seq(0, 1, length.out = 51) # x-coordinates
model1 <- function(obs, x) {
model1_data <- matrix(NA, nrow = obs, ncol = length(x))
for(i in 1:obs){
e <- draw_samples(x, 1, kernel_fn = kernel_1)
X <- c()
for (p in 1:length(x)){
t <- x[p]
val <- (4*t) + e[p,]
X = c(X, val)
}
model1_data[i,] <- X
}
model1_data
}
# model1(100, x)
Because you have set.seed in draw_samples, you are getting the same random numbers with each draw. If you remove it, then you can do:
a <- model1(100, x)
matplot(t(a), type = "l", col = 'gray')
to get

How to speed up a Rcpp double for loop

Since I am not an expert in C++ (or Rcpp) I was wondering if there are other / better ways to write this little function below, which uses 2 for loops to fill up a matrix based on the values of 2 other matrices.
Here is some example data:
library(Rcpp)
sizem <- 50000
n <- 500
k <- 5
d <- matrix(nrow = n, ncol = k, 0)
x <- matrix(nrow = n, ncol = sizem, runif(sizem*n, 0, 500))
rownames(x) <- paste0("p-",1:n)
v <- matrix(nrow = k, ncol = sizem, sample(0:300, size = sizem*k, T))
and the function currently looks like this:
cppFunction("
NumericMatrix compdistc(NumericMatrix d, NumericMatrix x, NumericMatrix v, int k, int n) {
NumericMatrix out = clone(d);
int i,j;
for (j=0; j < k; ++j) {
for (i=0; i < n; ++i) {
out(i,j) = sum(pow((x(i,_) - v(j,_)), 2));
}
}
return(out);
}
")
out <- compdistc(d, x, v, k, n)
and this is my current benchmark:
mc <- microbenchmark(times = 10,
out = compdistc(d, x, v, k, n)
); mc
Unit: seconds
expr min lq mean median uq max neval
out 2.123477 2.135013 2.174556 2.148467 2.204108 2.354089 10

Vectorizing nested ifelse

I'm trying to fasten my function in R. It contains of three ifelse statements where one of it is nested. For the single one I conducted vectorization which reduced my computation time. Unfortunately I don't see how I can vectorize the nested one. Every way I apply it returns an error. Furthemore if there is any another quirk I can use to speed it up?
cont.run <- function(reps=10000, n=10000, d=0.005, l=10 ,s=0.1) {
r <- rep(0, reps)
theta <- rep(0, n)
for (t in 1:reps) {
epsilon <- rnorm(1, 0, d)
Zt = sum(ifelse(epsilon > theta, 1,
ifelse(epsilon < -theta, -1, 0)))
r[t] <- Zt / (l * n)
theta <- ifelse(runif(n) < s, abs(r[t]), theta)
}
return(mean(r))
}
system.time(cont.run())
I got:
cont.run <- function(reps=10000, n=10000, d=0.005, l=10 ,s=0.1) {
r <- rep(0, reps)
theta <- rep(0, n)
for (t in 1:reps) {
epsilon <- rnorm(1, 0, d)
Zt = rep(NA, length(theta))
Zt = sum(Zt[epsilon > theta, 1])
Zt = sum(Zt[epsilon < -theta, -1])
r[t] <- Zt / (l * n)
theta = rep(theta, length(s))
theta[runif(n) < s] = abs(r[t])
}
return(mean(r))
}
system.time(cont.run())
Here's a little bit improved code.
Main change is that we don't use double ifelse, but instead perform two sums on TRUE vectors (sum(epsilon > theta) - sum(epsilon < -theta)) (we don't care about zeroes here). I added a couple of other improvements (eg., replaced rep with numeric, moved some operations outside the for loop).
contRun <- function(reps = 1e4, n = 1e4, d = 5e-3, l = 10, s = 0.1) {
# Replace rep with numeric
r <- numeric(reps)
theta <- numeric(n)
# Define before loop
ln <- l * n
# Don't use t as it's a function in base R
for (i in 1:reps) {
epsilon <- rnorm(1, 0, d)
# Sum two TRUE vectors
r[i] <- (sum(epsilon > theta) - sum(epsilon < -theta)) / ln
# Define before ifelse
absr <- abs(r[i])
theta <- ifelse(runif(n) < s, absr, theta)
}
return(mean(r))
}
library(microbenchmark)
microbenchmark(cont.run(), contRun())
Unit: seconds
expr min lq mean median uq max neval
cont.run() 13.652324 13.749841 13.769848 13.766342 13.791573 13.853786 100
contRun() 6.533654 6.559969 6.581068 6.577265 6.596459 6.770318 100
PS. For this kind of computing you might one to set seed (set.seed() before the for loop) to make sure that you can reproduce your results.
Furthemore if there is any another quirk I can use to speed it up?
In addition to PoGibas answer, you can avoid calling ifelse and get a faster function as follows
contRun <- function(reps = 1e4, n = 1e4, d = 5e-3, l = 10, s = 0.1) {
# Replace rep with numeric
r <- numeric(reps)
theta <- numeric(n)
# Define before loop
ln <- l * n
# Don't use t as it's a function in base R
for (i in 1:reps) {
epsilon <- rnorm(1, 0, d)
# Sum two TRUE vectors
r[i] <- (sum(epsilon > theta) - sum(epsilon < -theta)) / ln
# Define before ifelse
absr <- abs(r[i])
theta <- ifelse(runif(n) < s, absr, theta)
}
mean(r)
}
contRun2 <- function(reps = 1e4, n = 1e4, d = 5e-3, l = 10, s = 0.1) {
r <- numeric(reps)
theta <- numeric(n)
ln <- l * n
for (i in 1:reps) {
epsilon <- rnorm(1, 0, d)
r[i] <- (sum(epsilon > theta) - sum(epsilon < -theta)) / ln
absr <- abs(r[i])
# avoid ifelse
theta[runif(n) < s] <- absr
}
mean(r)
}
contRun3 <- function(reps = 1e4, n = 1e4, d = 5e-3, l = 10, s = 0.1) {
r <- numeric(reps)
theta <- numeric(n)
ln <- l * n
for (i in 1:reps) {
epsilon <- rnorm(1, 0, d)
r[i] <- (sum(epsilon > theta) - sum(epsilon < -theta)) / ln
absr <- abs(r[i])
# replace runif
theta[sample(c(T, F), prob = c(s, 1 - s), size = n, replace = TRUE)] <- absr
}
mean(r)
}
# gives the same
set.seed(1)
o1 <- contRun()
set.seed(1)
o2 <- contRun2()
set.seed(1)
o3 <- contRun3()
all.equal(o1, o2)
#R [1] TRUE
all.equal(o1, o3) # likely will not match
#R [1] [1] "Mean relative difference: 0.1508537"
# but distribution is the same
set.seed(1)
c1 <- replicate(10000, contRun2(reps = 100, n = 100))
c2 <- replicate(10000, contRun3(reps = 100, n = 100))
par(mfcol = c(1, 2), mar = c(5, 4, 2, .5))
hist(c1, breaks = seq(-.015, .015, length.out = 26))
hist(c2, breaks = seq(-.015, .015, length.out = 26))
# the latter is faster
microbenchmark::microbenchmark(
contRun = {set.seed(1); contRun ()},
contRun2 = {set.seed(1); contRun2()},
contRun3 = {set.seed(1); contRun3()},
times = 5)
#R Unit: seconds
#R expr min lq mean median uq max neval
#R contRun 7.121264 7.371242 7.388159 7.384997 7.443940 7.619352 5
#R contRun2 3.811267 3.887971 3.892523 3.892158 3.921148 3.950070 5
#R contRun3 1.920594 1.920754 1.998829 1.999755 2.009035 2.144005 5
The only bottleneck now is runif in contRun2. Replacing it with sample yields quite an improvement.

Split a vector into chunks such that sum of each chunk is approximately constant

I have a large data frame with more than 100 000 records where the values are sorted
For example, consider the following dummy data set
df <- data.frame(values = c(1,1,2,2,3,4,5,6,6,7))
I want to create 3 groups of above values (in sequence only) such that the sum of each group is more or less the same
So for the above group, if I decide to divide the sorted df in 3 groups as follows, their sums will be
1. 1 + 1 + 2 +2 + 3 + 4 = 13
2. 5 + 6 = 11
3. 6 + 7 = 13
How can create this optimization in R? any logic?
So, let's use pruning. I think other solutions are giving a good solution, but not the best one.
First, we want to minimize
where S_n is the cumulative sum of the first n elements.
computeD <- function(p, q, S) {
n <- length(S)
S.star <- S[n] / 3
if (all(p < q)) {
(S[p] - S.star)^2 + (S[q] - S[p] - S.star)^2 + (S[n] - S[q] - S.star)^2
} else {
stop("You shouldn't be here!")
}
}
I think the other solutions optimize over p and q independently, which won't give a global minima (expected for some particular cases).
optiCut <- function(v) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / 3
# good starting values
p_star <- which.min((S - S_star)^2)
q_star <- which.min((S - 2*S_star)^2)
print(min <- computeD(p_star, q_star, S))
count <- 0
for (q in 2:(n-1)) {
S3 <- S[n] - S[q] - S_star
if (S3*S3 < min) {
count <- count + 1
D <- computeD(seq_len(q - 1), q, S)
ind = which.min(D);
if (D[ind] < min) {
# Update optimal values
p_star = ind;
q_star = q;
min = D[ind];
}
}
}
c(p_star, q_star, computeD(p_star, q_star, S), count)
}
This is as fast as the other solutions because it prunes a lot the iterations based on the condition S3*S3 < min. But, it gives the optimal solution, see optiCut(c(1, 2, 3, 3, 5, 10)).
For the solution with K >= 3, I basically reimplemented trees with nested tibbles, that was fun!
optiCut_K <- function(v, K) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / K
# good starting values
p_vec_first <- sapply(seq_len(K - 1), function(i) which.min((S - i*S_star)^2))
min_first <- sum((diff(c(0, S[c(p_vec_first, n)])) - S_star)^2)
compute_children <- function(level, ind, val) {
# leaf
if (level == 1) {
val <- val + (S[ind] - S_star)^2
if (val > min_first) {
return(NULL)
} else {
return(val)
}
}
P_all <- val + (S[ind] - S[seq_len(ind - 1)] - S_star)^2
inds <- which(P_all < min_first)
if (length(inds) == 0) return(NULL)
node <- tibble::tibble(
level = level - 1,
ind = inds,
val = P_all[inds]
)
node$children <- purrr::pmap(node, compute_children)
node <- dplyr::filter(node, !purrr::map_lgl(children, is.null))
`if`(nrow(node) == 0, NULL, node)
}
compute_children(K, n, 0)
}
This gives you all the solution that are least better than the greedy one:
v <- sort(sample(1:1000, 1e5, replace = TRUE))
test <- optiCut_K(v, 9)
You need to unnest this:
full_unnest <- function(tbl) {
tmp <- try(tidyr::unnest(tbl), silent = TRUE)
`if`(identical(class(tmp), "try-error"), tbl, full_unnest(tmp))
}
print(test <- full_unnest(test))
And finally, to get the best solution:
test[which.min(test$children), ]
Here is one approach:
splitter <- function(values, N){
inds = c(0, sapply(1:N, function(i) which.min(abs(cumsum(as.numeric(values)) - sum(as.numeric(values))/N*i))))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
}
how good is it:
# I calculate the mean and sd of the maximal difference of the sums in the
#splits of 100 runs:
#split on 15 parts
set.seed(5)
z1 = as.data.frame(matrix(1:15, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = splitter(values, 15)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
z1 = z1[-1,]
mean(apply(z1, 1, function(x) max(x) - min(x)))
[1] 1004.158
sd(apply(z1, 1, function(x) max(x) - min(x)))
[1] 210.6653
#with less splits (4)
set.seed(5)
z1 = as.data.frame(matrix(1:4, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = splitter(values, 4)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
z1 = z1[-1,]
mean(apply(z1, 1, function(x) max(x) - min(x)))
#632.7723
sd(apply(z1, 1, function(x) max(x) - min(x)))
#260.9864
library(microbenchmark)
1M:
values = sort(sample(1:1000, 1000000, replace = T))
microbenchmark(
sp_27 = splitter(values, 27),
sp_3 = splitter(values, 3),
)
Unit: milliseconds
expr min lq mean median uq max neval cld
sp_27 897.7346 934.2360 1052.0972 1078.6713 1118.6203 1329.3044 100 b
sp_3 108.3283 116.2223 209.4777 173.0522 291.8669 409.7050 100 a
btw F. Privé is correct this function does not give the globally optimal split. It is greedy which is not a good characteristic for such a problem. It will give splits with sums closer to global sum / n in the initial part of the vector but behaving as so will compromise the splits in the later part of the vector.
Here is a test comparison of the three functions posted so far:
db = function(values, N){
temp = floor(sum(values)/N)
inds = c(0, which(c(0, diff(cumsum(values) %% temp)) < 0)[1:(N-1)], length(values))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
} #had to change it a bit since the posted one would not work - the core
#which calculates the splitting positions is the same
missuse <- function(values, N){
inds = c(0, sapply(1:N, function(i) which.min(abs(cumsum(as.numeric(values)) - sum(as.numeric(values))/N*i))))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(values, re))
}
prive = function(v, N){ #added dummy N argument because of the tester function
dummy = N
computeD <- function(p, q, S) {
n <- length(S)
S.star <- S[n] / 3
if (all(p < q)) {
(S[p] - S.star)^2 + (S[q] - S[p] - S.star)^2 + (S[n] - S[q] - S.star)^2
} else {
stop("You shouldn't be here!")
}
}
optiCut <- function(v, N) {
S <- cumsum(v)
n <- length(v)
S_star <- S[n] / 3
# good starting values
p_star <- which.min((S - S_star)^2)
q_star <- which.min((S - 2*S_star)^2)
print(min <- computeD(p_star, q_star, S))
count <- 0
for (q in 2:(n-1)) {
S3 <- S[n] - S[q] - S_star
if (S3*S3 < min) {
count <- count + 1
D <- computeD(seq_len(q - 1), q, S)
ind = which.min(D);
if (D[ind] < min) {
# Update optimal values
p_star = ind;
q_star = q;
min = D[ind];
}
}
}
c(p_star, q_star, computeD(p_star, q_star, S), count)
}
z3 = optiCut(v)
inds = c(0, z3[1:2], length(v))
dif = diff(inds)
re = rep(1:length(dif), times = dif)
return(split(v, re))
} #added output to be more in line with the other two
Function for testing:
tester = function(split, seed){
set.seed(seed)
z1 = as.data.frame(matrix(1:3, nrow=1))
repeat{
values = sort(sample(1:1000, 1000000, replace = T))
z = split(values, 3)
z = lapply(z, sum)
z = unlist(z)
z1 = rbind(z1, z)
if (nrow(z1)>101){
break
}
}
m = mean(apply(z1, 1, function(x) max(x) - min(x)))
s = sd(apply(z1, 1, function(x) max(x) - min(x)))
return(c("mean" = m, "sd" = s))
} #tests 100 random 1M length vectors with elements drawn from 1:1000
tester(db, 5)
#mean sd
#779.5686 349.5717
tester(missuse, 5)
#mean sd
#481.4804 216.9158
tester(prive, 5)
#mean sd
#451.6765 174.6303
prive is the clear winner - however it takes quite a bit longer than the other 2. and can handle splitting on 3 elements only.
microbenchmark(
missuse(values, 3),
prive(values, 3),
db(values, 3)
)
Unit: milliseconds
expr min lq mean median uq max neval cld
missuse(values, 3) 100.85978 111.1552 185.8199 120.1707 304.0303 393.4031 100 a
prive(values, 3) 1932.58682 1980.0515 2096.7516 2043.7133 2211.6294 2671.9357 100 b
db(values, 3) 96.86879 104.5141 194.0085 117.6270 306.7143 500.6455 100 a
N = 3
temp = floor(sum(df$values)/N)
inds = c(0, which(c(0, diff(cumsum(df$values) %% temp)) < 0)[1:(N-1)], NROW(df))
split(df$values, rep(1:N, ifelse(N == 1, NROW(df), diff(inds))))
#$`1`
#[1] 1 1 2 2 3 4
#$`2`
#[1] 5 6
#$`3`
#[1] 6 7

Resources