Group numeric vector by predefined maximal group sum - r

I have a numeric vector like this x <- c(1, 23, 7, 10, 9, 2, 4) and I want to group the elements from left to right with the constrain that each group sum must not exceed 25. Thus, here the first group is c(1, 23), the second is c(7, 10) and the last c(9, 2, 4). the expected output is a dataframe with a second column containing the groups:
data.frame(x= c(1, 23, 7, 10, 9, 2, 4), group= c(1, 1, 2, 2, 3, 3, 3))
I have tried different things with cumsum but am not able to kind of dynamically restart cumsum for the new group once the limit sum of 25 for the last group is reached.

I think cpp function is the fastest way:
library(Rcpp)
cppFunction(
"IntegerVector GroupBySum(const NumericVector& x, const double& max_sum = 25)
{
double sum = 0;
int cnt = 0;
int period = 1;
IntegerVector res(x.size());
for (int i = 0; i < x.size(); ++i)
{
++cnt;
sum += x[i];
if (sum > max_sum)
{
sum = x[i];
if (cnt > 1)
++period;
cnt = 1;
}
res[i] = period;
}
return res;
}"
)
GroupBySum(c(1, 23, 7, 10, 9, 2, 4), 25)

We can try this as a programming practice if you like :)
f1 <- function(x) {
group <- c()
while (length(x)) {
idx <- cumsum(x) <= 25
x <- x[!idx]
group <- c(group, rep(max(group, 0) + 1, sum(idx)))
}
group
}
or
f2 <- function(x) {
group <- c()
g <- 0
while (length(x)) {
cnt <- s <- 0
for (i in seq_along(x)) {
s <- s + x[i]
if (s <= 25) {
cnt <- cnt + 1
} else {
break
}
}
g <- g + 1
group <- c(group, rep(g, cnt))
x <- x[-(1:cnt)]
}
group
}
or
f3 <- function(x) {
s <- cumsum(x)
r <- c()
grp <- 1
while (length(s)) {
idx <- (s <= 25)
r <- c(r, rep(grp, sum(idx)))
grp <- grp + 1
s <- s[!idx] - tail(s[idx], 1)
}
r
}
which gives
[1] 1 1 2 2 3 3 3
and benchmarking among them looks like
set.seed(1)
set.seed(1)
x <- runif(1e3, 0, 25)
bm <- microbenchmark(
f1(x),
f2(x),
f3(x),
check = "equivalent"
)
autoplot(bm)
Recursion version
Another option is using recursion (based on f1())
f <- function(x, res = c()) {
if (!length(x)) {
return(res)
}
idx <- cumsum(x) <= 25
Recall(x[!idx], res = c(res, list(x[idx])))
}
and you will see
> f(x)
[[1]]
[1] 1 23
[[2]]
[1] 7 10
[[3]]
[1] 9 2 4

You can use the cumsumbinning built-in function from the MESS package:
# install.packages("MESS")
MESS::cumsumbinning(x, 25, cutwhenpassed = F)
# [1] 1 1 2 2 3 3 3
Or it can be done with purrr::accumulate:
cumsum(x == accumulate(x, ~ifelse(.x + .y <= 25, .x + .y, .y)))
# [1] 1 1 2 2 3 3 3
output
group <- MESS::cumsumbinning(x, 25, cutwhenpassed = F)
data.frame(x= c(1, 23, 7, 10, 9, 2, 4),
group = group)
x group
1 1 1
2 23 1
3 7 2
4 10 2
5 9 3
6 2 3
7 4 3
Quick benchmark:
x<- c(1, 23, 7, 10, 9, 2, 4)
bm <- microbenchmark(
fThomas(x),
fThomasRec(x),
fJKupzig(x),
fCumsumbinning(x),
fAccumulate(x),
fReduce(x),
fRcpp(x),
times = 100L,
setup = gc(FALSE)
)
autoplot(bm)
Егор Шишунов's Rcpp is the fastest, closely followed by MESS::cumsumbinning and ThomasIsCoding's both functions.
With n = 100, the gap gets bigger but Rcpp and cumsumbinning are still the top choices and the while loop option is no longer efficient (I had to remove ThomasIsCoding's functions because the execution time was too long):
x = runif(100, 1, 50)

In base R you could also use Reduce:
do.call(rbind, Reduce(\(x,y) if((z<-x[1] + y) > 25) c(y, x[2]+1)
else c(z, x[2]), x[-1], init = c(x[1], 1), accumulate = TRUE))
[,1] [,2]
[1,] 1 1
[2,] 24 1
[3,] 7 2
[4,] 17 2
[5,] 9 3
[6,] 11 3
[7,] 15 3
Breaking it down:
f <- function(x, y){
z <- x[1] + y
if(z > 25) c(y, x[2] + 1)
else c(z, x[2])
}
do.call(rbind, Reduce(f, x[-1], init = c(x[1], 1), accumulate = TRUE))
if using accumulate
library(tidyverse)
accumulate(x[-1], f, .init = c(x[1], 1)) %>%
invoke(rbind, .)
[,1] [,2]
[1,] 1 1
[2,] 24 1
[3,] 7 2
[4,] 17 2
[5,] 9 3
[6,] 11 3
[7,] 15 3

Here is a solution using base R and cumsum (and lapply for iteration):
id <- c(seq(1, length(x),1)[!duplicated(cumsum(x) %/% 25)], length(x)+1)
id2 <- 1:length(id)
group <- unlist(lapply(1:(length(id)-1), function(x) rep(id2[x], diff(id)[x])))
data.frame(x=x, group=group)
x group
1 1 1
2 23 1
3 7 2
4 10 2
5 9 3
6 2 3
7 4 3
Edit: New Approach using recursive function
Here is a new more efficient approach that should also cover the special case which #ЕгорШишунов considered and should work efficiently because it's written as a recursive function.
recursiveFunction<- function(x, maxN=25, sumX=0, period=1, period2return=c()){
sumX <- sumX + x[1]
if (sumX >= maxN) { sumX=x[1]; period = period + 1}
period2return <- c(period2return, period)
if (length(x) == 1) { return(period2return)}
return(recursiveFunction(x[-1], 25, sumX, period, period2return))
}
recursiveFunction(x, maxN=25)
Note that you should not change the entries for the last three function parameters (sumX=0, period=1, period2return=c()) because they are only important during the recursive call of the function.

Related

create sequence of numbers which are cyclic

I have a vector of months
m_vec <- c(3, 7, 11)
These months represent the start month of a season. All the months in each season are shown below:
season1 <- c(3,4,5,6)
season2 <- c(7,8,9,10)
season3 <- c(11,12,1,2)
I want to create a small function that takes a vector of start months and
generate the vector of months in each season. Some more examples are show below:
m_vec <- c(9,12,4,8)
season1 <- c(9,10,11)
season2 <- c(12,1,2,3)
season3 <- c(4,5,6,7,8)
m_vec <- c(12, 5, 9)
season1 <- c(12, 1, 2,3,4)
season2 <- c(5,6,7,8)
season3 <- c(9,10,11)
My for loop is not complete and I can't seem to even know where to get started with the logic
n_season <- length(m_vec)
temp_list <- list()
for(m in seq_along(m_vec)){
month_start <- m_vec[m]
month_start_next <- m_vec[m + 1]
month_start:month_start_next
}
First we can create some helper functions
cycle <- function(n) { function(x) (x-1) %% n + 1 }
split_at <- function(b) { function(x) split(x, cumsum(x %in% b)) }
The cycle() helper will return a function that will keep values in the range from 1 to the n you pass in. It does that using the modulus % operator. The split_at helper will return a function that takes a vector and splits it up when the values you pass in are found. It does that by using cumsum() to count when each of the break points are found.
Then we can take your input, create a vector of 12 months from your first starting month, wrap in in a cycler to keep it from 1-12, and then we can use split it up using your season breakpoints. Here's what that would look like:
month_cycle <- cycle(12)
season_splitter <- split_at(m_vec)
m_vec <- c(12, 5, 9)
seq(m_vec[1], length.out=12) |>
month_cycle() |>
season_splitter()
# $`1`
# [1] 12 1 2 3 4
# $`2`
# [1] 5 6 7 8
# $`3`
# [1] 9 10 11
m_vec <- c(12, 5, 9)
Map(function(x, y) head((((x:(x + ((y - x) %% 12))) - 1) %% 12) + 1, -1),
m_vec,
c(m_vec[-1], m_vec[1]))
#[[1]]
#[1] 12 1 2 3 4
#[[2]]
#[1] 5 6 7 8
#[[3]]
#[1] 9 10 11
One option is to convert to Date class, get the sequence and extract the months
library(lubridate)
fn1 <- function(mvec) {
new <- pmax(mvec-1, 1)
out <- Map(function(i, j) {
date1 <- mdy(i, truncated = 2)
date2 <- mdy(j, truncated = 2)
if(date1 > date2) {
date2 <- date2 + years(1)}
month(seq(date1, date2, by = "month"))
}, mvec, c(new[-1], new[1]))
if(length(out[[length(out)]]) < 2) {
out[[length(out)-1]] <- c(out[[length(out)-1]], out[[length(out)]])
out[[length(out)]] <- NULL
}
names(out) <- paste0("season", seq_along(out))
return(out)
}
-testing
> fn1(m_vec)
$season1
[1] 3 4 5 6
$season2
[1] 7 8 9 10
$season3
[1] 11 12 1 2
> fn1(c(9, 12, 4, 8))
$season1
[1] 9 10 11
$season2
[1] 12 1 2 3
$season3
[1] 4 5 6 7 8
> fn1(c(1, 5, 11))
$season1
[1] 1 2 3 4
$season2
[1] 5 6 7 8 9 10
$season3
[1] 11 12 1

Producing equal results for column entries which are the same

I have the piece of code below. What i want is alter the code
such that when the column entries for mat are the same, i get the same result in their respective positions in summation without it performing the fit operation?
So instead of getting
1 3 5 1 7 1
2 4 6 2 8 2
3 9 12 5 16 4
I want
1 3 5 1 7 1
2 4 6 2 8 2
3 9 12 3 16 3
set.seed(123)
fit = function(A){
x = A[1]
y = A[2]
z = sum(sample((x+y),2))
return(z)
}
mat= matrix(c(1,2,3,4,5,6,1,2,7,8,1,2),nrow=2,ncol=6)
summation=apply(mat, 2, FUN = 'fit')
newmat=rbind(mat,summation)
newmat
You can find out columns that are duplicates and replace the corresponding summation value with the first value of summation so that you get the same value.
fit = function(A){
x = A[1]
y = A[2]
z = sum(sample((x+y),2))
return(z)
}
mat= matrix(c(1,2,3,4,5,6,1,2,7,8,1,2),nrow=2,ncol=6)
summation=apply(mat, 2, FUN = 'fit')
vals <- apply(mat, 2, paste0, collapse = '-')
summation <- ave(summation, match(vals, unique(vals)), FUN = function(x) x[1])
newmat=rbind(mat,summation)
newmat
To pass only unique columns to fit function we can do :
fit = function(A){
x = A[1]
y = A[2]
z = sum(sample((x+y),2))
return(z)
}
mat= matrix(c(1,2,3,4,5,6,1,2,7,8,1,2),nrow=2,ncol=6)
vals <- apply(mat, 2, paste0, collapse = '-')
summation <- apply(mat[, !duplicated(vals)], 2, fit)
summation <- summation[match(vals, unique(vals))]
newmat=rbind(mat,summation)
newmat

Cumulative sum in R by group and start over when sum of values in group larger than maximum value

The function below groups values in a vector based on whether the cumulative sum has reached a certain max value and then starts over.
cs_group <- function(x, threshold) {
cumsum <- 0
group <- 1
result <- numeric()
for (i in 1:length(x)) {
cumsum <- cumsum + x[i]
if (cumsum > threshold) {
group <- group + 1
cumsum <- x[i]
}
result = c(result, group)
}
return (result)
}
Example
The max value in the example is 10. The first group only included 9; because summing it with the next value would result in a sum of 12. The next group includes 3, 2, 2 (+8 would result in a value higher then 10).
test <- c(9, 3, 2, 2, 8, 5, 4, 9, 1)
cs_group(test, 10)
[1] 1 2 2 2 3 4 4 5 5
However, I prefer to include in each group the value that results in the cumulative sum to be higher than the maximum value of 10.
Ideal result:
[1] 1 1 2 2 2 3 3 3 4
You can write your own custom function or use the code written by others.
I had the exact same problem few days back and this has been included in the MESS package.
devtools::install_github("ekstroem/MESS")
MESS::cumsumbinning(test, 10, cutwhenpassed = TRUE)
#[1] 1 1 2 2 2 3 3 3 4
One purrr approach could be:
cumsum(c(FALSE, diff(accumulate(test, ~ ifelse(.x >= 10, .y, .x + .y))) <= 0))
[1] 0 0 1 1 1 2 2 2 3
For your purpose, your cs_group can be written like below (if I understand the logic behind in a correct way):
cs_group <- function(x, threshold) {
group <- 1
r <- c()
repeat {
if (length(x)==0) break
cnt <- (idx <- max(which(cumsum(x) <= threshold)))+ifelse(idx==length(x),0,1)
r <- c(r,rep(group, cnt))
x <- x[-(1:cnt)]
group <- group + 1
}
r
}
such that
test <- c(9, 3, 2, 2, 8, 5, 4, 9, 1)
> cs_group(test, 10)
[1] 1 1 2 2 2 3 3 3 4

Remove duplicate and small vectors from list

I have a list of vectors, say:
li <- list( c(1, 2, 3),
c(1, 2, 3, 4),
c(2, 3, 4),
c(5, 6, 7, 8, 9, 10, 11, 12),
numeric(0),
c(5, 6, 7, 8, 9, 10, 11, 12, 13)
)
And I would like to remove all the vectors that are already contained in others (bigger or equal), as well as all the empty vectors
In this case, I would be left with only the list
1 2 3 4
5 6 7 8 9 10 11 12 13
Is there any useful function for achieving this?
Thanks in advance
First you should sort the list by vector length, such that in the excision loop it is guaranteed that each lower-index vector is shorter than each higher-index vector, so a one-way setdiff() is all you need.
l <- list(1:3, 1:4, 2:4, 5:12, double(), 5:13 );
ls <- l[order(sapply(l,length))];
i <- 1; while (i <= length(ls)-1) if (length(ls[[i]]) == 0 || any(sapply((i+1):length(ls),function(i2) length(setdiff(ls[[i]],ls[[i2]]))) == 0)) ls[[i]] <- NULL else i <- i+1;
ls;
## [[1]]
## [1] 1 2 3 4
##
## [[2]]
## [1] 5 6 7 8 9 10 11 12 13
Here's a slight alternative, replacing the any(sapply(...)) with a second while-loop. The advantage is that the while-loop can break prematurely if it finds any superset in the remainder of the list.
l <- list(1:3, 1:4, 2:4, 5:12, double(), 5:13 );
ls <- l[order(sapply(l,length))];
i <- 1; while (i <= length(ls)-1) if (length(ls[[i]]) == 0 || { j <- i+1; res <- F; while (j <= length(ls)) if (length(setdiff(ls[[i]],ls[[j]])) == 0) { res <- T; break; } else j <- j+1; res; }) ls[[i]] <- NULL else i <- i+1;
ls;
## [[1]]
## [1] 1 2 3 4
##
## [[2]]
## [1] 5 6 7 8 9 10 11 12 13
x is contained in y if
length(setdiff(x, y)) == 0
You can apply it to each pair of vectors using functions like expand.grid or combn.

Data frame subset with specified sum of elements

Having a data frame like this:
df <- data.frame(a=c(31, 18, 0, 1, 20, 2),
b=c(1, 0, 0, 3, 1, 1),
c=c(12, 0, 9, 8, 10, 3))
> df
a b c
1 31 1 12
2 18 0 0
3 0 0 9
4 1 3 8
5 20 1 10
6 2 1 3
How can I do a random subset so the sum of rows and columns is equal to a value, i.e , 100?
As I understand your question, you're trying to sample a subset of the rows and columns of your matrix so that they sum to a target value.
You can use integer optimization to accomplish this. You'll have a binary decision variable for each row, column, and cell, and constraints to force the cell values to be equal to the product of the row and column values. I'll use the lpSolve package to do this, because it has a convenient mechanism to get multiple optimal solutions. We can then use the sample function to select between them:
library(lpSolve)
get.subset <- function(dat, target) {
nr <- nrow(dat)
nc <- ncol(dat)
nvar <- nr + nc + nr*nc
# Cells upper bounded by row and column variable values (r and c) and lower bounded by r+c-1
mat <- as.matrix(do.call(rbind, apply(expand.grid(seq(nr), seq(nc)), 1, function(x) {
r <- x[1]
c <- x[2]
pos <- nr + nc + (r-1)*nc + c
ltc <- rep(0, nvar)
ltc[nr + c] <- 1
ltc[pos] <- -1
ltr <- rep(0, nvar)
ltr[r] <- 1
ltr[pos] <- -1
gtrc <- rep(0, nvar)
gtrc[nr + c] <- 1
gtrc[r] <- 1
gtrc[pos] <- -1
return(as.data.frame(rbind(ltc, ltr, gtrc)))
})))
dir <- rep(c(">=", ">=", "<="), nr*nc)
rhs <- rep(c(0, 0, 1), nr*nc)
# Sum of selected cells equals target
mat <- rbind(mat, c(rep(0, nr+nc), as.vector(t(dat))))
dir <- c(dir, "=")
rhs <- c(rhs, target)
res <- lp(objective.in=rep(0, nvar), # Feasibility problem
const.mat=mat,
const.dir=dir,
const.rhs=rhs,
all.bin=TRUE,
num.bin.solns=100 # Number of feasible solutions to get
)
if (res$status != 0) {
return(list(rows=NA, cols=NA, subset=NA, num.sol=0))
}
sol.num <- sample(res$num.bin.solns, 1)
vals <- res$solution[seq((sol.num-1)*nvar+1, sol.num*nvar)]
rows <- which(vals[seq(nr)] >= 0.999)
cols <- which(vals[seq(nr+1, nr+nc)] >= 0.999)
return(list(rows=rows, cols=cols, subset=dat[rows,cols], num.sol=res$num.bin.solns))
}
The function returns the number of subset with that sum and returns the randomly selected subset:
set.seed(144)
get.subset(df, 1)
# $rows
# [1] 1
# $cols
# [1] 2
# $subset
# [1] 1
# $num.sol
# [1] 14
get.subset(df, 100)
# $rows
# [1] 1 2 4 5
# $cols
# [1] 1 3
# $subset
# a c
# 1 31 12
# 2 18 0
# 4 1 8
# 5 20 10
# $num.sol
# [1] 2
get.subset(df, 10000)
# $rows
# [1] NA
# $cols
# [1] NA
# $subset
# [1] NA
# $num.sol
# [1] 0

Resources