Two cumsum conditions with reset in R - r

I have a dataframe that has two types of value. I'd like to slice it in groups.
This groups are expected to provide two conditions. Each group should be;
Conditions 1: max cumulative value of w <= 75
Conditions 1: max cumulative value of n <= 15
If one of these criteria reach the max cumulative value, it should reset the cumulative sums
and start over again for both.
id<- sample(1:33)
w <- c(2,1,32,5,1,1,12,1,2,32,32,32,1,3,2,12,1,1,1,1,1,1,5,3,5,1,1,1,2,7,2,32,1)
n <- c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)
df <- data.frame(id, w, n)
the expected result (made manully)
w cumsum_w n cumsum_n group
2 2 1 1 1
1 3 1 2 1
32 35 1 3 1
5 40 1 4 1
1 41 1 5 1
1 42 1 6 1
12 54 1 7 1
1 55 1 8 1
2 57 1 9 1
32 32 1 2 2
32 64 1 3 2
32 32 1 1 3
1 33 1 2 3
3 36 1 3 3
2 38 1 4 3
12 50 1 5 3
1 51 1 6 3
1 52 1 7 3
1 53 1 8 3
1 54 1 9 3
1 55 1 10 3
1 56 1 11 3
5 61 1 12 3
3 64 1 13 3
5 69 1 14 3
1 70 1 15 3
1 1 1 1 4
1 2 1 2 4
2 4 1 3 4
7 11 1 4 4
2 13 1 5 4
32 45 1 6 4
1 46 1 7 4
I tried to solve some methods:
Method 1
library(BBmisc)
chunk(df, chunk.size = 75, n.chunks = 15)
Error in chunk(df, chunk.size = 75, n.chunks = 15) :
You must provide exactly one of 'chunk.size', 'n.chunks' or 'props'
Method 2
cumsum_with_reset_group <- function(w, n, threshold_w, threshold_n) {
cumsum_w <- 0
cumsum_n <- 0
group <- 1
result <- numeric()
for (i in 1:length(w)) {
cumsum_w <- cumsum_w + w[i]
cumsum_n <- cumsum_n + n[i]
if (cumsum_w > threshold_w | cumsum_n > threshold_n) {
group <- group + 1
cumsum_w <- cumsum_w + w[i]
cumsum_n <- cumsum_n + n[i]
}
result = c(result, group)
}
return (result)
}
# cumsum with reset
cumsum_w_with_reset <- function(w, threshold_w) {
cumsum_w <- 0
group <- 1
result <- numeric()
for (i in 1:length(w)) {
cumsum_w <- cumsum_w + w[i]
if (cumsum_w > threshold_w) {
group <- group + 1
cumsum_w <- w[i]
}
result = c(result, cumsum_w)
}
return (result)
}
# cumsum with reset
cumsum_n_with_reset <- function(n, threshold_n) {
cumsum_n <- 0
group <- 1
result <- numeric()
for (i in 1:length(n)) {
cumsum_n <- cumsum_n + n[i]
if (cumsum_n > threshold_n | cumsum_w > threshold_w) {
group <- group + 1
cumsum_n <- n[i]
}
result = c(result, cumsum_n)
}
return (result)
}
# use functions above as window functions inside mutate statement
y<-df %>% group_by() %>%
mutate(
cumsum_w = cumsum_w_with_reset(w, 75),
cumsum_n =cumsum_n_with_reset(n, 15),
group = cumsum_with_reset_group(w, n, 75, 15)
) %>%
ungroup()
Error in mutate_impl(.data, dots) :
Evaluation error: object 'cumsum_w' not found
Thanks!

Here is a hack, which is done by repeated subsetting and binding. As such, this will be very slow with large data sets. This takes the whole data set as an input.
library(dplyr)
cumsumdf <- function(df){
cumsum_75 <- function(x) {cumsum(x) %/% 76}
cumsum_15 <- function(x) {cumsum(x) %/% 16}
cumsum_w75 <- function(x) {cumsum(x) %% 76}
cumsum_n15 <- function(x) {cumsum(x) %% 16}
m <- nrow(df)
df$grp <- 0
df <- df %>%
group_by(grp) %>%
mutate(cumsum_w = numeric(m), cumsum_n = numeric(m))
n = 0
df2 <- df[0,]
while(nrow(df) >0 ){
df$cumsum_w = cumsum_75(df$w)
df$cumsum_n = cumsum_15(df$n)
n <- n + 1
df1 <- df[df$cumsum_n == 0 & df$cumsum_w == 0,]
df <- df[df$cumsum_n != 0 | df$cumsum_w != 0,]
df1$grp <- n
df1 <- df1 %>% group_by(grp) %>%
mutate(cumsum_w = cumsum_w75(w), cumsum_n = cumsum_n15(n))
df2 <- rbind(df2,df1)
}
return(df2)
}
cumsumdf(df)

Related

Creating a function to run a conditional Sum in R

I have a dataframe like this:
dat<- data.frame (
'Ones'=c(0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,3,4,5,6,7,4,3,4,5))
I have to create a function (gap1) that detects each 1 in Ones and than sums n-1, n and n+1 in Thats, with n being in the same row as 1.
For example in this dataset I have two 1.
dat<- data.frame (
'Ones'=c(0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,3,4,5,6,7,4,3,4,5))
dat
This should be the output:
Ones Thats gap1
1 4 17 #(8+4+5)
1 1 7 #(3+1+3)
I would like to extend this gap at will, for example:
Ones Thats gap1 gap2 gap3 ...
1 4 17 29 #(6+8+4+5+6)
1 1 7 9 #(8+3+1+3+4)
There is another problem I have to consider:
Suppose we have this data frame:
dat<- data.frame (
'Ones'=c(1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,NA,4,5,6,7,4,3,4,5))
In case there is a 1 at the beginning (or at the end), or if there is an NA, the function should use available data.
In this case, for example:
Ones Thats gap1 gap2
1 0 5 (0+5) 8 #(0+5+3)
1 4 17 (8+4+5) 29 #(6+8+4+5+6)
1 1 4 (3+1+NA) 16 #(8+3+1+NA+4)
Do you have any advice?
Using tidyverse / collapse
For arbitrary number of lead and lags the collapse package offers a nice function flag, which has further arguments to specify columns (cols), or grouping variables g.
library(dplyr)
f <- function(df, n){
df %>%
collapse::flag(-n:n) %>%
transmute(Ones, Thats, gap = rowSums(., na.rm = T) - 1) %>%
filter(Ones == 1)
}
x <- data.frame (
'Ones'=c(1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,NA,4,5,6,7,4,3,4,5))
# we can now specify how many lags to count:
f(x, 1)
Ones Thats gap
1 1 0 5
2 1 4 17
3 1 1 4
f(x, 2)
Ones Thats gap
1 1 0 8
2 1 4 29
3 1 1 16
Or if you want to specify the number of gaps to compute, we can simplify the function to
f <- function(df, n){
df %>%
collapse::flag(-n:n) %>%
rowSums(na.rm = T) - 1
}
x %>%
mutate(gap1 = f(., 1),
gap2 = f(., 2)) %>%
filter(Ones == 1)
Ones Thats gap1 gap2
1 1 0 5 8
2 1 4 17 29
3 1 1 4 16
Base R
If you like terse functions:
f <- Vectorize(\(df, n) rowSums(collapse::flag(df, -n:n), na.rm = T) - 1, "n")
x[paste0("gap", 1:2)] <- f(x, 1:2) ; subset(x, Ones == 1)
Ones Thats gap1 gap2
1 1 0 5 8
6 1 4 17 29
11 1 1 4 16
With BaseR,
myfun <- function(data,gap=1) {
points <- which(data["Ones"]==1)
sapply(points, function(x) {
bottom <- ifelse(x-gap<=0,1,x -gap)
top <- ifelse(x+ gap > nrow(data),nrow(data),x +gap)
sum(data[bottom:top,"Thats"], na.rm=T)
})
}
#> myfun(dat,1)
#[1] 5 17 4
#> myfun(dat,2)
#[1] 8 29 16
Another base R solution
f <- function(dat, width = 1)
{
dat$gaps <- sapply(seq(nrow(dat)), function(x) {
if(dat$Ones[x] == 0) return(0)
i <- x + seq(2 * width + 1) - (width + 1)
i <- i[i > 0]
i <- i[i < nrow(dat)]
sum(dat$Thats[i])
})
dat[dat$Ones == 1,]
}
f(dat, 1)
#> Ones Thats gaps
#> 6 1 4 17
#> 11 1 1 7
f(dat, 2)
#> Ones Thats gaps
#> 6 1 4 29
#> 11 1 1 19

R: How to fill out values in a DF which are dependent on previous rows

I have a dataframe, and I want to do some calculations depending on the previous rows (like dragging informations down in excel). My DF looks like this:
set.seed(1234)
df <- data.frame(DA = sample(1:3, 6, rep = TRUE) ,HB = sample(0:600, 6, rep = TRUE), D = sample(1:5, 6, rep = TRUE), AD = sample(1:14, 6, rep = TRUE), GM = sample(30:31, 6, rep = TRUE), GL = NA, R =NA, RM =0 )
df$GL[1] = 646
df$R[1] = 60
df$DA[5] = 2
df
# DA HB D AD GM GL R RM
# 1 2 399 4 13 30 646 60 0
# 2 2 97 4 10 31 NA NA 0
# 3 1 102 5 5 31 NA NA 0
# 4 3 325 4 2 31 NA NA 0
# 5 2 78 3 14 30 NA NA 0
# 6 1 269 4 8 30 NA NA 0
I want to fill out the missing values in my GL, R and RM columns, and the values are dependent on each other. So eg.
attach(df)
#calc GL and R for the 2nd row
df$GL[2] <- GL[1]+HB[2]+RM[1]
df$R[2] <- df$GL[2]*D[2]/GM[2]*AD[2]
#calc GL and R for the 3rd row
df$GL[3] <- df$GL[2]+HB[3]+df$RM[2]
df$R[3] <-df$GL[3]*D[3]/GM[3]*AD[3]
#and so on..
Is there a way to do all the calculations at once, instead of row by row?
In addition, each time the column 'DA' = 1, the previous values for 'R' should be summed up for the same row for 'RM', but only from the last occurence. So that
attach(df)
df$RM[3] <-R[1]+R[2]+R[3]
#and RM for the 6th row is calculated by
#df$RM[6] <-R[4]+R[5]+R[6]
Thanks a lot in advance!
You can use a for loop to calculate GL values and once you have them you can do the calculation for R columns directly.
for(i in 2:nrow(df)) {
df$GL[i] <- with(df, GL[i-1]+HB[i]+RM[i-1])
}
df$R <- with(df, (GL* D)/(GM *AD))
You can use indexing to solve the first two problems:
> # Original code from question~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> set.seed(1234)
> df <- data.frame(DA = sample(1:3, 6, rep = TRUE), HB = sample(0:600, 6, rep = TRUE),
+ D = sample(1:5, 6, rep = TRUE), AD = sample(1:14, 6, rep = TRUE),
+ GM = sample(30:31, 6, rep = TRUE), GL = NA, R =NA, RM =0 )
> df$GL[1] = 646
> df$R[1] = 60
> df$DA[5] = 2
> #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> # View df
> df
DA HB D AD GM GL R RM
1 2 399 4 13 30 646 60 0
2 2 97 4 10 31 NA NA 0
3 1 102 5 5 31 NA NA 0
4 3 325 4 2 31 NA NA 0
5 2 78 3 14 30 NA NA 0
6 1 269 4 8 30 NA NA 0
> # Solution below, based on indexing
> # 1. GL column
> df$GL <- cumsum(c(df$GL[1], df$HB[-1] + df$RM[-nrow(df)]))
> # 2. R column
> df$R[-1] <- (df$GL * df$D / df$GM * df$AD)[-1]
> # May be more clear like this (same result)
> df$R[-1] <- df$GL[-1] * df$D[-1] / df$GM[-1] * df$AD[-1]
> # Or did you mean this for last *?
> df$R[-1] <- (df$GL * df$D / (df$GM * df$AD))[-1]
The third problem can be solved with a loop.
> df$RM[1] <- df$R[1]
> for (i in 2:nrow(df)) {
+ df$RM[i] <- df$R[i] + df$RM[i-1] * (df$DA[i] != 2)
+ }
> df
DA HB D AD GM GL R RM
1 2 399 4 13 30 646 60.000000 60.000000
2 2 97 4 10 31 743 9.587097 9.587097
3 1 102 5 5 31 845 27.258065 36.845161
4 3 325 4 2 31 1170 75.483871 112.329032
5 2 78 3 14 30 1248 8.914286 8.914286
6 1 269 4 8 30 1517 25.283333 34.197619
Do these results look correct?
Update: Assuming RM should = R unless DA = 1, and in that case RM = sum of current row and previous R up to (not including) the above row with DA = 1, try the following loop.
df$RM[1] <- cs <- df$R[1]
for (i in 2:nrow(df)) {
df$RM[i] <- df$R[i] + cs * (df$DA[i] == 1)
cs <- cs * (df$DA[i] != 1) + df$R[i]
}

Create a new column per group based on condition in a data frame

although I searched long for solutions, e.g.
Assign value to group based on condition in column
I am not able to solve the following problem and would appreciate greatly any help!
I have the following data frame (in reality, many more with thousands of rows):
df <- data.frame(ID1 = c(1,1,1,2,2,2,2,3,3,4,4,4,5,5,5,6,6,6,7,7),
ID2 = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20),
Percentage = c(0,10,NA,65,79,81,52,0,0,11,12,35,0,24,89,76,0,NA,59,16),
Group_expected_result = c(6,6,6,7,7,7,7,1,1,3,3,3,4,4,4,5,5,5,2,2))
What I want to do is to assign a group type from 1 to 7 to each group as indicated by ID1. Which group type should be assigned is dependent on the conditions of column 3, Percentage (can have values from 0-100) and is split into seven types:
Type 1 has a percentage of 0, i.e.
Type 1 = 0
Type 2 > 0 & < 10
Type 3 > 9 & < 20
Type 4 > 19 & < 30
Type 5 > 29 & < 40
Type 6 > 39 & < 50
Type 7 > 49
The combination of these types (above) defines the group type (G1-G7) below:
G1 = only T7
G2 = only T7 & T2-T6
G3 = only T2-T6
G4 = at least one T1, & one T2-T6, & one T7 (= all)
G5 = only T7 & T1
G6 = only T2-T6 & T1
G7 = only T1
The expected result is in the last column of the sample data frame, e.g.
the first group consists of types T1 and T2, therefore should be group type G6.
So, the question is how to get the expected result in the last column? I hope I made the problem clear! Thanks in advance!
Try this:
myType <- function(x) {
if (is.na(x) || x==0) {
return(1L)
} else if (x < 50) {
return(2L)
} else {
return(3L)
}
}
myGroup <- function(myDf) {
myIds <- unique(myDf$ID1)
myGs <- list(G1=1L, G2=2:3, G3=2L, G4=1:3, G5=c(1L,3L), G6=1:2, G7=3L)
assignG <- vector(mode = "integer", length=nrow(myDf))
vT <- vapply(myDf[,"Percentage"], function(x) myType(x), 1L)
for (i in myIds) {
myV <- which(myDf[,1L]==i)
testV <- sort(unique(vT[myV]))
assignG[myV] <- which(vapply(myGs, function(x) identical(x,testV), TRUE, USE.NAMES = FALSE))
}
myDf$myResult <- assignG
myDf
}
Calling it, we obtain:
myGroup(df,7)
ID1 ID2 Percentage Group_expected_result myResult
1 1 1 0 6 6
2 1 2 10 6 6
3 1 3 NA 6 6
4 2 4 65 7 7
5 2 5 79 7 7
6 2 6 81 7 7
7 2 7 52 7 7
8 3 8 0 1 1
9 3 9 0 1 1
10 4 10 11 3 3
11 4 11 12 3 3
12 4 12 35 3 3
13 5 13 0 4 4
14 5 14 24 4 4
15 5 15 89 4 4
16 6 16 76 5 5
17 6 17 0 5 5
18 6 18 NA 5 5
19 7 19 59 2 2
20 7 20 16 2 2
Here is a less intuitive, but more efficient solution.
myGroup2 <- function(myDf) {
myIds <- unique(myDf$ID1)
AltGs <- c(G1=2L, G2=7L, G3=3L, G4=9L, G5=6L, G6=5L, G7=4L)
assignG <- vector(mode = "integer", length=nrow(myDf))
vT <- vapply(myDf[,"Percentage"], function(x) myType(x), 1L)
for (i in myIds) {
myV <- which(myDf[,1L]==i)
testV <- unique(vT[myV])
assignG[myV] <- which(AltGs==(length(testV)+sum(testV)))
}
myDf$myResult <- assignG
myDf
}
It is about twice as fast.
microbenchmark(t1=myGroup(df,7), t2=myGroup2(df,7))
Unit: microseconds
expr min lq mean median uq max neval
t1 692.117 728.4470 779.6459 748.562 819.170 1018.060 100
t2 320.608 340.3115 390.7098 351.395 414.203 1781.195 100
You can obtain AltGs above by running the following:
myGs <- list(G1=1L, G2=2:3, G3=2L, G4=1:3, G5=c(1L,3L), G6=1:2, G7=3L)
AltGs <- vapply(myGs, function(x) length(x)+sum(x), 2L, USE.NAMES = FALSE)

data.table: “group counter” for a specific combination of columns

I would like to add a counter column in a data frame based on a set of identical rows. To do this, I used the package data.table. In my case, the comparison between rows need doing from the combination of columns "z" AND ("x" OR "y").
I tested:
DF[ , Index := .GRP, by = c("x","y","z") ]
but the result is the combination of "z" AND "x" AND "y".
How can I have the combination of "z" AND ("x" OR "y") ?
Here is a data example:
DF = data.frame(x=c("a","a","a","b","c","d","e","f","f"), y=c(1,3,2,8,8,4,4,6,0), z=c("M","M","M","F","F","M","M","F","F"))
DF <- data.table(DF)
I would like to have this output:
> DF
x y z Index
1: a 1 M 1
2: a 3 M 1
3: a 2 M 1
4: b 8 F 2
5: c 8 F 2
6: d 4 M 3
7: e 4 M 3
8: f 6 F 4
9: f 0 F 4
The new group starts if the value for z is changing or the values both for x and y are changing.
Try this example.
require(data.table)
DF <- data.table(x = c("a","a","a","b","c","d","e","f","f"),
y = c(1,3,2,8,8,4,4,6,0),
z=c("M","M","M","F","F","M","M","F","F"))
# The functions to compare if value is not equal with the previous value
is.not.eq.with.lag <- function(x) c(T, tail(x, -1) != head(x, -1))
DF[, x1 := is.not.eq.with.lag(x)]
DF[, y1 := is.not.eq.with.lag(y)]
DF[, z1 := is.not.eq.with.lag(z)]
DF
DF[, Index := cumsum(z1 | (x1 & y1))]
DF
I know a lot of people warn against a for loop in R, but in this instance I think it is a very direct way of approaching the problem. Plus, the result isn't growing in size so performance issues aren't a large issue. The for loop approach would be:
dt$grp <- rep(NA,nrow(dt))
for (i in 1:nrow(dt)){
if (i == 1){
dt$grp[i] = 1
}
else {
if(dt$z[i-1] == dt$z[i] & (dt$x[i-1] == dt$x[i] | dt$y[i-1] == dt$y[i])){
dt$grp[i] = dt$grp[i-1]
}else{
dt$grp[i] = dt$grp[i-1] + 1
}
}
}
Trying this on OPs original problem, the result is:
DF = data.frame(x=c("a","a","a","b","c","d","e","f","f"), y=c(1,3,2,8,8,4,4,6,0), z=c("M","M","M","F","F","M","M","F","F"))
dt <- data.table(DF)
dt$grp <- rep(NA,nrow(dt))
for (i in 1:nrow(dt)){
if (i == 1){
dt$grp[i] = 1
}
else {
if(dt$z[i-1] == dt$z[i] & (dt$x[i-1] == dt$x[i] | dt$y[i-1] == dt$y[i])){
dt$grp[i] = dt$grp[i-1]
}else{
dt$grp[i] = dt$grp[i-1] + 1
}
}
}
dt
x y z grp
1: a 1 M 1
2: a 3 M 1
3: a 2 M 1
4: b 8 F 2
5: c 8 F 2
6: d 4 M 3
7: e 4 M 3
8: f 6 F 4
9: f 0 F 4
Trying this on the data.table in #Frank's comment, gives the expected result as well:
dt<-data.table(x = c("b", "a", "a"), y = c(1, 1, 2), z = c("F", "F", "F"))
dt$grp <- rep(NA,nrow(dt))
for (i in 1:nrow(dt)){
if (i == 1){
dt$grp[i] = 1
}
else {
if(dt$z[i-1] == dt$z[i] & (dt$x[i-1] == dt$x[i] | dt$y[i-1] == dt$y[i])){
dt$grp[i] = dt$grp[i-1]
}else{
dt$grp[i] = dt$grp[i-1] + 1
}
}
}
dt
x y z grp
1: b 1 F 1
2: a 1 F 1
3: a 2 F 1
EDITED TO ADD: This solution is in some ways a more verbose version of the one advocated by djhurio above. I think it shows what is happening a bit more so I'll leave it.
I think this is a task easier to do if it is broken down a little bit. The below code creates TWO indices at first, one for changes in x (nested in z) and one for changes in y (nested in z). We then find the first row from each of these indices. Taking the cumulative sum of the case where both FIRST.x and FIRST.y is true should give your desired index.
library(data.table)
dt_example <- data.table(x = c("a","a","a","b","c","d","e","f","f"),
y = c(1,3,2,8,8,4,4,6,0),
z = c("M","M","M","F","F","M","M","F","F"))
dt_example[,Index_x := .GRP,by = c("z","x")]
dt_example[,Index_y := .GRP,by = c("z","y")]
dt_example[,FIRST.x := !duplicated(Index_x)]
dt_example[,FIRST.y := !duplicated(Index_y)]
dt_example[,Index := cumsum(FIRST.x & FIRST.y)]
dt_example
x y z Index_x Index_y FIRST.x FIRST.y Index
1: a 1 M 1 1 TRUE TRUE 1
2: a 3 M 1 2 FALSE TRUE 1
3: a 2 M 1 3 FALSE TRUE 1
4: b 8 F 2 4 TRUE TRUE 2
5: c 8 F 3 4 TRUE FALSE 2
6: d 4 M 4 5 TRUE TRUE 3
7: e 4 M 5 5 TRUE FALSE 3
8: f 6 F 6 6 TRUE TRUE 4
9: f 0 F 6 7 FALSE TRUE 4
This approach looks for changes in x & z | y & z. The extra columns are left in the data.table to show the calculations.
DF[, c("Ix", "Iy", "Iz", "dx", "dy", "min.change", "Index") :=
#Create index of values based on consecutive order
list(ix <- rleid(x), iy <- rleid(y), iz <- rleid(z),
#Determine if combinations of x+z OR y+z change
ix1 <- c(0, diff(rleid(ix+iz))),
iy1 <- c(0, diff(rleid(iy+iz))),
#Either combination is constant (no change)?
change <- pmin(ix1, iy1),
#New index based on change
cumsum(change) + 1
)]
x y z Ix Iy Iz dx dy min.change Index
1: a 1 M 1 1 1 0 0 0 1
2: a 3 M 1 2 1 0 1 0 1
3: a 2 M 1 3 1 0 1 0 1
4: b 8 F 2 4 2 1 1 1 2
5: c 8 F 3 4 2 1 0 0 2
6: d 4 M 4 5 3 1 1 1 3
7: e 4 M 5 5 3 1 0 0 3
8: f 6 F 6 6 4 1 1 1 4
9: f 0 F 6 7 4 0 1 0 4

Bin formation in a R data.frame

I have a data.frame with two columns:
category quantity
a 20
b 30
c 100
d 10
e 1
f 23
g 3
h 200
I need to write a function with two parameters: dataframe, bin_size which runs a cumsum over the quantity column, does a split of the subsequent row if the the cumsum exceeds the bin_size and adds a running bin number as an additional column.
Say, by entering this:
function(dataframe, 50)
in the above example should give me:
category quantity cumsum bin_nbr
a 20 20 1
b 30 50 1
c 50 50 2
c 50 50 3
d 10 10 4
e 1 11 4
f 23 34 4
g 3 37 4
h 13 50 4
h 50 50 5
h 50 50 6
h 50 50 7
h 37 37 8
Explanation:
row a + b sum up to 50 --> bin_nbr 1
row c is 100 -> split into 2 rows # 50 -> bin nbr 2, bin_nbr 3
row d,e,f,g sum up to 37 -> bin_nbr 4
I need another 13 from row h to fill in bin_nbr 4 to 50
The rest of the remaining quantity from h will be spitted into 4 bins -> bin_nbr 5, 6, 7, 8
I couldn't think of a clean way to do this with apply/data.table etc since you have an inter-row dependency and a changing size data frame. You can probably do it in an iterative/recursive manner, but I felt it would be quicker to figure out to just write the loop. One challenge is that it is difficult to know the final size of your object, so this is likely to be slow. You can mitigate the problem somewhat by switching from a df to a matrix (code should work fine, except transform bits) if performance is an issue in this application.
fun <- function(df, binsize){
df$cumsum <- cumsum(df$quantity)
df$bin <- 1
i <- 1
repeat {
if((extra <- (df[i, "cumsum"] - binsize)) > 0) { # Bin finished halfway through
top <- if(i > 1L) df[1L:(i - 1L), ] else df[0L, ]
mid <- transform(df[i, ], quantity=quantity-extra, cumsum=cumsum-extra)
bot <- transform(df[i, ], quantity=extra, cumsum=extra, bin=bin + 1L)
end <- if(i >= nrow(df)) df[0L, ] else df[(i + 1L):nrow(df), ]
end <- transform(end, cumsum=cumsum(end$quantity) + extra, bin=bin + 1L)
df <- rbind(top, mid, bot, end)
} else if (extra == 0 && nrow(df) > i) { # Bin finished cleanly
df[(i + 1L):nrow(df), ]$cumsum <- df[(i + 1L):nrow(df), ]$cumsum - binsize
df[(i + 1L):nrow(df), ]$bin <- df[(i + 1L):nrow(df), ]$bin + 1L
}
if(nrow(df) < (i <- i + 1)) break
}
rownames(df) <- seq(len=nrow(df))
df
}
fun(df, binsize)
# category quantity cumsum bin
# 1 a 20 20 1
# 2 b 30 50 1
# 3 c 50 50 2
# 4 c 50 50 3
# 5 d 10 10 4
# 6 e 1 11 4
# 7 f 23 34 4
# 8 g 3 37 4
# 9 h 13 50 4
# 10 h 50 50 5
# 11 h 50 50 6
# 12 h 50 50 7
# 13 h 37 37 8
Another solution with a loop:
DF <- read.table(text="category quantity
a 20
b 30
c 100
d 10
e 1
f 23
g 3
h 200", header=TRUE)
bin_size <- 50
n_bin <- ceiling(sum(DF$quantity)/bin_size)
DF$bin <- findInterval(cumsum(DF$quantity)-1, c(0, seq_len(n_bin)*50))
DF$cumsum <- cumsum(DF$quantity)
result <- lapply(seq_along(DF[,1]), function(i, df) {
if (i==1) {
d <- df[i, "bin"]
} else {
d <- df[i, "bin"]-df[i-1, "bin"]
}
if (d > 1) {
res <- data.frame(
category = df[i, "category"],
bin_nbr = df[i, "bin"]-seq_len(d+1)+1
)
res[,"quantity"] <- bin_size
if (i!=1) {
res[nrow(res),"quantity"] <- df[i-1, "bin"]*bin_size-df[i-1, "cumsum"]
} else {
res[nrow(res),"quantity"] <- 0
}
res[1,"quantity"] <- df[i, "quantity"]-sum(res[-1,"quantity"])
return(res[res$quantity > 0,])
} else {
return(data.frame(
category = df[i, "category"],
quantity = df[i, "quantity"],
bin_nbr = df[i, "bin"]
))
}
}, df=DF)
res <- do.call(rbind, result)
res <- res[order(res$category, res$bin_nbr),]
library(plyr)
res <- ddply(res, .(bin_nbr), transform, cumsum=cumsum(quantity))
res
# category quantity bin_nbr cumsum
# 1 a 20 1 20
# 2 b 30 1 50
# 3 c 50 2 50
# 4 c 50 3 50
# 5 d 10 4 10
# 6 e 1 4 11
# 7 f 23 4 34
# 8 g 3 4 37
# 9 h 13 4 50
# 10 h 50 5 50
# 11 h 50 6 50
# 12 h 50 7 50
# 13 h 37 8 37
This amounts to merging the bin boundaries with the data which gives this loop-free solution:
library(zoo)
fun <- function(DF, binsize = 50) {
nr <- nrow(DF)
DF2 <- data.frame(cumsum = seq(0, sum(DF$quantity), binsize) + binsize, bin_nbr = 1:nr)
DF.cs <- transform(DF, cumsum = cumsum(DF$quantity))
m <- na.locf(merge(DF.cs, DF2, all = TRUE), fromLast = TRUE)
m$bin_nbr <- as.numeric(m$bin_nbr)
cs <- as.numeric(m$cumsum)
m$quantity <- c(cs[1], diff(cs))
m$cumsum <- ave(m$quantity, m$bin_nbr, FUN = cumsum)
na.omit(m)[c("category", "quantity", "cumsum", "bin_nbr")]
}
giving:
> fun(DF)
category quantity cumsum bin_nbr
1 a 20 20 1
2 b 30 50 1
3 c 50 50 2
4 c 50 50 3
5 d 10 10 4
6 e 1 11 4
7 f 23 34 4
8 g 3 37 4
9 h 13 50 4
10 h 50 50 5
11 h 50 50 6
12 h 50 50 7
13 h 37 37 8
Note: For purposes of reproducing the result above this is the input we used:
Lines <- "category quantity
a 20
b 30
c 100
d 10
e 1
f 23
g 3
h 200
"
DF <- read.table(text = Lines, header = TRUE, as.is = TRUE)
REVISION An error in the code was corrected.

Resources