I have a dataframe like this:
dat<- data.frame (
'Ones'=c(0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,3,4,5,6,7,4,3,4,5))
I have to create a function (gap1) that detects each 1 in Ones and than sums n-1, n and n+1 in Thats, with n being in the same row as 1.
For example in this dataset I have two 1.
dat<- data.frame (
'Ones'=c(0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,3,4,5,6,7,4,3,4,5))
dat
This should be the output:
Ones Thats gap1
1 4 17 #(8+4+5)
1 1 7 #(3+1+3)
I would like to extend this gap at will, for example:
Ones Thats gap1 gap2 gap3 ...
1 4 17 29 #(6+8+4+5+6)
1 1 7 9 #(8+3+1+3+4)
There is another problem I have to consider:
Suppose we have this data frame:
dat<- data.frame (
'Ones'=c(1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,NA,4,5,6,7,4,3,4,5))
In case there is a 1 at the beginning (or at the end), or if there is an NA, the function should use available data.
In this case, for example:
Ones Thats gap1 gap2
1 0 5 (0+5) 8 #(0+5+3)
1 4 17 (8+4+5) 29 #(6+8+4+5+6)
1 1 4 (3+1+NA) 16 #(8+3+1+NA+4)
Do you have any advice?
Using tidyverse / collapse
For arbitrary number of lead and lags the collapse package offers a nice function flag, which has further arguments to specify columns (cols), or grouping variables g.
library(dplyr)
f <- function(df, n){
df %>%
collapse::flag(-n:n) %>%
transmute(Ones, Thats, gap = rowSums(., na.rm = T) - 1) %>%
filter(Ones == 1)
}
x <- data.frame (
'Ones'=c(1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,NA,4,5,6,7,4,3,4,5))
# we can now specify how many lags to count:
f(x, 1)
Ones Thats gap
1 1 0 5
2 1 4 17
3 1 1 4
f(x, 2)
Ones Thats gap
1 1 0 8
2 1 4 29
3 1 1 16
Or if you want to specify the number of gaps to compute, we can simplify the function to
f <- function(df, n){
df %>%
collapse::flag(-n:n) %>%
rowSums(na.rm = T) - 1
}
x %>%
mutate(gap1 = f(., 1),
gap2 = f(., 2)) %>%
filter(Ones == 1)
Ones Thats gap1 gap2
1 1 0 5 8
2 1 4 17 29
3 1 1 4 16
Base R
If you like terse functions:
f <- Vectorize(\(df, n) rowSums(collapse::flag(df, -n:n), na.rm = T) - 1, "n")
x[paste0("gap", 1:2)] <- f(x, 1:2) ; subset(x, Ones == 1)
Ones Thats gap1 gap2
1 1 0 5 8
6 1 4 17 29
11 1 1 4 16
With BaseR,
myfun <- function(data,gap=1) {
points <- which(data["Ones"]==1)
sapply(points, function(x) {
bottom <- ifelse(x-gap<=0,1,x -gap)
top <- ifelse(x+ gap > nrow(data),nrow(data),x +gap)
sum(data[bottom:top,"Thats"], na.rm=T)
})
}
#> myfun(dat,1)
#[1] 5 17 4
#> myfun(dat,2)
#[1] 8 29 16
Another base R solution
f <- function(dat, width = 1)
{
dat$gaps <- sapply(seq(nrow(dat)), function(x) {
if(dat$Ones[x] == 0) return(0)
i <- x + seq(2 * width + 1) - (width + 1)
i <- i[i > 0]
i <- i[i < nrow(dat)]
sum(dat$Thats[i])
})
dat[dat$Ones == 1,]
}
f(dat, 1)
#> Ones Thats gaps
#> 6 1 4 17
#> 11 1 1 7
f(dat, 2)
#> Ones Thats gaps
#> 6 1 4 29
#> 11 1 1 19
Related
appreciate your guidance as im new to R programme. basically i've created a function to check whether the value is even or odd.
i wish to create a new result column whereby 'even' results in the original value * 2, and 'odd' results in the original value - 5.
not sure where i've gone wrong with the second part of the code but i am trying to figure out where can i include my 'check' column in the second function to specify it should be checked for even or odd.
i only learnt about ifelse(check(df$check) but it doesnt seem to work in my instance.
much appreciated!
## print 'odd' or 'even' results in df
check = function(df,col){
df['check'] =
ifelse(df[,col] %% 2 ==0, 'even', 'odd')
return(df)
}
# multiplication and subtraction for odd_even results
checkresult = function(df,col){
df['res'] =
ifelse(check(df) == 'even', df[,col] * 2, df[,col]-5)
return(df)
}
checkresult(df)
The simplest way to do it is by not implementing a new function, just use ifelse() as it was intended:
set.seed(100)
df <- data.frame(x = sample(1:15, size = 100, replace = T))
df$res <- ifelse(df$x %% 2 == 0, df$x * 2, df$x - 5)
df |> head()
#> x res
#> 1 10 20
#> 2 7 2
#> 3 6 12
#> 4 3 -2
#> 5 9 4
#> 6 10 20
If you need to implement a function that returns a dataframe:
set.seed(100)
df <- data.frame(x = sample(1:15, size = 100, replace = T))
my_func <- function(dframe, column){
vec <- dframe[,column]
dframe$res <- ifelse(vec %% 2 == 0, vec * 2, vec - 5)
return(dframe)
}
my_func(df, "x") |> head()
#> x res
#> 1 10 20
#> 2 7 2
#> 3 6 12
#> 4 3 -2
#> 5 9 4
#> 6 10 20
Edit 1
If you want the parity of the number, one extra line is required:
set.seed(100)
df <- data.frame(x = sample(1:15, size = 100, replace = T))
my_func <- function(dframe, column){
vec <- dframe[,column]
dframe$parity <- ifelse(vec %% 2 == 0, "EVEN", "ODD")
dframe$res <- ifelse(vec %% 2 == 0, vec * 2, vec - 5)
return(dframe)
}
my_func(df, "x") |> head()
#> x parity res
#> 1 10 EVEN 20
#> 2 7 ODD 2
#> 3 6 EVEN 12
#> 4 3 ODD -2
#> 5 9 ODD 4
#> 6 10 EVEN 20
i tried to create new column in a dataframe using different equations and inputs from other column in the dataframe. the equation i want to apply will also slightly differs conditioned on another column. here is the dummy dataframe
set.seed(123)
df <-
data.frame(
N = (c(0,0,rep(10,18))),
I0 = runif(20, 0,10),
Dt = c(1:20),
Isolator = rep(1:10,each=2)
)
I want to create new column name Pcol using this equation 1-exp(-(x)*((df$I0/df$N)*df$Dt))
and x variables changes based on Isolator. I managed to create column Pcol using ifelese() and mutate() based on isolators but the input is not taken from the same row. to illustrate
df1<-mutate(df, Pcol = ifelse(Isolator %in% 1:4, 1-exp(-(0.5])*((df$I0/df$N)*df$Dt)),
ifelse(Isolator %in% 5:7, 1-exp(-(0.7)*((df$I0/df$N)*df$Dt)),
ifelse(Isolator %in% 8:10, 1-exp(-(0.9)*((df$I0/df$N)*df$Dt)), NA))))
I also calculated Pcol seperately by subseting dataframe based on isolators
col1<- df %>% filter(Isolator <= 4)
col2<- df %>% filter(Isolator >= 5 & Isolator < 8)
col3<- df %>% filter(Isolator >=9 )
Pcol1<-1-exp(-(0.5)*((col1$I0/col1$N)*col1$Dt))
Pcol2<-1-exp(-(0.7)*(((col2$I0/col2$N)*col2$Dt))
Pcol3<-1-exp(-(0.9)*((col3$I0/col3$N)*col3$Dt))
and the Pcol in dataframe differs drom Pcol calculated from subset group. i think ifelse() apply in the dataframe taking in input wrongly when it calculate Pcol but i don't know how to fix it or maybe there is a simpler way to apply equations into dataframe
Please help! thank you
Apply group of functions are normally used for iterative calculations or for manipulating lists, whereas,
Will this do?
df %>% mutate(Pcol = case_when(Isolator %in% 1:4 ~ 0.5,
Isolator %in% 5:7 ~ 0.7,
TRUE ~ 0.9),
Pcol = 1-exp(-(Pcol)*((I0/N)*Dt)))
N I0 Dt Isolator Pcol
1 0 2.8757752 1 1 1.0000000
2 0 7.8830514 2 1 1.0000000
3 10 4.0897692 3 2 0.4585288
4 10 8.8301740 4 2 0.8289903
5 10 9.4046728 5 3 0.9047422
6 10 0.4555650 6 3 0.1277415
7 10 5.2810549 7 4 0.8425062
8 10 8.9241904 8 4 0.9718350
9 10 5.5143501 9 5 0.9690084
10 10 4.5661474 10 5 0.9590868
11 10 9.5683335 11 6 0.9993686
12 10 4.5333416 12 6 0.9778076
13 10 6.7757064 13 7 0.9979002
14 10 5.7263340 14 7 0.9963455
15 10 1.0292468 15 8 0.7507959
16 10 8.9982497 16 8 0.9999976
17 10 2.4608773 17 9 0.9768357
18 10 0.4205953 18 9 0.4940738
19 10 3.2792072 19 10 0.9963296
20 10 9.5450365 20 10 1.0000000
Your manual calculations (using subsets)
col1<- df %>% filter(Isolator <= 4)
col2<- df %>% filter(Isolator >= 5 & Isolator < 8)
col3<- df %>% filter(Isolator >=8 )
Pcol1 <- 1-exp(-(0.5)*((col1$I0/col1$N)*col1$Dt))
Pcol2 <- 1-exp(-(0.7)*((col2$I0/col2$N)*col2$Dt))
Pcol3 <- 1-exp(-(0.9)*((col3$I0/col3$N)*col3$Dt))
Pcol2
Pcol <- c(Pcol1, Pcol2, Pcol3)
df %>% mutate(Pcol = case_when(Isolator %in% 1:4 ~ 0.5,
Isolator %in% 5:7 ~ 0.7,
TRUE ~ 0.9),
Pcol = 1-exp(-(Pcol)*((I0/N)*Dt))) -> df1
Pcol - df1$Pcol
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
The input vector is as below,
data=c(1,1,1,1,11,1,1,1,1,12,1,1,2,1,1,1)
I want the output as 1,1,1,1,11,11,11,11,11,12,12,12,2,2,2,2 where the 1's proceeding the non 1's should be imputed the non 1 value in R.
I tried the following code
data=c(1,1,1,1,11,1,1,1,1,12,1,1,2,1,1,1)
sapply(data, function(x) ifelse (lag(x)!=1,lag(x),x))
but it didn't yield expected output
You can convert every 1 after the first non-1 value to NA then use zoo::na.locf():
library(zoo)
x <- c(1,1,1,1,11,1,1,1,1,12,1,1,2,1,1,1)
data[seq_along(x) > which.max(x!= 1) & x== 1] <- NA
na.locf(x)
[1] 1 1 1 1 11 11 11 11 11 12 12 12 2 2 2 2
Or using replace() to add the NA values:
na.locf(replace(x, seq_along(x) > which.max(x != 1) & x == 1, NA))
In response to your comment about applying it to groups, you can use ave():
df <- data.frame(x = c(x, rev(x)), grp = rep(1:2, each = length(x)))
ave(df$x, df$grp, FUN = function(y)
na.locf(replace(y, seq_along(y) > which.max(y != 1) & y == 1, NA))
)
You can write your custom fill function:
x <- c(1,1,1,1,11,1,1,1,1,12,1,1,2,1,1,1)
myfill <- function(x) {
mem <- x[1]
for (i in seq_along(x)) {
if (x[i] == 1) {
x[i] <- mem
} else {
mem <- x[i]
}
}
x
}
myfill(x)
# 1 1 1 1 11 11 11 11 11 12 12 12 2 2 2 2
You could match unique 1 and non-1 values with the cumsum of non-1 values.
(c(1, x[x != 1]))[match(cumsum(x != 1), 0:3)]
# [1] 1 1 1 1 11 11 11 11 11 12 12 12 2 2 2 2
Data
x <- c(1, 1, 1, 1, 11, 1, 1, 1, 1, 12, 1, 1, 2, 1, 1, 1)
You can use rle from base to overwrite 1 with the value before.
x <- rle(data)
y <- c(FALSE, (x$values == 1)[-1])
x$values[y] <- x$values[which(y)-1]
inverse.rle(x)
# [1] 1 1 1 1 11 11 11 11 11 12 12 12 2 2 2 2
In R, I try systematically to avoid "for" loops and use lapply() family instead.
But how to do so when an iteration contains an increment step ?
For example : is it possible to obtain the same result as below with a lapply approach ?
a <- c()
b <- c()
set.seed(1L) # required for reproducible data
for (i in 1:10){
a <- c(a, sample(c(0,1), 1))
b <- c(b, (paste(a, collapse = "-")))
}
data.frame(a, b)
> data.frame(a, b)
> a b
> 1 0 0
> 2 1 0-1
> 3 0 0-1-0
> 4 0 0-1-0-0
> 5 1 0-1-0-0-1
> 6 0 0-1-0-0-1-0
> 7 0 0-1-0-0-1-0-0
> 8 0 0-1-0-0-1-0-0-0
> 9 1 0-1-0-0-1-0-0-0-1
> 10 1 0-1-0-0-1-0-0-0-1-1
EDIT
My question was very badly redacted. The below new example is much more illustrative : is it anyway to use lapply family if each iteration is calculated from the previous one ?
a <- c()
b <- c()
for (i in 1:10){
a <- c(a, sample(c(0,1), 1))
b <- c(b, (paste(a, collapse = "-")))
}
data.frame(a, b)
> data.frame(a, b)
a b
1 0 0
2 1 0-1
3 0 0-1-0
4 1 0-1-0-1
5 1 0-1-0-1-1
6 1 0-1-0-1-1-1
7 1 0-1-0-1-1-1-1
8 0 0-1-0-1-1-1-1-0
9 1 0-1-0-1-1-1-1-0-1
10 1 0-1-0-1-1-1-1-0-1-1
For the sake of completeness, there is also the accumulate() function from the purrr package.
So, building on the answers of Sotos and ThomasIsCoding:
df <- data.frame(a = 1:10)
df$b <- purrr::accumulate(df$a, paste, sep = "-")
df
a b
1 1 1
2 2 1-2
3 3 1-2-3
4 4 1-2-3-4
5 5 1-2-3-4-5
6 6 1-2-3-4-5-6
7 7 1-2-3-4-5-6-7
8 8 1-2-3-4-5-6-7-8
9 9 1-2-3-4-5-6-7-8-9
10 10 1-2-3-4-5-6-7-8-9-10
The difference to Reduce() is
that accumulate() is a function verb on its own (no additional parameter accumulate = TRUE required)
and that additional arguments like sep = "-" can be passed on to the mapped function which may help to avoid the creation of an anonymous function.
EDIT
If I understand correctly OP's edit of the question, the OP is asking if a for loop which computes a result iteratively can be replaced by lapply().
This is difficult to answer for me. Here are some thoughts and observations:
First, accumulate() still will work:
set.seed(1L) # required for reproducible data
df <- data.frame(a = sample(0:1, 10L, TRUE))
df$b <- purrr::accumulate(df$a, paste, sep = "-")
df
a b
1 0 0
2 1 0-1
3 0 0-1-0
4 0 0-1-0-0
5 1 0-1-0-0-1
6 0 0-1-0-0-1-0
7 0 0-1-0-0-1-0-0
8 0 0-1-0-0-1-0-0-0
9 1 0-1-0-0-1-0-0-0-1
10 1 0-1-0-0-1-0-0-0-1-1
This is possible because the computation of a can be pulled out off the loop as it does not depend on b.
IMHO, accumulate() and Reduce() do what the OP is looking for but is not called lapply(): They take the result of the previous iteration and combine it with the actual value, for instance
Reduce(`+`, 1:3)
returns the sum of 1, 2, and 3 by iteratively computing (((0 + 1) + 2) + 3). This can be visualised by using the accumulate parameter
Reduce(`+`, 1:3, accumulate = TRUE)
[1] 1 3 6
Second, there is a major difference between a for loop and functions of the lapply() family: lapply(X, FUN, ...) requires a function FUN to be called on each element of X. So, scoping rules for functions apply.
When we transplant the body of the loop into an anonymous function within lapply()
a <- c()
b <- c()
set.seed(1L) # required for reproducible data
lapply(1:10, function(i) {
a <- c(a, sample(c(0,1), 1))
b <- c(b, (paste(a, collapse = "-")))
})
we get
[[1]]
[1] "0"
[[2]]
[1] "1"
[[3]]
[1] "0"
[[4]]
[1] "0"
[[5]]
[1] "1"
[[6]]
[1] "0"
[[7]]
[1] "0"
[[8]]
[1] "0"
[[9]]
[1] "1"
[[10]]
[1] "1"
data.frame(a, b)
data frame with 0 columns and 0 rows data.frame(a, b)
Due to the scoping rules, a and b inside the function are considered as local to the function. No reference is made to a and b defined outside of the function.
This can be fixed by global assignment using the global assignment operator <<-:
a <- c()
b <- c()
set.seed(1L) # required for reproducible data
lapply(1:10, function(i) {
a <<- c(a, sample(c(0,1), 1))
b <<- c(b, (paste(a, collapse = "-")))
})
data.frame(a, b)
a b
1 0 0
2 1 0-1
3 0 0-1-0
4 0 0-1-0-0
5 1 0-1-0-0-1
6 0 0-1-0-0-1-0
7 0 0-1-0-0-1-0-0
8 0 0-1-0-0-1-0-0-0
9 1 0-1-0-0-1-0-0-0-1
10 1 0-1-0-0-1-0-0-0-1-1
However, global assignment is considered bad programming practice and should be avoided, see, e.g., the 6th Circle of Patrick Burns' The R Inferno and many questions on SO.
Third, the way the loop is written grows vectors in the loop. This also is considered bad practice as it requires to copy the data over and over again which may slow down tremendously with increasing size. See, e.g., the 2nd Circle of Patrick Burns' The R Inferno.
However, the original code
a <- c()
b <- c()
set.seed(1L) # required for reproducible data
for (i in 1:10) {
a <- c(a, sample(c(0,1), 1))
b <- c(b, (paste(a, collapse = "-")))
}
data.frame(a, b)
can be re-written as
a <- integer(10)
b <- character(10)
set.seed(1L) # required for reproducible data
for (i in seq_along(a)) {
a[i] <- sample(c(0,1), 1)
b[i] <- if (i == 1L) a[1] else paste(b[i-1], a[i], sep = "-")
}
data.frame(a, b)
Here, vectors are pre-allocated with the required size to hold the result. Elements to update are identified by subscripting.
Calculation of b[i] still depends only the value of the previous iteration b[i-1] and the actual value a[i] as requested by the OP.
Another way is to use Reduce with accumulate = TRUE, i.e.
df$new <- do.call(rbind, Reduce(paste, split(df, seq(nrow(df))), accumulate = TRUE))
which gives,
a new
1 1 1
2 2 1 2
3 3 1 2 3
4 4 1 2 3 4
5 5 1 2 3 4 5
6 6 1 2 3 4 5 6
7 7 1 2 3 4 5 6 7
8 8 1 2 3 4 5 6 7 8
9 9 1 2 3 4 5 6 7 8 9
10 10 1 2 3 4 5 6 7 8 9 10
You can use sapply (lapply would work too but it returns a list) and iterate over every value of a in df and create a sequence and paste the value together.
df <- data.frame(a = 1:10)
df$b <- sapply(df$a, function(x) paste(seq(x), collapse = "-"))
df
# a b
#1 1 1
#2 2 1-2
#3 3 1-2-3
#4 4 1-2-3-4
#5 5 1-2-3-4-5
#6 6 1-2-3-4-5-6
#7 7 1-2-3-4-5-6-7
#8 8 1-2-3-4-5-6-7-8
#9 9 1-2-3-4-5-6-7-8-9
#10 10 1-2-3-4-5-6-7-8-9-10
If there could be non-numerical values in data on which we can not use seq like
df <- data.frame(a =letters[1:10])
In those case, we can use
df$b <- sapply(seq_along(df$a), function(x) paste(df$a[seq_len(x)], collapse = "-"))
df
# a b
#1 a a
#2 b a-b
#3 c a-b-c
#4 d a-b-c-d
#5 e a-b-c-d-e
#6 f a-b-c-d-e-f
#7 g a-b-c-d-e-f-g
#8 h a-b-c-d-e-f-g-h
#9 i a-b-c-d-e-f-g-h-i
#10 j a-b-c-d-e-f-g-h-i-j
Another way of using Reduce, different to the approach by #Sotos
df$b <- Reduce(function(...) paste(...,sep = "-"), df$a, accumulate = T)
such that
> df
a b
1 1 1
2 2 1-2
3 3 1-2-3
4 4 1-2-3-4
5 5 1-2-3-4-5
6 6 1-2-3-4-5-6
7 7 1-2-3-4-5-6-7
8 8 1-2-3-4-5-6-7-8
9 9 1-2-3-4-5-6-7-8-9
10 10 1-2-3-4-5-6-7-8-9-10
I have a dataframe that has two types of value. I'd like to slice it in groups.
This groups are expected to provide two conditions. Each group should be;
Conditions 1: max cumulative value of w <= 75
Conditions 1: max cumulative value of n <= 15
If one of these criteria reach the max cumulative value, it should reset the cumulative sums
and start over again for both.
id<- sample(1:33)
w <- c(2,1,32,5,1,1,12,1,2,32,32,32,1,3,2,12,1,1,1,1,1,1,5,3,5,1,1,1,2,7,2,32,1)
n <- c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)
df <- data.frame(id, w, n)
the expected result (made manully)
w cumsum_w n cumsum_n group
2 2 1 1 1
1 3 1 2 1
32 35 1 3 1
5 40 1 4 1
1 41 1 5 1
1 42 1 6 1
12 54 1 7 1
1 55 1 8 1
2 57 1 9 1
32 32 1 2 2
32 64 1 3 2
32 32 1 1 3
1 33 1 2 3
3 36 1 3 3
2 38 1 4 3
12 50 1 5 3
1 51 1 6 3
1 52 1 7 3
1 53 1 8 3
1 54 1 9 3
1 55 1 10 3
1 56 1 11 3
5 61 1 12 3
3 64 1 13 3
5 69 1 14 3
1 70 1 15 3
1 1 1 1 4
1 2 1 2 4
2 4 1 3 4
7 11 1 4 4
2 13 1 5 4
32 45 1 6 4
1 46 1 7 4
I tried to solve some methods:
Method 1
library(BBmisc)
chunk(df, chunk.size = 75, n.chunks = 15)
Error in chunk(df, chunk.size = 75, n.chunks = 15) :
You must provide exactly one of 'chunk.size', 'n.chunks' or 'props'
Method 2
cumsum_with_reset_group <- function(w, n, threshold_w, threshold_n) {
cumsum_w <- 0
cumsum_n <- 0
group <- 1
result <- numeric()
for (i in 1:length(w)) {
cumsum_w <- cumsum_w + w[i]
cumsum_n <- cumsum_n + n[i]
if (cumsum_w > threshold_w | cumsum_n > threshold_n) {
group <- group + 1
cumsum_w <- cumsum_w + w[i]
cumsum_n <- cumsum_n + n[i]
}
result = c(result, group)
}
return (result)
}
# cumsum with reset
cumsum_w_with_reset <- function(w, threshold_w) {
cumsum_w <- 0
group <- 1
result <- numeric()
for (i in 1:length(w)) {
cumsum_w <- cumsum_w + w[i]
if (cumsum_w > threshold_w) {
group <- group + 1
cumsum_w <- w[i]
}
result = c(result, cumsum_w)
}
return (result)
}
# cumsum with reset
cumsum_n_with_reset <- function(n, threshold_n) {
cumsum_n <- 0
group <- 1
result <- numeric()
for (i in 1:length(n)) {
cumsum_n <- cumsum_n + n[i]
if (cumsum_n > threshold_n | cumsum_w > threshold_w) {
group <- group + 1
cumsum_n <- n[i]
}
result = c(result, cumsum_n)
}
return (result)
}
# use functions above as window functions inside mutate statement
y<-df %>% group_by() %>%
mutate(
cumsum_w = cumsum_w_with_reset(w, 75),
cumsum_n =cumsum_n_with_reset(n, 15),
group = cumsum_with_reset_group(w, n, 75, 15)
) %>%
ungroup()
Error in mutate_impl(.data, dots) :
Evaluation error: object 'cumsum_w' not found
Thanks!
Here is a hack, which is done by repeated subsetting and binding. As such, this will be very slow with large data sets. This takes the whole data set as an input.
library(dplyr)
cumsumdf <- function(df){
cumsum_75 <- function(x) {cumsum(x) %/% 76}
cumsum_15 <- function(x) {cumsum(x) %/% 16}
cumsum_w75 <- function(x) {cumsum(x) %% 76}
cumsum_n15 <- function(x) {cumsum(x) %% 16}
m <- nrow(df)
df$grp <- 0
df <- df %>%
group_by(grp) %>%
mutate(cumsum_w = numeric(m), cumsum_n = numeric(m))
n = 0
df2 <- df[0,]
while(nrow(df) >0 ){
df$cumsum_w = cumsum_75(df$w)
df$cumsum_n = cumsum_15(df$n)
n <- n + 1
df1 <- df[df$cumsum_n == 0 & df$cumsum_w == 0,]
df <- df[df$cumsum_n != 0 | df$cumsum_w != 0,]
df1$grp <- n
df1 <- df1 %>% group_by(grp) %>%
mutate(cumsum_w = cumsum_w75(w), cumsum_n = cumsum_n15(n))
df2 <- rbind(df2,df1)
}
return(df2)
}
cumsumdf(df)