How to do calculation based on previous row results in dplyr - r

I performing some calculations where the result of a row is the input to the next.
I'm using a for loop which is quite slow, is there a way I can use dplyr for these types of calculations? example below
df <- data.frame(beginning_on_hand = c(10,0,0,0,0,0,0,0,0,0,0,0),
sales = c(10,9,4,7,3,7,2,6,1,5,7,1),
ship = c(10,9,4,7,3,7,2,6,1,5,7,1))
dfb <- df %>%
mutate(receipts = 0) %>%
mutate(ending_on_hand = 0) %>%
mutate(receipts = lag(ship, 2)) %>%
mutate(receipts = if_else(is.na(receipts), 0, receipts))
> dfb
beginning_on_hand sales ship receipts ending_on_hand
10 10 10 0 0
0 9 9 0 0
0 4 4 10 0
0 7 7 9 0
0 3 3 4 0
0 7 7 7 0
0 2 2 3 0
0 6 6 7 0
0 1 1 2 0
0 5 5 6 0
0 7 7 1 0
0 1 1 5 0
for(i in 1:(nrow(dfb)- 2)) {
dfb$ending_on_hand[i] <- dfb$beginning_on_hand[i] + dfb$receipts[i] - dfb$sales[i]
dfb$beginning_on_hand[i+1] <- dfb$ending_on_hand[i]
}
> dfb
beginning_on_hand sales ship receipts ending_on_hand
1 10 10 10 0 0
2 0 9 9 0 -9
3 -9 4 4 10 -3
4 -3 7 7 9 -1
5 -1 3 3 4 0
6 0 7 7 7 0
7 0 2 2 3 1
8 1 6 6 7 2
9 2 1 1 2 3
10 3 5 5 6 4
11 4 7 7 1 0
12 0 1 1 5 0

I don't have a dplyr solution for this, but here is a data.table solution for this.
df <- data.frame(beginning_on_hand = c(10,0,0,0,0,0,0,0,0,0,0,0),
sales = c(10,9,4,7,3,7,2,6,1,5,7,1),
ship = c(10,9,4,7,3,7,2,6,1,5,7,1))
dfb <- df %>%
mutate(ending_on_hand = 0) %>%
mutate(receipts = lag(ship, 2)) %>%
mutate(receipts = if_else(is.na(receipts), 0, receipts))
dfb<-data.table(dfb)
df.end <- dfb[, ending_on_hand := cumsum(beginning_on_hand + receipts - sales)][,
beginning_on_hand := beginning_on_hand + lag(ending_on_hand, default = 0)]
>df.end
beginning_on_hand sales ship ending_on_hand receipts
1: 10 10 10 0 0
2: 0 9 9 -9 0
3: -9 4 4 -3 10
4: -3 7 7 -1 9
5: -1 3 3 0 4
6: 0 7 7 0 7
7: 0 2 2 1 3
8: 1 6 6 2 7
9: 2 1 1 3 2
10: 3 5 5 4 6
11: 4 7 7 -2 1
12: -2 1 1 2 5
To explain, data.table uses basically lists to comprise the data and displays it in typically a flat-file manner. It uses SQL type instructions to organize and process data. The functions of note used here are cumsum and lag. cumsum calculates all values prior to a particular index, and lag looks for a value above or prior to a given index.

Related

Count consecutive numbers

I have some time series with corresponding number for each date as 0 or 1. For example:
date value
1 0
2 0
3 1
4 1
5 1
6 0
7 1
8 1
So I want to count the consecutive 1´s like for date 3-5 the sum should be 3 and then start at date 7 again to count. And if this sum is below 6 the 1´s should be transformed to 0´s.
library(dplyr)
data.frame(
date = 1:8,
value = c(0,0,1,1,1,0,1,1)
) %>%
mutate(
count = rle(value) %>%
{list(.$lengths * .$values, .$lengths)} %>%
{rep(x = .[[1]], times = .[[2]])},
count_1 = ifelse(count < 6, 0, count)
)
gives:
date value count count_1
1 1 0 0 0
2 2 0 0 0
3 3 1 3 0
4 4 1 3 0
5 5 1 3 0
6 6 0 0 0
7 7 1 2 0
8 8 1 2 0
I would first create a grouping variable and then use this to aggregate the dataset.
d = data.frame("date"=1:12,
"value"=c(1,1,0,0,1,1,1,1,0,0,1,0))
d$group = 1
for(i in 2:dim(d)[1]){
if(d$value[i]==d$value[i-1]){
d$group[i]=d$group[i-1]
} else {
d$group[i]=d$group[i-1]+1
}
}
nd = data.frame("Group"=unique(d$group),
"Start"=aggregate(d$date~d$group,FUN=min)[,2],
"End"=aggregate(d$date~d$group,FUN=max)[,2],
"Count"=aggregate(d$value~d$group,FUN=sum)[,2])
The output for this data would be:
> d ## Input data
date value
1 1 1
2 2 1
3 3 0
4 4 0
5 5 1
6 6 1
7 7 1
8 8 1
9 9 0
10 10 0
11 11 1
12 12 0
> nd ## All groups
Group Start End Count
1 1 1 2 2
2 2 3 4 0
3 3 5 8 4
4 4 9 10 0
5 5 11 11 1
6 6 12 12 0
> nd[nd$Count>0,] ## Just the groups with 1 in them:
Group Start End Count
1 1 1 2 2
3 3 5 8 4
5 5 11 11 1
Another solution which looks like what you expected :
d = data.frame("date"=1:20,"value"=c(1,1,0,0,1,1,1,1,0,0,1,0,1,1,1,1,1,1,1,0))
repl <- rle(d$value)
rep_lengths <- rep(repl$lengths, repl$lengths)
rep_lengths[rep_lengths < 6] <- 0
d$value <- rep_lengths
returns
> d
date value
1 1 0
2 2 0
3 3 0
4 4 0
5 5 0
6 6 0
7 7 0
8 8 0
9 9 0
10 10 0
11 11 0
12 12 0
13 13 7
14 14 7
15 15 7
16 16 7
17 17 7
18 18 7
19 19 7
20 20 0
You can use rle to count the consecutive and use ifelse to set those lower 6 to 0:
y <- rle(x$value)
y[[2]] <- y[[1]] * y[[2]]
y[[2]] <- ifelse(y[[2]] < 6, 0, y[[2]])
inverse.rle(y)
#[1] 0 0 0 0 0 0 0 0
Data:
x <- data.frame(date = 1:8, value = c(0,0,1,1,1,0,1,1))

What is the R function for detecting successive differences in a data frame?

I use the following code in R and it works very well. More precisely, I compare each time cluster_id with the last cluster_ref to see when they differ 2 periods in a row (data is organized by fund_numbers). However, I would like to adapt it to 5 periods. But it is impossible to make it work. Do you have any idea how I can modify this code to solve my problem?
get_output <- function(mon, ref){
exp <- !is.na(Cluster_id) & !map2_lgl(Cluster_id, last(Cluster_ref), identical)
as.integer(exp & lag(exp, default = FALSE))
}
df %>%
arrange(Fund_number, rolling_window) %>%
group_by(Fund_number) %>%
mutate(Deviation = get_output(Cluster_id, Cluster_ref)) %>%
ungroup()
rolling_window Fund_number Cluster_id Cluster_ref Expected_output
1 1 10 10 0
2 1 10 10 0
3 1 8 9 0
4 1 8 8 0
5 1 7 7 0
6 1 8 8 0
7 1 8 NA 1
8 1 7 NA 1
9 1 7 10 1
10 1 10 10 0
1 2 NA NA 0
2 2 NA 3 0
3 2 3 3 0
4 2 2 5 0
5 2 2 NA 0
6 2 2 4 0
7 2 2 4 1
8 2 5 5 0
9 2 4 5 0
10 2 3 5 0
This is what I want.
So as you can see, the data is organized by fund_number. Then I look at the last cluster_ref for each fund (so every 8 rows) and compare it to each cluster_id for each fund. As soon as it is different at least 5 periods in a row I have 1 if not 0. So for each fund, I compare the 8th cluster_ref and the cluster_id of rows 1 to 8.
The code above makes this but with 2 time periods.
Thank you very much,
Vanie
In data.table we can use rleid over Cluster_id values.
library(data.table)
setDT(df)[, temp := rleid(last(Cluster_ref) != Cluster_id), Fund_number]
df[, output := +(seq_along(Cluster_ref) >= 5), .(Fund_number, temp)]
df[, temp := NULL]
df
# rolling_window Fund_number Cluster_id Cluster_ref Expected_output output
# 1: 1 1 10 10 0 0
# 2: 2 1 10 10 0 0
# 3: 3 1 8 9 0 0
# 4: 4 1 8 8 0 0
# 5: 5 1 7 7 0 0
# 6: 6 1 8 8 0 0
# 7: 7 1 8 NA 1 1
# 8: 8 1 7 NA 1 1
# 9: 9 1 7 10 1 1
#10: 10 1 10 10 0 0
#11: 1 2 NA NA 0 0
#12: 2 2 NA 3 0 0
#13: 3 2 3 3 0 0
#14: 4 2 2 5 0 0
#15: 5 2 2 NA 0 0
#16: 6 2 2 4 0 0
#17: 7 2 2 4 1 1
#18: 8 2 5 5 0 0
#19: 9 2 4 5 0 0
#20: 10 2 3 5 0 0

Using R to filter special rows

I have a question which has bothered me for a long time.I have a data frame as below...
ll <- data.frame(id=1:10,
A=c(rep(0,5),3,4,5,0,2),
B=c(1,4,2,0,3,0,3,24,0,0),
C=c(0,3,4,5,0,4,0,6,0,5),
D=c(0,1,2,0,42,4,0,3,8,0))
> ll
id A B C D
1 1 0 1 0 0
2 2 0 4 3 1
3 3 0 2 4 2
4 4 0 0 5 0
5 5 0 3 0 42
6 6 3 0 4 4
7 7 4 3 0 0
8 8 5 24 6 3
9 9 0 0 0 8
10 10 2 0 5 0
I want to filter out some special rows which have more than one "0" such as...
id A B C D
1 1 0 1 0 0
I want the final output as...
id A B C D
2 2 0 4 3 1
3 3 0 2 4 2
6 6 3 0 4 4
8 8 5 24 6 3
You can just use rowSums:
> ll[rowSums(ll == 0) <= 1, ]
id A B C D
2 2 0 4 3 1
3 3 0 2 4 2
6 6 3 0 4 4
8 8 5 24 6 3
If there are any columns that shouldn't be included, you can drop them in the rowSums step. For example, I assume "id" would not be included. If that's the case, then you can do:
ll[rowSums(ll[-1] == 0) <= 1, ]

Sum rows in a group, starting when a specific value occurs

I want to accumulate the values of a column till the end of the group, though starting the addition when a specific value occurs in another column. I am only interested in the first instance of the specific value within a group. So if that value occurs again within the group, the addition column should continue to add the values. I know this sounds like a rather strange problem, so hopefully the example table makes sense.
The following data frame is what I have now:
> df = data.frame(group = c(1,1,1,1,2,2,2,2,2,3,3,3,4,4,4),numToAdd = c(1,1,3,2,4,2,1,3,2,1,2,1,2,3,2),occurs = c(0,0,1,0,0,1,0,0,0,0,1,1,0,0,0))
> df
group numToAdd occurs
1 1 1 0
2 1 1 0
3 1 3 1
4 1 2 0
5 2 4 0
6 2 2 1
7 2 1 0
8 2 3 0
9 2 2 0
10 3 1 0
11 3 2 1
12 3 1 1
13 4 2 0
14 4 3 0
15 4 2 0
Thus, whenever a 1 occurs within a group, I want a cumulative sum of the values from the column numToAdd, until a new group starts. This would look like the following:
> finalDF = data.frame(group = c(1,1,1,1,2,2,2,2,2,3,3,3,4,4,4),numToAdd = c(1,1,3,2,4,2,1,3,2,1,2,1,2,3,2),occurs = c(0,0,1,0,0,1,0,0,0,0,1,1,0,0,0),added = c(0,0,3,5,0,2,3,6,8,0,2,3,0,0,0))
> finalDF
group numToAdd occurs added
1 1 1 0 0
2 1 1 0 0
3 1 3 1 3
4 1 2 0 5
5 2 4 0 0
6 2 2 1 2
7 2 1 0 3
8 2 3 0 6
9 2 2 0 8
10 3 1 0 0
11 3 2 1 2
12 3 1 1 3
13 4 2 0 0
14 4 3 0 0
15 4 2 0 0
Thus, the added column is 0 until a 1 occurs within the group, then accumulates the values from numToAdd until it moves to a new group, turning the added column back to 0. In group three, a value of 1 is found a second time, yet the cumulated sum continues. Additionally, in group 4, a value of 1 is never found, thus the value within the added column remains 0.
I've played around with dplyr, but can't get it to work. The following solution only outputs the total sum, and not the increasing cumulated number at each row.
library(dplyr)
df =
df %>%
mutate(added=ifelse(occurs == 1,cumsum(numToAdd),0)) %>%
group_by(group)
Try
df %>%
group_by(group) %>%
mutate(added= cumsum(numToAdd*cummax(occurs)))
# group numToAdd occurs added
# 1 1 1 0 0
# 2 1 1 0 0
# 3 1 3 1 3
# 4 1 2 0 5
# 5 2 4 0 0
# 6 2 2 1 2
# 7 2 1 0 3
# 8 2 3 0 6
# 9 2 2 0 8
# 10 3 1 0 0
# 11 3 2 1 2
# 12 3 1 1 3
# 13 4 2 0 0
# 14 4 3 0 0
# 15 4 2 0 0
Or using data.table
library(data.table)#v1.9.5+
i1 <-setDT(df)[, .I[(rleid(occurs) + (occurs>0))>1], group]$V1
df[, added:=0][i1, added:=cumsum(numToAdd), by = group]
Or a similar option as in dplyr
setDT(df)[,added := cumsum(numToAdd * cummax(occurs)) , by = group]
You can use split-apply-combine in base R with something like:
df$added <- unlist(lapply(split(df, df$group), function(x) {
y <- rep(0, nrow(x))
pos <- cumsum(x$occurs) > 0
y[pos] <- cumsum(x$numToAdd[pos])
y
}))
df
# group numToAdd occurs added
# 1 1 1 0 0
# 2 1 1 0 0
# 3 1 3 1 3
# 4 1 2 0 5
# 5 2 4 0 0
# 6 2 2 1 2
# 7 2 1 0 3
# 8 2 3 0 6
# 9 2 2 0 8
# 10 3 1 0 0
# 11 3 2 1 2
# 12 3 1 1 3
# 13 4 2 0 0
# 14 4 3 0 0
# 15 4 2 0 0
To add another base R approach:
df$added <- unlist(lapply(split(df, df$group), function(x) {
c(x[,'occurs'][cumsum(x[,'occurs']) == 0L],
cumsum(x[,'numToAdd'][cumsum(x[,'occurs']) != 0L]))
}))
# group numToAdd occurs added
# 1 1 1 0 0
# 2 1 1 0 0
# 3 1 3 1 3
# 4 1 2 0 5
# 5 2 4 0 0
# 6 2 2 1 2
# 7 2 1 0 3
# 8 2 3 0 6
# 9 2 2 0 8
# 10 3 1 0 0
# 11 3 2 1 2
# 12 3 1 1 3
# 13 4 2 0 0
# 14 4 3 0 0
# 15 4 2 0 0
Another base R:
df$added <- unlist(lapply(split(df,df$group),function(x){
cumsum((cumsum(x$occurs) > 0) * x$numToAdd)
}))

cumulative counter in dataframe R

I have a dataframe with many rows, but the structure looks like this:
year factor
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 1
10 0
11 0
12 0
13 0
14 0
15 0
16 0
17 1
18 0
19 0
20 0
I would need to add a counter as a third column. It should count the cumulative cells that contains zero until it set again to zero once the value 1 is encountered. The result should look like this:
year factor count
1 0 0
2 0 1
3 0 2
4 0 3
5 0 4
6 0 5
7 0 6
8 0 7
9 1 0
10 0 1
11 0 2
12 0 3
13 0 4
14 0 5
15 0 6
16 0 7
17 1 0
18 0 1
19 0 2
20 0 3
I would be glad to do it in a quick way, avoiding loops, since I have to do the operations for hundreds of files.
You can copy my dataframe, pasting the dataframe in "..." here:
dt <- read.table( text="...", , header = TRUE )
Perhaps a solution like this with ave would work for you:
A <- cumsum(dt$factor)
ave(A, A, FUN = seq_along) - 1
# [1] 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 0 1 2 3
Original answer:
(Missed that the first value was supposed to be "0". Oops.)
x <- rle(dt$factor == 1)
y <- sequence(x$lengths)
y[dt$factor == 1] <- 0
y
# [1] 1 2 3 4 5 6 7 8 0 1 2 3 4 5 6 7 0 1 2 3

Resources