Conditional cumsum with reset when accumulating and substracting at once - r

I have a data frame with three variables i.e. V1, V2 and V3.
ts<- c(-2, 4, 3,-5,-5,-7, -8, -2, -3, -5,-7, -8, -9, -2, 1, 2,4)
x<- c(6, 0, 0 ,1, 0, 2, 3,5,7,7,8,2,0, 0, 0 , 0, 0)
y<- c(0, 5, 8, 0, 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 7, 9, 12, 0)
ve <- data.frame(V1 = ts, V2 = x, V3 =y)
I applied conditional cumsum with the code given below:
ve$yt<- cumsum(ifelse(ve$V1>0, ve$V2-(ve$V3), ve$V2))
I have to admit, this code did its job partially for me until I encountered negative value. As such, I have different desired output (DO). I want to restart cumulating the value as shown in table below, once I have the negative value as encountered in column yt.
View(Ve)
V1 V2 V3 yt DO
-2 6 0 6 6
4 0 5 1 1
3 0 8 -7 0
-5 1 0 -6 1
-5 0 0 -6 1
-7 2 0 -4 3
-8 3 0 -1 6
-2 5 0 4 11
-3 7 0 11 18
-5 7 0 18 25
-7 8 0 26 33
-8 2 0 28 35
-9 0 0 28 35
-2 0 7 28 35
1 0 9 19 26
2 0 12 7 14
4 0 0 7 14
I searched for similar problem but I was unable to get any answer to solve my problem. These are some of the links I tried to solve my problem:
Conditional cumsum with reset
resetting cumsum if value goes to negative in r
I sincerely request to help me solve my problem.

Here is one way you might do this:
ve$DO <- Reduce(function(x,y) pmax(x + y, 0), with(ve, V2-V3*(V1 > 0)), accumulate = TRUE)
ve
V1 V2 V3 DO
1 -2 6 0 6
2 4 0 5 1
3 3 0 8 0
4 -5 1 0 1
5 -5 0 0 1
6 -7 2 0 3
7 -8 3 0 6
8 -2 5 0 11
9 -3 7 0 18
10 -5 7 0 25
11 -7 8 0 33
12 -8 2 0 35
13 -9 0 0 35
14 -2 0 7 35
15 1 0 9 26
16 2 0 12 14
17 4 0 0 14
Equivalent using purrr/dplyr:
library(purrr)
library(dplyr)
ve %>%
mutate(DO = accumulate(V2-V3*(V1 > 0), .f = ~pmax(.x + .y, 0)))

Related

Cumulative sum of a subset of data based on condition

I have what i think is a simple R task but i'm having trouble. Basically I need to do a cumulative sum of values based on the criteria of another column.
Here's the catch, it should do the cumulative sum for the previous rows until it hits another condition. In the example i'm providing, it accumulates all values from the duration column, 1 and 2 in the condition column. Example is shown below.
duration <- c(2,3,2,4,5,10,2,9,7,5,8,9,10,12,4,5,6)
condition <- c(0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,2)
accum_sum <- c(0,5,0,0,0,0,0,32,0,0,0,0,39,0,0,0,27)
df <- data.frame(duration,condition,accum_sum)
df
row duration condition accum_sum
1 2 0 0
2 3 1 5
3 2 0 0
4 4 0 0
5 5 0 0
6 10 0 0
7 2 0 0
8 9 2 32
9 7 0 0
10 5 0 0
11 8 0 0
12 9 0 0
13 10 1 39
14 12 0 0
15 4 0 0
16 5 0 0
17 6 2 27
Using data.table:
setDT(df)
df[, accum_sum := cumsum(duration), by = rev(cumsum(rev(condition)))]
df[condition == 0, accum_sum := 0]
# duration condition accum_sum
# 1: 2 0 0
# 2: 3 1 5
# 3: 2 0 0
# 4: 4 0 0
# 5: 5 0 0
# 6: 10 0 0
# 7: 2 0 0
# 8: 9 2 32
# 9: 7 0 0
#10: 5 0 0
#11: 8 0 0
#12: 9 0 0
#13: 10 1 39
#14: 12 0 0
#15: 4 0 0
#16: 5 0 0
#17: 6 2 27
We create runs by filling the zeros backwards with rev(cumsum(rev(condition))) and then group by this "filled" condition.
#cumulative sum
df$cum_sum <- ave(df$duration, c(0, cumsum(df$condition[-nrow(df)])), FUN = cumsum)
#replace all zero condition row with zero value in cumulative sum column
df$cum_sum <- ifelse(df$condition == 0, 0, df$cum_sum)
which gives
duration condition cum_sum
1 2 0 0
2 3 1 5
3 2 0 0
4 4 0 0
5 5 0 0
6 10 0 0
7 2 0 0
8 9 2 32
9 7 0 0
10 5 0 0
11 8 0 0
12 9 0 0
13 10 1 39
14 12 0 0
15 4 0 0
16 5 0 0
17 6 2 27
Sample data:
df <- structure(list(duration = c(2, 3, 2, 4, 5, 10, 2, 9, 7, 5, 8,
9, 10, 12, 4, 5, 6), condition = c(0, 1, 0, 0, 0, 0, 0, 2, 0,
0, 0, 0, 1, 0, 0, 0, 2), cum_sum = c(0, 5, 0, 0, 0, 0, 0, 32,
0, 0, 0, 0, 39, 0, 0, 0, 27)), .Names = c("duration", "condition",
"cum_sum"), row.names = c(NA, -17L), class = "data.frame")
Using dplyr, we can use cumsum() on condition to keep track of how many conditions have been seen. Then add within those subsets:
library(dplyr)
df %>%
mutate(condition_group = cumsum(lag(condition, default = 0) != 0) + 1) %>%
group_by(condition_group) %>%
mutate(accum_sum = ifelse(condition != 0,
sum(duration),
0))
Output:
# A tibble: 17 x 4
# Groups: condition_group [4]
duration condition accum_sum condition_group
<dbl> <dbl> <dbl> <dbl>
1 2 0 0 1
2 3 1 5 1
3 2 0 0 2
4 4 0 0 2
5 5 0 0 2
6 10 0 0 2
7 2 0 0 2
8 9 2 32 2
9 7 0 0 3
10 5 0 0 3
11 8 0 0 3
12 9 0 0 3
13 10 1 39 3
14 12 0 0 4
15 4 0 0 4
16 5 0 0 4
17 6 2 27 4
If you shift condition by 1, you can simply use tapply.
duration <- c(2,3,2,4,5,10,2,9,7,5,8,9,10,12,4,5,6)
condition <- c(0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,2)
accum_sum <- c(0,5,0,0,0,0,0,32,0,0,0,0,39,0,0,0,27)
df <- data.frame(duration,condition,accum_sum)
df$want <- unlist(tapply(df$duration,
INDEX = cumsum(c(df$condition[1], head(df$condition, -1))),
cumsum)) * ifelse(df$condition == 0, 0, 1)
df

Print 1 below specific value until meets a higher value

I have data where I wish to print 1s when below a certain value until we meet a higher value.
Take this data for example:
data long_entry long_exit
1 80.000000 0 1
2 7.692308 1 0
3 7.692308 1 0
4 8.333333 1 0
5 9.090909 1 0
6 20.000000 1 0
7 27.272727 0 0
8 50.000000 0 0
9 50.000000 0 0
10 21.428571 1 0
11 58.333333 0 0
12 46.666667 0 0
13 78.064516 0 1
14 86.153846 0 1
15 42.857143 0 0
16 44.186047 0 0
17 20.000000 1 0
18 25.000000 0 0
19 40.000000 0 0
20 45.000000 0 0
21 78.000000 0 1
22 55.000000 0 0
My goal is to print 1,s when data column is below 25 and continue to print 1 until we meet a data number over 70 (first instance).
Code used to make long / exit signals:
df$long_entry = ifelse(df$data < 25,1,0 )
df$long_exit = ifelse(df$data >= 70,1,0)
I have tried writing a few for loops using base and dplyr:
df$final.signal[[1]] = ifelse(df$long_entry[[1]] == 1, 1, 0)
for (i in 2:nrow(df)){
df$final.signal[i] = ifelse(df$long_entry[i] ==1, 1, 0,
ifelse(df$long_exit[i] == 1, 0,
df$long_exit[i-1]))
}
df <- df %>%
dplyr::mutate(final.signal = ifelse(long_entry == 1, 1,
ifelse(long_exit ==1, 0, 0)))
This however does not do as intended. The desired output is to be like this:
data desired.output
1 80.000000 0
2 7.692308 1
3 7.692308 1
4 8.333333 1
5 9.090909 1
6 20.000000 1
7 27.272727 1
8 50.000000 1
9 50.000000 1
10 21.428571 1
11 58.333333 1
12 46.666667 1
13 78.064516 1 (1 on first instance over 70)
14 86.153846 0
15 42.857143 0
16 44.186047 0
17 20.000000 1 (back to 1 when under 25)
18 25.000000 1
19 40.000000 1
20 45.000000 1
21 78.000000 1 ( stay 1 until first instance over 70)
22 85.000000 0
We see we print 1 < 25 until we meet the first instance of >70.
Which is the best method to approach this task?
May this could help you :
dataa <- data.frame(abs(rnorm(mean = 30, sd = 40, n= 100)))
names(dataa) <- c("v1")
dataa %>% mutate(v2 = as.numeric( (cumsum(as.numeric(dataa$v1>70)) <= 0) & (cumsum(as.numeric(dataa$v1<25)) >= 1)))

Get Max Value per Run or Series in Sequence

I am trying to get a a max value per stretch of an indicator, or repeating value.
Here is an example:
A = c(28, 20, 23, 30, 26, 23, 25, 26, 27, 25, 30, 26, 25, 22, 24, 25, 24, 27, 29)
B = c(0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1)
df <- as.data.frame(cbind(A, B))
df
A B
28 0
20 1
23 1
30 0
26 0
23 1
25 1
26 1
27 0
25 0
30 1
26 1
25 1
22 0
24 1
25 0
24 0
27 0
29 1
For each group or stretch of 1's in column B I want to find the max in column A. The max column could be an indicator that A it is a max or the actual value in A, and be NA or 0 for other values of B.
The output I am hoping for looks something like this:
A B max
28 0 0
20 1 0
23 1 1
30 0 0
26 0 0
23 1 0
25 1 0
26 1 1
27 0 0
25 0 0
30 1 1
26 1 0
25 1 0
22 0 0
24 1 1
25 0 0
24 0 0
27 0 0
29 1 1
I've tried to generate groups per section of column B that = 1 but I did not get very far because most grouping functions require unique values between groups.
Also, please let me know if there are any improvements to the title for this problem.
One option would be data.table
library(data.table)
setDT(df)[, Max := +((A== max(A)) & B), rleid(B) ]
df
# A B Max
# 1: 28 0 0
# 2: 20 1 0
# 3: 23 1 1
# 4: 30 0 0
# 5: 26 0 0
# 6: 23 1 0
# 7: 25 1 0
# 8: 26 1 1
# 9: 27 0 0
#10: 25 0 0
#11: 30 1 1
#12: 26 1 0
#13: 25 1 0
#14: 22 0 0
#15: 24 1 1
#16: 25 0 0
#17: 24 0 0
#18: 27 0 0
#19: 29 1 1
Or as #Frank mentioned, for better efficiency, we can make use gmax by first assigning column and then replace
DT[, MA := max(A), by=rleid(B)][A == MA & B, Max := 1L][]
Solution using dplyr
library(dplyr)
df %>%
group_by(with(rle(B), rep(seq_along(lengths), lengths))) %>%
mutate(MAX = ifelse(B == 0, 0, as.numeric(A == max(A)))) %>%
.[, c(1, 2, 4)]
A B MAX
<dbl> <dbl> <dbl>
1 28 0 0
2 20 1 0
3 23 1 1
4 30 0 0
5 26 0 0
6 23 1 0
7 25 1 0
8 26 1 1
9 27 0 0
10 25 0 0
11 30 1 1
12 26 1 0
13 25 1 0
14 22 0 0
15 24 1 1
16 25 0 0
17 24 0 0
18 27 0 0
19 29 1 1

Subtraction matrix in R

I want to get the subtraction matrix, the matrix obtained by subtracting each row from other rows. My MWE is below (not working as expected). The resulting matrix should be 36*3 containing subtracted values of each row from other rows. Thanks
X <-
matrix(
data=
c(
5, 9, 20
, 6, 11, 2
, 4, 5, 20
, 6, 9, 46
, 5, 7, 1
, 3, 1, 12
)
, nrow = 6
, ncol = 3
, byrow=TRUE
)
XSub <-
matrix(data=NA, nrow=nrow(X)^2, ncol=ncol(X))
for(i in 1:nrow(X)){
for(j in 1:nrow(X)){
XSub[i+j-1, ] <- X[i, ]-X[j,]
}
}
XSub
I believe this is what you might want (avoids using a loop):
X <-
matrix(
data=
c(
5, 9, 20
, 6, 11, 2
, 4, 5, 20
, 6, 9, 46
, 5, 7, 1
, 3, 1, 12
)
, nrow = 6
, ncol = 3
, byrow=TRUE
)
comb <- expand.grid(x1=1:nrow(X), x2=1:nrow(X))
XSub <- X[comb$x1,] - X[comb$x2,]
rownames(XSub) <- paste(comb$x1, comb$x2, sep="-")
Results in the following:
> XSub
[,1] [,2] [,3]
1-1 0 0 0
2-1 1 2 -18
3-1 -1 -4 0
4-1 1 0 26
5-1 0 -2 -19
6-1 -2 -8 -8
1-2 -1 -2 18
2-2 0 0 0
3-2 -2 -6 18
4-2 0 -2 44
5-2 -1 -4 -1
6-2 -3 -10 10
1-3 1 4 0
2-3 2 6 -18
3-3 0 0 0
4-3 2 4 26
5-3 1 2 -19
6-3 -1 -4 -8
1-4 -1 0 -26
2-4 0 2 -44
3-4 -2 -4 -26
4-4 0 0 0
5-4 -1 -2 -45
6-4 -3 -8 -34
1-5 0 2 19
2-5 1 4 1
3-5 -1 -2 19
4-5 1 2 45
5-5 0 0 0
6-5 -2 -6 11
1-6 2 8 8
2-6 3 10 -10
3-6 1 4 8
4-6 3 8 34
5-6 2 6 -11
6-6 0 0 0

Calculating change from baseline with data in long format

Here is a small reproducible example of my data:
> mydata <- structure(list(subject = c(1, 1, 1, 2, 2, 2), time = c(0, 1, 2, 0, 1, 2), measure = c(10, 12, 8, 7, 0, 0)), .Names = c("subject", "time", "measure"), row.names = c(NA, -6L), class = "data.frame")
> mydata
subject time measure
1 0 10
1 1 12
1 2 8
2 0 7
2 1 0
2 2 0
I would like to generate a new variable that is the "change from baseline". That is, I would like
subject time measure change
1 0 10 0
1 1 12 2
1 2 8 -2
2 0 7 0
2 1 0 -7
2 2 0 -7
Is there an easy way to do this, other than looping through all the records programatically or reshaping to wide format first ?
There are many possibilities. My favorites:
library(plyr)
ddply(mydata,.(subject),transform,change=measure-measure[1])
subject time measure change
1 1 0 10 0
2 1 1 12 2
3 1 2 8 -2
4 2 0 7 0
5 2 1 0 -7
6 2 2 0 -7
library(data.table)
myDT <- as.data.table(mydata)
myDT[,change:=measure-measure[1],by=subject]
print(myDT)
subject time measure change
1: 1 0 10 0
2: 1 1 12 2
3: 1 2 8 -2
4: 2 0 7 0
5: 2 1 0 -7
6: 2 2 0 -7
data.table is preferable if your dataset is large.
What about:
mydata$change <- do.call("c", with(mydata, lapply(split(measure, subject), function(x) x - x[1])))
alternatively you could also use the ave function:
with(mydata, ave(measure, subject, FUN=function(x) x - x[1]))
# [1] 0 2 -2 0 -7 -7
or
within(mydata, change <- ave(measure, subject, FUN=function(x) x - x[1]))
# subject time measure change
# 1 1 0 10 0
# 2 1 1 12 2
# 3 1 2 8 -2
# 4 2 0 7 0
# 5 2 1 0 -7
# 6 2 2 0 -7
you can use tapply:
mydata$change<-as.vector(unlist(tapply(mydata$measure,mydata$subject,FUN=function(x){return (x-rep(x[1],length(x)))})));

Resources