Iterative replacing of value with lagged values using dplyr - r

I have the following data frame -
x <- c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)
y <- c(0,0,0,1,0,-1,0,-1,0,1,0,-1,0,1,0,0,0)
data <- data.frame(x,y)
and I would like to create a type of momentum indicator. Effectively, if y is non-zero, x takes y's value and if y is 0, x takes on the value of the lagged x value. Essentially, I am replacing x's value row by row. Doing this in a for loop is simple -
for (i in 1:nrow(data)) {
data$x[i] <-
ifelse(data$y[i] == 1, 1, ifelse(data$y[i] == -1, -1, data$x[i-1]))}
Giving me this output (what I am looking for)
x y
1 NA 0
2 NA 0
3 NA 0
4 1 1
5 1 0
6 -1 -1
7 -1 0
8 -1 -1
9 -1 0
10 1 1
11 1 0
12 -1 -1
13 -1 0
14 1 1
15 1 0
16 1 0
17 1 0
However, on really large datasets, this for loop is extremely inefficient. I'd like to implement this in dplyr, however the best solution I have managed to come up with does not do the trick
data2 <- data.frame(x,y)
data2 <-
data2 %>%
mutate(x = ifelse(y == 1, 1, ifelse(y == -1, 0, Lag(x))))
which return this
x y
1 NA 0
2 1 0
3 1 0
4 1 1
5 1 0
6 0 -1
7 1 0
8 0 -1
9 1 0
10 1 1
11 1 0
12 0 -1
13 1 0
14 1 1
15 1 0
16 1 0
17 1 0
My guess is that the way I am currently attempting to do this in dplyr does not control for the iterative nature of what I want to do, namely replace x as I move down the rows. Does anyone have ideas as to how I could do this through dplyr?

One option is to replace 0 with NA, and then do a forward fill:
library(dplyr); library(tidyr)
data %>% mutate(x = na_if(y, 0)) %>% fill(x)
# x y
#1 NA 0
#2 NA 0
#3 NA 0
#4 1 1
#5 1 0
#6 -1 -1
#7 -1 0
#8 -1 -1
#9 -1 0
#10 1 1
#11 1 0
#12 -1 -1
#13 -1 0
#14 1 1
#15 1 0
#16 1 0
#17 1 0

Here is another option using na.locf from zoo
library(zoo)
data$x <- with(data, na.locf(y*(NA^!y), na.rm=FALSE))

Related

Create an index variable for blocks of values

I have a dataframe "data" with a grouping variable "grp" and a binary classification variable "classif". For each group in grp, I want to create a "result" variable creating an index of separate blocks of 0 in the classif variable. For the time being, I don't know how to reset the count for each level of the grouping variable and I don't find a way to only create the index for blocks of 0s (ignoring the 1s).
Example data:
grp <- c(1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3)
classif <- c(0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,1,1,1,0,0,1,1,0,0,0,1,0,1,0)
result <- c(1,0,2,2,0,3,3,0,0,1,1,1,1,0,2,0,0,0,3,3,0,0,1,1,1,0,2,0,3)
wrong_result <- c(1,2,3,3,4,5,5,1,1,2,2,2,2,3,4,5,5,5,6,6,1,1,2,2,2,3,4,5,6)
Data <- data.frame(grp,classif,result, wrong_result)
I have tried using rleid but the following command produces "wrong_result", which is not what I'm after.
data[, wrong_result:= rleid(classif)]
data[, wrong_result:= rleid(classif), by=grp]
With dplyr, use cumsum() and lag() to find blocks of zeroes .by group. (Make sure you’re using the latest version of dplyr to use the .by argument).
library(dplyr)
Data %>%
mutate(
result2 = ifelse(
classif == 0,
cumsum(classif == 0 & lag(classif, default = 1) == 1),
0
),
.by = grp
)
grp classif result result2
1 1 0 1 1
2 1 1 0 0
3 1 0 2 2
4 1 0 2 2
5 1 1 0 0
6 1 0 3 3
7 1 0 3 3
8 2 1 0 0
9 2 1 0 0
10 2 0 1 1
11 2 0 1 1
12 2 0 1 1
13 2 0 1 1
14 2 1 0 0
15 2 0 2 2
16 2 1 0 0
17 2 1 0 0
18 2 1 0 0
19 2 0 3 3
20 2 0 3 3
21 3 1 0 0
22 3 1 0 0
23 3 0 1 1
24 3 0 1 1
25 3 0 1 1
26 3 1 0 0
27 3 0 2 2
28 3 1 0 0
29 3 0 3 3
Use rle and sequentially number the runs produced and then convert back and zero out the runs of 1's. No packages are used.
seq0 <- function(x) {
r <- rle(x)
is0 <- r$values == 0
r$values[is0] <- seq_len(sum(is0))
inverse.rle(r) * !x
}
transform(Data, result2 = ave(classif, grp, FUN = seq0))

Conditioning error, progression of logic in mutate/elseif_ pipeline

I'm trying to work out why a code like this won't give me the expected results. I understand there are better ways of achieving the results (cut, etc.) but I am specifically trying to understand why the mutate>ifelse pipeline progression to replace values doesn't work.
A <- c(1,0,0,0,NA,0,1,0,1,0,0,1,1,1,NA,NA,NA,1,0,0,0,1,1,1,0,1,NA)
B <- c(1,0,0,NA,0,1,1,1,0,1,NA,1,0,1,NA,NA,1,0,01,0,0,0,NA,0,1,0,1)
C <- c(0,NA,0,1,0,1,NA,1,0,1,NA,0,1,0,NA,NA,1,0,01,NA,0,0,NA,1,NA,NA,1)
df <- data.frame(A, B, C)
df$D <- NA
df <- df %>%
mutate(D=ifelse(A==0 & B==0 & C==0,0,D)) %>% #assign 0 to d IF all 3 variables 0
mutate(D=ifelse(A==0 | B==0 | C==0,0,D)) %>% #now assign 0 to d IF ANY of 3 variables 0
mutate(D=ifelse(A==1 | B==1 | C==1,1,D)) #now reassign d to 1 if any of the variables has the value 1
> summary(as.factor(df$D))
0 1 NA's
2 19 6
But looking at cross tabulation, my aims is to get 0=2 and NA=2 and rest assigned 1. I can't figure out why my code's logic is not working.
> ftable(xtabs(~A+B+C, df, addNA = TRUE, na.action = NULL)) #matches AV variable
C 0 1 NA
A B
0 0 2 0 2
1 0 4 1
NA 0 1 1
1 0 3 2 1
1 3 0 1
NA 0 0 1
NA 0 1 0 0
1 0 2 0
NA 0 0 2
Edit: corrected typo
Look at your code step by step, specificslly the two mutate commands with the OR conditions. For rows that contain missing and 1s (but no zeroes), R can‘t check if this row contains a zero, because it does not know what NA might be. So the second mutate returns NA for any row that has only 1s and NAs. The third step dows the same, just with 1s. Any row that only contains 0s and NAs will then return NA.
You can verify this by:
x <- c(0, 0, NA)
any(x == 0)
[1] TRUE
any(x == 1)
[1] NA
You can do:
library(tidyverse)
df2 <- df %>%
mutate(D = case_when(A == 0 & B == 0 & C == 0 ~ 0,
is.na(A) & is.na(B) & is.na(C) ~ NA_real_,
TRUE ~ 1))
which gives:
A B C D
1 1 1 0 1
2 0 0 NA 1
3 0 0 0 0
4 0 NA 1 1
5 NA 0 0 1
6 0 1 1 1
7 1 1 NA 1
8 0 1 1 1
9 1 0 0 1
10 0 1 1 1
11 0 NA NA 1
12 1 1 0 1
13 1 0 1 1
14 1 1 0 1
15 NA NA NA NA
16 NA NA NA NA
17 NA 1 1 1
18 1 0 0 1
19 0 1 1 1
20 0 0 NA 1
21 0 0 0 0
22 1 0 0 1
23 1 NA NA 1
24 1 0 1 1
25 0 1 NA 1
26 1 0 NA 1
27 NA 1 1 1
And then
df2 %>% count(D)
D n
1 0 2
2 1 23
3 NA 2

column-wise operations depending on data on a data frame in R

I have a data frame with negative values in one column. something like this
df <- data.frame("a" = 1:6,"b"= -(5:10), "c" = rep(8:6,2))
a b c
1 1 -5 8
2 2 -6 7
3 3 -7 6
4 4 -8 8
5 5 -9 7
6 6 -10 6
I want to convert this to a data frame with no negative values in "b" keeping row totals unchanged. I can use column "a" only if "c" is not big enough to absorb the negative values in "b".
The end result should look like this
a b c
1 1 0 3
2 2 0 1
3 2 0 0
4 4 0 0
5 3 0 0
6 2 0 0
I feel that sapply could be used. But I don't know how ?
You can use pmin and pmax to get the new values for a, b and c.
df$c <- df$c + pmin(0, df$b)
df$b <- pmax(0, df$b)
df$a <- df$a + pmin(0, df$c)
df$c <- pmax(0, df$c)
df
# a b c
#1 1 0 3
#2 2 0 1
#3 2 0 0
#4 4 0 0
#5 3 0 0
#6 2 0 0
You could use dplyr:
df %>%
mutate(total=rowSums(.)) %>%
rowwise() %>%
mutate(c=max(b+c, 0),
b=max(b,0),
a=total - c - b) %>%
select(-total)
which returns
# A tibble: 6 x 3
# Rowwise:
a b c
<dbl> <dbl> <dbl>
1 1 0 3
2 2 0 1
3 2 0 0
4 4 0 0
5 3 0 0
6 2 0 0
Here is a base R solution.
df2 <- df
df2$c <- df$c + df$b
df2$a <- ifelse(df2$c < 0, df2$a + df2$c, df2$a)
df2[df2 < 0 ] <- 0
df2
# a b c
# 1 1 0 3
# 2 2 0 1
# 3 2 0 0
# 4 4 0 0
# 5 3 0 0
# 6 2 0 0

Filtering on a Column Whose Number is Specified in Another Column

I'm looking for a better way to achieve what the code below does with a for loop. The goal is to create a dataframe (or matrix) where each row is a possible n-length sequence of 1s and 0s, followed by an n+1th column which contains a number corresponding to one of the previous columns that contains a 0.
So in the n == 3 case for example, we want to include a row like this:
1 0 0 2
but not this:
1 0 0 1
Here's the code I have now (assuming n == 3 for simplicity):
library(tidyverse)
df <- expand.grid(x = 0:1, y = 0:1, z = 0:1, target = 1:3, keep = FALSE)
for (row in 1:nrow(df)) {
df$keep[row] <- df[row, df$target[row]] == 0
}
df <- df %>%
filter(keep == TRUE) %>%
select(-keep)
head(df)
# x y z target
# 1 0 0 0 1
# 2 0 1 0 1
# 3 0 0 1 1
# 4 0 1 1 1
# 5 0 0 0 2
# 6 1 0 0 2
# 7 0 0 1 2
# 8 1 0 1 2
# 9 0 0 0 3
# 10 1 0 0 3
# 11 0 1 0 3
# 12 1 1 0 3
Seems like there has to be a better way to do this, especially with dplyr. But I can't figure out how to use the value of target to specify the column to filter on.
Using base R, we can create a row/column index to filter values from the dataframe and keep rows where the extracted value is 0.
df[df[cbind(seq_len(nrow(df)), df$target)] == 0, ]
# x y z target
#1 0 0 0 1
#3 0 1 0 1
#5 0 0 1 1
#7 0 1 1 1
#9 0 0 0 2
#10 1 0 0 2
#13 0 0 1 2
#14 1 0 1 2
#17 0 0 0 3
#18 1 0 0 3
#19 0 1 0 3
#20 1 1 0 3
data
df <- expand.grid(x = 0:1, y = 0:1, z = 0:1, target = 1:3)

Flag observations before and after a specific value in another column

Say I have a df:
df <- data.frame(flag = c(rep(0, 20)),
include = c(rep(1, 20)))
df[c(4,8,16), ]$flag <- 1
df
flag include
1 0 1
2 0 1
3 0 1
4 1 1
5 0 1
6 0 1
7 0 1
8 1 1
9 0 1
10 0 1
11 0 1
12 0 1
13 0 1
14 0 1
15 0 1
16 1 1
17 0 1
18 0 1
19 0 1
20 0 1
What I wish to do is change the include flag to 0 if the row is within +/- two rows of a row where flag == 1. The result would look like:
flag include
1 0 1
2 0 0
3 0 0
4 1 1
5 0 0
6 0 0
7 0 0
8 1 1
9 0 0
10 0 0
11 0 1
12 0 1
13 0 1
14 0 0
15 0 0
16 1 1
17 0 0
18 0 0
19 0 1
20 0 1
I've thought of some 'innovative' (read: inefficient and over complicated) ways to do it but was thinking there must be a simple way I'm overlooking.
Would be nice if the answer was such that I could generalize this to +/- n rows, since I have a lot more data and would be looking to potentially search within +/- 10 rows...
Another option with data.table:
library(data.table)
n = 2
# find the row number where flag is one
flag_one = which(df$flag == 1)
# find the index where include needs to be updated
idx = setdiff(outer(flag_one, -n:n, "+"), flag_one)
# update include in place
setDT(df)[idx[idx >= 1 & idx <= nrow(df)], include := 0][]
# or as #Frank commented the last step with base R would be
# df$include[idx[idx >= 1 & idx <= nrow(df)]] = 0
# flag include
# 1: 0 1
# 2: 0 0
# 3: 0 0
# 4: 1 1
# 5: 0 0
# 6: 0 0
# 7: 0 0
# 8: 1 1
# 9: 0 0
#10: 0 0
#11: 0 1
#12: 0 1
#13: 0 1
#14: 0 0
#15: 0 0
#16: 1 1
#17: 0 0
#18: 0 0
#19: 0 1
#20: 0 1
Put in a function:
update_n <- function(df, n) {
flag_one = which(df$flag == 1)
idx = setdiff(outer(flag_one, -n:n, "+"), flag_one)
df$include[idx[idx >= 1 & idx <= nrow(df)]] = 0
df
}
There must be another simpler way but the first way which I could think of is using sapply and which
df$include[sapply(which(df$flag == 1) , function(x) c(x-2, x-1, x+1, x+2))] <- 0
df
# flag include
#1 0 1
#2 0 0
#3 0 0
#4 1 1
#5 0 0
#6 0 0
#7 0 0
#8 1 1
#9 0 0
#10 0 0
#11 0 1
#12 0 1
#13 0 1
#14 0 0
#15 0 0
#16 1 1
#17 0 0
#18 0 0
#19 0 1
#20 0 1
We first find out all the indices where flag is 1 and then create the required sequence of numbers around each of it and turn that index of include to 0.
For variable n we can do
n = 2
df$include[sapply(which(df$flag == 1),function(x) setdiff(seq(x-n, x+n),x))] <- 0
replace(x = df$include,
list = sapply(1:NROW(df), function(i)
any(df$flag[c(max(1, i-2):max(1, i-1),
min(i+1, NROW(df)):min(i+2, NROW(df)))] == 1)), values = 0)
# [1] 1 0 0 1 0 0 0 1 0 0 1 1 1 0 0 1 0 0 1 1
For n rows,
replace(x = df$include,
list = sapply(1:NROW(df), function(i)
any(df$flag[c(max(1, i-n):max(1, i-1),
min(i+1, NROW(df)):min(i+n, NROW(df)))] == 1)), values = 0)
Another way is to use zoo::rollapply. To determine if a row is within +/- two rows of a row where flag == 1, we check if the maximum flag in a window is 1.
We need rollapply rather than rollmax because we need to specify partial = T.
is_within_flag_window <- function(flag, n) {
zoo::rollapply(flag, width = (2 * n) + 1, partial = T, FUN = max) == 1
}
df %>%
mutate(include = ifelse(flag == 1, 1,
ifelse(is_within_flag_window(flag, 2), 0,
1)))
Use which and outer.
df$include[outer(which(df$flag==1), -2:2, `+`)] <- 0
If flag=1 within one or two positions of each other then restore the ones overwritten at position 0. Note this step is critical in case the "flag" overlaps in a particular range.
df$include[which(df$flag==1)] <- 1
flag include
1 0 1
2 0 0
3 0 0
4 1 1
5 0 0
6 0 0
7 0 0
8 1 1
9 0 0
10 0 0
11 0 1
12 0 1
13 0 1
14 0 0
15 0 0
16 1 1
17 0 0
18 0 0
19 0 1
20 0 1
If flag = 1 within one or two rows of the beginning or end of the dataset, R will throw errors. Use this:
## assign i for convenience/readability
i <- pmax(1, pmin(nrow(df), outer(which(df$flag==1), -2:2, `+`)))
df$include[i] <- 0
Restore 1s as before

Resources