find a sequence of rules in a dataframe, with break rules

find a sequence of rules in a dataframe, with break rules - r

I showed how I see the implementation of this algorithm, I divided it into two steps
step one sequence search
step two check break rules
set.seed(123)
dat <- as.data.frame(matrix(sample(10,60,replace = T),ncol = 3))
colnames(dat) <- LETTERS[1:ncol(dat)]
dat
rule <- c("A==0","A==10 & B==4","C==9","A>10","B<0","C==0","A==5","A>10",
"B<0","C==0","A==9 & B==9","A>10","B<0","A==10","A==7 & B==5")
action <- c("break","next","next",rep("break",3),"next",rep("break",3),
"next",rep("break",3) ,"next")
rule <- cbind(rule,action)

I think this works -
seq_rule <- function(dat, rule, res.only = TRUE) {
value = rule$action
rule <- rule$rule
m <- with(dat, lapply(rule, function(r) eval(str2expression(r))))
fu <- function(x, y) {
k <- which(y)
ifelse(all(k <= x), NA, min(k[k > x]))
}
idx <- Reduce(fu , m,init = 0, accumulate = TRUE)[-1]
if (!res.only) {
idx <- na.omit(idx)
fidx <- head(idx, length(rule))
debug.vec <- replace(rep("no", nrow(dat)), fidx, rule[seq_along(fidx)])
return(cbind(dat, debug.vec))
}
if(any(value[!is.na(idx)] == 'break')) return(FALSE)
idx <- na.omit(idx)
length(idx) >= length(rule)
}
Here are some checks -
rule <- data.frame(rule= c("A==9","B==4","C==4","A==4", "B==10","C==4") ,
action= c(rep("next",3),"break","break","next"))
seq_rule(dat = dat,rule = rule)
#[1] FALSE
rule <- data.frame(rule= c("C==9","B==3","C==4"),
action= c(rep("next",3)))
seq_rule(dat = dat,rule = rule)
#[1] TRUE
seq_rule(dat = dat,rule = rule, res.only = FALSE)
# A B C debug.vec
#1 3 5 9 C==9
#2 3 3 3 B==3
#3 10 9 4 C==4
#4 2 9 1 no
#5 6 9 7 no
#6 5 3 5 no
#7 4 8 10 no
#8 6 10 7 no
#9 9 7 9 no
#10 10 10 9 no
rule <- data.frame(rule= c("C==9","B==3","C==4", "A == 1"),
action= c(rep("next",3), 'break'))
seq_rule(dat = dat,rule = rule)
#[1] FALSE
rule <- data.frame(rule= c("C==9","B==3","C==4", "A == 6"),
action= c(rep("next",3), 'break'))
seq_rule(dat = dat,rule = rule)
#[1] FALSE

Since the logic of your question is a bit complicated, I guess a straightforward way, e.g., using loops, might be more efficient and readable. Here is one version of seq_rule
seq_rule <- function(dat, rule, res.only = TRUE) {
m <- with(dat, as.data.frame(sapply(rule$rule, function(r) eval(str2expression(r)))))
rule_next <- with(rule, rule[action == "next"])
m_next <- m[rule_next]
idx <- na.omit(
Reduce(
function(x, y) {
k <- which(y)
ifelse(all(k <= x), NA, min(k[k > x]))
}, m_next,
init = 0, accumulate = TRUE
)
)[-1]
fidx <- head(idx, length(rule_next))
debug.vec <- replace(rep("no", nrow(dat)), fidx, rule_next[seq_along(fidx)])
trgs <- do.call(
rbind,
Map(
function(p, q) {
u <- as.matrix(m[p, ][q[q %in% with(rule, rule[action == "break"])]])
k <- which(u, arr.ind = TRUE)
data.frame(breakRowID = row.names(u)[k[, "row"]], breakTrigger = colnames(u)[k[, "col"]])
},
split(1:nrow(dat), cut(1:nrow(dat), c(0, idx, Inf))),
split.default(names(m), cumsum(rule$action != "break"))
)
)
triggerBreaks <- replace(rep("no", nrow(dat)), debug.vec != "no", NA)
if (!res.only) {
cbind(dat, debug.vec, trigger.break = with(trgs, replace(triggerBreaks, as.numeric(breakRowID), breakTrigger)))
} else {
nrow(trgs) == 0
}
}
and you will see
> seq_rule(dat = dat, rule = rule)
[1] FALSE
> seq_rule(dat = dat, rule = rule, res.only = FALSE)
A B C debug.vec trigger.break
1 3 9 2 no no
2 3 3 1 no no
3 10 4 9 A==10 & B==4 <NA>
4 2 1 9 C==9 <NA>
5 6 7 6 no no
6 5 5 5 A==5 <NA>
7 4 10 9 no no
8 6 7 10 no no
9 9 9 4 A==9 & B==9 <NA>
10 10 9 6 no A==10
11 5 10 8 no no
12 3 7 6 no no
13 9 5 6 no no
14 9 7 7 no no
15 9 5 1 no no
16 3 6 6 no no
17 8 9 2 no no
18 10 2 1 no A==10
19 7 5 2 A==7 & B==5 <NA>
20 10 8 4 no no

I want to say a huge thank you to everyone who tried to help me, as well as for unlimited patience ..
But it was impossible to help me because I myself did not fully understand what I wanted. Instead of breaking the question into several parts and asking separately (as it should be), I asked a big difficult question that I could hardly explain to myself.
I am very very sorry for that.
Here is my answer, this is what I wanted to get in the end.
seq_rule2 <- function(dat , rule ,res.only = TRUE){
# This is a fast function written by Thomas here
# https://stackoverflow.com/questions/68625542/match-all-logic-rules-with-a-dataframe-need-super-fast-function
# as an answer to my earlier question.
# It takes the rules as a vector and looks for the sequence
seq_rule <- function(dat, rule, res.only = TRUE) {
m <- with(dat, lapply(rule, function(r) eval(str2expression(r))))
fu <- function(x, y) {
k <- which(y)
ifelse(all(k <= x), NA, min(k[k > x]))
}
idx <- na.omit(Reduce( fu, m,init = 0, accumulate = TRUE ))[-1]
if (!res.only) {
fidx <- head(idx, length(rule))
debug.vec <- replace(rep("no", nrow(dat)), fidx, rule[seq_along(fidx)])
return(cbind(dat, debug.vec))
}
length(idx) >= length(rule)
}
#if there is only one next rule, then there is no point in continuing to return the FALSE and finish completely
if( length(rule$rule[rule$action=="next"]) <= 1 ) return(FALSE)
# STEP 1
# run seq_rule
yes.next.rule.seq <- seq_rule(dat = dat , rule = rule$rule[rule$action=="next"] , res.only = T)
if(res.only==FALSE & yes.next.rule.seq==FALSE) {
Next <- rep("no",nrow(dat))
Break <- rep("no",nrow(dat))
dat <- cbind(dat,Next=Next, Break=Break)
return(dat)
}
if(res.only==TRUE & yes.next.rule.seq==FALSE) return(FALSE)
# if the seq_rule found the sequence (TRUE) but there are no "break rules" in the "rule",
# then there is no point in searching for "break rules". Return TRUE and finish completely
if( length(rule$rule[rule$action=="break"]) == 0 & yes.next.rule.seq == TRUE) return(TRUE)
# STEP 2
#looking for break rules in the range between next rules
if(yes.next.rule.seq){
#get indices where the "next rules" triggered in dat
deb.vec <- seq_rule(dat = dat , rule = rule$rule[rule$action=="next"] , res.only = F)[,"debug.vec"]
idx.next.rules <- which(deb.vec!="no")
#get indices where the "break rules" triggered in dat
m <- with(dat, lapply(rule$rule[rule$action=="break"], function(r) eval(str2expression(r))))
idx.break.rules <- unlist(lapply(m,which))
# RES the final result is equal to TRUE,
# but if a "break rule" is found between the "next rules",
# then the RES will be false
RES <- TRUE
# sliding window of two "next rules" http://prntscr.com/1qhnzae
for(i in 2:length(idx.next.rules)){
temp.range <- idx.next.rules[ (i-1):i ]
# Check if there is any "break rule" index between the "next rule" indexes
break.detect <- any( idx.break.rules > temp.range[1] & idx.break.rules < temp.range[2] )
if( break.detect ) RES <- FALSE ; break
}
}
if(!res.only) {
Next <- rep("no",nrow(dat)) ; Next[idx.next.rules] <- "yes"
Break <- rep("no",nrow(dat)) ; Break[idx.break.rules] <- "yes"
dat <- cbind(dat,Next=Next, Break=Break)
return(dat)
}
return(RES)
}
data for to check
set.seed(963)
dat <- as.data.frame(matrix(sample(10,30,replace = T),ncol = 3))
colnames(dat) <- LETTERS[1:ncol(dat)]
rule <- cbind.data.frame(rule= c("A==9","B==4","C==4","A==4") ,
action= c("next","break","break","next"))
rule <- as.data.frame(rule,stringsAsFactors = F)
seq_rule2(dat = dat, rule = rule)
dat
rule
for example no breaks set.seed(963)
http://prntscr.com/1qhprxq
with break set.seed(930) http://prntscr.com/1qhpv2h

Related

Error in if (a[i][j] > 4) { : missing value where TRUE/FALSE needed

Find the number of entries in each row which are greater than 4.
set.seed(75)
aMat <- matrix( sample(10, size=60, replace=T), nr=6)
rowmax=function(a)
{
x=nrow(a)
y=ncol(a)
i=1
j=1
z=0
while (i<=x) {
for(j in 1:y) {
if(!is.na(a[i][j])){
if(a[i][j]>4){
z=z+1
}
}
j=j+1
}
print(z)
i=i+1
}
}
rowmax(aMat)
It is showing the error. I don't want to apply in built function

You could do this easier counting the x that are greater than 4 using length.
rowmax2 <- function(x) apply(x, 1, function(x) {x <- na.omit(x);length(x[x > 4])})
rowmax2(aMat)
# [1] 8 7 8 7 4 3

If you wanted to do this absolutely without any shortcut you could use two for loops. 1 for each row and another for each value in the row.
rowmax = function(a) {
y=nrow(a)
result <- numeric(y)
for(j in seq_len(y)) {
count = 0
for(val in a[j, ]) {
if(!is.na(val) && val > 4)
count = count + 1
}
result[j] <- count
}
return(result)
}
rowmax(aMat)
#[1] 8 7 8 7 4 3
If you wanted to do this using in-built functions in base R you could use rowSums.
rowSums(aMat > 4, na.rm = TRUE)
#[1] 8 7 8 7 4 3

There are several errors in you code:
You should put z <- 0 inside while loop
You should use a[i,j] for the matrix indexing, rather than a[i][j]
Below is a version after fixing the problems
rowmax <- function(a) {
x <- nrow(a)
y <- ncol(a)
i <- 1
j <- 1
while (i <= x) {
z <- 0
for (j in 1:y) {
if (!is.na(a[i, j])) {
if (a[i, j] > 4) {
z <- z + 1
}
}
j <- j + 1
}
print(z)
i <- i + 1
}
}
and then we get
> rowmax(aMat)
[1] 8
[1] 7
[1] 8
[1] 7
[1] 4
[1] 3
A concise approach to make it is using rowSums, e.g.,
rowSums(aMat, na.rm = TRUE)

how to implement dot placeholder in custom pipe?

I'm writing my custom pipe and I would like it to work similar to %>%:
using this:
abc <- c(1,2,3,4,5)
abc %mypipe% data.frame(a = ., b= 2 *., c = 3*.) %mypipe% filter(b>6)
I'd like to get this:
a b c
1 4 8 12
2 5 10 15
3 6 12 18
4 7 14 21
I was able to define %mypipe%, but my code fails when I use dot.
`%mypipe%` <-
function (lhs, rhs) {
rhs_call <- substitute(rhs)
eval(rhs_call, envir = list(. = lhs), enclos = parent.frame())
}

Implementing the dot placeholder is harder than implementing the pipe, since you need to recursively replace captured dots in expressions within the pipe calls. It all gets a bit difficult to reason about. You even have to recursively build and eval calls to your own dot-replacing function inside itself.
The following is a (roughly) working implementation:
replace_dot <- function(x, y) {
x <- match.call()$x
y <- match.call()$y
if(is.symbol(x))
if(as.character(x) == ".")
return(y)
else return(x)
if(is.call(x))
{
x_list <- as.list(x)[-1]
if(length(x_list) > 0)
if(as.character(x_list[[1]])[1] == "." &
as.character(as.list(x)[[1]])[1] != "$")
x_list <- x_list[-1]
for(i in seq_along(x_list)) {
x_list[[i]] <- eval(as.call(list(quote(replace_dot),
x = x_list[[i]], y = substitute(y))))
}
return(as.call(c(x[[1]], x_list)))
}
else return(x)
}
`%mypipe%` <- function(a, b)
{
mc <- as.list(match.call())
mc$b <- eval(as.call(list(quote(replace_dot), x = mc$b, y = substitute(a))))
mc$b <- as.call(append(as.list(mc$b), mc$a, 1))
eval(as.call(mc$b), envir = parent.frame())
}
So now we can do:
abc %mypipe% data.frame(a = ., b= 2 *., c = 3*.) %mypipe% select(c,b)
#> c b
#> 1 3 2
#> 2 6 4
#> 3 9 6
#> 4 12 8
#> 5 15 10

How to get rid of that inner condition?

If argument negation is true then the condition should be negated. Is there a more convenient way to write this?
foo <- function (x, type, negation){
if(type == 1){
condition <- x > 1
if(negation){
condition <- !condition
}
}
if(type == 2){
condition <- x == 5
if(negation){
condition <- !condition
}
}
x[condition]
}
EDIT:
example:
x <- 1:10
foo(x, 1, T) # 1
foo(x, 1, F) # 2 3 4 5 6 7 8 9 10
foo(x, 2, T) # 1 2 3 4 6 7 8 9 10
foo(x, 2, F) # 5

If there will be many types in future, consider using S3 OOP system.
If not:
foo <- function(x, type, negation) {
condition <- switch(
type,
`1` = x > 1,
`2` = x == 5
)
x[xor(negation, condition)]
}

(after #PoGibas comment):
foo <- function (x, type, negation){
if(type == 1){
condition <- x > 1
}
if(type == 2){
condition <- x == 5
}
if(negation){
condition <- !condition
}
x[condition]
}
any other ideas to improve it more?

Merge-sorting in R using lists instead of vectors

So i've written this basic code that sorts a list using the well-known merge-sorting algorithm, i've defined two functions mergelists that compares and merges the elements and mergesort that divides the list into single elements:
mergelists <- function(a,b) {
al <- length(a)
bl <- length(b)
r <- numeric(al+bl)
ai <- 1
bi <- 1
j <- 1
while((ai<=al) && (bi<=bl)) {
if(a[ai]<b[bi]) {
r[j] <- a[ai]
ai <- ai+1
} else {
r[j] <- b[bi]
bi <- bi+1
}
j <- j+1
}
if(ai<=al) r[j:(al+bl)] <- a[ai:al]
else if(bi<=bl) r[j:(al+bl)] <- b[bi:bl]
return(r)
}
mergesort <- function(x) {
l <- length(x)
if(l>1) {
p <- ceiling(l/2)
a <- mergesort(x[1:p])
b <- mergesort(x[(p+1):l])
return(mergelists(a,b))
}
return(x)
}
this seems to work fine for the examples i used so far, for example:
> mergesort(c(11,10,9,15,6,12,17,8,19,7))
[1] 6 7 8 9 10 11 12 15 17 19
now for the sake of some research i'm doing, i want to change this code to work with R-lists and not vectors, the lists are usually defined as following:
> list(number=10,data=c(10,5,8,2))
$number
[1] 10
$data
[1] 10 5 8 2
data represents here the vector and number is the number of comparaisons.
After the change i imagine that the program should give me something like this:
>mergelists(list(number=8,data=c(1,3,5,8,9,10)),list(number=5,data=c(2,4,6,7)))
$number
[1] 20
$data
[1] 1 2 3 4 5 6 7 8 9 10
> mergesort(c(11,10,9,15,6,12,17,8,19,7))
$number
[1] 22
$data
[1] 6 7 8 9 10 11 12 15 17 19
the 20 here is basically 8 + 5 + 7, because 7 comparaisons would be necessary to merge the two sorted lists, but i don't know how to do this because i have a little experience with R-lists. i would appreciate your help. Thanks.

The starting point for any vector vec is list(number = 0, data = vec), where number is 0 because it as taken 0 comparisons to start with an unsorted vector.
You first need to modify mergelists to deal with two lists, simply by adding the indexing and then reforming the list at the end.
mergelists <- function(a,b) {
firstn <- a$number + b$number
a <- a$data
b <- b$data
al <- length(a)
bl <- length(b)
r <- numeric(al+bl)
ai <- 1
bi <- 1
j <- 1
while((ai<=al) && (bi<=bl)) {
if(a[ai]<b[bi]) {
r[j] <- a[ai]
ai <- ai+1
} else {
r[j] <- b[bi]
bi <- bi+1
}
j <- j+1
}
if(ai<=al) r[j:(al+bl)] <- a[ai:al]
else if(bi<=bl) r[j:(al+bl)] <- b[bi:bl]
return(list(number = firstn + j - 1L, data = r))
}
mergelists(list(number=8,data=c(1,3,5,8,9,10)), list(number=5,data=c(2,4,6,7)))
# $number
# [1] 20
# $data
# [1] 1 2 3 4 5 6 7 8 9 10
Now that you have the "base function" defined, you need the calling function to generate the enhanced vector (list) and pass it accordingly. This function can easily be improved for efficiency, but I think its recursive properties are sound.
mergesort <- function(x) {
# this first guarantees that if called with a vector, it is list-ified,
# but if called with a list (i.e., every other time in the recursion),
# the argument is untouched
if (! is.list(x)) x <- list(number = 0, data = x)
l <- length(x$data)
if (l > 1) {
p <- ceiling(l/2)
# the `within(...)` trick is a sneaky trick, can easily be
# handled with pre-assignment/subsetting
a <- mergesort(within(x, { data <- data[1:p]; }))
b <- mergesort(within(x, { data <- data[(p+1):l]; }))
return(mergelists(a,b))
}
return(x)
}
mergesort(c(11,10,9,15,6,12,17,8,19,7))
# $number
# [1] 22
# $data
# [1] 6 7 8 9 10 11 12 15 17 19

Double sliding window

I want to create a double sliding window in a for loop. An example data set might look like:
a <- structure(list(a = c(0.0961136, 0.1028192, 0.1106424, 0.1106424,
0.117348, 0.117348, 0.117348, 0.122936, 0.1307592, 0.1307592,
0.1318768, 0.1318768, 0.1385824, 0.1385824, 0.1318768, 0.1251712,
0.1251712, 0.1251712, 0.1251712, 0.1251712)), .Names = "a", row.names = c(NA,
-20L), class = "data.frame")
The code I have so far looks like this:
windowSize <- 5
windowStep <- 1
dat <- list()
for (i in seq(from = 1, to = nrow(a), by = windowStep)){
window1 <- a[i:windowSize, ]
window2 <- a[i:windowSize + windowSize, ]
if (median(window1) <= 0.12 && (median(window1) >= 0.08)) {
p <- "True"
} else
p <- "not"
dat[[i]] <- c(p)
}
result <- as.data.frame(do.call(rbind, dat))
This example shows that I require two windows of size 5 (data points) to slide one in front of the other by 1 data point at a time. This example does not utilize window 2 because it doesn't work!(I will need it to work eventually) However using just window1 to calculate the median (in this case) at each step works but the output is incorrect. The if statements ask that if the median of window 1 is between 0.08 and 0.12 then output "True" else "not."
Output for my for loop =
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 not
11 not
12 not
13 not
14 not
15 not
16 not
17 not
18 not
19 not
20 not
Correct output as checked using rollapply (and obviously can be seen by eye)
rollapply(a, 5, FUN = median, by = 1, by.column = TRUE, partial = TRUE, align = c("left"))
should be:
1 True
2 True
3 True
4 not
5 not
6 not
7 not
8 not
9 not
10 not
11 not
12 not
13 not
14 not
15 not
16 not
17 not
18 not
19 not
20 not
Could the solution remain as a for loop if possible as I have much more to add but need to get this right first. Thanks.

This gets close..modified from: https://stats.stackexchange.com/questions/3051/mean-of-a-sliding-window-in-r
windowSize <- 10
windowStep <- 1
Threshold <- 0.12
a <- as.vector(a)
data <- a
slideFunct <- function(data, windowSize, WindowStep){
total <- length(data)
dataLength <- seq(from=1, to=(total-windowSize), by=windowStep)
result <- vector(length = length(dataLength))
for(i in 1:length(dataLength)){
result[i] <- if (median(data[dataLength[i]:(dataLength[i]+windowSize)]) <= Threshold)
result[i] <- "True"
else
result[i] <- "not"
}
return(result)
}

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

find a sequence of rules in a dataframe, with break rules - r

Related

Error in if (a[i][j] > 4) { : missing value where TRUE/FALSE needed

how to implement dot placeholder in custom pipe?

How to get rid of that inner condition?

Merge-sorting in R using lists instead of vectors

Double sliding window

Categories

Resources