how to implement dot placeholder in custom pipe? - r

I'm writing my custom pipe and I would like it to work similar to %>%:
using this:
abc <- c(1,2,3,4,5)
abc %mypipe% data.frame(a = ., b= 2 *., c = 3*.) %mypipe% filter(b>6)
I'd like to get this:
a b c
1 4 8 12
2 5 10 15
3 6 12 18
4 7 14 21
I was able to define %mypipe%, but my code fails when I use dot.
`%mypipe%` <-
function (lhs, rhs) {
rhs_call <- substitute(rhs)
eval(rhs_call, envir = list(. = lhs), enclos = parent.frame())
}

Implementing the dot placeholder is harder than implementing the pipe, since you need to recursively replace captured dots in expressions within the pipe calls. It all gets a bit difficult to reason about. You even have to recursively build and eval calls to your own dot-replacing function inside itself.
The following is a (roughly) working implementation:
replace_dot <- function(x, y) {
x <- match.call()$x
y <- match.call()$y
if(is.symbol(x))
if(as.character(x) == ".")
return(y)
else return(x)
if(is.call(x))
{
x_list <- as.list(x)[-1]
if(length(x_list) > 0)
if(as.character(x_list[[1]])[1] == "." &
as.character(as.list(x)[[1]])[1] != "$")
x_list <- x_list[-1]
for(i in seq_along(x_list)) {
x_list[[i]] <- eval(as.call(list(quote(replace_dot),
x = x_list[[i]], y = substitute(y))))
}
return(as.call(c(x[[1]], x_list)))
}
else return(x)
}
`%mypipe%` <- function(a, b)
{
mc <- as.list(match.call())
mc$b <- eval(as.call(list(quote(replace_dot), x = mc$b, y = substitute(a))))
mc$b <- as.call(append(as.list(mc$b), mc$a, 1))
eval(as.call(mc$b), envir = parent.frame())
}
So now we can do:
abc %mypipe% data.frame(a = ., b= 2 *., c = 3*.) %mypipe% select(c,b)
#> c b
#> 1 3 2
#> 2 6 4
#> 3 9 6
#> 4 12 8
#> 5 15 10

Related

Assign values to slice of data.frame column returned from function in R

I have a function that does some (complex) selecting of rows, and returns the corresponding values from one column. I then want to overwrite these values.
MWE
x = data.frame(a=c(1,2,3),b=c(4,5,6))
f = function(x,i){ return(x[x$a==i,'b']) }
f(x,2) <- 3
throws:
Error in f(x, 2) = 3 : could not find function "f<-"
Is there a way to assign these values from the function return?
No tidyverse please. Only base R.
The function should be
f <- function(x, i, val) {
if(missing(val)) {
x<- x[x$a==i,]
} else {
x$b[x$a ==i] <- val
}
return(x)
}
Then, when we run the code
> f(x, 2, 3)
a b
1 1 4
2 2 3
3 3 6
> f(x, 2) # missing the val
a b
2 2 5
If we want to update the object, use <-
x <- f(x, 2, 3)
An alternative would be to write your function in two ways: the original one, and a specific assignment function, so that the R parser will work on your original syntax:
f <- function(x, i) {
return(x[x$a == i, 'b'])
}
`f<-` <- function(x, i, value) {
x[x$a == i, 'b'] <- value
return(x)
}
So now you can do:
f(x, 2)
#> [1] 5
f(x, 2) <- 3
x
#> a b
#> 1 1 4
#> 2 2 3
#> 3 3 6
f(x, 2)
#> [1] 3

find a sequence of rules in a dataframe, with break rules

I showed how I see the implementation of this algorithm, I divided it into two steps
step one sequence search
step two check break rules
set.seed(123)
dat <- as.data.frame(matrix(sample(10,60,replace = T),ncol = 3))
colnames(dat) <- LETTERS[1:ncol(dat)]
dat
rule <- c("A==0","A==10 & B==4","C==9","A>10","B<0","C==0","A==5","A>10",
"B<0","C==0","A==9 & B==9","A>10","B<0","A==10","A==7 & B==5")
action <- c("break","next","next",rep("break",3),"next",rep("break",3),
"next",rep("break",3) ,"next")
rule <- cbind(rule,action)
I think this works -
seq_rule <- function(dat, rule, res.only = TRUE) {
value = rule$action
rule <- rule$rule
m <- with(dat, lapply(rule, function(r) eval(str2expression(r))))
fu <- function(x, y) {
k <- which(y)
ifelse(all(k <= x), NA, min(k[k > x]))
}
idx <- Reduce(fu , m,init = 0, accumulate = TRUE)[-1]
if (!res.only) {
idx <- na.omit(idx)
fidx <- head(idx, length(rule))
debug.vec <- replace(rep("no", nrow(dat)), fidx, rule[seq_along(fidx)])
return(cbind(dat, debug.vec))
}
if(any(value[!is.na(idx)] == 'break')) return(FALSE)
idx <- na.omit(idx)
length(idx) >= length(rule)
}
Here are some checks -
rule <- data.frame(rule= c("A==9","B==4","C==4","A==4", "B==10","C==4") ,
action= c(rep("next",3),"break","break","next"))
seq_rule(dat = dat,rule = rule)
#[1] FALSE
rule <- data.frame(rule= c("C==9","B==3","C==4"),
action= c(rep("next",3)))
seq_rule(dat = dat,rule = rule)
#[1] TRUE
seq_rule(dat = dat,rule = rule, res.only = FALSE)
# A B C debug.vec
#1 3 5 9 C==9
#2 3 3 3 B==3
#3 10 9 4 C==4
#4 2 9 1 no
#5 6 9 7 no
#6 5 3 5 no
#7 4 8 10 no
#8 6 10 7 no
#9 9 7 9 no
#10 10 10 9 no
rule <- data.frame(rule= c("C==9","B==3","C==4", "A == 1"),
action= c(rep("next",3), 'break'))
seq_rule(dat = dat,rule = rule)
#[1] FALSE
rule <- data.frame(rule= c("C==9","B==3","C==4", "A == 6"),
action= c(rep("next",3), 'break'))
seq_rule(dat = dat,rule = rule)
#[1] FALSE
Since the logic of your question is a bit complicated, I guess a straightforward way, e.g., using loops, might be more efficient and readable. Here is one version of seq_rule
seq_rule <- function(dat, rule, res.only = TRUE) {
m <- with(dat, as.data.frame(sapply(rule$rule, function(r) eval(str2expression(r)))))
rule_next <- with(rule, rule[action == "next"])
m_next <- m[rule_next]
idx <- na.omit(
Reduce(
function(x, y) {
k <- which(y)
ifelse(all(k <= x), NA, min(k[k > x]))
}, m_next,
init = 0, accumulate = TRUE
)
)[-1]
fidx <- head(idx, length(rule_next))
debug.vec <- replace(rep("no", nrow(dat)), fidx, rule_next[seq_along(fidx)])
trgs <- do.call(
rbind,
Map(
function(p, q) {
u <- as.matrix(m[p, ][q[q %in% with(rule, rule[action == "break"])]])
k <- which(u, arr.ind = TRUE)
data.frame(breakRowID = row.names(u)[k[, "row"]], breakTrigger = colnames(u)[k[, "col"]])
},
split(1:nrow(dat), cut(1:nrow(dat), c(0, idx, Inf))),
split.default(names(m), cumsum(rule$action != "break"))
)
)
triggerBreaks <- replace(rep("no", nrow(dat)), debug.vec != "no", NA)
if (!res.only) {
cbind(dat, debug.vec, trigger.break = with(trgs, replace(triggerBreaks, as.numeric(breakRowID), breakTrigger)))
} else {
nrow(trgs) == 0
}
}
and you will see
> seq_rule(dat = dat, rule = rule)
[1] FALSE
> seq_rule(dat = dat, rule = rule, res.only = FALSE)
A B C debug.vec trigger.break
1 3 9 2 no no
2 3 3 1 no no
3 10 4 9 A==10 & B==4 <NA>
4 2 1 9 C==9 <NA>
5 6 7 6 no no
6 5 5 5 A==5 <NA>
7 4 10 9 no no
8 6 7 10 no no
9 9 9 4 A==9 & B==9 <NA>
10 10 9 6 no A==10
11 5 10 8 no no
12 3 7 6 no no
13 9 5 6 no no
14 9 7 7 no no
15 9 5 1 no no
16 3 6 6 no no
17 8 9 2 no no
18 10 2 1 no A==10
19 7 5 2 A==7 & B==5 <NA>
20 10 8 4 no no
I want to say a huge thank you to everyone who tried to help me, as well as for unlimited patience ..
But it was impossible to help me because I myself did not fully understand what I wanted. Instead of breaking the question into several parts and asking separately (as it should be), I asked a big difficult question that I could hardly explain to myself.
I am very very sorry for that.
Here is my answer, this is what I wanted to get in the end.
seq_rule2 <- function(dat , rule ,res.only = TRUE){
# This is a fast function written by Thomas here
# https://stackoverflow.com/questions/68625542/match-all-logic-rules-with-a-dataframe-need-super-fast-function
# as an answer to my earlier question.
# It takes the rules as a vector and looks for the sequence
seq_rule <- function(dat, rule, res.only = TRUE) {
m <- with(dat, lapply(rule, function(r) eval(str2expression(r))))
fu <- function(x, y) {
k <- which(y)
ifelse(all(k <= x), NA, min(k[k > x]))
}
idx <- na.omit(Reduce( fu, m,init = 0, accumulate = TRUE ))[-1]
if (!res.only) {
fidx <- head(idx, length(rule))
debug.vec <- replace(rep("no", nrow(dat)), fidx, rule[seq_along(fidx)])
return(cbind(dat, debug.vec))
}
length(idx) >= length(rule)
}
#if there is only one next rule, then there is no point in continuing to return the FALSE and finish completely
if( length(rule$rule[rule$action=="next"]) <= 1 ) return(FALSE)
# STEP 1
# run seq_rule
yes.next.rule.seq <- seq_rule(dat = dat , rule = rule$rule[rule$action=="next"] , res.only = T)
if(res.only==FALSE & yes.next.rule.seq==FALSE) {
Next <- rep("no",nrow(dat))
Break <- rep("no",nrow(dat))
dat <- cbind(dat,Next=Next, Break=Break)
return(dat)
}
if(res.only==TRUE & yes.next.rule.seq==FALSE) return(FALSE)
# if the seq_rule found the sequence (TRUE) but there are no "break rules" in the "rule",
# then there is no point in searching for "break rules". Return TRUE and finish completely
if( length(rule$rule[rule$action=="break"]) == 0 & yes.next.rule.seq == TRUE) return(TRUE)
# STEP 2
#looking for break rules in the range between next rules
if(yes.next.rule.seq){
#get indices where the "next rules" triggered in dat
deb.vec <- seq_rule(dat = dat , rule = rule$rule[rule$action=="next"] , res.only = F)[,"debug.vec"]
idx.next.rules <- which(deb.vec!="no")
#get indices where the "break rules" triggered in dat
m <- with(dat, lapply(rule$rule[rule$action=="break"], function(r) eval(str2expression(r))))
idx.break.rules <- unlist(lapply(m,which))
# RES the final result is equal to TRUE,
# but if a "break rule" is found between the "next rules",
# then the RES will be false
RES <- TRUE
# sliding window of two "next rules" http://prntscr.com/1qhnzae
for(i in 2:length(idx.next.rules)){
temp.range <- idx.next.rules[ (i-1):i ]
# Check if there is any "break rule" index between the "next rule" indexes
break.detect <- any( idx.break.rules > temp.range[1] & idx.break.rules < temp.range[2] )
if( break.detect ) RES <- FALSE ; break
}
}
if(!res.only) {
Next <- rep("no",nrow(dat)) ; Next[idx.next.rules] <- "yes"
Break <- rep("no",nrow(dat)) ; Break[idx.break.rules] <- "yes"
dat <- cbind(dat,Next=Next, Break=Break)
return(dat)
}
return(RES)
}
data for to check
set.seed(963)
dat <- as.data.frame(matrix(sample(10,30,replace = T),ncol = 3))
colnames(dat) <- LETTERS[1:ncol(dat)]
rule <- cbind.data.frame(rule= c("A==9","B==4","C==4","A==4") ,
action= c("next","break","break","next"))
rule <- as.data.frame(rule,stringsAsFactors = F)
seq_rule2(dat = dat, rule = rule)
dat
rule
for example no breaks set.seed(963)
http://prntscr.com/1qhprxq
with break set.seed(930) http://prntscr.com/1qhpv2h

functional programing, iterations of a loop within child frame

I would like to build a function to use within a loop function, however the child cannot find the i from the parent function's for (i in 1:3). How would I avoid this problem? I was thinking it could be be solved with assign(X, envir=sys.frame(which=-1)) , although I'm not sure where to place it or the frame to select in this case...
#simplified issue
parent.fxn = function(X) {
for (i in 1:3)
child.fxn(X)
}
child.fxn = function(X) {
return(X[i])
}
#ex. run_script
#parent.fxn(1:10)
#Error in child.fxn(X) : object 'i' not found
For reference, here is a version of the actual script I tried to write (below)... the error from my script (below) was very slightly different from my simplified example (above), although I'm not sure why.
library(cluster)
library(data.table)
library(magrittr)
data = data.table(x=runif(300), y=runif(300), z=runif(300))
master.fxn = function(DT, nlvls=3, data.cols=c("x", "y", "z")) {
Lvls = paste0("Level_", 1:nlvls);
for (i in 1:nlvls) {
core.fxn(DT, Lvls, return.k=TRUE)
}
}
core.fxn = function(X, mkey, ...) {
X[, c(mkey[i]):=cluster.fxn(.SD, ...), .SDcols=data.cols, by=c(mkey[0:(i-1)])] %>%
setkeyv(c(mkey[1:i]))
}
cluster.fxn = function(X, return.ac=FALSE, return.k=FALSE) {
a = agnes(X, metric="euclidean", method="ward", stand=TRUE)
if (return.ac) return(a$ac)
if (return.k) return(cutree(a, k=2))
}
#ex. run_script
#master.fxn(data)
#Error in eval(bysub, parent.frame(), parent.frame()) : object 'i' not found
And finally, here is a working version of the actual script, which does not use a function that utilizes [i] within the loop, but achieves the desired outcome. These scripts are condensed to exemplify the error. It would really simplify things in the final product if I could get the core.fxn to work within the master.fxn loop.
library(cluster)
library(data.table)
library(magrittr)
data = data.table(x=runif(300), y=runif(300), z=runif(300))
master.fxn = function(DT, nlvls=3, data.cols=c("x", "y", "z")) {
Lvls = paste0("Level_", 1:nlvls);
for (i in 1:nlvls) {
DT[, c(Lvls[i]):=cluster.fxn(.SD, return.k=TRUE), .SDcols=data.cols, by=c(Lvls[0:(i-1)])] %>%
setkeyv(c(Lvls[1:i]))
}
}
cluster.fxn = function(X, return.ac=FALSE, return.k=FALSE) {
a = agnes(X, metric="euclidean", method="ward", stand=TRUE)
if (return.ac) return(a$ac)
if (return.k) return(cutree(a, k=2))
}
#ex. run_script
#> data
# x y z
# 1: 0.1934689 0.67631296 0.3083592
# 2: 0.5267910 0.93186454 0.9583132
# 3: 0.5533244 0.37712457 0.4022132
# 4: 0.1886627 0.07535931 0.1171205
# 5: 0.7499003 0.90682684 0.6104284
# ---
#296: 0.7196245 0.80206991 0.6657839
#297: 0.2453930 0.06807955 0.8126690
#298: 0.3481978 0.23024162 0.4734052
#299: 0.2123976 0.27191432 0.1753336
#300: 0.7312911 0.89491793 0.5417281
#
#>master.fxn(data)
#data[, .SD[1], by=.(Level_1, Level_2, Level_3)]
#Level_1 Level_2 Level_3 x y z
#1: 1 1 1 0.0584953 0.77933040 0.76432541
#2: 1 1 2 0.1814877 0.65263178 0.41425295
#3: 1 2 1 0.9932725 0.99409350 0.96849477
#4: 1 2 2 0.9102010 0.76071068 0.69283525
#5: 2 1 1 0.9040033 0.85361443 0.30636660
#6: 2 1 2 0.8026868 0.08595128 0.43176372
#7: 2 2 1 0.2167962 0.73551203 0.01174373
#8: 2 2 2 0.5592571 0.84508641 0.37382253
#>

Merge-sorting in R using lists instead of vectors

So i've written this basic code that sorts a list using the well-known merge-sorting algorithm, i've defined two functions mergelists that compares and merges the elements and mergesort that divides the list into single elements:
mergelists <- function(a,b) {
al <- length(a)
bl <- length(b)
r <- numeric(al+bl)
ai <- 1
bi <- 1
j <- 1
while((ai<=al) && (bi<=bl)) {
if(a[ai]<b[bi]) {
r[j] <- a[ai]
ai <- ai+1
} else {
r[j] <- b[bi]
bi <- bi+1
}
j <- j+1
}
if(ai<=al) r[j:(al+bl)] <- a[ai:al]
else if(bi<=bl) r[j:(al+bl)] <- b[bi:bl]
return(r)
}
mergesort <- function(x) {
l <- length(x)
if(l>1) {
p <- ceiling(l/2)
a <- mergesort(x[1:p])
b <- mergesort(x[(p+1):l])
return(mergelists(a,b))
}
return(x)
}
this seems to work fine for the examples i used so far, for example:
> mergesort(c(11,10,9,15,6,12,17,8,19,7))
[1] 6 7 8 9 10 11 12 15 17 19
now for the sake of some research i'm doing, i want to change this code to work with R-lists and not vectors, the lists are usually defined as following:
> list(number=10,data=c(10,5,8,2))
$number
[1] 10
$data
[1] 10 5 8 2
data represents here the vector and number is the number of comparaisons.
After the change i imagine that the program should give me something like this:
>mergelists(list(number=8,data=c(1,3,5,8,9,10)),list(number=5,data=c(2,4,6,7)))
$number
[1] 20
$data
[1] 1 2 3 4 5 6 7 8 9 10
> mergesort(c(11,10,9,15,6,12,17,8,19,7))
$number
[1] 22
$data
[1] 6 7 8 9 10 11 12 15 17 19
the 20 here is basically 8 + 5 + 7, because 7 comparaisons would be necessary to merge the two sorted lists, but i don't know how to do this because i have a little experience with R-lists. i would appreciate your help. Thanks.
The starting point for any vector vec is list(number = 0, data = vec), where number is 0 because it as taken 0 comparisons to start with an unsorted vector.
You first need to modify mergelists to deal with two lists, simply by adding the indexing and then reforming the list at the end.
mergelists <- function(a,b) {
firstn <- a$number + b$number
a <- a$data
b <- b$data
al <- length(a)
bl <- length(b)
r <- numeric(al+bl)
ai <- 1
bi <- 1
j <- 1
while((ai<=al) && (bi<=bl)) {
if(a[ai]<b[bi]) {
r[j] <- a[ai]
ai <- ai+1
} else {
r[j] <- b[bi]
bi <- bi+1
}
j <- j+1
}
if(ai<=al) r[j:(al+bl)] <- a[ai:al]
else if(bi<=bl) r[j:(al+bl)] <- b[bi:bl]
return(list(number = firstn + j - 1L, data = r))
}
mergelists(list(number=8,data=c(1,3,5,8,9,10)), list(number=5,data=c(2,4,6,7)))
# $number
# [1] 20
# $data
# [1] 1 2 3 4 5 6 7 8 9 10
Now that you have the "base function" defined, you need the calling function to generate the enhanced vector (list) and pass it accordingly. This function can easily be improved for efficiency, but I think its recursive properties are sound.
mergesort <- function(x) {
# this first guarantees that if called with a vector, it is list-ified,
# but if called with a list (i.e., every other time in the recursion),
# the argument is untouched
if (! is.list(x)) x <- list(number = 0, data = x)
l <- length(x$data)
if (l > 1) {
p <- ceiling(l/2)
# the `within(...)` trick is a sneaky trick, can easily be
# handled with pre-assignment/subsetting
a <- mergesort(within(x, { data <- data[1:p]; }))
b <- mergesort(within(x, { data <- data[(p+1):l]; }))
return(mergelists(a,b))
}
return(x)
}
mergesort(c(11,10,9,15,6,12,17,8,19,7))
# $number
# [1] 22
# $data
# [1] 6 7 8 9 10 11 12 15 17 19

R - Looping through datasets and change column names

I'm trying to loop through a bunch of datasets and change columns in R.
I have a bunch of datasets, say a,b,c,etc, and all of them have three columns, say X, Y, Z.
I would like to change their names to be a_X, a_Y, a_Z for dataset a, and b_X, b_Y, b_Z for dataset b, and so on.
Here's my code:
name.list = ("a","b","c")
for(i in name.list){
names(i) = c(paste(i,"_X",sep = ""),paste(i,"_Y",sep = ""),paste(i,"_Y",sep = ""));
}
However, the code above doesn't work since i is in text format.
I've considered assign function but doesn't seem to fit as well.
I would appreciate if any ideas.
Something like this :
list2env(lapply(mget(name.list),function(dat){
colnames(dat) <- paste(nn,colnames(dat),sep='_')
dat
}),.GlobalEnv)
for ( i in name.list) {
assign(i, setNames( get(i), paste(i, names(get(i)), sep="_")))
}
> a
a_X a_Y a_Z
1 1 3 A
2 2 4 B
> b
b_X b_Y b_Z
1 1 3 A
2 2 4 B
> c
c_X c_Y c_Z
1 1 3 A
2 2 4 B
Here's some free data:
a <- data.frame(X = 1, Y = 2, Z = 3)
b <- data.frame(X = 4, Y = 5, Z = 6)
c <- data.frame(X = 7, Y = 8, Z = 9)
And here's a method that uses mget and a custom function foo
name.list <- c("a", "b", "c")
foo <- function(x, i) setNames(x, paste(name.list[i], names(x), sep = "_"))
list2env(Map(foo, mget(name.list), seq_along(name.list)), .GlobalEnv)
a
# a_X a_Y a_Z
# 1 1 2 3
b
# b_X b_Y b_Z
# 1 4 5 6
c
# c_X c_Y c_Z
# 1 7 8 9
You could also avoid get or mget by putting a, b, and c into their own environment (or even a list). You also wouldn't need the name.list vector if you go this route, because it's the same as ls(e)
e <- new.env()
e$a <- a; e$b <- b; e$c <- c
bar <- function(x, y) setNames(x, paste(y, names(x), sep = "_"))
list2env(Map(bar, as.list(e), ls(e)), .GlobalEnv)
Another perk of doing it this way is that you still have the untouched data frames in the environment e. Nothing was overwritten (check a versus e$a).

Resources