Related
I want to write a decorator function that adds a counter to a function, counting the number of times it was called. E.g.
foo <- function(x) {x}
foo <- counter_decorator(foo)
foo(1)
foo(1)
# => the counter gets incremented with each call and has the value 2 now
The approach below basically works, but:
I want the inner function (which is returned by the decorator) to have the same formal args as the original function and not just ellipsis (i.e. ...). I am not sure how to accomplish that. Any ideas?
Not sure if the whole approach is a good one. Alternatives or improvements are appreciated.
Here is what I did so far:
# Init or reset counter
counter_init <- function() {
.counters <<- list()
}
# Decorate a function with a counter
#
# Each time the function is called the counter is incremented
#
# fun: function to be decorated
# fun_name: name in .counters list to store number of times in
#
counter_decorator <- function(fun, fun_name = NULL)
{
# use function name if no name is passed explicitly
if (is.null(fun_name)) {
fun_name <- deparse(substitute(fun))
}
fun <- force(fun) # deep copy to prevent infinite recursion
function(...) { # ==> ellipsis not optimal!
n <- .counters[[fun_name]]
if (is.null(n)) {
n <- 0
}
.counters[[fun_name]] <<- n + 1
fun(...)
}
}
Now let's create some functions and decorate them.
library(dplyr) # for pipe
# Create functions and decorate them with a counter
# create and decorate in second call
add_one <- function(x) {
x + 1
}
add_one <- counter_decorator(add_one)
# create and decorate the piping way by passing the fun_name arg
add_two <- {function(x) {
x + 2
}} %>% counter_decorator(fun_name = "add_two")
mean <- counter_decorator(mean)
counter_init()
for (i in 1:100) {
add_one(1)
add_two(1)
mean(1)
}
What we get in the .counters list is
> .counters
$add_one
[1] 100
$add_two
[1] 100
$mean
[1] 100
which is basically what I want.
1) The trace command can be used. Use untrace to undo the trace or set .counter to any desired value to start over again from that value.
f <- function(x) x
trace(f, quote(.counter <<- .counter + 1), print = FALSE)
.counter <- 0
f(1)
## [1] 1
f(1)
## [1] 1
.counter
## [1] 2
2) This variation stores the counter in an attribute of f.
f <- function(x) x
trace(f, quote(attr(f, "counter") <<- attr(f, "counter") + 1), print = FALSE)
attr(f, "counter") <- 0
f(1)
## [1] 1
f(1)
## [1] 1
attr(f, "counter")
## [1] 2
3) This variation stores the counter in an option.
f <- function(x) x
trace(f, quote(options(counter = getOption("counter", 0) + 1)), print = FALSE)
f(1)
## [1] 1
f(1)
## [1] 1
getOption("counter")
## [1] 2
This method stores the counter within the wrapper function itself instead of somewhere in the users environment or package environment. (There's nothing wrong with the latter; the former can be problematic or at least annoying/discourteous.)
The biggest side-effect (liability?) of this is when the package is detached or reloaded (i.e., during development), then the counter list is cleared/re-initialized.
counter_decorator <- function(fun) {
.counter <- 0L
fun2 <- function(...) {
.counter <<- .counter + 1L
cl <- match.call()
cl[[1]] <- fun
eval.parent(cl)
}
formals(fun2) <- formals(args(fun))
fun2
}
Demo:
foo <- function(x, y) x + y
foo2 <- counter_decorator(foo)
get(".counter", envir = environment(foo2))
# [1] 0
foo2(5, 9)
# [1] 14
foo2(5, 11)
# [1] 16
foo2(5, 13)
# [1] 18
get(".counter", envir = environment(foo2))
# [1] 3
Same formals:
formals(foo)
# $x
# $y
formals(foo2)
# $x
# $y
Edited (twice) to better track primitives where formals(.) is NULL; in that case, we can use formals(args(fun)).
Adapted for your preferred methodology, albeit with a little poetic liberty:
counters <- local({
.counters <- list()
function(init = FALSE) {
out <- .counters # will return counters *before* initialization
if (init) .counters <<- list()
out
}
})
counter_decorator <- function(fun, fun_name) {
if (missing(fun_name)) {
fun_name <- deparse(substitute(fun))
}
count <- get(".counters", envir = environment(counters))
count[[fun_name]] <- 0L
assign(".counters", count, envir = environment(counters))
fun2 <- function(...) {
.count <- get(".counters", envir = environment(counters))
.count[[fun_name]] <- if (is.null(.count[[fun_name]])) 1L else .count[[fun_name]] + 1L
assign(".counters", .count, envir = environment(counters))
cl <- match.call()
cl[[1]] <- fun
eval.parent(cl)
}
formals(fun2) <- formals(args(fun))
fun2
}
add_one <- function(x) {
x + 1
}
add_one <- counter_decorator(add_one)
add_two <- {function(x) {
x + 2
}} %>% counter_decorator(fun_name = "add_two")
new_mean <- counter_decorator(mean)
for (i in 1:100) {
add_one(1)
add_two(1)
new_mean(1)
}
counters()
# $add_one
# [1] 100
# $add_two
# [1] 100
# $mean
# [1] 100
formals(new_mean)
# $x
# $...
Initialization is not strictly required. Re-initialization returns the counters before reinitializing, so you don't need a double-call to get the values and then reset (and if you don't care about previous values, just ignore its return).
counters(TRUE)
# $add_one
# [1] 100
# $add_two
# [1] 100
# $mean
# [1] 100
counters()
# list()
add_one(10)
# [1] 11
counters()
# $add_one
# [1] 1
Is it possible to retrieve the function components of a function call? That is, is it possible to use as.list(match.call()) on another function call.
The background is, that I want to have a function that takes a function-call and returns the components of said function call.
get_formals <- function(x) {
# something here, which would behave as if x would be a function that returns
# as.list(match.call())
}
get_formals(mean(1:10))
# expected to get:
# [[1]]
# mean
#
# $x
# 1:10
The expected result is to have get_formals return as match.call() was called within the supplied function call.
mean2 <- function(...) {
as.list(match.call())
}
mean2(x = 1:10)
# [[1]]
# mean2
#
# $x
# 1:10
Another Example
The motivation behind this question is to check if a memoised function already contains the cached values. memoise has the function has_cache() but it needs to be called in a specific way has_cache(foo)(vals), e.g.,
library(memoise)
foo <- function(x) mean(x)
foo_cached <- memoise(foo)
foo_cached(1:10) # not yet cached
foo_cached(1:10) # cached
has_cache(foo_cached)(1:10) # TRUE
has_cache(foo_cached)(1:3) # FALSE
My goal is to log something if the function call is cached or not.
cache_wrapper <- function(f_call) {
is_cached <- has_cache()() # INSERT SOLUTION HERE
# I need to deconstruct the function call to pass it to has_cache
# basically
# has_cache(substitute(expr)[[1L]])(substitute(expr)[[2L]])
# but names etc do not get passed correctly
if (is_cached) print("Using Cache") else print("New Evaluation of f_call")
f_call
}
cache_wrapper(foo_cached(1:10))
#> [1] "Using Cache" # From the log-functionality
#> 5.5 # The result from the function-call
You can use match.call() to do argument matching.
get_formals <- function(expr) {
call <- substitute(expr)
call_matched <- match.call(eval(call[[1L]]), call)
as.list(call_matched)
}
get_formals(mean(1:10))
# [[1]]
# mean
#
# $x
# 1:10
library(ggplot2)
get_formals(ggplot(mtcars, aes(x = mpg, y = hp)))
# [[1]]
# ggplot
#
# $data
# mtcars
#
# $mapping
# aes(x = mpg, y = hp)
library(dplyr)
get_formals(iris %>% select(Species))
# [[1]]
# `%>%`
#
# $lhs
# iris
#
# $rhs
# select(Species)
Edit: Thanks for #KonradRudolph's suggestion!
The function above finds the right function. It will search in the scope of the parent of get_formals(), not in that of the caller. The much safer way is:
get_formals <- function(expr) {
call <- substitute(expr)
call_matched <- match.call(eval.parent(bquote(match.fun(.(call[[1L]])))), call)
as.list(call_matched)
}
The match.fun() is important to correctly resolve functions that are shadowed by a non-function object of the same name. For example, if mean is overwrited with a vector
mean <- 1:5
The first example of get_formals() will get an error, while the updated version works well.
Here's a way to do it that also gets the default values from the function if you didn't supply all the arguments:
get_formals <- function(call)
{
f_list <- as.list(match.call()$call)
func_name <- f_list[[1]]
p_list <- formals(eval(func_name))
f_list <- f_list[-1]
ss <- na.omit(match(names(p_list), names(f_list)))
if(length(ss) > 0) {
p_list[na.omit(match(names(f_list), names(p_list)))] <- f_list[ss]
f_list <- f_list[-ss]
}
unnamed <- which(!nzchar(sapply(p_list, as.character)))
if(length(unnamed) > 0)
{
i <- 1
while(length(f_list) > 0)
{
p_list[[unnamed[i]]] <- f_list[[1]]
f_list <- f_list[-1]
i <- i + 1
}
}
c(func_name, p_list)
}
Which gives:
get_formals(rnorm(1))
[[1]]
rnorm
$n
[1] 1
$mean
[1] 0
$sd
[1] 1
get_formals(ggplot2::ggplot())
[[1]]
ggplot2::ggplot
$data
NULL
$mapping
aes()
$...
$environment
parent.frame()
To get this to work one level in you could do something like:
foo <- function(f_call) {
eval(as.call(list(get_formals, call = match.call()$f_call)))
}
foo(mean(1:10))
[[1]]
mean
$x
1:10
$...
This answer is mostly based on Allens answer, but implements Konrads comment regarding the eval and eval.parent functions.
Additionally, some do.call is thrown in to finalise the cache_wrapper from the example above:
library(memoise)
foo <- function(x) mean(x)
foo_cached <- memoise(foo)
foo_cached(1:10) # not yet cached
#> [1] 5.5
foo_cached(1:10) # cached
#> [1] 5.5
has_cache(foo_cached)(1:10)
#> [1] TRUE
has_cache(foo_cached)(1:3)
#> [1] FALSE
# As answered by Allen with Konrads comment
get_formals <- function(call) {
f_list <- as.list(match.call()$call)
func_name <- f_list[[1]]
# changed eval to eval.parent as suggested by Konrad...
p_list <- formals(eval.parent(eval.parent(bquote(match.fun(.(func_name))))))
f_list <- f_list[-1]
ss <- na.omit(match(names(p_list), names(f_list)))
if(length(ss) > 0) {
p_list[na.omit(match(names(f_list), names(p_list)))] <- f_list[ss]
f_list <- f_list[-ss]
}
unnamed <- which(!nzchar(sapply(p_list, as.character)))
if(length(unnamed) > 0) {
i <- 1
while(length(f_list) > 0) {
p_list[[unnamed[i]]] <- f_list[[1]]
f_list <- f_list[-1]
i <- i + 1
}
}
c(func_name, p_list)
}
# check if the function works with has_cache
fmls <- get_formals(foo_cached(x = 1:10))
do.call(has_cache(eval(parse(text = fmls[1]))),
fmls[2])
#> [1] TRUE
# implement a small wrapper around has_cache that reports if its using cache
cache_wrapper <- function(f_call) {
fmls <- eval(as.call(list(get_formals, call = match.call()$f_call)))
is_cached <- do.call(has_cache(eval(parse(text = fmls[1]))),
fmls[2])
if (is_cached) print("Using Cache") else print("New Evaluation of f_call")
f_call
}
cache_wrapper(foo_cached(x = 1:10))
#> [1] "Using Cache"
#> [1] 5.5
cache_wrapper(foo_cached(x = 1:30))
#> [1] "New Evaluation of f_call"
#> [1] 5.5
I would like to apply the same function certain number of times on a vector using the output from the function every time.
A simplified example with a simple function just to demonstrate:
# sample vector
a <- c(1,2,3)
# function to be applied n times
f1 <- function(x) {
x^2 + x^3
}
I would like to apply f1 on a, n number of times, for example here lets say 3 times.
I heard purrr::reduce or purrr::map() might be a good idea for this but couldn't make it work.
The desired output if n = 3 would be equal to f1(f1(f1(a))).
Let's use Reduce (no external library requirements, generally good performance). I'll modify the function slightly to accept a second (ignored) argument:
f1 <- function(x, ign) x^2 + x^3
Reduce(f1, 1:3, init = a)
# [1] 1.872000e+03 6.563711e+09 1.102629e+14
Here's what's happening. Reduce:
uses a binary function to successively combine the elements of a given vector and a possibly given initial value.
The first argument is the function to use, and it should accept two arguments. The first is the value from the previous execution of the function in this reduction. On the first call of the function, it uses the init= value provided.
First call:
f1(c(1,2,3), 1)
# [1] 2 12 36
Second call:
f1(c(2,12,36), 2)
# [1] 12 1872 47952
Third call:
f1(c(12,1872,47952), 3)
# [1] 1.872000e+03 6.563711e+09 1.102629e+14
The second argument 1:3 is used just for its length. Anything of the proper length will work.
If you don't want to redefine f1 just for this reduction, you can always do
Reduce(function(a,ign) f1(a), ...)
Benchmark:
library(microbenchmark)
r <- Reduce(function(a,b) call("f1", a), 1:3, init=quote(a))
triple_f1 <- function(a) f1(f1(f1(a)))
microbenchmark::microbenchmark(
base = Reduce(function(a,ign) f1(a), 1:3, a),
accum = a %>% accumulate(~ .x %>% f1, .init = f1(a)) %>% extract2(3),
reduc = purrr::reduce(1:3, function(a,ign) f1(a), .init=a),
whil = {
i <- 1
a <- c(1,2,3)
while (i < 10) {
i <- i + 1
a <- f1(a)
}
},
forloop = {
out <- a
for(i in seq_len(3)) out <- f1(out)
},
evaluated = {
r <- Reduce(function(a,b) call("f1", a), 1:3, init=quote(a))
eval(r)
},
precompiled = eval(r),
anotherfun = triple_f1(a)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# base 5.101 7.3015 18.28691 9.3010 10.8510 848.302 100
# accum 294.201 328.4015 381.21204 356.1520 402.6510 823.602 100
# reduc 27.000 38.1005 57.55694 45.2510 54.2005 747.401 100
# whil 1717.300 1814.3510 1949.03100 1861.8510 1948.9510 2931.001 100
# forloop 1110.001 1167.1010 1369.87696 1205.5010 1292.6500 9935.501 100
# evaluated 6.702 10.2505 22.18598 13.3015 15.5510 715.301 100
# precompiled 2.300 3.2005 4.69090 4.0005 4.5010 26.800 100
# anotherfun 1.400 2.0515 12.85201 2.5010 3.3505 1017.801 100
i <- 1
while (i < 10) {
i <- i + 1
x <- f(x)
}
Here is an option with accumulate
library(tidyverse)
n <- 3
a %>%
accumulate(~ .x %>%
f1, .init = f1(a)) %>%
extract2(n)
#[1] 1.872000e+03 6.563711e+09 1.102629e+14
NOTE: accumulate is similar to the base R option Reduce with accumulate = TRUE
checking with the OP's output
f1(f1(f1(a)))
#[1] 1.872000e+03 6.563711e+09 1.102629e+14
Or use a for loop (no external libraries used)
out <- a
for(i in seq_len(n)) out <- f1(out)
out
#[1] 1.872000e+03 6.563711e+09 1.102629e+14
Here's another way to do it with Reduce:
setting the stage
a <- 1:3
f1 <- function(x) x^2 + x^3
constructing a call and evaluating it
N <- 3 # how many times?
r <- Reduce(function(a,b) call("f1", a), rep(NA, N), init=a)
# f1(f1(f1(1:3)))
eval(r)
# [1] 1.872000e+03 6.563711e+09 1.102629e+14
alternative 2
# N defined as above
Reduce(function(x,y) y(x), replicate(N,f1), init=a)
# [1] 1.872000e+03 6.563711e+09 1.102629e+14
alternative 3 (recursive with a global-like variable)
doit <- function(N) {
i <- 0
function(fun, x){
i <<- i +1
if(i < N) Recall(fun, fun(x)) else fun(x)
}
}
doit(3)(f1, a)
# [1] 1.872000e+03 6.563711e+09 1.102629e+14
... or even
doit <- function(N, fun, x) (function(fun, x)
if((N <<- N - 1) > 0)
Recall(fun, fun(x)) else
fun(x))(fun, x)
doit(3, f1, a)
# [1] 1.872000e+03 6.563711e+09 1.102629e+14
I am trying to achieve the functionality similar to unlist, with the exception that types are not coerced to a vector, but the list with preserved types is returned instead. For instance:
flatten(list(NA, list("TRUE", list(FALSE), 0L))
should return
list(NA, "TRUE", FALSE, 0L)
instead of
c(NA, "TRUE", "FALSE", "0")
which would be returned by unlist(list(list(NA, list("TRUE", list(FALSE), 0L)).
As it is seen from the example above, the flattening should be recursive. Is there a function in standard R library which achieves this, or at least some other function which can be used to easily and efficiently implement this?
UPDATE: I don't know if it is clear from the above, but non-lists should not be flattened, i.e. flatten(list(1:3, list(4, 5))) should return list(c(1, 2, 3), 4, 5).
Interesting non-trivial problem!
MAJOR UPDATE With all that's happened, I've rewrote the answer and removed some dead ends. I also timed the various solutions on different cases.
Here's the first, rather simple but slow, solution:
flatten1 <- function(x) {
y <- list()
rapply(x, function(x) y <<- c(y,x))
y
}
rapply lets you traverse a list and apply a function on each leaf element. Unfortunately, it works exactly as unlist with the returned values. So I ignore the result from rapply and instead I append values to the variable y by doing <<-.
Growing y in this manner is not very efficient (it's quadratic in time). So if there are many thousands of elements this will be very slow.
A more efficient approach is the following, with simplifications from #JoshuaUlrich:
flatten2 <- function(x) {
len <- sum(rapply(x, function(x) 1L))
y <- vector('list', len)
i <- 0L
rapply(x, function(x) { i <<- i+1L; y[[i]] <<- x })
y
}
Here I first find out the result length and pre-allocate the vector. Then I fill in the values.
As you can will see, this solution is much faster.
Here's a version of #JoshO'Brien great solution based on Reduce, but extended so it handles arbitrary depth:
flatten3 <- function(x) {
repeat {
if(!any(vapply(x, is.list, logical(1)))) return(x)
x <- Reduce(c, x)
}
}
Now let the battle begin!
# Check correctness on original problem
x <- list(NA, list("TRUE", list(FALSE), 0L))
dput( flatten1(x) )
#list(NA, "TRUE", FALSE, 0L)
dput( flatten2(x) )
#list(NA, "TRUE", FALSE, 0L)
dput( flatten3(x) )
#list(NA_character_, "TRUE", FALSE, 0L)
# Time on a huge flat list
x <- as.list(1:1e5)
#system.time( flatten1(x) ) # Long time
system.time( flatten2(x) ) # 0.39 secs
system.time( flatten3(x) ) # 0.04 secs
# Time on a huge deep list
x <-'leaf'; for(i in 1:11) { x <- list(left=x, right=x, value=i) }
#system.time( flatten1(x) ) # Long time
system.time( flatten2(x) ) # 0.05 secs
system.time( flatten3(x) ) # 1.28 secs
...So what we observe is that the Reduce solution is faster when the depth is low, and the rapply solution is faster when the depth is large!
As correctness goes, here are some tests:
> dput(flatten1( list(1:3, list(1:3, 'foo')) ))
list(1L, 2L, 3L, 1L, 2L, 3L, "foo")
> dput(flatten2( list(1:3, list(1:3, 'foo')) ))
list(1:3, 1:3, "foo")
> dput(flatten3( list(1:3, list(1:3, 'foo')) ))
list(1L, 2L, 3L, 1:3, "foo")
Unclear what result is desired, but I lean towards the result from flatten2...
For lists that are only a few nestings deep, you could use Reduce() and c() to do something like the following. Each application of c() removes one level of nesting. (For fully general solution, see EDITs below.)
L <- (list(NA, list("TRUE", list(FALSE), 0L)))
Reduce(c, Reduce(c, L))
[[1]]
[1] NA
[[2]]
[1] "TRUE"
[[3]]
[1] FALSE
[[4]]
[1] 0
# TIMING TEST
x <- as.list(1:4e3)
system.time(flatten(x)) # Using the improved version
# user system elapsed
# 0.14 0.00 0.13
system.time(Reduce(c, x))
# user system elapsed
# 0.04 0.00 0.03
EDIT Just for fun, here's a version of #Tommy's version of #JoshO'Brien's solution that does work for already flat lists. FURTHER EDIT Now #Tommy's solved that problem as well, but in a cleaner way. I'll leave this version in place.
flatten <- function(x) {
x <- list(x)
repeat {
x <- Reduce(c, x)
if(!any(vapply(x, is.list, logical(1)))) return(x)
}
}
flatten(list(3, TRUE, 'foo'))
# [[1]]
# [1] 3
#
# [[2]]
# [1] TRUE
#
# [[3]]
# [1] "foo"
How about this? It builds off Josh O'Brien's solution but does the recursion with a while loop instead using unlist with recursive=FALSE.
flatten4 <- function(x) {
while(any(vapply(x, is.list, logical(1)))) {
# this next line gives behavior like Tommy's answer;
# removing it gives behavior like Josh's
x <- lapply(x, function(x) if(is.list(x)) x else list(x))
x <- unlist(x, recursive=FALSE)
}
x
}
Keeping the commented line in gives results like this (which Tommy prefers, and so do I, for that matter).
> x <- list(1:3, list(1:3, 'foo'))
> dput(flatten4(x))
list(1:3, 1:3, "foo")
Output from my system, using Tommy's tests:
dput(flatten4(foo))
#list(NA, "TRUE", FALSE, 0L)
# Time on a long
x <- as.list(1:1e5)
system.time( x2 <- flatten2(x) ) # 0.48 secs
system.time( x3 <- flatten3(x) ) # 0.07 secs
system.time( x4 <- flatten4(x) ) # 0.07 secs
identical(x2, x4) # TRUE
identical(x3, x4) # TRUE
# Time on a huge deep list
x <-'leaf'; for(i in 1:11) { x <- list(left=x, right=x, value=i) }
system.time( x2 <- flatten2(x) ) # 0.05 secs
system.time( x3 <- flatten3(x) ) # 1.45 secs
system.time( x4 <- flatten4(x) ) # 0.03 secs
identical(x2, unname(x4)) # TRUE
identical(unname(x3), unname(x4)) # TRUE
EDIT: As for getting the depth of a list, maybe something like this would work; it gets the index for each element recursively.
depth <- function(x) {
foo <- function(x, i=NULL) {
if(is.list(x)) { lapply(seq_along(x), function(xi) foo(x[[xi]], c(i,xi))) }
else { i }
}
flatten4(foo(x))
}
It's not super fast but it seems to work fine.
x <- as.list(1:1e5)
system.time(d <- depth(x)) # 0.327 s
x <-'leaf'; for(i in 1:11) { x <- list(left=x, right=x, value=i) }
system.time(d <- depth(x)) # 0.041s
I'd imagined it being used this way:
> x[[ d[[5]] ]]
[1] "leaf"
> x[[ d[[6]] ]]
[1] 1
But you could also get a count of how many nodes are at each depth too.
> table(sapply(d, length))
1 2 3 4 5 6 7 8 9 10 11
1 2 4 8 16 32 64 128 256 512 3072
Edited to address a flaw pointed out in the comments. Sadly, it just makes it even less efficient. Ah well.
Another approach, although I'm not sure it will be more efficient than anything #Tommy has suggested:
l <- list(NA, list("TRUE", list(FALSE), 0L))
flatten <- function(x){
obj <- rapply(x,identity,how = "unlist")
cl <- rapply(x,class,how = "unlist")
len <- rapply(x,length,how = "unlist")
cl <- rep(cl,times = len)
mapply(function(obj,cl){rs <- as(obj,cl); rs}, obj, cl,
SIMPLIFY = FALSE, USE.NAMES = FALSE)
}
> flatten(l)
[[1]]
[1] NA
[[2]]
[1] "TRUE"
[[3]]
[1] FALSE
[[4]]
[1] 0
purrr::flatten achieves that. Though it is not recursive (by design).
So applying it twice should work:
library(purrr)
l <- list(NA, list("TRUE", list(FALSE), 0L))
flatten(flatten(l))
Here is an attempt at a recursive version:
flatten_recursive <- function(x) {
stopifnot(is.list(x))
if (any(vapply(x, is.list, logical(1)))) Recall(purrr::flatten(x)) else x
}
flatten_recursive(l)
hack_list <- function(.list) {
.list[['_hack']] <- function() NULL
.list <- unlist(.list)
.list$`_hack` <- NULL
.list
}
You can also use rrapply in the rrapply-package (extended version of base-rapply) by setting how = "flatten":
library(rrapply)
rrapply(list(NA, list("TRUE", list(FALSE), 0L)), how = "flatten")
#> [[1]]
#> [1] NA
#>
#> [[2]]
#> [1] "TRUE"
#>
#> [[3]]
#> [1] FALSE
#>
#> [[4]]
#> [1] 0
Computation times
Below are some benchmark timings against the flatten2 and flatten3 functions in Tommy's response for two large nested lists:
flatten2 <- function(x) {
len <- sum(rapply(x, function(x) 1L))
y <- vector('list', len)
i <- 0L
rapply(x, function(x) { i <<- i+1L; y[[i]] <<- x })
y
}
flatten3 <- function(x) {
repeat {
if(!any(vapply(x, is.list, logical(1)))) return(x)
x <- Reduce(c, x)
}
}
## large deeply nested list (1E6 elements, 6 layers)
deep_list <- rrapply(replicate(10, 1, simplify = F), classes = c("list", "numeric"), condition = function(x, .xpos) length(.xpos) < 6, f = function(x) replicate(10, 1, simplify = F), how = "recurse")
system.time(flatten2(deep_list))
#> user system elapsed
#> 1.715 0.012 1.727
## system.time(flatten3(deep_list)), not run takes more than 10 minutes
system.time(rrapply(deep_list, how = "flatten"))
#> user system elapsed
#> 0.105 0.016 0.121
## large shallow nested list (1E6 elements, 2 layers)
shallow_list <- lapply(replicate(1000, 1, simplify = F), function(x) replicate(1000, 1, simplify = F))
system.time(flatten2(shallow_list))
#> user system elapsed
#> 1.308 0.040 1.348
system.time(flatten3(shallow_list))
#> user system elapsed
#> 5.246 0.012 5.259
system.time(rrapply(shallow_list, how = "flatten"))
#> user system elapsed
#> 0.09 0.00 0.09
I have a vector, such as c(1, 3, 4, 5, 9, 10, 17, 29, 30) and I would like to group together the 'neighboring' elements that form a regular, consecutive sequence, i.e. an increase by 1, in a ragged vector resulting in:
L1: 1
L2: 3,4,5
L3: 9,10
L4: 17
L5: 29,30
Naive code (of an ex-C programmer):
partition.neighbors <- function(v)
{
result <<- list() #jagged array
currentList <<- v[1] #current series
for(i in 2:length(v))
{
if(v[i] - v [i-1] == 1)
{
currentList <<- c(currentList, v[i])
}
else
{
result <<- c(result, list(currentList))
currentList <<- v[i] #next series
}
}
return(result)
}
Now I understand that a) R is not C (despite the curly brackets) b) global variables are pure evil c) that is a horribly inefficient way of achieving the result
, so any better solutions are welcome.
Making heavy use of some R idioms:
> split(v, cumsum(c(1, diff(v) != 1)))
$`1`
[1] 1
$`2`
[1] 3 4 5
$`3`
[1] 9 10
$`4`
[1] 17
$`5`
[1] 29 30
daroczig writes "you could write a lot neater code based on diff"...
Here's one way:
split(v, cumsum(diff(c(-Inf, v)) != 1))
EDIT (added timings):
Tommy discovered this could be faster by being careful with types; the reason it got faster is that split is faster on integers, and is actually faster still on factors.
Here's Joshua's solution; the result from the cumsum is a numeric because it's being c'd with 1, so it's the slowest.
system.time({
a <- cumsum(c(1, diff(v) != 1))
split(v, a)
})
# user system elapsed
# 1.839 0.004 1.848
Just cing with 1L so the result is an integer speeds it up considerably.
system.time({
a <- cumsum(c(1L, diff(v) != 1))
split(v, a)
})
# user system elapsed
# 0.744 0.000 0.746
This is Tommy's solution, for reference; it's also splitting on an integer.
> system.time({
a <- cumsum(c(TRUE, diff(v) != 1L))
split(v, a)
})
# user system elapsed
# 0.742 0.000 0.746
Here's my original solution; it also is splitting on an integer.
system.time({
a <- cumsum(diff(c(-Inf, v)) != 1)
split(v, a)
})
# user system elapsed
# 0.750 0.000 0.754
Here's Joshua's, with the result converted to an integer before the split.
system.time({
a <- cumsum(c(1, diff(v) != 1))
a <- as.integer(a)
split(v, a)
})
# user system elapsed
# 0.736 0.002 0.740
All the versions that split on an integer vector are about the same; it could be even faster if that integer vector was already a factor, as the conversion from integer to factor actually takes about half the time. Here I make it into a factor directly; this is not recommended in general because it depends on the structure of the factor class. It'ss done here for comparison purposes only.
system.time({
a <- cumsum(c(1L, diff(v) != 1))
a <- structure(a, class = "factor", levels = 1L:a[length(a)])
split(v,a)
})
# user system elapsed
# 0.356 0.000 0.357
Joshua and Aaron were spot on. However, their code can still be made more than twice as fast by careful use of the correct types, integers and logicals:
split(v, cumsum(c(TRUE, diff(v) != 1L)))
v <- rep(c(1:5, 19), len = 1e6) # Huge vector...
system.time( split(v, cumsum(c(1, diff(v) != 1))) ) # Joshua's code
# user system elapsed
# 2.64 0.00 2.64
system.time( split(v, cumsum(c(TRUE, diff(v) != 1L))) ) # Modified code
# user system elapsed
# 1.09 0.00 1.12
You could define the cut-points easily:
which(diff(v) != 1)
Based on that try:
v <- c(1,3,4,5,9,10,17,29,30)
cutpoints <- c(0, which(diff(v) != 1), length(v))
ragged.vector <- vector("list", length(cutpoints)-1)
for (i in 2:length(cutpoints)) ragged.vector[[i-1]] <- v[(cutpoints[i-1]+1):cutpoints[i]]
Which results in:
> ragged.vector
[[1]]
[1] 1
[[2]]
[1] 3 4 5
[[3]]
[1] 9 10
[[4]]
[1] 17
[[5]]
[1] 29 30
This algorithm is not a nice one but you could write a lot neater code based on diff :) Good luck!
You can create a data.frame and assign the elements to groups using diff, ifelse and cumsum, then aggregate using tapply:
v.df <- data.frame(v = v)
v.df$group <- cumsum(ifelse(c(1, diff(v) - 1), 1, 0))
tapply(v.df$v, v.df$group, function(x) x)
$`1`
[1] 1
$`2`
[1] 3 4 5
$`3`
[1] 9 10
$`4`
[1] 17
$`5`
[1] 29 30