Parallel power set generation in Erlang? - recursion

There is a lot of example implementations of generating a powerset of a set in Java, Python and others, but I still can not understand how the actual algorithm works.
What are the steps taken by an algorithm to generate a power set P(S) of a set S?
(For example, the power set of {1,2,3,4} is {{}, {1}, {2}, {1,2}, {3}, {1,3}, {2,3}, {1,2,3}, {4}, {1,4}, {2,4}, {1,2,4}, {3,4}, {1,3,4}, {2,3,4}, {1,2,3,4}}.)
UPD: I have found this explanation, but still I don't get it. I am trying to understand the algorithm of generating a power set, because I would like to write a parallel implementation of it - the following sequential Erlang implementation has an enormous stack and can not count more than 30-elements set on a machine with 8 GB RAM:
powerset(Lst) ->
N = length(Lst),
Max = trunc(math:pow(2,N)),
[[lists:nth(Pos+1,Lst) || Pos <- lists:seq(0,N-1), I band (1 bsl Pos) =/= 0] || I <- lists:seq(0,Max-1)].
UPD2:
This snippet returns all subsets of a set [a,b,c], except [a,b,c]:
generate_all_subsets([],Full_list,Result) ->
Result;
generate_all_subsets([Element|Rest_of_list],Full_list,Result) ->
Filtered_list = [X || X <- Full_list, X =/= Element],
?DBG("*Current accumulated result: ~w ~n", [Result]),
Result2 = generate_subsets(Element,Filtered_list,[],[]),
?DBG("Generated new result: ~w ~n", [Result2]),
New_result = lists:append(Result,Result2),
?DBG("Got new accumulated result: ~w ~n", [New_result]),
generate_all_subsets(Rest_of_list,Full_list,New_result).
generate_subsets(Main_element,[],Accumulated_list,Result) ->
Result;
generate_subsets(Main_element,[Element|Rest_of_set],Accumulated_list,Result) ->
?DBG("*Generating a subset for ~w ~n", [Main_element]),
New_accumulated_list = lists:flatten([Element|Accumulated_list]),
New_result = [New_accumulated_list|Result],
?DBG("Added ~w to the result: ~w ~n", [New_accumulated_list,New_result]),
generate_subsets(Main_element,Rest_of_set,New_accumulated_list,New_result).
I am not sure if this snippet is correct.

Here is pretty simple version which performs far better than version from rosettacode:
generate([]) -> [[]];
generate([H|T]) -> PT = generate(T),
[ [H|X] || X <- PT ] ++ PT.
if you want even better performance you can try this:
generate([]) -> [[]];
generate([H|T]) -> PT = generate(T),
generate(H, PT, PT).
generate(_, [], Acc) -> Acc;
generate(X, [H|T], Acc) -> generate(X, T, [[X|H]|Acc]).
But anyway I doubt if you will be able construct powerset of 30 elements set. According mine calculation it could consume more than 16GB. There can be some reusing of lists tails in mine second version but it would not help enough. I think you can even fail to bigger issue if you will implement it as parallel algorithm because there will be message copying.

Related

Recursion and Multi-Argument Functions in z3 in C#

I'm new to z3 and trying to use it to solve logic puzzles. The puzzle type I'm working on, Skyscrapers, includes given constraints on the number of times that a new maximum value is found while reading a series of integers.
For example, if the constraint given was 3, then the series [2,3,1,5,4] would satisfy the constraint as we'd detect the maximums '2', '3', '5'.
I've implemented a recursive solution, but the rule does not apply correctly and the resulting solutions are invalid.
for (int i = 0; i < clues.Length; ++i)
{
IntExpr clue = c.MkInt(clues[i].count);
IntExpr[] orderedCells = GetCells(clues[i].x, clues[i].y, clues[i].direction, cells, size);
IntExpr numCells = c.MkInt(orderedCells.Length);
ArrayExpr localCells = c.MkArrayConst(string.Format("clue_{0}", i), c.MkIntSort(), c.MkIntSort());
for (int j = 0; j < orderedCells.Length; ++j)
{
c.MkStore(localCells, c.MkInt(j), orderedCells[j]);
}
// numSeen counter_i(index, localMax)
FuncDecl counter = c.MkFuncDecl(String.Format("counter_{0}", i), new Sort[] { c.MkIntSort(), c.MkIntSort()}, c.MkIntSort());
IntExpr index = c.MkIntConst(String.Format("index_{0}", i));
IntExpr localMax = c.MkIntConst(String.Format("localMax_{0}", i));
s.Assert(c.MkForall(new Expr[] { index, localMax }, c.MkImplies(
c.MkAnd(c.MkAnd(index >= 0, index < numCells), c.MkAnd(localMax >= 0, localMax <= numCells)), c.MkEq(c.MkApp(counter, index, localMax),
c.MkITE(c.MkOr(c.MkGe(index, numCells), c.MkLt(index, c.MkInt(0))),
c.MkInt(0),
c.MkITE(c.MkOr(c.MkEq(localMax, c.MkInt(0)), (IntExpr)localCells[index] >= localMax),
1 + (IntExpr)c.MkApp(counter, index + 1, (IntExpr)localCells[index]),
c.MkApp(counter, index + 1, localMax)))))));
s.Assert(c.MkEq(clue, c.MkApp(counter, c.MkInt(0), c.MkInt(0))));
Or as an example of how the first assertion is stored:
(forall ((index_3 Int) (localMax_3 Int))
(let ((a!1 (ite (or (= localMax_3 0) (>= (select clue_3 index_3) localMax_3))
(+ 1 (counter_3 (+ index_3 1) (select clue_3 index_3)))
(counter_3 (+ index_3 1) localMax_3))))
(let ((a!2 (= (counter_3 index_3 localMax_3)
(ite (or (>= index_3 5) (< index_3 0)) 0 a!1))))
(=> (and (>= index_3 0) (< index_3 5) (>= localMax_3 0) (<= localMax_3 5))
a!2))))
From reading questions here, I get the sense that defining functions via Assert should work. However, I didn't see any examples where the function had two arguments. Any ideas what is going wrong? I realize that I could define all primitive assertions and avoid recursion, but I want a general solver not dependent on the size of the puzzle.
Stack-overflow works the best if you post entire code segments that can be independently run to debug. Unfortunately posting chosen parts makes it really difficult for people to understand what might be the problem.
Having said that, I wonder why you are coding this in C/C# to start with? Programming z3 using these lower level interfaces, while certainly possible, is a terrible idea unless you've some other integration requirement. For personal projects and learning purposes, it's much better to use a higher level API. The API you are using is extremely low-level and you end up dealing with API-centric issues instead of your original problem.
In Python
Based on this, I'd strongly recommend using a higher-level API, such as from Python or Haskell. (There are bindings available in many languages; but I think Python and Haskell ones are the easiest to use. But of course, this is my personal bias.)
The "skyscraper" constraint can easily be coded in the Python API as follows:
from z3 import *
def skyscraper(clue, xs):
# If list is empty, clue has to be 0
if not xs:
return clue == 0;
# Otherwise count the visible ones:
visible = 1 # First one is always visible!
curMax = xs[0]
for i in xs[1:]:
visible = visible + If(i > curMax, 1, 0)
curMax = If(i > curMax, i, curMax)
# Clue must equal number of visibles
return clue == visible
To use this, let's create a row of skyscrapers. We'll make the size based on a constant you can set, which I'll call N:
s = Solver()
N = 5 # configure size
row = [Int("v%d" % i) for i in range(N)]
# Make sure row is distinct and each element is between 1-N
s.add(Distinct(row))
for i in row:
s.add(And(1 <= i, i <= N))
# Add the clue, let's say we want 3 for this row:
s.add(skyscraper(3, row))
# solve
if s.check() == sat:
m = s.model()
print([m[i] for i in row])
else:
print("Not satisfiable")
When I run this, I get:
[3, 1, 2, 4, 5]
which indeed has 3 skyscrapers visible.
To solve the entire grid, you'd create NxN variables and add all the skyscraper assertions for all rows/columns. This is a bit of coding, but you can see that it's quite high-level and a lot easier to use than the C-encoding you're attempting.
In Haskell
For reference, here's the same problem encoded using the Haskell SBV library, which is built on top of z3:
import Data.SBV
skyscraper :: SInteger -> [SInteger] -> SBool
skyscraper clue [] = clue .== 0
skyscraper clue (x:xs) = clue .== visible xs x 1
where visible [] _ sofar = sofar
visible (x:xs) curMax sofar = ite (x .> curMax)
(visible xs x (1+sofar))
(visible xs curMax sofar)
row :: Integer -> Integer -> IO SatResult
row clue n = sat $ do xs <- mapM (const free_) [1..n]
constrain $ distinct xs
constrain $ sAll (`inRange` (1, literal n)) xs
constrain $ skyscraper (literal clue) xs
Note that this is even shorter than the Python encoding (about 15 lines of code, as opposed to Python's 30 or so), and if you're familiar with Haskell quite a natural description of the problem without getting lost in low-level details. When I run this, I get:
*Main> row 3 5
Satisfiable. Model:
s0 = 1 :: Integer
s1 = 4 :: Integer
s2 = 5 :: Integer
s3 = 3 :: Integer
s4 = 2 :: Integer
which tells me the heights should be 1 4 5 3 2, again giving a row with 3 visible skyscrapers.
Summary
Once you're familiar with the Python/Haskell APIs and have a good idea on how to solve your problem, you can code it in C# if you like. I'd advise against it though, unless you've a really good reason to do so. Sticking the Python or Haskell is your best bet not to get lost in the details of the API.

F# Recursive Objects

I'm new to F#, and functional languages. So this might be stupid question, or duplicated with this Recursive objects in F#?, but I don't know.
Here is a simple Fibonacci function:
let rec fib n =
match n with
| 0 -> 1
| 1 -> 1
| _ -> fib (n - 1) + fib (n - 2)
Its signature is int -> int.
It can be rewritten as:
let rec fib =
fun n ->
match n with
| 0 -> 1
| 1 -> 1
| _ -> fib (n - 1) + fib (n - 2)
Its signature is (int -> int) (in Visual Studio for Mac).
So what's the difference with the previous one?
If I add one more line like this:
let rec fib =
printfn "fib" // <-- this line
fun n ->
match n with
| 0 -> 1
| 1 -> 1
| _ -> fib (n - 1) + fib (n - 2)
The IDE gives me a warning:
warning FS0040: This and other recursive references to the object(s) being defined will be checked for initialization-soundness at runtime through the use of a delayed reference. This is because you are defining one or more recursive objects, rather than recursive functions. This warning may be suppressed by using '#nowarn "40"' or '--nowarn:40'.
How does this line affect the initialization?
What does "recursive object" mean? I can't find it in the documentation.
Update
Thanks for your replies, really nice explanation.
After reading your answers, I have some ideas about the Recursive Object.
First, I made a mistake about the signature. The first two code snippets above have a same signature, int -> int; but the last has signature (int -> int) (note: the signatures have different representation in vscode with Ionide extension).
I think the difference between the two signatures is, the first one means it's just a function, the other one means it's a reference to a function, that is, an object.
And every let rec something with no parameter-list is an object rather than a function, see the function definition, while the second snippet is an exception, possibly optimized by the compiler to a function.
One example:
let rec x = (fun () -> x + 1)() // same warning, says `x` is an recursive object
The only one reason I can think of is the compiler is not smart enough, it throws an warning just because it's a recursive object, like the warning indicates,
This is because you are defining one or more recursive objects, rather than recursive functions
even though this pattern would never have any problem.
let rec fib =
// do something here, if fib invoked here directly, it's definitely an error, not warning.
fun n ->
match n with
| 0 -> 1
| 1 -> 1
| _ -> fib (n - 1) + fib (n - 2)
What do you think about this?
"Recursive objects" are just like recursive functions, except they are, well, objects. Not functions.
A recursive function is a function that references itself, e.g.:
let rec f x = f (x-1) + 1
A recursive object is similar, in that it references itself, except it's not a function, e.g.:
let rec x = x + 1
The above will actually not compile. The F# compiler is able to correctly determine the problem and issue an error: The value 'x' will be evaluated as part of its own definition. Clearly, such definition is nonsensical: in order to calculate x, you need to already know x. Does not compute.
But let's see if we can be more clever. How about if I close x in a lambda expression?
let rec x = (fun() -> x + 1) ()
Here, I wrap the x in a function, and immediately call that function. This compiles, but with a warning - the same warning that you're getting, something about "checking for initialization-soundness at runtime".
So let's go to runtime:
> let rec x = (fun() -> x + 1) ()
System.InvalidOperationException: ValueFactory attempted to access the Value property of this instance.
Not surprisingly, we get an error: turns out, in this definition, you still need to know x in order to calculate x - same as with let rec x = x + 1.
But if this is the case, why does it compile at all? Well, it just so happens that, in general, it is impossible to strictly prove that x will or will not access itself during initialization. The compiler is just smart enough to notice that it might happen (and this is why it issues the warning), but not smart enough to prove that it will definitely happen.
So in cases like this, in addition to issuing a warning, the compiler will install a runtime guard, which will check whether x has already been initialized when it's being accessed. The compiled code with such guard might look something like this:
let mutable x_initialized = false
let rec x =
let x_temp =
(fun() ->
if not x_initialized then failwith "Not good!"
else x + 1
) ()
x_initialized <- true
x_temp
(the actual compiled code looks differently of course; use ILSpy to look if you're curious)
In certain special cases, the compiler can prove one way or another. In other cases it can't, so it installs runtime protection:
// Definitely bad => compile-time error
let rec x = x + 1
// Definitely good => no errors, no warnings
let rec x = fun() -> x() + 1
// Might be bad => compile-time warning + runtime guard
let rec x = (fun() -> x+1) ()
// Also might be bad: no way to tell what the `printfn` call will do
let rec x =
printfn "a"
fun() -> x() + 1
There's a major difference between the last two versions. Notice adding a printfn call to the first version generates no warning, and "fib" will be printed each time the function recurses:
let rec fib n =
printfn "fib"
match n with
| 0 -> 1
| 1 -> 1
| _ -> fib (n - 1) + fib (n - 2)
> fib 10;;
fib
fib
fib
...
val it : int = 89
The printfn call is part of the recursive function's body. But the 3rd/final version only prints "fib" once when the function is defined then never again.
What's the difference? In the 3rd version you're not defining just a recursive function, because there are other expressions creating a closure over the lambda, resulting in a recursive object. Consider this version:
let rec fib3 =
let x = 1
let y = 2
fun n ->
match n with
| 0 -> x
| 1 -> x
| _ -> fib3 (n - x) + fib3 (n - y)
fib3 is not a plain recursive function; there's a closure over the function capturing x and y (and same for the printfn version, although it's just a side-effect). This closure is the "recursive object" referred to in the warning. x and y will not be redefined in each recursion; they're part of the root-level closure/recursive object.
From the linked question/answer:
because [the compiler] cannot guarantee that the reference won't be accessed before it is initialized
Although it doesn't apply in your particular example, it's impossible for the compiler to know whether you're doing something harmless, or potentially referencing/invoking the lambda in fib3 definition before fib3 has a value/has been initialized. Here's another good answer explaining the same.

How can I have all cores participate in parallelization with Async?

The following functions parallelize the processing of a list by first decomposing a list into large chunks and then processing each chunk.
let chunkList chunkSize (xs : list<'T>) =
query {
for idx in 0..(xs.Length - 1) do
groupBy (idx / chunkSize) into g
select (g |> Seq.map (fun idx -> xs.[idx]))
}
let par (foo: 'T -> 'S) (xs: list<'T>) =
xs
|> List.map (fun x -> async { return foo x })
|> Async.Parallel
|> Async.RunSynchronously
|> Array.toList
let parChunks chunkSize (f: 'T -> 'S) (xs: list<'T>) =
chunkList chunkSize xs |> Seq.map List.ofSeq |> List.ofSeq
|> par (List.map f)
|> List.concat
This function was used to test parChunks:
let g i = [1..1000000] |> List.map (fun x -> sqrt (float (1000 * x + 1))) |> List.head
Running the standard List.Seq and `parChunk`` with a chunk size equal to 1/2 the list size there was a performance gain:
List.map g [1..100];;
// Real: 00:00:28.979, CPU: 00:00:29.562
parChunks 50 g [1..100];;
// Real: 00:00:23.027, CPU: 00:00:24.687
However, with a chunk size equal to 1/4 the size of the list the performance was almost the same. I did not expect this since my processor (Intel 6700HQ) has four cores.
parChunks 25 g [1..100];;
// Real: 00:00:21.695, CPU: 00:00:24.437
Looking at the Performance app in Task Manager one sees that the four cores are never used.
Is there a way to make all four cores participate in this computation?
I think you are overcomplicating this problem.
The primary use of async workflows is not for CPU-bound work, it's for IO-bound work to avoid blocking threads while awaiting results that will arrive with some latency.
Although you can parallelise CPU-bound work using async, doing so is suboptimal.
What you want can be far more easily achieved by using the Array.Parallel module on Arrays rather than Lists.
let g i =
[|1..1000000|]
|> Array.Parallel.map (fun x -> sqrt (float (1000 * x + 1)))
|> Array.head
No need to write your own chunking and merging code, that's all handled for you and, by my measurements, it's much much faster.
In F#, async workflows run using the .Net ThreadPool class, which has GetMinThreads and GetMaxThreads methods. They use two out parameters to return the minimum or maximum number of threads that the thread pool is allowed to use, but in F# that gets converted to a function returning a tuple:
F# Interactive for F# 4.1
Freely distributed under the Apache 2.0 Open Source License
For help type #help;;
> open System.Threading ;;
> ThreadPool.GetMinThreads() ;;
val it : int * int = (4, 4)
> ThreadPool.GetMaxThreads() ;;
val it : int * int = (400, 200)
The two numbers are for "worker" threads and "asynchronous I/O" threads, respectively. My CPU has four cores, so the minimum number of both kinds of threads in the pool is 4. I don't know for certain that this is your problem, but try running ThreadPool.GetMinThreads() on your system and make sure that it's 4. If it's 2 for some reason, that could explain why you're not getting better performance.
See also https://stackoverflow.com/a/26041852/2314532 for an explanation of another possible performance problem with using async workflows for parallel processing. That could also be what's happening here.
Finally, there's one more thing I want to mention. As it currently stands, I'm genuinely surprised that you're getting any benefit out of your parallelism. That's because there's a cost to dividing up the list and concatenating it again. Since the F# list type is a singly-linked list, that cost is O(N), and those steps (divide and reassemble) cannot be parallelized.
The answer to that problem is to use a different data structure, like an RRB Tree, for any list of items that you plan to process in parallel: it's designed to be split and concatenated efficiently (effectively O(1) splits and joins, though the constant factor in joins is rather large). Unfortunately, there's currently no implementation of RRB trees in F#. I'm currently working on one, and estimate it may be ready in another month or so. You can subscribe to this GitHub issue if you want to find out when I've released the code I've been working on.
Good answers here but I will add some comments when it comes to performance and parallelism.
For performance in general, we like to avoid dynamic allocations because we don't want to waste precious cycles allocating objects (quite fast in .NET, slow in C/C++) or collecting them (quite slow).
We also like to minimize the memory footprint of objects and make sure they lay sequentially in memory (Arrays are our friends here) in order to make as efficient use of the CPU cache and prefetcher as possible. A cache miss might cost several hundred cycles.
I think it is important to always compare against a trivial, sequential yet efficiently implemented loop in order to have some sanity check of the parallel performance. Otherwise we might trick ourselves into thinking our parallel masterpiece is doing well when in reality it's outclassed by a simple loop.
Also, varying the size of the input data because of caching issues but also because there is overhead from starting up a parallel computation.
With that said, I have prepared different versions of the following code:
module SequentialFold =
let compute (vs : float []) : float =
vs |> Array.fold (fun s v -> s + sqrt (1000. * v + 1.)) 0.
then I compare the performance of the different versions in order to see which does the best on varying sized in terms of performance and GC pressure.
The performance test is done in such a way that the total amount of work is always the same regardless of input size in order to make times comparable.
Here is the code:
open System
open System.Threading.Tasks
let clock =
let sw = System.Diagnostics.Stopwatch ()
sw.Start ()
fun () -> sw.ElapsedMilliseconds
let timeIt n a =
let r = a () // Warm-up
GC.Collect (2, GCCollectionMode.Forced, true)
let inline cc g = GC.CollectionCount g
let bcc0, bcc1, bcc2 = cc 0, cc 1, cc 2
let before = clock ()
for i = 1 to n do
a () |> ignore
let after = clock ()
let acc0, acc1, acc2 = cc 0, cc 1, cc 2
after - before, acc0 - bcc0, acc1 - bcc1, acc2 - bcc2, r
// compute implemented using tail recursion
module TailRecursion =
let compute (vs : float []) : float =
let rec loop s i =
if i < vs.Length then
let v = vs.[i]
loop (s + sqrt (1000. * v + 1.)) (i + 1)
else
s
loop 0. 0
// compute implemented using Array.fold
module SequentialFold =
let compute (vs : float []) : float =
vs |> Array.fold (fun s v -> s + sqrt (1000. * v + 1.)) 0.
// compute implemented using Array.map + Array.fold
module SequentialArray =
let compute (vs : float []) : float =
vs |> Array.map (fun v -> sqrt (1000. * v + 1.)) |> Array.fold (+) 0.
// compute implemented using Array.Parallel.map + Array.fold
module ParallelArray =
let compute (vs : float []) : float =
vs |> Array.Parallel.map (fun v -> sqrt (1000. * v + 1.)) |> Array.fold (+) 0.
// compute implemented using Parallel.For
module ParallelFor =
let compute (vs : float []) : float =
let lockObj = obj ()
let mutable sum = 0.
let options = ParallelOptions()
let init () = 0.
let body i pls s =
let v = i |> float
s + sqrt (1000. * v + 1.)
let localFinally ls =
lock lockObj <| fun () -> sum <- sum + ls
let pls = Parallel.For ( 0
, vs.Length
, options
, Func<float> init
, Func<int, ParallelLoopState, float, float> body
, Action<float> localFinally
)
sum
// compute implemented using Parallel.For with batches of size 100
module ParallelForBatched =
let compute (vs : float []) : float =
let inner = 100
let outer = vs.Length / inner + (if vs.Length % inner = 0 then 0 else 1)
let lockObj = obj ()
let mutable sum = 0.
let options = ParallelOptions()
let init () = 0.
let rec loop e s i =
if i < e then
let v = vs.[i]
loop e (s + sqrt (1000. * v + 1.)) (i + 1)
else
s
let body i pls s =
let b = i * inner
let e = b + inner |> min vs.Length
loop e s b
let localFinally ls =
lock lockObj <| fun () -> sum <- sum + ls
let pls = Parallel.For ( 0
, outer
, options
, Func<float> init
, Func<int, ParallelLoopState, float, float> body
, Action<float> localFinally
)
sum
[<EntryPoint>]
let main argv =
let count = 100000000
let outers =
[|
//10000000
100000
1000
10
|]
for outer in outers do
let inner = count / outer
let vs = Array.init inner float
let testCases =
[|
"TailRecursion" , fun () -> TailRecursion.compute vs
"Fold.Sequential" , fun () -> SequentialFold.compute vs
"Array.Sequential" , fun () -> SequentialArray.compute vs
"Array.Parallel" , fun () -> ParallelArray.compute vs
"Parallel.For" , fun () -> ParallelFor.compute vs
"Parallel.For.Batched" , fun () -> ParallelForBatched.compute vs
|]
printfn "Using outer = %A, inner = %A, total is: %A" outer inner count
for nm, a in testCases do
printfn " Running test case: %A" nm
let tm, cc0, cc1, cc2, r = timeIt outer a
printfn " it took %A ms with GC collects (%A, %A, %A), result is: %A" tm cc0 cc1 cc2 r
0
And here are the results (Intel I5, 4 cores):
Using outer = 100000, inner = 1000, total is: 100000000
Running test case: "TailRecursion"
it took 389L ms with GC collects (0, 0, 0), result is: 666162.111
Running test case: "Fold.Sequential"
it took 388L ms with GC collects (0, 0, 0), result is: 666162.111
Running test case: "Array.Sequential"
it took 628L ms with GC collects (255, 0, 0), result is: 666162.111
Running test case: "Array.Parallel"
it took 993L ms with GC collects (306, 2, 0), result is: 666162.111
Running test case: "Parallel.For"
it took 711L ms with GC collects (54, 2, 0), result is: 666162.111
Running test case: "Parallel.For.Batched"
it took 490L ms with GC collects (52, 2, 0), result is: 666162.111
Using outer = 1000, inner = 100000, total is: 100000000
Running test case: "TailRecursion"
it took 389L ms with GC collects (0, 0, 0), result is: 666661671.1
Running test case: "Fold.Sequential"
it took 388L ms with GC collects (0, 0, 0), result is: 666661671.1
Running test case: "Array.Sequential"
it took 738L ms with GC collects (249, 249, 249), result is: 666661671.1
Running test case: "Array.Parallel"
it took 565L ms with GC collects (249, 249, 249), result is: 666661671.1
Running test case: "Parallel.For"
it took 157L ms with GC collects (0, 0, 0), result is: 666661671.1
Running test case: "Parallel.For.Batched"
it took 110L ms with GC collects (0, 0, 0), result is: 666661671.1
Using outer = 10, inner = 10000000, total is: 100000000
Running test case: "TailRecursion"
it took 387L ms with GC collects (0, 0, 0), result is: 6.666666168e+11
Running test case: "Fold.Sequential"
it took 390L ms with GC collects (0, 0, 0), result is: 6.666666168e+11
Running test case: "Array.Sequential"
it took 811L ms with GC collects (3, 3, 3), result is: 6.666666168e+11
Running test case: "Array.Parallel"
it took 567L ms with GC collects (4, 4, 4), result is: 6.666666168e+11
Running test case: "Parallel.For"
it took 151L ms with GC collects (0, 0, 0), result is: 6.666666168e+11
Running test case: "Parallel.For.Batched"
it took 102L ms with GC collects (0, 0, 0), result is: 6.666666168e+11
TailRecursion and Fold.Sequential have similar performance.
Array.Sequential does worse because the job is split on two operations map and fold. In addition we get GC pressure because it allocates an extra array.
Array.Parallel is the same as Array.Sequential but uses Array.Parallel.map over Array.map. Here we see there's an overhead of starting many small parallel compuations as small input sizes generate more parallel computations and this takes significant more performance. In addition, the performance is poor even if we use multiple cores. This is because the compuation per element is very small and any benefit of spreading the job over several cores is consumed by the overhead of managing the distribution. When comparing the single thread performance of 390ms with the parallel performance of 990ms one might be suprised that it is 3x worse but in reality it's 12x worse as all 4 cores are used to produce the answer 3x slower.
Parallel.For does better as it allows the parallel computation to take place without allocating a new array and the internal overhead is likely lower. Here we manage to gain performance for larger sizes but still lags behind the sequential algorithms for smaller sizes because of the overhead of starting parallel computations.
Parallel.For.Batched tries to reduce the overhead by increasing the cost of the individual computations by folding several array values in each parallel computation. Essentially a combination of the TailRecursion algorithm and Parallel.For. Thanks to this we manage to hit an efficiency of 95% for larger sizes which can be consider decent.
For a simple computation like this AVX could be used as well leading to a potential speedup of around 16X, the cost is that the code will get even hairier.
With a batched parallel for we reached 95% of expected performance speedup.
The point of this is that it's important to continuously measure performance of your parallel algorithms and compare them against trivial sequential implementations.

Unique array of random numbers using functional programming

I'm trying to write some code in a functional paradigm for practice. There is one case I'm having some problems wrapping my head around. I am trying to create an array of 5 unique integers from 1, 100. I have been able to solve this without using functional programming:
let uniqueArray = [];
while (uniqueArray.length< 5) {
const newNumber = getRandom1to100();
if (uniqueArray.indexOf(newNumber) < 0) {
uniqueArray.push(newNumber)
}
}
I have access to lodash so I can use that. I was thinking along the lines of:
const uniqueArray = [
getRandom1to100(),
getRandom1to100(),
getRandom1to100(),
getRandom1to100(),
getRandom1to100()
].map((currentVal, index, array) => {
return array.indexOf(currentVal) > -1 ? getRandom1to100 : currentVal;
});
But this obviously wouldn't work because it will always return true because the index is going to be in the array (with more work I could remove that defect) but more importantly it doesn't check for a second time that all values are unique. However, I'm not quite sure how to functionaly mimic a while loop.
Here's an example in OCaml, the key point is that you use accumulators and recursion.
let make () =
Random.self_init ();
let rec make_list prev current max accum =
let number = Random.int 100 in
if current = max then accum
else begin
if number <> prev
then (number + prev) :: make_list number (current + 1) max accum
else accum
end
in
make_list 0 0 5 [] |> Array.of_list
This won't guarantee that the array will be unique, since its only checking by the previous. You could fix that by hiding a hashtable in the closure between make and make_list and doing a constant time lookup.
Here is a stream-based Python approach.
Python's version of a lazy stream is a generator. They can be produced in various ways, including by something which looks like a function definition but uses the key word yield rather than return. For example:
import random
def randNums(a,b):
while True:
yield random.randint(a,b)
Normally generators are used in for-loops but this last generator has an infinite loop hence would hang if you try to iterate over it. Instead, you can use the built-in function next() to get the next item in the string. It is convenient to write a function which works something like Haskell's take:
def take(n,stream):
items = []
for i in range(n):
try:
items.append(next(stream))
except StopIteration:
return items
return items
In Python StopIteration is raised when a generator is exhausted. If this happens before n items, this code just returns however much has been generated, so perhaps I should call it takeAtMost. If you ditch the error-handling then it will crash if there are not enough items -- which maybe you want. In any event, this is used like:
>>> s = randNums(1,10)
>>> take(5,s)
[6, 6, 8, 7, 2]
of course, this allows for repeats.
To make things unique (and to do so in a functional way) we can write a function which takes a stream as input and returns a stream consisting of unique items as output:
def unique(stream):
def f(s):
items = set()
while True:
try:
x = next(s)
if not x in items:
items.add(x)
yield x
except StopIteration:
raise StopIteration
return f(stream)
this creates an stream in a closure that contains a set which can keep track of items that have been seen, only yielding items which are unique. Here I am passing on any StopIteration exception. If the underlying generator has no more elements then there are no more unique elements. I am not 100% sure if I need to explicitly pass on the exception -- (it might happen automatically) but it seems clean to do so.
Used like this:
>>> take(5,unique(randNums(1,10)))
[7, 2, 5, 1, 6]
take(10,unique(randNums(1,10))) will yield a random permutation of 1-10. take(11,unique(randNums(1,10))) will never terminate.
This is a very good question. It's actually quite common. It's even sometimes asked as an interview question.
Here's my solution to generating 5 integers from 0 to 100.
let rec take lst n =
if n = 0 then []
else
match lst with
| [] -> []
| x :: xs -> x :: take xs (n-1)
let shuffle d =
let nd = List.map (fun c -> (Random.bits (), c)) d in
let sond = List.sort compare nd in
List.map snd sond
let rec range a b =
if a >= b then []
else a :: range (a+1) b;;
let _ =
print_endline
(String.concat "\t" ("5 random integers:" :: List.map string_of_int (take (shuffle (range 0 101)) 5)))
How's this:
const addUnique = (ar) => {
const el = getRandom1to100();
return ar.includes(el) ? ar : ar.concat([el])
}
const uniqueArray = (numberOfElements, baseArray) => {
if (numberOfElements < baseArray.length) throw 'invalid input'
return baseArray.length === numberOfElements ? baseArray : uniqueArray(numberOfElements, addUnique(baseArray))
}
const myArray = uniqueArray(5, [])

iterative version of recursive algorithm to make a binary tree

Given this algorithm, I would like to know if there exists an iterative version. Also, I want to know if the iterative version can be faster.
This some kind of pseudo-python...
the algorithm returns a reference to root of the tree
make_tree(array a)
if len(a) == 0
return None;
node = pick a random point from the array
calculate distances of the point against the others
calculate median of such distances
node.left = make_tree(subset of the array, such that the distance of points is lower to the median of distances)
node.right = make_tree(subset, such the distance is greater or equal to the median)
return node
A recursive function with only one recursive call can usually be turned into a tail-recursive function without too much effort, and then it's trivial to convert it into an iterative function. The canonical example here is factorial:
# naïve recursion
def fac(n):
if n <= 1:
return 1
else:
return n * fac(n - 1)
# tail-recursive with accumulator
def fac(n):
def fac_helper(m, k):
if m <= 1:
return k
else:
return fac_helper(m - 1, m * k)
return fac_helper(n, 1)
# iterative with accumulator
def fac(n):
k = 1
while n > 1:
n, k = n - 1, n * k
return k
However, your case here involves two recursive calls, and unless you significantly rework your algorithm, you need to keep a stack. Managing your own stack may be a little faster than using Python's function call stack, but the added speed and depth will probably not be worth the complexity. The canonical example here would be the Fibonacci sequence:
# naïve recursion
def fib(n):
if n <= 1:
return 1
else:
return fib(n - 1) + fib(n - 2)
# tail-recursive with accumulator and stack
def fib(n):
def fib_helper(m, k, stack):
if m <= 1:
if stack:
m = stack.pop()
return fib_helper(m, k + 1, stack)
else:
return k + 1
else:
stack.append(m - 2)
return fib_helper(m - 1, k, stack)
return fib_helper(n, 0, [])
# iterative with accumulator and stack
def fib(n):
k, stack = 0, []
while 1:
if n <= 1:
k = k + 1
if stack:
n = stack.pop()
else:
break
else:
stack.append(n - 2)
n = n - 1
return k
Now, your case is a lot tougher than this: a simple accumulator will have difficulties expressing a partly-built tree with a pointer to where a subtree needs to be generated. You'll want a zipper -- not easy to implement in a not-really-functional language like Python.
Making an iterative version is simply a matter of using your own stack instead of the normal language call stack. I doubt the iterative version would be faster, as the normal call stack is optimized for this purpose.
The data you're getting is random so the tree can be an arbitrary binary tree. For this case, you can use a threaded binary tree, which can be traversed and built w/o recursion and no stack. The nodes have a flag that indicate if the link is a link to another node or how to get to the "next node".
From http://en.wikipedia.org/wiki/Threaded_binary_tree
Depending on how you define "iterative", there is another solution not mentioned by the previous answers. If "iterative" just means "not subject to a stack overflow exception" (but "allowed to use 'let rec'"), then in a language that supports tail calls, you can write a version using continuations (rather than an "explicit stack"). The F# code below illustrates this. It is similar to your original problem, in that it builds a BST out of an array. If the array is shuffled randomly, the tree is relatively balanced and the recursive version does not create too deep a stack. But turn off shuffling, and the tree gets unbalanced, and the recursive version stack-overflows whereas the iterative-with-continuations version continues along happily.
#light
open System
let printResults = false
let MAX = 20000
let shuffleIt = true
// handy helper function
let rng = new Random(0)
let shuffle (arr : array<'a>) = // '
let n = arr.Length
for x in 1..n do
let i = n-x
let j = rng.Next(i+1)
let tmp = arr.[i]
arr.[i] <- arr.[j]
arr.[j] <- tmp
// Same random array
let sampleArray = Array.init MAX (fun x -> x)
if shuffleIt then
shuffle sampleArray
if printResults then
printfn "Sample array is %A" sampleArray
// Tree type
type Tree =
| Node of int * Tree * Tree
| Leaf
// MakeTree1 is recursive
let rec MakeTree1 (arr : array<int>) lo hi = // [lo,hi)
if lo = hi then
Leaf
else
let pivot = arr.[lo]
// partition
let mutable storeIndex = lo + 1
for i in lo + 1 .. hi - 1 do
if arr.[i] < pivot then
let tmp = arr.[i]
arr.[i] <- arr.[storeIndex]
arr.[storeIndex] <- tmp
storeIndex <- storeIndex + 1
Node(pivot, MakeTree1 arr (lo+1) storeIndex, MakeTree1 arr storeIndex hi)
// MakeTree2 has all tail calls (uses continuations rather than a stack, see
// http://lorgonblog.spaces.live.com/blog/cns!701679AD17B6D310!171.entry
// for more explanation)
let MakeTree2 (arr : array<int>) lo hi = // [lo,hi)
let rec MakeTree2Helper (arr : array<int>) lo hi k =
if lo = hi then
k Leaf
else
let pivot = arr.[lo]
// partition
let storeIndex = ref(lo + 1)
for i in lo + 1 .. hi - 1 do
if arr.[i] < pivot then
let tmp = arr.[i]
arr.[i] <- arr.[!storeIndex]
arr.[!storeIndex] <- tmp
storeIndex := !storeIndex + 1
MakeTree2Helper arr (lo+1) !storeIndex (fun lacc ->
MakeTree2Helper arr !storeIndex hi (fun racc ->
k (Node(pivot,lacc,racc))))
MakeTree2Helper arr lo hi (fun x -> x)
// MakeTree2 never stack overflows
printfn "calling MakeTree2..."
let tree2 = MakeTree2 sampleArray 0 MAX
if printResults then
printfn "MakeTree2 yields"
printfn "%A" tree2
// MakeTree1 might stack overflow
printfn "calling MakeTree1..."
let tree1 = MakeTree1 sampleArray 0 MAX
if printResults then
printfn "MakeTree1 yields"
printfn "%A" tree1
printfn "Trees are equal: %A" (tree1 = tree2)
Yes it is possible to make any recursive algorithm iterative. Implicitly, when you create a recursive algorithm each call places the prior call onto the stack. What you want to do is make the implicit call stack into an explicit one. The iterative version won't necessarily be faster, but you won't have to worry about a stack overflow. (do I get a badge for using the name of the site in my answer?
While it is true in the general sense that directly converting a recursive algorithm into an iterative one will require an explicit stack, there is a specific sub-set of algorithms which render directly in iterative form (without the need for a stack). These renderings may not have the same performance guarantees (iterating over a functional list vs recursive deconstruction), but they do often exist.
Here is stack based iterative solution (Java):
public static Tree builtBSTFromSortedArray(int[] inputArray){
Stack toBeDone=new Stack("sub trees to be created under these nodes");
//initialize start and end
int start=0;
int end=inputArray.length-1;
//keep memoy of the position (in the array) of the previously created node
int previous_end=end;
int previous_start=start;
//Create the result tree
Node root=new Node(inputArray[(start+end)/2]);
Tree result=new Tree(root);
while(root!=null){
System.out.println("Current root="+root.data);
//calculate last middle (last node position using the last start and last end)
int last_mid=(previous_start+previous_end)/2;
//*********** add left node to the previously created node ***********
//calculate new start and new end positions
//end is the previous index position minus 1
end=last_mid-1;
//start will not change for left nodes generation
start=previous_start;
//check if the index exists in the array and add the left node
if (end>=start){
root.left=new Node(inputArray[((start+end)/2)]);
System.out.println("\tCurrent root.left="+root.left.data);
}
else
root.left=null;
//save previous_end value (to be used in right node creation)
int previous_end_bck=previous_end;
//update previous end
previous_end=end;
//*********** add right node to the previously created node ***********
//get the initial value (inside the current iteration) of previous end
end=previous_end_bck;
//start is the previous index position plus one
start=last_mid+1;
//check if the index exists in the array and add the right node
if (start<=end){
root.right=new Node(inputArray[((start+end)/2)]);
System.out.println("\tCurrent root.right="+root.right.data);
//save the created node and its index position (start & end) in the array to toBeDone stack
toBeDone.push(root.right);
toBeDone.push(new Node(start));
toBeDone.push(new Node(end));
}
//*********** update the value of root ***********
if (root.left!=null){
root=root.left;
}
else{
if (toBeDone.top!=null) previous_end=toBeDone.pop().data;
if (toBeDone.top!=null) previous_start=toBeDone.pop().data;
root=toBeDone.pop();
}
}
return result;
}

Resources