Julia: type-stability with DataFrames - julia

How can I access the columns of a DataFrame in a type-stable way?
Let's assume I have the following data:
df = DataFrame(x = fill(1.0, 1000000), y = fill(1, 1000000), z = fill("1", 1000000))
And now I want to do some recursive computation (so I cannot use transform)
function foo!(df::DataFrame)
for i in 1:nrow(df)
if (i > 1) df.x[i] += df.x[i-1] end
end
end
This has terrible performance:
julia> #time foo!(df)
0.144921 seconds (6.00 M allocations: 91.529 MiB)
A quick fix in this simplified example would be the following:
function bar!(df::DataFrame)
x::Vector{Float64} = df.x
for i in length(x)
if (i > 1) x[i] += x[i-1] end
end
end
julia> #time bar!(df)
0.000004 seconds
However, I'm looking for a solution that is generalisable, eg when the recursive computation is just specified as a function
function foo2!(df::DataFrame, fn::Function)
for i in 1:nrow(df)
if (i > 1) fn(df, i) end
end
end
function my_fn(df::DataFrame, i::Int64)
x::Vector{Float64} = df.x
x[i] += x[i-1]
end
While this (almost) doesn't allocate, it is still very slow.
julia> #time foo2!(df, my_fn)
0.050465 seconds (1 allocation: 16 bytes)
Is there an approach that is performant and allows this kind of flexibility / generalisability?
EDIT: I should also mention that in practice it is not known a priori on which columns the function fn depends on. Ie I'm looking for an approach that allows performant access to / updating of arbitrary columns inside fn. The needed columns could be specified together with fn as a Vector{Symbol} for example if necessary.
EDIT 2: I tried using barrier functions as follows, but it's not performant
function foo3!(df::DataFrame, fn::Function, colnames::Vector{Symbol})
cols = map(cname -> df[!,cname], colnames)
for i in 1:nrow(df)
if (i > 1) fn(cols..., i) end
end
end
function my_fn1(x::Vector{Float64}, i::Int64)
x[i] += x[i-1]
end
function my_fn2(x::Vector{Float64}, y::Vector{Int64}, i::Int64)
x[i] += x[i-1] * y[i-1]
end
#time foo3!(df, my_fn1, [:x])
#time foo3!(df, my_fn2, [:x, :y])

This issue is intended (to avoid excessive compilation for wide data frames) and the ways how to handle it are explained in https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/11_performance.ipynb.
In general you should reduce the number of times you index into a data frame. So in this case do:
julia> function foo3!(x::AbstractVector, fn::Function)
for i in 2:length(x)
fn(x, i)
end
end
foo3! (generic function with 1 method)
julia> function my_fn(x::AbstractVector, i::Int64)
x[i] += x[i-1]
end
my_fn (generic function with 1 method)
julia> #time foo3!(df.x, my_fn)
0.010746 seconds (16.60 k allocations: 926.036 KiB)
julia> #time foo3!(df.x, my_fn)
0.002301 seconds
(I am using the version where you want to have a custom function passed)

My current approach involves wrapping the DataFrame in a struct and overloading getindex / setindex!. Some additional trickery using generated functions is needed to get the ability to access columns by name. While this is performant, it is also a quite hacky, and I was hoping there was a more elegant solution using only DataFrames.
For simplicity this assumes all (relevant) columns are of Float64 type.
struct DataFrameWrapper{colnames}
cols::Vector{Vector{Float64}}
end
function df_to_vectors(df::AbstractDataFrame, colnames::Vector{Symbol})::Vector{Vector{Float64}}
res = Vector{Vector{Float64}}(undef, length(colnames))
for i in 1:length(colnames)
res[i] = df[!,colnames[i]]
end
res
end
function DataFrameWrapper{colnames}(df::AbstractDataFrame) where colnames
DataFrameWrapper{colnames}(df_to_vectors(df, collect(colnames)))
end
get_colnames(::Type{DataFrameWrapper{colnames}}) where colnames = colnames
#generated function get_col_index(x::DataFrameWrapper, ::Val{col})::Int64 where col
id = findfirst(y -> y == col, get_colnames(x))
:($id)
end
Base.#propagate_inbounds Base.getindex(x::DataFrameWrapper, col::Val)::Vector{Float64} = x.cols[get_col_index(x, col)]
Base.#propagate_inbounds Base.getindex(x::DataFrameWrapper, col::Symbol)::Vector{Float64} = getindex(x, Val(col))
Base.#propagate_inbounds Base.setindex!(x::DataFrameWrapper, value::Float64, row::Int64, col::Val) = setindex!(x.cols[get_col_index(x, col)], value, row)
Base.#propagate_inbounds Base.setindex!(x::DataFrameWrapper, value::Float64, row::Int64, col::Symbol) = setindex!(x, value, row, Val(col))

Related

Type-stability in Julia's product iterator

I am trying to make A in the following code type-stable.
using Primes: factor
function f(n::T, p::T, k::T) where {T<:Integer}
return rand(T, n * p^k)
end
function g(m::T, n::T) where {T<:Integer}
i = 0
for A in Iterators.product((f(n, p, T(k)) for (p, k) in factor(m))...)
i = sum(A)
end
return i
end
Note that f is type-stable. The variable A is not type-stable because the product iterator will return different sized tuples depending on the values of n and m. If there was an iterator like the product iterator that returned a Vector instead of a Tuple, I believe that the type-instability would go away.
Does anyone have any suggestions to make A type-stable in the above code?
Edit: I should add that f returns a variable-sized Vector of type T.
One way I have solved the type-stability is by doing this.
function g(m::T, n::T) where {T<:Integer}
B = Vector{T}[T[]]
for (p, k) in factor(m)
C = Vector{T}[]
for (b, r) in Iterators.product(B, f(n, p, T(k)))
c = copy(b)
push!(c, r)
push!(C, c)
end
B = C
end
for A in B
i = sum(A)
end
return i
end
This (and in particular, A) is now type-stable, but at the cost lots of memory. I'm not sure of a better way to do this.
It's not easy to get this completely type stable, but you can isolate the type instability with a function barrier. Convert the factorization to a tuple in an outer function, which you pass to an inner function which is type stable. This gives just one dynamic dispatch, instead of many:
# inner, type stable
function _g(n, tup)
i = 0
for A in Iterators.product((f(n, p, k) for (p, k) in tup)...)
i += sum(A) # or i = sum(A), whatever
end
return i
end
# outer function
g(m::T, n::T) where {T<:Integer} = _g(n, Tuple(factor(m)))
Some benchmarks:
julia> #btime g(7, 210); # OP version
149.600 μs (7356 allocations: 172.62 KiB)
julia> #btime g(7, 210); # my version
1.140 μs (6 allocations: 11.91 KiB)
You should expect to hit compilation occasionally, whenever you get a number that contains a new number of factors.

StaticArrays and Statsbase

I want to use StaticArray with StatsBase. Consider the following function
function update_weights_1(N, M)
weights_vector_to_update = ones(N) / N
wvector = Weights(weights_vector_to_update, 1)
res = [0.0]
for m in 1:M
sample!(M_vector, wvector, res)
end
end
function update_weights_2(N, M)
weights_vector_to_update = ones(N) / N
res = [0.0]
for m in 1:M
sample!(M_vector, Weights(weights_vector_to_update, 1), res)
end
end
update_weights_1 requires substantially less memory allocation than update_weights_2 because Weights(weights_vector_to_update, 1) needs memory allocation. However, suppose I have a list of small vectors, say z,
z = [ones(3) / 3 for i in 1:10000]
and this function
function update_weights_3(z,M)
N = size(z[1],1)
M_vector = 1:N
for i in 1:size(z,1)
rand!(z[i])
res = [0.0]
for m in 1:M
sample!(M_vector, Weights(z[i]), res)
end
end
end
update_weights_3(z,1000) allocates a lot of memory. I know that using StaticArrays for z can significantly speed up the code and reduce memory allocation. However, following the procedure in this post, whenever I wrap Weights around a StaticArray, it creates memory.
Would you know how to apply StaticArray in this case? Essentially I have a collection of small arrays that I would like to transform into Weights.
Weights is a mutable type, which can cause unnecessary heap allocations (sometimes they are stack allocated... I don't fully understand when this optimization happens). You can define your own immutable weights type, though:
struct StaticWeights{S<:Real, T<:Real, N, V<: StaticVector{N, T}} <: AbstractWeights{S, T, V}
values::V
sum::S
end
StaticWeights(values) = StaticWeights(values, sum(values))
Used in your example:
function update_weights_3(z,M)
N = size(z[1],1)
M_vector = 1:N
for i in 1:size(z,1)
rand!(z[i])
res = [0.0]
for m in 1:M
sample!(M_vector, StaticWeights(z[i]), res)
end
end
end
With this change I don't see any allocations in the inner loop.

Julia function to return non-unique elements of an array

Julia base has the unique function that returns a vector containing only the unique elements of an array (or any iterable). I was looking for a nonunique function to return an array containing all the elements that appear at least twice in its input. As far as I can tell Julia does not have such a function, which I found a bit surprising.
My first attempt was as follows:
function nonunique(x::AbstractArray)
uniqueindexes = indexin(unique(x),x)
nonuniqueindexes = setdiff(1:length(x),uniqueindexes)
unique(x[nonuniqueindexes])
end
But inspired by Bogumił Kamiński's answer to indices of unique elements of vector in Julia I wrote a second version:
function nonunique(x::AbstractArray{T}) where T
uniqueset = Set{T}()
duplicatedset = Set{T}()
duplicatedvector = Vector{T}()
for i in x
if(i in uniqueset)
if !(i in duplicatedset)
push!(duplicatedset, i)
push!(duplicatedvector, i)
end
else
push!(uniqueset, i)
end
end
duplicatedvector
end
In my tests, this version is about 4 times faster. It has the nice property that the return is ordered in the order that the second (first repeat) of each set of equivalent elements originally appear. I believe that in is faster when checking for membership of a Set than an Array, which accounts for having the two variables duplicatedset and duplicatedvector.
Is it really necessary for me to "roll my own" nonunique function and can the second version be improved?
You can get higher performance by sorting the list and then searching for duplicates:
function nonunique2(x::AbstractArray{T}) where T
xs = sort(x)
duplicatedvector = T[]
for i=2:length(xs)
if (isequal(xs[i],xs[i-1]) && (length(duplicatedvector)==0 || !isequal(duplicatedvector[end], xs[i])))
push!(duplicatedvector,xs[i])
end
end
duplicatedvector
end
Here are sample results:
julia> x = rand(1:1000,1000);
julia> using BenchmarkTools
julia> nn = #btime nonunique($x);
42.240 μs (39 allocations: 71.23 KiB)
julia> nn2s = #btime nonunique2($x);
26.453 μs (10 allocations: 16.33 KiB)
julia> sort(nn) == sort(nn2s)
true
It will be much better if you can do in-place sorting:
function nonunique2!(x::AbstractArray{T}) where T
sort!(x)
duplicatedvector = T[]
for i=2:length(x)
if (isequal(x[i],x[i-1]) && (length(duplicatedvector)==0 || !isequal(duplicatedvector[end], x[i])))
push!(duplicatedvector,x[i])
end
end
duplicatedvector
end
Here are the results (the same data)
julia> nn2 = #btime nonunique2!($x)
9.813 μs (9 allocations: 8.39 KiB)
julia> sort(nn) == sort(nns)
true
To add to the answer above, as its limitation is that the type T must be sortable and it is not order-preserving I have two possible solutions.
Here is another non-order preserving solution that uses StatsBase.jl. It can be faster than the sorting solution or slower depending on the density of the duplicates (also it does more work, but in some applications this information might be useful):
nonunique3(x) = [k for (k, v) in countmap(x) if v > 1]
If you want to speed up the order preserving approach you could do something like:
function nonunique4(x::AbstractArray{T}) where T
status = Dict{T, Bool}()
duplicatedvector = Vector{T}()
for i in x
if haskey(status, i)
if status[i]
push!(duplicatedvector, i)
status[i] = false
end
else
status[i] = true
end
end
duplicatedvector
end
In general benchmarking them is tricky as performance will depend on:
density of duplicates and over double duplicates in x
the size of type T (e.g. if it were a very large immutable type things might change vs. standard situation)
Not really an answer (excellent answers are above) but a comment that the original implementation can be cleaned a little to:
function nonunique1(x::AbstractArray{T}) where T
uniqueset = Set{T}()
duplicatedset = Set{T}()
for i in x
if(i in uniqueset)
push!(duplicatedset, i)
else
push!(uniqueset, i)
end
end
collect(duplicatedset)
end
i.e. you don't need to check for existence before pushing to a set, and you don't need to fill a vector and set separately. It's still not as fast as the sorting implementation.

Recursive call signature keeps changing

I am going to implement a program that uses recursion quite a bit. So, before I started to get stack overflows exceptions, I figured it would be nice to have a trampoline implemented and use thunks in case it was needed.
A first try I did was with factorial. Here the code:
callable(f) = !isempty(methods(f))
function trampoline(f, arg1, arg2)
v = f(arg1, arg2)
while callable(v)
v = v()
end
return v
end
function factorial(n, continuation)
if n == 1
continuation(1)
else
(() -> factorial(n-1, (z -> (() -> continuation(n*z)))))
end
end
function cont(x)
x
end
Also, I implemented a naive factorial to check if, as a matter of fact, I would be preventing stack overflows:
function factorial_overflow(n)
if n == 1
1
else
n*factorial_overflow(n-1)
end
end
The results are:
julia> factorial_overflow(140000)
ERROR: StackOverflowError:
#JITing with a small input
julia> trampoline(factorial, 10, cont)
3628800
#Testing
julia> trampoline(factorial, 140000, cont)
0
So, yes, I am avoiding StacksOverflows. And yes, I know the result is nonsense as I am getting integers overflows, but here I just cared about the stack. A production version of course would have that fixed.
(Also, I know for the factorial case there is a built-in, I wouldn't use either of these, I made them for testing my trampoline).
The trampoline version takes a lot of time when running for the first time, and then it gets quick... when computing the same or lower values.
If I did trampoline(factorial, 150000, cont) I will have some compiling time again.
It seems to me (educated guess) that I am JITing many different signatures for factorial: one for every thunk generated.
My question is: can I avoid this?
I think the problem is that every closure is its own type, which is specialized on the captured variables. To avoid this specialization, one can instead use functors, that are not fully specialized:
struct L1
f
n::Int
z::Int
end
(o::L1)() = o.f(o.n*o.z)
struct L2
f
n::Int
end
(o::L2)(z) = L1(o.f, o.n, z)
struct Factorial
f
c
n::Int
end
(o::Factorial)() = o.f(o.n-1, L2(o.c, o.n))
callable(f) = false
callable(f::Union{Factorial, L1, L2}) = true
function myfactorial(n, continuation)
if n == 1
continuation(1)
else
Factorial(myfactorial, continuation, n)
end
end
function cont(x)
x
end
function trampoline(f, arg1, arg2)
v = f(arg1, arg2)
while callable(v)
v = v()
end
return v
end
Note that the function fields are untyped. Now the function run much faster on the first run:
julia> #time trampoline(myfactorial, 10, cont)
0.020673 seconds (4.24 k allocations: 264.427 KiB)
3628800
julia> #time trampoline(myfactorial, 10, cont)
0.000009 seconds (37 allocations: 1.094 KiB)
3628800
julia> #time trampoline(myfactorial, 14000, cont)
0.001277 seconds (55.55 k allocations: 1.489 MiB)
0
julia> #time trampoline(myfactorial, 14000, cont)
0.001197 seconds (55.55 k allocations: 1.489 MiB)
0
I just translated every closure in your code into a corresponding functor. This might not be needed and probably there are be better solutions, but it works and hopefully demonstrates the approach.
Edit:
To make the reason for the slowdown more clear, one can use:
function factorial(n, continuation)
if n == 1
continuation(1)
else
tmp = (z -> (() -> continuation(n*z)))
#show typeof(tmp)
(() -> factorial(n-1, tmp))
end
end
This outputs:
julia> trampoline(factorial, 10, cont)
typeof(tmp) = ##31#34{Int64,#cont}
typeof(tmp) = ##31#34{Int64,##31#34{Int64,#cont}}
typeof(tmp) = ##31#34{Int64,##31#34{Int64,##31#34{Int64,#cont}}}
typeof(tmp) = ##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,#cont}}}}
typeof(tmp) = ##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,#cont}}}}}
typeof(tmp) = ##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,#cont}}}}}}
typeof(tmp) = ##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,#cont}}}}}}}
typeof(tmp) = ##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,#cont}}}}}}}}
typeof(tmp) = ##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,##31#34{Int64,#cont}}}}}}}}}
3628800
tmp is a closure. Its automatically created type ##31#34 looks similar to
struct Tmp{T,F}
n::T
continuation::F
end
The specialization on the type F of the continuation field is the reason for the long compilation times.
By using L2 instead, which is not specialized on the corresponding field f, the continuation argument to factorial has always the type L2 and the problem is avoided.

Julia pmap performance

I am trying to port some of my R code to Julia;
Basically I have rewritten the following R code in Julia:
library(parallel)
eps_1<-rnorm(1000000)
eps_2<-rnorm(1000000)
large_matrix<-ifelse(cbind(eps_1,eps_2)>0,1,0)
matrix_to_compare = expand.grid(c(0,1),c(0,1))
indices<-seq(1,1000000,4)
large_matrix<-lapply(indices,function(i)(large_matrix[i:(i+3),]))
function_compare<-function(x){
which((rowSums(x==matrix_to_compare)==2) %in% TRUE)
}
> system.time(lapply(large_matrix,function_compare))
user system elapsed
38.812 0.024 38.828
> system.time(mclapply(large_matrix,function_compare,mc.cores=11))
user system elapsed
63.128 1.648 6.108
As one can notice I am getting significant speed-up when going from one core to 11. Now I am trying to do the same in Julia:
#Define cluster:
addprocs(11);
using Distributions;
#everywhere using Iterators;
d = Normal();
eps_1 = rand(d,1000000);
eps_2 = rand(d,1000000);
#Create a large matrix:
large_matrix = hcat(eps_1,eps_2).>=0;
indices = collect(1:4:1000000)
#Split large matrix:
large_matrix = [large_matrix[i:(i+3),:] for i in indices];
#Define the function to apply:
#everywhere function function_split(x)
matrix_to_compare = transpose(reinterpret(Int,collect(product([0,1],[0,1])),(2,4)));
matrix_to_compare = matrix_to_compare.>0;
find(sum(x.==matrix_to_compare,2).==2)
end
#time map(function_split,large_matrix )
#time pmap(function_split,large_matrix )
5.167820 seconds (22.00 M allocations: 2.899 GB, 12.83% gc time)
18.569198 seconds (40.34 M allocations: 2.082 GB, 5.71% gc time)
As one can notice I am not getting any speed up with pmap. Maybe somebody can suggest alternatives.
I think that some of the problem here is that #parallel and #pmap don't always handle moving data to and from the workers very well. Thus, they tend to work best in situations where what you are executing doesn't require very much data movement at all. I also suspect that there are probably things that could be done to improve their performance, but I'm not certain on the details.
For situations in which you do need more data moving around, it may be best to stick with options that directly call functions on workers, with those functions then accessing objects within the memory space of those workers. I give one example below, which speeds up your function using multiple workers. It uses perhaps the simplest option, which is #everywhere, but #spawn, remotecall() etc. are also worth considering, depending on your situation.
addprocs(11);
using Distributions;
#everywhere using Iterators;
d = Normal();
eps_1 = rand(d,1000000);
eps_2 = rand(d,1000000);
#Create a large matrix:
large_matrix = hcat(eps_1,eps_2).>=0;
indices = collect(1:4:1000000);
#Split large matrix:
large_matrix = [large_matrix[i:(i+3),:] for i in indices];
large_matrix = convert(Array{BitArray}, large_matrix);
function sendto(p::Int; args...)
for (nm, val) in args
#spawnat(p, eval(Main, Expr(:(=), nm, val)))
end
end
getfrom(p::Int, nm::Symbol; mod=Main) = fetch(#spawnat(p, getfield(mod, nm)))
#everywhere function function_split(x::BitArray)
matrix_to_compare = transpose(reinterpret(Int,collect(product([0,1],[0,1])),(2,4)));
matrix_to_compare = matrix_to_compare.>0;
find(sum(x.==matrix_to_compare,2).==2)
end
function distribute_data(X::Array, WorkerName::Symbol)
size_per_worker = floor(Int,size(X,1) / nworkers())
StartIdx = 1
EndIdx = size_per_worker
for (idx, pid) in enumerate(workers())
if idx == nworkers()
EndIdx = size(X,1)
end
#spawnat(pid, eval(Main, Expr(:(=), WorkerName, X[StartIdx:EndIdx])))
StartIdx = EndIdx + 1
EndIdx = EndIdx + size_per_worker - 1
end
end
distribute_data(large_matrix, :large_matrix)
function parallel_split()
#everywhere begin
if myid() != 1
result = map(function_split,large_matrix );
end
end
results = cell(nworkers())
for (idx, pid) in enumerate(workers())
results[idx] = getfrom(pid, :result)
end
vcat(results...)
end
## results given after running once to compile
#time a = map(function_split,large_matrix); ## 6.499737 seconds (22.00 M allocations: 2.899 GB, 13.99% gc time)
#time b = parallel_split(); ## 1.097586 seconds (1.50 M allocations: 64.508 MB, 3.28% gc time)
julia> a == b
true
Note: even with this, the speedup is not perfect from the multiple processes. But, this is to be expected, since there is still a moderate amount of data to be returned as a result of your function, and that data's got to be moved, taking time.
P.S. See this post (Julia: How to copy data to another processor in Julia) or this package (https://github.com/ChrisRackauckas/ParallelDataTransfer.jl) for more on the sendto and getfrom functions I used here.

Resources