This:
function draw1(n)
return rand(Normal(0,1), Int(n))
end
is somewhat faster than this:
function draw2(n)
result = zeros(Float64, Int(n))
for i=1:Int(n)
result[i] = rand(Normal(0,1))
end
return result
end
Just curious why that is, and if the explicit loop way can be speeded up (I tried #inbounds and #simd and didn't get a speedup). Is it the initial allocation of zeros()? I timed that separately at about 0.25 seconds, which doesn't fully account for the difference (plus doesn't the first way pre-allocate an array under the hood?).
Example:
#time x = draw1(1e08)
1.169986 seconds (6 allocations: 762.940 MiB, 4.53% gc time)
#time y = draw2(1e08)
1.824750 seconds (6 allocations: 762.940 MiB, 3.05% gc time)
Try this implementation:
function draw3(n)
d = Normal(0,1)
result = Vector{Float64}(Int(n))
#inbounds for i=1:Int(n)
result[i] = rand(d)
end
return result
end
What is the difference:
uses #inbounds
creates Normal(0,1) only once
performs faster initialization of result
When I test it it has essentially the same performance as draw1 (I have not tested it on 10e8 vector size though (not enough memory) - if you can run such #benchmark it would be nice):
julia> using BenchmarkTools
julia> #benchmark draw1(10e5)
BenchmarkTools.Trial:
memory estimate: 7.63 MiB
allocs estimate: 2
--------------
minimum time: 12.296 ms (0.00% GC)
median time: 13.012 ms (0.00% GC)
mean time: 14.510 ms (8.49% GC)
maximum time: 84.253 ms (81.30% GC)
--------------
samples: 345
evals/sample: 1
julia> #benchmark draw2(10e5)
BenchmarkTools.Trial:
memory estimate: 7.63 MiB
allocs estimate: 2
--------------
minimum time: 20.374 ms (0.00% GC)
median time: 21.622 ms (0.00% GC)
mean time: 22.787 ms (5.95% GC)
maximum time: 92.265 ms (77.18% GC)
--------------
samples: 220
evals/sample: 1
julia> #benchmark draw3(10e5)
BenchmarkTools.Trial:
memory estimate: 7.63 MiB
allocs estimate: 2
--------------
minimum time: 12.415 ms (0.00% GC)
median time: 12.956 ms (0.00% GC)
mean time: 14.456 ms (8.67% GC)
maximum time: 84.342 ms (83.74% GC)
--------------
samples: 346
evals/sample: 1
EDIT: actually defining a loop in a separate function (exactly as rand does) gives a bit better performance of draw4 than draw3:
function g!(d, v)
#inbounds for i=1:length(v)
v[i] = rand(d)
end
end
function draw4(n)
result = Vector{Float64}(Int(n))
g!(Normal(0,1), result)
return result
end
A shorter answer is that the built-in implementation is fastest, which is fortunately often the case.
Instead of draw4 above, you could just use the inbuilt
function draw5(n)
result = Vector{Float64}(Int(n))
rand!(Normal(0,1), result)
end
Filling an existing vector with something like rand! will always be inbounds.
Related
I am a julia newbie, and have a baby assignment to write a function which converts a vector of vectors to a matrix. This was pretty easy to do by iterating over the elements.
However, I have read that broadcasting tends to be more efficient. But I wasn't sure how to do it here, because a .= operation cannot work, as it would read the vector as a 1 by n array, and thus be trying to broadcast on two arrays of different length.
Is there a way to broadcast?
My code is below
function vecvec_to_matrix(vecvec)
dim1 = length(vecvec)
dim2 = length(vecvec[1])
my_array = zeros(Int64, dim1, dim2)
for i in 1:dim1
for j in 1:dim2
my_array[i,j] = vecvec[i][j]
end
end
return my_array
end
If your vectors are short and of fixed size (e.g., a list of points in 3 dimensions), then you should strongly consider using the StaticArrays package and then calling reinterpret. Demo:
julia> using StaticArrays
julia> A = rand(3, 8)
3×8 Array{Float64,2}:
0.153872 0.361708 0.39703 0.405625 0.0881371 0.390133 0.185328 0.585539
0.467841 0.846298 0.884588 0.798848 0.14218 0.156283 0.232487 0.22629
0.390566 0.897737 0.569882 0.491681 0.499163 0.377012 0.140902 0.513979
julia> reinterpret(SVector{3,Float64}, A)
1×8 reinterpret(SArray{Tuple{3},Float64,1,3}, ::Array{Float64,2}):
[0.153872, 0.467841, 0.390566] [0.361708, 0.846298, 0.897737] [0.39703, 0.884588, 0.569882] … [0.390133, 0.156283, 0.377012] [0.185328, 0.232487, 0.140902] [0.585539, 0.22629, 0.513979]
julia> B = vec(copy(ans))
8-element Array{SArray{Tuple{3},Float64,1,3},1}:
[0.1538721224514592, 0.467840786943454, 0.39056612358281706]
[0.3617079493961777, 0.8462982350893753, 0.8977366743282564]
[0.3970299970547111, 0.884587972864584, 0.5698823030478959]
[0.40562472747685074, 0.7988484677138279, 0.49168126614394647]
[0.08813706434793178, 0.14218012559727544, 0.499163319341982]
[0.3901332827772166, 0.15628284837250006, 0.3770117394226711]
[0.18532803309577517, 0.23248748941275688, 0.14090166962667428]
[0.5855387782654986, 0.22628968661452897, 0.5139790762185006]
julia> reshape(reinterpret(Float64, B), (3, 8))
3×8 reshape(reinterpret(Float64, ::Array{SArray{Tuple{3},Float64,1,3},1}), 3, 8) with eltype Float64:
0.153872 0.361708 0.39703 0.405625 0.0881371 0.390133 0.185328 0.585539
0.467841 0.846298 0.884588 0.798848 0.14218 0.156283 0.232487 0.22629
0.390566 0.897737 0.569882 0.491681 0.499163 0.377012 0.140902 0.513979
Your way is intuitive and fast already. You can improve performance with some #inbounds and that's about it. vcat is also fast. I think broadcasting is not necessary in your case. You
Here are some benchmarks of the various ways I can think of
function vecvec_to_matrix(vecvec)
dim1 = length(vecvec)
dim2 = length(vecvec[1])
my_array = zeros(Int64, dim1, dim2)
for i in 1:dim1
for j in 1:dim2
my_array[i,j] = vecvec[i][j]
end
end
return my_array
end
function vecvec_to_matrix2(vecvec::AbstractVector{T}) where T <: AbstractVector
dim1 = length(vecvec)
dim2 = length(vecvec[1])
my_array = Array{eltype(vecvec[1]), 2}(undef, dim1, dim2)
#inbounds #fastmath for i in 1:dim1, j in 1:dim2
my_array[i,j] = vecvec[i][j]
end
return my_array
end
function vecvec_to_matrix3(vecvec::AbstractVector{T}) where T <: AbstractVector
dim1 = length(vecvec)
dim2 = length(vecvec[1])
my_array = Array{eltype(vecvec[1]), 2}(undef, dim1, dim2)
Threads.#threads for i in 1:dim1
for j in 1:dim2
my_array[i,j] = vecvec[i][j]
end
end
return my_array
end
using Tullio
function using_tullio(vecvec::AbstractVector{T}) where T <: AbstractVector
dim1 = length(vecvec)
dim2 = length(vecvec[1])
my_array = Array{eltype(vecvec[1]), 2}(undef, dim1, dim2)
#tullio my_array[i, j] = vecvec[i][j]
my_array
end
function using_vcat(vecvec::AbstractVector{T}) where T <: AbstractVector
vcat(vecvec...)
end
using BenchmarkTools
vecvec =[rand(Int, 100) for i in 1:100];
#benchmark vecvec_to_matrix(vecvec)
#benchmark vecvec_to_matrix2(vecvec)
#benchmark vecvec_to_matrix3(vecvec)
#benchmark using_tullio(vecvec)
#benchmark using_vcat(vecvec)
with results
julia> #benchmark vecvec_to_matrix(vecvec)
BenchmarkTools.Trial:
memory estimate: 78.20 KiB
allocs estimate: 2
--------------
minimum time: 12.701 μs (0.00% GC)
median time: 15.001 μs (0.00% GC)
mean time: 24.465 μs (10.98% GC)
maximum time: 3.884 ms (98.30% GC)
--------------
samples: 10000
evals/sample: 1
julia> #benchmark vecvec_to_matrix2(vecvec)
BenchmarkTools.Trial:
memory estimate: 78.20 KiB
allocs estimate: 2
--------------
minimum time: 8.600 μs (0.00% GC)
median time: 9.800 μs (0.00% GC)
mean time: 19.532 μs (12.37% GC)
maximum time: 3.834 ms (98.82% GC)
--------------
samples: 10000
evals/sample: 1
julia> #benchmark vecvec_to_matrix3(vecvec)
BenchmarkTools.Trial:
memory estimate: 83.28 KiB
allocs estimate: 32
--------------
minimum time: 8.399 μs (0.00% GC)
median time: 14.600 μs (0.00% GC)
mean time: 28.178 μs (11.82% GC)
maximum time: 8.269 ms (0.00% GC)
--------------
samples: 10000
evals/sample: 1
julia> #benchmark using_tullio(vecvec)
BenchmarkTools.Trial:
memory estimate: 78.20 KiB
allocs estimate: 2
--------------
minimum time: 8.299 μs (0.00% GC)
median time: 10.101 μs (0.00% GC)
mean time: 19.476 μs (12.15% GC)
maximum time: 3.661 ms (98.74% GC)
--------------
samples: 10000
evals/sample: 1
julia> #benchmark using_vcat(vecvec)
BenchmarkTools.Trial:
memory estimate: 78.20 KiB
allocs estimate: 2
--------------
minimum time: 5.540 μs (0.00% GC)
median time: 7.480 μs (0.00% GC)
mean time: 16.236 μs (15.30% GC)
maximum time: 876.400 μs (97.85% GC)
--------------
samples: 10000
evals/sample: 5
I'm using Julia 1.0. Please consider the following code:
using LinearAlgebra
using Distributions
## create random data
const data = rand(Uniform(-1,2), 100000, 2)
function test_function_1(data)
theta = [1 2]
coefs = theta * data[:,1:2]'
res = coefs' .* data[:,1:2]
return sum(res, dims = 1)'
end
function test_function_2(data)
theta = [1 2]
sum_all = zeros(2)
for i = 1:size(data)[1]
sum_all .= sum_all + (theta * data[i,1:2])[1] * data[i,1:2]
end
return sum_all
end
After running it for the first time, I timed it
julia> #time test_function_1(data)
0.006292 seconds (16 allocations: 5.341 MiB)
2×1 Adjoint{Float64,Array{Float64,2}}:
150958.47189289227
225224.0374366073
julia> #time test_function_2(data)
0.038112 seconds (500.00 k allocations: 45.777 MiB, 15.61% gc time)
2-element Array{Float64,1}:
150958.4718928927
225224.03743660534
test_function_1 is significantly superior, both in allocations and speed, but test_function_1 is not devectorized. I would expect test_function_2 to perform better. Note that both functions do the same.
I have a hunch that it's because in test_function_2, I use sum_all .= sum_all + ..., but I'm not sure why that's a problem. Can I get a hint?
So first let me comment how I would write your function if I wanted to use a loop:
function test_function_3(data)
theta = (1, 2)
sum_all = zeros(2)
for row in eachrow(data)
sum_all .+= dot(theta, row) .* row
end
return sum_all
end
Next, here is a benchmark comparison of the three options:
julia> #benchmark test_function_1($data)
BenchmarkTools.Trial:
memory estimate: 5.34 MiB
allocs estimate: 16
--------------
minimum time: 1.953 ms (0.00% GC)
median time: 1.986 ms (0.00% GC)
mean time: 2.122 ms (2.29% GC)
maximum time: 4.347 ms (8.00% GC)
--------------
samples: 2356
evals/sample: 1
julia> #benchmark test_function_2($data)
BenchmarkTools.Trial:
memory estimate: 45.78 MiB
allocs estimate: 500002
--------------
minimum time: 16.316 ms (7.44% GC)
median time: 16.597 ms (7.63% GC)
mean time: 16.845 ms (8.01% GC)
maximum time: 34.050 ms (4.45% GC)
--------------
samples: 297
evals/sample: 1
julia> #benchmark test_function_3($data)
BenchmarkTools.Trial:
memory estimate: 96 bytes
allocs estimate: 1
--------------
minimum time: 777.204 μs (0.00% GC)
median time: 791.458 μs (0.00% GC)
mean time: 799.505 μs (0.00% GC)
maximum time: 1.262 ms (0.00% GC)
--------------
samples: 6253
evals/sample: 1
Next you can go a bit faster if you explicitly implement the dot in the loop:
julia> function test_function_4(data)
theta = (1, 2)
sum_all = zeros(2)
for row in eachrow(data)
#inbounds sum_all .+= (theta[1]*row[1]+theta[2]*row[2]) .* row
end
return sum_all
end
test_function_4 (generic function with 1 method)
julia> #benchmark test_function_4($data)
BenchmarkTools.Trial:
memory estimate: 96 bytes
allocs estimate: 1
--------------
minimum time: 502.367 μs (0.00% GC)
median time: 502.547 μs (0.00% GC)
mean time: 505.446 μs (0.00% GC)
maximum time: 806.631 μs (0.00% GC)
--------------
samples: 9888
evals/sample: 1
To understand the differences let us have a look at this line of your code:
sum_all .= sum_all + (theta * data[i,1:2])[1] * data[i,1:2]
Let us count the memory allocations you do in this expression:
sum_all .=
sum_all
+ # allocation of a new vector as a result of addition
(theta
* # allocation of a new vector as a result of multiplication
data[i,1:2] # allocation of a new vector via getindex
)[1]
* # allocation of a new vector as a result of multiplication
data[i,1:2] # allocation of a new vector via getindex
So you can see that in each iteration of the loop you allocate five times.
Allocations are expensive. And you can see this in the benchmarks that you have 5000002 allocations in the process:
1 allocation of sum_all
1 allocation of theta
500000 allocations in the loop (5 * 100000)
Additionally you perform indexing like data[i,1:2] which performs
bounds checking, which is also a small cost (but marginal in comparison to allocations).
Now in function test_function_3 I use eachrow(data). This time I also get rows of data matrix, but they are returned as views (not new matrices) so no allocation happens inside the loop. Next I use a dot function again to avoid allocation that was earlier caused by a matrix multiplication (I have changed theta to a Tuple from a Matrix as then dot is a bit faster, but this secondary). Finally I write um_all .+= dot(theta, row) .* row and in this case all operations are broadcasted, so Julia can do broadcast fusion (again - no allocations happen).
In test_function_4 I just replace dot by unrolled loop as we know we have two elements to calculate the dot product for. Actually if you fully unroll everything and use #simd it gets even faster:
julia> function test_function_5(data)
theta = (1, 2)
s1 = 0.0
s2 = 0.0
#inbounds #simd for i in axes(data, 1)
r1 = data[i, 1]
r2 = data[i, 2]
mul = theta[1]*r1 + theta[2]*r2
s1 += mul * r1
s2 += mul * r2
end
return [s1, s2]
end
test_function_5 (generic function with 1 method)
julia> #benchmark test_function_5($data)
BenchmarkTools.Trial:
memory estimate: 96 bytes
allocs estimate: 1
--------------
minimum time: 22.721 μs (0.00% GC)
median time: 23.146 μs (0.00% GC)
mean time: 24.306 μs (0.00% GC)
maximum time: 100.109 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
So you can see that this way you are around 100x faster than with test_function_1. Still already test_function_3 is relatively fast and it is fully generic so probably normally I would write something like test_function_3 unless I really needed to be super fast and knew that the dimensions of my data are fixed and small.
Consider
x = rand([missing, rand(Int, 100)...], 1_000_000)
which yields typeof(x) = Array{Union{Missing, Int64},1}.
What's the most efficient way to count the number of missings in x?
The cleanest way is probably just
count(ismissing, x)
Simple, easy to remember, and fast
Since you're asking for the "most efficient" way, let me give some benchmark results. It is slightly faster than #xiaodai's answer, and as fast as a simple loop implementation:
julia> #btime count($ismissing,$x);
278.499 μs (0 allocations: 0 bytes)
julia> #btime mapreduce($ismissing, $+, $x);
293.901 μs (0 allocations: 0 bytes)
julia> #btime count_missing($x)
278.499 μs (0 allocations: 0 bytes)
where
julia> function count_missing(x)
c = 0
#inbounds for i in eachindex(x)
if ismissing(x[i])
c += 1
end
end
return c
end
Abstraction for no cost, just the way you'd want it to be.
If you know that your number of missing is less than 4 billion elements (or less than 65k elements) you can be several times faster than #crstnbr answer with the following code:
function count_missing(x, T)
c = zero(T)
for i in 1:length(x)
c += #inbounds ismissing(x[i])
end
return Int(c) #we want to have stable result type
# this could be further combined with a barrier function
# that could check the size of `x` at the runtime
end
Now the benchmarks.
This is the original time on my laptop:
julia> #btime count_missing($x, Int)
227.799 μs (0 allocations: 0 bytes)
9971
Slash the time by half if you know there is less than 4 billion matching elements:
julia> #btime count_missing($x, UInt32)
113.899 μs (0 allocations: 0 bytes)
9971
Slash the time by 8x if you know there is less than 65k matching elements:
julia> #btime count_missing($x, UInt16)
29.200 μs (0 allocations: 0 bytes)
9971
This is an unsafe answer and is not guaranteed to work in future if Julia changes the memory layout but it's fun
x = Vector{Union{Missing, Float64}}(missing, 100_000_000)
x[rand(1:100_000_000, 90_000_000)] .= rand.()
using BenchmarkTools
#benchmark count($ismissing, $x)
# BenchmarkTools.Trial:
# memory estimate: 0 bytes
# allocs estimate: 0
# --------------
# minimum time: 48.468 ms (0.00% GC)
# median time: 51.755 ms (0.00% GC)
# mean time: 66.863 ms (0.00% GC)
# maximum time: 91.449 ms (0.00% GC)
# --------------
# samples: 76
# evals/sample: 1
function unsafe_count_missing(x::Vector{Union{Missing, T}}) where T
#assert isbitstype(T)
l = length(x)
GC.#preserve x begin
y = unsafe_wrap(Vector{UInt8}, Ptr{UInt8}(pointer(x) + sizeof(T)*l), l)
res = reduce(-, y; init = l)
end
res
end
#time count(ismissing, x) == unsafe_count_missing(x)
#benchmark faster_count_missing($x)
# BenchmarkTools.Trial:
# memory estimate: 80 bytes
# allocs estimate: 1
# --------------
# minimum time: 9.190 ms (0.00% GC)
# median time: 9.718 ms (0.00% GC)
# mean time: 9.845 ms (0.00% GC)
# maximum time: 15.691 ms (0.00% GC)
# --------------
# samples: 508
# evals/sample: 1
This question shows how to repeat individual characters in strings in Python.
>>> s = '123abc'
>>> n = 3
>>> ''.join([c*n for c in s])
'111222333aaabbbccc'
How would you do that in Julia?
EDIT
As a newcomer to Julia I am amazed at what the language has to offer.
For example, I would have thought that the Python code above is about as simple as the code could get in any language. However, as shown by my answer below, the Julia equivalent code join([c^n for c in s]) is arguably simpler, and may be reaching the optimum of simplicity for any language.
On the other hand, #niczky12 has shown that with the addition of the ellipsis operator to the string function, the speed can be substantially increased over what the somewhat simpler join function achieves.
In one case Julia shines for simplicity. In the other case, Julia shines for speed.
To a Python programmer the first case should be almost immediately readable when they notice that c^n is just c*n in Python. When they see the speed increase using the ... ellipsis operator, the extra complexity might not deter them from learning Julia. Readers might be starting to think I hope many Python programmers will take Julia seriously. They would not be wrong.
Thanks to #rickhg12hs for suggesting bench-marking. I have learned a lot.
In addition to the answers above, I found that the string function runs even faster. Here are my benchmarks:
julia> n = 2;
julia> s = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
julia> string((c^n for c in s)...) # proof that it works
"AABBCCDDEEFFGGHHIIJJKKLLMMNNOOPPQQRRSSTTUUVVWWXXYYZZ"
julia> n = 26000;
julia> #benchmark join(c^n for c in s)
BenchmarkTools.Trial:
memory estimate: 1.44 MiB
allocs estimate: 36
--------------
minimum time: 390.616 μs (0.00% GC)
median time: 425.861 μs (0.00% GC)
mean time: 484.638 μs (6.54% GC)
maximum time: 45.006 ms (98.99% GC)
--------------
samples: 10000
evals/sample: 1
julia> #benchmark string((c^n for c in s)...)
BenchmarkTools.Trial:
memory estimate: 1.29 MiB
allocs estimate: 31
--------------
minimum time: 77.480 μs (0.00% GC)
median time: 101.667 μs (0.00% GC)
mean time: 126.455 μs (0.00% GC)
maximum time: 832.524 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
As you can see it's about 3 times faster than the join solution proposed by #Julia Learner.
I tested the above on 0.7 but had no deprecation warnings so I'm assuming it works fine on 1.0 too. Even TIO says so.
You can do it with either a Julia comprehension or a generator.
julia> VERSION
v"1.0.0"
julia> s = "123abc"
"123abc"
# n is number of times to repeat each character.
julia> n = 3
3
# Using a Julia comprehension with [...]
julia> join([c^n for c in s])
"111222333aaabbbccc"
# Using a Julia generator without the [...]
julia> join(c^n for c in s)
"111222333aaabbbccc"
For small strings there should be little practical difference in speed.
Edit
TL;DR: In general, the generator is somewhat faster than the comprehension. However, see case 3 for the opposite. The memory estimates were very similar.
#rickhg12hs has suggested it would be nice to have benchmarks.
Using the great BenchmarkTools package, the results are below.
n = the number of times to repeat each character
s = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" in each case
In each case, the comprehension median time, C, is listed first, vs the generator median time, G, second. The times were rounded as seemed appropriate and the original figures are below the numbered summaries. Smaller, of course, is better.
The memory estimates were not very different.
1. n = 26, C=3.8 vs. G=2.8 μs, G faster
julia> using BenchmarkTools
julia> n = 26;
julia> #benchmark join([c^n for c in s])
BenchmarkTools.Trial:
memory estimate: 3.55 KiB
allocs estimate: 39
--------------
minimum time: 3.688 μs (0.00% GC)
median time: 3.849 μs (0.00% GC)
mean time: 4.956 μs (16.27% GC)
maximum time: 5.211 ms (99.85% GC)
--------------
samples: 10000
evals/sample: 8
julia> #benchmark join(c^n for c in s)
BenchmarkTools.Trial:
memory estimate: 3.19 KiB
allocs estimate: 36
--------------
minimum time: 2.661 μs (0.00% GC)
median time: 2.756 μs (0.00% GC)
mean time: 3.622 μs (19.94% GC)
maximum time: 4.638 ms (99.89% GC)
--------------
samples: 10000
evals/sample: 9
2. n = 260, C=10.7 vs. G=8.1 μs, G faster
julia> n = 260;
julia> #benchmark join([c^n for c in s])
BenchmarkTools.Trial:
memory estimate: 19.23 KiB
allocs estimate: 39
--------------
minimum time: 8.125 μs (0.00% GC)
median time: 10.691 μs (0.00% GC)
mean time: 18.559 μs (35.36% GC)
maximum time: 43.930 ms (99.92% GC)
--------------
samples: 10000
evals/sample: 1
julia> #benchmark join(c^n for c in s)
BenchmarkTools.Trial:
memory estimate: 18.88 KiB
allocs estimate: 36
--------------
minimum time: 7.270 μs (0.00% GC)
median time: 8.126 μs (0.00% GC)
mean time: 10.872 μs (18.04% GC)
maximum time: 10.592 ms (99.87% GC)
--------------
samples: 10000
evals/sample: 4
3. n = 2,600, C=62.3 vs. G=63.7 μs, C faster
julia> n = 2600;
julia> #benchmark join([c^n for c in s])
BenchmarkTools.Trial:
memory estimate: 150.16 KiB
allocs estimate: 39
--------------
minimum time: 51.746 μs (0.00% GC)
median time: 63.293 μs (0.00% GC)
mean time: 77.315 μs (2.79% GC)
maximum time: 3.721 ms (96.85% GC)
--------------
samples: 10000
evals/sample: 1
julia> #benchmark join(c^n for c in s)
BenchmarkTools.Trial:
memory estimate: 149.80 KiB
allocs estimate: 36
--------------
minimum time: 47.897 μs (0.00% GC)
median time: 63.720 μs (0.00% GC)
mean time: 88.716 μs (17.58% GC)
maximum time: 42.457 ms (99.83% GC)
--------------
samples: 10000
evals/sample: 1
4. n = 26,000, C=667 vs. G=516 μs, G faster
julia> n = 26000;
julia> #benchmark join([c^n for c in s])
BenchmarkTools.Trial:
memory estimate: 1.44 MiB
allocs estimate: 39
--------------
minimum time: 457.589 μs (0.00% GC)
median time: 666.710 μs (0.00% GC)
mean time: 729.592 μs (10.91% GC)
maximum time: 42.673 ms (98.76% GC)
--------------
samples: 6659
evals/sample: 1
julia> #benchmark join(c^n for c in s)
BenchmarkTools.Trial:
memory estimate: 1.44 MiB
allocs estimate: 36
--------------
minimum time: 475.977 μs (0.00% GC)
median time: 516.176 μs (0.00% GC)
mean time: 659.001 μs (10.36% GC)
maximum time: 42.268 ms (98.41% GC)
--------------
samples: 7548
evals/sample: 1
Code tested in Version 1.0.0 (2018-08-08).
When I'am trying to write map(x -> x^3, "123abc"), I got an error.
julia> map(x -> x^3, "123abc")
ERROR: ArgumentError: map(f, s::AbstractString) requires f to return AbstractChar; try map(f, collect(s)) or a comprehension instead
So, There's another way to do that.
julia> map(x -> x^3, collect("123abc"))
6-element Array{String,1}:
"111"
"222"
"333"
"aaa"
"bbb"
"ccc"
julia> join(map(x -> x^3, collect("123abc")))
"111222333aaabbbccc"
And Maybe repeat is more convenient.
julia> repeat(collect("123abc"), inner=3)
18-element Array{Char,1}:
'1'
'1'
'1'
'2'
'2'
'2'
'3'
'3'
'3'
'a'
'a'
'a'
'b'
'b'
'b'
'c'
'c'
'c'
julia> join(repeat(collect("123abc"), inner=3))
"111222333aaabbbccc"
I have a program in which the main() function takes four arguments. When I run #code_warntype on the function there seems to be nothing untoward. All the variables have specified types, and there are no instances of UNION or other obvious warning signs.
Apologies, the program is rather long but I'm not sure how to shorten it while retaining the problem:
function main(n::Int, dice::Int=6, start::Int=1, modal::Int=3) ::Tuple{String, Vector{String}, Vector{Float64}}
board = String["GO", "A1", "CC1", "A2", "T1", "R1", "B1", "CH1", "B2", "B3",
"JAIL", "C1", "U1", "C2", "C3", "R2", "D1", "CC2", "D2", "D3",
"FP", "E1", "CH2", "E2", "E3", "R3", "F1", "F2", "U2", "F3",
"G2J", "G1", "G2", "CC3", "G3", "R4", "CH3", "H1", "T2", "H2"]
cc_cards = shuffle(collect(1:16))
ch_cards = shuffle(collect(1:16))
function take_cc_card(square::Int, cards::Vector{Int})::Tuple{Int, Vector{Int}}
if cards[1] == 1
square = findfirst(board, "GO")
elseif cards[1] == 2
square = findfirst(board, "JAIL")
end
p = pop!(cards)
unshift!(cards, p)
return square, cards
end
function take_ch_card(square::Int, cards::Vector{Int})::Tuple{Int, Vector{Int}}
if cards[1] == 1
square = findfirst(board, "GO")
elseif cards[1] == 2
square = findfirst(board, "JAIL")
elseif cards[1] == 3
square = findfirst(board, "C1")
elseif cards[1] == 4
square = findfirst(board, "E3")
elseif cards[1] == 5
square = findfirst(board, "H2")
elseif cards[1] == 6
square = findfirst(board, "R1")
elseif cards[1] == 7 || cards[1] == 8
if board[square] == "CH1"
square = findfirst(board, "R2")
elseif board[square] == "CH2"
square = findfirst(board, "R3")
elseif board[square] == "CH3"
square = findfirst(board, "R1")
end
elseif cards[1] == 9
if board[square] == "CH1"
square = findfirst(board, "U1")
elseif board[square] == "CH2"
square = findfirst(board, "U2")
elseif board[square] == "CH3"
square = findfirst(board, "U1")
end
elseif cards[1] == 10
square = (square - 3) % 40 + ((square - 3 % 40 == 0 ? 40 : 0))
end
p = pop!(cards)
unshift!(cards, p)
return square, cards
end
result = zeros(Int, 40)
consec_doubles = 0
square = 1
for i = 1:n
throw_1 = rand(collect(1:dice))
throw_2 = rand(collect(1:dice))
if throw_1 == throw_2
consec_doubles += 1
else
consec_doubles = 0
end
if consec_doubles != 3
move = throw_1 + throw_2
square = (square + move) % 40 +((square + move) % 40 == 0 ? 40 : 0)
if board[square] == "G2J"
square = findfirst(board, "JAIL")
elseif board[square][1:2] == "CC"
square, cc_cards = take_cc_card(square, cc_cards)
elseif board[square][1:2] == "CH"
square, ch_cards = take_ch_card(square, ch_cards)
if board[square][1:2] == "CC"
square, cc_cards = take_cc_card(square, cc_cards)
end
end
else
square = findfirst(board, "JAIL")
consec_doubles = 0
end
if i >= start
result[square] += 1
end
end
result_tuple = Vector{Tuple{Float64, Int}}()
for i = 1:40
percent = result[i] * 100 / sum(result)
push!(result_tuple, (percent, i))
end
sort!(result_tuple, lt = (x, y) -> isless(x[1], y[1]), rev=true)
modal_squares = Vector{String}()
modal_string = ""
modal_percents = Vector{Float64}()
for i = 1:modal
push!(modal_squares, board[result_tuple[i][2]])
push!(modal_percents, result_tuple[i][1])
k = result_tuple[i][2] - 1
modal_string *= (k < 10 ? ("0" * string(k)) : string(k))
end
return modal_string, modal_squares, modal_percents
end
#code_warntype main(1_000_000, 4, 101, 5)
However, when I change the last three arguments to keywords by inserting a semi-colon rather than a comma after the first argument...
function main(n::Int; dice::Int=6, start::Int=1, modal::Int=3) ::Tuple{String, Vector{String}, Vector{Float64}}
...I seem to run into type stability problems.
#code_warntype main(1_000_000, dice=4, start=101, modal=5)
I'm now getting a temporary variable with an ANY type and an instance of UNION in the main text when I run #code_warntype.
Curiously this doesn't seem to come with a performance hit, as on an average of three benchmark tests the 'argument' version runs in 431.594 ms and the 'keyword' version runs in 413.149 ms. However, I'm curious to know:
(a) why this is happening;
(b) whether, as a general rule, the appearance of temporary variables with an ANY type is a cause for concern; and
(c) whether, as a general rule, there is any advantage from a performance perspective from using keywords rather than normal function arguments.
Here is my take at the three questions. In the answer I assume Julia 0.6.3 unless I explicitly state that I refer to Julia 0.7 at the end of the post.
(a) The code with Any variable is a part of the code that is responsible for handling keyword arguments (e.g. making sure that passed keyword argument is allowed by function signature). The reason is that keyword arguments are received as Vector{Any} inside a function. The vector holds tuples ([argument name], [argument value]).
The actual "work" the function does happens after this part with Any variable.
You can see this by comparing calls:
#code_warntype main(1_000_000, dice=4, start=101, modal=5)
and
#code_warntype main(1_000_000)
for the function with keyword arguments. The second call has only the last line of report produced by the first call above, and all other are responsible for handling passed keyword arguments.
(b) as a general rule this can be a concern of course, but in this case this cannot be helped. The variable with Any holds information about the name of keyword argument.
(c) in general you can assume that positional arguments are not slower than keyword arguments, but can be faster. Here is a MWE (actually if you run #code_warntype f(a=10) you will see this Any variable also):
julia> using BenchmarkTools
julia> f(;a::Int=1) = a+1
f (generic function with 1 method)
julia> g(a::Int=1) = a+1
g (generic function with 2 methods)
julia> #benchmark f()
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 1.865 ns (0.00% GC)
median time: 1.866 ns (0.00% GC)
mean time: 1.974 ns (0.00% GC)
maximum time: 14.463 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> #benchmark f(a=10)
BenchmarkTools.Trial:
memory estimate: 96 bytes
allocs estimate: 1
--------------
minimum time: 52.994 ns (0.00% GC)
median time: 54.413 ns (0.00% GC)
mean time: 65.207 ns (10.65% GC)
maximum time: 3.466 μs (94.78% GC)
--------------
samples: 10000
evals/sample: 986
julia> #benchmark g()
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 1.865 ns (0.00% GC)
median time: 1.866 ns (0.00% GC)
mean time: 1.954 ns (0.00% GC)
maximum time: 13.062 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> #benchmark g(10)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 1.865 ns (0.00% GC)
median time: 1.866 ns (0.00% GC)
mean time: 1.949 ns (0.00% GC)
maximum time: 13.063 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
Now you can see that actually the penalty of keyword argument is when it is passed (and this is exactly the case when you have Any variable in #code_warntype as Julia has to do more work then). Note, that the penalty is small and it will be visible in functions doing very little work. For functions that do a lot of computations it can be ignored most of the time.
Additionally note that in case you would not specify type of keyword argument the penalty would be much bigger when explicitly passing keyword argument value as Julia does not dispatch on keyword argument type (you can also run #code_warntype to witness this):
julia> h(;a=1) = a+1
h (generic function with 1 method)
julia> #benchmark h()
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 1.865 ns (0.00% GC)
median time: 1.866 ns (0.00% GC)
mean time: 1.960 ns (0.00% GC)
maximum time: 13.996 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> #benchmark h(a=10)
BenchmarkTools.Trial:
memory estimate: 96 bytes
allocs estimate: 1
--------------
minimum time: 75.433 ns (0.00% GC)
median time: 77.355 ns (0.00% GC)
mean time: 89.037 ns (7.87% GC)
maximum time: 2.128 μs (89.73% GC)
--------------
samples: 10000
evals/sample: 971
In Julia 0.7 keyword arguments are received as Base.Iterator.Pairs holding a NamedTuple so Julia knows types of passed arguments at compile time. This means that using keyword arguments is faster than in Julia 0.6.3 (but again - you should not expect them to be faster than positional arguments). You can see this buy running similar benchmarks (I have only changed what function does a bit to give a bit more work to Julia compiler) as above but under Julia 0.7 (you can also have a look at #code_warntype on those functions to see that type inference works better in Julia 0.7):
julia> using BenchmarkTools
julia> f(;a::Int=1) = [a]
f (generic function with 1 method)
julia> g(a::Int=1) = [a]
g (generic function with 2 methods)
julia> h(;a=1) = [a]
h (generic function with 1 method)
julia> #benchmark f()
BenchmarkTools.Trial:
memory estimate: 96 bytes
allocs estimate: 1
--------------
minimum time: 31.724 ns (0.00% GC)
median time: 34.523 ns (0.00% GC)
mean time: 50.576 ns (22.80% GC)
maximum time: 53.465 μs (99.89% GC)
--------------
samples: 10000
evals/sample: 1000
julia> #benchmark f(a=10)
BenchmarkTools.Trial:
memory estimate: 96 bytes
allocs estimate: 1
--------------
minimum time: 31.724 ns (0.00% GC)
median time: 34.057 ns (0.00% GC)
mean time: 50.739 ns (22.83% GC)
maximum time: 55.303 μs (99.89% GC)
--------------
samples: 10000
evals/sample: 1000
julia> #benchmark g()
BenchmarkTools.Trial:
memory estimate: 96 bytes
allocs estimate: 1
--------------
minimum time: 31.724 ns (0.00% GC)
median time: 34.523 ns (0.00% GC)
mean time: 50.529 ns (22.77% GC)
maximum time: 54.501 μs (99.89% GC)
--------------
samples: 10000
evals/sample: 1000
julia> #benchmark g(10)
BenchmarkTools.Trial:
memory estimate: 96 bytes
allocs estimate: 1
--------------
minimum time: 31.724 ns (0.00% GC)
median time: 34.523 ns (0.00% GC)
mean time: 50.899 ns (23.27% GC)
maximum time: 56.246 μs (99.90% GC)
--------------
samples: 10000
evals/sample: 1000
julia> #benchmark h()
BenchmarkTools.Trial:
memory estimate: 96 bytes
allocs estimate: 1
--------------
minimum time: 31.257 ns (0.00% GC)
median time: 34.057 ns (0.00% GC)
mean time: 50.924 ns (22.87% GC)
maximum time: 55.724 μs (99.88% GC)
--------------
samples: 10000
evals/sample: 1000
julia> #benchmark h(a=10)
BenchmarkTools.Trial:
memory estimate: 96 bytes
allocs estimate: 1
--------------
minimum time: 31.724 ns (0.00% GC)
median time: 34.057 ns (0.00% GC)
mean time: 50.864 ns (22.60% GC)
maximum time: 53.389 μs (99.83% GC)
--------------
samples: 10000
evals/sample: 1000