Mergesort implementation in Julia - algorithm

I'm trying to implement the merge sort algorithm in Julia, but I cannot seem to understand the recursion step needed for the algorithm. My code is the following:
mₐ = [1, 10, 7, 4, 3, 6, 8, 2, 9]
b₁(t, z, half₁, half₂)= ((t<=length(half₁)) && (z<=length(half₂))) && (half₁[t]<half₂[z])
b₂(t, z, half₁, half₂)= ((z<=length(half₂)) && (t<=length(half₁)) ) && (half₁[t]>half₂[z])
function Merge(m₁, m₂)
N = length(m₁) + length(m₂)
B = zeros(N)
i = 1
j = 1
for k in 1:N
if b₁(i, j, m₁, m₂)
B[k] = m₁[i]
i += 1
elseif b₂(i, j, m₁, m₂)
B[k] = m₂[j]
j += 1
elseif j >= length(m₂)
B[k] = m₁[i]
i += 1
elseif i >= length(m₁)
B[k] = m₂[j]
j += 1
end
end
return B
end
function MergeSort(M)
if length(M) == 1
return M
elseif length(M) == 0
return nothing
end
n = length(M)
i₁ = n ÷ 2
i₂ = n - i₁
h₁ = M[1:i₁]
h₂ = M[i₂:end]
C = MergeSort(h₁)
D = MergeSort(h₂)
return Merge(C, D)
end
MergeSort(mₐ)
It always gets stuck when C becomes a single element because it returns it and then splits it again, the only solution is to make it a loop once it is a single element. However, this would not be a recursive approach.
Solution
Taking #Sundar R answer and suggestions. This is a working implementation
#implementation of MergeSort in julia
# merge function, it joins two ordered arrays and returning one single ordered array
function merge(m₁, m₂)
N = length(m₁) + length(m₂)
# create a zeros array of the same input type (int64)
B = zeros(eltype(m₁), N)
i = 1
j = 1
for k in 1:N
if !checkbounds(Bool, m₁, i)
B[k] = m₂[j]
j += 1
elseif !checkbounds(Bool, m₂, j)
B[k] = m₁[i]
i += 1
elseif m₁[i]<m₂[j]
B[k] = m₁[i]
i += 1
else
B[k] = m₂[j]
j += 1
end
end
return B
end
# merge mergesort, this function recursively orders m/2 sub array given an array M
function mergeSort(M)
# base cases
if length(M) == 1
return M
elseif length(M) == 0
return nothing
end
# dividing array in two
n = length(M)
i₁ = n ÷ 2
# be careful with the indexes, thank you #Sundar R
i₂ = i₁ + 1
h₁ = M[1:i₁]
h₂ = M[i₂:end]
# recursively sorting the array
C = mergeSort(h₁)
D = mergeSort(h₂)
return merge(C, D)
end
#test the function
mₐ = [1, 10, 7, 4, 3, 6, 8, 2, 9]
b = mergeSort(mₐ)
println(b)

The issue is with the indices used for splitting, specifically i₂. n - i₁ is the number of elements in the second half of the array, but not necessarily the index where the second half starts - for that you just want i₂ = i₁ + 1.
With i₂ = n - i₁, when n is 2 i.e. when you come down to [1, 10] as the array to sort, i₁ = n ÷ 2 is 1, and i₂ is (2 - 1) = 1 also. So instead of splitting it into [1], [10], you end up "splitting" it into [1], and [1, 10], hence the infinite looping.
Once you fix that, there's a BoundsError from Merge because of a minor mistake: the elseif conditions should check for >, not >= (since Julia uses 1-based indexing, j is still a valid index when j == length(m₂)).
Some other suggestions:
zeros(N) returns a Float64 array, so the result here will always be a float array. I'd suggest zeros(eltype(m₁), N) instead.
It feels like b₁ and b₂ only complicate the code and make it less clear, I'd suggest a simple nested if there, an outer one to check the indices - look up checkbounds, for eg. checkbounds(Bool, m₁, i) - and an inner one to see which is greater.
Julia convention is to use lowercase for functions, so merge and mergesort instead of Merge and MergeSort

To add to the previous answers, which deal with some of the problems in your existing code, here is for reference a relatively efficient and straightforward Julia implementation of mergesort:
# Top-level function will allocate temporary arrays for convenience
function mergesort(A)
S = similar(A)
return mergesort!(copy(A), S)
end
# Efficient in-place version
# S is a temporary working (scratch) array
function mergesort!(A, S, n=length(A))
width = 1
swapcount = 0
while width < n
# A is currently full of sorted runs of length `width` (starting with width=1)
for i = 1:2*width:n
# Merge two sorted lists, left and right:
# left = A[i:i+width-1], right = A[i+width:i+2*width-1]
merge!(A, i, min(i+width, n+1), min(i+2*width, n+1), S)
end
# Swap the pointers of `A` and `S` such that `A` now contains merged
# runs of length 2*width.
S,A = A,S
swapcount += 1
# Double the width and continue
width *= 2
end
# Optional, if it is important that `A` be sorted in-place:
if isodd(swapcount)
# If we've swapped A and S an odd number of times, copy `A` back to `S`
# since `S` will by now refer to the memory initially provided as input
# array `A`, which the user will expect to have been sorted in-place
copyto!(S,A)
end
return A
end
# Merge two sorted subarrays, left and right:
# left = A[iₗ:iᵣ-1], right = A[iᵣ:iₑ-1]
#inline function merge!(A, iₗ, iᵣ, iₑ, S)
left, right = iₗ, iᵣ
#inbounds for n = iₗ:(iₑ-1)
if (left < iᵣ) && (right >= iₑ || A[left] <= A[right])
S[n] = A[left]
left += 1
else
S[n] = A[right]
right += 1
end
end
end
This is enough to get us in the same ballpark as Base's implementation of the same algorithm
julia> using BenchmarkTools
julia> #benchmark mergesort!(A,B) setup = (A = rand(50); B = similar(A))
BenchmarkTools.Trial: 10000 samples with 194 evaluations.
Range (min … max): 497.062 ns … 1.294 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 501.438 ns ┊ GC (median): 0.00%
Time (mean ± σ): 526.171 ns ± 49.011 ns ┊ GC (mean ± σ): 0.00% ± 0.00%
█▅ ▁ ▁ ▃▇▄ ▁ ▂
█████▇▇▆▇█▇████▇▅▆▅▅▅▆█▆██▄▅▅▄▆██▆▆▄▄▆██▅▃▄██▄▅▅▃▃▃▃▄▅▁▄▄▃▁█ █
497 ns Histogram: log(frequency) by time 718 ns <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> issorted(mergesort(rand(50)))
true
julia> issorted(mergesort(rand(10_000)))
true
julia> #benchmark Base.sort!(A, alg=MergeSort) setup=(A = rand(50))
BenchmarkTools.Trial: 10000 samples with 216 evaluations.
Range (min … max): 344.690 ns … 11.294 μs ┊ GC (min … max): 0.00% … 95.73%
Time (median): 352.917 ns ┊ GC (median): 0.00%
Time (mean ± σ): 401.700 ns ± 378.399 ns ┊ GC (mean ± σ): 3.57% ± 3.76%
█▇▄▄▄▂▁▂▁▂▃▁▁ ▃▂ ▁ ▁▁ ▁
████████████████▇██████▆▆▆▅▆▆▆▆▅▃▅▅▄▅▃▅▅▄▆▅▄▅▄▅▃▄▄██▇▅▆▆▇▆▄▅▅ █
345 ns Histogram: log(frequency) by time 741 ns <
Memory estimate: 336 bytes, allocs estimate: 3.
though both cost a good bit more in terms of both time and memory (the latter due to the need for the working array) in most numeric cases than a similarly efficient pure-Julia implementation of quicksort!:
julia> #benchmark VectorizedStatistics.quicksort!(A) setup = (A = rand(50))
BenchmarkTools.Trial: 10000 samples with 993 evaluations.
Range (min … max): 28.854 ns … 175.821 ns ┊ GC (min … max): 0.00% … 0.00%
Time (median): 35.268 ns ┊ GC (median): 0.00%
Time (mean ± σ): 38.703 ns ± 7.478 ns ┊ GC (mean ± σ): 0.00% ± 0.00%
▂ ▃█▁ ▃▃ ▃▆▂ ▂ ▃ ▂ ▁ ▂ ▂
█▆▃▅▁▁▄▅███▆███▆▆███▁▇█▇▅▇█▆▇█▁▆▅▃▅▄▄██▅▆▅▇▅▄▃▁▄▃▁▄▁▃▃▃▁▄▄▇█ █
28.9 ns Histogram: log(frequency) by time 68.7 ns <
Memory estimate: 0 bytes, allocs estimate: 0.

Related

Find index of maximum element satisfying condition (Julia)

In Julia I can use argmax(X) to find max element. If I want to find all element satisfying condition C I can use findall(C,X). But how can I combine the two? What's the most efficient/idiomatic/concise way to find maximum element index satisfying some condition in Julia?
If you'd like to avoid allocations, filtering the array lazily would work:
idx_filtered = (i for (i, el) in pairs(X) if C(el))
argmax(i -> X[i], idx_filtered)
Unfortunately, this is about twice as slow as a hand-written version. (edit: in my benchmarks, it's 2x slower on Intel Xeon Platinum but nearly equal on Apple M1)
function byhand(C, X)
start = findfirst(C, X)
isnothing(start) && return nothing
imax, max = start, X[start]
for i = start:lastindex(X)
if C(X[i]) && X[i] > max
imax, max = i, X[i]
end
end
imax, max
end
You can store the index returned by findall and subset it with the result of argmax of the vector fulfilling the condition.
X = [5, 4, -3, -5]
C = <(0)
i = findall(C, X);
i[argmax(X[i])]
#3
Or combine both:
argmax(i -> X[i], findall(C, X))
#3
Assuming that findall is not empty. Otherwise it need to be tested e.g. with isempty.
Benchmark
#Functions
function August(C, X)
idx_filtered = (i for (i, el) in pairs(X) if C(el))
argmax(i -> X[i], idx_filtered)
end
function byhand(C, X)
start = findfirst(C, X)
isnothing(start) && return nothing
imax, max = start, X[start]
for i = start:lastindex(X)
if C(X[i]) && X[i] > max
imax, max = i, X[i]
end
end
imax, max
end
function GKi1(C, X)
i = findall(C, X);
i[argmax(X[i])]
end
GKi2(C, X) = argmax(i -> X[i], findall(C, X))
#Data
using Random
Random.seed!(42)
n = 100000
X = randn(n)
C = <(0)
#Benchmark
using BenchmarkTools
suite = BenchmarkGroup()
suite["August"] = #benchmarkable August(C, $X)
suite["byhand"] = #benchmarkable byhand(C, $X)
suite["GKi1"] = #benchmarkable GKi1(C, $X)
suite["GKi2"] = #benchmarkable GKi2(C, $X)
tune!(suite);
results = run(suite)
#Results
results
#4-element BenchmarkTools.BenchmarkGroup:
# tags: []
# "August" => Trial(641.061 μs)
# "byhand" => Trial(261.135 μs)
# "GKi2" => Trial(259.260 μs)
# "GKi1" => Trial(339.570 μs)
results.data["August"]
#BenchmarkTools.Trial: 7622 samples with 1 evaluation.
# Range (min … max): 641.061 μs … 861.379 μs ┊ GC (min … max): 0.00% … 0.00%
# Time (median): 643.640 μs ┊ GC (median): 0.00%
# Time (mean ± σ): 653.027 μs ± 18.123 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
#
# ▄█▅▄▃ ▂▂▃▁ ▁▃▃▂▂ ▁▃ ▁▁ ▁
# ██████▇████████████▇▆▆▇████▇▆██▇▇▇▆▆▆▅▇▆▅▅▅▅▆██▅▆▆▆▇▆▇▇▆▇▆▆▆▅ █
# 641 μs Histogram: log(frequency) by time 718 μs <
#
# Memory estimate: 16 bytes, allocs estimate: 1.
results.data["byhand"]
#BenchmarkTools.Trial: 10000 samples with 1 evaluation.
# Range (min … max): 261.135 μs … 621.141 μs ┊ GC (min … max): 0.00% … 0.00%
# Time (median): 261.356 μs ┊ GC (median): 0.00%
# Time (mean ± σ): 264.382 μs ± 11.638 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
#
# █ ▁▁▁▁ ▂ ▁▁ ▂ ▁ ▁ ▁
# █▅▂▂▅████▅▄▃▄▆█▇▇▆▄▅███▇▄▄▅▆▆█▄▇█▅▄▅▅▆▇▇▅▄▅▄▄▄▃▄▃▃▃▄▅▆▅▄▇█▆▅▄ █
# 261 μs Histogram: log(frequency) by time 292 μs <
#
# Memory estimate: 32 bytes, allocs estimate: 1.
results.data["GKi1"]
#BenchmarkTools.Trial: 10000 samples with 1 evaluation.
# Range (min … max): 339.570 μs … 1.447 ms ┊ GC (min … max): 0.00% … 0.00%
# Time (median): 342.579 μs ┊ GC (median): 0.00%
# Time (mean ± σ): 355.167 μs ± 52.935 μs ┊ GC (mean ± σ): 1.90% ± 6.85%
#
# █▆▄▅▃▂▁▁ ▁ ▁
# ████████▇▆▆▅▅▅▆▄▄▄▄▁▃▁▁▃▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ █
# 340 μs Histogram: log(frequency) by time 722 μs <
#
# Memory estimate: 800.39 KiB, allocs estimate: 11.
results.data["GKi2"]
#BenchmarkTools.Trial: 10000 samples with 1 evaluation.
# Range (min … max): 259.260 μs … 752.773 μs ┊ GC (min … max): 0.00% … 54.40%
# Time (median): 260.692 μs ┊ GC (median): 0.00%
# Time (mean ± σ): 270.300 μs ± 40.094 μs ┊ GC (mean ± σ): 1.31% ± 5.60%
#
# █▁▁▅▄▂▂▄▃▂▁▁▁ ▁ ▁
# █████████████████▇██▆▆▇▆▅▄▆▆▆▄▅▄▆▅▇▇▆▆▅▅▄▅▃▃▅▃▄▁▁▁▃▁▃▃▃▄▃▃▁▃▃ █
# 259 μs Histogram: log(frequency) by time 390 μs <
#
# Memory estimate: 408.53 KiB, allocs estimate: 9.
versioninfo()
#Julia Version 1.8.0
#Commit 5544a0fab7 (2022-08-17 13:38 UTC)
#Platform Info:
# OS: Linux (x86_64-linux-gnu)
# CPU: 8 × Intel(R) Core(TM) i7-2600K CPU # 3.40GHz
# WORD_SIZE: 64
# LIBM: libopenlibm
# LLVM: libLLVM-13.0.1 (ORCJIT, sandybridge)
# Threads: 1 on 8 virtual cores
In this example argmax(i -> X[i], findall(C, X)) is close to the performance of the hand written function of #August but uses more memory, but can show better performance in case the data is sorted:
sort!(X)
results = run(suite)
#4-element BenchmarkTools.BenchmarkGroup:
# tags: []
# "August" => Trial(297.519 μs)
# "byhand" => Trial(270.486 μs)
# "GKi2" => Trial(242.320 μs)
# "GKi1" => Trial(319.732 μs)
From what I understand from your question you can use findmax() (requires Julia >= v1.7) to find the maximum index on the result of findall():
julia> v = [10, 20, 30, 40, 50]
5-element Vector{Int64}:
10
20
30
40
50
julia> findmax(findall(x -> x > 30, v))[1]
5
Performance of the above function:
julia> v = collect(10:1:10_000_000);
julia> #btime findmax(findall(x -> x > 30, v))[1]
33.471 ms (10 allocations: 77.49 MiB)
9999991
Update: solution suggested by #dan-getz of using last() and findlast() perform better than findmax() but findlast() is the winner:
julia> #btime last(findall(x -> x > 30, v))
19.961 ms (9 allocations: 77.49 MiB)
9999991
julia> #btime findlast(x -> x > 30, v)
81.422 ns (2 allocations: 32 bytes)
Update 2: Looks like the OP wanted to find the max element and not only the index. In that case, the solution would be:
julia> v[findmax(findall(x -> x > 30, v))[1]]
50

High GC time for simple mapreduce problem

I have simulation program written in Julia that does something equivalent to this as a part of its main loop:
# Some fake data
M = [randn(100,100) for m=1:100, n=1:100]
W = randn(100,100)
work = zip(W,M)
result = mapreduce(x -> x[1]*x[2], +,work)
In other words, a simple sum of weighted matrices. Timing the above code yields
0.691084 seconds (79.03 k allocations: 1.493 GiB, 70.59% gc time, 2.79% compilation time)
I am surprised about the large number of memory allocations, as this problem should be possible to do in-place. To see if it was my use of mapreduce that was wrong I also tested the following equivalent implementation:
#time begin
res = zeros(100,100)
for m=1:100
for n=1:100
res += W[m,n] * M[m,n]
end
end
end
which gave
0.442521 seconds (50.00 k allocations: 1.491 GiB, 70.81% gc time)
So, if I wrote this in C++ or Fortran it would be simple to do all of this in-place. Is this impossible in Julia? Or am I missing something here...?
It is possible to do it in place like this:
function ws(W, M)
res = zeros(100,100)
for m=1:100
for n=1:100
#. res += W[m,n] * M[m, n]
end
end
return res
end
and the timing is:
julia> #time ws(W, M);
0.100328 seconds (2 allocations: 78.172 KiB)
Note that in order to perform this operation in-place I used broadcasting (I could also use loops, but it would be the same).
The problem with your code is that in line:
res += W[m,n] * M[m,n]
You get two allocations:
When you do multiplication W[m,n] * M[m,n] a new matrix is allocated.
When you do addition res += ... again a matrix is allocated
By using broadcasting with #. you perform an in-place operation, see https://docs.julialang.org/en/v1/manual/mathematical-operations/#man-dot-operators for more explanations.
Additionally note that I have wrapped the code inside a function. If you do not do it then access both W and M is type unstable which also causes allocations, see https://docs.julialang.org/en/v1/manual/performance-tips/#Avoid-global-variables.
I'd like to add something to Bogumił's answer. The missing broadcast is the main problem, but in addition, the loop and the mapreduce variant differ in a fundamental semantic way.
The purpose of mapreduce is to reduce by an associative operation with identity element init in an unspecified order. This in particular also includes the (theoretical) option of running parts in parallel and doesn't really play well with mutation. From the docs:
The associativity of the reduction is implementation-dependent. Additionally, some implementations may reuse the return value of f for elements that appear multiple times in itr. Use mapfoldl or
mapfoldr instead for guaranteed left or right associativity and invocation of f for every value.
and
It is unspecified whether init is used for non-empty collections.
What the loop variant really corresponds to is a fold, which has a well-defined order and initial (not necessarily identity) element and can thus use an in-place reduction operator:
Like reduce, but with guaranteed left associativity. If provided, the keyword argument init will be used exactly once.
julia> #benchmark foldl((acc, (m, w)) -> (#. acc += m * w), $work; init=$(zero(W)))
BenchmarkTools.Trial: 45 samples with 1 evaluation.
Range (min … max): 109.967 ms … 118.251 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 112.639 ms ┊ GC (median): 0.00%
Time (mean ± σ): 112.862 ms ± 1.154 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
▄▃█ ▁▄▃
▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄███▆███▄▁▄▁▁▄▁▁▄▁▁▁▁▁▄▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▁
110 ms Histogram: frequency by time 118 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> #benchmark mapreduce(Base.splat(*), +, $work)
BenchmarkTools.Trial: 12 samples with 1 evaluation.
Range (min … max): 403.100 ms … 458.882 ms ┊ GC (min … max): 4.53% … 3.89%
Time (median): 445.058 ms ┊ GC (median): 4.04%
Time (mean ± σ): 440.042 ms ± 16.792 ms ┊ GC (mean ± σ): 4.21% ± 0.92%
▁ ▁ ▁ ▁ ▁ ▁ ▁▁▁ █ ▁
█▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁█▁▁▁▁▁▁█▁█▁▁▁▁███▁▁▁▁▁█▁▁▁█ ▁
403 ms Histogram: frequency by time 459 ms <
Memory estimate: 1.49 GiB, allocs estimate: 39998.
Think of it that way: if you would write the function as a parallel for loop with (+) reduction, iteration also would have an unspecified order, and you'd have memory overhead for the necessary copying of the individual results to the accumulating thread.
Thus, there is a trade-off. In your example, allocation/copying dominates. In other cases, the the mapped operation might dominate, and parallel reduction (with unspecified order, but copying overhead) be worth it.

Extract lower triangle portion of a matrix

I was wondering if there is a command or a package in Julia that permits us to extract directly the lower triangle portion of a matrix, excluding the diagonal. I can call R commands for that (like lowerTriangle of the gdata package), obviously, but I'd like to know if Julia has something similar. For example, imagine I have the matrix
1.0 0.751 0.734
0.751 1.0 0.948
0.734 0.948 1.0
I don't want to create a lower triangular matrix like
NA NA NA
0.751 NA NA
0.734 0.948 NA
but extract the lower portion of the matrix as an array: 0.751 0.734 0.948
If you're OK with creating a lower triangular matrix as an intermediate step, you can use logical indexing and tril! with an extra argument to get what you need.
julia> M = [1.0 0.751 0.734
0.751 1.0 0.948
0.734 0.948 1.0];
julia> v = M[tril!(trues(size(M)), -1)]
3-element Array{Float64, 1}:
0.751
0.734
0.948
The trues call returns an array of M's shape filled with boolean true values. tril! then prunes this down to just the part of the matrix that we want. The second argument to tril! tells it which superdiagonal to start from, which we use here to avoid the values in the leading diagonal.
We use the result of that for indexing into M, and that returns an array with the required values.
Using comprehensions:
julia> [M[m, n] for m in 2:size(M, 1) for n in 1:m-1]
3-element Array{Float64,1}:
0.751
0.734
0.948
But it is much slower than the sundar/Matt B. solution:
lower_triangular_1(M) = [M[m, n] for m in 2:size(M, 1) for n in 1:m-1]
lower_triangular_2(M) = [M[m, n] for n in 1:size(M, 2) for m in n+1:size(M, 1)]
lower_triangular_3(M) = M[tril!(trues(size(M)), -1)]
using BenchmarkTools
using LinearAlgebra # avoid warning in 0.7
M=rand(100, 100)
Testing with Julia Version 0.7.0-alpha.0:
julia> #btime lower_triangular_1(M);
73.179 μs (10115 allocations: 444.34 KiB)
julia> #btime lower_triangular_2(M);
71.157 μs (10117 allocations: 444.41 KiB)
julia> #btime lower_triangular_3(M);
16.325 μs (6 allocations: 40.19 KiB)
Not elegant, but faster (with #views):
function lower_triangular_4(M)
# works only for square matrices
res = similar(M, ((size(M, 1)-1) * size(M, 2)) ÷ 2)
start_idx = 1
for n = 1:size(M, 2)-1
#views column = M[n+1:end, n]
last_idx = start_idx -1 + length(column)
#views res[start_idx:last_idx] = column[:]
start_idx = last_idx + 1
end
end
julia> #btime lower_triangular_4(M);
4.272 μs (101 allocations: 44.95 KiB)

Julia: best way to sample from successively shrinking range?

I would like to sample k numbers where the first number is sampled from 1:n and the second from 1:n-1 and the third from 1:n-2 and so on.
I have the below implementation
function shrinksample(n,k)
[rand(1:m) for m in n:-1:n-k+1]
end
Are there faster solutions in Julia?
The following takes ideas from the implementation of randperm and since n and k are of the same order, this is appropriate as the same type of randomness is needed (both have output space of size n factorial):
function fastshrinksample(r::AbstractRNG,n,k)
a = Vector{typeof(n)}(k)
#assert n <= Int64(2)^52
k == 0 && return a
mask = (1<<(64-leading_zeros(n)))-1
nextmask = mask>>1
nn = n
for i=1:k
a[i] = 1+Base.Random.rand_lt(r, nn, mask)
nn -= 1
if nn == nextmask
mask, nextmask = nextmask, nextmask>>1
end
end
return a
end
fastshrinksample(n,k) = fastshrinksample(Base.Random.GLOBAL_RNG, n, k)
Benchmarking gives a 3x improvement for one tested instance:
julia> using BenchmarkTools
julia> #btime shrinksample(10000,10000);
310.277 μs (2 allocations: 78.20 KiB)
julia> #btime fastshrinksample(10000,10000);
91.815 μs (2 allocations: 78.20 KiB)
The trick is mainly to use the internal Base.Random.rand_lt instead of regular rand(1:n)
If this is not very sensitive to randomness (you're not doing cryptography), the following should be amazingly fast and very simple:
blazingshrinksample(n,k) = (Int)[trunc(Int,(n-m)rand()+1) for m in 0:k-1]
Testing this along with your implementation and with Dan's, I got this:
using BenchmarkTools
#btime shrinksample(10000,10000);
259.414 μs (2 allocations: 78.20 KiB)
#btime fastshrinksample(10000,10000);
66.713 μs (2 allocations: 78.20 KiB)
#btime blazingshrinksample(10000,10000);
33.614 μs (2 allocations: 78.20 KiB)

Performance of Prime Testing with Haskell

I have two ways of testing for primes. One of them called isPrime and the other is isBigPrime. What I originally wanted is to test "big" primes with "small" primes that I have already computed, so that the testing becomes faster. Here are my implementations:
intSqrt :: Integer -> Integer
intSqrt n = round $ sqrt $ fromIntegral n
isPrime' :: Integer->Integer -> Bool
isPrime' 1 m = False
isPrime' n m = do
if (m > (intSqrt n))
then True
else if (rem n (m+1) == 0)
then False
else do isPrime' n (m+1)
isPrime :: Integer -> Bool
isPrime 2 = True
isPrime 3 = True
isPrime n = isPrime' n 1
isBigPrime' :: Integer ->Int ->Bool
isBigPrime' n i =
if ( ( smallPrimes !! i ) > intSqrt n )
then True
else if (rem n (smallPrimes !! i) == 0)
then False
else do isBigPrime' n (i+1)
smallPrimes = [2,3, List of Primes until 1700]
--Start at 1 because we only go through uneven numbers
isBigPrime n = isBigPrime' n 1
primes m = [2]++[k | k <- [3,5..m], isPrime k]
bigPrimes m = smallPrimes ++ [k | k <- [1701,1703..m], isBigPrime k]
main = do
print $ (sum $ [Enter Method] 2999999 )
I have chosen 1700 as upper limit because I wanted to have primes up to 3e6 and sqrt(3e6) ~ 1700. I took the sum of them to compare those two algorithms. I thought that bigPrimes would be way faster that primes because first of all it does way less calculations and it has a head start, which is not too big but anyway. However to my surprise bigPrimes was slower than primes. Here are the results:
For primes
Performance counter stats for './p10':
16768,627686 task-clock (msec) # 1,000 CPUs utilized
58 context-switches # 0,003 K/sec
1 cpu-migrations # 0,000 K/sec
6.496 page-faults # 0,387 K/sec
53.416.641.157 cycles # 3,186 GHz
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
160.411.056.099 instructions # 3,00 insns per cycle
34.512.352.987 branches # 2058,150 M/sec
10.673.742 branch-misses # 0,03% of all branches
16,773316435 seconds time elapsed
and for bigPrimes
Performance counter stats for './p10':
19111,667046 task-clock (msec) # 0,999 CPUs utilized
259 context-switches # 0,014 K/sec
3 cpu-migrations # 0,000 K/sec
6.278 page-faults # 0,328 K/sec
61.027.453.425 cycles # 3,193 GHz
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
198.207.905.034 instructions # 3,25 insns per cycle
34.632.138.061 branches # 1812,094 M/sec
106.102.114 branch-misses # 0,31% of all branches
19,126843560 seconds time elapsed
I was wondering why that would be the case. I am suspecting that using primes!!n makes bigPrimes somewhat slower but I am not entirely sure.
A common antipattern brought from other languages is to iterate over indices and use (!!) to index into a list. In Haskell, it is instead idiomatic to simply iterate over the list itself. So:
isBigPrime' :: Integer -> [Integer] ->Bool
isBigPrime' n [] = True
isBigPrime' n (p:ps) = p > intSqrt n || (rem n p /= 0 && isBigPrime' n ps)
isBigPrime n = isBigPrime' n (drop 1 smallPrimes)
On my machine, your primes takes 25.3s; your bigPrimes takes 20.9s; and my bigPrimes takes 3.2s. There are several other pieces of low-hanging fruit (e.g. using p*p > n instead of p > intSqrt n), but this is by far the most significant one.

Resources