Fused operators on views much slower than nested loop in Julia - performance

I am trying to make a function that computes a diffusion kernel as fast as possible by using view and fused operators. Is it possible to get the second function as fast as the first? Currently, diff takes 59.6 ms, whereas diff_view takes 384.3 ms.
using BenchmarkTools
function diff(
at::Array{Float64, 3}, a::Array{Float64, 3},
visc::Float64, dxidxi::Float64, dyidyi::Float64, dzidzi::Float64,
itot::Int64, jtot::Int64, ktot::Int64)
for k in 2:ktot-1
for j in 2:jtot-1
#simd for i in 2:itot-1
#inbounds at[i, j, k] += visc * (
(a[i-1, j , k ] - 2. * a[i, j, k] + a[i+1, j , k ]) * dxidxi +
(a[i , j-1, k ] - 2. * a[i, j, k] + a[i , j+1, k ]) * dyidyi +
(a[i , j , k-1] - 2. * a[i, j, k] + a[i , j , k+1]) * dzidzi )
end
end
end
end
function diff_view(
at::Array{Float64, 3}, a::Array{Float64, 3},
visc::Float64, dxidxi::Float64, dyidyi::Float64, dzidzi::Float64,
itot::Int64, jtot::Int64, ktot::Int64)
at_c = view(at, 2:itot-1, 2:jtot-1, 2:ktot-1)
a_c = view(a, 2:itot-1, 2:jtot-1, 2:ktot-1)
a_w = view(a, 1:itot-2, 2:jtot-1, 2:ktot-1)
a_e = view(a, 3:itot , 2:jtot-1, 2:ktot-1)
a_s = view(a, 2:itot-1, 1:jtot-2, 2:ktot-1)
a_n = view(a, 2:itot-1, 3:jtot , 2:ktot-1)
a_b = view(a, 2:itot-1, 2:jtot-1, 1:ktot-2)
a_t = view(a, 2:itot-1, 2:jtot-1, 3:ktot )
at_c .+= visc .* ( (a_w .- 2. .* a_c .+ a_e) .* dxidxi .+
(a_s .- 2. .* a_c .+ a_n) .* dyidyi .+
(a_b .- 2. .* a_c .+ a_n) .* dzidzi )
end
itot = 384
jtot = 384
ktot = 384
a = rand(Float64, (itot, jtot, ktot))
at = zeros(Float64, (itot, jtot, ktot))
visc = 0.1
dxidxi = 0.1
dyidyi = 0.1
dzidzi = 0.1
#btime diff(
at, a,
visc, dxidxi, dyidyi, dzidzi,
itot, jtot, ktot)
#btime diff_view(
at, a,
visc, dxidxi, dyidyi, dzidzi,
itot, jtot, ktot)

You can accomplish this using LoopVectorization.jl's #turbo macro, which will make sure that the broadcast compiles to efficient SIMD instructions wherever possible.
using LoopVectorization
function diff_view_lv!(
at::Array{Float64, 3}, a::Array{Float64, 3},
visc::Float64, dxidxi::Float64, dyidyi::Float64, dzidzi::Float64,
itot::Int64, jtot::Int64, ktot::Int64)
at_c = view(at, 2:itot-1, 2:jtot-1, 2:ktot-1)
a_c = view(a, 2:itot-1, 2:jtot-1, 2:ktot-1)
a_w = view(a, 1:itot-2, 2:jtot-1, 2:ktot-1)
a_e = view(a, 3:itot , 2:jtot-1, 2:ktot-1)
a_s = view(a, 2:itot-1, 1:jtot-2, 2:ktot-1)
a_n = view(a, 2:itot-1, 3:jtot , 2:ktot-1)
a_b = view(a, 2:itot-1, 2:jtot-1, 1:ktot-2)
a_t = view(a, 2:itot-1, 2:jtot-1, 3:ktot )
#turbo at_c .+= visc .* ( (a_w .- 2. .* a_c .+ a_e) .* dxidxi .+
(a_s .- 2. .* a_c .+ a_n) .* dyidyi .+
(a_b .- 2. .* a_c .+ a_n) .* dzidzi )
# Could also use #turbo #. to apply the broadcast to every operator, so you don't have to type `.` before each one.
end
As a stylistic aside, since all these functions mutate at, they should have names that end with ! to denote that they mutate their argument.
And, as the comments noted, we want to be sure to interpolate any global variables into the benchmark with $. But other than that, using the same setup as in your question above (on what seems to be a slightly slower CPU):
julia> #benchmark diff!(
$at, $a,
$visc, $dxidxi, $dyidyi, $dzidzi,
$itot, $jtot, $ktot)
BenchmarkTools.Trial: 50 samples with 1 evaluation.
Range (min … max): 100.575 ms … 101.855 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 100.783 ms ┊ GC (median): 0.00%
Time (mean ± σ): 100.798 ms ± 173.505 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▆▁▁█▄
▄▄▄▄▄▆▇█████▇▆▄▆▁▁▁▄▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▁
101 ms Histogram: frequency by time 102 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> #benchmark diff_view!(
$at, $a,
$visc, $dxidxi, $dyidyi, $dzidzi,
$itot, $jtot, $ktot)
BenchmarkTools.Trial: 13 samples with 1 evaluation.
Range (min … max): 397.203 ms … 397.800 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 397.427 ms ┊ GC (median): 0.00%
Time (mean ± σ): 397.436 ms ± 173.079 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▁ ▁ ▁ ▁ ▁ ▁ ▁ ▁ █ ▁ ▁ ▁
█▁█▁▁▁▁█▁▁▁▁▁█▁▁█▁▁█▁▁█▁█▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
397 ms Histogram: frequency by time 398 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> #benchmark diff_view_lv!(
$at, $a,
$visc, $dxidxi, $dyidyi, $dzidzi,
$itot, $jtot, $ktot)
BenchmarkTools.Trial: 61 samples with 1 evaluation.
Range (min … max): 82.226 ms … 83.015 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 82.364 ms ┊ GC (median): 0.00%
Time (mean ± σ): 82.395 ms ± 115.205 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▁▄ ▁▁▁▁▄▄▄█▁ ▄ ▁█ ▁▁ ▁ ▁ ▁
▆▁▁▆▁▁▁██▆▁█████████▆▆█▆▆██▁██▁█▆█▁▁▁▁▁▁█▆▆▆▁▁▁▆▁▁▁▁▁▁▁▁▁▁▁▆ ▁
82.2 ms Histogram: frequency by time 82.7 ms <
Memory estimate: 1008 bytes, allocs estimate: 41.
With this, the broadcasted version is now faster than the original looped version! However, as the comments have noted, the simple looping approach is arguably cleaner and more readable, and (as you might guess from the name) you can apply LoopVectorization to the looped version just as well:
using LoopVectorization
function diff_lv!(
at::Array{Float64, 3}, a::Array{Float64, 3},
visc::Float64, dxidxi::Float64, dyidyi::Float64, dzidzi::Float64,
itot::Int64, jtot::Int64, ktot::Int64)
#turbo for k in 2:ktot-1
for j in 2:jtot-1
for i in 2:itot-1
at[i, j, k] += visc * (
(a[i-1, j , k ] - 2. * a[i, j, k] + a[i+1, j , k ]) * dxidxi +
(a[i , j-1, k ] - 2. * a[i, j, k] + a[i , j+1, k ]) * dyidyi +
(a[i , j , k-1] - 2. * a[i, j, k] + a[i , j , k+1]) * dzidzi )
end
end
end
end
julia> #benchmark diff_lv!(
$at, $a,
$visc, $dxidxi, $dyidyi, $dzidzi,
$itot, $jtot, $ktot)
BenchmarkTools.Trial: 56 samples with 1 evaluation.
Range (min … max): 89.489 ms … 90.166 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 89.657 ms ┊ GC (median): 0.00%
Time (mean ± σ): 89.660 ms ± 103.127 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▁ ▁ ▁ █▃ ▆ ▁
▄▁▁▄▁▁▄█▄▁▁▄█▄█▄▁▄▇▇▇▇██▄▄▇█▇█▁▁▁▄▄▁▁▁▁▄▁▄▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▁
89.5 ms Histogram: frequency by time 89.9 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
Finally, if you want to multithread, you can just add another t in the name of the macro (#tturbo instead of #turbo)
julia> #benchmark diff_lvt!(
$at, $a,
$visc, $dxidxi, $dyidyi, $dzidzi,
$itot, $jtot, $ktot)
BenchmarkTools.Trial: 106 samples with 1 evaluation.
Range (min … max): 47.225 ms … 47.560 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 47.434 ms ┊ GC (median): 0.00%
Time (mean ± σ): 47.432 ms ± 67.185 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▁ ▁▁ ▄▂ █ ▂
▃▁▁▃▁▁▅▁▁▅▁▁▁▁▁▁▁▁▃▃▃▃▃▃▁▅▅▃▅▅▃█▃██▃██▃▃▆▃▃▅▆█▆▅██▆▆▅▅▃▃▁▃▆ ▃
47.2 ms Histogram: frequency by time 47.5 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
which should provide some additional speedup as long as you have started Julia with multiple threads.

Related

Generate array of complex numbers with absolute value one in Julia?

In Julia, I would like to randomly generate an array of arbitrary size, where all the elements of the array are complex numbers with absolute value one. Is there perhaps any way to do this within Julia?
I've got four options so far:
f1(n) = exp.((2*im*π).*rand(n))
f2(n) = map(x->(z = x[1]+im*x[2] ; z ./ abs(z) ),
eachcol(randn(2,n)))
f3(n) = [im*x[1]+x[2] for x in sincos.(2π*rand(n))]
f4(n) = cispi.(2 .*rand(n))
We have:
julia> using BenchmarkTools
julia> begin
#btime f1(1_000);
#btime f2(1_000);
#btime f3(1_000);
#btime f4(1_000);
end;
29.390 μs (2 allocations: 23.69 KiB)
15.559 μs (2 allocations: 31.50 KiB)
25.733 μs (4 allocations: 47.38 KiB)
27.662 μs (2 allocations: 23.69 KiB)
Not a crucial difference.
One way is:
randcomplex() = (c = Complex(rand(2)...); c / abs(c))
randcomplex(numwanted) = [randcomplex() for _ in 1:numwanted]
or
randcomplex(dims...) = (a = zeros(Complex, dims...); for i in eachindex(a) a[i] = randcomplex() end; a)
If you are looking for something faster, here are two options. They return a perhaps slightly unfamiliar type, but it is equivalent to a regular Vector
function f5(n)
r = rand(2, n)
for i in 1:n
a = sqrt(r[1, i]^2 + r[2, i]^2)
r[1, i] /= a
r[2, i] /= a
end
return reinterpret(reshape, ComplexF64, r)
end
using LoopVectorization: #turbo
function f5t(n)
r = rand(2, n)
#turbo for i in 1:n
a = sqrt(r[1, i]^2 + r[2, i]^2)
r[1, i] /= a
r[2, i] /= a
end
return reinterpret(reshape, ComplexF64, r)
end
julia> #btime f5(1000);
4.186 μs (1 allocation: 15.75 KiB)
julia> #btime f5t(1000);
2.900 μs (1 allocation: 15.75 KiB)

IPC performance: Anonymous Pipe vs Socket

This question is similar to IPC performance: Named Pipe vs Socket but focusses on anonymous instead of named pipes: How is the performance difference between an anonymous pipe and a TCP connection on different operating systems and with different transfer sizes?
I tried to benchmark it using BenchmarkDotNet with the code attached at the end of this post. When the program starts, it initializes BenchmarkDotNet which in turn invokes the GlobalSetup() methods once and the two benchmarked methods (Pipe() and Tcp()) many times.
In GlobalSetup(), two child processes are started. One for pipe communication and one for tcp communication. Once the child processes are ready, they wait for a trigger signal and the number of values N to be transferred (provided via stdin) and then start sending data.
When the benchmarked methods (Pipe() and Tcp()) are invoked, they send the trigger signal and the number of values N and wait for the incoming data.
It has shown that it is important to set TcpClient.NoDelay = true to disable the Nagle-Algorithm that first collects small messages until a certain threshold or a certain timeout is reached. Interestingly this affects only the Linux tests with N = 10000. With NoDelay = false (default), the average time for this test jumps from ~40 µs to ~40 ms.
Here are the results:
Legends
N : N = number of int32 values to be transmitted
Mean : Arithmetic mean of all measurements
Error : Half of 99.9% confidence interval
StdDev : Standard deviation of all measurements
Median : Value separating the higher half of all measurements (50th percentile)
Ratio : Mean of the ratio distribution ([Current]/[Baseline])
RatioSD : Standard deviation of the ratio distribution ([Current]/[Baseline])
1 us : 1 Microsecond (0.000001 sec)
Virtual Machine (Ubuntu 20.04)
BenchmarkDotNet=v0.13.0, OS=ubuntu 20.04
AMD Opteron(tm) Processor 4334, 4 CPU, 4 logical and 4 physical cores
.NET SDK=5.0.102
[Host] : .NET 5.0.2 (5.0.220.61120), X64 RyuJIT
DefaultJob : .NET 5.0.2 (5.0.220.61120), X64 RyuJIT
Method
N
Mean
Error
StdDev
Median
Ratio
RatioSD
Pipe
1
27.33 μs
1.660 μs
4.895 μs
30.75 μs
1.00
0.00
Tcp
1
31.42 μs
0.620 μs
0.713 μs
31.24 μs
1.39
0.21
Pipe
100
26.72 μs
1.990 μs
5.867 μs
26.63 μs
1.00
0.00
Tcp
100
38.95 μs
2.146 μs
6.327 μs
43.34 μs
1.53
0.43
Pipe
10000
42.45 μs
2.804 μs
8.268 μs
47.09 μs
1.00
0.00
Tcp
10000
46.97 μs
3.057 μs
9.013 μs
53.93 μs
1.16
0.34
Pipe
1000000
1,621.87 μs
116.924 μs
344.752 μs
1,893.49 μs
1.00
0.00
Tcp
1000000
1,707.25 μs
8.066 μs
7.545 μs
1,707.24 μs
0.94
0.13
Pipe
10000000
21,013.86 μs
166.250 μs
129.797 μs
21,007.89 μs
1.00
0.00
Tcp
10000000
20,548.03 μs
407.779 μs
814.379 μs
20,713.44 μs
0.96
0.03
Notebook (Ubuntu 20.04 on Windows 10 + WSL2):
BenchmarkDotNet=v0.13.0, OS=ubuntu 20.04
Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
.NET SDK=5.0.301
[Host] : .NET 5.0.7 (5.0.721.25508), X64 RyuJIT
DefaultJob : .NET 5.0.7 (5.0.721.25508), X64 RyuJIT
Method
N
Mean
Error
StdDev
Median
Ratio
RatioSD
Pipe
1
44.66 μs
0.882 μs
1.051 μs
44.45 μs
1.00
0.00
Tcp
1
54.42 μs
0.411 μs
0.364 μs
54.34 μs
1.21
0.03
Pipe
100
45.07 μs
0.895 μs
1.496 μs
44.63 μs
1.00
0.00
Tcp
100
55.27 μs
0.735 μs
0.614 μs
55.17 μs
1.21
0.05
Pipe
10000
52.30 μs
1.018 μs
1.131 μs
52.32 μs
1.00
0.00
Tcp
10000
55.47 μs
0.590 μs
0.523 μs
55.32 μs
1.06
0.03
Pipe
1000000
4,034.01 μs
77.978 μs
65.115 μs
4,035.58 μs
1.00
0.00
Tcp
1000000
1,398.62 μs
24.230 μs
21.479 μs
1,395.20 μs
0.35
0.01
Pipe
10000000
69,767.35 μs
4,993.492 μs
14,723.423 μs
64,169.46 μs
1.00
0.00
Tcp
10000000
24,660.43 μs
1,746.809 μs
4,955.406 μs
23,947.15 μs
0.38
0.14
Notebook (Windows 10):
BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19043.1083 (21H1/May2021Update)
Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
.NET SDK=5.0.203
[Host] : .NET 5.0.6 (5.0.621.22011), X64 RyuJIT
DefaultJob : .NET 5.0.6 (5.0.621.22011), X64 RyuJIT
Method
N
Mean
Error
StdDev
Median
Ratio
RatioSD
Pipe
1
22.60 μs
0.441 μs
1.013 μs
22.21 μs
1.00
0.00
Tcp
1
27.42 μs
0.535 μs
1.019 μs
27.51 μs
1.21
0.08
Pipe
100
21.93 μs
0.146 μs
0.122 μs
21.94 μs
1.00
0.00
Tcp
100
26.06 μs
0.506 μs
0.474 μs
25.99 μs
1.19
0.02
Pipe
10000
29.59 μs
0.126 μs
0.099 μs
29.58 μs
1.00
0.00
Tcp
10000
33.25 μs
0.655 μs
0.919 μs
33.01 μs
1.14
0.04
Pipe
1000000
1,675.35 μs
32.862 μs
43.870 μs
1,685.37 μs
1.00
0.00
Tcp
1000000
2,553.07 μs
58.100 μs
167.631 μs
2,505.34 μs
1.63
0.10
Pipe
10000000
23,421.61 μs
141.337 μs
132.207 μs
23,380.19 μs
1.00
0.00
Tcp
10000000
28,182.91 μs
375.644 μs
313.679 μs
28,114.22 μs
1.20
0.01
Benchmark code:
Benchmark.csproj
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net5.0</TargetFramework>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="BenchmarkDotNet" Version="0.13.0" />
</ItemGroup>
</Project>
Program.cs
using BenchmarkDotNet.Running;
using System;
using System.IO;
using System.Linq;
using System.Net.Sockets;
using System.Runtime.InteropServices;
namespace Benchmark
{
public class Program
{
public const int MIN_LENGTH = 1;
public const int MAX_LENGTH = 10_000_000;
static void Main(string[] args)
{
if (!args.Any())
{
var summary = BenchmarkRunner.Run<PipeVsTcp>();
}
else
{
var data = MemoryMarshal
.AsBytes<int>(
Enumerable
.Range(0, MAX_LENGTH)
.ToArray())
.ToArray();
using var readStream = Console.OpenStandardInput();
if (args[0] == "pipe")
{
using var pipeStream = Console.OpenStandardOutput();
RunChildProcess(readStream, pipeStream, data);
}
else if (args[0] == "tcp")
{
var tcpClient = new TcpClient()
{
NoDelay = true
};
tcpClient.Connect("localhost", 55555);
var tcpStream = tcpClient.GetStream();
RunChildProcess(readStream, tcpStream, data);
}
else
{
throw new Exception("Invalid argument (args[0]).");
}
}
}
static void RunChildProcess(Stream readStream, Stream writeStream, byte[] data)
{
// wait for start signal
Span<byte> buffer = stackalloc byte[4];
while (true)
{
var length = readStream.Read(buffer);
if (length == 0)
throw new Exception($"The host process terminated early.");
var N = BitConverter.ToInt32(buffer);
// write
writeStream.Write(data, 0, N * sizeof(int));
}
}
}
}
PipeVsTcp.cs
using BenchmarkDotNet.Attributes;
using System;
using System.Buffers;
using System.Diagnostics;
using System.IO;
using System.Net;
using System.Net.Sockets;
using System.Reflection;
using System.Runtime.InteropServices;
namespace Benchmark
{
[MemoryDiagnoser]
public class PipeVsTcp
{
private Process _pipeProcess;
private Process _tcpProcess;
private TcpClient _tcpClient;
[GlobalSetup]
public void GlobalSetup()
{
// assembly path
// under Linux the Location property is an empty
// string (why?), therefore I have it replaced
// with an hard-coded string
var assemblyPath = Assembly.GetExecutingAssembly().Location;
// run pipe process
var pipePsi = new ProcessStartInfo("dotnet")
{
Arguments = $"{assemblyPath} pipe",
UseShellExecute = false,
RedirectStandardInput = true,
RedirectStandardOutput = true,
RedirectStandardError = true
};
_pipeProcess = new Process() { StartInfo = pipePsi };
_pipeProcess.Start();
// run tcp process
var tcpPsi = new ProcessStartInfo("dotnet")
{
Arguments = $"{assemblyPath} tcp",
UseShellExecute = false,
RedirectStandardInput = true,
RedirectStandardOutput = true,
RedirectStandardError = true
};
_tcpProcess = new Process() { StartInfo = tcpPsi };
_tcpProcess.Start();
var tcpListener = new TcpListener(IPAddress.Parse("127.0.0.1"), 55555);
tcpListener.Start();
_tcpClient = tcpListener.AcceptTcpClient();
_tcpClient.NoDelay = true;
}
[GlobalCleanup]
public void GlobalCleanup()
{
_pipeProcess?.Kill();
_tcpProcess?.Kill();
}
[Params(Program.MIN_LENGTH, 100, 10_000, 1_000_000, Program.MAX_LENGTH)]
public int N;
[Benchmark(Baseline = true)]
public Memory<byte> Pipe()
{
var pipeReadStream = _pipeProcess.StandardOutput.BaseStream;
var pipeWriteStream = _pipeProcess.StandardInput.BaseStream;
using var owner = MemoryPool<byte>.Shared.Rent(N * sizeof(int));
return ReadFromStream(pipeReadStream, pipeWriteStream, owner.Memory);
}
[Benchmark()]
public Memory<byte> Tcp()
{
var tcpReadStream = _tcpClient.GetStream();
var pipeWriteStream = _tcpProcess.StandardInput.BaseStream;
using var owner = MemoryPool<byte>.Shared.Rent(N * sizeof(int));
return ReadFromStream(tcpReadStream, pipeWriteStream, owner.Memory);
}
private Memory<byte> ReadFromStream(Stream readStream, Stream writeStream, Memory<byte> buffer)
{
// trigger
var Nbuffer = BitConverter.GetBytes(N);
writeStream.Write(Nbuffer);
writeStream.Flush();
// receive data
var remaining = N * sizeof(int);
var offset = 0;
while (remaining > 0)
{
var span = buffer.Slice(offset, remaining).Span;
var readBytes = readStream.Read(span);
if (readBytes == 0)
throw new Exception("The child process terminated early.");
remaining -= readBytes;
offset += readBytes;
}
var intBuffer = MemoryMarshal.Cast<byte, int>(buffer.Span);
// validate first 3 values
for (int i = 0; i < Math.Min(N, 3); i++)
{
if (intBuffer[i] != i)
throw new Exception($"Invalid data received. Data is {intBuffer[i]}, index = {i}.");
}
return buffer;
}
}
}

Broadcasting assignment slow Julia

I have something like this (simple example):
using BenchmarkTools
function assign()
e = zeros(100, 90000)
e2 = ones(100) * 0.16
e[:, 100:end] .= e2[:]
end
#benchmark assign()
and need to this for thousands of time steps. This gives
BenchmarkTools.Trial:
memory estimate: 68.67 MiB
allocs estimate: 6
--------------
minimum time: 16.080 ms (0.00% GC)
median time: 27.811 ms (0.00% GC)
mean time: 31.822 ms (12.31% GC)
maximum time: 43.439 ms (27.66% GC)
--------------
samples: 158
evals/sample: 1
Is there a faster way of doing this?
First of all I will assume that you meant
function assign1()
e = zeros(100, 90000)
e2 = ones(100) * 0.16
e[:, 100:end] .= e2[:]
return e # <- important!
end
Since otherwise you will not return the first 99 columns of e(!):
julia> size(assign())
(100, 89901)
Secondly, don't do this:
e[:, 100:end] .= e2[:]
e2[:] makes a copy of e2 and assigns that, but why? Just assign e2 directly:
e[:, 100:end] .= e2
Ok, but let's try a few different versions. Notice that there is no need to make e2 a vector, just assign a scalar:
function assign2()
e = zeros(100, 90000)
e[:, 100:end] .= 0.16 # Just broadcast a scalar!
return e
end
function assign3()
e = fill(0.16, 100, 90000) # use fill instead of writing all those zeros that you will throw away
e[:, 1:99] .= 0
return e
end
function assign4()
# only write exactly the values you need!
e = Matrix{Float64}(undef, 100, 90000)
e[:, 1:99] .= 0
e[:, 100:end] .= 0.16
return e
end
Time to benchmark
julia> #btime assign1();
14.550 ms (5 allocations: 68.67 MiB)
julia> #btime assign2();
14.481 ms (2 allocations: 68.66 MiB)
julia> #btime assign3();
9.636 ms (2 allocations: 68.66 MiB)
julia> #btime assign4();
10.062 ms (2 allocations: 68.66 MiB)
Versions 1 and 2 are equally fast, but you'll notice that there are 2 allocations instead of 5, but, of course, the big allocation dominates.
Versions 3 and 4 are faster, not dramatically so, but you see that it avoids some duplicate work, such as writing values into the matrix twice. Version 3 is the fastest, not by much, but this changes if the assignment is a bit more balanced, in which case version 4 is faster:
function assign3_()
e = fill(0.16, 100, 90000)
e[:, 1:44999] .= 0
return e
end
function assign4_()
e = Matrix{Float64}(undef, 100, 90000)
e[:, 1:44999] .= 0
e[:, 45000:end] .= 0.16
return e
end
julia> #btime assign3_();
11.576 ms (2 allocations: 68.66 MiB)
julia> #btime assign4_();
8.658 ms (2 allocations: 68.66 MiB)
The lesson is to avoid doing unnecessary work.

Why is 'all(itr) do' block slower than a for loop in this case?

What my code does
The goal was to build a function, that checks if all brackets open and close correctly in a given string with julia. So,
"{abc()([[def]])()}"
should return true, while something like
"{(bracket order mixed up here!})[and this bracket doesn't close!"
should return false.
Question
I have two versions of the function. Why is version I faster by about 10%?
Version I
function matching_brackets_old(s::AbstractString)
close_open_map = Dict('}' => '{', ')' => '(', ']' => '[')
order_arr = []
for char in s
if char in values(close_open_map)
push!(order_arr, char)
elseif (char in keys(close_open_map)) &&
(isempty(order_arr) || (close_open_map[char] != pop!(order_arr)))
return false
end
end
return isempty(order_arr)
end
Version II
Here I replace the for loop with a do block:
function matching_brackets(s::AbstractString)
close_open_map = Dict('}' => '{', ')' => '(', ']' => '[')
order_arr = []
all_correct = all(s) do char
if char in values(close_open_map)
push!(order_arr, char)
elseif (char in keys(close_open_map)) &&
(isempty(order_arr) || (close_open_map[char] != pop!(order_arr)))
return false
end
return true
end
return all_correct && isempty(order_arr)
end
Timings
Using BenchmarkTools' #benchmark for the strings "{()()[()]()}" and "{()()[())]()}", I get a slow down up of about 10% for both strings, when comparing minimum execution time.
Additional Info
Version Info:
Julia Version 1.3.1
Commit 2d5741174c (2019-12-30 21:36 UTC)
Platform Info:
OS: macOS (x86_64-apple-darwin18.6.0)
CPU: Intel(R) Core(TM) i5-4260U CPU # 1.40GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-6.0.1 (ORCJIT, haswell)
Timing Code:
using BenchmarkTools
benchmark_strings = ["{()()[()]()}", "{()()[())]()}"]
for s in benchmark_strings
b_old = #benchmark matching_brackets_old("$s") samples=100000 seconds=30
b_new = #benchmark matching_brackets("$s") samples=100000 seconds=30
println("For String=", s)
println(b_old)
println(b_new)
println(judge(minimum(b_new), minimum(b_old)))
println("Result: ", matching_brackets(s))
end
With Result:
For String={()()[()]()}
Trial(8.177 μs)
Trial(9.197 μs)
TrialJudgement(+12.48% => regression)
Result: true
For String={()()[())]()}
Trial(8.197 μs)
Trial(9.202 μs)
TrialJudgement(+12.27% => regression)
Result: false
Edit
I mixed up the order on the Trialjudgement, so Version I is faster, as François Févotte suggests. My question remains: why?
Now that the mistake with judge is resolved, the answer is probably the usual caveat: function calls, as in this case resulting from the closure passed to all, are quite optimized, but not for free.
To get a real improvement, I suggest, other than making the stack type stable (which isn't that big a deal here), to get rid of the iterations you implicitely do by calling in on values and keys. It suffices to do that only once, without a dictionary:
const MATCHING_PAIRS = ('{' => '}', '(' => ')', '[' => ']')
function matching_brackets(s::AbstractString)
stack = Vector{eltype(s)}()
for c in s
for (open, close) in MATCHING_PAIRS
if c == open
push!(stack, c)
elseif c == close
if isempty(stack) || (pop!(stack) != open)
return false
end
end
end
end
return isempty(stack)
end
Even a bit more time can be squeezed out by unrolling the inner loop over the tuple:
function matching_brackets_unrolled(s::AbstractString)
stack = Vector{eltype(s)}()
for c in s
if (c == '(') || (c == '[') || (c == '{')
push!(stack, c)
elseif (c == ')')
if isempty(stack) || (pop!(stack) != '(')
return false
end
elseif (c == ']')
if isempty(stack) || (pop!(stack) != '[')
return false
end
elseif (c == '}')
if isempty(stack) || (pop!(stack) != '{')
return false
end
end
end
return isempty(stack)
end
This is somewhat ugly and certainly not nicely extendable, though. My benchmarks (matching_brackets_new is your second version, matching_brackets my first one):
julia> versioninfo()
Julia Version 1.3.1
Commit 2d5741174c (2019-12-30 21:36 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Core(TM) i7 CPU 960 # 3.20GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-6.0.1 (ORCJIT, nehalem)
# NOT MATCHING
julia> #benchmark matching_brackets_new("{()()[())]()}")
BenchmarkTools.Trial:
memory estimate: 784 bytes
allocs estimate: 16
--------------
minimum time: 674.844 ns (0.00% GC)
median time: 736.200 ns (0.00% GC)
mean time: 800.935 ns (6.54% GC)
maximum time: 23.831 μs (96.16% GC)
--------------
samples: 10000
evals/sample: 160
julia> #benchmark matching_brackets_old("{()()[())]()}")
BenchmarkTools.Trial:
memory estimate: 752 bytes
allocs estimate: 15
--------------
minimum time: 630.743 ns (0.00% GC)
median time: 681.725 ns (0.00% GC)
mean time: 753.937 ns (6.41% GC)
maximum time: 23.056 μs (94.19% GC)
--------------
samples: 10000
evals/sample: 171
julia> #benchmark matching_brackets("{()()[())]()}")
BenchmarkTools.Trial:
memory estimate: 112 bytes
allocs estimate: 2
--------------
minimum time: 164.883 ns (0.00% GC)
median time: 172.900 ns (0.00% GC)
mean time: 186.523 ns (4.33% GC)
maximum time: 5.428 μs (96.54% GC)
--------------
samples: 10000
evals/sample: 759
julia> #benchmark matching_brackets_unrolled("{()()[())]()}")
BenchmarkTools.Trial:
memory estimate: 112 bytes
allocs estimate: 2
--------------
minimum time: 134.459 ns (0.00% GC)
median time: 140.292 ns (0.00% GC)
mean time: 150.067 ns (5.84% GC)
maximum time: 5.095 μs (96.56% GC)
--------------
samples: 10000
evals/sample: 878
# MATCHING
julia> #benchmark matching_brackets_old("{()()[()]()}")
BenchmarkTools.Trial:
memory estimate: 800 bytes
allocs estimate: 18
--------------
minimum time: 786.358 ns (0.00% GC)
median time: 833.873 ns (0.00% GC)
mean time: 904.437 ns (5.43% GC)
maximum time: 29.355 μs (96.88% GC)
--------------
samples: 10000
evals/sample: 106
julia> #benchmark matching_brackets_new("{()()[()]()}")
BenchmarkTools.Trial:
memory estimate: 832 bytes
allocs estimate: 19
--------------
minimum time: 823.597 ns (0.00% GC)
median time: 892.506 ns (0.00% GC)
mean time: 981.381 ns (5.98% GC)
maximum time: 47.308 μs (97.84% GC)
--------------
samples: 10000
evals/sample: 77
julia> #benchmark matching_brackets("{()()[()]()}")
BenchmarkTools.Trial:
memory estimate: 112 bytes
allocs estimate: 2
--------------
minimum time: 206.062 ns (0.00% GC)
median time: 214.481 ns (0.00% GC)
mean time: 227.385 ns (3.38% GC)
maximum time: 6.890 μs (96.22% GC)
--------------
samples: 10000
evals/sample: 535
julia> #benchmark matching_brackets_unrolled("{()()[()]()}")
BenchmarkTools.Trial:
memory estimate: 112 bytes
allocs estimate: 2
--------------
minimum time: 160.186 ns (0.00% GC)
median time: 164.752 ns (0.00% GC)
mean time: 180.794 ns (4.95% GC)
maximum time: 5.751 μs (97.03% GC)
--------------
samples: 10000
evals/sample: 800
Update: if you insert breaks in the first version, to really avoid unnecessary looping, the timings are almost indistinguishable, with nice code:
function matching_brackets(s::AbstractString)
stack = Vector{eltype(s)}()
for c in s
for (open, close) in MATCHING_PAIRS
if c == open
push!(stack, c)
break
elseif c == close
if isempty(stack) || (pop!(stack) != open)
return false
end
break
end
end
end
return isempty(stack)
end
with
julia> #benchmark matching_brackets_unrolled("{()()[())]()}")
BenchmarkTools.Trial:
memory estimate: 112 bytes
allocs estimate: 2
--------------
minimum time: 137.574 ns (0.00% GC)
median time: 144.978 ns (0.00% GC)
mean time: 165.365 ns (10.44% GC)
maximum time: 9.344 μs (98.02% GC)
--------------
samples: 10000
evals/sample: 867
julia> #benchmark matching_brackets("{()()[())]()}") # with breaks
BenchmarkTools.Trial:
memory estimate: 112 bytes
allocs estimate: 2
--------------
minimum time: 148.255 ns (0.00% GC)
median time: 155.231 ns (0.00% GC)
mean time: 175.245 ns (9.62% GC)
maximum time: 9.602 μs (98.31% GC)
--------------
samples: 10000
evals/sample: 839
I don't observe the same on my machine: in my tests, version I is faster for both strings:
julia> versioninfo()
Julia Version 1.3.0
Commit 46ce4d7933 (2019-11-26 06:09 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Core(TM) i5-6200U CPU # 2.30GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-6.0.1 (ORCJIT, skylake)
Environment:
JULIA_PROJECT = #.
julia> #btime matching_brackets_old("{()()[()]()}")
716.443 ns (18 allocations: 800 bytes)
true
julia> #btime matching_brackets("{()()[()]()}")
761.434 ns (19 allocations: 832 bytes)
true
julia> #btime matching_brackets_old("{()()[())]()}")
574.847 ns (15 allocations: 752 bytes)
false
julia> #btime matching_brackets("{()()[())]()}")
612.793 ns (16 allocations: 784 bytes)
false
I would think (but this is a wild guess) that the difference between for loops and higher-order functions gets less and less significant when the string size increases.
However, I would encourage you to look more closely at the order_arr variable: as it is currently written, it is of type Vector{Any}, which - like any container of abstractly typed values - hurts performance. The following version performs better by concretely typing the elements of order_arr:
function matching_brackets_new(s::AbstractString)
close_open_map = Dict('}' => '{', ')' => '(', ']' => '[')
# Make sure the compiler knows about the type of elements in order_arr
order_arr = eltype(s)[] # or order_arr = Char[]
for char in s
if char in values(close_open_map)
push!(order_arr, char)
elseif (char in keys(close_open_map)) &&
(isempty(order_arr) || (close_open_map[char] != pop!(order_arr)))
return false
end
end
return isempty(order_arr)
end
yielding:
julia> #btime matching_brackets_new("{()()[()]()}")
570.641 ns (18 allocations: 784 bytes)
true
julia> #btime matching_brackets_new("{()()[())]()}")
447.758 ns (15 allocations: 736 bytes)
false

Counting swaps for sorting statistics - what with swaps with only two assignments instead of three

While helping out a student with his classes, I implemented the dual pivot quicksort algorithm to prepare a session and got intriged. After running some statistics, then solving the worst case situation, then running stats again, and again solving the next worst case situation, and repeating this process several times, the resulting code is no more then 80 lines of simple straightforward Python code (a bit less then Vladimir's code). The novel part is how the 3 partitions are constructed in combination with some very simple yet effective post processing of them. Now I need some help on how to test and make statistics properly.
Especially about how to count the swaps: most of the swaps only perform two assignements instead of three. So must I count them as full swaps or, is it fair to count them only as a '2/3' swap?
Counting every swap as 1, the Cn in Cn * N * log2(N) is around 0.48 on short lists (<100 elements) and around 0.55 on longer lists of several million elements. That is just the theoretical minimum as calculated by Vladimir Yaroslavskiy.
Counting the lighter swaps as 2/3 instead, the number of needed swaps is almost equal for any list size and is around 0.36 (stdev around 0.015).
The Cn for the number of comparisons is on average around 1.3 for lists of 2 million records, which is less then the theoretical 1.38 (from 2*N*ln(N)), and lower for shorter lists, i.e. for 1024 elements, it's around 1.21
That is for lists with 100% unique numbers and randomly ordered with Python's random.shuffle().
So my question is:
Is it ok to count the lighter swaps as such, and is the result indeed promising or not?
Also interesting is:
the more equal elements in the list, the faster is sorts. Cn is 0.03 and 0.1 for swaps and comparisons respectively for a 2 million list of all equal elements.
Cn for sorted and reversed sorted lists are almost the same for all sizes: 0.3 and 1 for the swaps (counted with 2/3) and comparisons respectively.
I will post a list with more statistics shortly which includes maximum stack depth, number of recursive calls besides the swaps and comparisons. Are there other things I should count?
Also, are there some 'standard' test suites with files of all kinds of situations (with equals, partially sorted etc.) one can use to test a sorting algorithm, and to make the results comparable with other sorting algorithms.
Added May 5:
I improved the algorithm especially for sorted lists.
Here are the resutls for 20 runs for each.
Are this good results?
New statistics:
Random.shuffle(), unique number
Length Swaps/Nlog2(N) Comparisons/Nlog2(N) Maximum Stack/log2(N)
16 0.367 0.922 0.250
64 0.360 1.072 0.500
256 0.342 1.122 0.625
1024 0.358 1.156 0.800
4096 0.359 1.199 0.917
16384 0.359 1.244 1.071
65536 0.360 1.244 1.125
262144 0.360 1.269 1.167
1048576 0.362 1.275 1.200
Sorted, unique numbers
Length Swaps/Nlog2(N) Comparisons/Nlog2(N) Maximum Stack/log2(N)
16 0.172 0.531 0.250
64 0.117 0.586 0.333
256 0.087 0.609 0.375
1024 0.075 0.740 0.500
4096 0.060 0.732 0.500
16384 0.051 0.726 0.500
65536 0.044 0.722 0.500
262144 0.041 0.781 0.556
1048576 0.036 0.774 0.550
2097152 0.035 0.780 0.571
Reversed order, unique numbers
Length Swaps/Nlog2(N) Comparisons/Nlog2(N) Maximum Stack/log2(N)
16 0.344 0.828 0.250
64 0.279 0.812 0.333
256 0.234 0.788 0.375
1024 0.210 0.858 0.500
4096 0.190 0.865 0.500
16384 0.172 0.855 0.500
65536 0.158 0.846 0.500
262144 0.153 0.900 0.556
1048576 0.143 0.892 0.550
2097152 0.140 0.895 0.571
I have chosen to count the assignments executed on the elements to be sorted, instead of 'swaps'. Assignements and comparisons of indexes are not counted.
I converted the code Vladimir Yaroslavskiy included in his document (Last updated: September 22, 2009) to Python and added the counters the same way as I did in my own implementation. The code is included at the end.
Any comments are welcome.
Here are the results, the averages of 10 runs.
The columns labeled VY are the results for the implementation by Vladimir, the columns labeled by JB are these of my own implementation.
Length F Function call Assignements Comparisons Maximum Stack
of list per N per N.log2(N) per N.log2(N) per log2(N)
Random.shuffle(), unique number
Version VY JB VY JB VY JB VY JB
64 1 0.170 0.266 1.489 1.029 1.041 1.028 0.417 0.633
256 1 0.171 0.270 1.463 1.016 1.066 1.138 0.575 0.812
1024 1 0.167 0.275 1.451 1.046 1.089 1.165 0.690 1.010
4096 1 0.164 0.273 1.436 1.069 1.119 1.189 0.800 1.075
16384 1 0.166 0.273 1.444 1.077 1.117 1.270 0.843 1.221
65536 1 0.166 0.273 1.440 1.108 1.126 1.258 0.919 1.281
262144 1 0.166 0.273 1.423 1.102 1.134 1.278 0.950 1.306
1048576 1 0.166 0.273 1.426 1.085 1.131 1.273 0.990 1.290
Sorted, unique numbers
Version VY JB VY JB VY JB VY JB
64 1 0.203 0.203 1.036 0.349 0.643 0.586 0.333 0.333
256 1 0.156 0.156 0.904 0.262 0.643 0.609 0.375 0.375
1024 1 0.118 0.355 0.823 0.223 0.642 0.740 0.400 0.500
4096 1 0.131 0.267 0.840 0.181 0.679 0.732 0.500 0.500
16384 1 0.200 0.200 0.926 0.152 0.751 0.726 0.500 0.500
65536 1 0.150 0.150 0.866 0.131 0.737 0.722 0.500 0.500
262144 1 0.113 0.338 0.829 0.124 0.728 0.781 0.500 0.556
1048576 1 0.147 0.253 0.853 0.108 0.750 0.774 0.550 0.550
Reversed order, unique numbers
Version VY JB VY JB VY JB VY JB
64 1 0.203 0.203 1.320 0.836 0.841 0.802 0.333 0.333
256 1 0.156 0.156 1.118 0.703 0.795 0.783 0.375 0.375
1024 1 0.118 0.312 1.002 0.631 0.768 0.852 0.400 0.500
4096 1 0.125 0.267 0.977 0.569 0.776 0.861 0.500 0.500
16384 1 0.200 0.200 1.046 0.516 0.834 0.852 0.500 0.500
65536 1 0.150 0.150 0.974 0.475 0.813 0.844 0.500 0.500
262144 1 0.113 0.338 0.925 0.459 0.795 0.896 0.500 0.556
1048576 1 0.145 0.253 0.938 0.430 0.811 0.890 0.550 0.550
Random, with increasing frequency of the numbers.
The last row is a list of the same number
Version VY JB VY JB VY JB VY JB
65536 1 0.166 0.273 1.429 1.051 1.113 1.251 0.881 1.156
65536 2 0.167 0.270 1.404 1.075 1.112 1.238 0.894 1.194
65536 4 0.168 0.273 1.373 1.039 1.096 1.213 0.906 1.238
65536 8 0.151 0.245 1.302 1.029 1.069 1.199 0.900 1.262
65536 16 0.132 0.127 1.264 0.970 1.020 1.150 0.912 1.188
65536 32 0.090 0.064 1.127 0.920 0.950 1.099 0.856 1.119
65536 64 0.051 0.032 1.000 0.845 0.879 0.993 0.819 1.019
65536 128 0.026 0.016 0.884 0.792 0.797 0.923 0.725 0.931
65536 256 0.013 0.008 0.805 0.704 0.728 0.840 0.675 0.856
65536 512 0.006 0.004 0.690 0.615 0.652 0.728 0.588 0.669
65536 1024 0.003 0.002 0.635 0.557 0.579 0.654 0.519 0.625
65536 2048 0.002 0.001 0.541 0.487 0.509 0.582 0.438 0.463
65536 4096 0.001 0.000 0.459 0.417 0.434 0.471 0.369 0.394
65536 8192 0.000 0.000 0.351 0.359 0.357 0.405 0.294 0.300
65536 16384 0.000 0.000 0.247 0.297 0.253 0.314 0.206 0.194
65536 32768 0.000 0.000 0.231 0.188 0.209 0.212 0.125 0.081
65536 65536 0.000 0.000 0.063 0.125 0.063 0.125 0.062 0.000
Here is the code of Vladimirs sort in Python:
DIST_SIZE = 13
TINY_SIZE = 17
def dualPivotQuicksort(a, left, right, nesting=0):
global assignements, comparisons, oproepen, maxnesting
oproepen += 1
maxnesting = max(maxnesting, nesting)
length = right - left
if length < TINY_SIZE: # insertion sort on tiny array
# note by JB: rewritten to minimize the assignements
for i in xrange(left+1, right+1):
key = a[i]
assignements += 1
while i > left:
comparisons += 1
if key < a[i - 1]:
assignements += 1
a[i] = a[i-1]
i -= 1
else:
break
assignements += 1
a[i] = key
return
# median indexes
sixth = length / 6
m1 = left + sixth
m2 = m1 + sixth
m3 = m2 + sixth
m4 = m3 + sixth
m5 = m4 + sixth
assignements += 9*3
comparisons += 9
## 5-element sorting network
if a[m1] > a[m2]: a[m1],a[m2] = a[m2],a[m1]
if a[m4] > a[m5]: a[m4],a[m5] = a[m5],a[m4]
if a[m1] > a[m3]: a[m1],a[m3] = a[m3],a[m1]
if a[m2] > a[m3]: a[m2],a[m3] = a[m3],a[m2]
if a[m1] > a[m4]: a[m1],a[m4] = a[m4],a[m1]
if a[m3] > a[m4]: a[m3],a[m4] = a[m4],a[m3]
if a[m2] > a[m5]: a[m2],a[m5] = a[m5],a[m2]
if a[m2] > a[m3]: a[m2],a[m3] = a[m3],a[m2]
if a[m4] > a[m5]: a[m4],a[m5] = a[m5],a[m4]
# pivots: [ < pivot1 | pivot1 <= && <= pivot2 | > pivot2 ]
assignements += 2
pivot1 = a[m2]
pivot2 = a[m4]
comparisons += 1
diffPivots = pivot1 != pivot2
assignements += 2
a[m2] = a[left]
a[m4] = a[right]
# center part pointers
less = left + 1
great = right - 1
# sorting
if (diffPivots):
k = less
while k <= great:
assignements += 1
x = a[k]
comparisons += 2
if (x < pivot1):
comparisons -= 1
assignements += 2
a[k] = a[less]
a[less] = x
less += 1
elif (x > pivot2):
while k < great:
comparisons += 1
if a[great] > pivot2:
great -= 1
else:
break
assignements += 3
a[k] = a[great]
a[great] = x
great -= 1
x = a[k]
comparisons += 1
if (x < pivot1):
assignements += 2
a[k] = a[less]
a[less] = x
less += 1
k += 1
else:
k = less
while k <= great:
assignements += 1
x = a[k]
comparisons += 1
if (x == pivot1):
k += 1
continue
comparisons += 1
if (x < pivot1):
assignements += 2
a[k] = a[less]
a[less] = x
less += 1
else:
while k < great:
comparisons += 1
if a[great] > pivot2:
great -= 1
else:
break
assignements += 3
a[k] = a[great]
a[great] = x
great -= 1
x = a[k]
comparisons += 1
if (x < pivot1):
assignements += 2
a[k] = a[less]
a[less] = x
less += 1
k += 1
# swap
assignements += 2
a[left] = a[less - 1]
a[less - 1] = pivot1
assignements += 2
a[right] = a[great + 1]
a[great + 1] = pivot2
# left and right parts
dualPivotQuicksort(a, left, less - 2, nesting+1)
dualPivotQuicksort(a, great + 2, right, nesting+1)
# equal elements
if (great - less > length - DIST_SIZE and diffPivots):
k = less
while k <= great:
assignements += 1
x = a[k]
comparisons += 2
if (x == pivot1):
comparisons -= 1
assignements += 2
a[k] = a[less]
a[less] = x
less += 1
elif (x == pivot2):
assignements += 3
a[k] = a[great]
a[great] = x
great -= 1
x = a[k]
comparisons += 1
if (x == pivot1):
assignements += 2
a[k] = a[less]
a[less] = x
less += 1
k += 1
# center part
if (diffPivots):
dualPivotQuicksort(a, less, great, nesting+1)
This code is about 190 lines, my current implementation written with the same formatting is about 110 lines.
So any remarks are welcome.

Resources