Getting element from a `Vec<Vec<u8>>` very slow - performance

I'm running the following snippet of code (the std::time objects are just there for benchmarking) that gets u8 elements from a vector of vector of u8 in a given order and creates a new vector with these objects in this order.
for idx in cur_prefix_ref.iter() {
let now = std::time::Instant::now();
let elapsed_first = now.elapsed();
unsafe {
val = *data.get_unchecked(*idx as usize).get_unchecked(j);
let elapsed_second = now.elapsed();
if val == 0 {
zero_tot += 1;
} else if val == 1 {
one_tot += 1;
if (ct == 0) || (ct_extra == fm_gap) {
ct_extra = 0;
ct += 1;
ct_extra += 1;
let elapsed_third = now.elapsed();
In my full code this inner loop ends up running hundreds of millions of times so I'm trying to optimise it as much as possible. According to be benchmarking, I seem to be spending the vast majority of the loop time in looking up values from my Vec<Vec<u8>>, on the line val = *data.get_unchecked(*idx as usize).get_unchecked(j);, see below which benchmarks some elapsed_first,elapsed_second,elapsed_third times from different iterations of this loop (the i^th element of each list is from the same run):
First: [27ns, 23ns, 21ns, 24ns, 27ns, 23ns, 28ns, 23ns, 26ns, 23ns, 21ns, 22ns, 27ns, 27ns, 28ns, 23ns, 25ns, 24ns, 26ns, 25ns, 22ns, 24ns, 24ns, 28ns, 28ns, 28ns, 26ns, 22ns, 22ns, 21ns]
Second: [538ns, 695ns, 550ns, 486ns, 627ns, 615ns, 562ns, 570ns, 661ns, 521ns, 617ns, 358ns, 444ns, 560ns, 540ns, 471ns, 656ns, 336ns, 233ns, 209ns, 433ns, 373ns, 1.427µs, 542ns, 708ns, 288ns, 304ns, 608ns, 297ns, 252ns]
Third: [612ns, 736ns, 587ns, 525ns, 665ns, 658ns, 608ns, 614ns, 701ns, 560ns, 656ns, 395ns, 482ns, 606ns, 578ns, 510ns, 696ns, 374ns, 270ns, 246ns, 470ns, 416ns, 1.47µs, 583ns, 751ns, 327ns, 348ns, 645ns, 334ns, 289ns]
I've been trying to understand why this simple vector lookup is the bit that takes by far the most time compared to everything else and still haven't figured it out. Any help is much appreciated!
EDIT: Here is the full function which this code comes from:
pub fn spaced_pbwt(vcf: &VCFData, pbwt_cols: &Vec<SiteRow>, fm_gap: u32) -> SpacedPbwt {
let now = std::time::Instant::now();
let data_positions: Vec<u32> = vcf.positions.clone();
let mut pbwt_positions: Vec<u32> = Vec::new();
let mut insert_positions: Vec<u32> = Vec::new();
let data: &Vec<Vec<u8>> = &vcf.vcf_data;
let mut col_set: HashSet<u32> = HashSet::new();
let mut n: usize = 0;
for col in pbwt_cols {
let pos = col.position;
n += 1;
let m = data.len();
let n_full = data[0].len();
let n_other = n_full-n;
let mut is_pbwt_col :Vec<u8> = Vec::with_capacity(n_full+1);
let mut pbwt_positions: Vec<u32> = Vec::new();
let mut inserted_positions: Vec<u32> = Vec::new();
let mut prefixes : Vec<Vec<u32>> = Vec::with_capacity(n+1);
let mut divergences : Vec<Vec<u32>> = Vec::with_capacity(n+1);
let mut binaries: Vec<Vec<u8>> = Vec::with_capacity(n_full+1);
let cur_prefix : Vec<u32> = Vec::from_iter(0..m as u32);
let cur_divergence : Vec<u32> = vec![0; m];
let mut j: usize = 0;
let mut j_pbwt = 0;
let mut count_vec: Vec<u32> = Vec::new();
let mut occ_vec : Vec<Vec<Vec<u32>>> = Vec::new();
let mut cur_prefix_ref: &Vec<u32> = &(prefixes[prefixes.len()-1]);
let mut cur_divergence_ref: &Vec<u32> = &divergences[divergences.len()-1];
let mut ct: i32 = 0;
let mut ct_extra: u32 = 0;
let mut zero_tot: u32 = 0;
let mut one_tot: u32 = 0;
let mut occ_positions: Vec<Vec<u32>> = vec![Vec::new(),Vec::new()];
let mut new_add: Vec<u8> = Vec::with_capacity(m);
let mut a: Vec<u32> = Vec::with_capacity(m);
let mut b: Vec<u32> = Vec::with_capacity(m);
let mut d: Vec<u32> = Vec::with_capacity(m);
let mut e: Vec<u32> = Vec::with_capacity(m);
let mut bin_values: Vec<u8> = Vec::with_capacity(m);
let mut elapse1 = Vec::new();
let mut elapse2 = Vec::new();
let mut elapse3 = Vec::new();
for col in &vcf.positions {
if !col_set.contains(&col) {
ct = 0;
ct_extra = 0;
zero_tot = 0;
one_tot = 0;
occ_positions = vec![Vec::new(),Vec::new()];
new_add = Vec::with_capacity(m);
let mut val: u8;
for idx in cur_prefix_ref.iter() {
let now = std::time::Instant::now();
let elapsed_first = now.elapsed();
unsafe {
val = *data.get_unchecked(*idx as usize).get_unchecked(j);
let elapsed_second = now.elapsed();
if val == 0 {
zero_tot += 1;
} else if val == 1 {
one_tot += 1;
if (ct == 0) || (ct_extra == fm_gap) {
ct_extra = 0;
ct += 1;
ct_extra += 1;
let elapsed_third = now.elapsed();
} else {
a = Vec::with_capacity(m);
b = Vec::with_capacity(m);
d = Vec::with_capacity(m);
e = Vec::with_capacity(m);
bin_values = Vec::with_capacity(m);
let mut p: u32 = j_pbwt+1;
let mut q: u32 = j_pbwt+1;
occ_positions = vec![Vec::new(),Vec::new()];
ct = 0;
ct_extra = 0;
zero_tot = 0;
one_tot = 0;
let mut cur_allele: u8;
for (idx,start_point) in
cur_prefix_ref.iter().zip(cur_divergence_ref.iter()) {
let idx_val = *idx;
unsafe {
cur_allele = *data.get_unchecked(idx_val as usize).get_unchecked(j);
let st = *start_point;
if st > p {
p = st;
if st > q {
q = st;
if cur_allele == 0 {
p = 0;
zero_tot += 1;
if cur_allele == 1 {
q = 0;
one_tot += 1;
if (ct == 0) || (ct_extra == fm_gap) {
ct_extra = 0;
ct += 1;
ct_extra += 1;
let mut new_prefix = a;
new_prefix.append(&mut b);
let mut new_divergence = d;
new_divergence.append(&mut e);
cur_prefix_ref = &(prefixes[prefixes.len()-1]);
cur_divergence_ref = &divergences[divergences.len()-1];
j += 1;
let elapsed = now.elapsed();
println!("Calc Time: {:.4?}", elapsed);
println!("First: {:?}", &elapse1[500..530]);
println!("Second: {:?}", &elapse2[500..530]);
println!("Third: {:?}", &elapse3[500..530]);
return SpacedPbwt {
num_samples: m as u32,
num_pbwt_sites: n as u32,
num_inserted_sites: n_other as u32,
num_total_sites: n_full as u32,
pbwt_positions: pbwt_positions,
inserted_positions: inserted_positions,
all_positions: data_positions,
pbwt_col_flags: is_pbwt_col,
bin_pbwt: binaries,
count: count_vec,
occ_list: occ_vec,
fm_gap: fm_gap,
Here is a modified version of the file that everybody should be able to run on their machine and does exhibit the behaviour I'm concerned about. It only uses the rand crate as a dependency:
use rand::{seq::IteratorRandom, thread_rng}; // 0.6.1
use rand::distributions::{Distribution, Uniform};
use std::collections::HashSet;
pub fn spaced_pbwt(data: &Vec<Vec<u8>>, fm_gap: u32) -> () {
let now = std::time::Instant::now();
let m = data.len();
let n = data[0].len();
let half_n = n/2;
let mut rng = thread_rng();
let sample: Vec<u32> = (0u32..n as u32).collect();
let perm = sample.iter().choose_multiple(&mut rng, half_n);
let mut cols_to_permute: Vec<u32> = Vec::new();
for i in perm {
let mut col_set: HashSet<u32> = HashSet::new();
let mut n: usize = 0;
for col in &cols_to_permute {
n += 1;
let m = data.len();
let n_full = data[0].len();
let n_other = n_full-n;
let mut is_pbwt_col :Vec<u8> = Vec::with_capacity(n_full+1);
let mut pbwt_positions: Vec<u32> = Vec::new();
let mut inserted_positions: Vec<u32> = Vec::new();
let mut prefixes : Vec<Vec<u32>> = Vec::with_capacity(n+1);
let mut divergences : Vec<Vec<u32>> = Vec::with_capacity(n+1);
let mut binaries: Vec<Vec<u8>> = Vec::with_capacity(n_full+1);
let cur_prefix : Vec<u32> = Vec::from_iter(0..m as u32);
let cur_divergence : Vec<u32> = vec![0; m];
let mut j: usize = 0;
let mut j_pbwt = 0;
let mut count_vec: Vec<u32> = Vec::new();
let mut occ_vec : Vec<Vec<Vec<u32>>> = Vec::new();
let mut cur_prefix_ref: &Vec<u32> = &(prefixes[prefixes.len()-1]);
let mut cur_divergence_ref: &Vec<u32> = &divergences[divergences.len()-1];
let mut ct: i32 = 0;
let mut ct_extra: u32 = 0;
let mut zero_tot: u32 = 0;
let mut one_tot: u32 = 0;
let mut occ_positions: Vec<Vec<u32>> = vec![Vec::new(),Vec::new()];
let mut new_add: Vec<u8> = Vec::with_capacity(m);
let mut a: Vec<u32> = Vec::with_capacity(m);
let mut b: Vec<u32> = Vec::with_capacity(m);
let mut d: Vec<u32> = Vec::with_capacity(m);
let mut e: Vec<u32> = Vec::with_capacity(m);
let mut bin_values: Vec<u8> = Vec::with_capacity(m);
let mut elapse1 = Vec::new();
let mut elapse2 = Vec::new();
let mut elapse3 = Vec::new();
for col in 0..n {
if !col_set.contains(&(col as u32)) {
ct = 0;
ct_extra = 0;
zero_tot = 0;
one_tot = 0;
occ_positions = vec![Vec::new(),Vec::new()];
new_add = Vec::with_capacity(m);
let mut val: u8;
for idx in cur_prefix_ref.iter() {
let now = std::time::Instant::now();
let elapsed_first = now.elapsed();
unsafe {
val = *data.get_unchecked(*idx as usize).get_unchecked(j);
let elapsed_second = now.elapsed();
if val == 0 {
zero_tot += 1;
} else if val == 1 {
one_tot += 1;
if (ct == 0) || (ct_extra == fm_gap) {
ct_extra = 0;
ct += 1;
ct_extra += 1;
let elapsed_third = now.elapsed();
inserted_positions.push(col as u32);
} else {
a = Vec::with_capacity(m);
b = Vec::with_capacity(m);
d = Vec::with_capacity(m);
e = Vec::with_capacity(m);
bin_values = Vec::with_capacity(m);
let mut p: u32 = j_pbwt+1;
let mut q: u32 = j_pbwt+1;
occ_positions = vec![Vec::new(),Vec::new()];
ct = 0;
ct_extra = 0;
zero_tot = 0;
one_tot = 0;
let mut cur_allele: u8;
for (idx,start_point) in
cur_prefix_ref.iter().zip(cur_divergence_ref.iter()) {
let idx_val = *idx;
unsafe {
cur_allele = *data.get_unchecked(idx_val as usize).get_unchecked(j);
let st = *start_point;
if st > p {
p = st;
if st > q {
q = st;
if cur_allele == 0 {
p = 0;
zero_tot += 1;
if cur_allele == 1 {
q = 0;
one_tot += 1;
if (ct == 0) || (ct_extra == fm_gap) {
ct_extra = 0;
ct += 1;
ct_extra += 1;
let mut new_prefix = a;
new_prefix.append(&mut b);
let mut new_divergence = d;
new_divergence.append(&mut e);
cur_prefix_ref = &(prefixes[prefixes.len()-1]);
cur_divergence_ref = &divergences[divergences.len()-1];
pbwt_positions.push(col as u32);
j += 1;
let elapsed = now.elapsed();
println!("Calc Time: {:.4?}", elapsed);
println!("First: {:?}", &elapse1[500..530]);
println!("Second: {:?}", &elapse2[500..530]);
println!("Third: {:?}", &elapse3[500..530]);
fn main() {
let m = 4000;
let n = 50000;
let step: Uniform<u8> = Uniform::new(0,2);
let mut rng = rand::thread_rng();
let mut data = Vec::new();
for _ in 0..m {
let choices: Vec<u8> = step.sample_iter(&mut rng).take(n).collect();
let fm = 2;

When I ran your code (on an i7-7700HQ), I got these numbers
First: [16ns, 17ns, 16ns, 16ns, 16ns, 17ns, 15ns, 15ns, 16ns, 17ns, 16ns, 16ns, 16ns, 16ns, 16ns, 16ns, 16ns, 15ns, 16ns, 15ns, 16ns, 16ns, 17ns, 16ns, 17ns, 16ns, 15ns, 16ns, 16ns, 16ns]
Second: [107ns, 104ns, 171ns, 109ns, 101ns, 112ns, 116ns, 169ns, 184ns, 177ns, 103ns, 108ns, 105ns, 79ns, 110ns, 112ns, 109ns, 165ns, 157ns, 104ns, 104ns, 409ns, 104ns, 107ns, 111ns, 104ns, 104ns, 104ns, 106ns, 117ns]
Third: [132ns, 126ns, 202ns, 132ns, 133ns, 140ns, 147ns, 197ns, 216ns, 207ns, 136ns, 138ns, 405ns, 105ns, 149ns, 139ns, 142ns, 198ns, 182ns, 126ns, 135ns, 434ns, 128ns, 136ns, 136ns, 127ns, 128ns, 129ns, 136ns, 147ns]
Which has vastly different proportions, than your results. Since you said, there is a C program that runs faster, it should not be a problem with your system.
The next thing I can think about is you need a cargo clean and recompile the whole thing. Sometimes (I am on the nightly compiler) I had an issue, that made recompiled binaries slow, maybe because of some code-layout issue, compiler stuff idk. A clean build usually fixed it.
Next, you can try using link time optimization. Add this to your Cargo.toml:
inherits = "release"
lto = true
Then run the profile with
cargo run --profile lto
Third, use a single array, like some comments said. The ndarray crate is perfect for this. For me it brings down the times to
First: [18ns, 16ns, 17ns, 16ns, 17ns, 16ns, 18ns, 17ns, 17ns, 17ns, 17ns, 17ns, 17ns, 25ns, 16ns, 17ns, 18ns, 18ns, 17ns, 17ns, 18ns, 17ns, 17ns, 16ns, 17ns, 16ns, 16ns, 17ns, 17ns, 18ns]
Second: [51ns, 49ns, 48ns, 50ns, 51ns, 51ns, 49ns, 48ns, 48ns, 49ns, 50ns, 48ns, 53ns, 66ns, 49ns, 53ns, 52ns, 50ns, 50ns, 49ns, 53ns, 51ns, 47ns, 50ns, 52ns, 50ns, 48ns, 48ns, 48ns, 50ns]
Third: [77ns, 77ns, 75ns, 74ns, 83ns, 81ns, 75ns, 72ns, 81ns, 74ns, 82ns, 79ns, 552ns, 99ns, 81ns, 76ns, 79ns, 74ns, 77ns, 73ns, 86ns, 76ns, 75ns, 80ns, 85ns, 75ns, 74ns, 73ns, 74ns, 76ns]
use ndarray::Array2;
use std::collections::HashSet;
pub fn spaced_pbwt(data: &Array2<u8>, fm_gap: u32) -> () {
let now = std::time::Instant::now();
let (m, n) = data.dim();
let half_n = n/2;
unsafe {
val = *data.uget((idx as usize, j));
fn main() {
let m = 4000;
let n = 50000;
let step: Uniform<u8> = Uniform::new(0,2);
let mut rng = rand::thread_rng();
let mut data = Vec::new();
let mut data2 = Vec::with_capacity(m*n);
for _ in 0..m {
let choices: Vec<u8> = step.sample_iter(&mut rng).take(n).collect();
let fm = 2;
spaced_pbwt(&Array2::from_shape_vec((m, n), data2).unwrap(),fm);


Reading EXR file

I'm trying to create a IWICBitmap from an EXR file (error checks removed).
#pragma pack(push,1)
struct fl
float r, g, b, a;
#pragma pack(pop)
HRESULT Open(const char* f,IWICBitmap** d)
exr_context_initializer_t ctxtinit = EXR_DEFAULT_CONTEXT_INITIALIZER;
exr_context_t myfile = {};
exr_result_t rv = exr_start_read(&myfile, f, &ctxtinit);
int part_index = 0;
const exr_attr_chlist_t* chl = 0;
exr_get_channels(myfile, part_index, &chl);
int32_t ck = 0;
rv = exr_get_chunk_count(myfile, part_index, &ck);
int32_t sl = 0;
rv = exr_get_scanlines_per_chunk(myfile, part_index, &sl);
int y = 0;
int wi = 0;
int he = 0;
std::vector<fl> data; // put here the floats
exr_decode_pipeline_t dec = {};
for (int32_t cuk = 0; cuk < ck; cuk++)
exr_chunk_info_t ch = {};
exr_read_scanline_chunk_info(myfile, part_index, y, &ch);
wi = ch.width;
he += ch.height;
y += sl;
bool first = 0;
if (dec.decompress_fn == 0)
rv = exr_decoding_initialize(myfile, part_index, &ch, &dec);
rv = exr_decoding_choose_default_routines(myfile, part_index, &dec);
first = 1;
if (!first)
rv = exr_decoding_update(myfile, part_index,&ch,&dec);
rv = exr_decoding_run(myfile, part_index, &dec);
int NumPixels = (wi * ch.height);
auto BytesPerPixel = ch.unpacked_size / NumPixels;
if (true)
// RGB(A)
if (chl->entries[0].pixel_type == EXR_PIXEL_HALF)
if (BytesPerPixel == chl->num_channels * 2)
auto ds = data.size();
data.resize(ds + NumPixels);
auto p = + ds;
char* x = (char*)dec.unpacked_buffer;
for (int j = 0; j < NumPixels; j++)
uint16_t* u = (uint16_t*)x;
p->a = 1.0f;
for (int jH = 0; jH < chl->num_channels; jH++)
half ha(Imath_3_2::half::FromBits,*u);
if (strcmp(chl->entries[jH].name.str, "R") == 0) p->r = ha.operator float();
if (strcmp(chl->entries[jH].name.str, "G") == 0) p->g = ha.operator float();
if (strcmp(chl->entries[jH].name.str, "B") == 0) p->b = ha.operator float();
if (strcmp(chl->entries[jH].name.str, "A") == 0) p->a = ha.operator float();
x += BytesPerPixel;
if (chl->entries[0].pixel_type == EXR_PIXEL_FLOAT)
// code removed for simplicity, I guess the same issue happens here unless it's a problem of the half-float
rv = exr_decoding_destroy(myfile, &dec);
CComPtr<IWICImagingFactory2> wbfact = 0;
CoCreateInstance(CLSID_WICImagingFactory2, 0, CLSCTX_INPROC_SERVER,
__uuidof(IWICImagingFactory2), (void**)&wbfact);
return wbfact->CreateBitmapFromMemory(wi, he, GUID_WICPixelFormat128bppPRGBAFloat, wi * 16,(UINT)data.size()*16, (BYTE*), d);
What am I doing wrong? The pixel number I'm reading is correct (in this image 800x800).
My result:
Is there a problem with the half-float? I'm just using the OpenEXR's IMath implementation.

trying to implement selection sort in rust without using vectors

I'm currently learning rust and implementing selection sort using it. I'm trying to implement it using arrays but not able to address the following error.
fn selectionsort(arr: &[i32]) {
let len = arr.len();
let newarr: [i32; 5];
let mut moved = 0;
let mut max = 0;
while moved < len {
for x in 0..5 {
if max < arr[x] {
if !found(arr[x], &newarr) {
max = arr[x];
newarr[moved] = max;
moved += 1;
println!("{:?}", newarr);
//return &newarr;
fn found(x: i32, b: &[i32]) -> bool {
for i in 0..b.len() {
if x == b[i] {
return true;
return false;
the error:
error[E0381]: borrow of possibly-uninitialized variable: `newarr`
--> src/
17 | if !found(arr[x], &newarr) {
| ^^^^^^^ use of possibly-uninitialized `newarr`
error: aborting due to previous error
is it possible to implement the selection sort purely using arrays (like in C)?
edit: i'm comming from a C background and trying to implement the selection sort using purely primitives. before moving to generics. the suggested 'similar question' had solutions which were using vectors!
thanks #Prime_Aqasix for the solution for the compiler error. as promised here's the final debugged code. it's still primitive and does not deal with repeating numbers ie: [1, 2, 3, 3, 4]. and no zero's.
fn main() {
println!("Hello, world!");
let a: [i32; 5] = [5, 4, 1, 3, 2];
//let a: [i32; 5] = [1, 2, 3, 4, 5];
fn selectionsort(arr: &[i32]) {
let len = arr.len();
let mut newarr = [0; 5];
let mut moved = 0;
let mut max;
while moved < len {
max = find_max(&arr, &newarr, moved);
newarr[moved] = max;
moved += 1;
println!("{:?}", newarr);
//return &newarr;
fn find_max(a: &[i32], b: &[i32], mut i: usize) -> i32 {
if i != 0 {
i -= 1;
let temp_m = b[i];
let mut new_m = 0;
for x in 0..a.len() {
if temp_m == 0 || temp_m > a[x] {
// ^this is already moved!
if new_m < a[x] {
new_m = a[x];
return new_m;

Keeping track of total coins collected Swift 3

I am trying to save the coins collected and add that amount to the total amount of coins collected by the user(in SpriteKit). With the current code, the coins currently do not save and nothing is added to the total. I am not sure why the coins are not saving as I do not see any notable mistakes in the code. Any help or explanations as to why this is not working the way it should would be greatly appreciated.
var totalCoins = 0
var coin = 0
let totalCoinDefault = UserDefaults.standard()
totalCoins = totalCoinDefault.integer(forKey: "Totalcoin")
totalCoinLabel.text = "\(totalCoins)"
if ( coin > 0) {
totalCoins += self.coin
totalCoinLabel.text = String(format: "Totalcoin : %i", totalCoins)
let totalcoinDefault = UserDefaults.standard()
totalcoinDefault.setValue(totalCoins, forKey: "Totalcoin")
func updateCoinTotal(){
coinLabel.text = String(self.coin)
totalCoinLabel.text = String(self.totalCoins)
let totalCoinDefault = UserDefaults.standard()
totalCoins = totalCoinDefault.integer(forKey: "")
totalCoinLabel.text = "\(totalCoins)"
if (self.coin > 0) {
totalCoins += self.coin
totalCoinLabel.text = NSString(format: "%i", totalCoins) as String
let totalcoinDefault = UserDefaults.standard()
totalcoinDefault.setValue(totalCoins, forKey: "")
This is the updated code that you had which should work for your coins:
totalCoins = NSUserDefaults.standardUserDefaults().integerForKey("Total Coins")
totalCoinLabel.text = "\(totalCoins)"
if ( coin > 0) {
totalCoins += coin
totalCoinLabel.text = String(format: "Total Coins: \(totalCoins)")
NSUserDefaults.standardUserDefaults().setInteger(totalCoins, forKey: "Total Coins")
func updateCoinTotal() {
coinLabel.text = String(coin)
totalCoinLabel.text = String(totalCoins)
totalCoins = NSUserDefaults.standardUserDefaults().integerForKey("Total Coins")
totalCoinLabel.text = "\(totalCoins)"
if (coin > 0) {
totalCoins += coin
totalCoinLabel.text = NSString(format: "%i", totalCoins) as String
NSUserDefaults.standardUserDefaults().setInteger(totalCoins, forKey: "Total Coins")
But the coins Int will always equal zero so totalCoins will never be updated.
This is the code that I would use for collecting coins:
func colledCoin() {
totalCoins += 1
coin += 1
totalCoinLabel.text = String(totalCoins)
coinLabel.text = String(coin)
NSUserDefaults.standardUserDefaults().setInteger(totalCoins, forKey: "Total Coins")
func updateCoinLabels() {
totalCoins = NSUserDefaults.standardUserDefaults().integerForKey("Total Coins")
totalCoinLabel.text = String(totalCoins)

codility GenomicRangeQuery algorithm comparsion speed Java vs Swift

I rewrited code that solves GenomicRangeQuery task from Java to Swift. The code in Jave gets 100/100 score but the code in Swift fails all performance tests. I'm trying to understand why because logic in code is the same. I'w wondering why Swift code is executing so long. Do I using some very slow parts in my swift code that I'm not aware of. Please take a look at this Java code copied from here.
class Solution {
public int[] solveGenomicRange(String S, int[] P, int[] Q) {
//used jagged array to hold the prefix sums of each A, C and G genoms
//we don't need to get prefix sums of T, you will see why.
int[][] genoms = new int[3][S.length()+1];
//if the char is found in the index i, then we set it to be 1 else they are 0
// 3 short values are needed for this reason
short a, c, g;
for (int i=0; i<S.length(); i++) {
a = 0; c = 0; g = 0;
if ('A' == (S.charAt(i))) {
if ('C' == (S.charAt(i))) {
if ('G' == (S.charAt(i))) {
//here we calculate prefix sums. To learn what's prefix sums look at here
genoms[0][i+1] = genoms[0][i] + a;
genoms[1][i+1] = genoms[1][i] + c;
genoms[2][i+1] = genoms[2][i] + g;
int[] result = new int[P.length];
//here we go through the provided P[] and Q[] arrays as intervals
for (int i=0; i<P.length; i++) {
int fromIndex = P[i];
//we need to add 1 to Q[i],
//because our genoms[0][0], genoms[1][0] and genoms[2][0]
//have 0 values by default, look above genoms[0][i+1] = genoms[0][i] + a;
int toIndex = Q[i]+1;
if (genoms[0][toIndex] - genoms[0][fromIndex] > 0) {
result[i] = 1;
} else if (genoms[1][toIndex] - genoms[1][fromIndex] > 0) {
result[i] = 2;
} else if (genoms[2][toIndex] - genoms[2][fromIndex] > 0) {
result[i] = 3;
} else {
result[i] = 4;
return result;
And here the same code rewritten to Swift 2.1
public func solution(inout S:String, inout _ P:[Int], inout _ Q:[Int]) -> [Int] {
let len = S.characters.count
//used jagged array to hold the prefix sums of each A, C and G genoms
//we don't need to get prefix sums of T, you will see why.
var genoms = [[Int]](count: 3, repeatedValue: [Int](count: len+1, repeatedValue: 0))
//if the char is found in the index i, then we set it to be 1 else they are 0
// 3 short values are needed for this reason
var a,c,g:Int
for i in 0..<len {
a=0; c=0; g=0
let char = S[S.startIndex.advancedBy(i)]
switch char {
case "A": a=1;
case "C": c=1;
case "G": g=1;
default: ()
//here we calculate prefix sums. To learn what's prefix sums look at here
genoms[0][i+1] = genoms[0][i] + a
genoms[1][i+1] = genoms[1][i] + c
genoms[2][i+1] = genoms[2][i] + g
var result: [Int] = [Int](count: P.count, repeatedValue: 0)
//here we go through the provided P[] and Q[] arrays as intervals
for i in 0..<P.count {
let fromIndex = P[i]
//we need to add 1 to Q[i],
//because our genoms[0][0], genoms[1][0] and genoms[2][0]
//have 0 values by default, look above genoms[0][i+1] = genoms[0][i] + a;
let toIndex = Q[i] + 1
if (genoms[0][toIndex] - genoms[0][fromIndex] > 0) {
result[i] = 1;
} else if (genoms[1][toIndex] - genoms[1][fromIndex] > 0) {
result[i] = 2;
} else if (genoms[2][toIndex] - genoms[2][fromIndex] > 0) {
result[i] = 3;
} else {
result[i] = 4;
return result
Does anybody know why this Swift code fails all performance tests when Java code passes all tests? I suppose I'm touching some sensitive bottleneck in Swift but I'm not aware where.
If someone is not aware of codility this is the link to the task.
This Java code for the GenomicRangeQuery problem scored 100% at codility.
It uses 4 simple Arrays to do the prefix sums.
I post it here as an alternative approach.
Time Complexity is O(n+m)
public int[] solution4(String S, int[] P, int[] Q){
int n=chars.length;
int[]contaA=new int[n+1];
int[]contaC=new int[n+1];
int[]contaG=new int[n+1];
int[]contaT=new int[n+1];
for (int i=1;i<n+1;i++){
if (chars[i-1]=='A')contaA[i]+=1;
if (chars[i-1]=='C')contaC[i]+=1;
if (chars[i-1]=='G')contaG[i]+=1;
if (chars[i-1]=='T')contaT[i]+=1;
int[] arrayContadores=new int[P.length];
for (int i=0;i<P.length;i++){
int primeiro=P[i];
int ultimo=Q[i];
int A=contaFatia(contaA,primeiro,ultimo);
int C=contaFatia(contaC,primeiro,ultimo);
int G=contaFatia(contaG,primeiro,ultimo);
int T=contaFatia(contaT,primeiro,ultimo);
if (A>0){arrayContadores[i]=1;
}else if (C>0) {
arrayContadores[i] = 2;
}else if(G>0){
}else if (T>0){
return arrayContadores;
public int contaFatia(int[]P,int x,int y){
return P[y+1]-P[x];
public func solution(_ S : inout String, _ P : inout [Int], _ Q : inout [Int]) -> [Int] {
var retArr = [Int]()
var chrArr = [Character]()
for chr in S {
for i in 0..<P.count {
var minFactor = 4
if P[i] - Q[i] == 0 {
if chrArr[P[i]] == "A"{
minFactor = 1
}else if chrArr[P[i]] == "C"{
minFactor = 2
}else if chrArr[P[i]] == "G"{
minFactor = 3
}else {
for j in P[i]...Q[i] {
if chrArr[j] == "A"{
minFactor = 1
}else if chrArr[j] == "C"{
minFactor = 2
}else if chrArr[j] == "G"{
if minFactor > 2 {
minFactor = 3
return retArr
I have been playing with things in Swift for a while trying to come up with the right solution. This is the closest I have come.
public func solution(_ S : inout String, _ P : inout [Int], _ Q : inout [Int]) -> [Int] {
let N = S.count + 1
var outerImpacts: ContiguousArray<ContiguousArray<Int>> = []
for i in 0..<N {
if i > 0 {
var innerImpacts = outerImpacts[i - 1]
switch S[S.index(S.startIndex, offsetBy: i - 1)] {
case "A":
innerImpacts[0] += 1
case "C":
innerImpacts[1] += 1
case "G":
innerImpacts[2] += 1
case "T":
innerImpacts[3] += 1
} else {
outerImpacts.append(ContiguousArray<Int>(repeating: 0, count: 4))
let M: Int = P.count
var minimalImpacts: [Int] = []
for i in 0..<M {
for j in 0..<4 where (outerImpacts[Q[i] + 1][j] - outerImpacts[P[i]][j]) > 0 {
minimalImpacts.append(j + 1)
return minimalImpacts

Making parallel prime sieve with shared memory faster

I have a prime sieve whose sequential version runs great. I finally figured out how to make the inner loop run in parallel, but (as I feared based on prior experience with other languages) the single threaded version is faster.
Can this parallel version in Rust be optimized?
extern crate crossbeam;
fn main() {
let residues = [1, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67,
71, 73, 79, 83, 89, 97,101,103,107,109,113,121,127,131,137,139,
let val = 1_000_000;
let md = 210;
let rescnt = 48;
println!("val = {}, mod = {}, rescnt = {}", val, md, rescnt);
let mut posn = [0; 210];
for i in 1..rescnt {posn[residues[i]] = i - 1;}
posn[1] = rescnt - 1;
let mut modk; let mut r; let mut k;
let num = val - 1 | 1;
k = num / md; modk = md * k; r = 1;
while num >= modk + residues[r] {r += 1;}
let maxpcs = k * rescnt + r - 1;
let prms: Vec<u8> = vec![0; maxpcs];
println!("num = {}, k = {}, modk = {}, maxpcs = {}", num, k, modk, maxpcs);
let sqrt_n = (num as f32).sqrt() as usize;
modk = 0; r = 0; k = 0;
// sieve to identify/eliminate nonprimes/locations in prms array
for i in 0..maxpcs {
r += 1; if r > rescnt {r = 1; modk += md; k += 1;};
if prms[i] == 1 {continue;}
let prm_r = residues[r];
let prime = modk + prm_r;
if prime > sqrt_n {break;}
let prmstep = prime * rescnt;
for ri in &residues[1..rescnt + 1] {
let prms = &mut prms;
crossbeam::scope(|scope| {
scope.spawn(move || {
let prod = prm_r * ri;
let mut np = (k * (prime + ri) + (prod - 2) / md) * rescnt + posn[prod % md];
while np < maxpcs {prms[np] = 1; np += prmstep;}
// the prms array now has all the positions for primes r1..N
// extract prime numbers and count from prms into prims array
let mut prmcnt = 4;
modk = 0; r = 0;
for i in 0..maxpcs {
r += 1; if r > rescnt {r = 1; modk += md;};
if prms[i] == 0 {prmcnt += 1;}
println!("{}", prmcnt);
Using Rust 1.6 on Linux.
