I'm running the following snippet of code (the std::time objects are just there for benchmarking) that gets u8 elements from a vector of vector of u8 in a given order and creates a new vector with these objects in this order.
for idx in cur_prefix_ref.iter() {
let now = std::time::Instant::now();
let elapsed_first = now.elapsed();
unsafe {
val = *data.get_unchecked(*idx as usize).get_unchecked(j);
}
let elapsed_second = now.elapsed();
new_add.push(val);
if val == 0 {
zero_tot += 1;
} else if val == 1 {
one_tot += 1;
}
if (ct == 0) || (ct_extra == fm_gap) {
occ_positions[0].push(zero_tot);
occ_positions[1].push(one_tot);
ct_extra = 0;
}
ct += 1;
ct_extra += 1;
let elapsed_third = now.elapsed();
elapse1.push(elapsed_first);
elapse2.push(elapsed_second);
elapse3.push(elapsed_third);
}
In my full code this inner loop ends up running hundreds of millions of times so I'm trying to optimise it as much as possible. According to be benchmarking, I seem to be spending the vast majority of the loop time in looking up values from my Vec<Vec<u8>>, on the line val = *data.get_unchecked(*idx as usize).get_unchecked(j);, see below which benchmarks some elapsed_first,elapsed_second,elapsed_third times from different iterations of this loop (the i^th element of each list is from the same run):
First: [27ns, 23ns, 21ns, 24ns, 27ns, 23ns, 28ns, 23ns, 26ns, 23ns, 21ns, 22ns, 27ns, 27ns, 28ns, 23ns, 25ns, 24ns, 26ns, 25ns, 22ns, 24ns, 24ns, 28ns, 28ns, 28ns, 26ns, 22ns, 22ns, 21ns]
Second: [538ns, 695ns, 550ns, 486ns, 627ns, 615ns, 562ns, 570ns, 661ns, 521ns, 617ns, 358ns, 444ns, 560ns, 540ns, 471ns, 656ns, 336ns, 233ns, 209ns, 433ns, 373ns, 1.427µs, 542ns, 708ns, 288ns, 304ns, 608ns, 297ns, 252ns]
Third: [612ns, 736ns, 587ns, 525ns, 665ns, 658ns, 608ns, 614ns, 701ns, 560ns, 656ns, 395ns, 482ns, 606ns, 578ns, 510ns, 696ns, 374ns, 270ns, 246ns, 470ns, 416ns, 1.47µs, 583ns, 751ns, 327ns, 348ns, 645ns, 334ns, 289ns]
I've been trying to understand why this simple vector lookup is the bit that takes by far the most time compared to everything else and still haven't figured it out. Any help is much appreciated!
EDIT: Here is the full function which this code comes from:
pub fn spaced_pbwt(vcf: &VCFData, pbwt_cols: &Vec<SiteRow>, fm_gap: u32) -> SpacedPbwt {
let now = std::time::Instant::now();
let data_positions: Vec<u32> = vcf.positions.clone();
let mut pbwt_positions: Vec<u32> = Vec::new();
let mut insert_positions: Vec<u32> = Vec::new();
let data: &Vec<Vec<u8>> = &vcf.vcf_data;
let mut col_set: HashSet<u32> = HashSet::new();
let mut n: usize = 0;
for col in pbwt_cols {
let pos = col.position;
col_set.insert(pos);
n += 1;
}
let m = data.len();
let n_full = data[0].len();
let n_other = n_full-n;
let mut is_pbwt_col :Vec<u8> = Vec::with_capacity(n_full+1);
let mut pbwt_positions: Vec<u32> = Vec::new();
let mut inserted_positions: Vec<u32> = Vec::new();
let mut prefixes : Vec<Vec<u32>> = Vec::with_capacity(n+1);
let mut divergences : Vec<Vec<u32>> = Vec::with_capacity(n+1);
let mut binaries: Vec<Vec<u8>> = Vec::with_capacity(n_full+1);
let cur_prefix : Vec<u32> = Vec::from_iter(0..m as u32);
let cur_divergence : Vec<u32> = vec![0; m];
let mut j: usize = 0;
let mut j_pbwt = 0;
let mut count_vec: Vec<u32> = Vec::new();
let mut occ_vec : Vec<Vec<Vec<u32>>> = Vec::new();
prefixes.push(cur_prefix);
divergences.push(cur_divergence);
let mut cur_prefix_ref: &Vec<u32> = &(prefixes[prefixes.len()-1]);
let mut cur_divergence_ref: &Vec<u32> = &divergences[divergences.len()-1];
let mut ct: i32 = 0;
let mut ct_extra: u32 = 0;
let mut zero_tot: u32 = 0;
let mut one_tot: u32 = 0;
let mut occ_positions: Vec<Vec<u32>> = vec![Vec::new(),Vec::new()];
let mut new_add: Vec<u8> = Vec::with_capacity(m);
let mut a: Vec<u32> = Vec::with_capacity(m);
let mut b: Vec<u32> = Vec::with_capacity(m);
let mut d: Vec<u32> = Vec::with_capacity(m);
let mut e: Vec<u32> = Vec::with_capacity(m);
let mut bin_values: Vec<u8> = Vec::with_capacity(m);
let mut elapse1 = Vec::new();
let mut elapse2 = Vec::new();
let mut elapse3 = Vec::new();
for col in &vcf.positions {
if !col_set.contains(&col) {
ct = 0;
ct_extra = 0;
zero_tot = 0;
one_tot = 0;
occ_positions = vec![Vec::new(),Vec::new()];
new_add = Vec::with_capacity(m);
let mut val: u8;
for idx in cur_prefix_ref.iter() {
let now = std::time::Instant::now();
let elapsed_first = now.elapsed();
unsafe {
val = *data.get_unchecked(*idx as usize).get_unchecked(j);
}
let elapsed_second = now.elapsed();
new_add.push(val);
if val == 0 {
zero_tot += 1;
} else if val == 1 {
one_tot += 1;
}
if (ct == 0) || (ct_extra == fm_gap) {
occ_positions[0].push(zero_tot);
occ_positions[1].push(one_tot);
ct_extra = 0;
}
ct += 1;
ct_extra += 1;
let elapsed_third = now.elapsed();
elapse1.push(elapsed_first);
elapse2.push(elapsed_second);
elapse3.push(elapsed_third);
}
binaries.push(new_add);
is_pbwt_col.push(0);
inserted_positions.push(*col);
count_vec.push(zero_tot);
occ_vec.push(occ_positions);
} else {
a = Vec::with_capacity(m);
b = Vec::with_capacity(m);
d = Vec::with_capacity(m);
e = Vec::with_capacity(m);
bin_values = Vec::with_capacity(m);
let mut p: u32 = j_pbwt+1;
let mut q: u32 = j_pbwt+1;
occ_positions = vec![Vec::new(),Vec::new()];
ct = 0;
ct_extra = 0;
zero_tot = 0;
one_tot = 0;
let mut cur_allele: u8;
for (idx,start_point) in
cur_prefix_ref.iter().zip(cur_divergence_ref.iter()) {
let idx_val = *idx;
unsafe {
cur_allele = *data.get_unchecked(idx_val as usize).get_unchecked(j);
}
bin_values.push(cur_allele);
let st = *start_point;
if st > p {
p = st;
}
if st > q {
q = st;
}
if cur_allele == 0 {
a.push(idx_val);
d.push(p);
p = 0;
zero_tot += 1;
}
if cur_allele == 1 {
b.push(idx_val);
e.push(q);
q = 0;
one_tot += 1;
}
if (ct == 0) || (ct_extra == fm_gap) {
occ_positions[0].push(zero_tot);
occ_positions[1].push(one_tot);
ct_extra = 0;
}
ct += 1;
ct_extra += 1;
}
let mut new_prefix = a;
new_prefix.append(&mut b);
let mut new_divergence = d;
new_divergence.append(&mut e);
prefixes.push(new_prefix);
divergences.push(new_divergence);
binaries.push(bin_values);
cur_prefix_ref = &(prefixes[prefixes.len()-1]);
cur_divergence_ref = &divergences[divergences.len()-1];
count_vec.push(zero_tot);
occ_vec.push(occ_positions);
is_pbwt_col.push(1);
pbwt_positions.push(*col);
}
j += 1;
}
let elapsed = now.elapsed();
println!("Calc Time: {:.4?}", elapsed);
println!("First: {:?}", &elapse1[500..530]);
println!("Second: {:?}", &elapse2[500..530]);
println!("Third: {:?}", &elapse3[500..530]);
return SpacedPbwt {
num_samples: m as u32,
num_pbwt_sites: n as u32,
num_inserted_sites: n_other as u32,
num_total_sites: n_full as u32,
pbwt_positions: pbwt_positions,
inserted_positions: inserted_positions,
all_positions: data_positions,
pbwt_col_flags: is_pbwt_col,
bin_pbwt: binaries,
count: count_vec,
occ_list: occ_vec,
fm_gap: fm_gap,
};
}
EDIT EDIT:
Here is a modified version of the file that everybody should be able to run on their machine and does exhibit the behaviour I'm concerned about. It only uses the rand crate as a dependency:
use rand::{seq::IteratorRandom, thread_rng}; // 0.6.1
use rand::distributions::{Distribution, Uniform};
use std::collections::HashSet;
pub fn spaced_pbwt(data: &Vec<Vec<u8>>, fm_gap: u32) -> () {
let now = std::time::Instant::now();
let m = data.len();
let n = data[0].len();
let half_n = n/2;
let mut rng = thread_rng();
let sample: Vec<u32> = (0u32..n as u32).collect();
let perm = sample.iter().choose_multiple(&mut rng, half_n);
let mut cols_to_permute: Vec<u32> = Vec::new();
for i in perm {
cols_to_permute.push(*i);
}
let mut col_set: HashSet<u32> = HashSet::new();
let mut n: usize = 0;
for col in &cols_to_permute {
col_set.insert(*col);
n += 1;
}
let m = data.len();
let n_full = data[0].len();
let n_other = n_full-n;
let mut is_pbwt_col :Vec<u8> = Vec::with_capacity(n_full+1);
let mut pbwt_positions: Vec<u32> = Vec::new();
let mut inserted_positions: Vec<u32> = Vec::new();
let mut prefixes : Vec<Vec<u32>> = Vec::with_capacity(n+1);
let mut divergences : Vec<Vec<u32>> = Vec::with_capacity(n+1);
let mut binaries: Vec<Vec<u8>> = Vec::with_capacity(n_full+1);
let cur_prefix : Vec<u32> = Vec::from_iter(0..m as u32);
let cur_divergence : Vec<u32> = vec![0; m];
let mut j: usize = 0;
let mut j_pbwt = 0;
let mut count_vec: Vec<u32> = Vec::new();
let mut occ_vec : Vec<Vec<Vec<u32>>> = Vec::new();
prefixes.push(cur_prefix);
divergences.push(cur_divergence);
let mut cur_prefix_ref: &Vec<u32> = &(prefixes[prefixes.len()-1]);
let mut cur_divergence_ref: &Vec<u32> = &divergences[divergences.len()-1];
let mut ct: i32 = 0;
let mut ct_extra: u32 = 0;
let mut zero_tot: u32 = 0;
let mut one_tot: u32 = 0;
let mut occ_positions: Vec<Vec<u32>> = vec![Vec::new(),Vec::new()];
let mut new_add: Vec<u8> = Vec::with_capacity(m);
let mut a: Vec<u32> = Vec::with_capacity(m);
let mut b: Vec<u32> = Vec::with_capacity(m);
let mut d: Vec<u32> = Vec::with_capacity(m);
let mut e: Vec<u32> = Vec::with_capacity(m);
let mut bin_values: Vec<u8> = Vec::with_capacity(m);
let mut elapse1 = Vec::new();
let mut elapse2 = Vec::new();
let mut elapse3 = Vec::new();
for col in 0..n {
if !col_set.contains(&(col as u32)) {
ct = 0;
ct_extra = 0;
zero_tot = 0;
one_tot = 0;
occ_positions = vec![Vec::new(),Vec::new()];
new_add = Vec::with_capacity(m);
let mut val: u8;
for idx in cur_prefix_ref.iter() {
let now = std::time::Instant::now();
let elapsed_first = now.elapsed();
unsafe {
val = *data.get_unchecked(*idx as usize).get_unchecked(j);
}
let elapsed_second = now.elapsed();
new_add.push(val);
if val == 0 {
zero_tot += 1;
} else if val == 1 {
one_tot += 1;
}
if (ct == 0) || (ct_extra == fm_gap) {
occ_positions[0].push(zero_tot);
occ_positions[1].push(one_tot);
ct_extra = 0;
}
ct += 1;
ct_extra += 1;
let elapsed_third = now.elapsed();
elapse1.push(elapsed_first);
elapse2.push(elapsed_second);
elapse3.push(elapsed_third);
}
binaries.push(new_add);
is_pbwt_col.push(0);
inserted_positions.push(col as u32);
count_vec.push(zero_tot);
occ_vec.push(occ_positions);
} else {
a = Vec::with_capacity(m);
b = Vec::with_capacity(m);
d = Vec::with_capacity(m);
e = Vec::with_capacity(m);
bin_values = Vec::with_capacity(m);
let mut p: u32 = j_pbwt+1;
let mut q: u32 = j_pbwt+1;
occ_positions = vec![Vec::new(),Vec::new()];
ct = 0;
ct_extra = 0;
zero_tot = 0;
one_tot = 0;
let mut cur_allele: u8;
for (idx,start_point) in
cur_prefix_ref.iter().zip(cur_divergence_ref.iter()) {
let idx_val = *idx;
unsafe {
cur_allele = *data.get_unchecked(idx_val as usize).get_unchecked(j);
}
bin_values.push(cur_allele);
let st = *start_point;
if st > p {
p = st;
}
if st > q {
q = st;
}
if cur_allele == 0 {
a.push(idx_val);
d.push(p);
p = 0;
zero_tot += 1;
}
if cur_allele == 1 {
b.push(idx_val);
e.push(q);
q = 0;
one_tot += 1;
}
if (ct == 0) || (ct_extra == fm_gap) {
occ_positions[0].push(zero_tot);
occ_positions[1].push(one_tot);
ct_extra = 0;
}
ct += 1;
ct_extra += 1;
}
let mut new_prefix = a;
new_prefix.append(&mut b);
let mut new_divergence = d;
new_divergence.append(&mut e);
prefixes.push(new_prefix);
divergences.push(new_divergence);
binaries.push(bin_values);
cur_prefix_ref = &(prefixes[prefixes.len()-1]);
cur_divergence_ref = &divergences[divergences.len()-1];
count_vec.push(zero_tot);
occ_vec.push(occ_positions);
is_pbwt_col.push(1);
pbwt_positions.push(col as u32);
}
j += 1;
}
let elapsed = now.elapsed();
println!("Calc Time: {:.4?}", elapsed);
println!("First: {:?}", &elapse1[500..530]);
println!("Second: {:?}", &elapse2[500..530]);
println!("Third: {:?}", &elapse3[500..530]);
}
fn main() {
let m = 4000;
let n = 50000;
let step: Uniform<u8> = Uniform::new(0,2);
let mut rng = rand::thread_rng();
let mut data = Vec::new();
for _ in 0..m {
let choices: Vec<u8> = step.sample_iter(&mut rng).take(n).collect();
data.push(choices);
}
let fm = 2;
spaced_pbwt(&data,fm);
}
When I ran your code (on an i7-7700HQ), I got these numbers
First: [16ns, 17ns, 16ns, 16ns, 16ns, 17ns, 15ns, 15ns, 16ns, 17ns, 16ns, 16ns, 16ns, 16ns, 16ns, 16ns, 16ns, 15ns, 16ns, 15ns, 16ns, 16ns, 17ns, 16ns, 17ns, 16ns, 15ns, 16ns, 16ns, 16ns]
Second: [107ns, 104ns, 171ns, 109ns, 101ns, 112ns, 116ns, 169ns, 184ns, 177ns, 103ns, 108ns, 105ns, 79ns, 110ns, 112ns, 109ns, 165ns, 157ns, 104ns, 104ns, 409ns, 104ns, 107ns, 111ns, 104ns, 104ns, 104ns, 106ns, 117ns]
Third: [132ns, 126ns, 202ns, 132ns, 133ns, 140ns, 147ns, 197ns, 216ns, 207ns, 136ns, 138ns, 405ns, 105ns, 149ns, 139ns, 142ns, 198ns, 182ns, 126ns, 135ns, 434ns, 128ns, 136ns, 136ns, 127ns, 128ns, 129ns, 136ns, 147ns]
Which has vastly different proportions, than your results. Since you said, there is a C program that runs faster, it should not be a problem with your system.
The next thing I can think about is you need a cargo clean and recompile the whole thing. Sometimes (I am on the nightly compiler) I had an issue, that made recompiled binaries slow, maybe because of some code-layout issue, compiler stuff idk. A clean build usually fixed it.
Next, you can try using link time optimization. Add this to your Cargo.toml:
[profile.lto]
inherits = "release"
lto = true
Then run the profile with
cargo run --profile lto
Third, use a single array, like some comments said. The ndarray crate is perfect for this. For me it brings down the times to
First: [18ns, 16ns, 17ns, 16ns, 17ns, 16ns, 18ns, 17ns, 17ns, 17ns, 17ns, 17ns, 17ns, 25ns, 16ns, 17ns, 18ns, 18ns, 17ns, 17ns, 18ns, 17ns, 17ns, 16ns, 17ns, 16ns, 16ns, 17ns, 17ns, 18ns]
Second: [51ns, 49ns, 48ns, 50ns, 51ns, 51ns, 49ns, 48ns, 48ns, 49ns, 50ns, 48ns, 53ns, 66ns, 49ns, 53ns, 52ns, 50ns, 50ns, 49ns, 53ns, 51ns, 47ns, 50ns, 52ns, 50ns, 48ns, 48ns, 48ns, 50ns]
Third: [77ns, 77ns, 75ns, 74ns, 83ns, 81ns, 75ns, 72ns, 81ns, 74ns, 82ns, 79ns, 552ns, 99ns, 81ns, 76ns, 79ns, 74ns, 77ns, 73ns, 86ns, 76ns, 75ns, 80ns, 85ns, 75ns, 74ns, 73ns, 74ns, 76ns]
use ndarray::Array2;
use std::collections::HashSet;
pub fn spaced_pbwt(data: &Array2<u8>, fm_gap: u32) -> () {
let now = std::time::Instant::now();
let (m, n) = data.dim();
let half_n = n/2;
...
unsafe {
val = *data.uget((idx as usize, j));
}
...
}
fn main() {
let m = 4000;
let n = 50000;
let step: Uniform<u8> = Uniform::new(0,2);
let mut rng = rand::thread_rng();
let mut data = Vec::new();
let mut data2 = Vec::with_capacity(m*n);
for _ in 0..m {
let choices: Vec<u8> = step.sample_iter(&mut rng).take(n).collect();
data2.extend_from_slice(&choices);
data.push(choices);
}
let fm = 2;
spaced_pbwt(&Array2::from_shape_vec((m, n), data2).unwrap(),fm);
//spaced_pbwt_vecs(&data,fm);
}
As the title suggests, I am using Google OR-Tools to tackle a bin-packing problem. I would like to require that all orders packed into a given truck have the same delivery destination. The following is my attempt at implementing this, which doesn't seem to be working:
# x[i, j] = 1 if item i is packed in bin j
x = {}
for i in data['orders']:
for j in data['trucks']:
x[(i, j)] = solver.IntVar(0, 1, 'x_%i_%i' % (i, j))
data['trucks'] = [0, 1, 2, 3, ...]
data['orders'] = [0, 1, 2, 3, ...]
data['delivery_county_id'] = [8, 8, 8, 1, 3, 2, ...]
from itertools import groupby
# Checks if all elements of list are equal
def all_equal(iterable):
g = groupby(iterable)
return next(g, True) and not next(g, False)
for j in data['trucks']:
solver.Add( all_equal ( [ x[(i, j)] * data['delivery_county_id'][i] for i in data['orders'] if x[(i, j)] == 1 ] ) == True )
Strangely, I am not getting any errors when I execute the code, but my constraint is not being obeyed. I'm not sure why that is. Any assistance or suggestions would be profoundly appreciated!
I don't have a working Python installation, but this is how it could be done in c#:
public void initModel(CpModel model)
{
// Make up some data for the counties of the orders
deliveryCountyId = new int[nOrders];
for (int order = 0; order < nOrders; order++)
{
deliveryCountyId[order] = order % nCounties;
}
// Boolean variables for item shipped by truck
x = new IntVar[nOrders, nTrucks];
for (int order = 0; order < nOrders; order++)
{
for (int truck = 0; truck < nTrucks; truck++)
{
x[order, truck] = model.NewBoolVar($"Item {order} (county {deliveryCountyId[order]}) in truck {truck}");
}
}
// Boolean variables for truck carrying an item for county
y = new IntVar[nTrucks, nCounties];
for (int truck = 0; truck < nTrucks; truck++)
{
for (int county = 0; county < nCounties; county++)
{
y[truck, county] = model.NewBoolVar($"Truck {truck} has item for county {county}");
}
}
// Each item must be shipped by exactly one truck
for (int order = 0; order < nOrders; order++)
{
List<IntVar> trucksForThisItem = new List<IntVar>();
for (int truck = 0; truck < nTrucks; truck++)
{
trucksForThisItem.Add(x[order, truck]);
}
model.Add(LinearExpr.Sum(trucksForThisItem) == 1);
}
// Compute which counties are in each truck
for (int truck = 0; truck < nTrucks; truck++)
{
for (int county = 0; county < nCounties; county++)
{
List<IntVar> ordersForCountyAndTruck = new List<IntVar>();
{
for (int order = 0; order < nOrders; order++)
{
if (deliveryCountyId[order] == county)
{
ordersForCountyAndTruck.Add(x[order, truck]);
}
}
}
if (ordersForCountyAndTruck.Count > 0)
{
model.AddMaxEquality(y[truck, county], ordersForCountyAndTruck);
}
else
{
model.Add(y[truck, county] == 0);
}
}
}
// Each truck may carry items for only one county
for (int truck = 0; truck < nTrucks; truck++)
{
List<IntVar> countiesPerTruck = new List<IntVar>();
for (int county = 0; county < nCounties; county++)
{
countiesPerTruck.Add(y[truck, county]);
}
model.Add(LinearExpr.Sum(countiesPerTruck) <= 1);
}
}
}
You can easily express the equivalent method calls in Python.
I have used Google-ortool's CP-SAT solver (python). Please see the code below, I have added the desired constraint
import random
from ortools.sat.python import cp_model as cp
trucks = list(range(1, 9))
orders = list(range(1, 51))
delivery_county_id = [random.randint(1, 8) for _ in orders]
order_country = list(zip(orders, delivery_county_id))
model = cp.CpModel()
dv_order_truck = {}
for ord_cntry in order_country:
for trck in trucks:
dv_order_truck[(ord_cntry, trck)] = model.NewBoolVar("")
# one order in one truck only
for ord_cntry in order_country:
model.Add(sum(dv_order_truck[(ord_cntry, trck)] for trck in trucks) == 1)
# all orders packed into a given truck have the same delivery destination
dv_truck_country = {}
for trck in trucks:
for cntry in set(delivery_county_id):
dv_truck_country[trck, cntry] = model.NewBoolVar("")
for trck in trucks:
for cntry in set(delivery_county_id):
orders_inTruck_cntry = [v for k,v in dv_order_truck.items() if k[1] == trck and k[0][1] == cntry]
model.AddMaxEquality(dv_truck_country[trck, cntry], orders_inTruck_cntry)
for trck in trucks:
model.Add(sum(dv_truck_country[trck, cntry] for cntry in set(delivery_county_id)) == 1)
solver = cp.CpSolver()
solver.Solve(model)
# inspect the solution
solution = [(ord_cntry, trck) for ord_cntry in order_country for trck in trucks if solver.Value(dv_order_truck[(ord_cntry, trck)]) > 0]
sorted(solution, key = lambda x : x[0][1],reverse=True)
I rewrited code that solves GenomicRangeQuery task from Java to Swift. The code in Jave gets 100/100 score but the code in Swift fails all performance tests. I'm trying to understand why because logic in code is the same. I'w wondering why Swift code is executing so long. Do I using some very slow parts in my swift code that I'm not aware of. Please take a look at this Java code copied from here.
class Solution {
public int[] solveGenomicRange(String S, int[] P, int[] Q) {
//used jagged array to hold the prefix sums of each A, C and G genoms
//we don't need to get prefix sums of T, you will see why.
int[][] genoms = new int[3][S.length()+1];
//if the char is found in the index i, then we set it to be 1 else they are 0
// 3 short values are needed for this reason
short a, c, g;
for (int i=0; i<S.length(); i++) {
a = 0; c = 0; g = 0;
if ('A' == (S.charAt(i))) {
a=1;
}
if ('C' == (S.charAt(i))) {
c=1;
}
if ('G' == (S.charAt(i))) {
g=1;
}
//here we calculate prefix sums. To learn what's prefix sums look at here https://codility.com/media/train/3-PrefixSums.pdf
genoms[0][i+1] = genoms[0][i] + a;
genoms[1][i+1] = genoms[1][i] + c;
genoms[2][i+1] = genoms[2][i] + g;
}
int[] result = new int[P.length];
//here we go through the provided P[] and Q[] arrays as intervals
for (int i=0; i<P.length; i++) {
int fromIndex = P[i];
//we need to add 1 to Q[i],
//because our genoms[0][0], genoms[1][0] and genoms[2][0]
//have 0 values by default, look above genoms[0][i+1] = genoms[0][i] + a;
int toIndex = Q[i]+1;
if (genoms[0][toIndex] - genoms[0][fromIndex] > 0) {
result[i] = 1;
} else if (genoms[1][toIndex] - genoms[1][fromIndex] > 0) {
result[i] = 2;
} else if (genoms[2][toIndex] - genoms[2][fromIndex] > 0) {
result[i] = 3;
} else {
result[i] = 4;
}
}
return result;
}
}
And here the same code rewritten to Swift 2.1
public func solution(inout S:String, inout _ P:[Int], inout _ Q:[Int]) -> [Int] {
let len = S.characters.count
//used jagged array to hold the prefix sums of each A, C and G genoms
//we don't need to get prefix sums of T, you will see why.
var genoms = [[Int]](count: 3, repeatedValue: [Int](count: len+1, repeatedValue: 0))
//if the char is found in the index i, then we set it to be 1 else they are 0
// 3 short values are needed for this reason
var a,c,g:Int
for i in 0..<len {
a=0; c=0; g=0
let char = S[S.startIndex.advancedBy(i)]
switch char {
case "A": a=1;
case "C": c=1;
case "G": g=1;
default: ()
}
//here we calculate prefix sums. To learn what's prefix sums look at here https://codility.com/media/train/3-PrefixSums.pdf
genoms[0][i+1] = genoms[0][i] + a
genoms[1][i+1] = genoms[1][i] + c
genoms[2][i+1] = genoms[2][i] + g
}
var result: [Int] = [Int](count: P.count, repeatedValue: 0)
//here we go through the provided P[] and Q[] arrays as intervals
for i in 0..<P.count {
let fromIndex = P[i]
//we need to add 1 to Q[i],
//because our genoms[0][0], genoms[1][0] and genoms[2][0]
//have 0 values by default, look above genoms[0][i+1] = genoms[0][i] + a;
let toIndex = Q[i] + 1
if (genoms[0][toIndex] - genoms[0][fromIndex] > 0) {
result[i] = 1;
} else if (genoms[1][toIndex] - genoms[1][fromIndex] > 0) {
result[i] = 2;
} else if (genoms[2][toIndex] - genoms[2][fromIndex] > 0) {
result[i] = 3;
} else {
result[i] = 4;
}
}
return result
}
Does anybody know why this Swift code fails all performance tests when Java code passes all tests? I suppose I'm touching some sensitive bottleneck in Swift but I'm not aware where.
If someone is not aware of codility this is the link to the task.
This Java code for the GenomicRangeQuery problem scored 100% at codility.
It uses 4 simple Arrays to do the prefix sums.
I post it here as an alternative approach.
Time Complexity is O(n+m)
public int[] solution4(String S, int[] P, int[] Q){
char[]chars=S.toCharArray();
int n=chars.length;
int[]contaA=new int[n+1];
int[]contaC=new int[n+1];
int[]contaG=new int[n+1];
int[]contaT=new int[n+1];
for (int i=1;i<n+1;i++){
contaA[i]=contaA[i-1];
contaC[i]=contaC[i-1];
contaG[i]=contaG[i-1];
contaT[i]=contaT[i-1];
if (chars[i-1]=='A')contaA[i]+=1;
if (chars[i-1]=='C')contaC[i]+=1;
if (chars[i-1]=='G')contaG[i]+=1;
if (chars[i-1]=='T')contaT[i]+=1;
}
int[] arrayContadores=new int[P.length];
for (int i=0;i<P.length;i++){
int primeiro=P[i];
int ultimo=Q[i];
int A=contaFatia(contaA,primeiro,ultimo);
int C=contaFatia(contaC,primeiro,ultimo);
int G=contaFatia(contaG,primeiro,ultimo);
int T=contaFatia(contaT,primeiro,ultimo);
if (A>0){arrayContadores[i]=1;
}else if (C>0) {
arrayContadores[i] = 2;
}else if(G>0){
arrayContadores[i]=3;
}else if (T>0){
arrayContadores[i]=4;
}
}
return arrayContadores;
}
public int contaFatia(int[]P,int x,int y){
return P[y+1]-P[x];
}
public func solution(_ S : inout String, _ P : inout [Int], _ Q : inout [Int]) -> [Int] {
var retArr = [Int]()
var chrArr = [Character]()
for chr in S {
chrArr.append(chr)
}
for i in 0..<P.count {
var minFactor = 4
if P[i] - Q[i] == 0 {
if chrArr[P[i]] == "A"{
minFactor = 1
}else if chrArr[P[i]] == "C"{
minFactor = 2
}else if chrArr[P[i]] == "G"{
minFactor = 3
}
}else {
for j in P[i]...Q[i] {
if chrArr[j] == "A"{
minFactor = 1
break
}else if chrArr[j] == "C"{
minFactor = 2
}else if chrArr[j] == "G"{
if minFactor > 2 {
minFactor = 3
}
}
}
}
retArr.append(minFactor)
}
return retArr
}
I have been playing with things in Swift for a while trying to come up with the right solution. This is the closest I have come.
public func solution(_ S : inout String, _ P : inout [Int], _ Q : inout [Int]) -> [Int] {
let N = S.count + 1
var outerImpacts: ContiguousArray<ContiguousArray<Int>> = []
outerImpacts.reserveCapacity(N)
for i in 0..<N {
if i > 0 {
var innerImpacts = outerImpacts[i - 1]
switch S[S.index(S.startIndex, offsetBy: i - 1)] {
case "A":
innerImpacts[0] += 1
case "C":
innerImpacts[1] += 1
case "G":
innerImpacts[2] += 1
case "T":
innerImpacts[3] += 1
default:
break
}
outerImpacts.append(innerImpacts)
} else {
outerImpacts.append(ContiguousArray<Int>(repeating: 0, count: 4))
}
}
let M: Int = P.count
var minimalImpacts: [Int] = []
minimalImpacts.reserveCapacity(M)
for i in 0..<M {
for j in 0..<4 where (outerImpacts[Q[i] + 1][j] - outerImpacts[P[i]][j]) > 0 {
minimalImpacts.append(j + 1)
break
}
}
return minimalImpacts
}
I have a prime sieve whose sequential version runs great. I finally figured out how to make the inner loop run in parallel, but (as I feared based on prior experience with other languages) the single threaded version is faster.
Can this parallel version in Rust be optimized?
extern crate crossbeam;
fn main() {
let residues = [1, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67,
71, 73, 79, 83, 89, 97,101,103,107,109,113,121,127,131,137,139,
143,149,151,157,163,167,169,173,179,181,187,191,193,197,199,209,211];
let val = 1_000_000;
let md = 210;
let rescnt = 48;
println!("val = {}, mod = {}, rescnt = {}", val, md, rescnt);
let mut posn = [0; 210];
for i in 1..rescnt {posn[residues[i]] = i - 1;}
posn[1] = rescnt - 1;
let mut modk; let mut r; let mut k;
let num = val - 1 | 1;
k = num / md; modk = md * k; r = 1;
while num >= modk + residues[r] {r += 1;}
let maxpcs = k * rescnt + r - 1;
let prms: Vec<u8> = vec![0; maxpcs];
println!("num = {}, k = {}, modk = {}, maxpcs = {}", num, k, modk, maxpcs);
let sqrt_n = (num as f32).sqrt() as usize;
modk = 0; r = 0; k = 0;
// sieve to identify/eliminate nonprimes/locations in prms array
for i in 0..maxpcs {
r += 1; if r > rescnt {r = 1; modk += md; k += 1;};
if prms[i] == 1 {continue;}
let prm_r = residues[r];
let prime = modk + prm_r;
if prime > sqrt_n {break;}
let prmstep = prime * rescnt;
for ri in &residues[1..rescnt + 1] {
let prms = &mut prms;
crossbeam::scope(|scope| {
scope.spawn(move || {
let prod = prm_r * ri;
let mut np = (k * (prime + ri) + (prod - 2) / md) * rescnt + posn[prod % md];
while np < maxpcs {prms[np] = 1; np += prmstep;}
});
});
}
}
// the prms array now has all the positions for primes r1..N
// extract prime numbers and count from prms into prims array
let mut prmcnt = 4;
modk = 0; r = 0;
for i in 0..maxpcs {
r += 1; if r > rescnt {r = 1; modk += md;};
if prms[i] == 0 {prmcnt += 1;}
}
println!("{}", prmcnt);
}
Using Rust 1.6 on Linux.
What's the most efficient algorithm anyone can think of that, given a natural number n, returns the least natural number x with n positive divisors (including 1 and x)? For example, given 4 the algorithm should result in 6 (divisors: 1,2,3,6); i.e. 6 is the smallest number having 4 distinct factors. Similarly, given 6, the algorithm should result in 12 (divisors: 1,2,3,4,6,12); i.e. 12 is the smallest number having 6 distinct factors
In terms of real-world performance, I'm looking for a scalable algorithm which can give answers of the order of 1020 within 2 seconds on a machine which can do 107 computations per second.
http://www.primepuzzles.net/problems/prob_019.htm
b) Jud McCranie, T.W.A. Baumann & Enoch Haga sent basically the same
procedure to find N(d) for a given d:
Factorize d as a product of his prime divisors: d = p1a1 * p2a2 *p3a3 *...
convert this factorization in another arithmetically equivalent factorization, composed of non-powered monotonically decreasing and not
necesarilly prime factors... (uf!...) d = p1a1 * p2a2 *p3a3 *... =
b1 * b2 * b3... such that b1 ≥ b2 ≥ b3...
You must realize that for every given d, there are several
arithmetically equivalent factorizations that can be done: by example:
if d = 16 = 24 then there are 5 equivalent factorizations:
d = 2*2*2*2 = 4*2*2 = 4*4 = 8*2 = 16
N is the minimal number resulting of computing 2b1-1 * 3b2-1 * 5b3-1 * ... for all the equivalent factorizations of d. Working the same example:
N(16) = the minimal of these {2 * 3 * 5 * 7, 23 * 3 * 5, 23 * 33, 27 * 3, 215} = 23 * 3 * 5 = 120
Update: With numbers around 1020, pay attention to the notes by Christian Bau quoted on the same page.
//What is the smallest number with X factors?
function smallestNumberWithThisManyFactors(factorCount) {
Number.prototype.isPrime = function() {
let primeCandidate = this;
if(primeCandidate <= 1 || primeCandidate % 1 !== 0) return false
let i = 2;
while(i <= Math.floor(Math.sqrt(primeCandidate))){
if(primeCandidate%i === 0) return false;
i++;
}
return true;
}
Number.prototype.nextPrime = function() {
let currentPrime = this;
let nextPrimeCandidate = currentPrime + 1
while(nextPrimeCandidate < Infinity) {
if(nextPrimeCandidate.isPrime()){
return nextPrimeCandidate;
} else {
nextPrimeCandidate++;
}
}
}
Number.prototype.primeFactors = function() {
let factorParent = this;
let primeFactors = [];
let primeFactorCandidate = 2;
while(factorParent !== 1){
while(factorParent % primeFactorCandidate === 0){
primeFactors.push(primeFactorCandidate);
factorParent /= primeFactorCandidate;
}
primeFactorCandidate = primeFactorCandidate.nextPrime();
}
return primeFactors;
}
Number.prototype.factors = function() {
let parentNumber = this.valueOf();
let factors = []
let iterator = parentNumber % 2 === 0 ? 1 : 2
let factorCandidate = 1;
for(factorCandidate; factorCandidate <= Math.floor(parentNumber/2); factorCandidate += iterator) {
if(parentNumber % factorCandidate === 0) {
factors.push(factorCandidate)
}
}
factors.push(parentNumber)
return factors
}
Array.prototype.valueSort = function() {
return this.sort(function (a,b){ return a-b })
}
function clone3DArray(arrayOfArrays) {
let cloneArray = arrayOfArrays.map(function(arr) {
return arr.slice();
});
return cloneArray;
}
function does3DArrayContainArray(arrayOfArrays, array){
let aOA = clone3DArray(arrayOfArrays);
let a = array.slice(0);
for(let i=0; i<aOA.length; i++){
if(aOA[i].sort().join(',') === a.sort().join(',')){
return true;
}
}
return false;
}
function removeDuplicateArrays(combinations) {
let uniqueCombinations = []
for(let c = 0; c < combinations.length; c++){
if(!does3DArrayContainArray(uniqueCombinations, combinations[c])){
uniqueCombinations[uniqueCombinations.length] = combinations[c];
}
}
return uniqueCombinations;
}
function generateCombinations(parentArray) {
let generate = function(n, src, got, combinations) {
if(n === 0){
if(got.length > 0){
combinations[combinations.length] = got;
}
return;
}
for (let j=0; j<src.length; j++){
generate(n - 1, src.slice(j + 1), got.concat([src[j]]), combinations);
}
return;
}
let combinations = [];
for(let i=1; i<parentArray.length; i++){
generate(i, parentArray, [], combinations);
}
combinations.push(parentArray);
return combinations;
}
function generateCombinedFactorCombinations(primeFactors, primeFactorCombinations) {
let candidates = [];
for(let p=0; p<primeFactorCombinations.length; p++){
let product = 1;
let primeFactorsCopy = primeFactors.slice(0);
for(let q=0; q<primeFactorCombinations[p].length; q++){
product *= primeFactorCombinations[p][q];
primeFactorsCopy.splice(primeFactorsCopy.indexOf(primeFactorCombinations[p][q]), 1);
}
primeFactorsCopy.push(product);
candidates[candidates.length] = primeFactorsCopy.valueSort().reverse();
}
return candidates;
}
function determineMinimumCobination (candidates){
let minimumValue = Infinity;
let bestFactorCadidate = []
for(let y=0; y<candidates.length; y++){
let currentValue = 1;
let currentPrime = 2;
for(let z=0; z<combinedFactorCandidates[y].length; z++){
currentValue *= Math.pow(currentPrime,(combinedFactorCandidates[y][z])-1);
currentPrime = currentPrime.nextPrime();
}
if(currentValue < minimumValue){
minimumValue = currentValue;
bestFactorCadidate = combinedFactorCandidates[y];
}
}
return minimumValue;
}
let primeFactors = factorCount.primeFactors();
let primeFactorCombinations = removeDuplicateArrays(generateCombinations(primeFactors));
let combinedFactorCandidates = generateCombinedFactorCombinations(primeFactors, primeFactorCombinations);
let smallestNumberWithFactorCount = determineMinimumCobination(combinedFactorCandidates);
console.log('The smallest number with ' + factorCount + ' factors is: ')
console.log(smallestNumberWithFactorCount)
console.log('With these factors being: ')
console.log(smallestNumberWithFactorCount.factors())
return smallestNumberWithFactorCount;
}
smallestNumberWithThisManyFactors(10)