Rust performance issue - High complexity code

Rust performance issue - High complexity code - performance

I bought RUST by its performance so I decided to translate one project where performance matters a lot, from JAVA 11 to Rust.
The thing is the Version written in JAVA performance pretty much better more than 3x in single thread, +10X in multi thread
For context purpose: The most complex code is a function that trys to find an assigment between 2 sets, imagine that you have houses and stores, the stores have a fixed capacity and houses have necesities, you want to find the best assignment to walk less.
With all this in mind, I guess that the problem is how I use the variables, maybe clone() is called too much automatically, maybe reference access cause some unknown behavior.
Any upgrade that reduce while loop time will be great because it iterate over 5000 times. Sorry for the long code but I think everything is relevant in this case. You can't copy and paste this code if you want I can send you the git project link.
PD: I'm running with cargo run --release
pub fn evaluate(elem: &Element) -> EvaluatedElement {
let p1 = properties::get_cast::<f64>("p1");
let p2 = properties::get_cast::<usize>("p2");
let p3 = properties::get_cast::<usize>("p3");
let p4 = properties::get_cast::<f64>("p4");
let p5 = properties::get_array::<usize>("p5");
let mut kinds1 = kind1::get_map(); //almost 300 elements
let kinds2 = = kind2::get_map(); //almost 300 elements
let usables = elem.usables();
for (i, &a) in usables.iter().enumerate() {
if !a {
&kinds1.remove(&(i + 1));
}
}
let mut assignations = HashMap::new();
for k in (1..=p2).rev() {
let mut kinds2_sub = HashMap::with_capacity((&kinds2).len());
for (_, p) in kinds2.iter() {
if p.val1[k - 1] == 0 {
continue;
}
&kinds2_sub.insert(p.id, Kind2Sub {
parent: p.clone(),
val2: p.val1[k - 1],
val3: std::f64::MAX,
kind1_id: std::usize::MAX,
});
}
let mut opt_kind1_id: Option<usize> = Option::None;
while !&kinds2_sub.is_empty() {//arround 5500 times loop
for mut l in kinds2_sub.values_mut() {
match opt_kind1_id {
None => (),
Some(id) => if !l.kind1_id == id { continue; },
}
l.val3 = std::f64::MAX;
l.kind1_id = std::usize::MAX;
for b in kinds1.values_mut() {
let dist_b_l = calc_dist(b.id, l.id);
if dist_b_l > p4
|| (p1 as usize).min(l.val2) > p4 + b.val3
|| b.val2 < k
|| (l.val2 < (2 * p4) && (b.val3 as i16 - l.val2 as i16) < 0)
{ continue; }
let tmp = dist_b_l * p1.min(l.val2 as f64);
if l.val3 > tmp {
l.val3 = tmp;
l.kind1_id = b.id;
}
}
}
let lc = kinds2_sub.values_mut().min_by(|x, y| x.val3.partial_cmp(&y.val3).unwrap()).unwrap();
let obc = kinds1.get_mut(&lc.kind1_id);
let bc = obc.unwrap_or_else(|| {
panic!("No assignation able")
});
let b_c_id = (*bc).id;
let l_c_id = (*lc).id;
let time = if lc.val2 < (2usize * p1 as usize) { lc.val2 } else { p1 as usize };
let val = (*bc).val3 as i16 - time as i16;
let assignation = Assignation { kind1_id: (*bc).id, kind2_id: lc.id, val3: k, val4: 0 };
let assignation_id = assignation.id();//id() = fn concatenate first 3 values
if !assignations.contains_key(&assignation_id) {
assignations.insert(assignation.id(), assignation);
}
let mut assignation = assignations.get_mut(&assignation_id).unwrap_or_else(|| panic!("Assignation not found {}", assignation_id));
if val >= 0 {
assignation.val4 += time;
lc.val2 -= time;
(*bc).val3 -= time;
} else {
assignation.val4 += (*bc).val3;
lc.val2 -= (*bc).val3;
(*bc).val3 = 0;
}
if (*bc).val3 < p4 {
&kinds1.remove(&b_c_id);
}
if lc.val2 == 0 {
&kinds2_sub.remove(&l_c_id);
}
opt_kind1_id = Some(b_c_id);
}
}
let assignations_values = assignations.iter().map(|(_, v)| v.clone()).collect();
EvaluatedElement::evaluation(assignations_values)
}

Now I have a 4X increase.
Step Value Time Used Stores
RUST -> BI 90 2672540 28057 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0]
Java -> BI 90 2672625 4704 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0]
FIX: -> BI 90 2672540 1093 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0]
The "bug"
match opt_kind1_id {
None => (),
Some(id) => if !l.kind1_id == id { continue; },
}
The fix
if let Some(id) = opt_kind1_id {
if l.kind1_id != id {
continue;
}
}
This continue skips 90% of finding a new value

Related

Integer Linear Program, Bipartite Matching with Constraints How To?

The following program produces a matching between two sets of vertices, one represents meets between two teams and the other time slots when the meets could happen. The adjacency map represents both teams' availability to meet at any given time slot, days[][] represents which time slots are on the same date, weekDays[][] represents what day of the week is a given date, and teamToGames maps every meet that includes a given team. The decision variables are in a map match[][], with a value of 1 where a meet is matched to a time slot. Constraints are added so that only 1 meet can be matched to a time slot, only 1 time slot can be matched to a meet, only if allowed by the respective adj[][] value, and so that meets involving the same team cannot be matched to a game slot on the same date nor the following or previous date, excluding thursday-sunday.
What I don't know how to do now, is how can I make it prefer assigning two or more meets on the same date rather than one meet on each of two separate dates? So that there is the least possible amount of dates with only one meet. Kind of like constraining the number of meets on a date to either 0 or >=2 but only if possible.
Thank you for reading and for any help you can offer.
// [START program]
// [START import]
import com.google.ortools.linearsolver.MPConstraint;
import com.google.ortools.linearsolver.MPObjective;
import com.google.ortools.linearsolver.MPSolver;
import com.google.ortools.linearsolver.MPVariable;
// [END import]
/** MIP example that solves an assignment problem. */
public class GameMatching {
static {
System.loadLibrary("jniortools");
}
public static void main(String[] args) {
// Data
// [START data_model]
// Adjacency matrix represents which games can happen on which dates
int[][] adj = {
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
};
int numGames = adj.length;
int numDates = adj[0].length;
//represents which game time slots are on a given day (4 games on sundays, 2 on weekdays)
int[][] days = {
{0, 1, 2, 3},
{4, 5},
{6, 7},
{8, 9, 10, 11},
{12, 13},
{14, 15},
{16, 17},
{18, 19},
{20, 21, 22, 23},
{24, 25},
{26, 27},
{28, 29},
{30, 31},
{32, 33, 34, 35},
{36, 37},
{38, 39},
{40, 41},
{42, 43},
{44, 45, 46, 47},
{48, 49},
{50, 51},
{52, 53, 54, 55},
{56, 57},
{58, 59},
{60, 61},
{62, 63},
{64, 65, 66, 67},
{68, 69},
{70, 71},
{72, 73},
{74, 75},
{76, 77, 78, 79}
};
//represents what day of the week is a day, a team can play thursday and sunday, but not sunday and monday 0 is sunday, 1 is monday...
int[] weekDays = {0, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0};
// teamToGames[i][j] represents a team i's, games j
int[][] teamToGames = {
{1, 3, 9, 16, 18, 26},
{0, 8, 12, 16, 23, 28},
{1, 5, 7, 13, 21, 27},
{2, 5, 14, 17, 22, 26},
{7, 15, 19, 21, 24, 28},
{3, 10, 14, 20, 27, 29},
{2, 6, 9, 13, 23, 29},
{6, 8, 11, 18, 19, 25},
{8, 4, 10, 11, 17, 24},
{4, 12, 15, 20, 22, 25},
};
// [END data_model]
// Solver
// [START solver]
// Create the linear solver with the CBC backend.
MPSolver solver = new MPSolver("AssignmentMip", MPSolver.OptimizationProblemType.CBC_MIXED_INTEGER_PROGRAMMING);
// [END solver]
// Variables
// [START variables]
// x[i][j] is an array of 0-1 variables, which will be 1
// if a game i is assigned to date j.
MPVariable[][] match = new MPVariable[numGames][numDates];
for (int i = 0; i < numGames; ++i) {
for (int j = 0; j < numDates; ++j) {
match[i][j] = solver.makeIntVar(0, 1, "");
}
}
// [END variables]
// Constraints
// [START constraints]
// Each game is assigned to at most one date.
for (int i = 0; i < numGames; ++i) {
MPConstraint constraint = solver.makeConstraint(0, 1, "");
for (int j = 0; j < numDates; ++j) {
constraint.setCoefficient(match[i][j], 1);
}
}
// Each date is assigned to at most one game.
for (int j = 0; j < numDates; ++j) {
MPConstraint constraint = solver.makeConstraint(0, 1, "");
for (int i = 0; i < numGames; ++i) {
constraint.setCoefficient(match[i][j], 1);
}
}
// Can only assign respecting adj matrix
for (int i = 0; i < numGames; ++i) {
for (int j = 0; j < numDates; ++j) {
MPConstraint constraint = solver.makeConstraint(0, adj[i][j], "");
constraint.setCoefficient(match[i][j], 1);
}
}
// Cannot assign team to consecutive dates
for (int i = 0; i < teamToGames.length; ++i) {
for (int j = 0; j < days.length - 1; ++j) {
if (weekDays[j] != 4) {
MPConstraint constraint = solver.makeConstraint(0, 1, "");
for (int k = 0; k < teamToGames[i].length; ++k) {
for (int l = 0; l < days[j].length; ++l) {
constraint.setCoefficient(match[teamToGames[i][k]][l], 1);
}
for (int l = 0; l < days[j+1].length; ++l) {
constraint.setCoefficient(match[teamToGames[i][k]][l], 1);
}
}
}
}
}
// [END constraints]
// Objective
// [START objective]
MPObjective objective = solver.objective();
for (int i = 0; i < numGames; ++i) {
for (int j = 0; j < numDates; ++j) {
objective.setCoefficient(match[i][j], 1);
}
}
objective.setMaximization();
// [END objective]
// Solve
// [START solve]
MPSolver.ResultStatus resultStatus = solver.solve();
// [END solve]
// Print solution.
// [START print_solution]
// Check that the problem has a feasible solution.
if (resultStatus == MPSolver.ResultStatus.OPTIMAL || resultStatus == MPSolver.ResultStatus.FEASIBLE) {
System.out.println("Total matches: " + objective.value() + "\n");
for (int i = 0; i < numGames; ++i) {
for (int j = 0; j < numDates; ++j) {
// Test if x[i][j] is 0 or 1 (with tolerance for floating point
// arithmetic).
if (match[i][j].solutionValue() > 0.5) {
System.out.println("Game " + i + " assigned to date " + j);
}
}
}
} else {
System.err.println("No solution found.");
}
// [END print_solution]
}
// private GameMatching() {
// }
}
// [END program]

Borrowing a page from the facility location playbook, make a new array of 0-1 variables canHaveMatches[j], add constraints match[i][j] <= canHaveMatches[j], minimize sum_j canHaveMatches[j].

ECLiPSe CLP : slow combined occurrence/3 constraint behaviour

As a subset of a larger problem, I'm trying to write the 2 following constraints for an NxN board (containing N² cells):
Each row/col contains exactly N occurrences of integer K given by pre-defined hints
No 2x2 block (anywhere on the board) contains more than 1 occurrence of integer K
On the board, several cells will already be filled in on beforehand and should be ignored for the constraints in this SO question, therefore we use integer 2 to represent these cells and model the unknown cells to have a finite domain of binary boolean values:
model(Board,N,Hints) :-
dim(Board,[N,N]),
( foreach(Row-Col,Hints), param(Board)
do
2 is Board[Row,Col]
),
( multifor([I,J],1,N), param(Board)
do
Cell is Board[I,J],
( var(Cell) -> Cell :: 0..1 ; true )
).
The constraints in code respectively:
hint_constraints(Board,N,RowHints,ColHints) :-
( for(I,1,N), foreach(RH,RowHints), foreach(CH,ColHints), param(Board,N)
do
Row is Board[I,1..N],
Col is Board[1..N,I],
ic_global:occurrences(1,Row,RH), % Here, K=1 and N=RH
ic_global:occurrences(1,Col,CH) % Here, K=1 and N=CH
).
block_constraints(Board,N) :-
( multifor([I,J],1,(N-1)), param(Board)
do
Block is Board[I..I+1,J..J+1],
flatten(Block,BlockFlat),
Sum #:: [0,1],
ic_global:occurrences(1,BlockFlat,Sum) % Here, K=1
).
For a simple execution of a puzzle:
solve(BT) :-
puzzle(N,_,RowHints,ColHints,Hints),
model(N,RowHints,ColHints,Hints,Board),
hint_constraints(Board,N,RowHints,ColHints),
block_constraints(Board,N),
once search(Board,0,most_constrained,indomain_max,complete,[backtrack(BT)]).
For the 8x8 puzzle, the first solution is found almost instantly:
?- solve(BT).
[](0, 0, 0, 0, 0, 0, 1, 2)
[](2, 1, 0, 2, 1, 0, 0, 2)
[](0, 0, 0, 0, 0, 0, 1, 0)
[](0, 0, 0, 1, 0, 0, 0, 0)
[](1, 0, 0, 0, 2, 0, 0, 0)
[](2, 2, 1, 0, 1, 2, 1, 2)
[](1, 2, 0, 2, 0, 0, 2, 0)
[](0, 0, 0, 0, 1, 0, 0, 1)
BT = 0
Yes (0.01s cpu)
for the 20x20 instance however, I have left it running for around 5 minutes without getting any result.
To investigate whether one constraint would be significantly more costly than the other, I ran both of them separately:
When we use hint_constraints/4, but not block_constraints/2, we get:
?- solve(BT).
[](1, 1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0)
[](1, 1, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0)
[](2, 1, 1, 1, 2, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0)
[](1, 1, 0, 0, 0, 0, 0, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2)
[](1, 0, 1, 1, 1, 2, 1, 1, 2, 1, 0, 0, 0, 0, 2, 2, 0, 0, 2, 0)
[](2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0)
[](2, 0, 0, 0, 1, 2, 1, 1, 1, 1, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0)
[](1, 0, 0, 0, 2, 1, 1, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0)
[](2, 0, 0, 0, 1, 1, 0, 1, 1, 2, 1, 0, 2, 0, 2, 0, 2, 0, 0, 2)
[](2, 0, 0, 0, 1, 0, 2, 1, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 2, 0)
[](0, 0, 0, 0, 2, 0, 2, 0, 0, 1, 2, 1, 2, 1, 1, 0, 0, 1, 0, 2)
[](0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 2, 0, 0)
[](0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 1, 1, 1, 0, 1, 0, 2, 0, 0)
[](2, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1)
[](0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 1, 0, 1, 0, 1)
[](0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 2, 2, 2, 1)
[](0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 1, 1, 1)
[](0, 0, 2, 0, 0, 0, 0, 0, 2, 2, 0, 2, 0, 1, 0, 0, 1, 2, 1, 1)
[](2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2)
[](0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, 1, 2, 1, 1, 1, 2, 1)
BT = 0
Yes (0.04s cpu)
and can verify that all row/col occurrences are satisfied. The other way around, when we use block_constraints/2, but not hint_constraints/2:
?- solve(BT).
[](0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0)
[](0, 1, 2, 1, 0, 2, 1, 2, 1, 0, 1, 0, 1, 2, 1, 0, 1, 2, 2, 0)
[](2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0)
[](0, 1, 0, 1, 0, 1, 0, 2, 2, 1, 2, 1, 0, 1, 0, 1, 0, 0, 2, 2)
[](0, 0, 0, 0, 0, 2, 0, 1, 2, 0, 0, 0, 0, 0, 2, 2, 0, 1, 2, 1)
[](2, 1, 0, 1, 0, 1, 0, 0, 0, 1, 2, 1, 0, 1, 2, 1, 0, 0, 0, 0)
[](2, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 1)
[](0, 1, 0, 1, 2, 1, 0, 0, 0, 2, 1, 0, 1, 0, 2, 1, 0, 2, 0, 0)
[](2, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 2, 0, 2, 0, 2, 1, 0, 2)
[](2, 1, 0, 1, 0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 1, 2, 0, 2, 1)
[](0, 0, 0, 0, 2, 0, 2, 1, 0, 0, 2, 0, 2, 0, 0, 0, 0, 1, 0, 2)
[](0, 1, 0, 1, 2, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 2, 0, 0)
[](0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 1, 0)
[](2, 1, 0, 1, 2, 2, 0, 0, 0, 2, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0)
[](0, 0, 2, 0, 0, 1, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0)
[](0, 1, 0, 1, 0, 2, 0, 0, 0, 1, 0, 1, 2, 1, 0, 1, 2, 2, 2, 0)
[](0, 2, 0, 0, 2, 1, 2, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 1)
[](0, 1, 2, 1, 0, 0, 0, 0, 2, 2, 1, 2, 1, 0, 1, 0, 0, 2, 0, 0)
[](2, 0, 0, 0, 0, 2, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 2)
[](0, 1, 2, 1, 0, 0, 0, 0, 2, 0, 2, 2, 1, 0, 2, 0, 0, 0, 2, 0)
BT = 0
Yes (0.01s cpu)
we can once more verify that the 2x2 block constraint successfully holds. Unfortunately, when using both constraints together the program seems not to finish anywhere within 5 minutes. I'm a bit confused by this behaviour since both constraints separately appear to work very fast. Though I understand that a lot of checks will have to occur internally to make sure the correct occurrences for each row/col are present while still satisfying the block constraint throughout the process, the fact that it takes over 5 minutes made me think something else must be wrong with the way I have written the block constraint.
Does anyone have an idea on how to optimise my implementation of the block occurrence constraint?
Thanks in advance!
Puzzle instances
puzzle(8, easy, [1,2,1,1,1,3,1,2],
[2,1,1,1,3,0,3,1],
[1-8, 2-1, 2-4, 2-8, 5-5, 6-1, 6-2, 6-6, 6-8, 7-2, 7-4, 7-7]).
puzzle(20,medium,[5,2,6,2,7,1,6,3,5,4,5,3,4,2,4,3,5,4,4,5],
[5,4,3,3,6,3,4,5,2,4,4,4,2,7,1,5,3,6,3,6],
[1-6, 1-15, 2-3, 2-6, 2-8, 2-14, 2-18, 2-19, 3-1, 3-5, 3-11, 4-8, 4-9, 4-11, 4-19, 4-20,
5-6, 5-9, 5-15, 5-16, 5-19, 6-1, 6-11, 6-15, 7-1, 7-6, 7-15, 8-5, 8-10, 8-15, 8-18,
9-1, 9-10, 9-13, 9-15, 9-17, 9-20, 10-1, 10-7, 10-17, 10-19, 11-5, 11-7, 11-11, 11-13, 11-20,
12-5, 12-18, 13-6, 13-11, 13-18, 14-1, 14-5, 14-6, 14-10, 15-3, 15-12, 16-6, 16-13, 16-17, 16-18, 16-19,
17-2, 17-5, 17-7, 17-15, 18-3, 18-9, 18-10, 18-12, 18-18, 19-1, 19-6, 19-20, 20-3, 20-9, 20-11, 20-12, 20-15, 20-19]).

Matrix columns permutation with cublas

I have an input matrix A of size 10x20, I want to permute its columns as follows:
p=[1 4 2 3 5 11 7 13 6 12 8 14 17 9 15 18 10 16 19 20] ;%rearrange the columns of A
A=A(:,p);
To do so, I constructed a permutation matrix I corresponding to the permutation vector p and permuted A can be obtained by performing the following multiplication:
A=A*I
I tested the permutation in Matlab and everything is ok. Now, I want to test it in cuda using cublas.
The input matrix A is entered in column major. The permuation matrix I in column major as well. The following code is to simply test the permutation:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
__global__ void sgemm_kernel(float *A_d, float *I_d)
{
int m=10,n=20,k=20;
int lda=k, ldb=k;
cublasHandle_t hdl;
cublasStatus_t status = cublasCreate_v2(&hdl);
const float alpha=1.0F, beta=0.0f;
status=cublasSgemm(hdl,CUBLAS_OP_N,CUBLAS_OP_N,k,n,k,&alpha,A_d,lda,I_d,ldb,&beta,A_d,lda);
}
int main(int argc, char* argv[])
{float A[10*20]={-0.0614, -0.0199, 0.0024, -0.0414, 0.1736, -0.0595, -0.2794, 0.1946, -0.0647, -0.0025,
-0.0036, 0.0628, -0.0827, 0.3679, -0.1913, 0.0500, -0.0245, 0.3855, -0.1298, -0.0334,
-0.0241, -0.0564, 0.0098, -0.2862, -0.0474, 0.0333, -0.3049, 0.2851, -0.1242, 0.0162,
0.0241, 0.0270, -0.0670, 0.3129, -0.2428, 0.0947, -0.1878, 0.0889, -0.0208, 0.0075,
-0.1559, 0.1437, -0.1916, 0.2297, -0.0833, -0.1805, 0.2522, -0.1738, 0.1027, -0.1273,
0.0716, 0.1882, -0.0963, 0.1081, 0.0958, -0.0713, 0.1931, 0.0874, -0.4186, 0.0345,
-0.1912, 0.0501, -0.1396, -0.0989, -0.0338, 0.1773, 0.1088, 0.0389, -0.0117, 0.0014,
0.1648, -0.1705, -0.0575, -0.0133, -0.0570, 0.2124, -0.0193, 0.1535, 0.0857, -0.1308,
0.1971, 0.0882, -0.2577, 0.1662, -0.2498, -0.0365, -0.1805, 0.0921, 0.0912, 0.0178,
-0.0379, 0.0080, 0.0572, -0.0067, 0.0591, -0.0136, 0.0471, -0.0163, 0.0082, -0.0338,
-0.2436, 0.1116, 0.0732, -0.0319, 0.0550, 0.2821, 0.0240, 0.0109, -0.0034, 0.1212,
-0.0061, 0.2497, -0.0542, -0.0939, 0.0651, 0.0063, -0.1367, 0.0580, 0.7389, -0.1143,
-0.3786, 0.1288, 0.0001, 0.2604, -0.1094, -0.3624, -0.0184, 0.0538, 0.0329, 0.0040,
0.0603, 0.1422, 0.1037, -0.1846, 0.4046, -0.3738, -0.3487, 0.3846, -0.0849, 0.0135,
-0.1850, 0.3571, -0.0543, -0.0025, -0.2880, 0.0600, 0.2605, -0.0474, 0.0010, -0.0333,
-0.1974, 0.4788, -0.2441, 0.3847, -0.1235, -0.3503, -0.1785, -0.1095, 0.3158, 0.0062,
-0.0509, -0.0502, 0.2154, 0.2237, -0.0671, 0.0377, 0.0519, 0.1530, -0.1675, 0.1856,
-0.0380, -0.0026, 0.4700, 0.0097, -0.2394, 0.0717, -0.2101, 0.2841, -0.1799, -0.0924,
-0.2678, 0.4485, 0.0044, 0.0030, -0.0439, 0.4337, 0.1819, -0.0180, -0.5443, 0.0864,
0.0390, -0.0235, -0.0706, 0.0138, 0.0633, -0.0147, 0.0444, -0.0334, 0.0557, 0.0507}
float I[20*20]={1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
float *A_d, *I_d;
cudacall(cudaMalloc(&A_d,10*20*sizeof( float )));
cudacall(cudaMalloc(&I_d, 20*20*sizeof(float )));
cudacall(cudaMemcpy(A_d, A, 10*20*sizeof(float), cudaMemcpyHostToDevice));
cudacall(cudaMemcpy(I_d, I, 20*20*sizeof(float), cudaMemcpyHostToDevice));
sgemm_kernel<<<1,1>>>(A_d, I_d);
cudacall(cudaDeviceSynchronize());
cudacall(cudaMemcpy(A, A_d, 10*20*sizeof(float), cudaMemcpyDeviceToHost));
cudacall(cudaFree(A_d));
cudacall(cudaFree(I_d));
return 0;
}
I couldn't get a correct result.

CUBLAS doesn't support in-place operations (in fact no parallel BLAS I am aware of supports it). You cannot pass A_d and use it in the multiplication and as the matrix in the operation. You must use a different memory allocation to hold the result.
So
C <- 1*(A * B) + 0*C
is legal, whereas
A <- 1*(A * B) + 0*A
is not.

cublasSgemm is a host function, so it should be called from a function without the __global__ qualifier.

Opencv Filter Performance

I am trying to process a real time video which consists of frames with 2048*2000 resolution.
When i profile my code, i see that the bottleneck is the below code :
filter_gauss->apply(src, proc_img);
cv::filter2D(proc_img, frame_seq[0], -1, kernel[0]);
cv::filter2D(proc_img, frame_seq[1], -1, kernel[1]);
cv::filter2D(proc_img, frame_seq[2], -1, kernel[2]);
cv::filter2D(proc_img, frame_seq[3], -1, kernel[3]);
cv::threshold(frame_seq[0], frame_seq[0], threshold, 255, cv::THRESH_BINARY);
cv::threshold(frame_seq[1], frame_seq[1], threshold, 255, cv::THRESH_BINARY);
cv::threshold(frame_seq[2], frame_seq[2], threshold, 255, cv::THRESH_BINARY);
cv::threshold(frame_seq[3], frame_seq[3], threshold, 255, cv::THRESH_BINARY);
proc_img = frame_seq[0] | frame_seq[1] | frame_seq[2] | frame_seq[3];
Is there any possible ways to improve this part of code in terms of performance ?
The kernels used above are like these :
kernel[0]
-1, -1, 4, -1, -1
kernel[1]
-1, -1, 4, -1, -1
kernel[2]
-1, 0, 0, 0, 0, 0, 0,
0, -1, 0, 0, 0, 0, 0,
0, 0, -1, 0, 0, 0, 0,
0, 0, 0, 6 , 0, 0, 0,
0, 0, 0, 0, -1, 0, 0,
0, 0, 0, 0, 0, -1, 0,
0, 0, 0, 0, 0, 0, -1
kernel[3]
0, 0, 0, 0, 0, 0, -1,
0, 0, 0, 0, 0, -1, 0,
0, 0, 0, 0, -1, 0, 0,
0, 0, 0, 6 , 0, 0, 0,
0, 0, -1, 0, 0, 0, 0,
0, -1, 0, 0, 0, 0, 0,
-1, 0, 0, 0, 0, 0, 0
filter_gauss
cv::Mat gaussian_kernel = cv::getGaussianKernel(7, 5.0);
cv::Ptr<cv::FilterEngine> filter_gauss = cv::createSeparableLinearFilter(CV_8UC1, CV_8UC1, gaussian_kernel, gaussian_kernel);

Get matrix from textfile in Mathematica

What's the easiest way to turn this text file into matrix? It has one row per line, where O means 0 and X means 1

$url = "http://hyperpublic.com/challenge2input.txt";
StringCases[Import[$url, "Lines"], {"O" -> 0, "X" -> 1}]

I first saved that text in a file tmp.txt.
In[180]:= words = ReadList["~danl/tmp.txt", Word];
vals = Map[Characters, words] /. {"O" -> 0, "X" -> 1};
In[182]:= vals[[1]]
Out[182]= {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
Daniel Lichtblau
Wolfram Research

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Rust performance issue - High complexity code - performance

Related

Integer Linear Program, Bipartite Matching with Constraints How To?

ECLiPSe CLP : slow combined occurrence/3 constraint behaviour

Matrix columns permutation with cublas

Opencv Filter Performance

Get matrix from textfile in Mathematica

Categories

Resources