Optimizing N-queen with openmp - parallel-processing

I am learning OPENMP and wrote the following code to solve nqueens problem.
//Full Code: https://github.com/Shafaet/Codes/blob/master/OPENMP/Parallel%20N- Queen%20problem.cpp
int n;
int call(int col,int rowmask,int dia1,int dia2)
{
if(col==n)
{
return 1;
}
int row,ans=0;
for(row=0;row<n;row++)
{
if(!(rowmask & (1<<row)) & !(dia1 & (1<<(row+col))) & !(dia2 & (1<<((row+n-1)-col))))
{
ans+=call(col+1,rowmask|1<<row,dia1|(1<<(row+col)), dia2|(1<<((row+n-1)-col)));
}
}
return ans;
}
double parallel()
{
double st=omp_get_wtime();
int ans=0;
int i;
int rowmask=0,dia1=0,dia2=0;
#pragma omp parallel for reduction(+:ans) shared(i,rowmask)
for(i=0;i<n;i++)
{
rowmask=0;
dia1=0,dia2=0;
int col=0,row=i;
ans+=call(1,rowmask|1<<row,dia1|(1<<(row+col)), dia2|(1<<((row+n-1)-col)));
}
printf("Found %d configuration for n=%d\n",ans,n);
double en=omp_get_wtime();
printf("Time taken using openmp %lf\n",en-st);
return en-st;
}
double serial()
{
double st=omp_get_wtime();
int ans=0;
int i;
int rowmask=0,dia1=0,dia2=0;
for(i=0;i<n;i++)
{
rowmask=0;
dia1=0,dia2=0;
int col=0,row=i;
ans+=call(1,rowmask|1<<row,dia1|(1<<(row+col)), dia2|(1<<((row+n-1)-col)));
}
printf("Found %d configuration for n=%d\n",ans,n);
double en=omp_get_wtime();
printf("Time taken without openmp %lf\n",en-st);
return en-st;
}
int main()
{
double average=0;
int count=0;
for(int i=2;i<=13;i++)
{
count++;
n=i;
double stime=serial();
double ptime=parallel();
printf("OpenMP is %lf times faster for n=%d\n",stime/ptime,n);
average+=stime/ptime;
puts("===============");
}
printf("On average OpenMP is %lf times faster\n",average/count);
return 0;
}
Parallel code is already faster than normal one but i wonder how can i optimize it more using openmp pragmas. I want to know what i should do for better performance and what i should not do.
Thanks in advance.
(Please dont suggest any optimizations which are non-related to parallel programming)

Your code seems to use classic backtracking N-Queens recursive algorithm, which is not the fastest possible for N-Queens solving, but (due to simplicity) is the most vivid one in terms of practicing with parallelism basics.
That's being said: this is very simple, thus you don't expect it to naturally demonstrate lots of advanced OpenMP means except basic "parallel for" and reduction.
But, as far as you're looking for learning parallelism and probably for more clearness and better learning curve, there is one more (out of many possible) implementation available, which uses the same algorithm but tends to be more readable and vivid from educational perspective:
void setQueen(int queens[], int row, int col) {
//check all previously placed rows for attacks
for(int i=0; i<row; i++) {
// vertical attacks
if (queens[i]==col) {
return;
}
// diagonal attacks
if (abs(queens[i]-col) == (row-i) ) {
return;
}
}
// column is ok, set the queen
queens[row]=col;
if(row==size-1) {
#pragma omp atomic
nrOfSolutions++; //Placed final queen, found a solution
}
else {
// try to fill next row
for(int i=0; i<size; i++) {
setQueen(queens, row+1, i);
}
}
}
//Function to find all solutions for nQueens problem on size x size chessboard.
void solve() {
#pragma omp parallel for
for(int i=0; i<size; i++) {
// try all positions in first row
int * queens = new int[size]; //array representing queens placed on a chess board. Index is row position, value is column.
setQueen(queens, 0, i);
delete[](queens);
}
}
This given code is one of Intel Advisor XE samples (for both C++ and Fortran); the parallelization aspects for given sample are discussed in very detailed manner in Chapter 10 of given Parallel Programming Book (in fact, given chapter just uses N-Queens to demonstrate how to use tools in order to parallelize serial code in general).
Given Advisor n-queens sample uses essentially the same algorithm as yours, but it replaces explicit reduction with combination of simple parallel for + atomic. This code is expected to be less efficient, but more "procedural-style" and more "educational", since it demonstrates "hidden" data race. In case you upload given samplecode, you will actually find 4 equialent N-Queens parallel implementatons using TBB, Cilk Plus and OpenMP (OMP is for C++ and Fortran).

I know I am a little late for the party, but you can use task queueing for further optimization.(about 7-10% faster results).No idea why. Here's the code,that i am using :
#include <iostream> // std::cout, cin, cerr ...
#include <iomanip> // modify std::out
#include <omp.h>
using namespace std;
int nrOfSolutions=0;
int size=0;
void print(int queens[]) {
cerr << "Solution " << nrOfSolutions << endl;
for(int row=0; row<size; row++) {
for(int col=0; col<size; col++) {
if(queens[row]==col) {
cout << "Q";
}
else {
cout << "-";
}
}
cout << endl;
}
}
void setQueen(int queens[], int row, int col, int id) {
for(int i=0; i<row; i++) {
// vertical attacks
if (queens[i]==col) {
return;
}
// diagonal attacks
if (abs(queens[i]-col) == (row-i) ) {
return;
}
}
// column is ok, set the queen
queens[row]=col;
if(row==size-1) {
// only one thread should print allowed to print at a time
{
// increasing the solution counter is not atomic
#pragma omp critical
nrOfSolutions++;
#ifdef _DEBUG
#pragma omp critical
print(queens);
#endif
}
}
else {
// try to fill next row
for(int i=0; i<size; i++) {
setQueen(queens, row+1, i, id);
}
}
}
void solve() {
int myid=0 ;
#pragma omp parallel
#pragma omp single
{
for(int i=0; i<size; i++) {
/*
#ifdef _OMP //(???)
myid = omp_get_thread_num();
#endif
#ifdef _DEBUG
cout << "ThreadNum: " << myid << endl ;
#endif
*/
// try all positions in first row
// create separate array for each recursion
// started here
#pragma omp task
setQueen(new int[size], 0, i, myid);
}
}
}
int main(int argc, char*argv[]) {
if(argc !=2) {
cerr << "Usage: nq-openmp-taskq boardSize.\n";
return 0;
}
size = atoi(argv[1]);
cout << "Starting OpenMP Task Queue solver for size " << size << "...\n";
double st=omp_get_wtime();
solve();
double en=omp_get_wtime();
printf("Time taken using openmp %lf\n",en-st);
cout << "Number of solutions: " << nrOfSolutions << endl;
return 0;
}

Related

How to access map in vector for building trie

#include <string>
#include <iostream>
#include <vector>
#include <map>
using std::map;
using std::vector;
using std::string;
typedef map<char, int> edges;
typedef vector<edges> trie;
trie build_trie(vector<string> & patterns) {
trie t;
// write your code here
for(int j=0;j<patterns.size();j++){
//current NOde = root;
int currNI=0;
for(int i=0;i<patterns[j].size();i++){
char currS = patterns[j][i];
auto it = t[currNI].begin();
//auto mit = it->edges.begin();
if(t[currNI].edges.find(currS)!=t[currNI].edges.end()){
currNI = t[currNI].edges.find(currS)->second;
}else{
t.push_back(edges.insert(currS,t.size()));
t[currNI].edges.insert(currS,t.size());
currNI = t.size();
}
}
}
return t;
}
int main() {
size_t n;
std::cin >> n;
vector<string> patterns;
for (size_t i = 0; i < n; i++) {
string s;
std::cin >> s;
patterns.push_back(s);
}
trie t = build_trie(patterns);
for (size_t i = 0; i < t.size(); ++i) {
for (const auto & j : t[i]) {
std::cout << i << "->" << j.second << ":" << j.first << "\n";
}
}
return 0;
}
Hi I was trying to build trie using vector and map but I am not able to access elements of map.
I have also added image of pseudo code for better clarity.
I have used iterators and many other tricks that are already present on stackoverflow and on other platform but somehow I am not able to access what I want.
Thank You
To answer your questions: in your code, edges is a type, not an object.
t[currNI] is of type edges, thus is a map<char, int>.
You should try t[currNI].find(currS) and t[currNI].end() directly.
NB: after you fix this, there are still other errors in your code.

Matrix Multiplication OpenMP Counter-Intuitive Results

I am currently porting some code over to OpenMP at my place of work. One of the tasks I am doing is figuring out how to speed up matrix multiplication for one of our applications.
The matrices are stored in row-major format, so A[i*cols +j] gives the A_i_j element of the matrix A.
The code looks like this (uncommenting the pragma parallelises the code):
#include <omp.h>
#include <iostream>
#include <iomanip>
#include <stdio.h>
#define NUM_THREADS 8
#define size 500
#define num_iter 10
int main (int argc, char *argv[])
{
// omp_set_num_threads(NUM_THREADS);
int *A = new int [size*size];
int *B = new int [size*size];
int *C = new int [size*size];
for (int i=0; i<size; i++)
{
for (int j=0; j<size; j++)
{
A[i*size+j] = j*1;
B[i*size+j] = i*j+2;
C[i*size+j] = 0;
}
}
double total_time = 0;
double start = 0;
for (int t=0; t<num_iter; t++)
{
start = omp_get_wtime();
int i, k;
// #pragma omp parallel for num_threads(10) private(i, k) collapse(2) schedule(dynamic)
for (int j=0; j<size; j++)
{
for (i=0; i<size; i++)
{
for (k=0; k<size; k++)
{
C[i*size+j] += A[i*size+k] * B[k*size+j];
}
}
}
total_time += omp_get_wtime() - start;
}
std::setprecision(5);
std::cout << total_time/num_iter << std::endl;
delete[] A;
delete[] B;
delete[] C;
return 0;
}
What is confusing me is the following: why is dynamic scheduling faster than static scheduling for this task? Timing the runs and taking an average shows that static scheduling is slower, which to me is a bit counterintuitive since each thread is doing the same amount of work.
Also, am I correctly speeding up my matrix multiplication code?
Parallel matrix multiplication is non-trivial (have you even considered cache-blocking?). Your best bet is likely to be to use a BLAS Library for this, rather than writing it yourself. (Remember, "The best code is the code I do not have to write").
Wikipedia: Basic Linear Algebra Subprograms points to many implementations, a lot of which (including Intel Math Kernel Library) have free licenses.

C++11 app that uses dispatch_apply not working under Mac OS Sierra

I had a completely functioning codebase written in C++11 that used Grand Central Dispatch parallel processing, specifically dispatch_apply to do the basic parallel for loop for some trivial game calculations.
Since upgrading to Sierra, this code still runs, but each block is run in serial -- the cout statement shows that they are being executed in serial order, and CPU usage graph shows no parallel working on.
Queue is defined as:
workQueue = dispatch_queue_create("workQueue", DISPATCH_QUEUE_CONCURRENT);
And the relevant program code is:
case Concurrency::Parallel: {
dispatch_apply(stateMap.size(), workQueue, ^(size_t stateIndex) {
string thisCode = stateCodes[stateIndex];
long thisCount = stateCounts[stateIndex];
GameResult sliceResult = playStateOfCode(thisCode, thisCount);
results[stateIndex] = sliceResult;
if ((stateIndex + 1) % updatePeriod == 0) {
cout << stateIndex << endl;
}
});
break;
}
I strongly suspect that this either a bug, but if this is GCD forcing me to use new C++ methods for this, I'm all ears.
I'm not sure if it is a bug in Sierra or not. But it seems to work if you explicitly associate a global concurrent queue as target:
dispatch_queue_t target =
dispatch_get_global_queue(QOS_CLASS_USER_INITIATED, 0);
dispatch_queue_t workQueue =
dispatch_queue_create_with_target("workQueue", DISPATCH_QUEUE_CONCURRENT, target);
// ^~~~~~~~~~~ ^~~~~~
Here is a working example
#include <iostream>
#include <fstream>
#include <vector>
#include <cmath>
#include <sstream>
#include <dispatch/dispatch.h>
void load_problem(const std::string, std::vector<std::pair<double,double>>&);
int main() {
// n-factor polynomial - test against a given problem provided as a set of space delimited x y values in 2d.txt
std::vector<std::pair<double,double>> problem;
std::vector<double> test = {14.1333177226503,-0.0368874860476915,
0.0909424058436257,2.19080982673558,1.24632025036125,0.0444549880462031,
1.06824631867947,0.551482840616757, 1.04948148731933};
load_problem("weird.txt",problem); //a list of space delimited doubles representing x, y.
size_t a_count = test.size();
dispatch_queue_t queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
__block double diffs = 0.0; //sum of all values..
dispatch_apply(problem.size(), queue, ^(size_t i) {
double g = 0;
for (size_t j=0; j < a_count - 1; j++) {
g += test[j]*pow(problem[i].first,a_count - j - 1);
}
g += test[a_count - 1];
diffs += pow(g - problem[i].second,2);
});
double delta = 1/(1+sqrt(diffs));
std::cout << "test: fit delta: " << delta << std::endl;
}
void load_problem(const std::string file, std::vector<std::pair<double,double>>& repo) {
repo.clear();
std::ifstream ifs(file);
if (ifs.is_open()) {
std::string line;
while(getline(ifs, line)) {
double x= std::nan("");
double y= std::nan("");
std::istringstream istr(line);
istr >> std::skipws >> x >> y;
if (!isnan(x) && !isnan(y)) {
repo.push_back({x, y});
};
}
ifs.close();
}
}

How to iterate a string using while loop in C++?

number = 100010001111111
for (int i=0; number.length(); i++) {
while number[i] == 1 {
k++;
}
}
I would like to implement a while-loop as a replacement for the for-loop as shown above.
How could I convert this to a while-loop?
Here's a solution for the problem you mentioned in your comment (Problem - 96A)
#include <iostream>
using namespace std;
int main()
{
cout << "Please enter your players situation" << endl;
std::string str;
cin >> str;
std::string::size_type i = 0;
int NumbersofAppearances = 0;
int ConsectiveNumberSequence = 7; //You can change that to whatever sequence you like
bool IsDangerous=false;
while (i < str.size())
{
if(str[i]=='1' )
{
++NumbersofAppearances;
//We need to check if we reached the consecutive number or not and save it on a different bool variable
if(NumbersofAppearances>=ConsectiveNumberSequence)
IsDangerous=true;
}
else
{
NumbersofAppearances=0;
}
++i;
}
//print out the end result
if (IsDangerous)
cout <<"YES , this is dangerous"<< endl;
else
cout <<"No, this is not dangerous"<< endl;
return 0;
}
And here's a link to Coding ground

Save state of c++11 random generator without using iostream

What is the best way to store the state of a C++11 random generator without using the iostream interface. I would like to do like the first alternative listed here[1]? However, this approach requires that the object contains the PRNG state and only the PRNG state. In partucular, it fails if the implementation uses the pimpl pattern(at least this is likely to crash the application when reloading the state instead of loading it with bad data), or there are more state variables associated with the PRNG object that does not have to do with the generated sequence.
The size of the object is implementation defined:
g++ (tdm64-1) 4.7.1 gives sizeof(std::mt19937)==2504 but
Ideone http://ideone.com/41vY5j gives 2500
I am missing member functions like
size_t state_size();
const size_t* get_state() const;
void set_state(size_t n_elems,const size_t* state_new);
(1) shall return the size of the random generator state array
(2) shall return a pointer to the state array. The pointer is managed by the PRNG.
(3) shall copy the buffer std::min(n_elems,state_size()) from the buffer pointed to by state_new
This kind of interface allows more flexible state manipulation. Or are there any PRNG:s whose state cannot be represented as an array of unsigned integers?
[1]Faster alternative than using streams to save boost random generator state
I've written a simple (-ish) test for the approach I mentioned in the comments of the OP. It's obviously not battle-tested, but the idea is represented - you should be able to take it from here.
Since the amount of bytes read is so much smaller than if one were to serialize the entire engine, the performance of the two approaches might actually be comparable. Testing this hypothesis, as well as further optimization, are left as an exercise for the reader.
#include <iostream>
#include <random>
#include <chrono>
#include <cstdint>
#include <fstream>
using namespace std;
struct rng_wrap
{
// it would also be advisable to somehow
// store what kind of RNG this is,
// so we don't deserialize an mt19937
// as a linear congruential or something,
// but this example only covers mt19937
uint64_t seed;
uint64_t invoke_count;
mt19937 rng;
typedef mt19937::result_type result_type;
rng_wrap(uint64_t _seed) :
seed(_seed),
invoke_count(0),
rng(_seed)
{}
rng_wrap(istream& in) {
in.read(reinterpret_cast<char*>(&seed), sizeof(seed));
in.read(reinterpret_cast<char*>(&invoke_count), sizeof(invoke_count));
rng = mt19937(seed);
rng.discard(invoke_count);
}
void discard(unsigned long long z) {
rng.discard(z);
invoke_count += z;
}
result_type operator()() {
++invoke_count;
return rng();
}
static constexpr result_type min() {
return mt19937::min();
}
static constexpr result_type max() {
return mt19937::max();
}
};
ostream& operator<<(ostream& out, rng_wrap& wrap)
{
out.write(reinterpret_cast<char*>(&(wrap.seed)), sizeof(wrap.seed));
out.write(reinterpret_cast<char*>(&(wrap.invoke_count)), sizeof(wrap.invoke_count));
return out;
}
istream& operator>>(istream& in, rng_wrap& wrap)
{
wrap = rng_wrap(in);
return in;
}
void test(rng_wrap& rngw, int count, bool quiet=false)
{
uniform_int_distribution<int> integers(0, 9);
uniform_real_distribution<double> doubles(0, 1);
normal_distribution<double> stdnorm(0, 1);
if (quiet) {
for (int i = 0; i < count; ++i)
integers(rngw);
for (int i = 0; i < count; ++i)
doubles(rngw);
for (int i = 0; i < count; ++i)
stdnorm(rngw);
} else {
cout << "Integers:\n";
for (int i = 0; i < count; ++i)
cout << integers(rngw) << " ";
cout << "\n\nDoubles:\n";
for (int i = 0; i < count; ++i)
cout << doubles(rngw) << " ";
cout << "\n\nNormal variates:\n";
for (int i = 0; i < count; ++i)
cout << stdnorm(rngw) << " ";
cout << "\n\n\n";
}
}
int main(int argc, char** argv)
{
rng_wrap rngw(123456790ull);
test(rngw, 10, true); // this is just so we don't start with a "fresh" rng
uint64_t seed1 = rngw.seed;
uint64_t invoke_count1 = rngw.invoke_count;
ofstream outfile("rng", ios::binary);
outfile << rngw;
outfile.close();
cout << "Test 1:\n";
test(rngw, 10); // test 1
ifstream infile("rng", ios::binary);
infile >> rngw;
infile.close();
cout << "Test 2:\n";
test(rngw, 10); // test 2 - should be identical to 1
return 0;
}

Resources