Parallel Matrix Multiplication in C++ - parallel-processing

I am trying to implement Parallel Multi-threaded Matrix multiplication in C++. The method i follow involves dividing Arrays into 4 sub-arrays and carry out parallel Multiplication using 4 threads on these 4 sub arrays.
I have written a C++ code but it is throwing error and terminates explicitly. Error :
"terminate called after throwing an instance of std::system_error
what():invalid Argument"
Here is my complete code. I am relatively new to C++ and multi-threading.
#include <iostream>
#include <thread>
#include <mutex>
#include <vector>
#include <algorithm>
#include <string>
#define N 4
using namespace std;
mutex mu;
void stage_1_multiply(int *a,int *b,int *d){
int *xij;
int *yij;
int *zij;
int COLS = N,ROWS = N;
cout<< " thread "<< this_thread::get_id() << " "<<endl;
for(int i = 0;i<(N/2);++i){
for(int j = 0;j < (N/2); j++){
for(int k = 0; k<(N/2);k++){
mu.lock();
xij = a + ((COLS * i) + k);
yij = b + ((COLS * k) + j);
zij = d + ((COLS * i) + j);
*zij += ( (*xij) * (*yij) );
mu.unlock();
}
}
}
}
int main(){
int A[4][4],B[4][4],C[4][4],D_1[4][4],D_2[4][4];
for(int i = 0;i<4;i++){
for(int j = 0;j<4;j++){
A[i][j] = i + 1;
B[i][j] = i + 1;
C[i][j] = 0;
D_1[i][j] = 0;
D_2[i][j] = 0;
}
}
for(int i = 0;i<4;i++){
for(int j = 0;j< 4;j++){
cout << A[i][j] << " ";
}
cout << endl;
}
for(int i = 0;i<4;i++){
for(int j = 0;j< 4;j++){
cout << B[i][j] << " ";
}
cout << endl;
}
vector< thread> threads(8);
int th = 0;
threads[th++] = thread(stage_1_multiply,&A[0][0],&B[0][0],&D_1[0][0]);
threads[th++] = thread(stage_1_multiply,&A[0][2],&B[2][0],&D_2[0][0]);
threads[th++] = thread(stage_1_multiply,&A[2][0],&B[0][2],&D_1[2][2]);
threads[th++] = thread(stage_1_multiply,&A[2][2],&B[2][2],&D_2[2][2]);
for( auto& t : threads){
t.join();
}
threads[th++] = thread(stage_1_multiply,&A[0][0],&B[0][2],&D_1[0][2]);
threads[th++] = thread(stage_1_multiply,&A[0][2],&B[2][2],&D_2[0][2]);
threads[th++] = thread(stage_1_multiply,&A[2][0],&B[0][0],&D_1[2][0]);
threads[th++] = thread(stage_1_multiply,&A[2][2],&B[2][0],&D_2[2][0]);
for( auto& t : threads){
t.join();
}
// code to add The Matrices D_1 and D_2 goes here.
for(int i = 0;i<4;i++){
for(int j = 0;j< 4;j++){
cout << D_1[i][j] << " ";
}
cout << endl;
}
cout << " Main Close "<<endl;
return 0;
}
What am doing wrong? is it anything related to parallel access of shared memory? If so how can i correct it?
PS: This is a homework Assignment.

Related

Why does my won't my code populate the matrix?

My code is trying to find the beginning and ending indices of a section of a matrix that when added, together would equal 20. For each instance this occurs, it would then populate a matrix with said beginning and end indices in the format {beginning row index, beginning column index, ending row index, ending column index} for each row. Each row would represent separate instances. It works fine for one instance but when introduced to other instances it wouldn't populate the matrix. Please help.
#include <cstddef> // size_t
#include <iostream>
using namespace std;
// Populates matrix
void filler(int bIndr, int bIndc, int eIndr, int eIndc, size_t**matrix, const size_t kIndices_size2, const size_t kIndices_size) {
int t = 0;
int matrix2[4] = {0,0,0,0};
for(int i = 0 ; i < kIndices_size2; i++) {
for (int j = 0; j < 2; j++) {
for (int ii = t; ii < kIndices_size; ii++) {
if(j == 0) {
matrix2[ii] = bIndr;
matrix2[ii+1] = bIndc;
cout << matrix2[ii+1] << endl;
break;
}
if(j == 1) {
matrix2[ii] = eIndr;
matrix2[ii+1] = eIndc;
cout << matrix2[ii+1] << endl;
break;
}
}
t = 2;
}
}
for(int i = 0 ; i < kIndices_size; i++) {
matrix[kIndices_size2-1][i] = matrix2[i];
}
}
int main()
{
int goal = 20;
int array[2][8] = {{10,0,0,10,0,0,1,0},{0,0,10,0,0,0,10,0}};
int inst = 0;
int t=0;
int bIndr = 0;
int bIndc = 0;
int eIndr = 0;
int eIndc = 0;
const size_t kIndices_size = 4;
size_t**matrix;
for(int ii = 0; ii < 2; ii++) {
bIndc =0;
for(int j = bIndc; j < 8; j++) {
t = 0;
bIndr = ii;
bIndc = j;
for(int i = j; i < 8; i++) {
t += array[ii][i];
if((goal-t) == 0) {
inst++;
eIndc = i;
eIndr = ii;
matrix=new size_t*[inst];
matrix[inst-1]=new size_t [kIndices_size];
cout << bIndr << bIndc << eIndr << eIndc << endl;
filler(bIndr, bIndc, eIndr, eIndc, matrix, inst, kIndices_size);
break;
}
}
}
}
size_t actual_size = static_cast<size_t>(-1);
cout << actual_size << endl;
size_t* sums_found = &actual_size;
*sums_found = inst;
cout << actual_size << endl;
cout << matrix[0][0] << endl;
for(int i = 0; i < inst; i++) {
for(int ii = 0; ii < kIndices_size; ii++) {
cout << matrix[i][ii] << " ";
}
cout << endl;
}
return 0;
}

Access an matrix as its tranpose in tiled matrix mutliplication in CUDA

I'm currently experimenting with CUDA and i came across this kernel from an answer for matrix multiplication: https://stackoverflow.com/a/18856054/7867026
I want instead of doing A*B to do A_Transpose*A but without saving A_Transpose (only matrix A as an input to kernel). I have to properly set the indexes but I'm confused by this matrix representation. Any help would be appreciated.
most of what you need is here and here.
In the first link it is identified that AxAT involves taking inner products of rows of matrix A, and similarly ATxA will involve taking inner products of columns of matrix A. Also note the symmetry statement. In the second link (scroll down from that point a bit in the programming guide) you will find a complete tiled matrix multiply. You just need to index into both tiles by column.
Here is a worked example, using the code from the SO answer you linked:
$ cat t1654.cu
#include <iostream>
#include <cstdio>
#include <cstdlib>
const int TILE_DIM = 32;
template <typename T>
__global__ void ATA(const T * __restrict__ A, T * __restrict__ C, int ARows, int ACols)
{
T CValue = 0;
int Row = blockIdx.y*TILE_DIM + threadIdx.y;
int Col = blockIdx.x*TILE_DIM + threadIdx.x;
__shared__ T As[TILE_DIM][TILE_DIM];
__shared__ T Bs[TILE_DIM][TILE_DIM];
for (int k = 0; k < (TILE_DIM + ARows - 1)/TILE_DIM; k++) {
if (k*TILE_DIM + threadIdx.y < ARows && blockIdx.y*blockDim.y+threadIdx.x < ACols)
As[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + blockIdx.y*blockDim.y+threadIdx.x];
else
As[threadIdx.y][threadIdx.x] = 0.0;
if (k*TILE_DIM + threadIdx.y < ARows && Col < ACols)
Bs[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + Col];
else
Bs[threadIdx.y][threadIdx.x] = 0.0;
__syncthreads();
for (int n = 0; n < TILE_DIM; ++n)
CValue += As[n][threadIdx.y] * Bs[n][threadIdx.x];
__syncthreads();
}
if (Row < ACols && Col < ACols)
C[((blockIdx.y * blockDim.y + threadIdx.y)*ACols) +
(blockIdx.x * blockDim.x)+ threadIdx.x] = CValue;
}
template <typename T>
__global__ void transpose_naive(const T * __restrict__ in, T * __restrict__ out, const int dim){
int col = threadIdx.x+blockDim.x*blockIdx.x;
int row = threadIdx.y+blockDim.y*blockIdx.y;
if ((col < dim) && (row < dim)) out[col*dim+row] = in[row*dim+col];
}
template <typename T>
__global__ void mm_naive(const T * __restrict__ A, const T * __restrict__ B, T * __restrict__ C, const int rowA, const int colA, const int colB){
int col = threadIdx.x+blockDim.x*blockIdx.x;
int row = threadIdx.y+blockDim.y*blockIdx.y;
if ((row < rowA) && (col < colB)){
T Cval = 0;
for (int i = 0; i < colA; i++) Cval += A[row*colA+i]*B[i*colB+col];
C[row*colB+col] = Cval;}
}
typedef float mt;
int main(){
mt *d_A, *d_B, *d_C, *h_A, *h_C, *h_C1;
int m = 64;
int n = 64;
h_A = new mt[m*n];
h_C = new mt[n*n];
h_C1 = new mt[n*n];
cudaMalloc(&d_A, m*n*sizeof(d_A[0]));
cudaMalloc(&d_B, m*n*sizeof(d_A[0]));
cudaMalloc(&d_C, n*n*sizeof(d_C[0]));
// test 1
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
h_A[i*n+j] = (i==j)?1.0f:0.0f;
cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
dim3 block(TILE_DIM, TILE_DIM);
dim3 grid((n+block.x-1)/block.x, (n+block.y-1)/block.y);
ATA<<<grid,block>>>(d_A, d_C, m, n);
cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
#endif
// test 2
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
h_A[i*n+j] = rand()%10;
cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
ATA<<<grid,block>>>(d_A, d_C, m, n);
cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
#endif
transpose_naive<<<grid,block>>>(d_A, d_B, n);
mm_naive<<<grid,block>>>(d_B, d_A, d_C, n, n, n);
cudaMemcpy(h_C1, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C1[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
#endif
for (int i = 0; i < n*n; i++) if (h_C[i] != h_C1[i]) {std::cout << "mismatch at: " << i << " was: " << h_C[i] << " should be: " << h_C1[i] << std::endl; return 0;}
}
$ nvcc -o t1654 t1654.cu
$ cuda-memcheck ./t1654
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
Note that loading the Bs tile is identical in both cases. The main changes are in loading the As tile, and also note the indexing change when computing Cvalue. These changes are necessary to index in both cases by column.
There may still be bugs. I have not tested the non-square case, nor have I tested the case where the matrix size is not a multiple of block size. Furthermore I've taken no advantage of the symmetry in the output. However this should help with the indexing.

C++ ScopeTimer doesn't work

I'm trying to implement a timer class which prints the time needed for a given scope. Somehow I can't get it to work properly. My code so far:
Main.cpp:
#include "scopetimer.hpp"
#include <cstdlib>
#include <cmath>
#include <string>
#include <chrono>
#include <iostream>
void work01()
{
double numbers[10000];
for (int i = 0; i < 10000; ++i)
{
numbers[i] = double(std::rand()) / double(RAND_MAX);
}
for (int n = 10000; n > 1; n = n - 1) {
for (int i = 0; i < n - 1; i = i + 1) {
if (numbers[i] > numbers[i + 1]) {
double tmp = numbers[i];
numbers[i] = numbers[i + 1];
numbers[i + 1] = tmp;
}
}
}
}
void work02()
{
int* buf[1024];
for (int i = 2; i < 1024; ++i)
buf[i] = new int[i];
for (int i = 2; i < 1024; ++i)
delete[] buf[i];
}
// counts the number of primes in an interval
int work03(int n0, int n1)
{
int freq = n1 - n0 + 1;
for (int i = n0; i <= n1; ++i)
{
// Have fun: use the alternative iteration direction and see how fast
// it gets!
// for(int j = 2; j < i; ++j)
for (int j = i - 1; j > 1; --j)
{
if (i%j == 0)
{
--freq;
break;
}
}
}
return freq;
}
int main(int, char**)
{
{ ScopeTimer("work01");
work01();
}
{
ScopeTimer("work02");
work02();
}
{
ScopeTimer("work03");
work03(0, 10000);
}
std::cout << std::endl << "Tests" << std::endl << std::endl;
{
clock_t start_(std::clock());
work01();
clock_t end_(std::clock());
std::cout << "Test Timer: " << end_ - start_ << "ns" << std::endl;
}
{
clock_t start_(std::clock());
work02();
clock_t end_(std::clock());
std::cout << "Test Timer: " << end_ - start_ << "ns" << std::endl;
}
{
clock_t start_(std::clock());
work03(0,10000);
clock_t end_(std::clock());
std::cout << "Test Timer: " << end_ - start_ << "ns" << std::endl;
}
system("Pause");
}
scopetimer.cpp
#include "scopetimer.hpp"
#include <cmath>
#include <string>
#include <chrono>
#include <iostream>
ScopeTimer::ScopeTimer(const std::string& name)
:name_(name),
start_(std::clock()) {
}
ScopeTimer::~ScopeTimer() {
double elapsed = (double(std::clock() - start_) / double(CLOCKS_PER_SEC));
std::cout << name_ << ": " << int(elapsed) << "ns" << std::endl;
}
I tested the clock functions outside of ScopeTimer(), which works fine. So the only issues, as far as I can tell, is that I can't get ScopeTimer() to work. It always prints 0ns. I mostly followed the turorial: https://felix.abecassis.me/2011/09/cpp-timer-raii/
Kind regards
In ~ScopeTimer() you print how many complete seconds have passed not how many nanoseconds, while in the second part of main, you print the number of clock ticks, which may or may not be the same as a nanosecond.
I came across the same problem
The solution for me is to define a ScopeTimer instance instead of just call its constructor, I mean:
{
ScopeTimer _scopetimer("work01");
work01();
}
That should work
I guess compiler seems to ignore (optimize) that, 'cause when you just call ScopeTimer("work01").

C++ : Why it dont work?

I was trying to solve
In which you have given a array of int and you have to return its sum.
#include <cmath>
#include <cstdio>
#include <vector>
#include <iostream>
#include <algorithm>
#include <cstdlib>
using namespace std;
int main(){
int n;
cin >> n;
int k;
vector<int> arr(n);
for(int arr_i = 0;arr_i < n;arr_i++){
cin >> arr[arr_i];
k = k + arr[arr_i];
//cout << "arr = " << arr[arr_i] << " k " << k << endl; // [0]
if (arr_i == (n-1))
{
cout << k;
}
}
return 0;
}
This return a afkard no. instead of sum.
But when uncomment out [0] line. code starts working as it should.
P.S. i found out the solution by changing cout to cerr.
but wanted to know why it doesn't work.
as the other answer, initialize k, move if outside of loop
vector<int> arr(n);
int k = 0;
for(int arr_i = 0;arr_i < n;arr_i++){
cin >> arr[arr_i];
k = k + arr[arr_i];
//cout << "arr = " << arr[arr_i] << " k " << k << endl; // [0]
}
cout << k;
as your question, there is no need std::vector anymore
int sum = 0, num;
for(int arr_i = 0;arr_i < n;arr_i++){
cin >> num;
sum = sum + num;
}
std::cout << sum << std::endl;
Initialize k before you use, else it will contain junk value int k = 0;
As the other answers (+1) suggested, you need to initialize k. However, I think it is slightly better to assign it the first value initially and reduce the number of iterations therefore:
vector<int> arr(n);
if (n > 0) {
int k = arr[0];
for(int arr_i = 1;arr_i < n;arr_i++){
//Your cycle
}
}

Why am I keep getting WA for this solution for SPOJ FISHER?

I am trying to solve this problem, I think I have come up with a correct answer, but I am keep getting WA (wrong answer) response from the judge.
http://www.spoj.com/problems/FISHER/
The problem distilled, is, given a complete graph with a time and a toll associated with each edge, find a path from the first node to the last node within time constraint and minimize toll.
As with any problems, there are many ways to solve it. My idea is to extend the Floyd-Warshall algorithm to keep track of all non-dominated paths. At the end of the algorithm, we extract the path with minimal cost, and if there are multiple paths with the same cost, choose the one that spent least time.
Complexity aside, the bad thing is, wrong answer. I have no idea what is wrong. I have generated some random graphs and used a brute force solver (one that try all possible paths) and they matches exactly on small (i.e. less than 11 nodes) graphs. Without further ado, here is the code:
#include "stdafx.h"
// http://www.spoj.com/problems/FISHER/
// #define LOG
#include <iostream>
#include <vector>
#include <map>
#include <algorithm>
using namespace std;
int main()
{
while (true)
{
int num_cities;
int time_budget;
vector<vector<int> > distances;
vector<vector<int> > tolls;
cin >> num_cities;
cin >> time_budget;
if (num_cities == 0 && time_budget == 0)
{
break;
}
distances.resize(num_cities);
tolls.resize(num_cities);
for (int i = 0; i < num_cities; i++)
{
distances[i].resize(num_cities);
tolls[i].resize(num_cities);
}
for (int i = 0; i < num_cities; i++)
{
for (int j = 0; j < num_cities; j++)
{
int distance;
cin >> distance;
distances[i][j] = distance;
}
}
for (int i = 0; i < num_cities; i++)
{
for (int j = 0; j < num_cities; j++)
{
int toll;
cin >> toll;
tolls[i][j] = toll;
}
}
// Try Floyd Warshall
// Denote the set of shortest paths from i to j going through {0,1,...k - 1} be shortest_paths[i][j][k],
// It is a set of shortest paths because there can be multiple shortest paths with different time used.
// We should record if using longer time can lead to lower cost, or similarly higher cost but less time
// The first element in the pair is the cost, the second element in the pair is time used
vector<vector<vector<vector<pair<int, int> > > > > shortest_paths;
shortest_paths.resize(num_cities);
for (int i = 0; i < num_cities; i++)
{
shortest_paths[i].resize(num_cities);
for (int j = 0; j < num_cities; j++)
{
shortest_paths[i][j].resize(num_cities + 1);
}
}
// Initialization - there is only one path without going through any node
#ifdef LOG
cout << "k = " << 0 << endl;
cout << "<table border='1'>" << endl;
#endif
for (int i = 0; i < num_cities; i++)
{
#ifdef LOG
cout << "<tr>" << endl;
#endif
for (int j = 0; j < num_cities; j++)
{
#ifdef LOG
cout << "<td>(" << tolls[i][j] << ", " << distances[i][j] << ")</td>";
#endif
shortest_paths[i][j][0].push_back(pair<int, int>(tolls[i][j], distances[i][j]));
}
#ifdef LOG
cout << "</tr>" << endl;
#endif
}
#ifdef LOG
cout << "</table>" << endl;
#endif
// Iteration - the shortest path
for (int k = 1; k <= num_cities; k++)
{
#ifdef LOG
cout << "k = " << k << endl;
cout << "<table border='1'>" << endl;
#endif
for (int i = 0; i < num_cities; i++)
{
#ifdef LOG
cout << "<tr>";
#endif
for (int j = 0; j < num_cities; j++)
{
// Step 1: Generate all candidate shortest paths
map<pair<int, int>, bool> candidates;
for (vector<pair<int, int> >::iterator pi = shortest_paths[i][j][k - 1].begin(); pi != shortest_paths[i][j][k - 1].end(); pi++)
{
candidates.insert(pair<pair<int, int>, bool>(*pi, false));
}
for (vector<pair<int, int> >::iterator fi = shortest_paths[i][k - 1][k - 1].begin(); fi != shortest_paths[i][k - 1][k - 1].end(); fi++)
{
for (vector<pair<int, int> >::iterator si = shortest_paths[k - 1][j][k - 1].begin(); si != shortest_paths[k - 1][j][k - 1].end(); si++)
{
int first_path_cost = fi->first;
int first_path_time_used = fi->second;
int second_path_cost = si->first;
int second_path_time_used = si->second;
int new_path_cost = first_path_cost + second_path_cost;
int new_path_time_used = first_path_time_used + second_path_time_used;
if (new_path_time_used <= time_budget)
{
candidates.insert(pair<pair<int, int>, bool>(pair<int, int>(new_path_cost, new_path_time_used), false));
}
}
}
vector<pair<pair<int, int>, bool> > candidates_list;
for (map<pair<int,int>, bool>::iterator ci = candidates.begin(); ci != candidates.end(); ci++)
{
candidates_list.push_back(*ci);
}
// Eliminate the bad ones
for (unsigned int p = 0; p < candidates_list.size(); p++)
{
for (unsigned int q = 0; q < candidates_list.size(); q++)
{
if (p != q)
{
int first_path_cost = candidates_list[p].first.first;
int first_path_time_used = candidates_list[p].first.second;
int second_path_cost = candidates_list[q].first.first;
int second_path_time_used = candidates_list[q].first.second;
// First take less time and less cost than second, second is eliminated
if (first_path_time_used <= second_path_time_used && first_path_cost <= second_path_cost)
{
candidates_list[q].second = true;
}
}
}
}
#ifdef LOG
cout << "<td>";
#endif
for (unsigned int p = 0; p < candidates_list.size(); p++)
{
if (candidates_list[p].second == false)
{
#ifdef LOG
cout << "(" << candidates_list[p].first.first << ", " << candidates_list[p].first.second << ")<br>";
#endif
shortest_paths[i][j][k].push_back(candidates_list[p].first);
}
}
#ifdef LOG
cout << "</td>";
#endif
}
#ifdef LOG
cout << "</tr>" << endl;;
#endif
}
#ifdef LOG
cout << "</table>" << endl;
#endif
}
bool first = true;
int best_cost = -1;
int best_cost_time = -1;
for (vector<pair<int, int> >::iterator pi = shortest_paths[0][num_cities - 1][num_cities].begin(); pi != shortest_paths[0][num_cities - 1][num_cities].end(); pi++)
{
if (first)
{
best_cost = pi->first;
best_cost_time = pi->second;
first = false;
}
else
{
if (pi->first < best_cost)
{
best_cost = pi->first;
best_cost_time = pi->second;
}
if (pi->first == best_cost && pi->second < best_cost_time)
{
best_cost_time = pi->second;
}
}
}
cout << best_cost << " " << best_cost_time << endl;
}
return 0;
}
/*
4 7
0 5 2 3
5 0 2 3
3 1 0 2
3 3 2 0
0 2 2 7
2 0 1 2
2 2 0 5
7 2 5 0
0 0
*/
Turn on the LOG you will be able to see the Floyd Warshall table for each iteration, each cell has set of a (cost, time) pair. They are supposed to be the cost/time pairs of all non-dominated paths.
I would really appreciate if someone can tell me what's wrong. Thanks a lot in advance!
Try this test:
4 10
0 1 1 1000
1 0 1 1
1 1 0 1
1000 1 1 0
0 1 1 1
1 0 1 1
1 1 0 1
1 1 1 0
Basically you need to ensure distances[i][j] <= time_budget before
shortest_paths[i][j][0].push_back(pair<int, int>(tolls[i][j], distances[i][j]));

Resources