OpenCL Matrix Multiply runs, but answer is always zero - c++11

I am trying to learn/teach myself OpenCL and started with a program to do Matrix Multiply. No matter what I do, I end up with the answer of zero.
I know that a 1x3 and a 3x1 should yield a 1x1 answer, and it should be non zero with the way I have it setup to create random floats. Here is my main body, and the kernel. Other than the warnings; What am I missing, I have been over this for hours and can't see the problem.
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#include <iostream>
#include <fstream>
#include <sstream>
#include "./cl.hpp"
int main()
{
int nX = 1;
int nY = 3;
int nZ = 1;
// Get all platforms
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if(platforms.empty()){
throw std::runtime_error("No Platforms found, check OpenCL installation.");
}
cl::Platform platform = platforms[0];
std::cout << "Using Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
if(devices.empty()){
throw std::runtime_error ("No Devices Found, check installation.");
}
cl::Device device = devices[0];
// Create an execusion context
cl::Context context(device);
// create a command queue
cl::CommandQueue queue(context, device);
// Load the kernel sources, use global memory
std::ifstream fs("mCrossProd.cl");
if(!fs.is_open()) {
throw std::runtime_error("Can not open kernel source file.");
}
std::stringstream ss;
ss << fs.rdbuf();
std::string code = ss.str();
cl::Program::Sources sources;
sources.push_back({code.c_str(), code.length()});
// Build the kernel
cl::Program program(context, sources);
try{
program.build({device});
} catch(std::exception &err){
throw std::runtime_error(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
}
//Create Matrix arrays and fill with random float values
float *A = new float[nX*nY];
float *B = new float[nY*nZ];
float *C = new float[nX*nZ];
for(int i =0; i < nX; i++){
for(int j = 0; j < nY; j++)
{
A[j + i*nY] = rand()/(float)RAND_MAX * 10 + 1;
std::cout << " A[" << std::to_string(j + i * nY) << "] = ";
std::cout << A[j + i*nY] << ' ';
}
std::cout << std::endl;
}
std::cout << std::endl;
for(int i =0; i < nY; i++){
for(int j = 0; j < nZ; j++)
{
B[j + i*nY] = rand()/(float)RAND_MAX * 10 + 1 ;
std::cout << " B[" + std::to_string(j + i * nY) + "] = " ;
std::cout << B[j + i * nY] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
//fill Matrix C with random values
for(int i =0; i < nX; i++){
for(int j = 0; j < nZ; j++)
{
C[j + i*nX] = rand()/(float)RAND_MAX * 10 + 1 ;
std::cout << " C[" + std::to_string(j + i * nX) + "] = " ;
std::cout << B[j + i * nX] << " ";
}
std::cout << std::endl;
}
// Create data/memory buffers, and equeue them
cl::Buffer bufA(context, CL_MEM_READ_ONLY, sizeof(float) * nX * nY);
cl::Buffer bufB(context, CL_MEM_READ_ONLY, sizeof(float) * nY * nZ);
cl::Buffer bufC(context, CL_MEM_READ_WRITE, sizeof(float) * nX * nZ);
queue.enqueueWriteBuffer(bufA, CL_TRUE, 0, sizeof(float) * nX * nY, A);
queue.enqueueWriteBuffer(bufA, CL_TRUE, 0, sizeof(float) * nY * nZ, B);
// Select kernel, pass arguments
cl::Kernel kernel = cl::Kernel(program, "mCrossProd");
kernel.setArg(0, nX);
kernel.setArg(1, nY);
kernel.setArg(2, nZ);
kernel.setArg(3, bufA);
kernel.setArg(4, bufB);
kernel.setArg(5, bufC);
// Execute the kernel
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(nX, nY), cl::NDRange(16,16));
// Retrieve results from global memory
queue.enqueueReadBuffer(bufC, CL_TRUE,0, sizeof(float) * nX * nZ, C);
queue.finish();
fs.close();
std::cout << "\nThe solution is" << std::endl;
for(int i = 0; i < nX; i++){
for(int j = 0; j < nZ; j++)
{
std::cout << "C[" + std::to_string(j*nZ+i) + "] = " ;
std::cout << C[j*nZ+i] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
This is my Kernel function:
__kernel void mCrossProd(const int nX, const int nY, const int nZ, __global float* A, __global float* B, __global float* C) {
int i = get_global_id(0);
int j = get_global_id(1);
for(int k = 0; k < nX; k++){
C[j*nY+i] += A[j*nX+k] * B[k*nY+i];
}
}

The problem is in the following line of code:
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(nX, nY), cl::NDRange(16,16));
Try this instead:
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(M,N), cl::NDRange(1,1));
I believe the arguments for enqueueNDRangeKernel refer to the number of workers/threads running for this kernel (suggest reading the docs). Try play around with the numbers and benchmark. Also, you can check for errors, which should make debugging easier.
if( queue.enqueueNDRangeKernel() != CL_SUCCESS ) { throw error; }
By adding this, you will find out that your code currently does not even compute the matrix multiplication and fails to queue up the kernel.
There is also a bug with the way you are accessing your array. You only allocate 3 floats for matrix A and matrix B. But you assign values to indices that are out of bounds. e.g. float *A = new float[3]; A[5] = 10.0f;. This is undefined behaviour, which is why it does not necessarily crash but it is quite dangerous. You also do not free up the memory. If you use new keyword, remember to match with delete or delete[] in this case when are you done using the memory. Otherwise you get memory leaks. Alternatively, you can look into using vectors, smart pointers, statically sized arrays etc.
Also,
try{
program.build({device});
} catch(std::exception &err){
throw std::runtime_error(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
}
Should probably be replace with
if(program.build({device}!=CL_SUCCESS))
{
std::cerr << "Failed to compile kernel code" << std::endl;
exit(1);
}
Or at the very least do catch(...) instead of catching a specific exception. This is because I do not think program.build throws an exception. The documents state that it returns an error code, which is something you should check for.
One last thing, try to make the code simpler so it is easier to debug. For instance, you can try create a simple kernel that just adds two arrays rather than multiplying.
Anyways, I have modified your code. Hopefully it makes sense:
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#include <iostream>
#include <fstream>
#include <sstream>
#include <CL/cl.hpp>
int main()
{
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if(platforms.empty()){
throw std::runtime_error("No Platforms found, check OpenCL installation.");
}
cl::Platform platform = platforms[0];
std::cout << "Using Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
if(devices.empty()){
throw std::runtime_error ("No Devices Found, check installation.");
}
cl::Device device = devices[0];
// Create an execusion context
cl::Context context(device);
// Load the kernel sources, use global memory
std::ifstream fs("mCrossProd.cl");
if(!fs.is_open()){
throw std::runtime_error("Cannot open kernel source file.");
}
// Extract kernel code
std::stringstream ss;
ss << fs.rdbuf();
auto code = ss.str();
cl::Program::Sources sources;
sources.push_back({code.c_str(), code.length()});
fs.close();
// Build the kernel
cl::Program program(context, sources);
if(program.build({device})!=CL_SUCCESS){
std::cout<<" Error building: "<<program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)<<"\n";
exit(1);
}
// Output matrix dimensions
int M = 4, N = 3, K = 6;
int A_dims = M * K;
int B_dims = N * K;
int C_dims = M * N;
// Create buffers for device
cl::Buffer buffer_A(context,CL_MEM_READ_WRITE,sizeof(float)*A_dims);
cl::Buffer buffer_B(context,CL_MEM_READ_WRITE,sizeof(float)*B_dims);
cl::Buffer buffer_C(context,CL_MEM_READ_WRITE,sizeof(float)*C_dims);
float A[] = {2.0f, 1.0f, 2.0f, 2.0f, 4.0f, 1.0f,
4.0f, 2.0f, 1.0f, 1.0f, 0.0f, 0.0f,
3.0f, 2.0f, 5.0f, 1.0f, 1.0f, 1.0f,
0.0f, 0.0f, 0.0f, 2.0f, 1.0f, 1.0f};
float B[] = {3.0f, 2.0f, 4.0f,
1.0f, 1.0f, 2.0f,
4.0f, 2.0f, 1.0f,
0.0f, 0.0f, 1.0f,
9.0f, 2.0f, 1.0f,
2.0f, 1.0f, 0.0f};
float C[] = {0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f};
cl::CommandQueue queue(context,device);
//write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_A,CL_TRUE,0,sizeof(float)*A_dims,A);
queue.enqueueWriteBuffer(buffer_B,CL_TRUE,0,sizeof(float)*B_dims,B);
// Select kernel, pass arguments
cl::Kernel kernel = cl::Kernel(program, "mCrossProd");
kernel.setArg(0, M);
kernel.setArg(1, N);
kernel.setArg(2, K);
kernel.setArg(3, buffer_A);
kernel.setArg(4, buffer_B);
kernel.setArg(5, buffer_C);
// Execute kernel
if( queue.enqueueNDRangeKernel(kernel,cl::NullRange,cl::NDRange(M,N),cl::NDRange(1,1)) != CL_SUCCESS )
{
std::cout << "Failed to launch kernel" << std::endl;
exit(1);
}
queue.finish();
// read result C from the device to array C
queue.enqueueReadBuffer(buffer_C,CL_TRUE,0,sizeof(float)*C_dims,C);
std::cout << sizeof(C) / sizeof(float) << std::endl;
std::cout << C_dims << std::endl;
std::cout << M << " " << N << std::endl;
std::cout << "\nThe solution is" << std::endl;
for(int i = 0; i < M; i++) {
for(int j = 0; j < N; j++) {
std::cout << "C[" + std::to_string(i*N+j) + "] = ";
std::cout << C[i*N+j] << " ";
}
std::cout << std::endl;
}
}
Kernel source:
__kernel void mCrossProd(const int M, const int N, const int K, __global float* A, __global float* B, __global float* C) {
int const i = get_global_id(0);
int const j = get_global_id(1);
int const debug_elem_id = 3; // purely for debug purposes.
for(int k = 0; k < K; k++){
C[i*N+j] += A[i*K+k] * B[N*k+j];
if((i*N+j)==debug_elem_id)
{
printf("PROD, %.2f\n", A[i*K+k] * B[N*k+j]);
}
}
if((i*N+j)==debug_elem_id)
{
printf("SUM: %.2f\n", C[i*N+j]);
}
}
Edit: Made corrections to example code and explanation. Credit to #mogu

Related

How can I change 3D Libtorch tensor into 3D eigen nested array(or matrix)

I need the 3D eigen nested array, which can be transfered from Libtorch tensor. The example code is as follows:
typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXf_rm;
typedef Eigen::Array<MatrixXf_rm, Eigen::Dynamic, 1> DMVector;
int main()
{
std::cout << "Testing Libtorch to Eigen" << std::endl;
auto T = torch::rand({3, 3, 2});
std::cout << "Libtorch:" << std::endl;
std::cout << T << std::endl;
DMVector result = DMVector::Constant(3, 1, MatrixXf_rm());
for (int i = 0; i < 3; i++)
{
float* data = T[i].data_ptr<float>();
Eigen::Map<MatrixXf_rm> E(data, T[i].size(0), T[i].size(1));
result(i) = E;
}
return 0;
}
I can get the right result from the above code.
But I don not want to do it in a for loop way: for (int i = 0; i < 3; i++). Is there a solution that can get the same result as the above code without the for loop?

How can I place n circles randomly inside a rectangle without overlapping?

Suppose I have n circles of radius r. I want to place them randomly inside a rectangle of size AxA.
It is guaranteed that they fit. One can suppose that the sum of the area of all circles is about 60% of the area of the rectangle.
I can try it by doing a backtracking, trying to place, going back, etc., but there should be a better way to do it.
One possibility is to generate random points inside the rectangle without further constraints, and then move the points/centres iteratively (by little steps) such that avoiding overlapping. If two points are too near one from each other, each point can bring pressure to the other, to make it going away a little bit. The higher the pressure, the higher the move.
This process was implemented in C++. In the following simple code, to facilitate implementation, points and vectors are represented par std::complex type.
Note that I used srandand rand for test purpose. You may used better random algorithms, depending on your constraints.
According to the tests that I have performed, convergence seems guaranteed for a density of 60%. I also made some tests with a density of 70%: sometimes convergence, sometimes not.
Complexity is O(n^2 n_iter), where nis the number of circles and n_iterthe number of iterations.
n_iteris generally between 100 and 300, for a density of 60%. It could be decreased with relaxing the convergence criteria.
It could be seems high complexity, compared to other proposals in comments. In practice, for n = 15, the work is performed in less than 30ms on my PC. Huge time or fast enough, depending on the context. I have included a figure to illustrate the algorithm.
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <vector>
#include <ctime>
#include <complex>
#include <cmath>
#include <tuple>
#include <ios>
#include <iomanip>
using dcomplex = std::complex<double>;
void print (const std::vector<dcomplex>& centers) {
std::cout << std::setprecision (9);
std::cout << "\ncenters:\n";
for (auto& z: centers) {
std::cout << real(z) << ", " << imag(z) << "\n";
}
}
std::tuple<bool, int, double> process (double A, double R, std::vector<dcomplex>& centers, int n_iter_max = 100) {
bool check = true;
int n = centers.size();
std::vector<dcomplex> moves (n, 0.0);
double acceleration = 1.0001; // to accelerate the convergence, if density not too large
// could be made dependent of the iteration index
double dmin;
auto limit = [&] (dcomplex& z) {
double zx = real(z);
double zi = imag(z);
if (zx < R) zx = R;
if (zx > A-R) zx = A-R;
if (zi < R) zi = R;
if (zi > A-R) zi = A-R;
return dcomplex(zx, zi);
};
int iter;
for (iter = 0; iter < n_iter_max; ++iter) {
for (int i = 0; i < n; ++i) moves[i] = 0.0;
dmin = A;
for (int i = 0; i < n; ++i) {
for (int j = i+1; j < n; ++j) {
auto vect = centers[i] - centers[j];
double dist = std::abs(vect);
if (dist < dmin) dmin = dist;
double x = std::max (0.0, 2*R*acceleration - dist) / 2.0;
double coef = x / (dist + R/10000);
moves[i] += coef * vect;
moves[j] -= coef * vect;
}
}
std::cout << "iteration " << iter << " dmin = " << dmin << "\n";
if (dmin/R >= 2.0 - 1.0e-6) break;
for (int i = 0; i < n; ++i) {
centers[i] += moves[i];
centers[i] = limit (centers[i]);
}
}
dmin = A;
for (int i = 0; i < n; ++i) {
for (int j = i+1; j < n; ++j) {
auto vect = centers[i] - centers[j];
double dist = std::abs(vect);
if (dist < dmin) dmin = dist;
}
}
std::cout << "Final: dmin/R = " << dmin/R << "\n";
check = dmin/R >= 2.0 - 1.0e-6;
return {check, iter, dmin};
}
int main() {
int n = 15; // number of circles
double R = 1.0; // ray of each circle
double density = 0.6; // area of all circles over total area A*A
double A; // side of the square
int n_iter = 1000;
A = sqrt (n*M_PI*R*R/density);
std::cout << "number of circles = " << n << "\n";
std::cout << "density = " << density << "\n";
std::cout << "A = " << A << std::endl;
std::vector<dcomplex> centers (n);
std::srand(std::time(0));
for (int i = 0; i < n; ++i) {
double x = R + (A - 2*R) * (double) std::rand()/RAND_MAX;
double y = R + (A - 2*R) * (double) std::rand()/RAND_MAX;
centers[i] = {x, y};
}
auto [check, n_iter_eff, dmin] = process (A, R, centers, n_iter);
std::cout << "check = " << check << "\n";
std::cout << "Relative min distance = " << std::setprecision (9) << dmin/R << "\n";
std::cout << "nb iterations = " << n_iter_eff << "\n";
print (centers);
return 0;
}

Value of sum from thrust::reduce not correct

I have been trying to implement some code requiring to call reduce on thrust::device_ptr, and the results are not consistent with CPU implementation while dealing with large values. I have to deal with large values. So is there a way around:
My code:
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
#define NZ 412//
#define NX 402//
using namespace std;
using real =double;
void allocate_array_2d(real**& preal, const int dim1, const int dim2) {
// Contiguous allocation of 2D arrays
preal = new real * [dim1];
preal[0] = new real[dim1 * dim2];
for (int i = 1; i < dim1; i++) preal[i] = preal[i - 1] + dim2;
for (int i = 0; i < dim1; i++) {
for (int j = 0; j < dim2; j++) {
preal[i][j] = 0;
}
}
}
#define cudaCheckError(code) \
{ \
if ((code) != cudaSuccess) { \
fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
cudaGetErrorString(code)); \
} \
}
int main()
{
real** a;
std::cout.precision(30);
allocate_array_2d(a, NZ, NX);//input array
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
a[i][j] = 2.14748e+09;
}
}
real* da;
cudaCheckError(cudaMalloc(&da, NZ * NX * sizeof(real)));
cudaCheckError(cudaMemcpy(da,a[0], NZ * NX * sizeof(real),cudaMemcpyHostToDevice));
///************************
//CUDA KERNELS ARE HERE
// REMOVED FOR CLEAR QUESTION
///*************************
real sum1=0;
thrust::device_ptr<real> dev_ptr = thrust::device_pointer_cast(da);
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
cout<<" \nsum gpu "<< sum1<<"\n";
real sum2=0;
////////CPU PART DOING SAME THING//////
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
sum2 += a[i][j];
}
}
cout<<"\nsum cpu "<< sum2<<"\n";
if((sum2-sum1)<0.001)
std::cout << "\nSUCESS "<< "\n";
else
std::cout << "\nFailure & by "<<sum2-sum1<< "\n";
}
The compiler that I am using is nvcc and my graphics card is nvidia 1650 with compute capability 7.5.
According to the documentation, thrust expects the type for summation to be reflected in the init value:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
^
The type of that constant you have is an integral type. If you change that to a double-precision constant:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0.0, thrust::plus<real>());
you get matching results, between CPU and GPU, according to my testing. (You could alternatively cast your constant to real type: (real)0 and use that, and there are other ways to address this as well, such as dropping the use of the init value and the binary op.)

Access an matrix as its tranpose in tiled matrix mutliplication in CUDA

I'm currently experimenting with CUDA and i came across this kernel from an answer for matrix multiplication: https://stackoverflow.com/a/18856054/7867026
I want instead of doing A*B to do A_Transpose*A but without saving A_Transpose (only matrix A as an input to kernel). I have to properly set the indexes but I'm confused by this matrix representation. Any help would be appreciated.
most of what you need is here and here.
In the first link it is identified that AxAT involves taking inner products of rows of matrix A, and similarly ATxA will involve taking inner products of columns of matrix A. Also note the symmetry statement. In the second link (scroll down from that point a bit in the programming guide) you will find a complete tiled matrix multiply. You just need to index into both tiles by column.
Here is a worked example, using the code from the SO answer you linked:
$ cat t1654.cu
#include <iostream>
#include <cstdio>
#include <cstdlib>
const int TILE_DIM = 32;
template <typename T>
__global__ void ATA(const T * __restrict__ A, T * __restrict__ C, int ARows, int ACols)
{
T CValue = 0;
int Row = blockIdx.y*TILE_DIM + threadIdx.y;
int Col = blockIdx.x*TILE_DIM + threadIdx.x;
__shared__ T As[TILE_DIM][TILE_DIM];
__shared__ T Bs[TILE_DIM][TILE_DIM];
for (int k = 0; k < (TILE_DIM + ARows - 1)/TILE_DIM; k++) {
if (k*TILE_DIM + threadIdx.y < ARows && blockIdx.y*blockDim.y+threadIdx.x < ACols)
As[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + blockIdx.y*blockDim.y+threadIdx.x];
else
As[threadIdx.y][threadIdx.x] = 0.0;
if (k*TILE_DIM + threadIdx.y < ARows && Col < ACols)
Bs[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + Col];
else
Bs[threadIdx.y][threadIdx.x] = 0.0;
__syncthreads();
for (int n = 0; n < TILE_DIM; ++n)
CValue += As[n][threadIdx.y] * Bs[n][threadIdx.x];
__syncthreads();
}
if (Row < ACols && Col < ACols)
C[((blockIdx.y * blockDim.y + threadIdx.y)*ACols) +
(blockIdx.x * blockDim.x)+ threadIdx.x] = CValue;
}
template <typename T>
__global__ void transpose_naive(const T * __restrict__ in, T * __restrict__ out, const int dim){
int col = threadIdx.x+blockDim.x*blockIdx.x;
int row = threadIdx.y+blockDim.y*blockIdx.y;
if ((col < dim) && (row < dim)) out[col*dim+row] = in[row*dim+col];
}
template <typename T>
__global__ void mm_naive(const T * __restrict__ A, const T * __restrict__ B, T * __restrict__ C, const int rowA, const int colA, const int colB){
int col = threadIdx.x+blockDim.x*blockIdx.x;
int row = threadIdx.y+blockDim.y*blockIdx.y;
if ((row < rowA) && (col < colB)){
T Cval = 0;
for (int i = 0; i < colA; i++) Cval += A[row*colA+i]*B[i*colB+col];
C[row*colB+col] = Cval;}
}
typedef float mt;
int main(){
mt *d_A, *d_B, *d_C, *h_A, *h_C, *h_C1;
int m = 64;
int n = 64;
h_A = new mt[m*n];
h_C = new mt[n*n];
h_C1 = new mt[n*n];
cudaMalloc(&d_A, m*n*sizeof(d_A[0]));
cudaMalloc(&d_B, m*n*sizeof(d_A[0]));
cudaMalloc(&d_C, n*n*sizeof(d_C[0]));
// test 1
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
h_A[i*n+j] = (i==j)?1.0f:0.0f;
cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
dim3 block(TILE_DIM, TILE_DIM);
dim3 grid((n+block.x-1)/block.x, (n+block.y-1)/block.y);
ATA<<<grid,block>>>(d_A, d_C, m, n);
cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
#endif
// test 2
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
h_A[i*n+j] = rand()%10;
cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
ATA<<<grid,block>>>(d_A, d_C, m, n);
cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
#endif
transpose_naive<<<grid,block>>>(d_A, d_B, n);
mm_naive<<<grid,block>>>(d_B, d_A, d_C, n, n, n);
cudaMemcpy(h_C1, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C1[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
#endif
for (int i = 0; i < n*n; i++) if (h_C[i] != h_C1[i]) {std::cout << "mismatch at: " << i << " was: " << h_C[i] << " should be: " << h_C1[i] << std::endl; return 0;}
}
$ nvcc -o t1654 t1654.cu
$ cuda-memcheck ./t1654
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
Note that loading the Bs tile is identical in both cases. The main changes are in loading the As tile, and also note the indexing change when computing Cvalue. These changes are necessary to index in both cases by column.
There may still be bugs. I have not tested the non-square case, nor have I tested the case where the matrix size is not a multiple of block size. Furthermore I've taken no advantage of the symmetry in the output. However this should help with the indexing.

Parallel Matrix Multiplication in C++

I am trying to implement Parallel Multi-threaded Matrix multiplication in C++. The method i follow involves dividing Arrays into 4 sub-arrays and carry out parallel Multiplication using 4 threads on these 4 sub arrays.
I have written a C++ code but it is throwing error and terminates explicitly. Error :
"terminate called after throwing an instance of std::system_error
what():invalid Argument"
Here is my complete code. I am relatively new to C++ and multi-threading.
#include <iostream>
#include <thread>
#include <mutex>
#include <vector>
#include <algorithm>
#include <string>
#define N 4
using namespace std;
mutex mu;
void stage_1_multiply(int *a,int *b,int *d){
int *xij;
int *yij;
int *zij;
int COLS = N,ROWS = N;
cout<< " thread "<< this_thread::get_id() << " "<<endl;
for(int i = 0;i<(N/2);++i){
for(int j = 0;j < (N/2); j++){
for(int k = 0; k<(N/2);k++){
mu.lock();
xij = a + ((COLS * i) + k);
yij = b + ((COLS * k) + j);
zij = d + ((COLS * i) + j);
*zij += ( (*xij) * (*yij) );
mu.unlock();
}
}
}
}
int main(){
int A[4][4],B[4][4],C[4][4],D_1[4][4],D_2[4][4];
for(int i = 0;i<4;i++){
for(int j = 0;j<4;j++){
A[i][j] = i + 1;
B[i][j] = i + 1;
C[i][j] = 0;
D_1[i][j] = 0;
D_2[i][j] = 0;
}
}
for(int i = 0;i<4;i++){
for(int j = 0;j< 4;j++){
cout << A[i][j] << " ";
}
cout << endl;
}
for(int i = 0;i<4;i++){
for(int j = 0;j< 4;j++){
cout << B[i][j] << " ";
}
cout << endl;
}
vector< thread> threads(8);
int th = 0;
threads[th++] = thread(stage_1_multiply,&A[0][0],&B[0][0],&D_1[0][0]);
threads[th++] = thread(stage_1_multiply,&A[0][2],&B[2][0],&D_2[0][0]);
threads[th++] = thread(stage_1_multiply,&A[2][0],&B[0][2],&D_1[2][2]);
threads[th++] = thread(stage_1_multiply,&A[2][2],&B[2][2],&D_2[2][2]);
for( auto& t : threads){
t.join();
}
threads[th++] = thread(stage_1_multiply,&A[0][0],&B[0][2],&D_1[0][2]);
threads[th++] = thread(stage_1_multiply,&A[0][2],&B[2][2],&D_2[0][2]);
threads[th++] = thread(stage_1_multiply,&A[2][0],&B[0][0],&D_1[2][0]);
threads[th++] = thread(stage_1_multiply,&A[2][2],&B[2][0],&D_2[2][0]);
for( auto& t : threads){
t.join();
}
// code to add The Matrices D_1 and D_2 goes here.
for(int i = 0;i<4;i++){
for(int j = 0;j< 4;j++){
cout << D_1[i][j] << " ";
}
cout << endl;
}
cout << " Main Close "<<endl;
return 0;
}
What am doing wrong? is it anything related to parallel access of shared memory? If so how can i correct it?
PS: This is a homework Assignment.

Resources