I'm currently experimenting with CUDA and i came across this kernel from an answer for matrix multiplication: https://stackoverflow.com/a/18856054/7867026
I want instead of doing A*B to do A_Transpose*A but without saving A_Transpose (only matrix A as an input to kernel). I have to properly set the indexes but I'm confused by this matrix representation. Any help would be appreciated.
most of what you need is here and here.
In the first link it is identified that AxAT involves taking inner products of rows of matrix A, and similarly ATxA will involve taking inner products of columns of matrix A. Also note the symmetry statement. In the second link (scroll down from that point a bit in the programming guide) you will find a complete tiled matrix multiply. You just need to index into both tiles by column.
Here is a worked example, using the code from the SO answer you linked:
$ cat t1654.cu
#include <iostream>
#include <cstdio>
#include <cstdlib>
const int TILE_DIM = 32;
template <typename T>
__global__ void ATA(const T * __restrict__ A, T * __restrict__ C, int ARows, int ACols)
T CValue = 0;
int Row = blockIdx.y*TILE_DIM + threadIdx.y;
int Col = blockIdx.x*TILE_DIM + threadIdx.x;
__shared__ T As[TILE_DIM][TILE_DIM];
__shared__ T Bs[TILE_DIM][TILE_DIM];
for (int k = 0; k < (TILE_DIM + ARows - 1)/TILE_DIM; k++) {
if (k*TILE_DIM + threadIdx.y < ARows && blockIdx.y*blockDim.y+threadIdx.x < ACols)
As[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + blockIdx.y*blockDim.y+threadIdx.x];
As[threadIdx.y][threadIdx.x] = 0.0;
if (k*TILE_DIM + threadIdx.y < ARows && Col < ACols)
Bs[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + Col];
Bs[threadIdx.y][threadIdx.x] = 0.0;
for (int n = 0; n < TILE_DIM; ++n)
CValue += As[n][threadIdx.y] * Bs[n][threadIdx.x];
if (Row < ACols && Col < ACols)
C[((blockIdx.y * blockDim.y + threadIdx.y)*ACols) +
(blockIdx.x * blockDim.x)+ threadIdx.x] = CValue;
template <typename T>
__global__ void transpose_naive(const T * __restrict__ in, T * __restrict__ out, const int dim){
int col = threadIdx.x+blockDim.x*blockIdx.x;
int row = threadIdx.y+blockDim.y*blockIdx.y;
if ((col < dim) && (row < dim)) out[col*dim+row] = in[row*dim+col];
template <typename T>
__global__ void mm_naive(const T * __restrict__ A, const T * __restrict__ B, T * __restrict__ C, const int rowA, const int colA, const int colB){
int col = threadIdx.x+blockDim.x*blockIdx.x;
int row = threadIdx.y+blockDim.y*blockIdx.y;
if ((row < rowA) && (col < colB)){
T Cval = 0;
for (int i = 0; i < colA; i++) Cval += A[row*colA+i]*B[i*colB+col];
C[row*colB+col] = Cval;}
typedef float mt;
int main(){
mt *d_A, *d_B, *d_C, *h_A, *h_C, *h_C1;
int m = 64;
int n = 64;
h_A = new mt[m*n];
h_C = new mt[n*n];
h_C1 = new mt[n*n];
cudaMalloc(&d_A, m*n*sizeof(d_A[0]));
cudaMalloc(&d_B, m*n*sizeof(d_A[0]));
cudaMalloc(&d_C, n*n*sizeof(d_C[0]));
// test 1
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
h_A[i*n+j] = (i==j)?1.0f:0.0f;
cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
dim3 block(TILE_DIM, TILE_DIM);
dim3 grid((n+block.x-1)/block.x, (n+block.y-1)/block.y);
ATA<<<grid,block>>>(d_A, d_C, m, n);
cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
// test 2
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
h_A[i*n+j] = rand()%10;
cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
ATA<<<grid,block>>>(d_A, d_C, m, n);
cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
transpose_naive<<<grid,block>>>(d_A, d_B, n);
mm_naive<<<grid,block>>>(d_B, d_A, d_C, n, n, n);
cudaMemcpy(h_C1, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
std::cout << h_C1[i*n+j] << " ";
std::cout << std::endl;}
std::cout << std::endl;
for (int i = 0; i < n*n; i++) if (h_C[i] != h_C1[i]) {std::cout << "mismatch at: " << i << " was: " << h_C[i] << " should be: " << h_C1[i] << std::endl; return 0;}
$ nvcc -o t1654 t1654.cu
$ cuda-memcheck ./t1654
========= ERROR SUMMARY: 0 errors
Note that loading the Bs tile is identical in both cases. The main changes are in loading the As tile, and also note the indexing change when computing Cvalue. These changes are necessary to index in both cases by column.
There may still be bugs. I have not tested the non-square case, nor have I tested the case where the matrix size is not a multiple of block size. Furthermore I've taken no advantage of the symmetry in the output. However this should help with the indexing.
I am trying to implement a parallel reduction sum of even and odd number Separately in CUDA.
I'm new in CUDA programming and I'm trying so hard but I can't find a solution.
I have for example the array : [5, 8, 0, -6, 2]. And the result need to be [4, 5] (Even : 8+0-6+2=4, Odd : 5=5).
But the result of my following code is [8, 5].
I think that my problem is in the notion of "shared" but I do not understand why.
__global__ void sumEvenOdd(int *a, int *b, int N){
int column = blockIdx.x * blockIdx.x + threadIdx.x;
__shared__ int s_data[2];
if (column < N){
if (a[column] % 2 == 0){
s_data[0] += a[column];
s_data[1] += a[column];
b[0] = s_data[0];
b[1] = s_data[1];
void initArray(int *a, int N){
for (unsigned int i = 0; i < N; i++){
a[i] = rand() % 100;
void verify_result(int *a, int *b, int N){
int *verify_b;
verify_b = (int*)malloc(2 * sizeof(int));
verify_b[0] = 0;
verify_b[1] = 0;
for (unsigned int i = 0; i < N; i++){
if (a[i] % 2 == 0){
verify_b[0] += a[i];
verify_b[1] += a[i];
for (unsigned int i = 0; i < 2; i++){
assert(verify_b[i] == b[i]);
void printResult(int *a, int *b, int N){
for (unsigned int i = 0; i < N; i++){
printf("%d, ", a[i]);
for (unsigned int i = 0; i < 2; i++){
printf("%d, ", b[i]);
int main(){
//Array sizes;
int N = 5;
//Size (in bytes) of matrix
size_t bytes = N * sizeof(int);
//Host pointers
int *a, *b;
// Allocate host memory
a = (int*)malloc(bytes);
b = (int*)malloc(2 * sizeof(int));
// Initialize array
initArray(a, N);
// Device pointers
int *d_a, *d_b;
// Allocated device memory
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, 2 * sizeof(int));
// Copy data to the device
cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice);
//Number of threads
int THREADS = 128;
//Number of blocks
// Launch kernel
sumEvenOdd<<<BLOCKS, THREADS>>>(d_a, d_b, N);
// Copy back to the host
cudaMemcpy(b, d_b, 2 * sizeof(int), cudaMemcpyDeviceToHost);
// Check result
verify_result(a, b, N);
printResult(a, b, N);
return 0;
you cannot just use
s_data[1] += a[column];
remember all units are going to execute this line at the same time, and store in the same position, so all threads are storing into s_data at the same time.
instead you should use atomic add
atomicAdd(&s_data[1], a[column]);
and you should also be initializing s_data to zeros.
I'm relatively new to Thrust and I'm trying to perform a segmented scan. Here is my code, which you should be able to run as-is:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <chrono>
// Sequential scan for CPU
float* test_seqScan(float* in, int s, int m) {
float* out = new float[s * m];
for (unsigned int i = 0; i < s; i++) {
out[i * m] = 0;
for (unsigned int i = 0; i < s; i++) {
for (unsigned int j = 1; j < m; j++) {
out[i * m + j] = out[i * m + j - 1] + in[i * m + j - 1];
return out;
void test_sumScan(thrust::device_vector<float> dev_in, thrust::device_vector<int> dev_keys, int s, int m) {
// Allocate device memory for output
thrust::device_vector<float> dev_out(s * m);
thrust::exclusive_scan_by_key(thrust::device, dev_keys.begin(), dev_keys.end(), dev_in.begin(), dev_out.begin());
int main(){
int s = 100;
int m = 100000;
float* seq_in = new float[s * m];
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
seq_in[i * m + j] = j + 1;
thrust::host_vector<float> par_in(s * m);
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
par_in[i * m + j] = j + 1;
thrust::host_vector<int> keys(s * m);
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
keys[i * m + j] = i;
thrust::device_vector<float> dev_in = par_in;
thrust::device_vector<int> dev_keys = keys;
auto t1 = std::chrono::high_resolution_clock::now();
test_seqScan(seq_in, s, m);
auto t2 = std::chrono::high_resolution_clock::now();
auto duration1 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
std::cout << "Sequential duration: " << duration1 << "\n\n";
auto t3 = std::chrono::high_resolution_clock::now();
test_sumScan(dev_in, dev_keys, s, m);
auto t4 = std::chrono::high_resolution_clock::now();
auto duration2 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
std::cout << "Parallel duration: " << duration2 << "\n\n";
My issue is that both these snippets of code take exactly the same amount of time to run regardless of how small or large I set s and m. I assume that I'm doing something wrong, but I don't know what; can anyone point out the issue?
I have been trying to implement some code requiring to call reduce on thrust::device_ptr, and the results are not consistent with CPU implementation while dealing with large values. I have to deal with large values. So is there a way around:
My code:
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
#define NZ 412//
#define NX 402//
using namespace std;
using real =double;
void allocate_array_2d(real**& preal, const int dim1, const int dim2) {
// Contiguous allocation of 2D arrays
preal = new real * [dim1];
preal[0] = new real[dim1 * dim2];
for (int i = 1; i < dim1; i++) preal[i] = preal[i - 1] + dim2;
for (int i = 0; i < dim1; i++) {
for (int j = 0; j < dim2; j++) {
preal[i][j] = 0;
#define cudaCheckError(code) \
{ \
if ((code) != cudaSuccess) { \
fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
cudaGetErrorString(code)); \
} \
int main()
real** a;
allocate_array_2d(a, NZ, NX);//input array
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
a[i][j] = 2.14748e+09;
real* da;
cudaCheckError(cudaMalloc(&da, NZ * NX * sizeof(real)));
cudaCheckError(cudaMemcpy(da,a[0], NZ * NX * sizeof(real),cudaMemcpyHostToDevice));
real sum1=0;
thrust::device_ptr<real> dev_ptr = thrust::device_pointer_cast(da);
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
cout<<" \nsum gpu "<< sum1<<"\n";
real sum2=0;
for (int i = 0; i < NZ; i++) {
for (int j = 0; j < NX; j++) {
sum2 += a[i][j];
cout<<"\nsum cpu "<< sum2<<"\n";
std::cout << "\nSUCESS "<< "\n";
std::cout << "\nFailure & by "<<sum2-sum1<< "\n";
The compiler that I am using is nvcc and my graphics card is nvidia 1650 with compute capability 7.5.
According to the documentation, thrust expects the type for summation to be reflected in the init value:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0, thrust::plus<real>());
The type of that constant you have is an integral type. If you change that to a double-precision constant:
sum1 = thrust::reduce(dev_ptr, dev_ptr+NZ*NX, 0.0, thrust::plus<real>());
you get matching results, between CPU and GPU, according to my testing. (You could alternatively cast your constant to real type: (real)0 and use that, and there are other ways to address this as well, such as dropping the use of the init value and the binary op.)
I'm currently working on a MPI-program and I'm trying to send blocks of a matrix with scatterv to all processes.
Process description
The matrix is given as an array.
First I produce a datatype with MPI_Type_vector to create the necessary block out of the original array.
Second I create a MPI_Type_struct that should hold rows of blocks.
#include <math.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define n 16
int main(int argc, char *argv[])
MPI_Init(&argc, &argv);
int p,r;
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &r);
int *arr;
arr = NULL;
if (r == 0){
arr = (int *) malloc(n * n * sizeof(int));
for (int i = 0; i < n * n; i++) arr[i] = i;
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++)
printf("%4d", arr[i * n + j]);
int ps = sqrt(p);
int ns = n / ps;
if (r == 0) {
printf("ps: %d ns: %d\n", ps, ns);
/* create datatype */
MPI_Datatype block;
MPI_Type_vector(ns, ns, n, MPI_INT, &block);
int blocks[ps];
MPI_Aint displs[ps];
for (int i = 0; i < ps; i++) {
blocks[i] = 1;
displs[i] = i * sizeof(int);
MPI_Datatype types[ps];
//for (int i = 0; i < ps - 1; i++) types[i] = block;
//types[ps - 1] = MPI_UB;
types[0] = block;
for (int i = 1; i < ps; i++) types[i] = MPI_UB;
//types[0] = block;
//types[1] = MPI_UB;
if (r == 0) {
for(int i = 0; i < ps; i++) printf("%3ld", displs[i]);
MPI_Datatype row;
MPI_Type_struct(ps, blocks, displs, types, &row);
/* prepare scatter */
int sdispl[p]; int sendcounts[p];
for (int i = 0; i < p; i++) {
sdispl[i] = (i % ps) + (i / ps) * (ns * ps);
sendcounts[i] = 1;
if (r == 0) {
printf("sdispl: \n");
for (int i = 0; i < 4; i++) printf("%3d", sdispl[i]);
int rcv[ns * ns];
MPI_Scatterv(arr, sendcounts, sdispl, row, rcv, ns * ns, MPI_INT, 0, comm);
int result = 1;
if (r == result) {
printf("result for %d:\n", result);
for (int i = 0; i < ns * ns; i++) {
printf("%4d", rcv[i]);
if ((i+1) % ns == 0) printf("\n");
if (arr != NULL) free(arr);
return 0;
So far the structure of the blocks is correct.
The problem
The block, that was sent to process r = 1 starts with 3 instead of 4. The block for process r = 2 also starts with 6 and the one for process r = 3 starts with 9.
For r == 4 it jumps to 48.
What it should do
r start
0 0
1 4
2 8
3 12
4 64
5 68
6 ...
15 204
The help I would need
I think, that I'm making some mistake with displ and sdispl.
Compiling and Running the example
The code is compiled with the folowing command:
mpicc -o main main.c -lm
I run the code with:
mpirun -np 16 ./main
Thanks for any help in advance!
With the hint of Zulan I was able to solve my problem.
The following code is based on the excellent answer to subarrays.
#include <math.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define n 8
void print_arr(int *arr, int x) {
for (int i = 0; i < x*x; i++){
if (i % x == 0) printf("\n");
printf("%4d", arr[i]);
int main(int argc, char *argv[])
MPI_Init(&argc, &argv);
int p, r;
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &r);
/* number of proceses in dim x and dim y */
int ps = sqrt(p);
/* number of elements in dim x and dim y in sarr */
int ns = n/ps;
/* array of data - distributed by process 0 */
int *arr = NULL;
if (r==0) {
arr = (int *) malloc(n * n * sizeof(int));
for (int i = 0; i < n*n; i++) arr[i] = i;
print_arr(arr, n);
MPI_Datatype type, resizedtype;
int sizes[2] = {n,n};
int subsizes[2] = {ns,ns};
int starts[2] = {0,0};
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &type);
MPI_Type_create_resized(type, 0, ns*sizeof(int), &resizedtype);
int counts[p];
for (int i = 0; i < p; i++) counts[i] = 1;
int displs[p];
for (int i = 0; i < p; i++) displs[i] = i%ps + i/ps * ns * ps;
/* subarray to store distributed data */
int sarr[ns * ns];
/* send submatrices to all processes */
MPI_Scatterv(arr, counts, displs, resizedtype, sarr, ns*ns, MPI_INT, 0, comm);
/* print received data for process pr */
int pr = 3;
if (r == pr)
print_arr(sarr, ns);
/* free arr */
if (arr != NULL) free(arr);
return 0;
You can compile the example with
mpicc -o main main.c
and run it with
mpirun -np 4 ./main
I am trying to implement Parallel Multi-threaded Matrix multiplication in C++. The method i follow involves dividing Arrays into 4 sub-arrays and carry out parallel Multiplication using 4 threads on these 4 sub arrays.
I have written a C++ code but it is throwing error and terminates explicitly. Error :
"terminate called after throwing an instance of std::system_error
what():invalid Argument"
Here is my complete code. I am relatively new to C++ and multi-threading.
#include <iostream>
#include <thread>
#include <mutex>
#include <vector>
#include <algorithm>
#include <string>
#define N 4
using namespace std;
mutex mu;
void stage_1_multiply(int *a,int *b,int *d){
int *xij;
int *yij;
int *zij;
int COLS = N,ROWS = N;
cout<< " thread "<< this_thread::get_id() << " "<<endl;
for(int i = 0;i<(N/2);++i){
for(int j = 0;j < (N/2); j++){
for(int k = 0; k<(N/2);k++){
xij = a + ((COLS * i) + k);
yij = b + ((COLS * k) + j);
zij = d + ((COLS * i) + j);
*zij += ( (*xij) * (*yij) );
int main(){
int A[4][4],B[4][4],C[4][4],D_1[4][4],D_2[4][4];
for(int i = 0;i<4;i++){
for(int j = 0;j<4;j++){
A[i][j] = i + 1;
B[i][j] = i + 1;
C[i][j] = 0;
D_1[i][j] = 0;
D_2[i][j] = 0;
for(int i = 0;i<4;i++){
for(int j = 0;j< 4;j++){
cout << A[i][j] << " ";
cout << endl;
for(int i = 0;i<4;i++){
for(int j = 0;j< 4;j++){
cout << B[i][j] << " ";
cout << endl;
vector< thread> threads(8);
int th = 0;
threads[th++] = thread(stage_1_multiply,&A[0][0],&B[0][0],&D_1[0][0]);
threads[th++] = thread(stage_1_multiply,&A[0][2],&B[2][0],&D_2[0][0]);
threads[th++] = thread(stage_1_multiply,&A[2][0],&B[0][2],&D_1[2][2]);
threads[th++] = thread(stage_1_multiply,&A[2][2],&B[2][2],&D_2[2][2]);
for( auto& t : threads){
threads[th++] = thread(stage_1_multiply,&A[0][0],&B[0][2],&D_1[0][2]);
threads[th++] = thread(stage_1_multiply,&A[0][2],&B[2][2],&D_2[0][2]);
threads[th++] = thread(stage_1_multiply,&A[2][0],&B[0][0],&D_1[2][0]);
threads[th++] = thread(stage_1_multiply,&A[2][2],&B[2][0],&D_2[2][0]);
for( auto& t : threads){
// code to add The Matrices D_1 and D_2 goes here.
for(int i = 0;i<4;i++){
for(int j = 0;j< 4;j++){
cout << D_1[i][j] << " ";
cout << endl;
cout << " Main Close "<<endl;
return 0;
What am doing wrong? is it anything related to parallel access of shared memory? If so how can i correct it?
PS: This is a homework Assignment.