Matrix Multiplication : Performance decreases after coalescing global memory access in CUDA

Matrix Multiplication : Performance decreases after coalescing global memory access in CUDA - parallel-processing

I have started recently started working with GPUs using CUDA. As a starter programme I tried to implement a simple matrix multiplication efficiently
C = AB
, Starting with the naive matrix multiplication (each thread loads all the elements of A and B for an element in C), the tiled implementation (threads collaboratively load a tile of elements from A and B in a tile in shared memory to reduce global memory traffic) provides good speed up.
However, in the tiled implementation too the access to the global memory is not in coalesced order. So, to increase performance it is better to transpose matrix B and then multiply. Below is my code,
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include <time.h>
#include <sys/time.h>
void querydeviceprop();
void allocate_matrix(float *h_a, float *h_b, int matDim);
void verify(float *h_c, float *h_c_check, int matDim);
void print_matrix(float *ha, int m,int n);
void transpose_matrix(float *ha, int matDim);
void mat_mul();
#define TILE_WIDTH 16 //should be equal to numThread for tiling implementation
__global__ void MatrixMult_tiling(float *d_a,float *d_b,float *d_c, int dim){
__shared__ float ta[TILE_WIDTH][TILE_WIDTH]; //to load one tile of A
__shared__ float tb[TILE_WIDTH][TILE_WIDTH]; //to load one tile of A
int bx,by,tx,ty,i,j;
float res;
int row, col;
bx=blockIdx.x; by=blockIdx.y;
tx=threadIdx.x; ty=threadIdx.y;
row=by*TILE_WIDTH+ty;
col=bx*TILE_WIDTH+tx;
res=0;
for(i=0;i<dim/TILE_WIDTH;i++){
//collaboratively load the elements. Each thread loads a single element.
ta[ty][tx]=d_a[row*dim+TILE_WIDTH*i+tx];
tb[ty][tx]=d_b[(ty+i*TILE_WIDTH)*dim+col];
__syncthreads();
for(j=0;j<TILE_WIDTH;j++){
res=res+ta[ty][j]*tb[j][tx];
}
__syncthreads();
}
d_c[row*dim+col]=res;
}
__global__ void MatrixMult_tiling_coalesced(float *d_a,float *d_b,float *d_c, int dim){
__shared__ float ta[TILE_WIDTH][TILE_WIDTH]; //to load one tile of A
__shared__ float tb[TILE_WIDTH][TILE_WIDTH]; //to load one tile of A
int bx,by,tx,ty,i,j;
float res;
int row, col;
bx=blockIdx.x; by=blockIdx.y;
tx=threadIdx.x; ty=threadIdx.y;
row=by*TILE_WIDTH+ty;
col=bx*TILE_WIDTH+tx;
res=0;
for(i=0;i<dim/TILE_WIDTH;i++){
//collaboratively load the elements. Each thread loads a single element.
ta[ty][tx]=d_a[row*dim+TILE_WIDTH*i+tx];
tb[ty][tx]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
__syncthreads();
for(j=0;j<TILE_WIDTH;j++){
res=res+ta[ty][j]*tb[tx][j];
}
__syncthreads();
}
d_c[row*dim+col]=res;
}
__global__ void MatrixMult_naive(float *d_a,float *d_b,float *d_c, int dim){
int row,col,i;
col=blockIdx.y*blockDim.y+threadIdx.y;
row=blockIdx.x*blockDim.x+threadIdx.x;
float res;
if(row<dim && col<dim){
res=0;
for(i=0;i<dim;i++){
res=res+(d_a[row*dim+i]*d_b[i*dim+col]);
}
d_c[row*dim+col]=res;
}
}
int main(){
mat_mul();
return 0;
}
void mat_mul(){
cudaSetDevice(0);
time_t t;
cudaError_t err = cudaSuccess;
srand((unsigned) time(&t));
cudaEvent_t start, stop;
float milliseconds=0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int matDim = 8192;
float *h_a, *h_b, *h_c, *h_c_check;
/*declare the host memories*/
h_a=(float *)malloc(matDim*matDim*sizeof(float));
h_b=(float *)malloc(matDim*matDim*sizeof(float));
h_c=(float *)malloc(matDim*matDim*sizeof(float));
h_c_check=(float *)malloc(matDim*matDim*sizeof(float));
// Verify that allocations succeeded
if (h_a == NULL || h_b == NULL || h_c == NULL || h_c_check ==NULL)
{
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
allocate_matrix(h_a,h_b,matDim); // allocate memory to hold the matrix
//allocate cuda memory
float *d_a=NULL;
float *d_b=NULL;
float *d_c=NULL;
err=cudaMalloc((void **)&d_a, matDim*matDim*sizeof(float));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device matrix A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err=cudaMalloc((void **)&d_b, matDim*matDim*sizeof(float));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device matrix A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err=cudaMalloc((void **)&d_c, matDim*matDim*sizeof(float));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device matrix A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
printf("Matrix dimension is : %d\n",matDim);
// Copy the host input matrix A and B in host memory to the device matrix in device memory
//printf("Copy input data from the host memory to the CUDA device\n");
cudaEventRecord(start);
err = cudaMemcpy(d_a, h_a, matDim*matDim*sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
//printf("GPU memcpy matrix A %10.10f ms\n",milliseconds);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
cudaEventRecord(start);
err = cudaMemcpy(d_b, h_b, matDim*matDim*sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
//printf("GPU memcpy matrix B %10.10f ms\n",milliseconds);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
/*constants for kernel launch*/
int numThread=16; //number of threads per Block axis
int numBlocks=matDim/numThread;
if(matDim%numThread)
numBlocks++;
dim3 dimGrid(numBlocks,numBlocks);
dim3 dimBlock(numThread,numThread);
//-------------------------------------------------------------
//-------transpose and copy to GPU-------
transpose_matrix(h_b, matDim);//transpose first the b matrix
err = cudaMemcpy(d_b, h_b, matDim*matDim*sizeof(float), cudaMemcpyHostToDevice);
cudaEventSynchronize(stop);
if (err != cudaSuccess){
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//--------transpose and copy ends-------------
cudaEventRecord(start);
MatrixMult_tiling_coalesced<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, matDim);
cudaEventRecord(stop);
err = cudaGetLastError();
if (err != cudaSuccess){
fprintf(stderr, "Failed to launch vector matrix kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("GPU time tiled & coalesced %10.10f ms\n",milliseconds);
//printf("Copy output data from the CUDA device to the host memory\n");
cudaEventRecord(start);
err = cudaMemcpy(h_c_check, d_c, matDim*matDim*sizeof(float), cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
//printf("GPU memcpy time tiled & coalesced %10.10f ms\n",milliseconds);
//------------transpose back the original B matrix----------------
transpose_matrix(h_b, matDim);//transpose first the b matrix
err = cudaMemcpy(d_b, h_b, matDim*matDim*sizeof(float), cudaMemcpyHostToDevice);
cudaEventSynchronize(stop);
if (err != cudaSuccess){
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//------------transpose back the original matrix ends-------------
//-------------------------------------------------------------
cudaEventRecord(start);
MatrixMult_tiling<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, matDim);
cudaEventRecord(stop);
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vector matrix kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("GPU time tiled %10.10f ms\n",milliseconds);
//printf("Copy output data from the CUDA device to the host memory\n");
cudaEventRecord(start);
err = cudaMemcpy(h_c, d_c, matDim*matDim*sizeof(float), cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
//printf("GPU memcpy time tiled %10.10f ms\n",milliseconds);
//-------------------------------------------------------------
/*
cudaEventRecord(start);
MatrixMult_naive<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, matDim);
cudaEventRecord(stop);
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vector matrix kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("GPU time naive %10.10f ms\n",milliseconds);
printf("Copy output data from the CUDA device to the host memory\n");
cudaEventRecord(start);
err = cudaMemcpy(h_c, d_c, matDim*matDim*sizeof(float), cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("GPU memcpy time naive %10.10f ms\n",milliseconds);
*/
//-------------------------------------------------------------
verify(h_c, h_c_check, matDim);
// Free device global memory
err = cudaFree(d_a);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_b);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_c);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Free host memory
free(h_a);
free(h_b);
free(h_c);
printf("Done\n");
}
void allocate_matrix(float *h_a, float *h_b, int matDim){
int i,j;
// Initialize the host input vectors
for (i = 0; i < matDim; i++)
{
for(j=0;j< matDim;j++){
h_a[i*matDim+j] = rand()%10;
h_b[i*matDim+j] = rand()%10;
}
}
}
void print_matrix(float *ha, int m,int n){
int i, j;
for(i=0;i<m;i++){
for(j=0;j<n;j++){
printf(" %.1f ",ha[i*m+j]);
}
printf("\n");
}
}
void transpose_matrix(float *h_a, int matDim){
int i, j;
int temp;
for(i=0;i<matDim;i++)
{
for(j=0;j<i;j++)
{
temp=h_a[i*matDim+j];
h_a[i*matDim+j]=h_a[j*matDim+i];
h_a[j*matDim+i]=temp;
}
}
}
void verify(float *h_c, float *h_c_check, int matDim){
int i,j;
//check the code
for (i = 0; i < matDim; i++)
{
for(j=0;j<matDim;j++){
if (fabs(h_c[i*matDim+j] - h_c_check[i*matDim+j]) > 1e-5)
{
printf("cpu : %f , gpu : %f\t",h_c[i*matDim+j],h_c_check[i*matDim+j]);
fprintf(stderr, "Result verification failed at element %d,%d !\n\n", i,j);
exit(EXIT_FAILURE);
}
}
}
printf("Test PASSED\n");
}
MatrixMult_tiling_coalesced and void MatrixMult_tiling are the functions with and without colalesced memroy access of the elements of B respectively.
Now, the problem is the time taken by MatrixMult_tiling_coalesced is almost double of the time taken by MatrixMult_tiling.
I understand that in the MatrixMult_tiling the eleemnts are loaded in the tiles in coalesced manner(i.e in row major order) for each tile, but the tiles are arranged in along a column whereas the the tiles in MatrixMult_tiling_coalesced are arranged along a row, so the function MatrixMult_tiling_coalesced should be faster than the other one.
But in practice I can see the opposite is true.
I will appreciate if someone can point out the reason.
Thnaks in advance..
EDIT 1:
After the answer of Robert (see below) I understand the problem was in the load operation during elemntwise multiplication.
tb[ty][tx]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];]
to
tb[tx][ty]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
and
res=res+ta[ty][j]*tb[tx][j];
to
res=res+ta[ty][j]*tb[j][tx];
This inclreased performance of the MatrixMult_tiling_coalesced function from 1500 ms to 1000 ms. However, the function MatrixMult_tiling takes only 879 ms. So, the coalesced routine is still slower. I don't understand where is the problem now.
EDIT 2 :
I realized that in the EDIT 1, I had just moved the bank conflict problem from the elementwise multiplication to the element loading section. The folling changes in the code has no bank conflict,
tb[tx][ty]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
to
tb[ty][tx]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
And
res=res+ta[ty][j]*tb[j][tx];
to
res=res+ta[ty][j]*tb[ty][j];
But it is still slightly slower than the MatrixMult_tiling function. The MatrixMult_tiling_coalesced function takes 982 ms vs 870 ms of MatrixMult_tiling function. It should be at least similar to MatrixMult_tiling if not faster.
FINAL EDIT :
Edit 2 will not produce correct result. So the code with edit 1 will be optimum. Transposing the one of the multiplicand matrix is probably not a good idea. :-(
Thanks all for helping.

B certainly isn't the matrix I would transpose in C=AB. But that is neither here nor there.
I'm not sure why you think:
in the tiled implementation too the access to the global memory is not in coalesced order
I don't see any lines of code in your MatrixMult_tiling that result in uncoalesced access.
Just to make sure we don't trip over terminology, "coalesced" or "uncoalesced" are terms that we apply to access patterns to global memory (not shared memory). Your global memory access patterns are in these lines in your tiled kernel:
ta[ty][tx]=d_a[row*dim+TILE_WIDTH*i+tx];
tb[ty][tx]=d_b[(ty+i*TILE_WIDTH)*dim+col];
...
d_c[row*dim+col]=res;
and none of those patterns to global memory are uncoalesced. In each of the generated indices into d_a, d_b and d_c, if you perform the substitutions, you will find that the threadIdx.x variable is present in all of them and is not multiplied by any value, constant or otherwise. Therefore these patterns will all coalesce (nicely).
I will appreciate if someone can point out the reason.
You have done something bad when it comes to shared memory.
In your tiled kernel, your multiplication operation looks like this:
res=res+ta[ty][j]*tb[j][tx];
For this case:
ta[ty][j]
we have a situation where all threads in the warp (which have linearly increasing tx values but the same ty value) are reading the same location in shared memory. This is an "optimal" access pattern - it does not present any bank conflicts, and will be serviced in the shortest possible time.
For this case:
tb[j][tx]
we have a situation where adjacent threads in the warp are reading adjacent locations in shared memory. This is also an "optimal", un-bank-conflicted pattern, and will be serviced in the shortest possible time.
However in your MatrixMult_tiling_coalesced kernel, the corresponding multiplication operation is:
res=res+ta[ty][j]*tb[tx][j];
Again, with this case:
ta[ty][j]
we have a shared memory "broadcast" pattern (all threads in a warp read from the same location) which is optimal and fast. But in this case:
tb[tx][j]
you have actually created columnar access into shared memory. This is the worst possible access pattern for shared memory, and it will result in a 32-way serialization (or possibly 16-way serialization, in the case of your 16x16 threadblocks) of the load process, and definitely worse performance. Why? Remember that for a given load, j is constant across the warp, and tx increases linearly across the warp. Therefore, let's say j is 1 on a particular loop iteration. The threads in warp 0 will read:
tb[0][1], tb[1][1], tb[2][1], tb[3][1], ...
and these locations all belong to a particular "column" of shared memory, i.e. they all belong to the same shared memory bank. This is the worst-case pattern for shared memory.
For completeness, I claim that all of your global memory access patterns in your MatrixMult_tiling_coalesced kernel are also coalesced:
ta[ty][tx]=d_a[row*dim+TILE_WIDTH*i+tx];
tb[ty][tx]=d_b[bx*TILE_WIDTH*dim + TILE_WIDTH*i+ty*dim+tx];
...
d_c[row*dim+col]=res;
so there should be no major difference in the global memory access pattern/activity/efficiency, between your two kernel implementations.
As a side note, I assume this is all a learning exercise. If you are interested in high-performance matrix multiply on the GPU, I would encourage you to consider using CUBLAS.

Related

When I try to use multidimensional blocks or grids part of my resulted array occurs zeros

I slightly modify CUDA 10.1 Runtime project to acquaint with multidimensional blocks and grids. I use Visual Studio 2015 and NVIDA Quatro P400 video card. But in resulting array after some correct results follow zero values. What is wrong in following program? It uses meltidimensional blocks. The same is with grids.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
const int arraySize = 448 * 1024;
int a[arraySize];
int b[arraySize];
int c[arraySize] = { 0 };
__global__ void addKernel(int *c, const int *a, const int *b)
{
int i = 256*(blockIdx.y*blockDim.x + threadIdx.y) + blockIdx.x*blockDim.x + threadIdx.x;
c[i] = a[i] + b[i];
}
int main()
{
int i;
for(i = 0; i < arraySize; i++)
{
a[i] = i;
b[i] = i;
}
// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
dim3 threads(256, 4, 1);
dim3 blocks(size >> 10, 1, 1);
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
addKernel << < blocks, threads >> >(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}

This calculation is incorrect:
int i = 256*(blockIdx.y*blockDim.x + threadIdx.y) + blockIdx.x*blockDim.x + threadIdx.x;
I'm not sure how you came up with that.
The calculation should be to take your thread x index and add to it the grid width in x, times the row in y.
It should be:
int i = (blockIdx.y*blockDim.y+threadIdx.y)*(gridDim.x*blockDim.x) + blockIdx.x*blockDim.x + threadIdx.x;
// the row in y * grid width in x + thread index in x

Why do I get "Unspecified Launch failure" in CUDA program, multiplying 2 matrices

I am new to CUDA. When I multiply the 1024x1024 matrix, and launch a kernel with:
multiplyKernel << <dim3(32,32, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);
But when I multiply a 2048 x 2048 matrix, with
dim3(64,64,1)
I get this error:
cudaDeviceSynchronize returned error code 4 after launching addKernel!
unspecified launch failure
From tinkering with the code, I think that the error is in this statement
result += a[row * size + ind] * b[col + size * ind];
in the part
b[col+size*ind]
If I take that out, I don't get a kernel launch error (just the wrong answer, obviously). I cannot figure out what's wrong. Any suggestions would be most appreciated.
I am using Visual Studio 2013. I am using the debugger, but this does not help me find the error.
This seems to be a similar problem:
cudaDeviceSynchronize returned error code 4 after launching
many thanks, here is the code:
cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size);
__global__ void multiplyKernel(int *c, const int *a, const int *b, unsigned int size)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row > size || col > size) return;
// target field in 1-D
int z = row * size + col;
int result = 0;
for (int ind = 0; ind < size ; ++ind) {
result += a[row * size + ind] * b[col + size * ind];
}
c[z] = result;
}
int main(){
const int sizeMatrix = 2048;
int* a = new int[sizeMatrix * sizeMatrix];
int* b = new int[sizeMatrix * sizeMatrix];
int* c = new int[sizeMatrix * sizeMatrix];
for (int i = 0; i < sizeMatrix * sizeMatrix; i++) {
a[i] = rand() % 2;
b[i] = rand() % 2;
}
cudaError_t cudaStatus = multiplyWithCuda(c, a, b, sizeMatrix);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a ;
int *dev_b;
int *dev_c;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
fprintf(stdout, "device set");
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for c allocated \n");
cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for a allocated \n");
cudaStatus = cudaMalloc((void**)&dev_b, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for b allocated \n");
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
fprintf(stdout, "cudaMemcpy a done \n");
cudaStatus = cudaMemcpy(dev_b, b, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
fprintf(stdout, "cudaMemcpy b done\n");
fprintf(stdout, "about to launch kernel \n");
// Launch a kernel on the GPU with one thread for each element.
multiplyKernel << <dim3(64,64, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);
fprintf(stdout, "kernel launched\n");
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
; fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
fprintf(stderr, " %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}

On Windows, I right clicked the NSight monitor icon in the system tray. There I chose Options>General. We see WDDM TDR delay. It was at 2, and I increased it to 10. Then, I ran my program again, and it worked fine.
This was according to Robert's link (see above)
http://http.developer.nvidia.com/NsightVisualStudio/2.2/Documentation/UserGuide/HTML/Content/Timeout_Detection_Recovery.htm

Cuda Matrix Multiplication -- not working for some non-square matrices

I am experimenting with Cuda programming a the moment. As part of this I am attempting to develop a matrix multiplication algorithm to run on GPU. This algorithm works for square matrices but fails for non-square matrices.
Here is my kernel
float* multiply_gpu(float* matrix1 , float* matrix2);
__global__ void mult(int rowsA , int columnsA, int rowsB,int columnsB, float *a,
float *b, float *result) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int result_size = rowsA*columnsB;
int value = 0;//the final result
//indices of values from input matrices
if (index < result_size) {
int index1 = (index/rowsA)*rowsA; //get nearest row
int index2 = index%columnsB; //get start column
int k = 0;
while (k<columnsA) { //columnsA == rowsB
value += a[index1]*b[index2]; //v = sum a_ik * b_kj
index1 ++;
index2 += columnsB;
k++;
}
result[index] = value;
}
}
After doing a brief sanity check with my supervisor, he has not seen any issues with it.
I believe the problem lies in this function:
float* multiply_gpu(float* matrix1 , float* matrix2) {
//the dimensions of the matrices
size_t available, total;
cudaError_t error;
cudaError err = cudaMemGetInfo(&available, &total);
if(err != cudaSuccess){
printf("There was an error: %s\n", cudaGetErrorString(err));
}
int height1 = matrix1[0];
int width1 = matrix1[1];
int height2 = matrix2[0];
int width2 = matrix2[1];
if (width1!=height2) {
return NULL;
}
//this array contains the result of the operation
float* result = (float *) malloc(height1*width2*sizeof(float));
//pointers for device matrices
float *d_matrix1;
float *d_matrix2;
float *d_result;
//allocate memory for matrices
error = cudaMalloc((void **)&d_matrix1,(size_t)height1*width1*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_matrix2,height2*width2*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_result,height1*width2*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//now copy matrices onto device -- note the offset of 2
error = cudaMemcpy(d_matrix1 , matrix1+2 , height1*width1*sizeof(float), cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMemcpy(d_matrix2 , matrix2+2 , height2*width2*sizeof(float), cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//launch multiplication kernel
//note I have tried adjusting the kernel values between <<< , >>> to no avail
mult<<<height1,width2>>>(height1,width1,height2,width2,d_matrix1,d_matrix2,d_result);
printf("%d %d %d %d\n",height1,width1,height2,width2);
//make the host block until mult is finished running
//printf("finished multiplying\n");
cudaDeviceSynchronize();
//copy result back
error = cudaMemcpy(result,d_result,height1*width2*sizeof(float),cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//free now unneeded cuda memory
cudaFree(d_matrix1);
cudaFree(d_matrix2);
cudaFree(d_result);
printf("GOT RESULT\n");
for (int i=0;i<height1*width2;i++) {
printf("%f ",result[i]);
}
printf("\n");
//result ready to be returned
return result;
}
Note that the matrices which are parameters to multiply_gpu have their height at index 0 and width at index 1. The result matrix does not have this information.
An example of incorrect computation:
when I feed the following arrays into multiply_gpu -- {2,3,1,2,3,4,5,6} , {3,2,1,2,3,4,5,6} the answer should be {22,28,49,64} but instead my unit tests generate {22,28,40,52}. So close! Note that for the dot product of (1,2,3)*(1,2,3) (which is not square) the algorithm is happy... What could be the error here? Thanks for any assistance. Will post a solution if I find one independently.

This line is wrong:
int index1 = (index/rowsA)*rowsA; //get nearest row
It should be something like this:
int index1 = (index/columnsB)*columnsA; //get nearest row
Why is this formulation correct? The index1 is used to index through the row elements in A that correspond to the row indicated by the output matrix position we are computing. The output matrix position is just the thread index. If we (integer) divide the thread index by the number of columns in the output matrix i.e. C, we get the row number in question. Then, to find the first element of that row in A, we multiply by the number of columns in A. This correctly indexes us to the first element of the relevant row in A.
Here's a complete app along with my test cases - the only change I made to your code was the change indicated above.
$ cat t290.cu
#include <stdio.h>
__global__ void mult(int rowsA , int columnsA, int rowsB,int columnsB, float *a, float *b, float *result) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int result_size = rowsA*columnsB;
int value = 0;//the final result
//indices of values from input matrices
if (index < result_size) {
int index1 = (index/columnsB)*columnsA; //get nearest row
int index2 = index%columnsB; //get start column
int k = 0;
while (k<columnsA) { //columnsA == rowsB
value += a[index1]*b[index2]; //v = sum a_ik * b_kj
index1 ++;
index2 += columnsB;
k++;
}
result[index] = value;
}
}
float* multiply_gpu(float* matrix1 , float* matrix2) {
//the dimensions of the matrices
size_t available, total;
cudaError_t error;
cudaError err = cudaMemGetInfo(&available, &total);
if(err != cudaSuccess){
printf("There was an error: %s\n", cudaGetErrorString(err));
}
int height1 = matrix1[0];
int width1 = matrix1[1];
int height2 = matrix2[0];
int width2 = matrix2[1];
if (width1!=height2) {
printf("fail!\n");
return NULL;
}
//this array contains the result of the operation
float* result = (float *) malloc(height1*width2*sizeof(float));
//pointers for device matrices
float *d_matrix1;
float *d_matrix2;
float *d_result;
//allocate memory for matrices
error = cudaMalloc((void **)&d_matrix1,(size_t)height1*width1*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_matrix2,height2*width2*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_result,height1*width2*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//now copy matrices onto device -- note the offset of 2
error = cudaMemcpy(d_matrix1 , matrix1+2 , height1*width1*sizeof(float), cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMemcpy(d_matrix2 , matrix2+2 , height2*width2*sizeof(float), cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//launch multiplication kernel
//note I have tried adjusting the kernel values between <<< , >>> to no avail
mult<<<height1,width2>>>(height1,width1,height2,width2,d_matrix1,d_matrix2,d_result);
printf("%d %d %d %d\n",height1,width1,height2,width2);
error = cudaGetLastError();
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//make the host block until mult is finished running
//printf("finished multiplying\n");
error = cudaDeviceSynchronize();
if (error != cudaSuccess) {
fprintf(stderr, "kernel fail (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//copy result back
error = cudaMemcpy(result,d_result,height1*width2*sizeof(float),cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//free now unneeded cuda memory
cudaFree(d_matrix1);
cudaFree(d_matrix2);
cudaFree(d_result);
printf("GOT RESULT\n");
for (int i=0;i<height1*width2;i++) {
printf("%f ",result[i]);
}
printf("\n");
//result ready to be returned
return result;
}
int main(){
float m1[8] = {2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
float m2[6] = {2.0, 2.0, 1.0, 1.0, 2.0, 2.0};
float *my_result1 = multiply_gpu(m2, m1);
float m3[8] = {2,3,1,2,3,4,5,6};
float m4[8] = {3,2,1,2,3,4,5,6};
float *my_result2 = multiply_gpu(m3, m4);
float *my_result3 = multiply_gpu(m4, m3);
float m5[12] = {2,5,1,1,1,1,1,1,1,1,1,1};
float m6[22] = {5,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
float *my_result4 = multiply_gpu(m5, m6);
return 0;
}
$ nvcc -arch=sm_20 -o t290 t290.cu
t290.cu: In function âfloat* multiply_gpu(float*, float*)â:
t290.cu:30: warning: converting to âintâ from âfloatâ
t290.cu:31: warning: converting to âintâ from âfloatâ
t290.cu:32: warning: converting to âintâ from âfloatâ
t290.cu:33: warning: converting to âintâ from âfloatâ
$ cuda-memcheck ./t290
========= CUDA-MEMCHECK
2 2 2 3
GOT RESULT
5.000000 7.000000 9.000000 10.000000 14.000000 18.000000
2 3 3 2
GOT RESULT
22.000000 28.000000 49.000000 64.000000
3 2 2 3
GOT RESULT
9.000000 12.000000 15.000000 19.000000 26.000000 33.000000 29.000000 40.000000 51.000000
2 5 5 4
GOT RESULT
5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000
========= ERROR SUMMARY: 0 errors
$

So after carefully reviewing my matrix code I discovered a simple problem with
the mathematics of my operation.
It is true this line was wrong
int index1 = (index/rowsA)*rowsA; //get nearest row
I note that since my matrix is row-ordered, the formula for acquiring the correct index from element at (i,j) is
index = i*rowLength + j
Therefore the assignment to index1 should be
int index1 = (index/rowsA)*columnsA
Why? Well it's obvious that to navigate to the index for row n, we have to move by n row lengths (this is the number of columns in the matrix). My code worked for square matrices but not other rectangular ones because the number of columns doesn't match the number of rows in such a matrix.

memory fragmentation in cuda

I am having a memory allocation problem which I can't understand. I am trying to allocate a char array in GPU (I am guessing it is probably a memory fragmentation issue).
here is my code,
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<cuda.h>
inline void gpuAssert(cudaError_t code, char *file, int line,
int abort=1)
{
if (code != cudaSuccess) {
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
__global__ void calc(char *k,char *i)
{
*i=*k;
}
int main()
{
char *dev_o=0;
char *i;
i = (char*)malloc(10*sizeof(char));
cudaMalloc((void**)&dev_o,10*sizeof(char)); //Line 31
calc<<<1,1>>>("arun",dev_o);
gpuErrchk(cudaMemcpy(&i,dev_o,10*sizeof(char),cudaMemcpyDeviceToHost));
cudaFree(dev_o);
printf("string : %s \n",i);
return 0;
}
but I'm getting output as,
GPUassert: out of memory sample2.cu 31
In the same case, I tried to allocate integer in GPU and it's working properly.
My GPU device information is given as,
--- General Information for device 0 ---
Name:GeForce GTX 460 SE
Compute capability:2.1
Clock rate:1296000
Device copy overlap:Enabled
Kernel execition timeout :Enabled
--- Memory Information for device 0 ---
Total global mem:1073283072
Total constant Mem:65536
Max mem pitch:2147483647
Texture Alignment:512
--- MP Information for device 0 ---
Multiprocessor count:6
Shared mem per mp:49152
Registers per mp:32768
Threads in warp:32
Max threads per block:1024
Max thread dimensions:(1024, 1024, 64)
Max grid dimensions:(65535, 65535, 65535)
Can anyone tell me what is the problem and how I can overcome it?

Several things were wrong in your code.
cudaMemcpy(&i, ...) should be cudaMemcpy(i, ...).
Check the return error of your kernel call as explained in this post. If you don't, the error will seem to appear later in your code.
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
Your char *k argument to your kernel is a host pointer. You should create another device array and copy your data to the device before calling your kernel.
You were also not doing any parallel work on your threads in your calc() kernel since you were not using the thread indices, threadIdx.x. This was probably for testing though.
Here is what you would get if you fix these issues:
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<cuda.h>
inline void gpuAssert(cudaError_t code, char *file, int line,
int abort=1)
{
if (code != cudaSuccess) {
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
__global__ void calc(char* k, char *i)
{
i[threadIdx.x] = k[threadIdx.x];
}
int main()
{
const char* msg = "arun";
char *dev_i, *dev_k;
char *i, *k;
k = (char*)malloc(10*sizeof(char));
i = (char*)malloc(10*sizeof(char));
sprintf(k, msg);
cudaMalloc((void**)&dev_i, 10*sizeof(char));
cudaMalloc((void**)&dev_k, 10*sizeof(char));
gpuErrchk(cudaMemcpy(dev_k, k, 10*sizeof(char), cudaMemcpyHostToDevice));
calc<<<1,5>>>(dev_k, dev_i);
gpuErrchk(cudaPeekAtLastError());
// Synchronization will be done in the next synchronous cudaMemCpy call, else
// you would need cudaDeviceSynchronize() to detect execution errors.
//gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(i, dev_i, 10*sizeof(char), cudaMemcpyDeviceToHost));
printf("string : %s\n", i);
cudaFree(dev_i);
cudaFree(dev_k);
free(i);
free(k);
return 0;
}

CUDA : How to allocate memory for data member of a class

suppose I have this class :
class Particle
{
double *_w;
};
And I want to send nParticles objects of Particle to my kernel. Allocating space for these objects is easy :
Particle *dev_p;
cudaStatus = cudaMalloc((void**)&dev_P, nParticles * sizeof(Particle));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
Also suppose that nParticles is 100. Now I need to allocate 300 double for each _w in a Particle object. How can I do this? I tried this code :
for( int i = 0; i < nParticles; i++){
cudaStatus = cudaMalloc((void**)&(dev_P[i]._w), 300 * sizeof(double));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
}
But debugging with Nsight stops when I access dev_p[i]._w[j] .

Perhaps you should include a complete simple example. (If I compile your code above and run it by itself, on linux, I get a seg fault at the second cudaMalloc operation). One wrinkle I see is that since you have in the first step allocated the particle objects in device memory, when you go to allocate the _w pointers, you are passing a pointer to cudaMalloc that is already in device memory. You're supposed to pass a host-based pointer to cudaMalloc, which it will then assign to the allocated area in device (global) memory.
One possible solution that I think conforms to what I see in yoru example is like this:
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
class Particle
{
public:
double *_w;
};
__global__ void test(Particle *p){
int idx=threadIdx.x + blockDim.x*blockIdx.x;
if (idx == 2){
printf("dev_p[2]._w[2] = %f\n", p[idx]._w[2]);
}
}
int main() {
int nParticles=100;
Particle *dev_p;
double *w[nParticles];
cudaMalloc((void**)&dev_p, nParticles * sizeof(Particle));
cudaCheckErrors("cudaMalloc1 fail");
for( int i = 0; i < nParticles; i++){
cudaMalloc((void**)&(w[i]), 300 * sizeof(double));
cudaCheckErrors("cudaMalloc2 fail");
cudaMemcpy(&(dev_p[i]._w), &(w[i]), sizeof(double *), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
}
double testval = 32.7;
cudaMemcpy(w[2]+2, &testval, sizeof(double), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy2 fail");
test<<<1, 32>>>(dev_p);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
printf("Done!\n");
}
Here we are creating a separate set of pointers on the host to use for cudaMalloc purposes, then copying those allocated pointers down to the device for use as device pointers (this is legal with UVA).
Another approach would be to allocate the _w pointers on the device side. This may serve your purposes as well.
All of the above I am assuming cc 2.0 or greater.
Using a methodology similar to what is described here, it may be possible to collapse the device-side allocations done in a loop down to a single allocation:
cudaMalloc(&(w[0]), nParticles*300*sizeof(double));
cudaCheckErrors("cudaMalloc2 fail");
cudaMemcpy(&(dev_p[0]._w), &(w[0]), sizeof(double *), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
for( int i = 1; i < nParticles; i++){
w[i] = w[i-1] + 300;
cudaMemcpy(&(dev_p[i]._w), &(w[i]), sizeof(double *), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
}
The cudaMemcpy operations still have to be done individually.

There are two ways of doing it. First one - you allocate the memory on the host filling up host array of particle objects. Once complete, you copy the host array to the device through cudaMemcpy.
Second way - on Fermi and higher you can call malloc in the kernel, filling the dev_P array from the kernel.

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Matrix Multiplication : Performance decreases after coalescing global memory access in CUDA - parallel-processing

Related

When I try to use multidimensional blocks or grids part of my resulted array occurs zeros

Why do I get "Unspecified Launch failure" in CUDA program, multiplying 2 matrices

Cuda Matrix Multiplication -- not working for some non-square matrices

memory fragmentation in cuda

CUDA : How to allocate memory for data member of a class

Categories

Resources