Related
Why is this not vectorizing?
__attribute__((num_simd_work_items(4)))
__attribute__((num_compute_units(2)))
__attribute__((reqd_work_group_size(16,16,1)))
__kernel void matrix_multiplication(const int fDIM,const int gDIM, const int hDIM,
__global float* A, __global float* B, __global float* C) {
int k;
int i = get_global_id(0);
int j = get_global_id(1);
float temp_result;
if((i < gDIM) && (j<fDIM)){
temp_result= 0.0f;
for(k = 0; k<hDIM;k++) {
temp_result+= A[i*gDIM+k] * B[k*hDIM+j];
}
C[i*gDIM+j] = temp_result;
}
}
Compiler Warning:
Kernel Vectorization: branching is thread ID dependent ... cannot vectorize.
Q : Why is this not vectorizing?
The evil is the "branching…cannot vectorize" - it relates to this instruction:
if( ( i < gDIM ) && ( j < fDIM ) ){ ... }
Efficient SIMD-instructions based vectorisation means all code-execution flows are not "divergent" (branched) and do "execute" the very same data/instruction (i.e. data elements SIMD-"glued" into Vectors of DATA, put into wide-enough, CPU, SIMD-friendly, registers, that get computed at once by a single SIMD-friendly instruction - i.e. the very same for each thread-in-a-pack SIMD-friendly instruction, i.e. not if(){...}else{...}-diverged into different, "divergent" flow-of different sequences of different instructions for different data-elements
It is principally impossible to want do different operations for different parts of the data, aligned into the SIMD-friendly CPU register - one and only one SIMD-friendly instruction can be executed at once for all vector-components stored into the SIMD-friendly CPU-register.
Hardware details on integer and floats SIMD-vector instructions vary, as does the resulting micro-ops latency, SIMD-processor specific details form compilator do matter a lot, yet the principle of avoiding divergent paths is common for the automated SIMD-vectorisation in the compiler phase. For more deails on SIMD-instructions and their further performance-limiting properties may read and learn from Agner
I am trying to implement matrix multiplication using CUDA. I have two matrices of order Mw and wN. I launched (w*w) threads in each block and grid dimension = (M/w,N/w). I created a matrix in shared memory of size 32*32. I want to implement matrix multiplication using only one matrix in shared memory. Here's my code
#include<stdio.h>
#include<cuda.h>
#include<stdlib.h>
#include<stdlib.h>
#include<unistd.h>
#include<math.h>
__global__ void add(int *a,int *b, int *c,int *p,int *q){
// __shared__ int aTile[*p][*p];
//const int a=*p;
__shared__ int aTile[32][32];
int row = blockIdx.x*blockDim.x+threadIdx.x;
int col = blockIdx.y*blockDim.y+threadIdx.y;
int sum=0;
aTile[threadIdx.x][threadIdx.y] = a[row*(*p)+threadIdx.y];
__syncthreads();
if(row< *q && col< *q)
{
for(int k=0;k<*p;k++)
{
sum+= aTile[threadIdx.x][k]*b[col+(*q)*k];
// __syncthreads();
}
c[col+(*q)*row]=sum;
//__syncthreads();
}
}
int main(){
printf("Enter the number of rows of matrix 1\n");
int row_1;
scanf("%d",&row_1);
printf("Enter the number of columns of matrix 1\n");
int col_1;
scanf("%d",&col_1);
/*printf("Enter the values of matrix 1 \n");
*/
int a[row_1][col_1];
for(int i=0;i<row_1;i++)
{
for(int j=0;j<col_1;j++)
{
//scanf("%d",&a[i][j]);
a[i][j]=1;
}
}
printf("Enter the number of rows of matrix 2\n");
int row_2;
scanf("%d",&row_2);
printf("Enter the number of columns of matrix 2\n");
int col_2;
scanf("%d",&col_2);
/* printf("Enter the values of matrix 2 \n");
*/
int b[row_2][col_2];
for(int i=0;i<row_2;i++)
{
for(int j=0;j<col_2;j++)
{
// scanf("%d",&b[i][j]);
b[i][j]=1;
}
}
int c[row_1][col_2];
//dim3 dimBlock(col_1, col_1);// in one block u have row_1*col_2 threads;
dim3 dimBlock(col_1,col_1);
//dim3 dimGrid((row_1/col_1)+1,(col_2/col_1)+1); // in one grid you have 1*1 blocks
dim3 dimGrid(ceil(row_1/col_1),ceil(col_2/col_1));
int *p;
int *q;
int *dev_a,*dev_b,*dev_c;
int size_a=row_1*col_1*sizeof(int);
int size_b=row_2*col_2*sizeof(int);
int size_c = row_1*col_2*sizeof(int);
cudaMalloc((void**)&dev_a,size_a);
cudaMalloc((void**)&dev_b,size_b);
cudaMalloc((void**)&dev_c,size_c);
cudaMalloc((void**)&p,sizeof(int));
cudaMalloc((void**)&q,sizeof(int));
cudaMemcpy(dev_a,a,size_a,cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,b,size_b,cudaMemcpyHostToDevice);
cudaMemcpy(dev_c,c,size_c,cudaMemcpyHostToDevice);
cudaMemcpy(p,&col_1,sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(q,&col_2,sizeof(int),cudaMemcpyHostToDevice);
add<<<dimGrid,dimBlock>>>(dev_a,dev_b,dev_c,p,q);
cudaMemcpy(c,dev_c,size_c,cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
printf("output matrix is : \n");
for(int i=0;i<10;i++)
{
for(int j=0;j<10;j++)
{
printf("%d ",c[i][j]);
}
printf("\n");
}
}
I get the correct output for when i multiply matrices of size 32*32 and 32*32
but when i Multiply matrices of sizes 33*33 and 33*33(and above) , the resultant multiplied matrix contains all zeros. I have tried to increased the size of matrix in shared memory, but I get the following error
ptxas error : Entry function '_Z3addPiS_S_S_S_' uses too much shared data (0x10038 bytes, 0x4000 max)
I am pretty new to CUDA. Sorry, if this was too much basic question
This is a basic question and has been answered many times over.
First of all, use proper cuda error checking any time you are having trouble with a CUDA code. In this case, you would have received an error that would have been instructive.
CUDA kernels have a limit on the maximum number of threads per threadblock. That limit (under CUDA 7, 7.5RC, currently) is 1024 threads per block, on all supported devices. The number of threads per block is specified (in this case) by your dimBlock variable, and it is the product of the terms in each dimension:
dim3 dimBlock(col_1,col_1);
add<<<dimGrid,dimBlock>>>(dev_a,dev_b,dev_c,p,q);
Therefore, when col_1 is 32, you are requesting 32x32 threads (1024) which is the maximum. Any value above 32x32 will fail for this reason. (Your kernel will not launch. No kernel code will get executed when you specify 33x33 here.)
Rather than rewrite this code to fix all the issues, I suggest you study any of the dozens of questions already asked about matrix multiplication, here on the cuda tag. In fact, if you want to see a shared memory optimized code for naive matrix multiplication in CUDA, there is a full example in the programming guide (including both the non-shared version and the shared version for comparison).
And again, I suggest you implement proper cuda error checking before asking for help here. Even if you don't understand the error results, it will be useful information for those who are trying to help you.
You have an overflow in this line:
aTile[threadIdx.x][threadIdx.y] = a[row*(*p)+threadIdx.y];
knowing that aTile is defined as __shared__ int aTile[32][32];
If you want to do tiling, you'll have to loop over the number of tiles you need to cover your matrice.
I'm playing around with OpenCL, Geforce GTX550 and driver version 331.38 from Ubuntu 14.04. What stumps me is the speed of copying from global to local memory. As far as I know, the following code should do coalesced access to global memory:
void toLocal(__local float* target, const __global float* source, int count) {
const int iterations = (count + get_local_size(0) - 1) / get_local_size(0);
for (int i = 0; i < iterations; i++) {
int idx = i * get_local_size(0) + get_local_id(0);
if (idx < count)
target[idx] = source[idx];
}
}
In practice, the following code (which should use all threads to copy the same float over and over again) is measurably faster:
void toLocal(__local float* target, const __global float* source, int count) {
for (int i = 0; i < count; i++)
target[i] = source[i];
}
Both source and target point directly at the beginning of a buffer, so I would guess they are correctly aligned. Group size is 16 by 16, trying to use all threads makes the code more complex but doesn't affect speed. The optimal coalescing group size would be 128 bytes or 32 floats, but as far as I know, on compute model 2 cards (which GTX550 is) the penalty of using only a part or even permuting the elements shouldn't be that bad. Adding local memory fence to the first version makes it only slower. Is there anything else I missed?
EDIT: Changing group size to 32 by 32 made the parallel version roughly as fast as sequential 16 by 16 and made the sequential version slightly slower. Still not the speed improvement I was expecting.
When using cub::BlockRadixSort to do the sorting within a block, if the number of elements is too large, how do we deal with that? If we set a tile size to be too large, the shared memory for the temporary storage will soon not able to hold it. If we split it into multiple tiles, how do we post-process it after we sorted each tile?
Caveat: I am not a cub expert (far from it).
You might want to review this question/answer as I'm building on some of the work I did there.
Certainly if the problem size is large enough, then a device-wide sort would seem to be something you might want to consider. But your question seems focused on block sorting.
From my testing, cub doesn't really have requirements around where your original data is located, or where you place the temp storage. Therefore, one possible solution would be simply to place your temp storage in global memory. To analyze this, I created a code that has 3 different test cases:
Test a version of cub block sort with the temp storage in global memory.
Test the original version of cub block sort adapted from the example here
Test a version of cub block sort derived from my previous answer, where there is no copying of data to/from global memory, ie. it is assumed that the data is already resident "on-chip" i.e. in shared memory.
None of this is extensively tested, but since I am building on cub building blocks, and testing my results in the first two cases, hopefully I have not made any grievous errors. Here's the full test code, and I will make additional comments below:
$ cat t10.cu
#include <cub/cub.cuh>
#include <stdio.h>
#include <stdlib.h>
#include <thrust/sort.h>
#define nTPB 512
#define ELEMS_PER_THREAD 2
#define RANGE (nTPB*ELEMS_PER_THREAD)
#define DSIZE (nTPB*ELEMS_PER_THREAD)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
using namespace cub;
// GLOBAL CUB BLOCK SORT KERNEL
// Specialize BlockRadixSort collective types
typedef BlockRadixSort<int, nTPB, ELEMS_PER_THREAD> my_block_sort;
__device__ int my_val[DSIZE];
__device__ typename my_block_sort::TempStorage sort_temp_stg;
// Block-sorting CUDA kernel (nTPB threads each owning ELEMS_PER THREAD integers)
__global__ void global_BlockSortKernel()
{
// Collectively sort the keys
my_block_sort(sort_temp_stg).Sort(*static_cast<int(*)[ELEMS_PER_THREAD]>(static_cast<void*>(my_val+(threadIdx.x*ELEMS_PER_THREAD))));
}
// ORIGINAL CUB BLOCK SORT KERNEL
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void BlockSortKernel(int *d_in, int *d_out)
{
// Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
typedef cub::BlockLoad<int*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE> BlockLoadT;
typedef cub::BlockStore<int*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_TRANSPOSE> BlockStoreT;
typedef cub::BlockRadixSort<int, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
// Allocate type-safe, repurposable shared memory for collectives
__shared__ union {
typename BlockLoadT::TempStorage load;
typename BlockStoreT::TempStorage store;
typename BlockRadixSortT::TempStorage sort;
} temp_storage;
// Obtain this block's segment of consecutive keys (blocked across threads)
int thread_keys[ITEMS_PER_THREAD];
int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);
BlockLoadT(temp_storage.load).Load(d_in + block_offset, thread_keys);
__syncthreads(); // Barrier for smem reuse
// Collectively sort the keys
BlockRadixSortT(temp_storage.sort).Sort(thread_keys);
__syncthreads(); // Barrier for smem reuse
// Store the sorted segment
BlockStoreT(temp_storage.store).Store(d_out + block_offset, thread_keys);
}
// SHARED MEM CUB BLOCK SORT KERNEL
// Block-sorting CUDA kernel (nTPB threads each owning ELEMS_PER THREAD integers)
template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void shared_BlockSortKernel(int *d_out)
{
__shared__ int my_val[BLOCK_THREADS*ITEMS_PER_THREAD];
// Specialize BlockRadixSort collective types
typedef BlockRadixSort<int, BLOCK_THREADS, ITEMS_PER_THREAD> my_block_sort;
// Allocate shared memory for collectives
__shared__ typename my_block_sort::TempStorage sort_temp_stg;
// need to extend synthetic data for ELEMS_PER_THREAD > 1
my_val[threadIdx.x*ITEMS_PER_THREAD] = (threadIdx.x + 5); // synth data
my_val[threadIdx.x*ITEMS_PER_THREAD+1] = (threadIdx.x + BLOCK_THREADS + 5); // synth data
__syncthreads();
// printf("thread %d data = %d\n", threadIdx.x, my_val[threadIdx.x*ITEMS_PER_THREAD]);
// Collectively sort the keys
my_block_sort(sort_temp_stg).Sort(*static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(my_val+(threadIdx.x*ITEMS_PER_THREAD))));
__syncthreads();
// printf("thread %d sorted data = %d\n", threadIdx.x, my_val[threadIdx.x*ITEMS_PER_THREAD]);
if (threadIdx.x == clock()){ // dummy to prevent compiler optimization
d_out[threadIdx.x*ITEMS_PER_THREAD] = my_val[threadIdx.x*ITEMS_PER_THREAD];
d_out[threadIdx.x*ITEMS_PER_THREAD+1] = my_val[threadIdx.x*ITEMS_PER_THREAD+1];}
}
int main(){
int *h_data, *h_result;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
h_data=(int *)malloc(DSIZE*sizeof(int));
h_result=(int *)malloc(DSIZE*sizeof(int));
if (h_data == 0) {printf("malloc fail\n"); return 1;}
if (h_result == 0) {printf("malloc fail\n"); return 1;}
for (int i = 0 ; i < DSIZE; i++) h_data[i] = rand()%RANGE;
// first test sorting directly out of global memory
global_BlockSortKernel<<<1,nTPB>>>(); //warm up run
cudaDeviceSynchronize();
cudaMemcpyToSymbol(my_val, h_data, DSIZE*sizeof(int));
cudaCheckErrors("memcpy to symbol fail");
cudaEventRecord(start);
global_BlockSortKernel<<<1,nTPB>>>(); //timing run
cudaEventRecord(stop);
cudaDeviceSynchronize();
cudaCheckErrors("cub 1 fail");
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
cudaMemcpyFromSymbol(h_result, my_val, DSIZE*sizeof(int));
cudaCheckErrors("memcpy from symbol fail");
if(!thrust::is_sorted(h_result, h_result+DSIZE)) { printf("sort 1 fail!\n"); return 1;}
printf("global Elapsed time: %fms\n", et);
printf("global Kkeys/s: %d\n", (int)(DSIZE/et));
// now test original CUB block sort copying global to shared
int *d_in, *d_out;
cudaMalloc((void **)&d_in, DSIZE*sizeof(int));
cudaMalloc((void **)&d_out, DSIZE*sizeof(int));
cudaCheckErrors("cudaMalloc fail");
BlockSortKernel<nTPB, ELEMS_PER_THREAD><<<1, nTPB>>>(d_in, d_out); // warm up run
cudaMemcpy(d_in, h_data, DSIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaEventRecord(start);
BlockSortKernel<nTPB, ELEMS_PER_THREAD><<<1, nTPB>>>(d_in, d_out); // timing run
cudaEventRecord(stop);
cudaDeviceSynchronize();
cudaCheckErrors("cub 2 fail");
cudaEventSynchronize(stop);
cudaEventElapsedTime(&et, start, stop);
cudaMemcpy(h_result, d_out, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy D to H fail");
if(!thrust::is_sorted(h_result, h_result+DSIZE)) { printf("sort 2 fail!\n"); return 1;}
printf("CUB Elapsed time: %fms\n", et);
printf("CUB Kkeys/s: %d\n", (int)(DSIZE/et));
// now test shared memory-only version of block sort
shared_BlockSortKernel<nTPB, ELEMS_PER_THREAD><<<1, nTPB>>>(d_out); // warm-up run
cudaEventRecord(start);
shared_BlockSortKernel<nTPB, ELEMS_PER_THREAD><<<1, nTPB>>>(d_out); // timing run
cudaEventRecord(stop);
cudaDeviceSynchronize();
cudaCheckErrors("cub 3 fail");
cudaEventSynchronize(stop);
cudaEventElapsedTime(&et, start, stop);
printf("shared Elapsed time: %fms\n", et);
printf("shared Kkeys/s: %d\n", (int)(DSIZE/et));
return 0;
}
$ nvcc -O3 -arch=sm_20 -o t10 t10.cu
$ ./t10
global Elapsed time: 0.236960ms
global Kkeys/s: 4321
CUB Elapsed time: 0.042816ms
CUB Kkeys/s: 23916
shared Elapsed time: 0.040192ms
shared Kkeys/s: 25477
$
For this test, I am using CUDA 6.0RC, cub v1.2.0 (which is pretty recent), RHEL5.5/gcc4.1.2, and a Quadro5000 GPU (cc2.0, 11SMs, approximately 40% slower than a GTX480). Here are some observations that occur to me:
The speed ratio of the original cub sort(2) to the global memory sort(1) is approximately 6:1, which is approximately the bandwidth ratio of shared memory (~1TB/s) to global memory (~150GB/s).
The original cub sort(2) has a throughput, that when scaled for the number of SMs (11), yielding 263MKeys/s, is a sizeable fraction of the best device-wide sort I have seen on this device (thrust sort, yielding ~480MKeys/s)
The shared-memory only sort is not much faster than the original cub sort which copies input/output from/to global memory, indicating the copy from global memory to the cub temp storage is not a large fraction of the overall processing time.
The 6:1 penalty is a large one to pay. So my recommendation would be, if possible to use a device-wide sort on problem sizes larger than what can be handled easily by cub block sorting. This allows you to tap into the expertise of some of the best GPU code writers for your sorting, and achieve throughputs much closer to what the device as a whole is capable of.
Note that so I could test under similar conditions, the problem size here (512 threads, 2 elements per thread) does not exceed what you can do in a CUB block sort. But it's not difficult to extend the data set size to larger values (say, 1024 elements per thread) that can be only handled (in this context, among these choices) using the first approach. If I do larger problem sizes like that, on my GPU I observe a throughput of around 6Mkeys/s for the global memory block sort on my cc2.0 device.
Let me take the hardware with computation ability 1.3 as an example.
30 SMs are available. Then at most 240 blocks are able to be running at the same time(Considering the limit of register and shared memory, the restriction to the number of block may be much lower). Those blocks beyond 240 have to wait for available hardware resources.
My question is when those blocks beyond 240 will be assigned to SMs. Once some blocks of the first 240 are completed? Or when all of the first 240 blocks are finished?
I wrote such a piece of code.
#include<stdio.h>
#include<string.h>
#include<cuda_runtime.h>
#include<cutil_inline.h>
const int BLOCKNUM = 1024;
const int N=240;
__global__ void kernel ( volatile int* mark ) {
if ( blockIdx.x == 0 ) while ( mark[N] == 0 );
if ( threadIdx.x == 0 ) mark[blockIdx.x] = 1;
}
int main() {
int * mark;
cudaMalloc ( ( void** ) &mark, sizeof ( int ) *BLOCKNUM );
cudaMemset ( mark, 0, sizeof ( int ) *BLOCKNUM );
kernel <<< BLOCKNUM, 1>>> ( mark );
cudaFree ( mark );
return 0;
}
This code causes a deadlock and fails to terminate. But if I change N from 240 to 239, the code is able to terminate. So I want to know some details about the scheduling of blocks.
On the GT200, it has been demonstrated through micro-benchmarking that new blocks are scheduled whenever a SM has retired all the currently active blocks which it was running. So the answer is when some blocks are finished, and the scheduling granularity is SM level. There seems to be a consensus that Fermi GPUs have a finer scheduling granularity than previous generations of hardware.
I can't find any reference about this for compute capabilities < 1.3.
Fermi architectures introduce a new block dispatcher called GigaThread engine.
GigaThread enables immediate replacement of blocks on an SM when one completes executing and also enables concurrent kernel execution.
While there is no official answer to this, you can measure through atomic operations when your blocks begin your work and when they end.
Try playing with the following code:
#include <stdio.h>
const int maxBlocks=60; //Number of blocks of size 512 threads on current device required to achieve full occupancy
__global__ void emptyKernel() {}
__global__ void myKernel(int *control, int *output) {
if (threadIdx.x==1) {
//register that we enter
int enter=atomicAdd(control,1);
output[blockIdx.x]=enter;
//some intensive and long task
int &var=output[blockIdx.x+gridDim.x]; //var references global memory
var=1;
for (int i=0; i<12345678; ++i) {
var+=1+tanhf(var);
}
//register that we quit
var=atomicAdd(control,1);
}
}
int main() {
int *gpuControl;
cudaMalloc((void**)&gpuControl, sizeof(int));
int cpuControl=0;
cudaMemcpy(gpuControl,&cpuControl,sizeof(int),cudaMemcpyHostToDevice);
int *gpuOutput;
cudaMalloc((void**)&gpuOutput, sizeof(int)*maxBlocks*2);
int cpuOutput[maxBlocks*2];
for (int i=0; i<maxBlocks*2; ++i) //clear the host array just to be on the safe side
cpuOutput[i]=-1;
// play with these values
const int thr=479;
const int p=13;
const int q=maxBlocks;
//I found that this may actually affect the scheduler! Try with and without this call.
emptyKernel<<<p,thr>>>();
cudaEvent_t timerStart;
cudaEvent_t timerStop;
cudaEventCreate(&timerStart);
cudaEventCreate(&timerStop);
cudaThreadSynchronize();
cudaEventRecord(timerStart,0);
myKernel<<<q,512>>>(gpuControl, gpuOutput);
cudaEventRecord(timerStop,0);
cudaEventSynchronize(timerStop);
cudaMemcpy(cpuOutput,gpuOutput,sizeof(int)*maxBlocks*2,cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
float thisTime;
cudaEventElapsedTime(&thisTime,timerStart,timerStop);
cudaEventDestroy(timerStart);
cudaEventDestroy(timerStop);
printf("Elapsed time: %f\n",thisTime);
for (int i=0; i<q; ++i)
printf("%d: %d-%d\n",i,cpuOutput[i],cpuOutput[i+q]);
}
What you get in the output is the block ID, followed by the enter "time" and exit "time". This way you can learn in which order those events occured.
On Fermi, I'm sure that a block is scheduled on a SM as soon there is room for it. I.e., whenever, a SM finishes executing one block, it will execute another block if there is any block left. (However, the actual order is not deterministic).
In older versions, I don't know. But you can verify it by using the build-in clock() function.
For example, I used the following OpenCL kernel code (you can easily convert it to CUDA):
__kernel void test(uint* start, uint* end, float* buffer);
{
int id = get_global_id(0);
start[id] = clock();
__do_something_here;
end[id] = clock();
}
Then output it to a file and build a graph. You will see how visual it is.