Sparse plus dense matrix operation using cuSPARSE - matrix

Is it possible to add a sparse matrix and a dense matrix using cuSPARSE? In cuBLAS, I'd just treat the matrices as vectors and use axpy. cuSPARSE does have axpy for sparse/dense vectors, but it cannot be used for matrices because sparse vectors and matrices have different memory structure.

cusparse has dense-to-sparse and sparse-to-dense conversion routines. You could:
convert the sparse matrix to dense (e.g. with cusparse<t>csr2dense), then add the two with cublas<t>geam, producing a dense matrix result
convert the dense matrix to sparse (e.g. with cusparse<t>dense2csr), then use cusparse<t>csrgeam to produce a sparse result
Note that using cusparse<t>geam is a little bit more involved than just a single function call, but the usage methodology is given in the documentation. Also, when using cusparse<t>dense2csr, you will likely want to use cusparse<t>nnz to help with the storage allocations needed.

Here is a fully worked example with a customized kernel summing up a sparse matrix A stored in CSR format with a dense matrix B providing a dense matrix C. The customized kernel explicitly deals with the mapping between the CSR and dense indices.
#include <stdio.h>
#include <assert.h>
#include <cusparse.h>
#define BLOCKSIZEX 16
#define BLOCKSIZEY 16
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if (CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %d, error %s\nterminating!\n", __FILE__, __LINE__, \
_cusparseGetErrorEnum(err)); \
assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/*****************************/
/* SETUP DESCRIPTOR FUNCTION */
/*****************************/
void setUpDescriptor(cusparseMatDescr_t &descrA, cusparseMatrixType_t matrixType, cusparseIndexBase_t indexBase) {
cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseSetMatType(descrA, matrixType));
cusparseSafeCall(cusparseSetMatIndexBase(descrA, indexBase));
}
/********************************************************/
/* DENSE TO SPARSE CONVERSION FOR REAL DOUBLE PRECISION */
/********************************************************/
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A,
int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA,
const cusparseHandle_t handle, const int M, const int N) {
const int lda = M; // --- Leading dimension of dense matrix
gpuErrchk(cudaMalloc(&d_nnzPerVector[0], M * sizeof(int)));
// --- Compute the number of nonzero elements per row and the total number of nonzero elements
// the dense d_A_dense
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, M, N, descrA, d_A_dense,
lda, d_nnzPerVector[0], &nnz));
// --- Device side sparse matrix
gpuErrchk(cudaMalloc(&d_A[0], nnz * sizeof(double)));
gpuErrchk(cudaMalloc(&d_A_RowIndices[0], (M + 1) * sizeof(int)));
gpuErrchk(cudaMalloc(&d_A_ColIndices[0], nnz * sizeof(int)));
cusparseSafeCall(cusparseDdense2csr(handle, M, N, descrA, d_A_dense, lda, d_nnzPerVector[0],
d_A[0], d_A_RowIndices[0], d_A_ColIndices[0]));
}
/********************************/
/* SPARSE + DENSE CUSTOM KERNEL */
/********************************/
__global__ void sparsePlusDense(const double * __restrict__ d_A, const int * __restrict__ d_A_RowIndices,
const int * __restrict__ d_A_ColIndices, const double * __restrict__ d_B,
double * __restrict__ d_C, const int M, const int N) {
const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
const int tidy = threadIdx.y + blockIdx.y * blockDim.y;
if ((tidx >= N) || (tidy >= M)) return;
const int row = tidy;
const int nnzRow = d_A_RowIndices[tidy + 1] - d_A_RowIndices[tidy];
if (tidx >= nnzRow) return;
const int col = d_A_ColIndices[d_A_RowIndices[tidy] + tidx];
d_C[row * N + col] = d_C[row * N + col] + d_A[d_A_RowIndices[tidy] + tidx];
}
/********/
/* MAIN */
/********/
int main() {
cusparseHandle_t handle;
// --- Initialize cuSPARSE
cusparseSafeCall(cusparseCreate(&handle));
// --- Initialize matrix descriptors
cusparseMatDescr_t descrA;
setUpDescriptor(descrA, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO);
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int M = 5; // --- Number of rows
const int N = 4; // --- Number of columns
// --- Host side dense matrix
double *h_A_dense = (double*)malloc(M * N * sizeof(*h_A_dense));
// --- Column-major storage
h_A_dense[0] = 0.4612; h_A_dense[5] = 0.0; h_A_dense[10] = 1.3; h_A_dense[15] = 0.0;
h_A_dense[1] = 0.0; h_A_dense[6] = 1.443; h_A_dense[11] = 0.0; h_A_dense[16] = 0.0;
h_A_dense[2] = -0.0006; h_A_dense[7] = 0.4640; h_A_dense[12] = 0.0723; h_A_dense[17] = 0.0;
h_A_dense[3] = 0.3566; h_A_dense[8] = 0.0; h_A_dense[13] = 0.7543; h_A_dense[18] = 0.0;
h_A_dense[4] = 0.; h_A_dense[9] = 0.0; h_A_dense[14] = 0.0; h_A_dense[19] = 0.1;
// --- Create device array and copy host array to it
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, M * N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, M * N * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
/*******************************/
/* FROM DENSE TO SPARSE MATRIX */
/*******************************/
int nnz = 0; // --- Number of nonzero elements in dense matrix
int *d_nnzPerVector; // --- Device side number of nonzero elements per row
double *d_A; // --- Sparse matrix values - array of size nnz
int *d_A_RowIndices; // --- "Row indices"
int *d_A_ColIndices; // --- "Column indices"
dense2SparseD(d_A_dense, &d_nnzPerVector, &d_A, &d_A_RowIndices, &d_A_ColIndices, nnz, descrA,
handle, M, N);
/*************************/
/* DENSE MATRIX OPERANDS */
/*************************/
// --- Host side dense matrix
double *h_B_dense = (double*)malloc(M * N * sizeof(*h_B_dense));
// --- Column-major storage
h_B_dense[0] = 1.5; h_B_dense[5] = -0.2; h_B_dense[10] = -0.9; h_B_dense[15] = 1.1;
h_B_dense[1] = 2.1; h_B_dense[6] = 2.0; h_B_dense[11] = 1.1; h_B_dense[16] = -0.009;
h_B_dense[2] = -2; h_B_dense[7] = -0.82; h_B_dense[12] = 1.2; h_B_dense[17] = 1.21;
h_B_dense[3] = -0.001; h_B_dense[8] = -1.1; h_B_dense[13] = 0.887; h_B_dense[18] = 1.1143;
h_B_dense[4] = 1.1; h_B_dense[9] = 2.1; h_B_dense[14] = -1.1213; h_B_dense[19] = 5.4334;
// --- Create device array and copy host array to it
double *d_B_dense; gpuErrchk(cudaMalloc(&d_B_dense, M * N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_B_dense, h_B_dense, M * N * sizeof(*d_B_dense), cudaMemcpyHostToDevice));
// --- Allocate space for the result e initialize it
double *d_C_dense; gpuErrchk(cudaMalloc(&d_C_dense, M * N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_C_dense, d_B_dense, M * N * sizeof(double), cudaMemcpyDeviceToDevice));
/*********************************/
/* RUN THE SPARSE-DENSE ADDITION */
/*********************************/
dim3 GridDim(iDivUp(N, BLOCKSIZEX), iDivUp(M, BLOCKSIZEY));
dim3 BlockDim(BLOCKSIZEX, BLOCKSIZEY);
sparsePlusDense << <GridDim , BlockDim>> >(d_A, d_A_RowIndices, d_A_ColIndices, d_B_dense, d_C_dense, M, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
/*******************************************************/
/* CHECKING THE RESULTS FOR SPARSE TO DENSE CONVERSION */
/*******************************************************/
double *h_C_dense = (double *)malloc(M * N * sizeof(double));
gpuErrchk(cudaMemcpy(h_C_dense, d_C_dense, M * N * sizeof(double), cudaMemcpyDeviceToHost));
printf("\nFirst dense operand matrix (column-major storage) \n");
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++)
printf("%f\t", h_A_dense[n * M + m]);
printf("\n");
}
printf("\nSecond dense operand matrix (row-major storage) \n");
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++)
printf("%f\t", h_B_dense[n + m * N]);
printf("\n");
}
printf("\nReference dense matrix (the first has column-major storage, the second row-major\n");
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++)
printf("%f\t", h_A_dense[n * M + m] + h_B_dense[n + m * N]);
printf("\n");
}
printf("\nSecond dense operand matrix (row-major storage) \n");
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++)
printf("%f\t", h_C_dense[n + m * N]);
printf("\n");
}
return 0;
}

Related

Some CUDA computations fail with larger block dimension (< 1024)

I am learning CUDA with a GTX 960 4GB. I wrote a program which performs an element-wise matrix multiplication. When I increase the block dimensions for x and y to lets say (32 x 32) in combination with a large matrix (lets say 15000 x 15000 elements), some but not all multiplication results are wrong (value 0 instead of 6).
When I then decrease the block dimensions to e.g (8 x 8), all results are right again. When I decrease the Matrix size, the results are right again, too.
So in case of this example, there seems to be combinations of total threads and threads per block, which does not work.
I am surprised I can't find any threads regarding this topic. All I can find is about increasing performance and occupancy, but not about when some but not all calculations are aborted.
The grid dimensions are calculated as follows:
dim3 blocks(ceil<int>(COLS / threads.x), ceil<int>(ROWS / threads.y));
Why do some multiplications fail while others are successful?
Some Examples
Block dim : (8, 8)
Matrix shape : (15000, 15000)
Verification : 0 elements have failed, total length 225000000, shape: (15000, 15000)
Block dim : (16, 16)
Matrix shape : (15000, 15000)
Verification : 239936 elements have failed, total length 225000000, shape: (15000, 15000)
Block dim : (32, 32)
Matrix shape : (15000, 15000)
Verification : 719424 elements have failed, total length 225000000, shape: (15000, 15000).
Block dim : (32, 32)
Matrix shape : (10000, 10000)
Verification : 0 elements have failed, total length 100000000, shape: (10000, 10000).
Driver Version
$ cat /proc/driver/nvidia/version
NVRM version: NVIDIA UNIX x86_64 Kernel Module 470.82.00 Thu Oct 14 10:24:40 UTC 2021
Complete Code
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define ROWS 10000
#define COLS 10000
#define MAX_ERR 1e-6
typedef struct {
int width;
int height;
float* elements;
} Matrix;
size_t ij(int i, int j){
return j * ROWS + i;
}
__global__ void matrix_multi_elemwise(const Matrix OUT, const Matrix A, const Matrix B) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if (col < A.width && row < A.height) {
int index = row * A.height + col; // linearisation of index
OUT.elements[index] = A.elements[index] * B.elements[index];
}
}
int main(){
Matrix A, B, OUT;
Matrix dev_A, dev_B, dev_OUT;
size_t SIZE = ROWS * COLS * sizeof(float);
// Allocate host memory
A.elements = (float*) malloc(SIZE);
B.elements = (float*) malloc(SIZE);
OUT.elements = (float*) malloc(SIZE);
// Initialize host matrices
A.height = ROWS; A.width = COLS;
B.height = ROWS; B.width = COLS;
OUT.height = ROWS; OUT.width = COLS;
for (int j = 0; j < ROWS; j++) {
for(int i = 0; i < COLS; i++){
A.elements[ij(i, j)] = 2.0f;
B.elements[ij(i, j)] = 3.0f;
}
}
// Allocate device memory
cudaMalloc((void**) &dev_A.elements, SIZE);
cudaMalloc((void**) &dev_B.elements, SIZE);
cudaMalloc((void**) &dev_OUT.elements, SIZE);
dev_A.height = A.height; dev_A.width = A.width;
dev_B.height = A.height; dev_B.width = B.width;
dev_OUT.height = A.height; dev_OUT.width = OUT.width;
// Transfer data from host to device memory
cudaMemcpy(dev_A.elements, A.elements, SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(dev_B.elements, B.elements, SIZE, cudaMemcpyHostToDevice);
// Executing kernel
dim3 threads(16, 16);
dim3 blocks(ceil<int>(COLS / threads.x), ceil<int>(ROWS / threads.y));
matrix_multi_elemwise<<<blocks, threads>>>(dev_OUT, dev_A, dev_B);
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) {
printf("CUDA Runtime API Error reported : %s in file %s on line.\n", cudaGetErrorString(err), __FILE__);
}
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Transfer data back to host memory
cudaMemcpy(OUT.elements, dev_OUT.elements, SIZE, cudaMemcpyDeviceToHost);
// Verification
int count = 0, length = 0, i = 0, j = 0;
for (j = 0; j < ROWS; j++) {
for(i = 0; i < COLS; i++){
//assert(fabs(OUT.elements[ij(i, j)] / A.elements[ij(i, j)] - B.elements[ij(i, j)]) < MAX_ERR);
if (fabs(OUT.elements[ij(i, j)] / A.elements[ij(i, j)] - B.elements[ij(i, j)]) > MAX_ERR) {
count++;
}
length++;
}
}
printf("Verification: %i elements have failed, total length %i, shape: (%i, %i).\n", count, length, i, j);
// Deallocate device memory
cudaFree(dev_A.elements);
cudaFree(dev_B.elements);
cudaFree(dev_OUT.elements);
// Deallocate host memory
free(A.elements);
free(B.elements);
free(OUT.elements);
}
The number of blocks is wrong. Indeed, COLS and threads.x are both integers. Thus, the result is a truncated integer. ceil<int> cannot ceil the result as it has already been truncated. This cause some blocks not to be computed: 15000 is divisible by 8 but not by 16. You need to either cast COLS to a floating-point number or compute the ceil result manually (safer). Here is an example:
dim3 blocks((COLS + threads.x - 1) / threads.x, (ROWS + threads.y - 1) / threads.y);
As pointed out in the comment, note that row * A.height + col is wrong: it should be row * A.width + col instead. This causes issues for non square matrices.

How to expand the product of a sequence of binomials efficiently?

The product of the sequence of binomials reads
where {a_i} and {b_i} are coefficients in binomials.
I need to expand it to a polynomial
and use all coefficients {c_k} in the polynomial afterwards.
How to expand it efficiently? The speed has priority over the memory occupation because the expansion will be used many times.
What I tried
At present I just come up with an update scheme, which expands the polynomial right after absorbing one binomial.
This scheme needs two arrays — one for results up to i-1 and the other for results up to i.
Here is the C++ code for my naive scheme, but I think this question is irrelevant to what language is used.
#include <iostream>
#include <vector>
int main()
{
using namespace std;
// just an example, the coefficients are actually real numbers in [0,1]
unsigned d = 3;
vector<double> a;
vector<double> b;
a.resize(d, 1); b.resize(d, 1);
// given two arrays, a[] and b[], of length d
vector< vector<double> > coefficients(2);
coefficients[0].resize(d + 1);
coefficients[1].resize(d + 1);
if (d > 0) {
auto &coeff = coefficients[0]; // i = 0
coeff[0] = a[0];
coeff[1] = b[0];
for (unsigned i = 1; i < d; ++i) {// i : [1, d-1]
const auto ai = a[i];
const auto bi = b[i];
const auto &oldCoeff = coefficients[(i-1)%2];
auto &coeff = coefficients[i%2];
coeff[0] = oldCoeff[0] * ai; // j = 0
for (unsigned j = 1; j <= i; ++j) { // j : [1, i]
coeff[j] = oldCoeff[j] * ai + oldCoeff[j-1] * bi;
}
coeff[i+1] = oldCoeff[i] * bi; // j = i
}
}
const auto &coeff = coefficients[(d-1)%2];
for (unsigned i = 0; i < d; ++i) {
cout << coeff[i] << "\t";
}
cout << coeff[d] << '\n';
}

OpenCL Kernel Error -11

I'm new to OpenCL and i'm trying to parallelise an edge detection program.I'm trying to write a kernel from the edge detection function.
The original function:
void edgeDetection(float *out, float *in, int w, int h) {
int r,c;
for (r = 0; r < h-2; r++) {
for (c = 0; c < w-2; c++) {
float G;
float* pOut = &out[r*w + c];
float Gx = 0.0;
float Gy = 0.0;
int fr,fc;
/* run the 2d-convolution filter */
for (fr = 0; fr < 3; fr++) {
for (fc = 0; fc < 3; fc++) {
float p = in[(r+fr)*w + (c+fc)];
/* X-directional edges */
Gx += p * F[fr*3 + fc];
/* Y-directional edges */
Gy += p * F[fc*3 + fr];
}
}
/* all edges, pythagoral sum */
G = sqrtf(Gx*Gx + Gy*Gy);
*pOut = G;
}
}
}
My OpenCL Kernel:
__kernel
void edgeDetection(__global float *out,
__global float *in, int w, int h)
{
// Get the work-item’s unique ID
const int r = get_global_id(0);
const int c = get_global_id(1);
if(r>=0 && c>=0 && r<h-2 && c<w-2){
float G;
float* pOut = &out[r*w + c];
float Gx = 0.0;
float Gy = 0.0;
int fr,fc;
for (fr = 0; fr < 3; fr++) {
for (fc = 0; fc < 3; fc++) {
float p = in[(r+fr)*w + (c+fc)];
Gx += p * F[fr*3 + fc];
Gy += p * F[fc*3 + fr];
}
}
G = sqrtf(Gx*Gx + Gy*Gy);
*pOut = G;
}
}
When I try to build the program from the .cl file using this(chk is a function to check if there are any failures/errors):
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
chk(status, "clBuildProgram");
I get an error saying, "clBuildProgram failed (-11)". From my researches, I've seen that it is commonly tought that this error is caused by a syntax error. However, after checking many times I cannot see anything particularly wrong with my kernel. Can somebody help me figure out what's wrong with it?
There are many errors in the code:
1)
float* pOut = &out[r*w + c];
This is invalid, it should be:
__global float* pOut = &out[r*w + c];
2) You are using F in the kernel which was never defined.
3) sqrtf is not defined in CL, did you mean sqrt instead?

Kernel crashes while trying to do a simple value assignment

I am learning CUDA and still at the very beginner level. I am trying a simple assignment but my code crashes when I run it and I am not sure why. Any help would be appreciated.
EDIT: Crashes on cudaMemcpy and in Image structure, the pixelVal is of type int**. Is that the cause?
Original C++ code:
void Image::reflectImage(bool flag, Image& oldImage)
/*Reflects the Image based on users input*/
{
int rows = oldImage.N;
int cols = oldImage.M;
Image tempImage(oldImage);
for(int i = 0; i < rows; i++)
{
for(int j = 0; j < cols; j++)
tempImage.pixelVal[rows - (i + 1)][j] = oldImage.pixelVal[i][j];
}
oldImage = tempImage;
}
My CUDA kernel & code:
#define NTPB 512
__global__ void fliph(int* a, int* b, int r, int c)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i >= r || j >= c)
return;
a[(r - i * c) + j] = b[i * c + j];
}
void Image::reflectImage(bool flag, Image& oldImage)
/*Reflects the Image based on users input*/
{
int rows = oldImage.N;
int cols = oldImage.M;
Image tempImage(oldImage);
if(flag == true) //horizontal reflection
{
//Allocate device memory
int* dpixels;
int* oldPixels;
int n = rows * cols;
cudaMalloc((void**)&dpixels, n * sizeof(int));
cudaMalloc((void**)&oldPixels, n * sizeof(int));
cudaMemcpy(dpixels, tempImage.pixelVal, n * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(oldPixels, oldImage.pixelVal, n * sizeof(int), cudaMemcpyHostToDevice);
int nblks = (n + NTPB - 1) / NTPB;
fliph<<<nblks, NTPB>>>(dpixels, oldPixels, rows, cols);
cudaMemcpy(tempImage.pixelVal, dpixels, n * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dpixels);
cudaFree(oldPixels);
}
oldImage = tempImage;
}
You have to create a 2D Grid in order to process the image using 2D indices i and j. In the current case, the kernel is processing only the first row of the image.
To create a 2D indexing mechanism, create a 2D block and 2D grid like this:
const int BLOCK_DIM = 16;
dim3 Block(BLOCK_DIM,BLOCK_DIM);
dim3 Grid;
Grid.x = (cols + Block.x - 1)/Block.x;
Grid.y = (rows + Block.y - 1)/Block.y;
fliph<<<Grid, Block>>>(dpixels, oldPixels, rows, cols);

FIR filter in CUDA (as a 1D convolution)

I'm trying to implement a FIR (Finite Impulse Response) filter in CUDA. My approach is quite simple and looks somewhat like this:
#include <cuda.h>
__global__ void filterData(const float *d_data,
const float *d_numerator,
float *d_filteredData,
const int numeratorLength,
const int filteredDataLength)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
float sum = 0.0f;
if (i < filteredDataLength)
{
for (int j = 0; j < numeratorLength; j++)
{
// The first (numeratorLength-1) elements contain the filter state
sum += d_numerator[j] * d_data[i + numeratorLength - j - 1];
}
}
d_filteredData[i] = sum;
}
int main(void)
{
// (Skipping error checks to make code more readable)
int dataLength = 18042;
int filteredDataLength = 16384;
int numeratorLength= 1659;
// Pointers to data, filtered data and filter coefficients
// (Skipping how these are read into the arrays)
float *h_data = new float[dataLength];
float *h_filteredData = new float[filteredDataLength];
float *h_filter = new float[numeratorLength];
// Create device pointers
float *d_data = nullptr;
cudaMalloc((void **)&d_data, dataLength * sizeof(float));
float *d_numerator = nullptr;
cudaMalloc((void **)&d_numerator, numeratorLength * sizeof(float));
float *d_filteredData = nullptr;
cudaMalloc((void **)&d_filteredData, filteredDataLength * sizeof(float));
// Copy data to device
cudaMemcpy(d_data, h_data, dataLength * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_numerator, h_numerator, numeratorLength * sizeof(float), cudaMemcpyHostToDevice);
// Launch the kernel
int threadsPerBlock = 256;
int blocksPerGrid = (filteredDataLength + threadsPerBlock - 1) / threadsPerBlock;
filterData<<<blocksPerGrid,threadsPerBlock>>>(d_data, d_numerator, d_filteredData, numeratorLength, filteredDataLength);
// Copy results to host
cudaMemcpy(h_filteredData, d_filteredData, filteredDataLength * sizeof(float), cudaMemcpyDeviceToHost);
// Clean up
cudaFree(d_data);
cudaFree(d_numerator);
cudaFree(d_filteredData);
// Do stuff with h_filteredData...
// Clean up some more
delete [] h_data;
delete [] h_filteredData;
delete [] h_filter;
}
The filter works, but as I'm new to CUDA programming and I'm not sure how to optimize it.
A slight problem that I see is that dataLength, filteredDataLength, and numeratorLength are not known before hand in the application I intend to use the filter in. Also, even though dataLength is a multiple of 32 in the above code, it is not guaranteed to be that in the final application.
When I compare my code above to ArrayFire, my code takes about three times longer to execute.
Does anyone have any ideas on how to speed things up?
EDIT: Have changed all filterLength to numeratorLength.
I can suggest the following to speed up your code:
Use the shared memory: it is a tiny cache-like memory but extremely
faster than the global card memory. You can find more about it by
looking for __shared__ keyword in CUDA documentation. For
example, you can pre-fetch the filter numerators and big chunks
of data in shared memory, this will significantly enhance your
performance. You need to pay extra attention to the data
alignment in this case as it really matters and it can slow down
your code.
Think about unrolling the for-loop of the numerator
sum. You can check the reduce-vector example in CUDA
documentation.
You can also think about parallelizing the
numerator loop itself by itself. This can be done by adding an extra dimension (say 'y') to your thread-block. You will need to make sum a shared vector as well that has the dimension of numeratorLength. You can also check the reduce vector example on how
to quickly take the sum of this vector at the end.
You are attempting at calculating the filter output by directly evaluating the 1D convolution through a CUDA kernel.
In the case when the filter impulse response duration is long, one thing you can do to evaluate the filtered input is performing the calculations directly in the conjugate domain using FFTs. Below I'm reporting a sample code using CUDA Thrust and the cuFFT library. It is a direct translation of the Matlab-based example reported at
Low-Pass Filtering by FFT Convolution
Let me disclaim that some optimizations are possible with this code, but I preferred to leave it as it is so that it could be more easily compared to its Matlab's counterpart.
#include <stdio.h>
#include <math.h>
#include <cufft.h>
#include <thrust\device_vector.h>
#include <thrust\sequence.h>
#define pi_f 3.14159265358979f // Greek pi in single precision
/****************/
/* SIN OPERATOR */
/****************/
class sin_op {
float fk_, Fs_;
public:
sin_op(float fk, float Fs) { fk_ = fk; Fs_ = Fs; }
__host__ __device__ float operator()(float x) const { return sin(2.f*pi_f*x*fk_/Fs_); }
};
/*****************/
/* SINC OPERATOR */
/*****************/
class sinc_op {
float fc_, Fs_;
public:
sinc_op(float fc, float Fs) { fc_ = fc; Fs_ = Fs; }
__host__ __device__ float operator()(float x) const
{
if (x==0) return (2.f*fc_/Fs_);
else return (2.f*fc_/Fs_)*sin(2.f*pi_f*fc_*x/Fs_)/(2.f*pi_f*fc_*x/Fs_);
}
};
/********************/
/* HAMMING OPERATOR */
/********************/
class hamming_op {
int L_;
public:
hamming_op(int L) { L_ = L; }
__host__ __device__ float operator()(int x) const
{
return 0.54-0.46*cos(2.f*pi_f*x/(L_-1));
}
};
/*********************************/
/* MULTIPLY CUFFTCOMPLEX NUMBERS */
/*********************************/
struct multiply_cufftComplex {
__device__ cufftComplex operator()(const cufftComplex& a, const cufftComplex& b) const {
cufftComplex r;
r.x = a.x * b.x - a.y * b.y;
r.y = a.x * b.y + a.y * b.x;
return r;
}
};
/********/
/* MAIN */
/********/
void main(){
// Signal parameters:
int M = 256; // signal length
const int N = 4;
float f[N] = { 440, 880, 1000, 2000 }; // frequencies
float Fs = 5000.; // sampling rate
// Generate a signal by adding up sinusoids:
thrust::device_vector<float> d_x(M,0.f); // pre-allocate 'accumulator'
thrust::device_vector<float> d_n(M); // discrete-time grid
thrust::sequence(d_n.begin(), d_n.end(), 0, 1);
thrust::device_vector<float> d_temp(M);
for (int i=0; i<N; i++) {
float fk = f[i];
thrust::transform(d_n.begin(), d_n.end(), d_temp.begin(), sin_op(fk,Fs));
thrust::transform(d_temp.begin(), d_temp.end(), d_x.begin(), d_x.begin(), thrust::plus<float>());
}
// Filter parameters:
int L = 257; // filter length
float fc = 600.f; // cutoff frequency
// Design the filter using the window method:
thrust::device_vector<float> d_hsupp(L);
thrust::sequence(d_hsupp.begin(), d_hsupp.end(), -(L-1)/2, 1);
thrust::device_vector<float> d_hideal(L);
thrust::transform(d_hsupp.begin(), d_hsupp.end(), d_hideal.begin(), sinc_op(fc,Fs));
thrust::device_vector<float> d_l(L);
thrust::sequence(d_l.begin(), d_l.end(), 0, 1);
thrust::device_vector<float> d_h(L);
thrust::transform(d_l.begin(), d_l.end(), d_h.begin(), hamming_op(L));
// h is our filter
thrust::transform(d_hideal.begin(), d_hideal.end(), d_h.begin(), d_h.begin(), thrust::multiplies<float>());
// --- Choose the next power of 2 greater than L+M-1
int Nfft = pow(2,(ceil(log2((float)(L+M-1))))); // or 2^nextpow2(L+M-1)
// Zero pad the signal and impulse response:
thrust::device_vector<float> d_xzp(Nfft,0.f);
thrust::device_vector<float> d_hzp(Nfft,0.f);
thrust::copy(d_x.begin(), d_x.end(), d_xzp.begin());
thrust::copy(d_h.begin(), d_h.end(), d_hzp.begin());
// Transform the signal and the filter:
cufftHandle plan;
cufftPlan1d(&plan, Nfft, CUFFT_R2C, 1);
thrust::device_vector<cufftComplex> d_X(Nfft/2+1);
thrust::device_vector<cufftComplex> d_H(Nfft/2+1);
cufftExecR2C(plan, (cufftReal*)thrust::raw_pointer_cast(d_xzp.data()), (cufftComplex*)thrust::raw_pointer_cast(d_X.data()));
cufftExecR2C(plan, (cufftReal*)thrust::raw_pointer_cast(d_hzp.data()), (cufftComplex*)thrust::raw_pointer_cast(d_H.data()));
thrust::device_vector<cufftComplex> d_Y(Nfft/2+1);
thrust::transform(d_X.begin(), d_X.end(), d_H.begin(), d_Y.begin(), multiply_cufftComplex());
cufftPlan1d(&plan, Nfft, CUFFT_C2R, 1);
thrust::device_vector<float> d_y(Nfft);
cufftExecC2R(plan, (cufftComplex*)thrust::raw_pointer_cast(d_Y.data()), (cufftReal*)thrust::raw_pointer_cast(d_y.data()));
getchar();
}
Besides my other answer which I expect will be more convenient for convolution kernels with long duration, below I'm reporting a different implementation, which is more compliant with the OP's initial attempt and I expect will be more convenient for convolution kernels with short duration. Such an implementation is based on a hand-written kernel exploiting caching in shared memory. More details can be found in the book by D.B. Kirk and W.-m. W. Hwu
Programming Massively Parallel Processors, Second Edition: A Hands-on Approach
#include <stdio.h>
#include <stdlib.h>
#include "TimingGPU.cuh"
#include "Utilities.cuh"
#define RG 10
#define BLOCKSIZE 8
/****************/
/* CPU FUNCTION */
/****************/
void h_convolution_1D(const float * __restrict__ h_Signal, const float * __restrict__ h_ConvKernel, float * __restrict__ h_Result_CPU,
const int N, const int K) {
for (int i = 0; i < N; i++) {
float temp = 0.f;
int N_start_point = i - (K / 2);
for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) {
temp += h_Signal[N_start_point+ j] * h_ConvKernel[j];
}
h_Result_CPU[i] = temp;
}
}
/********************/
/* BASIC GPU KERNEL */
/********************/
__global__ void d_convolution_1D_basic(const float * __restrict__ d_Signal, const float * __restrict__ d_ConvKernel, float * __restrict__ d_Result_GPU,
const int N, const int K) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
float temp = 0.f;
int N_start_point = i - (K / 2);
for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) {
temp += d_Signal[N_start_point+ j] * d_ConvKernel[j];
}
d_Result_GPU[i] = temp;
}
/***************************/
/* GPU KERNEL WITH CACHING */
/***************************/
__global__ void d_convolution_1D_caching(const float * __restrict__ d_Signal, const float * __restrict__ d_ConvKernel, float * __restrict__ d_Result_GPU,
const int N, const int K) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float d_Tile[BLOCKSIZE];
d_Tile[threadIdx.x] = d_Signal[i];
__syncthreads();
float temp = 0.f;
int N_start_point = i - (K / 2);
for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) {
if ((N_start_point + j >= blockIdx.x * blockDim.x) && (N_start_point + j < (blockIdx.x + 1) * blockDim.x))
// --- The signal element is in the tile loaded in the shared memory
temp += d_Tile[threadIdx.x + j - (K / 2)] * d_ConvKernel[j];
else
// --- The signal element is not in the tile loaded in the shared memory
temp += d_Signal[N_start_point + j] * d_ConvKernel[j];
}
d_Result_GPU[i] = temp;
}
/********/
/* MAIN */
/********/
int main(){
const int N = 15; // --- Signal length
const int K = 5; // --- Convolution kernel length
float *h_Signal = (float *)malloc(N * sizeof(float));
float *h_Result_CPU = (float *)malloc(N * sizeof(float));
float *h_Result_GPU = (float *)malloc(N * sizeof(float));
float *h_ConvKernel = (float *)malloc(K * sizeof(float));
float *d_Signal; gpuErrchk(cudaMalloc(&d_Signal, N * sizeof(float)));
float *d_Result_GPU; gpuErrchk(cudaMalloc(&d_Result_GPU, N * sizeof(float)));
float *d_ConvKernel; gpuErrchk(cudaMalloc(&d_ConvKernel, K * sizeof(float)));
for (int i=0; i < N; i++) { h_Signal[i] = (float)(rand() % RG); }
for (int i=0; i < K; i++) { h_ConvKernel[i] = (float)(rand() % RG); }
gpuErrchk(cudaMemcpy(d_Signal, h_Signal, N * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_ConvKernel, h_ConvKernel, K * sizeof(float), cudaMemcpyHostToDevice));
h_convolution_1D(h_Signal, h_ConvKernel, h_Result_CPU, N, K);
d_convolution_1D_basic<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_Signal, d_ConvKernel, d_Result_GPU, N, K);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_Result_GPU, d_Result_GPU, N * sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i < N; i++) if (h_Result_CPU[i] != h_Result_GPU[i]) {printf("mismatch2 at %d, cpu: %d, gpu %d\n", i, h_Result_CPU[i], h_Result_GPU[i]); return 1;}
printf("Test basic passed\n");
d_convolution_1D_caching<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_Signal, d_ConvKernel, d_Result_GPU, N, K);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_Result_GPU, d_Result_GPU, N * sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i < N; i++) if (h_Result_CPU[i] != h_Result_GPU[i]) {printf("mismatch2 at %d, cpu: %d, gpu %d\n", i, h_Result_CPU[i], h_Result_GPU[i]); return 1;}
printf("Test caching passed\n");
return 0;
}

Resources