Use CUSP matrix inside CUDA function? [duplicate] - c++11

This question already has an answer here:
How to get raw pointer from cusp library matrix format
(1 answer)
Closed 4 years ago.
i want to write a kernel function that takes as input 2 CUSP matrices A and B,then fills data into B in parallel.
#include <cusp/coo_matrix.h>
#include <cusp/print.h>
#include <iostream>
__global__ void kernel_example(cusp::coo_matrix<int,float,cusp::host_memory>* A,
cusp::coo_matrix<int,float,cusp::host_memory>* B){
printf("hello from kernel...");
//actual operations go here.
}
int main(void)
{
// allocate storage
cusp::coo_matrix<int,float,cusp::host_memory> A(4,3,6);
cusp::coo_matrix<int,float,cusp::host_memory> B(4,3,6);
// initialize matrix entries on host
A.row_indices[0] = 0; A.column_indices[0] = 0; A.values[0] = 10;
A.row_indices[1] = 0; A.column_indices[1] = 2; A.values[1] = 20;
A.row_indices[2] = 2; A.column_indices[2] = 2; A.values[2] = 30;
A.row_indices[3] = 3; A.column_indices[3] = 0; A.values[3] = 40;
A.row_indices[4] = 3; A.column_indices[4] = 1; A.values[4] = 50;
A.row_indices[5] = 3; A.column_indices[5] = 2; A.values[5] = 60;
kernel_example<<<1,1>>>(A,B);
cudaDeviceSynchronize();
return 0;
}
the following error ensues:
error: no suitable conversion function from "cusp::coo_matrix<int, float, cusp::host_memory>" to "cusp::coo_matrix<int, float, cusp::host_memory> *" exists
how do i go about it?

The error is because the function signature is for a pointer, and you're passing an object. You can pass by reference and it will build.
Should be
__global__ void kernel_example(cusp::coo_matrix<int, float, cusp::host_memory>& A,
cusp::coo_matrix<int, float, cusp::host_memory>& B) {
printf("hello from kernel...");
//actual operations go here.
}

Related

Getting calling a __host__ function from a global function is not allowed when using cudaMallocManaged

I have a written code that I am trying to modify in order to make it use CUDA and I am having plenty of troubles, currently, I was trying to make the functions I want to be kernel functions to be void and I got some errors
Here is the list of errors I am getting:
black_scholes.cu(54): error: calling a __host__ function("cudaMallocManaged<double> ") from a __global__ function("black_scholes_iterate") is not allowed
black_scholes.cu(54): error: identifier "cudaMallocManaged<double> " is undefined in device code
black_scholes.cu(56): error: calling a __host__ function("init_gaussrand_state") from a __global__ function("black_scholes_iterate") is not allowed
black_scholes.cu(56): error: identifier "init_gaussrand_state" is undefined in device code
black_scholes.cu(65): error: calling a __host__ function("spawn_prng_stream") from a __global__ function("black_scholes_iterate") is not allowed
black_scholes.cu(65): error: identifier "spawn_prng_stream" is undefined in device code
black_scholes.cu(66): error: calling a __host__ function("gaussrand1") from a __global__ function("black_scholes_iterate") is not allowed
black_scholes.cu(66): error: identifier "gaussrand1" is undefined in device code
black_scholes.cu(66): error: identifier "uniform_random_double" is undefined in device code
black_scholes.cu(73): error: calling a __host__ function("free_prng_stream") from a __global__ function("black_scholes_iterate") is not allowed
black_scholes.cu(73): error: identifier "free_prng_stream" is undefined in device code
black_scholes.cu(74): error: calling a __host__ function("cudaFree") from a __global__ function("black_scholes_iterate") is not allowed
black_scholes.cu(74): error: identifier "cudaFree" is undefined in device code
I am particularly posting concerning the first 2 errors as while learning CUDA via an Nvidia Introductory course, it was common to call cudaMallocManaged inside a __global__ function and I don't get what is different here
Here is my .cu code :
#include "black_scholes.h"
#include "gaussian.h"
#include "random.h"
#include "util.h"
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
__managed__ double stddev;
__global__ void black_scholes_stddev (void* the_args)
{
black_scholes_args_t* args = (black_scholes_args_t*) the_args;
const double mean = args->mean;
const int M = args->M;
double variance = 0.0;
int k = blockIdx.x * blockDim.x + threadIdx.x;
if(k<M)
{
const double diff = args->trials[k] - mean;
variance += diff * diff / (double) M;
}
args->variance = variance;
stddev=sqrt(variance);
}
__global__ void black_scholes_iterate (void* the_args)
{
black_scholes_args_t* args = (black_scholes_args_t*) the_args;
const int S = args->S;
const int E = args->E;
const int M = args->M;
const double r = args->r;
const double sigma = args->sigma;
const double T = args->T;
double* trials = args->trials;
double mean = 0.0;
gaussrand_state_t gaussrand_state;
void* prng_stream = NULL;
double *randnumbs;
cudaMallocManaged(&randnumbs, M * sizeof (double));
init_gaussrand_state (&gaussrand_state);
int i = blockIdx.x * blockDim.x + threadIdx.x;
int k = blockIdx.x * blockDim.x + threadIdx.x;
//for (int i = 0; i < M; i++)
if(i<M)
{
prng_stream = spawn_prng_stream(i%4);
const double gaussian_random_number = gaussrand1 (&uniform_random_double, prng_stream, &gaussrand_state);
randnumbs[i]=gaussian_random_number;
const double current_value = S * exp ( (r - (sigma*sigma) / 2.0) * T + sigma * sqrt (T) * randnumbs[k]);
trials[k] = exp (-r * T) * ((current_value - E < 0.0) ? 0.0 : current_value - E);
mean += trials[k] / (double) M;//needs to be shared
args->mean = mean;
}
free_prng_stream (prng_stream);
cudaFree(randnumbs);
}
void black_scholes (confidence_interval_t* interval,
const double S,
const double E,
const double r,
const double sigma,
const double T,
const int M,
const int n)
{
black_scholes_args_t args;
double mean = 0.0;
double conf_width = 0.0;
double* trials = NULL;
assert (M > 0);
trials = (double*) malloc (M * sizeof (double));
assert (trials != NULL);
args.S = S;
args.E = E;
args.r = r;
args.sigma = sigma;
args.T = T;
args.M = M;
args.trials = trials;
args.mean = 0.0;
args.variance = 0.0;
(void)black_scholes_iterate<<<1,1>>>(&args);
mean = args.mean;
black_scholes_stddev<<<1,1>>> (&args);
cudaDeviceSynchronize();
conf_width = 1.96 * stddev / sqrt ((double) M);
interval->min = mean - conf_width;
interval->max = mean + conf_width;
deinit_black_scholes_args (&args);
}
void deinit_black_scholes_args (black_scholes_args_t* args)
{
if (args != NULL)
if (args->trials != NULL)
{
free (args->trials);
args->trials = NULL;
}
}
Any help in understanding what is going on would be appreciated, it seems to be a recurrent theme.
Currently, it's not possible to call cudaMallocManaged in CUDA device code. It has never been possible. I don't believe there is NVIDIA training material that demonstrates using cudaMallocManaged in device code.
If you wish to make an in-kernel allocation, I suggest using the methods described in the programming guide. Also, new and delete work similarly to malloc() and free(), for in-kernel usage.

Eigen JacobiSVD cuda compile error

I've got an error, regarding calling JacobiSVD in my cuda function.
This is the part of the code that causing the error.
Eigen::JacobiSVD<Eigen::Matrix3d> svd( cov_e, Eigen::ComputeThinU | Eigen::ComputeThinV);
And this is the error message.
CUDA_voxel_building.cu(43): error: calling a __host__
function("Eigen::JacobiSVD , (int)2> ::JacobiSVD") from a __global__
function("kernel") is not allowed
I've used the following command to compile it.
nvcc -std=c++11 -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__ -ptx CUDA_voxel_building.cu
I'm using code 8.0 with eigen3 on ubuntu 16.04.
It seems like other functions such as eigen value decomposition also gives the same error.
Anyone knows a solution? I'm enclosing my code below.
//nvcc -ptx CUDA_voxel_building.cu
#include </usr/include/eigen3/Eigen/Core>
#include </usr/include/eigen3/Eigen/SVD>
/*
#include </usr/include/eigen3/Eigen/Sparse>
#include </usr/include/eigen3/Eigen/Dense>
#include </usr/include/eigen3/Eigen/Eigenvalues>
*/
__global__ void kernel(double *p, double *breaks,double *ind, double *mu, double *cov, double *e,double *v, int *n, char *isgood, int minpts, int maxgpu){
bool debuginfo = false;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(debuginfo)printf("Thread %d got pointer\n",idx);
if( idx < maxgpu){
int s_ind = breaks[idx];
int e_ind = breaks[idx+1];
int diff = e_ind-s_ind;
if(diff >minpts){
int cnt = 0;
Eigen::MatrixXd local_p(3,diff) ;
for(int k = s_ind;k<e_ind;k++){
int temp_ind=ind[k];
//Eigen::Matrix<double, 3, diff> local_p;
local_p(1,cnt) = p[temp_ind*3];
local_p(2,cnt) = p[temp_ind*3+1];
local_p(3,cnt) = p[temp_ind*3+2];
cnt++;
}
Eigen::Matrix3d centered = local_p.rowwise() - local_p.colwise().mean();
Eigen::Matrix3d cov_e = (centered.adjoint() * centered) / double(local_p.rows() - 1);
Eigen::JacobiSVD<Eigen::Matrix3d> svd( cov_e, Eigen::ComputeThinU | Eigen::ComputeThinV);
/* Eigen::Matrix3d Cp = svd.matrixU() * svd.singularValues().asDiagonal() * svd.matrixV().transpose();
mu[idx]=p[ind[s_ind]*3];
mu[idx+1]=p[ind[s_ind+1]*3];
mu[idx+2]=p[ind[s_ind+2]*3];
e[idx]=svd.singularValues()(0);
e[idx+1]=svd.singularValues()(1);
e[idx+2]=svd.singularValues()(2);
n[idx] = diff;
isgood[idx] = 1;
for(int x = 0; x < 3; x++)
{
for(int y = 0; y < 3; y++)
{
v[x+ 3*y +idx*9]=svd.matrixV()(x, y);
cov[x+ 3*y +idx*9]=cov_e(x, y);
//if(debuginfo)printf("%f ",R[x+ 3*y +i*9]);
if(debuginfo)printf("%f ",Rm(x, y));
}
}
*/
} else {
mu[idx]=0;
mu[idx+1]=0;
mu[idx+2]=0;
e[idx]=0;
e[idx+1]=0;
e[idx+2]=0;
n[idx] = 0;
isgood[idx] = 0;
for(int x = 0; x < 3; x++)
{
for(int y = 0; y < 3; y++)
{
v[x+ 3*y +idx*9]=0;
cov[x+ 3*y +idx*9]=0;
}
}
}
}
}
First of all, Ubuntu 16.04 provides Eigen 3.3-beta1, which is not really recommended to be used. I would suggest upgrading to a more recent version. Furthermore, to include Eigen, write (e.g.):
#include <Eigen/Eigenvalues>
and compile with -I /usr/include/eigen3 (if you use the version provided by the OS), or better -I /path/to/local/eigen-version.
Then, as talonmies noted, you can't call host-functions from kernels, (I'm not sure at the moment, why JacobiSVD is not marked as device function), but in your case it would make much more sense to use Eigen::SelfAdjointEigenSolver, anyway. Since the matrix you are decomposing is fixed-size 3x3 you should actually use the optimized computeDirect method:
Eigen::SelfAdjointEigenSolver<Eigen::Matrix3d> eig; // default constructor
eig.computeDirect(cov_e); // works for 2x2 and 3x3 matrices, does not require loops
It seems the computeDirect even works on the beta version provided by Ubuntu (I'd still recommend to update).
Some unrelated notes:
The following is wrong, since you should start with index 0:
local_p(1,cnt) = p[temp_ind*3];
local_p(2,cnt) = p[temp_ind*3+1];
local_p(3,cnt) = p[temp_ind*3+2];
Also, you can write this in one line:
local_p.col(cnt) = Eigen::Vector3d::Map(p+temp_ind*3);
This line will not fit (unless diff==3):
Eigen::Matrix3d centered = local_p.rowwise() - local_p.colwise().mean();
What you probably mean is (local_p is actually 3xn not nx3)
Eigen::Matrix<double, 3, Eigen::Dynamic> centered = local_p.colwise() - local_p.rowwise().mean();
And when computing cov_e you need to .adjoint() the second factor, not the first.
You can avoid both 'big' matrices local_p and centered, by directly accumulating Eigen::Matrix3d sum2 and Eigen::Vector3d sum with sum2 += v*v.adjoint() and sum +=v and computing
Eigen::Vector3d mu = sum / diff;
Eigen::Matrix3d cov_e = (sum2 - mu*mu.adjoint()*diff)/(diff-1);

OpenACC call to cuMemFreeHost returned error

I'm using PGI C Compiler--pgcc v16.10.0 64-bit--to learn how to program with OpenACC
Here is my code to simulate the process of particle transport
typedef struct {
double position;
double direction;
double weight;
int cell;
int group;
int alive;
} Particle;
int size = 100000; // number of particles to be simulated
int tot = (int) (1.3 * size); // this variable limits the maximum of next generation particles
int capacity = 0; // this variable indicates the actual number of next generation particles
/* particles to be simulated */
Particle *par = (Particle *) malloc(size * sizeof(Particle));
/* next generation particles produced */
particle *next = (Particle *) malloc(tot * sizeof(Particle));
/* initialization */
for (int i = 0; i < size; i++){
par[i].position = rand1() * 100.0; // random number between 0.0~1.0
par[i].direction = rand2(); // random number between -1.0~1.0
par[i].weight = 1.0;
par[i].cell = 2;
par[i].group = rand1() > 0.5 ? 1 : 2;
par[i].alive = 1;
}
/* some parameters used in simulation */
double keff = 1.0;
double tracklength, collision, absorption;
/* start simulating */
int generation;
for (generation = 1; generation <= 100; generation++){
int CellID, MatID, GroupID;
int k; // k-th particle to be simulated
#pragma acc parallel copy(capacity) copyin(par[0:size],size, keff) copyout(next[0:tot])
#pragma acc loop reduction(+:tracklength, collision, absorption)
for (k = 0; k < size; k++){
/* do some calculating with par[k] */
/* secondary particle produced under certain circumstances */
if (condition){
next[capacity].position = par[k].position;
next[capacity].direction = rand2();
next[capacity].weight = 1.0;
next[capacity].cell = par[k].cell;
next[capacity].group = rand1() < 0.9 ? 1 : 2;
next[capacity].alive = 1;
capacity++;
}
}
/* after simulation of current generation, update the parameters */
keff = ........ // one formula to update keff
size = capacity;
capacity = 0;
tot = (int) (1.3 * size);
free(par);
par = next;
next = (Particle *) malloc(tot * sizeof(Particle));
}
free(par);
free(next);
i compiled the code with
pgcc -acc -Minfo=accel -ta=tesla:cc30,time -O0 main.c -o test
and get information below:
Loop carried dependence of par->alive, par->cell, par->direction, par->position, par->weight, par->group prevents parallelization
Loop carried dependence of par->direction, par->group, par->position prevents vectorization
Loop carried reuse of next->position prevents parallelization
and then run the executable ./test
an error occurred
call to cuMemFreeHost returned error 700: Illegal address during kernel execution
i have no idea how to work it out[SAD].
BTW the code runs well and returns the correct result when compiled by gcc ignoring #pragma

"Warning : Non-POD class type passed through ellipsis" for simple thrust program

In spite of reading many answers on the same kind of questions on SO I am not able to figure out solution in my case. I have written the following code to implement a thrust program. Program performs simple copy and display operation.
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
int main(void)
{
// H has storage for 4 integers
thrust::host_vector<int> H(4);
H[0] = 14;
H[1] = 20;
H[2] = 38;
H[3] = 46;
// H.size() returns the size of vector H
printf("\nSize of vector : %d",H.size());
printf("\nVector Contents : ");
for (int i = 0; i < H.size(); ++i) {
printf("\t%d",H[i]);
}
thrust::device_vector<int> D = H;
printf("\nDevice Vector Contents : ");
for (int i = 0; i < D.size(); i++) {
printf("%d",D[i]); //This is where I get the warning.
}
return 0;
}
Thrust implements certain operations to facilitate using elements of a device_vector in host code, but this apparently isn't one of them.
There are many approaches to addressing this issue. The following code demonstrates 3 possible approaches:
explicitly copy D[i] to a host variable, and thrust has an appropriate method defined for that.
copy the thrust device_vector back to a host_vector before print-out.
use thrust::copy to directly copy the elements of the device_vector to a stream.
Code:
#include <stdio.h>
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
int main(void)
{
// H has storage for 4 integers
thrust::host_vector<int> H(4);
H[0] = 14;
H[1] = 20;
H[2] = 38;
H[3] = 46;
// H.size() returns the size of vector H
printf("\nSize of vector : %d",H.size());
printf("\nVector Contents : ");
for (int i = 0; i < H.size(); ++i) {
printf("\t%d",H[i]);
}
thrust::device_vector<int> D = H;
printf("\nDevice Vector Contents : ");
//method 1
for (int i = 0; i < D.size(); i++) {
int q = D[i];
printf("\t%d",q);
}
printf("\n");
//method 2
thrust::host_vector<int> Hnew = D;
for (int i = 0; i < Hnew.size(); i++) {
printf("\t%d",Hnew[i]);
}
printf("\n");
//method 3
thrust::copy(D.begin(), D.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
Note that for methods like these, thrust is generating various kinds of device-> host copy operations to facilitate the use of device_vector in host code. This has performance implications, so you might want to use the defined copy operations for large vectors.

Julia Set - Cuda , improve the performance failed

Recently I am learning the examples in the book CUDA by JASON SANDERS.
the example of Juila Set makes a bad performance of 7032ms.
Here is the program:
#include <cuda.h>
#include <cuda_runtime.h>
#include <cpu_bitmap.h>
#include <book.h>
#define DIM 1024
struct cuComplex{
float r;
float i;
__device__ cuComplex(float a, float b) : r(a),i(b){
}
__device__ float magnitude2(void){
return r*r+i*i;
}
__device__ cuComplex operator *(const cuComplex& a){
return cuComplex(r*a.r-i*a.i, i*a.r+r*a.i);
}
__device__ cuComplex operator +(const cuComplex& a){
return cuComplex(r+a.r,i+a.i);
}
};
__device__ int julia(int x,int y){
const float scale = 1.5;
float jx = scale * (float)(DIM/2 - x)/(DIM/2);
float jy = scale * (float)(DIM/2 - y)/(DIM/2);
cuComplex c(-0.8,0.156);
cuComplex a(jx,jy);
int i = 0;
for(i = 0; i<200; i++){
a = a*a + c;
if(a.magnitude2() > 1000){
return 0;
}
}
return 1;
}
__global__ void kernel(unsigned char *ptr){
int x = blockIdx.x;
int y = blockIdx.y;
int offset = x + y*gridDim.x;
int juliaValue = julia(x,y);
ptr[offset*4 + 0] = 255*juliaValue;
ptr[offset*4 + 1] = 0;
ptr[offset*4 + 2] = 1;
ptr[offset*4 + 3] = 255;
}
int main(void){
CPUBitmap bitmap(DIM,DIM);
unsigned char * dev_bitmap;
dim3 grid(DIM,DIM);
dim3 blocks(DIM/16,DIM/16);
dim3 threads(16,16);
dim3 thread(DIM,DIM);
cudaEvent_t start,stop;
cudaEvent_t bitmapCpy_start,bitmapCpy_stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventCreate(&bitmapCpy_start));
HANDLE_ERROR(cudaEventCreate(&bitmapCpy_stop));
HANDLE_ERROR(cudaMalloc((void **)&dev_bitmap,bitmap.image_size()));
HANDLE_ERROR(cudaEventRecord(start,0));
kernel<<<grid,1>>>(dev_bitmap);
HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(),dev_bitmap,bitmap.image_size(),cudaMemcpyDeviceToHost));
//HANDLE_ERROR(cudaEventRecord(bitmapCpy_stop,0));
//HANDLE_ERROR(cudaEventSynchronize(bitmapCpy_stop));
// float copyTime;
// HANDLE_ERROR(cudaEventElapsedTime(&copyTime,bitmapCpy_start,bitmapCpy_stop));
HANDLE_ERROR(cudaEventRecord(stop,0));
HANDLE_ERROR(cudaEventSynchronize(stop));
float elapsedTime;
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,start,stop));
//printf("Total time is %3.1f ms, time for copying is %3.1f ms \n",elapsedTime,copyTime);
printf("Total time is %3.1f ms\n",elapsedTime);
bitmap.display_and_exit();
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaEventDestroy(bitmapCpy_start));
HANDLE_ERROR(cudaEventDestroy(bitmapCpy_stop));
HANDLE_ERROR(cudaFree(dev_bitmap));
}
I think the main factor that influences the performance is that the program above just run 1 thread in every block:
kernel<<<grid,1>>>(dev_bitmap);
so I change the kernel like the following:
__global__ void kernel(unsigned char *ptr){
int x = threadIdx.x + blockIdx.x*blockDim.x;
int y = threadIdx.y + blockIdx.y*blockDim.y;
int offset = x + y*gridDim.x*blockIdx.x;
int juliaValue = julia(x,y);
ptr[offset*4 + 0] = 255*juliaValue;
ptr[offset*4 + 1] = 0;
ptr[offset*4 + 2] = 1;
ptr[offset*4 + 3] = 255;
}
and call kernel:
dim3 blocks(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<blocks,threads>>>(dev_bitmap);
I think this change is not a big deal, but when I ran it, it acted like that it ran into some endless loops, no image appeared and I couldn't do anything with my screen, just blocked there.
toolkit: cuda 5.5
system: ubuntu 12.04
When I run the original code you have posted here, I get a correct display and a time of ~340ms.
When I make your kernel change, I get an "unspecified launch error" on the kernel launch.
In your modified kernel, you have the following which is an incorrect computation:
int offset = x + y*gridDim.x*blockIdx.x;
When I change it to:
int offset = x + y*gridDim.x*blockDim.x;
I get normal execution and results, and an indicated time of ~10ms.

Resources