I'm attempting to compile the example code relating to the ODE solver, gsl/gsl_odeiv2, using g++. The code below is from their website :
http://www.gnu.org/software/gsl/manual/html_node/ODE-Example-programs.html
and compiles fine under gcc, but g++ throws the error
invalid conversion from 'void*' to 'int (*)(double, const double*, double*, double*,
void*)' [-fpermissive]
in the code :
#include <stdio.h>
#include <gsl/gsl_errno.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_odeiv2.h>
int func (double t, const double y[], double f[], void *params)
{
double mu = *(double *)params;
f[0] = y[1];
f[1] = -y[0] - mu*y[1]*(y[0]*y[0] - 1);
return GSL_SUCCESS;
}
int * jac;
int main ()
{
double mu = 10;
gsl_odeiv2_system sys = {func, jac, 2, &mu};
gsl_odeiv2_driver * d = gsl_odeiv2_driver_alloc_y_new (&sys, gsl_odeiv2_step_rkf45, 1e-6, 1e-6, 0.0);
int i;
double t = 0.0, t1 = 100.0;
double y[2] = { 1.0, 0.0 };
for (i = 1; i <= 100; i++)
{
double ti = i * t1 / 100.0;
int status = gsl_odeiv2_driver_apply (d, &t, ti, y);
if (status != GSL_SUCCESS)
{
printf ("error, return value=%d\n", status);
break;
}
printf ("%.5e %.5e %.5e\n", t, y[0], y[1]);
}
gsl_odeiv2_driver_free (d);
return 0;
}
The error is given on the line
gsl_odeiv2_system sys = {func, jac, 2, &mu};
Any help in solving this issue would be fantastic. I'm hoping to include some stdlib elements, hence wanting to compile it as C++. Also, if I can get it to compile with g++-4.7, I could more easily multithread it using C++11's additions to the language. Thank you very much.
It looks like you have some problems with Jacobian. In your particular case you could just use NULL instead of jac in the definition of your system, i.e.
gsl_odeiv2_system sys = {func, NULL, 2, &mu};
In general you Jacobian must be a function with particular entries - see gsl manual - that is why your compiler is complaining.
Also, you may want to link the gsl library manually:
-L/usr/local/lib -lgsl
if you are on a linux system.
Related
I have a written code that I am trying to modify in order to make it use CUDA and I am having plenty of troubles, currently, I was trying to make the functions I want to be kernel functions to be void and I got some errors
Here is the list of errors I am getting:
black_scholes.cu(54): error: calling a __host__ function("cudaMallocManaged<double> ") from a __global__ function("black_scholes_iterate") is not allowed
black_scholes.cu(54): error: identifier "cudaMallocManaged<double> " is undefined in device code
black_scholes.cu(56): error: calling a __host__ function("init_gaussrand_state") from a __global__ function("black_scholes_iterate") is not allowed
black_scholes.cu(56): error: identifier "init_gaussrand_state" is undefined in device code
black_scholes.cu(65): error: calling a __host__ function("spawn_prng_stream") from a __global__ function("black_scholes_iterate") is not allowed
black_scholes.cu(65): error: identifier "spawn_prng_stream" is undefined in device code
black_scholes.cu(66): error: calling a __host__ function("gaussrand1") from a __global__ function("black_scholes_iterate") is not allowed
black_scholes.cu(66): error: identifier "gaussrand1" is undefined in device code
black_scholes.cu(66): error: identifier "uniform_random_double" is undefined in device code
black_scholes.cu(73): error: calling a __host__ function("free_prng_stream") from a __global__ function("black_scholes_iterate") is not allowed
black_scholes.cu(73): error: identifier "free_prng_stream" is undefined in device code
black_scholes.cu(74): error: calling a __host__ function("cudaFree") from a __global__ function("black_scholes_iterate") is not allowed
black_scholes.cu(74): error: identifier "cudaFree" is undefined in device code
I am particularly posting concerning the first 2 errors as while learning CUDA via an Nvidia Introductory course, it was common to call cudaMallocManaged inside a __global__ function and I don't get what is different here
Here is my .cu code :
#include "black_scholes.h"
#include "gaussian.h"
#include "random.h"
#include "util.h"
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
__managed__ double stddev;
__global__ void black_scholes_stddev (void* the_args)
{
black_scholes_args_t* args = (black_scholes_args_t*) the_args;
const double mean = args->mean;
const int M = args->M;
double variance = 0.0;
int k = blockIdx.x * blockDim.x + threadIdx.x;
if(k<M)
{
const double diff = args->trials[k] - mean;
variance += diff * diff / (double) M;
}
args->variance = variance;
stddev=sqrt(variance);
}
__global__ void black_scholes_iterate (void* the_args)
{
black_scholes_args_t* args = (black_scholes_args_t*) the_args;
const int S = args->S;
const int E = args->E;
const int M = args->M;
const double r = args->r;
const double sigma = args->sigma;
const double T = args->T;
double* trials = args->trials;
double mean = 0.0;
gaussrand_state_t gaussrand_state;
void* prng_stream = NULL;
double *randnumbs;
cudaMallocManaged(&randnumbs, M * sizeof (double));
init_gaussrand_state (&gaussrand_state);
int i = blockIdx.x * blockDim.x + threadIdx.x;
int k = blockIdx.x * blockDim.x + threadIdx.x;
//for (int i = 0; i < M; i++)
if(i<M)
{
prng_stream = spawn_prng_stream(i%4);
const double gaussian_random_number = gaussrand1 (&uniform_random_double, prng_stream, &gaussrand_state);
randnumbs[i]=gaussian_random_number;
const double current_value = S * exp ( (r - (sigma*sigma) / 2.0) * T + sigma * sqrt (T) * randnumbs[k]);
trials[k] = exp (-r * T) * ((current_value - E < 0.0) ? 0.0 : current_value - E);
mean += trials[k] / (double) M;//needs to be shared
args->mean = mean;
}
free_prng_stream (prng_stream);
cudaFree(randnumbs);
}
void black_scholes (confidence_interval_t* interval,
const double S,
const double E,
const double r,
const double sigma,
const double T,
const int M,
const int n)
{
black_scholes_args_t args;
double mean = 0.0;
double conf_width = 0.0;
double* trials = NULL;
assert (M > 0);
trials = (double*) malloc (M * sizeof (double));
assert (trials != NULL);
args.S = S;
args.E = E;
args.r = r;
args.sigma = sigma;
args.T = T;
args.M = M;
args.trials = trials;
args.mean = 0.0;
args.variance = 0.0;
(void)black_scholes_iterate<<<1,1>>>(&args);
mean = args.mean;
black_scholes_stddev<<<1,1>>> (&args);
cudaDeviceSynchronize();
conf_width = 1.96 * stddev / sqrt ((double) M);
interval->min = mean - conf_width;
interval->max = mean + conf_width;
deinit_black_scholes_args (&args);
}
void deinit_black_scholes_args (black_scholes_args_t* args)
{
if (args != NULL)
if (args->trials != NULL)
{
free (args->trials);
args->trials = NULL;
}
}
Any help in understanding what is going on would be appreciated, it seems to be a recurrent theme.
Currently, it's not possible to call cudaMallocManaged in CUDA device code. It has never been possible. I don't believe there is NVIDIA training material that demonstrates using cudaMallocManaged in device code.
If you wish to make an in-kernel allocation, I suggest using the methods described in the programming guide. Also, new and delete work similarly to malloc() and free(), for in-kernel usage.
I've got an error, regarding calling JacobiSVD in my cuda function.
This is the part of the code that causing the error.
Eigen::JacobiSVD<Eigen::Matrix3d> svd( cov_e, Eigen::ComputeThinU | Eigen::ComputeThinV);
And this is the error message.
CUDA_voxel_building.cu(43): error: calling a __host__
function("Eigen::JacobiSVD , (int)2> ::JacobiSVD") from a __global__
function("kernel") is not allowed
I've used the following command to compile it.
nvcc -std=c++11 -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__ -ptx CUDA_voxel_building.cu
I'm using code 8.0 with eigen3 on ubuntu 16.04.
It seems like other functions such as eigen value decomposition also gives the same error.
Anyone knows a solution? I'm enclosing my code below.
//nvcc -ptx CUDA_voxel_building.cu
#include </usr/include/eigen3/Eigen/Core>
#include </usr/include/eigen3/Eigen/SVD>
/*
#include </usr/include/eigen3/Eigen/Sparse>
#include </usr/include/eigen3/Eigen/Dense>
#include </usr/include/eigen3/Eigen/Eigenvalues>
*/
__global__ void kernel(double *p, double *breaks,double *ind, double *mu, double *cov, double *e,double *v, int *n, char *isgood, int minpts, int maxgpu){
bool debuginfo = false;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(debuginfo)printf("Thread %d got pointer\n",idx);
if( idx < maxgpu){
int s_ind = breaks[idx];
int e_ind = breaks[idx+1];
int diff = e_ind-s_ind;
if(diff >minpts){
int cnt = 0;
Eigen::MatrixXd local_p(3,diff) ;
for(int k = s_ind;k<e_ind;k++){
int temp_ind=ind[k];
//Eigen::Matrix<double, 3, diff> local_p;
local_p(1,cnt) = p[temp_ind*3];
local_p(2,cnt) = p[temp_ind*3+1];
local_p(3,cnt) = p[temp_ind*3+2];
cnt++;
}
Eigen::Matrix3d centered = local_p.rowwise() - local_p.colwise().mean();
Eigen::Matrix3d cov_e = (centered.adjoint() * centered) / double(local_p.rows() - 1);
Eigen::JacobiSVD<Eigen::Matrix3d> svd( cov_e, Eigen::ComputeThinU | Eigen::ComputeThinV);
/* Eigen::Matrix3d Cp = svd.matrixU() * svd.singularValues().asDiagonal() * svd.matrixV().transpose();
mu[idx]=p[ind[s_ind]*3];
mu[idx+1]=p[ind[s_ind+1]*3];
mu[idx+2]=p[ind[s_ind+2]*3];
e[idx]=svd.singularValues()(0);
e[idx+1]=svd.singularValues()(1);
e[idx+2]=svd.singularValues()(2);
n[idx] = diff;
isgood[idx] = 1;
for(int x = 0; x < 3; x++)
{
for(int y = 0; y < 3; y++)
{
v[x+ 3*y +idx*9]=svd.matrixV()(x, y);
cov[x+ 3*y +idx*9]=cov_e(x, y);
//if(debuginfo)printf("%f ",R[x+ 3*y +i*9]);
if(debuginfo)printf("%f ",Rm(x, y));
}
}
*/
} else {
mu[idx]=0;
mu[idx+1]=0;
mu[idx+2]=0;
e[idx]=0;
e[idx+1]=0;
e[idx+2]=0;
n[idx] = 0;
isgood[idx] = 0;
for(int x = 0; x < 3; x++)
{
for(int y = 0; y < 3; y++)
{
v[x+ 3*y +idx*9]=0;
cov[x+ 3*y +idx*9]=0;
}
}
}
}
}
First of all, Ubuntu 16.04 provides Eigen 3.3-beta1, which is not really recommended to be used. I would suggest upgrading to a more recent version. Furthermore, to include Eigen, write (e.g.):
#include <Eigen/Eigenvalues>
and compile with -I /usr/include/eigen3 (if you use the version provided by the OS), or better -I /path/to/local/eigen-version.
Then, as talonmies noted, you can't call host-functions from kernels, (I'm not sure at the moment, why JacobiSVD is not marked as device function), but in your case it would make much more sense to use Eigen::SelfAdjointEigenSolver, anyway. Since the matrix you are decomposing is fixed-size 3x3 you should actually use the optimized computeDirect method:
Eigen::SelfAdjointEigenSolver<Eigen::Matrix3d> eig; // default constructor
eig.computeDirect(cov_e); // works for 2x2 and 3x3 matrices, does not require loops
It seems the computeDirect even works on the beta version provided by Ubuntu (I'd still recommend to update).
Some unrelated notes:
The following is wrong, since you should start with index 0:
local_p(1,cnt) = p[temp_ind*3];
local_p(2,cnt) = p[temp_ind*3+1];
local_p(3,cnt) = p[temp_ind*3+2];
Also, you can write this in one line:
local_p.col(cnt) = Eigen::Vector3d::Map(p+temp_ind*3);
This line will not fit (unless diff==3):
Eigen::Matrix3d centered = local_p.rowwise() - local_p.colwise().mean();
What you probably mean is (local_p is actually 3xn not nx3)
Eigen::Matrix<double, 3, Eigen::Dynamic> centered = local_p.colwise() - local_p.rowwise().mean();
And when computing cov_e you need to .adjoint() the second factor, not the first.
You can avoid both 'big' matrices local_p and centered, by directly accumulating Eigen::Matrix3d sum2 and Eigen::Vector3d sum with sum2 += v*v.adjoint() and sum +=v and computing
Eigen::Vector3d mu = sum / diff;
Eigen::Matrix3d cov_e = (sum2 - mu*mu.adjoint()*diff)/(diff-1);
I visited the gnu gsl website and i dont find the example there to solve a differential equation to be intuitive at all (especially because it is using 2nd order differential equation). https://www.gnu.org/software/gsl/manual/html_node/ODE-Example-programs.html#ODE-Example-programs
Can somebody guide about where to find a descriptive guide to how solve a very simple first order differetial equation.
For example, supoose my function is y'=x+2y (or any such function) then how do i write code in gsl to solve it with a given fixed step size and initial condition.
For y'=f(x,y)=x+2y the arrays have all dimension 1, which normally is something to avoid, but here it is instructional. For the explicit solvers, i.e., those not containing imp in the name, you do not need the Jacobian:
#include <stdio.h>
#include <gsl/gsl_errno.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_odeiv2.h>
int odefunc (double x, const double y[], double f[], void *params)
{
f[0] = x+2*y[0];
return GSL_SUCCESS;
}
int * jac;
int main ()
{
int dim = 1;
gsl_odeiv2_system sys = {odefunc, NULL, dim, NULL};
gsl_odeiv2_driver * d = gsl_odeiv2_driver_alloc_y_new (&sys, gsl_odeiv2_step_rkf45, 1e-6, 1e-6, 0.0);
int i;
double x0 = 0.0, xf = 100.0; /* start and end of integration interval */
double x = x0;
double y[1] = { 1.0 }; /* initial value */
for (i = 1; i <= 100; i++)
{
double xi = x0 + i * (xf-x0) / 100.0;
int status = gsl_odeiv2_driver_apply (d, &x, xi, y);
if (status != GSL_SUCCESS)
{
printf ("error, return value=%d\n", status);
break;
}
printf ("%.8e %.8e\n", x, y[0]);
}
gsl_odeiv2_driver_free (d);
return 0;
}
You may want to look up the book "Introduction to Computational Modeling Using C and Open-Source Tools" by Jose M. Garrido.
Lutzl, Please review:
'#include <stdio.h>
#include <gsl/gsl_errno.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_odeiv2.h>
int odefunc (double x, const double y[], double f[], void *params)
{
f[0] = x+2*y[0];
return GSL_SUCCESS;
}
int jac(double x , const double y[] ,double *dfdy , double dfdx[], void *params) {
gsl_matrix_view dfdy_mat= gsl_matrix_view_array(dfdy,1,1);
gsl_matrix *m= &dfdy_mat.matrix;
gsl_matrix_set(m,0,0,x);
dfdx[0]=2;
return GSL_SUCCESS;
}
int main ()
{
int dim =1;
gsl_odeiv2_system sys = {odefunc, jac, dim, NULL};
gsl_odeiv2_driver * d = gsl_odeiv2_driver_alloc_y_new (&sys, gsl_odeiv2_step_rk1imp,1e-7,1e-7, 0.0);
int i;
double x0 = 0.0, xf =1.0; /*al value */
while(x<xf)
{
double xi = x0 + 0.25;
int status = gsl_odeiv2_driver_apply (d, &x, xi, y);
if (status != GSL_SUCCESS)
{
printf ("error, return value=%d\n", status);
break;
}
printf ("%.8e %.8e\n", x, y[0]);
}
gsl_odeiv2_driver_free (d);
return 0;
}
'
I have a few for loops that does saturated arithmetic operations.
For instance:
Implementation of saturated add in my case is as follows:
static void addsat(Vector &R, Vector &A, Vector &B)
{
int32_t a, b, r;
int32_t max_add;
int32_t min_add;
const int32_t SAT_VALUE = (1<<(16-1))-1;
const int32_t SAT_VALUE2 = (-SAT_VALUE - 1);
const int32_t sat_cond = (SAT_VALUE <= 0x7fffffff);
const uint32_t SAT = 0xffffffff >> 16;
for (int i=0; i<R.length; i++)
{
a = static_cast<uint32_t>(A.data[i]);
b = static_cast<uint32_t>(B.data[i]);
max_add = (int32_t)0x7fffffff - a;
min_add = (int32_t)0x80000000 - a;
r = (a>0 && b>max_add) ? 0x7fffffff : a + b;
r = (a<0 && b<min_add) ? 0x80000000 : a + b;
if ( sat_cond == 1)
{
std_max(r,r,SAT_VALUE2);
std_min(r,r,SAT_VALUE);
}
else
{
r = static_cast<uint16_t> (static_cast<int32_t> (r));
}
R.data[i] = static_cast<uint16_t>(r);
}
}
I see that there is paddsat intrinsic in x86 that could have been the perfect solution to this loop. I do get the code auto vectorized but with a combination of multiple operations according to my code. I would like to know what could be the best way to write this loop that auto-vectorizer finds the addsat operation match right.
Vector structure is:
struct V {
static constexpr int length = 32;
unsigned short data[32];
};
Compiler used is clang 3.8 and code was compiled for AVX2 Haswell x86-64 architecture.
I'm trying to learn how to exploit vectorization with gcc. I followed this tutorial of Erik Holk ( with source code here )
I just modified it to double. I used this dotproduct to compute multiplication of randomly generated square matrices 1200x1200 of doubles ( 300x300 double4 ). I checked that the results are the same. But what really surprised me is, that the simple dotproduct was actually 10x faster than my manually vectorized.
maybe, double4 is too big for SSE ( it would need AVX2 ? ) But I would expect that even in case when gcc cannot find suitable instruction for dealing with double4 at once, it would still be able to exploit the explicit information that data are in big chunks for auto-vectorization.
Details:
the results was:
dot_simple:
time elapsed 1.90000 [s] for 1.728000e+09 evaluations => 9.094737e+08 [ops/s]
dot_SSE:
time elapsed 15.78000 [s] for 1.728000e+09 evaluations => 1.095057e+08 [ops/s]
I used gcc 4.6.3 on Intel® Core™ i5 CPU 750 # 2.67GHz × 4 with these options -std=c99 -O3 -ftree-vectorize -unroll-loops --param max-unroll-times=4 -ffast-math or with just -O2
( the result was the same )
I did it using python/scipy.weave() for convenience, but I hope it doesn't change anything
The code:
double dot_simple( int n, double *a, double *b ){
double dot = 0;
for (int i=0; i<n; i++){
dot += a[i]*b[i];
}
return dot;
}
and that one using explicitly gcc vector extensiobns
double dot_SSE( int n, double *a, double *b ){
const int VECTOR_SIZE = 4;
typedef double double4 __attribute__ ((vector_size (sizeof(double) * VECTOR_SIZE)));
double4 sum4 = {0};
double4* a4 = (double4 *)a;
double4* b4 = (double4 *)b;
for (int i=0; i<n; i++){
sum4 += *a4 * *b4 ;
a4++; b4++;
//sum4 += a4[i] * b4[i];
}
union { double4 sum4_; double sum[VECTOR_SIZE]; };
sum4_ = sum4;
return sum[0]+sum[1]+sum[2]+sum[3];
}
Then I used it for multiplication of 300x300 random matrix to measure performance
void mmul( int n, double* A, double* B, double* C ){
int n4 = n*4;
for (int i=0; i<n4; i++){
for (int j=0; j<n4; j++){
double* Ai = A + n4*i;
double* Bj = B + n4*j;
C[ i*n4 + j ] = dot_SSE( n, Ai, Bj );
//C[ i*n4 + j ] = dot_simple( n4, Ai, Bj );
ijsum++;
}
}
}
scipy weave code:
def mmul_2(A, B, C, __force__=0 ):
code = r''' mmul( NA[0]/4, A, B, C ); '''
weave_options = {
'extra_compile_args': ['-std=c99 -O3 -ftree-vectorize -unroll-loops --param max-unroll-times=4 -ffast-math'],
'compiler' : 'gcc', 'force' : __force__ }
return weave.inline(code, ['A','B','C'], verbose=3, headers=['"vectortest.h"'],include_dirs=['.'], **weave_options )
One of the main problems is that in your function dot_SSE you loop over n items when you should only loop over n/2 items (or n/4 with AVX).
To fix this with GCC's vector extensions you can do this:
double dot_double2(int n, double *a, double *b ) {
typedef double double2 __attribute__ ((vector_size (16)));
double2 sum2 = {};
int i;
double2* a2 = (double2*)a;
double2* b2 = (double2*)b;
for(i=0; i<n/2; i++) {
sum2 += a2[i]*b2[i];
}
double dot = sum2[0] + sum2[1];
for(i*=2;i<n; i++) dot +=a[i]*b[i];
return dot;
}
The other problem with your code is that it has a dependency chain. Your CPU can do a simultaneous SSE addition and multiplication but only for independent data paths. To fix this you need to unroll the loop. The following code unrolls the loop by 2 (but you probably need to unroll by three for the best results).
double dot_double2_unroll2(int n, double *a, double *b ) {
typedef double double2 __attribute__ ((vector_size (16)));
double2 sum2_v1 = {};
double2 sum2_v2 = {};
int i;
double2* a2 = (double2*)a;
double2* b2 = (double2*)b;
for(i=0; i<n/4; i++) {
sum2_v1 += a2[2*i+0]*b2[2*i+0];
sum2_v2 += a2[2*i+1]*b2[2*i+1];
}
double dot = sum2_v1[0] + sum2_v1[1] + sum2_v2[0] + sum2_v2[1];
for(i*=4;i<n; i++) dot +=a[i]*b[i];
return dot;
}
Here is a version using double4 which I think is really what you wanted with your original dot_SSE function. It's ideal for AVX (though it still needs to be unrolled) but it will still work with SSE2 as well. In fact with SSE it seems GCC breaks it into two chains which effectively unrolls the loop by 2.
double dot_double4(int n, double *a, double *b ) {
typedef double double4 __attribute__ ((vector_size (32)));
double4 sum4 = {};
int i;
double4* a4 = (double4*)a;
double4* b4 = (double4*)b;
for(i=0; i<n/4; i++) {
sum4 += a4[i]*b4[i];
}
double dot = sum4[0] + sum4[1] + sum4[2] + sum4[3];
for(i*=4;i<n; i++) dot +=a[i]*b[i];
return dot;
}
If you compile this with FMA it will generate FMA3 instructions. I tested all these functions here (you can edit and compile the code yourself as well) http://coliru.stacked-crooked.com/a/273268902c76b116
Note that using SSE/AVX for a single dot production in matrix multiplication is not the optimal use of SIMD. You should do two (four) dot products at once with SSE (AVX) for double floating point.