I am creating a ruby wrapper for the fftw3 library for the Scientific Ruby Foundation which uses nmatrix objects instead of regular ruby arrays.
I have a curious problem in returning the transformed array in that I am not sure how to do this so I can check the transform has been computed correctly against octave or (something like this) in my specs
I have an idea that I might be best to cast the output array out which is an fftw_complex type to a VALUE to pass it to the nmatrix object before returning but I am not sure whether I should be using a wisdom and getting the values from that with fftw.
Here is the method and the link to the spec output on travis-ci
static VALUE
fftw_r2c_one(VALUE self, VALUE nmatrix)
{
VALUE cNMatrix = rb_define_class("NMatrix", rb_cObject);
fftw_plan plan;
VALUE shape = rb_funcall(nmatrix, rb_intern("shape"), 0);
const int size = NUM2INT(rb_funcall(cNMatrix, rb_intern("size"), 1, shape));
double* in = ALLOC_N(double, size);
for (int i = 0; i < size; i++)
{
in[i] = NUM2DBL(rb_funcall(nmatrix, rb_intern("[]"), 1, INT2FIX(i)));
printf("IN[%d]: in[%.2f] \n", i, in[i]);
}
fftw_complex* out = (fftw_complex *) fftw_malloc(sizeof(fftw_complex) * size + 1);
plan = fftw_plan_dft_r2c(1,&size, in, out, FFTW_ESTIMATE);
fftw_execute(plan);
fftw_destroy_plan(plan);
xfree(in);
fftw_free(out);
return nmatrix;
}
Feel free to clone the repo from github and have a play about, if you like.
Note: I am pretty new to fftw3 and have not used C (or ruby) much, before starting this project. I had got more used to java, python and javascript to date so haven't quite got my head around lower level concepts like memory management but am getting the with this project. Please bear that in mind in your answers, and try to see that they are clear for someone and who up to recently has mainly got used to an object orientated approach up to now by avoiding jargon (or taking care to point it out) as that would really help.
Thank you.
I got some advice from Colin Fuller and after some pointers from him I came up with this solution:
VALUE fftw_complex_to_nm_complex(fftw_complex* in) {
double real = ((double (*)) in)[1];
double imag = ((double (*)) in)[2];
VALUE mKernel = rb_define_module("Kernel");
return rb_funcall(mKernel,
rb_intern("Complex"),
2,
rb_float_new(real),
rb_float_new(imag));
}
/**
fftw_r2c
#param self
#param nmatrix
#return nmatrix
With FFTW_ESTIMATE as a flag in the plan,
the input and and output are not overwritten at runtime
The plan will use a heuristic approach to picking plans
rather than take measurements
*/
static VALUE
fftw_r2c_one(VALUE self, VALUE nmatrix)
{
/**
Define and initialise the NMatrix class:
The initialisation rb_define_class will
just retrieve the NMatrix class that already exists
or define a new class altogether if it does not
find NMatrix. */
VALUE cNMatrix = rb_define_class("NMatrix", rb_cObject);
fftw_plan plan;
const int rank = rb_iv_set(self, "#rank", 1);
// shape is a ruby array, e.g. [2, 2] for a 2x2 matrix
VALUE shape = rb_funcall(nmatrix, rb_intern("shape"), 0);
// size is the number of elements stored for a matrix with dimensions = shape
const int size = NUM2INT(rb_funcall(cNMatrix, rb_intern("size"), 1, shape));
double* in = ALLOC_N(double, size);
fftw_complex* out = (fftw_complex *) fftw_malloc(sizeof(fftw_complex) * size * size);
for (int i = 0; i < size; i++)
{
in[i] = NUM2DBL(rb_funcall(nmatrix, rb_intern("[]"), 1, INT2FIX(i)));;
}
plan = fftw_plan_dft_r2c(1,&size, in, out, FFTW_ESTIMATE);
fftw_execute(plan);
for (int i = 0; i < 2; i++)
{
rb_funcall(nmatrix, rb_intern("[]="), 2, INT2FIX(i), fftw_complex_to_nm_complex(out + i));
}
// INFO: http://www.fftw.org/doc/New_002darray-Execute-Functions.html#New_002darray-Execute-Functions
fftw_destroy_plan(plan);
xfree(in);
fftw_free(out);
return nmatrix;
}
The only problem which remains it getting the specs to recognise the output types which I am looking at solving in the ruby core Complex API
If you want to see any performance benefit from using FFTW then you'll need to re-factor this code so that plan generation is performed only once for a given FFT size, since plan generation is quite costly, while executing the plan is where the performance gains come from.
You could either
a) have two entry points - an initialisation routine which generates the plan and then a main entry point which executes the plan
b) use a memorization technique so that you only generate the plan once, the first time you are called for a given FFT dimension, and then you cache the plan for subsequent re-use.
The advantage of b) is that it is a cleaner implementation with a single entry point; the disadvantage being that it breaks if you call the function with dimensions that change frequently.
Related
I've been trying to create a generalized Gradient Noise generator (which doesn't use the hash method to get gradients). The code is below:
class GradientNoise {
std::uint64_t m_seed;
std::uniform_int_distribution<std::uint8_t> distribution;
const std::array<glm::vec2, 4> vector_choice = {glm::vec2(1.0, 1.0), glm::vec2(-1.0, 1.0), glm::vec2(1.0, -1.0),
glm::vec2(-1.0, -1.0)};
public:
GradientNoise(uint64_t seed) {
m_seed = seed;
distribution = std::uniform_int_distribution<std::uint8_t>(0, 3);
}
// 0 -> 1
// just passes the value through, origionally was perlin noise activation
double nonLinearActivationFunction(double value) {
//return value * value * value * (value * (value * 6.0 - 15.0) + 10.0);
return value;
}
// 0 -> 1
//cosine interpolation
double interpolate(double a, double b, double t) {
double mu2 = (1 - cos(t * M_PI)) / 2;
return (a * (1 - mu2) + b * mu2);
}
double noise(double x, double y) {
std::mt19937_64 rng;
//first get the bottom left corner associated
// with these coordinates
int corner_x = std::floor(x);
int corner_y = std::floor(y);
// then get the respective distance from that corner
double dist_x = x - corner_x;
double dist_y = y - corner_y;
double corner_0_contrib; // bottom left
double corner_1_contrib; // top left
double corner_2_contrib; // top right
double corner_3_contrib; // bottom right
std::uint64_t s1 = ((std::uint64_t(corner_x) << 32) + std::uint64_t(corner_y) + m_seed);
std::uint64_t s2 = ((std::uint64_t(corner_x) << 32) + std::uint64_t(corner_y + 1) + m_seed);
std::uint64_t s3 = ((std::uint64_t(corner_x + 1) << 32) + std::uint64_t(corner_y + 1) + m_seed);
std::uint64_t s4 = ((std::uint64_t(corner_x + 1) << 32) + std::uint64_t(corner_y) + m_seed);
// each xy pair turns into distance vector from respective corner, corner zero is our starting corner (bottom
// left)
rng.seed(s1);
corner_0_contrib = glm::dot(vector_choice[distribution(rng)], {dist_x, dist_y});
rng.seed(s2);
corner_1_contrib = glm::dot(vector_choice[distribution(rng)], {dist_x, dist_y - 1});
rng.seed(s3);
corner_2_contrib = glm::dot(vector_choice[distribution(rng)], {dist_x - 1, dist_y - 1});
rng.seed(s4);
corner_3_contrib = glm::dot(vector_choice[distribution(rng)], {dist_x - 1, dist_y});
double u = nonLinearActivationFunction(dist_x);
double v = nonLinearActivationFunction(dist_y);
double x_bottom = interpolate(corner_0_contrib, corner_3_contrib, u);
double x_top = interpolate(corner_1_contrib, corner_2_contrib, u);
double total_xy = interpolate(x_bottom, x_top, v);
return total_xy;
}
};
I then generate an OpenGL texture to display with like this:
int width = 1024;
int height = 1024;
unsigned char *temp_texture = new unsigned char[width*height * 4];
double octaves[5] = {2,4,8,16,32};
for( int i = 0; i < height; i++){
for(int j = 0; j < width; j++){
double d_noise = 0;
d_noise += temp_1.noise(j/octaves[0], i/octaves[0]);
d_noise += temp_1.noise(j/octaves[1], i/octaves[1]);
d_noise += temp_1.noise(j/octaves[2], i/octaves[2]);
d_noise += temp_1.noise(j/octaves[3], i/octaves[3]);
d_noise += temp_1.noise(j/octaves[4], i/octaves[4]);
d_noise/=5;
uint8_t noise = static_cast<uint8_t>(((d_noise * 128.0) + 128.0));
temp_texture[j*4 + (i * width * 4) + 0] = (noise);
temp_texture[j*4 + (i * width * 4) + 1] = (noise);
temp_texture[j*4 + (i * width * 4) + 2] = (noise);
temp_texture[j*4 + (i * width * 4) + 3] = (255);
}
}
Which give good results:
But gprof is telling me that the Mersenne twister is taking up 62.4% of my time and growing with larger textures. Nothing else individual takes any where near as much time. While the Mersenne twister is fast after initialization, the fact that I initialize it every time I use it seems to make it pretty slow.
This initialization is 100% required for this to make sure that the same x and y generates the same gradient at each integer point (so you need either a hash function or seed the RNG each time).
I attempted to change the PRNG to both the linear congruential generator and Xorshiftplus, and while both ran orders of magnitude faster, they gave odd results:
LCG (one time, then running 5 times before using)
Xorshiftplus
After one iteration
After 10,000 iterations.
I've tried:
Running the generator several times before utilizing output, this results in slow execution or simply different artifacts.
Using the output of two consecutive runs after initial seed to seed the PRNG again and use the value after wards. No difference in result.
What is happening? What can i do to get faster results that are of the same quality as the mersenne twister?
OK BIG UPDATE:
I don't know why this works, I know it has something to do with the prime number utilized, but after messing around a bit, it appears that the following works:
Step 1, incorporate the x and y values as seeds separately (and incorporate some other offset value or additional seed value with them, this number should be a prime/non trivial factor)
Step 2, Use those two seed results into seeding the generator again back into the function (so like geza said, the seeds made were bad)
Step 3, when getting the result, instead of using modulo number of items (4) trying to get, or & 3, modulo the result by a prime number first then apply & 3. I'm not sure if the prime being a mersenne prime matters or not.
Here is the result with prime = 257 and xorshiftplus being used! (note I used 2048 by 2048 for this one, the others were 256 by 256)
LCG is known to be inadequate for your purpose.
Xorshift128+'s results are bad, because it needs good seeding. And providing good seeding defeats the whole purpose of using it. I don't recommend this.
However, I recommend using an integer hash. For example, one from Bob's page.
Here's a result of the first hash of that page, it looks OK to me, and it is fast (I think it is much faster than Mersenne Twister):
Here's the code I've written to generate this:
#include <cmath>
#include <stdio.h>
unsigned int hash(unsigned int a) {
a = (a ^ 61) ^ (a >> 16);
a = a + (a << 3);
a = a ^ (a >> 4);
a = a * 0x27d4eb2d;
a = a ^ (a >> 15);
return a;
}
unsigned int ivalue(int x, int y) {
return hash(y<<16|x)&0xff;
}
float smooth(float x) {
return 6*x*x*x*x*x - 15*x*x*x*x + 10*x*x*x;
}
float value(float x, float y) {
int ix = floor(x);
int iy = floor(y);
float fx = smooth(x-ix);
float fy = smooth(y-iy);
int v00 = ivalue(iy+0, ix+0);
int v01 = ivalue(iy+0, ix+1);
int v10 = ivalue(iy+1, ix+0);
int v11 = ivalue(iy+1, ix+1);
float v0 = v00*(1-fx) + v01*fx;
float v1 = v10*(1-fx) + v11*fx;
return v0*(1-fy) + v1*fy;
}
unsigned char pic[1024*1024];
int main() {
for (int y=0; y<1024; y++) {
for (int x=0; x<1024; x++) {
float v = 0;
for (int o=0; o<=9; o++) {
v += value(x/64.0f*(1<<o), y/64.0f*(1<<o))/(1<<o);
}
int r = rint(v*0.5f);
pic[y*1024+x] = r;
}
}
FILE *f = fopen("x.pnm", "wb");
fprintf(f, "P5\n1024 1024\n255\n");
fwrite(pic, 1, 1024*1024, f);
fclose(f);
}
If you want to understand, how a hash function work (or better yet, which properties a good hash have), check out Bob's page, for example this.
You (unknowingly?) implemented a visualization of PRNG non-random patterns. That looks very cool!
Except Mersenne Twister, all your tested PRNGs do not seem fit for your purpose. As I have not done further tests myself, I can only suggest to try out and measure further PRNGs.
The randomness of LCGs are known to be sensitive to the choice of their parameters. In particular, the period of a LCG is relative to the m parameter - at most it will be m (your prime factor) & for many values it can be less.
Similarly, the careful parameters selection is required to get a long period from Xorshift PRNGs.
You've noted that some PRNGs give good procedural generation results while other do not. In order to isolate the cause, I would factor out the proc gen stuff & examine the PRNG output directly. An easy way to visualize the data is to build a grey scale image where each pixel value is a (possibly scaled) random value. For image based stuff, I find this to be an easy way to find stuff that may lead to visual artifacts. Any artifacts you see with this are likely to cause issues with your proc gen output.
Another option is to try something like the Diehard tests. If the aforementioned image test failed to reveal any problems, I might use this just to be sure my PRNG techniques were trustworthy.
Note that your code seeds the PRNG, then generates one pseudorandom number from the PRNG. The reason for the nonrandomness in xorshift128+ that you discovered is that xorshift128+ simply adds the two halves of the seed (and uses the result mod 264 as the generated number) before changing its state (review its source code). This makes that PRNG considerably different from a hash function.
What you see is the practical demonstration of quality of PRNG. Mersenne Twister is one of the best PRNGs with good performance, it passes DIEHARD tests. One should know that generating a random numbers is not an easy computational task, so looking for a better performance will inevitably result in poor quality. LCG is known to be simplest and worst PRNG ever designed and it clearly shows two-dimensional correlation as in your picture. The quality of Xorshift generators largely depend on bitness and parameters. They are definitely worse than Mersenne Twister, but some (xorshift128+) may work good enough to pass BigCrush battery of TestU01 tests.
In other words, if you are making an important physical modelling numerical experiment, you better continue to use Mersenne Twister as known to be a good trade-off between speed and quality and it comes in many standard libraries. On a less important case you may try to use xorshift128+ generator. For an ultimate results you need to use cryptographical-quality PRNG (none of mentioned here may be used for cryptographical purposes).
I was trying to explore the option of "solveInPlace()" function while using LLT in Eigen3.3.7 to speed up the matrix inverse computation in my application.
I used the following code to test it.
int main()
{
const int M=3;
Eigen::Matrix<MyType,Eigen::Dynamic,Eigen::Dynamic> R = Eigen::Matrix<MyType,Eigen::Dynamic,Eigen::Dynamic>::Zero(M,M);
// to make sure full rank
for(int i=0; i<M*2; i++)
{
const Eigen::Matrix<MyType, Eigen::Dynamic,1> tmp = Eigen::Matrix<MyType,Eigen::Dynamic,1>::Random(M);
R += tmp*tmp.transpose();
}
std::cout<<"R \n";
std::cout<<R<<std::endl;
decltype (R) R0 = R; // saving for later comparison
Eigen::LLT<Eigen::Ref<Eigen::Matrix<MyType,Eigen::Dynamic,Eigen::Dynamic> > > myllt(R);
const Eigen::Matrix<MyType,Eigen::Dynamic,Eigen::Dynamic> I = Eigen::Matrix<MyType,Eigen::Dynamic,Eigen::Dynamic>::Identity(R.rows(), R.cols());
myllt.solveInPlace(I);
std::cout<<"I: "<<I<<std::endl;
std::cout<<"Prod InPlace: \n"<<R0*I<<std::endl;
return 0;
}
After reading the Eigen documentation, I thought that the input matrix (here "R") will be modified while computing the transform. To my surprise, I found that the results is store in "I". This was not expected as I defined "I" as a constant. Please provide an explanation for this behaviour.
The simple non-compiler answer would be that you're asking for the LLT to solve in-place (i.e. in the passed parameter) so what would you expect the result to be? Apparently, you would expect it to be a compiler error, as the "in-place" means change the parameter, but you're passing a const object.
So, if we search the Eigen docs for solveInPlace, we find the only item that takes a const reference to have the following note:
"in-place" version of TriangularView::solve() where the result is written in other
Warning
The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here. This function will const_cast it, so constness isn't honored here.
The non-in-place option would be:
R = myllt.solve(I);
but that won't really speed up the calculation. In any case, benchmark before you decide that you need the in-place option.
You're question is in place, as what const_cast is meant to do is strip references/pointers of their const-ness iff the underlying variable is not const qualified* (cppref). If you were to write some examples
const int i = 4;
int& iRef = const_cast<int&>(i); // UB, i is actually const
std::cout << i; // Prints "I want coffee", or it can as we like UB
int j = 4;
const int& jRef = j;
const_cast<int&>(jRef)++; // Legal. Underlying variable is not const.
std::cout << j; // Prints 5
The case with i may well work as expected or not, we're dependent on each implementation/compiler. It may work with gcc but not with clang or MSVC. There are no guarantees. As you are indirectly invoking UB in your example, the compiler can choose to do what you expect or something else entirely.
*Technically it's the modification that's UB, not the const_cast itself.
I am writing a function in RcppEigen for weighted covariances. In one of the steps I want to take column i and column j of a matrix, X, and compute the cwiseProduct, which should return some kind of vector. The output of cwiseProduct will go into an intermediate variable which can be reused many times. From the docs it seems cwiseProduct returns a CwiseBinaryOp, which itself takes two types. My cwiseProduct operates on two column vectors, so I thought the correct return type should be Eigen::CwiseBinaryOp<Eigen::ColXpr, Eigen::ColXpr>, but I get the error no member named ColXpr in namespace Eigen
#include <RcppEigen.h>
// [[Rcpp::depends(RcppEigen)]]
Rcpp::List Crossprod_sparse(Eigen::MappedSparseMatrix<double> X, Eigen::Map<Eigen::MatrixXd> W) {
int K = W.cols();
int p = X.cols();
Rcpp::List crossprods(W.cols());
for (int i = 0; i < p; i++) {
for (int j = i; j < p; j++) {
Eigen::CwiseBinaryOp<Eigen::ColXpr, Eigen::ColXpr> prod = X.col(i).cwiseProduct(X.col(j));
for (int k = 0; k < K; k++) {
//double out = prod.dot(W.col(k));
}
}
}
return crossprods;
}
I have also tried saving into a SparseVector
Eigen::SparseVector<double> prod = X.col(i).cwiseProduct(X.col(j));
as well as computing, but not saving at all
X.col(i).cwiseProduct(X.col(j));
If I don't save the product at all, the functions returns very quickly, hinting that cwiseProduct is not an expensive function. When I save it into a SparseVector, the function is extremely slow, making me think that SparseVector is not the right return type and Eigen is doing extra work to get it into that type.
Recall that Eigen relies on expression templates, so if you don't assign an expression then this expression is essentially a no-op. In your case, assigning it to a SparseVector is the right thing to do. Regarding speed, make sure to compile with compiler optimizations ON (like -O3).
Nonetheless, I believe there is a faster way to write your overall computations. For instance, are you sure that all X.col(i).cwiseProduct(X.col(j)) are non empty? If not, then the second loop should be rewritten to iterate over the sparse set of overlapping columns only. Loops could also be interchanged to leverage efficient matrix products.
I have an C++11 application where I commonly iterate over several different structure of arrays for various algorithms. Raw CPU performance is important for this app.
The array elements are fundamental types (int, double, ..) or simple struct. The array are typically tens of thousands of elements long. I often need to iterate several arrays at once in a given loop. So typically I would need one pointer for each array of whatever type. So times I need to increment five individual pointers which is verbose.
Based on these answers about tuples,
Why is std::pair faster than std::tuple
C++11 tuple performance
I hoped there was no overhead to using tuples to pack the pointers together into a single object.
I thought it might be nice to implement a cursor like object to assist in iterating, since missing the increment on a particular pointer would be an annoying bug.
auto pts = std::make_tuple(p1, p2, p3...);
allow you to bundle a bunch of variables together in a typesafe way. Then you can implement a variadic template function to increment each pointer in the tuple in a type safe way.
However...
When I measure performance, the tuple version was slower then using raw pointers. But when I look at the generated assembly I see additional mov instructions in the tuple loop increment. Maybe due to the fact the std::get<> returns a reference? I had hoped that would be compiled away...
Am I missing something or are raw pointers just going to beat tuples when used like this? Here is a simple test harness. I threw away the fancy cursor code and just use a std::tuple<> for this test
On my machine, the tuple loop is consistently twice as slow as the raw pointer version for various data sizes.
My system config is Visual C++ 2013 x64 on Windows 8 with a release build. I did try turning on various optimization in Visual Studio such as
Inline Function Expansion : Any Suitable (/Ob2)
but it did not seem to change the time result for my case.
I did need to do two extra things to avoid aggressive optimization by VS
1) I forced the test data array to allocated on the heap, not the stack. That made a big difference when I timed things, possibly due to memory cache effects.
2) I forced a side effect by writing to static variable at the end so the compiler would not just skip my loop.
struct forceHeap
{
__declspec(noinline) int* newData(int M)
{
int* data = new int[M];
return data;
}
};
void timeSumCursor()
{
static int gIntStore;
int maxCount = 20;
int M = 10000000;
// compiler might place array on stack which changes the timing
// int* data = new int[N];
forceHeap fh;
int* data = fh.newData(M);
int *front = data;
int *end = data + M;
int j = 0;
for (int* p = front; p < end; ++p)
{
*p = (++j) % 1000;
}
{
BEGIN_TIMING_BLOCK("raw pointer loop", maxCount);
int* p = front;
int sum = 0;
int* cursor = front;
while (++cursor != end)
{
sum += *cursor;
}
gIntStore = sum;// force a side effect
END_TIMING_BLOCK();
}
printf("%d\n", gIntStore);
{
// just use a simple tuple to show the issue
// rather full blown cursor object
BEGIN_TIMING_BLOCK("tuple loop", maxCount);
int sum = 0;
auto cursor = std::make_tuple(front);
while (++std::get<0>(cursor) != end)
{
sum += *std::get<0>(cursor);
}
gIntStore = sum; // force a side effect
END_TIMING_BLOCK();
}
printf("%d\n", gIntStore);
delete[] data;
}
I've been doing a lot of OpenGL and shaders before, and now, I decided to give a try to OpenCL. I watched some online tutorials, and started reading books on the subject. In order to better understand, and because I believe that the best way to learn is by intelligently trying and learning from the issues that arose while doing so, I decided to start implementing a kernel for a fully-connected perceptron.
For those who don't know what that is, I'll explain the basic idea. It is a neural network in which each neuron of a layer is connected to every neurons of the next layer. Each neuron has but one action to perform: performing the sum of all the neurons from the previous layer, weighted by a different value for each neuron.
This seemed simple enough to implement, and after reading the paper "Parallel Neural Network Training with OpenCL" I implemented it in the following way
Each layer being dependent on the previous one, they're being run sequentially by the host
For computing a layer, I run my kernel with a global work size of the number of neurons within the layer (which can be quite huge, tens of thousand for instance). That makes it so that all the neurons are performing its sum independently to one another.
Each neuron (identified by its global_work_id) performs the weighted sum with all the neurons from the previous layer.
Here is my fully functional opencl kernel:
/**
* #brief Computes one layer of the perceptron given the previous one and the
* weights
* The kernel is run once for each layer.
* The work items are each tasked with computing the output of a single neuron
* of the out layer.
*
* #param out_layer_size
* Size of the output layer (number of elements in the output array that will
* contain the result for each neuron).
* #param in_layer_size
* Number of elements of the input layer
* #param in_value
* Values of the neuron in the previous layer
* #param in_weights
* Array containing the weights for each input neuron. It is organised as a
* two dimensional matrix, written by concatenating each line in the array
* [ w11, w12, w13, ...
* w21, w22, w23, ...
* ..., ..., ..., ...
* ]
* Where wij is the weight linking the neuron i of the input layer to the
* neuron j of the output layer
* #param out_values
* Computed values for the current layer
*/
void kernel perceptron(global const int* in_layer_size, global const int* out_layer_size, global const float *in_value, global const float* in_weights, global float* out_values)
{
private const int global_id = get_global_id(0);
private const int out_layer_s = *out_layer_size;
private const int in_layer_s = *in_layer_size;
private const int offset = out_layer_s * global_id;
private float sum = 0.;
for(int i=0; i < in_layer_s; i++) {
sum += in_weights[i*out_layer_s+global_id] * in_value[i];
}
//out_values[global_id] = sigma(sum);
out_values[global_id] = sum;
}
And here is how I invoke it:
queue.enqueueNDRangeKernel(kernel, cl::NullRange,cl::NDRange(number of neurons within layer),cl::NullRange);
I realize that the bottleneck of this kernel is the implementation of the weighted sum. It would be really helpful if someone could explain how I could improve upon this to make it faster.
I probably don't make proper use of the different memory regions, I'm thinking essentially of the local memory that I don't even use.
Just to give you an idea of performance (that is on an Nvidia GTX 660M), I'll show you some of the times I achieved. Each value is the number of neurons per layer:
2500, 10 000, 2500 : 0.018s ~ 60FPS. It's about 4 to 5 times faster than on my processor (Intel Core i7 running at 2.40GHz)
100 000, 100 000, 500: 140s -> which I guess isn't surpsising since each neuron in the second layer has to perform the weighted sum of 100 000 elements. Running this on my processor yields about the same results.
As you told, bottleneck is the weighted summ. That's not hard to be, as at each layer every WI (Work Item) is doing a lot of IO operations in comparison to number of arithmetic operations. I have no experience in neural networks, but for me problem looks like poor memory access pattern on GPU.
Potentially, that can be solved by organizing your WI into local WGs (Work Groups). As every WI needs to process all data from prev. layer, I guess that all WI in WG can load some amount of data into local memory, process them and than to next bunch of data. This will make your algorithm much more cache friendly. Pseudo-code of kernel looks like:
void kernel Kernel(
__global const int in_layer_size,
__global const int out_layer_size,
__global const float *in_value,
__global const float *in_weights,
__global float *out_values){
__local float buffer[SOME_SIZE];
__global const float* p_in = in_value;
__global float* p_out = out_values;
const int
global_id = get_global_id(0),
local_id = get_local_id(0),
num_buffers = in_layer_size / SOME_SIZE,
offset = out_layer_size * global_id;
float sum = 0.0f;
for(int i=0; i < num_buffers; i++){
buffer[local_id] = p_in[local_id];
barrier(CLK_LOCAL_MEM_FENCE);
//Process all data inside buffer by every WI in WG
//...
p_in += SOME_SIZE;
out_values += SOME_SIZE;
}
//...
return;
}
So, you're sliding with the window of fixed size & calculating data within & then going to next window. Al data operations are done independently, Work Items are only using same data at same time. Optimal size of local group is Device- and Kernel- dependent.
You can do it in many ways.
But the most generic way, without changing how your kernel behaves is to do it is reusing your workgroup size (whatever you selected, or default) and reuse the memory accesses from the group.
I would suggest something like this:
NOTE: I removed thouse ugly pointers for single values. OpenCL supports this, and it is much easier. There is no need to create a memory zone, just do clSetKernelArg(kernel, arg_index, sizeof(cl_float), &size); Where cl_float size = the_size;.
#define IN_LOCAL_SIZE 4096 //Because 16KB/4B (for each float)
void kernel perceptron(global const int in_layer_size, global const int out_layer_size, global const float *in_value, global const float* in_weights, global float* out_values)
{
const int global_id = get_global_id(0);
__local float in_buffer[IN_LOCAL_SIZE];
float sum = 0.0f;
event_t ev;
int j;
//For each full buffer
for(j=0; j < (in_layer_size/IN_LOCAL_SIZE)-1; i++) {
ev = async_work_group_copy(in_buffer, in_value+j*IN_LOCAL_SIZE, IN_LOCAL_SIZE, ev);
wait_group_events(1,&ev);
barrier(CLK_LOCAL_MEM_FENCE);
for(int i=0; i < IN_LOCAL_SIZE; i++) {
sum += in_weights[(i+j*IN_LOCAL_SIZE)*out_layer_size+global_id] * in_buffer[i];
}
}
//Last one
ev = async_work_group_copy(in_buffer, in_value+j*IN_LOCAL_SIZE, in_layer_size%IN_LOCAL_SIZE, ev);
wait_group_events(1,&ev);
barrier(CLK_LOCAL_MEM_FENCE);
for(int i=0; i < in_layer_size%IN_LOCAL_SIZE; i++) {
sum += in_weights[(i+j*IN_LOCAL_SIZE)*out_layer_size+global_id] * in_buffer[i];
}
out_values[global_id] = sum;
}
However, if the output size is small (100k, 250k, 500), then you will have just 500 work items, which is not optimal. In that case you should reshape the algorithm.
One possible way to do it is that each workitem works in the inner layer, performing sums, and the whole work group creates one output out of all the work items. That would be easy, since you can control the sums inside the workgroup easily.
But maybe other approaches fit better your problem.
You can make large improvements by caching in_values in local memory. The fewer times you have to read each element of in_values from global memory, the better.
I have come up with a solution that caches the maximum number of input values, and reads each element from global memory only once per work group. This is done by copying a block of in_values at a time, processing it against all out_values, and moving on to the next block. There is also a local array of floats used to reduce the work items' sums of each block.
pseudocode:
output elements assumed to be set to 0 already
for each block of input values:
cache the input block
for each target output value:
reset local sum to 0
for each element this work item is responsible for:
read the weight, multiply, and add to sum
reduce sums to a single value, ADD value to output element
I haven't had a chance to run this through a profiler or debugger yet, but I will give it a try when I am back at my home PC. (no opencl tools at my office workstation). Make sure to queue kernel with group size equal to the GROUP_SIZE constant. Also, only create a single group per compute unit on your device.
real code:
//experiment with GROUP_SIZE to discover the optimal value for your device
//this needs to be equal to local_work_size passed into clEnqueueNDRangeKernel
//use a multiple of CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
//max. for most devices is 256
#define GROUP_SIZE = 64;
// IN_VALUE_CACHE_SIZE is the number of floats from in_value to copy to local memory at a time
//assuming GROUP_SIZE can be up to 256, sizeof(float)=4, and local memory size is 32kb, full saturation can be achieved with the following:
//(32768 - (256 * 4)) /4 = 7936
//try another multiple of 1024 (6144, 4096... )if there is trouble with this value
#define IN_VALUE_CACHE_SIZE = 7936;
void kernel perceptron(global const int* in_layer_size, global const int* out_layer_size, global const float *in_value, global const float* in_weights, global float* out_values)
{
private const int global_id = get_global_id(0);
private const int out_layer_s = *out_layer_size;
private const int in_layer_s = *in_layer_size;
private const int offset = out_layer_s * global_id;
private const int item_id = get_local_id(0);
private const int group_id = get_group_id(0);
private const int group_count = get_num_groups(0);
local float result_buffer[GROUP_SIZE];
local float in_value_cache[IN_VALUE_CACHE_SIZE];
int i,j,k;
//init the block to 0, in case there are fewer than IN_VALUE_CACHE_SIZE values in total
for(i=item_id; i<IN_VALUE_CACHE_SIZE; i+= GROUP_SIZE){
in_value_cache[i] = 0.0;
}
barrier(CL_LOCAL_MEM_FENCE);
private float sum = 0.0;
event_t e;
int copy_total = 0;
int copy_offset;
for(i=0; i<in_layer_s; i+=IN_VALUE_CACHE_SIZE){
//cap the number of values to copy to local memory if loop is near the end of the input data
copy_total = IN_VALUE_CACHE_SIZE;
if((copy_total + i*IN_VALUE_CACHE_SIZE) > in_layer_s){
copy_total = in_layer_s - i*IN_VALUE_CACHE_SIZE;
}
//copy the next block of values
e = async_work_group_copy(in_value_cache, in_value + i * 4, copy_total, 0);
wait_group_events(1, &e);
for(j=group_id; j<out_layer_s; j+=group_count){
sum = 0.0;
//need to reset result_buffer[item_id] as well
//this is in case there are fewer than GROUP_SIZE input values remaining ie copy_total < GROUP_SIZE
result_buffer[item_id] = 0.0;
for(k=item_id; k<copy_total; k+=GROUP_SIZE){
sum += in_value_cache[k] * in_weights[(k+i) + j * out_layer_s];
}
result_buffer[item_id] = sum;
//simple O(n) reduction can be optimized further
if(item_id == 0){
for(k=1;k<GROUP_SIZE;k++){
sum += result_buffer[k];
}
out_values[j] += sum;
}
barrier(CL_LOCAL_MEM_FENCE);
}
}
}
This will handle input of any size, so you can try it with as many elements as you have global memory for.