i have a vector which stores a list of co-ordinate points. I would like to count the occurrence of each point within defined tolerance limit.
Let us say tolerance limit has been defined as 10 for both X and Y value. and the vector contains { (100, 200), ( 110, 205 ), ( 115, 215 ), ( 120, 220 ), ( 100, 200 ), ( 150, 160 ) }. Now my desired output is
( 100, 200 ) 3
( 110, 205 ) 4
( 115, 215 ) 3
( 120,220 ) 2
( 150, 160 ) 1
The first co-ordinate count 3 includes (100, 200), ( 110, 205 ) {reason being the value is within the range 100 +- 10 and 200 +- 10 } and ( 100, 200 )
Here efficiency is high priority
I am posting the solution of this problem using boost. I am posting the code here in case someone needs similar solution in future.
#include<set>
#include <iostream>
#include <boost/geometry.hpp>
#include <boost/geometry/geometries/point.hpp>
#include <boost/geometry/index/rtree.hpp>
using namespace std;
namespace bg = boost::geometry;
namespace bgi = boost::geometry::index;
typedef bg::model::point<int, 2, bg::cs::cartesian> point;
typedef std::pair<point, unsigned> value;
struct ltstr
{
bool operator()(const point &p1, const point &p2) const
{
return (p1.get < 0 >() < p2.get < 0 >() || p1.get < 1 >() < p2.get < 1 >());
}
};
void main()
{
vector<point> candidatePoints{ point(457, 184), point(457, 184), point(457, 184), point(457, 184), point(457, 184),
point(456, 184), point(456, 184), point(456, 184), point(456, 184), point(456, 184),
point(456, 184), point(457, 184), point(457, 184), point(457, 184), point(458, 184), point(459, 185) };
bgi::rtree< value, bgi::quadratic<16> > rtree;
set<point, ltstr> uniqueCandidatePoints;
for (int i = 0; i < candidatePoints.size(); ++i)
{
int x = candidatePoints[i].get < 0 >();
int y = candidatePoints[i].get < 1 >();
uniqueCandidatePoints.insert(point(x, y));
rtree.insert(make_pair(candidatePoints[i], i));
}
for (auto it = uniqueCandidatePoints.begin(); it != uniqueCandidatePoints.end(); ++it)
{
std::vector<value> returnedValues;
point currentItem = *it;
rtree.query(bgi::satisfies([&](value const& v) {return bg::distance(v.first, currentItem) < 5; }),
std::back_inserter(returnedValues));
cout << "Current Item: " << currentItem.get < 0 >() << "," << currentItem.get < 1 >() << "Count: " << returnedValues.size() << endl;
}
getchar();
}
Related
Currently I'm learning how to create games (at a low level) at my degree. I'm programming on Ubuntu mate 16.04, Codeblocks 13.12 and this happen:
-------------- Build: Debug in s04 (compiler: GNU GCC Compiler)---------------
g++ -o bin/Debug/s04 obj/Debug/main.o obj/Debug/Pantalla.o
obj/Debug/main.o: file not recognized: File format not recognized
collect2: error: ld returned 1 exit status
Process terminated with status 1 (0 minute(s), 0 second(s))
0 error(s), 0 warning(s) (0 minute(s), 0 second(s))
I must create a new window where my "game" will run... (I add code)
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "Pantalla.h"
//Ej.1
struct BalaRep
{
int x;
int y;
int vx;
int vy;
};
typedef struct BalaRep * Bala;
//Ej.2
Bala crea_bala ( double x, double y, double vx, double vy )
{
Bala b=malloc(sizeof(struct BalaRep));
b->x = x;
b->y = y;
b->vx = vx;
b->vy = vy;
return b;
}
//Ej.3
void libera_bala( Bala b )
{
free(b);
}
//Ej.4
void mueve_bala( Bala b )
{
b->x = (b->x)+ (b->vx);
b->y = (b->y)+ (b->vy);
}
//Ej.5
void dibuja_bala( Bala b )
{
Pantalla_DibujaRectangulo( b->x, b->y, 7, 7);
}
//Ej.6
/*
double get_x_bala( Bala b )
{
return b->x;
}
*/
//Ej.7
/*
double get_y_bala( Bala b )
{
return b->y;
}
*/
int main( int argc, char *argv[] )
{
Pantalla_Crea("Ejemplo 3", 640,480);
Pantalla_ColorTrazo(255,0,0, 255);
int x = 280;
int y = 425;
int x2 = 200;
int y2 = 100;
int vx2 = 5;
Bala b = NULL;
while ( Pantalla_Activa() )
{
//Crear bala
if (Pantalla_TeclaPulsada(SDL_SCANCODE_SPACE))
{
libera_bala(b);
b=NULL;
b=crea_bala(x,y,0,-10);
}
//Movimiento del rectángulo
if (Pantalla_TeclaPulsada(SDL_SCANCODE_RIGHT))
{
x = x + 5;
}
if (Pantalla_TeclaPulsada(SDL_SCANCODE_LEFT))
{
x = x - 5;
}
/*if (Pantalla_TeclaPulsada(SDL_SCANCODE_UP))
{
y = y - 5;
}
if (Pantalla_TeclaPulsada(SDL_SCANCODE_DOWN))
{
y = y + 5;
}*/
//Bordes no-salir
if (x > 640-80)
{
x = 640 - 80;
}
if (x < 0)
{
x = 0;
}
if (y > 480-40)
{
y = 480 - 40;
}
if (y < 0)
{
y = 0;
}
//Mov enemigo
x2 = x2 + vx2;
//Bordes no-salir enemigo
if (x2 > 640-80)
{
x2 = 640 - 80;
vx2 = vx2 * (-1);
}
if (x2 < 0)
{
x2 = 0;
vx2 = vx2 * (-1);
}
//BALA
if (b!=NULL)
{
mueve_bala(b);
}
if ((b!=NULL) && ((b->y) <= 0))
{
libera_bala(b);
b=NULL;
}
Pantalla_DibujaRellenoFondo( 255,255,255, 255 );
Pantalla_DibujaRectangulo( x, y, 80,40 );
Pantalla_DibujaRectangulo( x2, y2, 80,40 );
if (b!=NULL)
{
dibuja_bala(b);
}
Pantalla_Actualiza();
Pantalla_Espera(40);
}
Pantalla_Libera();
return 0;
}
There is a file that teachers give us to run it properly. Furthermore, my classmate run the same code (what I add) on his laptop and it works. Excuse me, I know my English is bad...
Obviously the object files are incompatible either because your build process is broken or if your professor just gives you the compiled object file, because it is not ABI compatible with your implementation i.e. compiler, OS, architecture.
I'm using SDL2 with Crystal to make a 16bit RPG style tile-based game. I've seen this question asked a ton, but even with all the answers I've come across, I'm still not getting the movement I'm looking for. Have you ever played Final Fantasy IV, V, or VI on the SNES? I'm looking for movement like that. No diagonal, character is always over a tile, and never stops between 2 tiles.
# main game loop
loop do
ticks = Time.monotonic.milliseconds / 1000.0
case event = SDL::Event.poll
when SDL::Event::Keyboard
case event.sym
when .right?
character.move_right(ticks)
end
end
character.draw(renderer)
renderer.present
#other code handling break and stuff omitted
end
# character.cr
VELOCITY = 100
def move_right(delta_ticks)
#direction_facing = "east"
#x += VELOCITY * delta_ticks
end
def draw(renderer)
sprite = #directions[#direction_facing]
renderer.copy(sprite, dstrect: SDL::Rect[#x.to_i, #y.to_i, 64, 64])
end
The way my current movement works, the character starts walking slow, then picks up speed then drops back down to walking slow like it's shifting gears or something. I know my line #x += VELOCITY * delta_ticks is wrong, but I wasn't able to find one that worked how I wanted. This also doesn't take in to account stopping directly over a tile (in this case 64x64).
EDIT: I've tried to transpose the suggestion #genpfault gave. It still doesn't do what I want, but since I don't know C++, I may have missed some stuff. That code update is here
Make a little "tasklet" helper (I know zero about Crystal; in C++ I'd just have this be a class/struct with member data & functions) that encapsulates the character's current tile x/y position (and fine, sub-tile x/y position)
When you handle the left/right/up/down input, check if a current tasklet is still doing its thing; if not, make a new tasklet with the desired direction
Each frame while a tasklet is active, process it: increment/decrement (1px/frame? up to you) the character's fine x/y position until it hits the goal tile position; if the tasklet hits the goal position this frame, remove it (and update the character's tile position)
This way you prevent new input from interfering with character motion while it's in progress, as well as smoothly animating tile transitions.
Something like this:
#include <SDL2/SDL.h>
#include <memory>
struct Character
{
int m_TileX;
int m_TileY;
int m_FineX; // in 16ths of a tile
int m_FineY; // in 16ths of a tile
};
class ITask
{
public:
virtual ~ITask() {};
// override & return true to indicate this task is done
virtual bool Run() = 0;
};
class CharacterAnimator : public ITask
{
public:
CharacterAnimator( Character& c, int dx, int dy )
: m_C( c )
, m_Dx( dx )
, m_Dy( dy )
{}
~CharacterAnimator() override {}
bool Run() override
{
m_C.m_FineX += m_Dx;
m_C.m_FineY += m_Dy;
bool done = false;
if( m_C.m_FineX <= -16 ) { m_C.m_TileX--; m_C.m_FineX = 0; done = true; }
if( m_C.m_FineY <= -16 ) { m_C.m_TileY--; m_C.m_FineY = 0; done = true; }
if( m_C.m_FineX >= 16 ) { m_C.m_TileX++; m_C.m_FineX = 0; done = true; }
if( m_C.m_FineY >= 16 ) { m_C.m_TileY++; m_C.m_FineY = 0; done = true; }
return done;
}
private:
Character& m_C;
int m_Dx;
int m_Dy;
};
int main( int argc, char** argv )
{
SDL_Init( SDL_INIT_EVERYTHING );
SDL_Window * window = SDL_CreateWindow
(
"SDL2",
SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED,
640, 480,
SDL_WINDOW_SHOWN
);
SDL_Renderer* renderer = SDL_CreateRenderer
(
window,
0,
SDL_RENDERER_ACCELERATED | SDL_RENDERER_PRESENTVSYNC
);
SDL_RenderSetLogicalSize( renderer, 320, 240 );
Character c;
c.m_TileX = 9;
c.m_TileY = 7;
c.m_FineX = 0;
c.m_FineY = 0;
std::unique_ptr< ITask > movementTask;
bool running = true;
while( running )
{
if( movementTask && movementTask->Run() )
{
movementTask.reset();
}
SDL_Event ev;
while( SDL_PollEvent( &ev ) )
{
if ( ev.type == SDL_QUIT )
running = false;
if( ev.type == SDL_KEYUP && ev.key.keysym.sym == SDLK_ESCAPE )
running = false;
if( ev.type == SDL_KEYDOWN && ev.key.keysym.sym == SDLK_UP && !movementTask )
movementTask = std::unique_ptr< ITask >( new CharacterAnimator( c, 0, -1 ) );
if( ev.type == SDL_KEYDOWN && ev.key.keysym.sym == SDLK_DOWN && !movementTask )
movementTask = std::unique_ptr< ITask >( new CharacterAnimator( c, 0, 1 ) );
if( ev.type == SDL_KEYDOWN && ev.key.keysym.sym == SDLK_LEFT && !movementTask )
movementTask = std::unique_ptr< ITask >( new CharacterAnimator( c, -1, 0 ) );
if( ev.type == SDL_KEYDOWN && ev.key.keysym.sym == SDLK_RIGHT && !movementTask )
movementTask = std::unique_ptr< ITask >( new CharacterAnimator( c, 1, 0 ) );
}
SDL_SetRenderDrawColor( renderer, 0, 0, 0, 255 );
SDL_RenderClear( renderer );
// draw character
SDL_SetRenderDrawColor( renderer, 255, 0, 0, 255 );
SDL_Rect r =
{
c.m_TileX * 16 + c.m_FineX,
c.m_TileY * 16 + c.m_FineY,
16,
16
};
SDL_RenderFillRect( renderer, &r );
SDL_RenderPresent( renderer );
}
SDL_DestroyRenderer( renderer );
SDL_DestroyWindow( window );
SDL_Quit();
return 0;
}
Could anyone help me pleaseee? I am writing a program in C++ that randomly generates some simple code from the following BNF grammar:
<prog> ::= “int main() { <stat_list> return 0; }”
<stat_list> ::= <stat>
| <stat_list> <stat>
<stat> ::= <cmpd_stat>
| <if_stat>
| <iter_stat>
| <assgn_stat>
| <decl_stat>
<cmpd_stat> ::= { <stat_list> }
<if_stat> ::= if ( <exp> ) <stat>
| if ( <exp> ) <cmpd_stat>
| if ( <exp> ) <stat> else <stat>
| if ( <exp> ) <cmpd_stat> else <stat>
| if ( <exp> ) <stat> else <cmpd_stat>
| if ( <exp> ) <cmpd_stat> else <cmpd_stat>
<iter_stat> ::= while ( <exp> ) <stat>
| while ( <exp> ) <cmpd_stat>
<assgn_stat> ::= <id> = <exp> ;
<decl_stat> ::= <type> <id> ;
| <type> <assgn_stat>
<exp> ::= <exp> <op> <exp>
| <id>
| <const>
<op> ::=+|-|*|/
<type> ::=int
| double
<id> ::= <char><chardigit_seq>
<const> ::= <digit><digit_seq>
<chardigit_seq>::= [empty]
| <char><chardigit_seq>
| <digit><chardigit_seq>
<char> ::= [A-Z] | [a-z] | _
<digit> ::= [0-9]
The output code should create the simple complete program like this (it need to be correct in syntax, but not in semantic):
int main()
{
int F0Z = 0262;
if (22682 / 525)
double S1;
else
S = U;
while (8 - 594873)
{
while (97942 / 6871573097 * 7261055)
{
while (9307 * M / 4 / 2 + 4 - 7 / K)
{
double A;
}
}
}
return 0;
}
Here is the code I am doing with, could someone please check it and tell me what wrong with it? Thank you very much for help.
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <ctime>
#include <vector>
#include <string>
#include <fstream>
using namespace std;
class Production {
private:
string lhs;
vector<string> rhs_options; // list of options for expansion
vector<double> trans_probs; // list of probabilities associated
// with each choice
public:
Production();
Production(string);
void add_rhs(string, double); // adds new rhs to the production
string expand() const; // returns one of the rhs choices using
//a random number generator
};
Production::Production() {
lhs = "<prog>"; //first lhs option, all C++ programs start with this
rhs_options.push_back("int main() { <stat_list> return 0; }"); //only
rhs option for <prog>
trans_probs.push_back(1.0); //100% chance of the rhs result for
<prog>, since it is the only option
}
Production::Production(string s) {
lhs = s; //set the lhs to whatever is passed.
}
void Production::add_rhs(string s, double num) {
rhs_options.push_back(s); //each rhs option and probability is added
trans_probs.push_back(num);
}
string Production::expand() const { //returns one of the options from
rhs vector
srand(time(0));
double random = (rand() + 0.0) / RAND_MAX; //generates random number
between 0.0 <= x <= 1.0
for (int i = 0; i < (int)trans_probs.size(); i++) {
if (trans_probs[i] >= random)
return rhs_options[i];
}
return rhs_options[0];
}
//method prototypes
string progm();
string stat_listm();
string statm();
string cmpd_statm();
string if_statm();
string iter_statm();
string assgn_statm();
string decl_statm();
string expm();
string opm();
string typem();
string idm();
string constm();
string char_digit_seqm();
string digit_seqm();
string charm();
string digitm();
string progm() {
Production prog("<prog>");
prog.add_rhs("int main() { " + stat_listm() + " return 0; }", 1.0);
//only rhs option for <prog>, therefore 100% chance
return prog.expand();
}
string stat_listm() {
Production stat_list("<stat_list>");
stat_list.add_rhs(statm(), 0.5);
stat_list.add_rhs(stat_listm() + " " + statm(), 1.0);
return stat_list.expand();
}
string statm() {
Production stat("<stat>");
stat.add_rhs(cmpd_statm(), 0.05); //5%
stat.add_rhs(if_statm(), 0.2); //15% (5+15=20)
stat.add_rhs(iter_statm(), 0.35); //15% (20+15=35)
stat.add_rhs(assgn_statm(), 0.65); //30% (35+30=65)
stat.add_rhs(decl_statm(), 1.0); //35% (65+35=100)
return stat.expand();
}
string cmpd_statm() {
Production cmpd_stat("<cmpd_stat>");
cmpd_stat.add_rhs("{ " + stat_listm() + " }", 1.0);
return cmpd_stat.expand();
}
string if_statm() {
Production if_stat("<if_stat>");
if_stat.add_rhs("if ( " + expm() + " ) " + statm(), 0.3);
if_stat.add_rhs("if ( " + expm() + " ) " + cmpd_statm(), 0.5);
if_stat.add_rhs("if ( " + expm() + " ) " + statm() + " else " +
statm(), 0.7);
if_stat.add_rhs("if ( " + expm() + " ) " + cmpd_statm() + " else " +
statm(), 0.85);
if_stat.add_rhs("if ( " + expm() + " ) " + statm() + " else " +
cmpd_statm(), 0.95);
if_stat.add_rhs("if ( " + expm() + " ) " + cmpd_statm() + " else " +
cmpd_statm(), 1.0);
return if_stat.expand();
}
string iter_statm() {
Production iter_stat("<iter_stat>");
iter_stat.add_rhs("while ( " + expm() + " ) " + statm(), 0.75);
iter_stat.add_rhs("while ( " + expm() + " ) " + cmpd_statm(), 1.0);
return iter_stat.expand();
}
string assgn_statm() {
Production assgn_stat("<assgn_stat>");
assgn_stat.add_rhs(idm() + " = " + expm() + " ;", 1.0);
return assgn_stat.expand();
}
string decl_statm() {
Production decl_stat("<decl_stat>");
decl_stat.add_rhs(typem() + " " + idm() + " ;", 0.75);
decl_stat.add_rhs(typem() + " " + assgn_statm(), 1.0);
return decl_stat.expand();
}
string expm() {
Production exp("<exp>");
exp.add_rhs(expm() + " " + opm() + " " + expm(), 0.5);
exp.add_rhs(idm(), 0.75);
exp.add_rhs(constm(), 1.0);
return exp.expand();
}
string opm() {
Production op("<op>");
op.add_rhs("+", 0.25);
op.add_rhs("-", 0.50);
op.add_rhs("*", 0.75);
op.add_rhs("/", 1.0);
return op.expand();
}
string typem() {
Production type("<type>");
type.add_rhs("int", 0.5);
type.add_rhs("double", 1.0);
return type.expand();
}
string idm() {
Production id("<id>");
id.add_rhs(charm() + char_digit_seqm(), 1.0);
return id.expand();
}
string constm() {
Production constant("<const>");
constant.add_rhs(digitm() + digit_seqm(), 1.0);
return constant.expand();
}
string char_digit_seqm() {
Production char_digit_seq("<char_digit_seq>");
char_digit_seq.add_rhs("", 0.25);
char_digit_seq.add_rhs(charm() + char_digit_seqm(), 0.5);
char_digit_seq.add_rhs(digitm() + char_digit_seqm(), 1.0);
return char_digit_seq.expand();
}
string digit_seqm() {
Production digit_seq("<digit_seq>");
digit_seq.add_rhs("", 0.25);
digit_seq.add_rhs(digitm() + digit_seqm(), 1.0);
return digit_seq.expand();
}
string charm() {
Production character("<char>");
char alpha[]={'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','_'};
for (int i = 0; i < 53; i++)
{
string letter(1, alpha[i]);
character.add_rhs(letter, (1.0/53 + i/53.0));
}
return character.expand();
}
string digitm()
{
Production digit("<digit>");
char nums[] = {'0','1','2','3','4','5','6','7','8','9'};
for (int i = 0; i < 10; i++)
{
string num(1, nums[i]);
digit.add_rhs(num, (1.0/10 + i/10.0));
}
return digit.expand();
}
int main()
{
...... // what should to write here?
cout << progm(); // cannot, caused lot of errors.
return 0;
}
I'm building a CUDA kernel to compute the numerical N*N jacobian of a function, using finite differences; in the example I provided, it is the square function (each entry of the vector is squared). The host coded allocates in linear memory, while I'm using a 2-dimensional indexing in the kernel.
My issue is that I haven't found a way to sum on the diagonal of the matrices cudaMalloc'ed. My attempt has been to use the statement threadIdx.x == blockIdx.x as a condition for the diagonal, but instead it evaluates to true only for them both at 0.
Here is the kernel and EDIT: I posted the whole code as an answer, based on the suggestions in the comments (the main() is basically the same, while the kernel is not)
template <typename T>
__global__ void jacobian_kernel (
T * J,
const T t0,
const T tn,
const T h,
const T * u0,
const T * un,
const T * un_old)
{
T cgamma = 2 - sqrtf(2);
const unsigned int t = threadIdx.x;
const unsigned int b = blockIdx.x;
const unsigned int tid = t + b * blockDim.x;
/*__shared__*/ T temp_sx[BLOCK_SIZE][BLOCK_SIZE];
/*__shared__*/ T temp_dx[BLOCK_SIZE][BLOCK_SIZE];
__shared__ T sm_temp_du[BLOCK_SIZE];
T* temp_du = &sm_temp_du[0];
if (tid < N )
{
temp_sx[b][t] = un[t];
temp_dx[b][t] = un[t];
if ( t == b )
{
if ( tn == t0 )
{
temp_du[t] = u0[t]*0.001;
temp_sx[b][t] += temp_du[t]; //(*)
temp_dx[b][t] -= temp_du[t];
temp_sx[b][t] += ( abs( temp_sx[b][t] ) < 10e-6 ? 0.1 : 0 );
temp_dx[b][t] += ( abs( temp_dx[b][t] ) < 10e-6 ? 0.1 : 0 );
temp_sx[b][t] = ( temp_sx[b][t] == 0 ? 0.1 : temp_sx[b][t] );
temp_dx[b][t] = ( temp_dx[b][t] == 0 ? 0.1 : temp_dx[b][t] );
}
else
{
temp_du[t] = MAX( un[t] - un_old[t], 10e-6 );
temp_sx[b][t] += temp_du[t];
temp_dx[b][t] -= temp_du[t];
}
}
__syncthreads();
//J = f(tn, un + du)
d_func(tn, (temp_sx[b]), (temp_sx[b]), 1.f);
d_func(tn, (temp_dx[b]), (temp_dx[b]), 1.f);
__syncthreads();
J[tid] = (temp_sx[b][t] - temp_dx[b][t]) * powf((2 * temp_du[t]), -1);
//J[tid]*= - h*cgamma/2;
//J[tid]+= ( t == b ? 1 : 0);
//J[tid] = temp_J[tid];
}
}
The general procedure for computing the jacobian is
Copy un into every row of temp_sx and temp_dx
Compute du as a 0.01 magnitude from u0
Sum du to the diagonal of temp_sx, subtract du from the diagonal of temp_dx
Compute the square function on each entry of temp_sx and temp_dx
Subtract them and divide every entry by 2*du
This procedure can be summarized with (f(un + du*e_i) - f(un - du*e_i))/2*du.
My problem is to sum du to the diagonal of the matrices of temp_sx and temp_dx like I tried in (*). How can I achieve that?
EDIT: Now calling 1D blocks and threads; in fact, .y axis wasn't used at all in the kernel. I'm calling the kernel with a fixed amount of shared memory
Note that in int main() I'm calling the kernel with
#define REAL sizeof(float)
#define N 32
#define BLOCK_SIZE 16
#define NUM_BLOCKS ((N*N + BLOCK_SIZE - 1)/ BLOCK_SIZE)
...
dim3 dimGrid(NUM_BLOCKS,);
dim3 dimBlock(BLOCK_SIZE);
size_t shm_size = N*N*REAL;
jacobian_kernel <<< dimGrid, dimBlock, size_t shm_size >>> (...);
So that I attempt to deal with block-splitting the function calls. In the kernel to sum on the diagonal I used if(threadIdx.x == blockIdx.x){...}. Why isn't this correct? I'm asking it because while debugging and making the code print the statement, It only evaluates true if they both are 0. Thus du[0] is the only numerical value and the matrix becomes nan. Note that this approach worked with the first code I built, where instead I called the kernel with
jacobian_kernel <<< N, N >>> (...)
So that when threadIdx.x == blockIdx.x the element is on the diagonal. This approach doesn't fit anymore though, since now I need to deal with larger N (possibly larger than 1024, which is the maximum number of threads per block).
What statement should I put there that works even if the matrices are split into blocks and threads?
Let me know if I should share some other info.
Here is how I managed to solve my problem, based on the suggestion in the comments on the answer. The example is compilable, provided you put helper_cuda.h and helper_string.h in the same directory or you add -I directive to the CUDA examples include path, installed along with the CUDA toolkit. The relevant changes are only in the kernel; there's a minor change in the main() though, since I was calling double the resources to execute the kernel, but the .y axis of the grid of thread blocks wasn't even used at all, so it didn't generate any error.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <math.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include "helper_string.h"
#include <fstream>
#ifndef MAX
#define MAX(a,b) ((a > b) ? a : b)
#endif
#define REAL sizeof(float)
#define N 128
#define BLOCK_SIZE 128
#define NUM_BLOCKS ((N*N + BLOCK_SIZE - 1)/ BLOCK_SIZE)
template <typename T>
inline void printmatrix( T mat, int rows, int cols);
template <typename T>
__global__ void jacobian_kernel ( const T * A, T * J, const T t0, const T tn, const T h, const T * u0, const T * un, const T * un_old);
template<typename T>
__device__ void d_func(const T t, const T u[], T res[], const T h = 1);
template<typename T>
int main ()
{
float t0 = 0.; //float tn = 0.;
float h = 0.1;
float* u0 = (float*)malloc(REAL*N); for(int i = 0; i < N; ++i){u0[i] = i+1;}
float* un = (float*)malloc(REAL*N); memcpy(un, u0, REAL*N);
float* un_old = (float*)malloc(REAL*N); memcpy(un_old, u0, REAL*N);
float* J = (float*)malloc(REAL*N*N);
float* A = (float*)malloc(REAL*N*N); host_heat_matrix(A);
float *d_u0;
float *d_un;
float *d_un_old;
float *d_J;
float *d_A;
checkCudaErrors(cudaMalloc((void**)&d_u0, REAL*N)); //printf("1: %p\n", d_u0);
checkCudaErrors(cudaMalloc((void**)&d_un, REAL*N)); //printf("2: %p\n", d_un);
checkCudaErrors(cudaMalloc((void**)&d_un_old, REAL*N)); //printf("3: %p\n", d_un_old);
checkCudaErrors(cudaMalloc((void**)&d_J, REAL*N*N)); //printf("4: %p\n", d_J);
checkCudaErrors(cudaMalloc((void**)&d_A, REAL*N*N)); //printf("4: %p\n", d_J);
checkCudaErrors(cudaMemcpy(d_u0, u0, REAL*N, cudaMemcpyHostToDevice)); assert(d_u0 != NULL);
checkCudaErrors(cudaMemcpy(d_un, un, REAL*N, cudaMemcpyHostToDevice)); assert(d_un != NULL);
checkCudaErrors(cudaMemcpy(d_un_old, un_old, REAL*N, cudaMemcpyHostToDevice)); assert(d_un_old != NULL);
checkCudaErrors(cudaMemcpy(d_J, J, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_J != NULL);
checkCudaErrors(cudaMemcpy(d_A, A, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_A != NULL);
dim3 dimGrid(NUM_BLOCKS); std::cout << "NUM_BLOCKS \t" << dimGrid.x << "\n";
dim3 dimBlock(BLOCK_SIZE); std::cout << "BLOCK_SIZE \t" << dimBlock.x << "\n";
size_t shm_size = N*REAL; //std::cout << shm_size << "\n";
//HERE IS A RELEVANT CHANGE OF THE MAIN, SINCE I WAS CALLING
//THE KERNEL WITH A 2D GRID BUT WITHOUT USING THE .y AXIS,
//WHILE NOW THE GRID IS 1D
jacobian_kernel <<< dimGrid, dimBlock, shm_size >>> (d_A, d_J, t0, t0, h, d_u0, d_un, d_un_old);
checkCudaErrors(cudaMemcpy(J, d_J, REAL*N*N, cudaMemcpyDeviceToHost)); //printf("4: %p\n", d_J);
printmatrix( J, N, N);
checkCudaErrors(cudaDeviceReset());
free(u0);
free(un);
free(un_old);
free(J);
}
template <typename T>
__global__ void jacobian_kernel (
const T * A,
T * J,
const T t0,
const T tn,
const T h,
const T * u0,
const T * un,
const T * un_old)
{
T cgamma = 2 - sqrtf(2);
const unsigned int t = threadIdx.x;
const unsigned int b = blockIdx.x;
const unsigned int tid = t + b * blockDim.x;
/*__shared__*/ T temp_sx[BLOCK_SIZE][BLOCK_SIZE];
/*__shared__*/ T temp_dx[BLOCK_SIZE][BLOCK_SIZE];
__shared__ T sm_temp_du;
T* temp_du = &sm_temp_du;
//HERE IS A RELEVANT CHANGE (*)
if ( t < BLOCK_SIZE && b < NUM_BLOCKS )
{
temp_sx[b][t] = un[t]; //printf("temp_sx[%d] = %f\n", t,(temp_sx[b][t]));
temp_dx[b][t] = un[t];
//printf("t = %d, b = %d, t + b * blockDim.x = %d \n",t, b, tid);
//HERE IS A NOTE (**)
if ( t == b )
{
//printf("t = %d, b = %d \n",t, b);
if ( tn == t0 )
{
*temp_du = u0[t]*0.001;
temp_sx[b][t] += *temp_du;
temp_dx[b][t] -= *temp_du;
temp_sx[b][t] += ( abs( temp_sx[b][t] ) < 10e-6 ? 0.1 : 0 );
temp_dx[b][t] += ( abs( temp_dx[b][t] ) < 10e-6 ? 0.1 : 0 );
temp_sx[b][t] = ( temp_sx[b][t] == 0 ? 0.1 : temp_sx[b][t] );
temp_dx[b][t] = ( temp_dx[b][t] == 0 ? 0.1 : temp_dx[b][t] );
}
else
{
*temp_du = MAX( un[t] - un_old[t], 10e-6 );
temp_sx[b][t] += *temp_du;
temp_dx[b][t] -= *temp_du;
}
;
}
//printf("du[%d] %f\n", tid, (*temp_du));
__syncthreads();
//printf("temp_sx[%d][%d] = %f\n", b, t, temp_sx[b][t]);
//printf("temp_dx[%d][%d] = %f\n", b, t, temp_dx[b][t]);
//d_func(tn, (temp_sx[b]), (temp_sx[b]), 1.f);
//d_func(tn, (temp_dx[b]), (temp_dx[b]), 1.f);
matvec_dev( tn, A, (temp_sx[b]), (temp_sx[b]), N, N, 1.f );
matvec_dev( tn, A, (temp_dx[b]), (temp_dx[b]), N, N, 1.f );
__syncthreads();
//printf("temp_sx_later[%d][%d] = %f\n", b, t, (temp_sx[b][t]));
//printf("temp_sx_later[%d][%d] - temp_dx_later[%d][%d] = %f\n", b,t,b,t, (temp_sx[b][t] - temp_dx[b][t]) / 2 * *temp_du);
//if (t == b ) printf( "2du[%d]^-1 = %f\n",t, powf((2 * *temp_du), -1));
J[tid] = (temp_sx[b][t] - temp_dx[b][t]) / (2 * *temp_du);
}
}
template<typename T>
__device__ void d_func(const T t, const T u[], T res[], const T h )
{
__shared__ float temp_u;
temp_u = u[threadIdx.x];
res[threadIdx.x] = h*powf( (temp_u), 2);
}
template <typename T>
inline void printmatrix( T mat, int rows, int cols)
{
std::ofstream matrix_out;
matrix_out.open( "heat_matrix.txt", std::ofstream::out);
for( int i = 0; i < rows; i++)
{
for( int j = 0; j <cols; j++)
{
double next = mat[i + N*j];
matrix_out << ( (next >= 0) ? " " : "") << next << " ";
}
matrix_out << "\n";
}
}
The relevant change is on (*). Before I used if (tid < N) which has two downsides:
First, it is wrong, since it should be tid < N*N, as my data is 2D, while tid is a global index which tracks all the data.
Even if I wrote tid < N*N, since I'm splitting the function calls into blocks, the t < BLOCK_SIZE && b < NUM_BLOCKS seems clearer to me in how the indexing is arranged in the code.
Moreover, the statement t == b in (**) is actually the right one to operate on the diagonal elements of the matrix. The fact that it was evaluated true only on 0 was because of my error right above.
Thanks for the suggestions!
I'm trying to give a better error report (possible bug) for this case (about judySArray give incorrect result, but I don't know which key that give incorrect result).
The code here from this folder, note on this blog. Dependencies: judySArray.h and cedar.h
// judy.cpp
#include "deps/judySArray.h"
#include <string>
#include <iostream>
#include <cstdlib>
#include <cstring>
using namespace std;
typedef judySArray<double> MSD;
const int MAX_DATA = 12000000;
const char i2ch[] = {'0','1','2','3','4','5','6','7','8','9','a','B','c','D','e','F'};
int get_first_digit(double d) {
while(d > 10) d /= 10;
return d;
}
string to_rhex(int v) {
char hex[32];
int start = 0;
while(v>0) {
hex[start] = i2ch[v%16];
v /= 16;
++start;
}
hex[start] = 0;
return hex;
}
void add_or_inc(MSD &m, const string& key,double set, double inc, int& ctr) {
const char* cstr = key.c_str();
double it = m.find(cstr);
if(!it) {
m.insert(cstr,set);
return;
}
m.insert(cstr,it+inc);
++ctr;
}
int main() {
MSD m(64);
int dup1 = 0, dup2 = 0, dup3 = 0;
for(int z=MAX_DATA;z>0;--z) {
int val2 = MAX_DATA-z;
int val3 = MAX_DATA*2-z;
string key1 = to_string(z);
string key2 = to_string(val2);
string key3 = to_rhex(val3);
add_or_inc(m,key1,z,val2,dup1);
add_or_inc(m,key2,val2,val3,dup2);
add_or_inc(m,key3,val3,z,dup3);
}
cout << dup1 << ' ' << dup2 << ' ' << dup3 << endl;
int total = 0, verify = 0, count = 0;
for(auto &it = m.begin();m.success(); m.next()) {
total += get_first_digit(it.value);
verify += strlen((const char *) it.key);
count += 1;
}
cout << total << ' ' << verify << ' ' << count << endl;
}
other implementation (map, unordered_map, hat-trie and cedar) give correct result:
6009354 6009348 611297
36186112 159701682 23370001
but judy didn't:
6009354 6009348 611297
36186112 159701681 23370000
The problem is, which key that have incorrect result?
I've tried to build a code that insert those keys on another data structure (that is cedar), but that incorrect keys still not detected:
// judy.cpp
#include "deps/judySArray.h"
#include <string>
#include <iostream>
#include <cstdlib>
#include <cstring>
#include <vector>
using namespace std;
typedef judySArray<double> MSD;
const int MAX_DATA = 12000000;
const char i2ch[] = {'0','1','2','3','4','5','6','7','8','9','a','B','c','D','e','F'};
int get_first_digit(double d) {
while(d > 10) d /= 10;
return d;
}
string to_rhex(int v) {
char hex[32];
int start = 0;
while(v>0) {
hex[start] = i2ch[v%16];
v /= 16;
++start;
}
hex[start] = 0;
return hex;
}
void add_or_inc(MSD &m, const string& key,double set, double inc, int& ctr) {
const char* cstr = key.c_str();
double it = m.find(cstr);
if(!it) {
m.insert(cstr,set);
return;
}
m.insert(cstr,it+inc);
++ctr;
}
#include "deps/cedar.h"
class MSD2 {
public:
vector<double> data;
typedef cedar::da<int> CI;
CI da;
bool exists(const string& key,double &old) {
int idx = -1;
bool found = da.exactMatchExists(key.c_str(),key.size(),&idx);
if(found) old = data[idx];
return found;
}
void insert(const string& key,double val) {
da.update(key.c_str(),key.size(),data.size());
data.push_back(val);
}
void update(const string& key,double val) {
int idx = -1;
bool found = da.exactMatchExists(key.c_str(),key.size(),&idx);
if(found) {
data[idx] = val;
return;
}
insert(key,val);
}
};
void add_or_inc(MSD2 &m, const string& key,double set, double inc, int& ctr) {
double old;
if(!m.exists(key,old)) {
m.insert(key,set);
return;
}
m.update(key,old+inc);
++ctr;
}
int main() {
MSD m(64);
MSD2 m2;
int dup1 = 0, dup2 = 0, dup3 = 0;
int vup1 = 0, vup2 = 0, vup3 = 0;
for(int z=MAX_DATA;z>0;--z) {
int val2 = MAX_DATA-z;
int val3 = MAX_DATA*2-z;
string key1 = to_string(z);
string key2 = to_string(val2);
string key3 = to_rhex(val3);
add_or_inc(m,key1,z,val2,dup1);
add_or_inc(m,key2,val2,val3,dup2);
add_or_inc(m,key3,val3,z,dup3);
add_or_inc(m2,key1,z,val2,vup1);
add_or_inc(m2,key2,val2,val3,vup2);
add_or_inc(m2,key3,val3,z,vup3);
}
cout << dup1 << ' ' << dup2 << ' ' << dup3 << endl;
cout << vup1 << ' ' << vup2 << ' ' << vup3 << endl;
int total = 0, verify = 0, count = 0;
int xotal = 0, xerify = 0, xount = 0;
union { int i; int x; } b;
size_t from = 0, p = 0;
char key[256] = {0};
for (b.i = m2.da.begin(from, p); b.i != MSD2::CI::CEDAR_NO_PATH; b.i = m2.da.next(from, p)) {
double it2 = m2.data[b.x]; // <-- find cedar's
xotal += get_first_digit(it2);
m2.da.suffix(key,p,from);
xerify += strlen(key);
xount += 1;
double it = m.find(key); // <-- find judy's
if(it != it2) { // if value doesn't match, print:
cout << "mismatch value for " << key << " : " << it2 << " vs " << it << endl;
}
}
for(auto &it = m.begin();m.success(); m.next()) {
total += get_first_digit(it.value);
verify += strlen((const char *) it.key);
count += 1;
}
cout << total << ' ' << verify << ' ' << count << endl;
cout << xotal << ' ' << xerify << ' ' << xount << endl;
}
compile with: clang++ -std=c++11 judy-findbug.cpp (or g++ -std=c++11)
the output would be:
6009354 6009348 611297
6009354 6009348 611297
36186112 159701681 23370000 <-- judy's
36186112 159701682 23370001 <-- cedar's
cedar has one more value than judy's (that is correct), but it didn't detected by the code above..
How to find that incorrect key(s)?
The bug on the code is someone (me) uncomment the assert(value != 0).
The bug was Karl's Judy implementation should not store null values (0 value).
Solution: use Doug Baskins' Judy implementation.