how to change openmp to tbb? - openmp

hellow,
I want change my code openmp to TBB
this is my code.
for (long y = 0; y < _src.rows; y++)
{
#pragma omp parallel
{
#pragma omp for firstprivate(i)
for (long x = 0; x < _src.cols; x++)
{
const uchar* data = _src.ptr((int)y, (int)x);
i = (y * _src.cols + x);
//update model+ background subtract
// cout << "parallel: " << i << endl;
uchar include = 0;
int result = _cvCheckPixelBackgroundNP(i, data, nchannels,
m_nN, m_aModel, m_fTb, m_nkNN, m_fTau, m_bShadowDetection, include);
_cvUpdatePixelBackgroundNP(i, data, nchannels,
m_nN, m_aModel,
m_nNextLongUpdate,
m_nNextMidUpdate,
m_nNextShortUpdate,
m_aModelIndexLong,
m_aModelIndexMid,
m_aModelIndexShort,
m_nLongCounter,
m_nMidCounter,
m_nShortCounter,
m_nLongUpdate,
m_nMidUpdate,
m_nShortUpdate,
include
);
switch (result)
{
case 0:
//foreground
*_dst.ptr((int)y, (int)x) = 255;
break;
case 1:
//background
*_dst.ptr((int)y, (int)x) = 0;
break;
case 2:
//shadow
*_dst.ptr((int)y, (int)x) = nShadowDetection;
break;
}
// i++;
}
}
}
};
I try to change TBB code
long i = 0;
tbb::parallel_for(tbb::blocked_range2d<int>(0,_src.rows,0,_src.cols)
, [&](tbb::blocked_range2d<int>r) {
for (int y = r.rows().begin(); y < r.rows().end();y++)
{
for (int x = r.cols().begin(); x != r.cols().end(); x++)
{
const uchar* data = _src.ptr((int)y, (int)x);
i = (y * _src.cols + x);
//update model+ background subtract
uchar include = 0;
int result = _cvCheckPixelBackgroundNP(i, data, nchannels,
m_nN, m_aModel, m_fTb, m_nkNN, m_fTau, m_bShadowDetection, include);
_cvUpdatePixelBackgroundNP(i, data, nchannels,
m_nN, m_aModel,
m_nNextLongUpdate,
m_nNextMidUpdate,
m_nNextShortUpdate,
m_aModelIndexLong,
m_aModelIndexMid,
m_aModelIndexShort,
m_nLongCounter,
m_nMidCounter,
m_nShortCounter,
m_nLongUpdate,
m_nMidUpdate,
m_nShortUpdate,
include
);
switch (result)
{
case 0:
//foreground
*_dst.ptr((int)y, (int)x) = 255;
break;
case 1:
//background
*_dst.ptr((int)y, (int)x) = 0;
break;
case 2:
//shadow
*_dst.ptr((int)y, (int)x) = nShadowDetection;
break;
}
// i++;
}
}
},tbb::auto_partitioner());
and I did change time. two result time is simillary but,
performance is different.
how can i fix it? sorry for my bad english.

Related

Updating sand simulation in grid doesn't work

I want to make a falling sand simulation using cellular automata, but when I update it, nothing happens, and when I want to do a line of diffrent material using lineDrawing() this material appear in random cells. This is update code:
void update()
{
for (int i = verticalNumberOfCells - 1; i > 0; i--)
{
for (int j = 0; j < horizontalNumberOfCells; j++)
{
world[j][i].update(false);
}
}
for (int y = verticalNumberOfCells - 1; y > 0; y--)
{
for (int x = 0; x < horizontalNumberOfCells; x++)
{
if (world[x][y].hasMoved) continue;
if (world[x][y].state == 0 && world[x][y].state == 1) continue;
if (canMove(world[x][y].state, x, y + 1))
{
move(x, y, x, y + 1);
}
}
}
}
The auxiliary functions that I use to check if the contents of a cell can change and to change the contents of a cell look like this:
boolean canMove(int state, int positionX, int positionY)
{
if (positionX < 0 || positionX >= horizontalNumberOfCells || positionY < 0 || positionY >= verticalNumberOfCells) return false;
int otherSubstance = world[positionX][positionY].state;
if (state == 5) return (otherSubstance == 4);
if (otherSubstance == 0) return true;
if (state == 2 && otherSubstance == 3 && random(1f) < 0.5f) return true;
return false;
}
void move(int fromX, int fromY, int toX, int toY)
{
Cells otherSubstance = world[toX][toY];
world[toX][toY] = world[fromX][fromY];
world[fromX][fromY] = otherSubstance;
world[fromX][fromY].hasMoved = true;
world[toX][toY].hasMoved = true;
world[fromX][fromY].velocityX = 0;
world[fromX][fromY].velocityY = 0;
if (toX > fromX)
{
world[toX][toY].velocityX = 1;
} else if (toX < fromX)
{
world[toX][toY].velocityX = -1;
} else
{
world[toX][toY].velocityX = 0;
}
if (toY > fromY)
{
world[toX][toY].velocityY = 1;
} else if (toY < fromY)
{
world[toX][toY].velocityY = -1;
} else
{
world[toX][toY].velocityY = 0;
}
}
I was able to fix this problem. The thing was, copying a cell in the move function didn't work. Here is the wrong version of the code:
Cells otherSubstance = world[toX][toY];
world[toX][toY] = world[fromX][fromY];
world[fromX][fromY] = otherSubstance;
and here is right version of the code:
int oldState = world[toX][toY].state;
world[toX][toY].state = world[fromX][fromY].state;
world[fromX][fromY].state = oldState;

How to refresh OnPaint() function; Window Update is not working

This is my code:
void CChildView::OnPaint()
{
CPaintDC dc(this);
CBitmap b; b.LoadBitmap(IDB_BITMAP1);
CDC memdc; memdc.CreateCompatibleDC(&dc);
auto prev = memdc.SelectObject(&b);
BITMAP bmp; b.GetBitmap(&bmp);
int bitmap_height = bmp.bmHeight;
int bitmap_width = bmp.bmWidth;
if (!painted) {
nrows = divide(bitmap_height, 8);
ncols = divide(bitmap_width, 8);
piece_height = bitmap_height / nrows;
piece_width = bitmap_width / ncols;
int number_of_tiles = nrows * ncols;
positions.resize(number_of_tiles);
std::iota(positions.begin(), positions.end(), 0);
empty = positions.size() - 1;//prazna pločica je zadnja
std::shuffle(positions.begin(), positions.end(), std::mt19937{ std::random_device{}() });
painted = true;
}
for (int i = 0; i < positions.size()-1; ++i) {
int row_dest = positions[i] / ncols;
int col_dest = positions[i] % ncols;
int row_src = i / ncols;
int col_src = i % ncols;
int x_src = col_src * piece_width;
int y_src = row_src * piece_height;
int x_dest = col_dest * piece_width;
int y_dest = row_dest * piece_height;
dc.BitBlt(x_dest, y_dest, piece_width, piece_height, &memdc, x_src, y_src, SRCCOPY);
dc.SelectObject(prev);
}
}
void CChildView::OnLButtonDown(UINT nFlags, CPoint point)
{
// TODO: Add your message handler code here and/or call def
CWnd::OnLButtonDown(nFlags, point);
//koordinate gdje je korisnik kliknuo
int row = point.y / piece_height;
int col = point.x / piece_width;
int empty_row = empty / ncols;
int empty_col = empty % ncols;
bool slide = false;
switch (abs(row - empty_row)) {
case 1:
if (abs(col==empty_col))
slide = true;
break;
case 0:
if (abs(col-empty_col==1))
slide = true;
break;
}
if (slide) {
int old_index = row * ncols + col;
positions[old_index] = positions[empty];
empty = old_index;
CWnd::InvalidateRect(NULL, FALSE);
CWnd::UpdateWindow();
}
}
I am trying to write a program that is called SlidingPuzzle. I managed to divide a bitmap into rectangles and shuffle them randomly. However, when I click on a rectangle there is no change. I suppose that UpdateWindow() is not working. It should refresh OnPaint() function. Can someone please help me, what am I doing wrong?

Is the cuda kernel limited by memory usage per thread/block

I have a kernel code that executes properly
runnable code
__global__ static void CalcSTLDistance_Kernel(Integer ComputeParticleNumber)
{
//const Integer TID = CudaGetTargetID();
const Integer ID =CudaGetTargetID();
/*if(ID >= ComputeParticleNumber)
{
return ;
}*/
CDistance NearestDistance;
Integer NearestID = -1;
NearestDistance.Magnitude = 1e8;
NearestDistance.Direction.x = 0;
NearestDistance.Direction.y = 0;
NearestDistance.Direction.z = 0;//make_Scalar3(0,0,0);
//if(c_daOutputParticleID[ID] < -1)
//{
// c_daSTLDistance[ID] = NearestDistance;
// c_daSTLID[ID] = NearestID;
// return;
//}
//Scalar3 TargetPosition = c_daParticlePosition[ID];
Integer TriangleID;
Integer CIDX, CIDY, CIDZ;
Integer CID = GetCellID(&CONSTANT_BOUNDINGBOX,&c_daParticlePosition[ID],CIDX, CIDY, CIDZ);
if(CID >=0 && CID < c_CellNum)
{
//Integer Range = 1;
for(Integer k = -1; k <= 1; ++k)
{
for(Integer j = -1; j <= 1; ++j)
{
for(Integer i = -1; i <= 1; ++i)
{
Integer MCID = GetCellID(&CONSTANT_BOUNDINGBOX,CIDX +i, CIDY + j,CIDZ + k);
if(MCID < 0 || MCID >= c_CellNum)
{
continue;
}
unsigned int TriangleNum = c_daCell[MCID].m_TriangleNum;
for(unsigned int l = 0; l < TriangleNum; ++l)
{
TriangleID = c_daCell[MCID].m_TriangleID[l];
/*if(c_daTrianglesParameters[c_daTriangles[TriangleID].ModelIDNumber].isDrag)
{
continue;
}*/
if( TriangleID >= 0 && TriangleID < c_TriangleNum && TriangleID != NearestID)// No need to calculate again for the same triangle
{
CDistance Distance ;
Distance.Magnitude = CalcDistance(&c_daTriangles[TriangleID], &c_daParticlePosition[ID], &Distance.Direction);
if(Distance.Magnitude < NearestDistance.Magnitude)
{
NearestDistance = Distance;
NearestID = TriangleID;
}
}
}
}
}
}
}
c_daSTLDistance[ID] = NearestDistance;
c_daSTLID[ID] = NearestID;
}
and when I add any basic variables or perform any checking operation, it gives unknown error and while checking wih cuda-memcheck, it suggests memory read error.
here in the changed code, i tried to check the previously calculated part and tried to skip the redundant calculation. for this I tried to perform basic check operation in array but it throws memory error.
error raising code
__global__ static void CalcSTLDistance_Kernel(Integer ComputeParticleNumber)
{
//const Integer TID = CudaGetTargetID();
const Integer ID =CudaGetTargetID();
/*if(ID >= ComputeParticleNumber)
{
return ;
}*/
CDistance NearestDistance;
Integer NearestID = -1;
NearestDistance.Magnitude = 1e8;
NearestDistance.Direction.x = 0;
NearestDistance.Direction.y = 0;
NearestDistance.Direction.z = 0;//make_Scalar3(0,0,0);
//if(c_daOutputParticleID[ID] < -1)
//{
// c_daSTLDistance[ID] = NearestDistance;
// c_daSTLID[ID] = NearestID;
// return;
//}
//Scalar3 TargetPosition = c_daParticlePosition[ID];
Integer TriangleID;
Integer CIDX, CIDY, CIDZ;
Integer CID = GetCellID(&CONSTANT_BOUNDINGBOX,&c_daParticlePosition[ID],CIDX, CIDY, CIDZ);
int len=0;
int td[100];
for(int m=0;m<100;m++)
{
td[m]=-1;
}
if(CID >=0 && CID < c_CellNum)
{
//Integer Range = 1;
for(Integer k = -1; k <= 1; ++k)
{
for(Integer j = -1; j <= 1; ++j)
{
for(Integer i = -1; i <= 1; ++i)
{
Integer MCID = GetCellID(&CONSTANT_BOUNDINGBOX,CIDX +i, CIDY + j,CIDZ + k);
if(MCID < 0 || MCID >= c_CellNum)
{
continue;
}
unsigned int TriangleNum = c_daCell[MCID].m_TriangleNum;
bool flag = false;
//len=len+TriangleNum ;
for(unsigned int l = 0; l < TriangleNum; ++l)
{
TriangleID = c_daCell[MCID].m_TriangleID[l];
//tem[l] = c_daCell[MCID].m_TriangleID[l];
for(int m=0;m<100;m++)
{
if(TriangleID ==td[m])
{
flag= true;
}
if(flag == true)
break;
}
if(flag == true)
continue;
else
{
td[len] = TriangleID;
len= len+1;
if( TriangleID >= 0 && TriangleID < c_TriangleNum && TriangleID != NearestID)// No need to calculate again for the same triangle
{
CDistance Distance ;
Distance.Magnitude = CalcDistance(&c_daTriangles[TriangleID], &c_daParticlePosition[ID], &Distance.Direction);
if(Distance.Magnitude < NearestDistance.Magnitude)
{
NearestDistance = Distance;
NearestID = TriangleID;
}
}
}
}
}
}
}
}
c_daSTLDistance[ID] = NearestDistance;
c_daSTLID[ID] = NearestID;
}
this problem arises whenever I tried to add any piece of code,thus I suspects that this block of kernel is not allowing me to add any further code due to memory over use.
is there any memory violation rule per block or thread??
how to find the total memory usuage per kernel ?? is there any way??

Given a position in matrix [i, j], find the block it belongs to

Well, I am dealing with sudoku solving algorithm and generation but stuck at rather simple task. I have made the check, whether a number is really fit in the position row-wise and column-wise. But what it is driving me mad is block check, ie, whether the number is really fit in the 3x3 block.
It must be simple enough but I can't really arrive at the solution. In short, I want to know the 3x3 block to which a position in matrix belongs. Here are some of the assert cases. The block no, row no and col no indexing starts from 0.
assert("x( 0, 8 ) === 2");
assert("x( 8, 8 ) === 8");
assert("x( 3, 3 ) === 4");
assert("x( 3, 7 ) === 5");
assert("x( 7, 1 ) === 6");
x( i , j ) returns the block number where i = row and j = col.
Isn't it just:
block = 3 * (i / 3) + (j / 3)
(assumes integer operations).
I would code a check, something like this (in pseudo C++)
// row = row to check
// col = column to check
// checkNum = number we are thinking of inserting
bool check(int row, int col, int checkNum)
{
int blockRow = 3 * (row/3);
int blockCol = 3 * (col/3);
for(int i = 0 ; i < 9 ; i++)
{
if(grid[row][i] == checkNum) return false; // number exists in the row.
if(grid[i][col] == checkNum) return false; // number exists in the col.
if(grid[blockRow + i/3][blockCol + i%3] == checkNum) return false; // number exists in the block.
}
return true;
}
Here is a sudoku solver in javascript. Taken from DSSudokuSolver, that I created.
The CleanElements function does something similar to what you are asking for.
CleanElements = function(comp_ary, Qsudoku){
for(i=0; i<9; i++){
for(j=0; j<9; j++){
/*if(Qsudoku[i][j] != ""){
comp_ary[i][j]=[];
}*/
for(k=0; k<9; k++){
i_index = comp_ary[i][k].indexOf(Qsudoku[i][j]);
if(i_index != -1){
comp_ary[i][k].splice(i_index, 1);
}
j_index = comp_ary[k][j].indexOf(Qsudoku[i][j]);
if(j_index != -1){
comp_ary[k][j].splice(j_index, 1);
}
}
if(i < 3){
i_min = 0;
i_max = 2;
}
else if(i < 6){
i_min = 3;
i_max = 5;
}
else{
i_min = 6;
i_max = 8;
}
if(j < 3){
j_min = 0;
j_max = 2;
}
else if(j < 6){
j_min = 3;
j_max = 5;
}
else{
j_min = 6;
j_max = 8;
}
for(i_box=i_min; i_box<=i_max; i_box++){
for(j_box=j_min; j_box<=j_max; j_box++){
index = comp_ary[i_box][j_box].indexOf(Qsudoku[i][j]);
if(index != -1){
comp_ary[i_box][j_box].splice(index, 1);
}
}
}
}
}
return comp_ary;
}
FindElements = function(comp_ary, Qsudoku){
for(i=0; i<9; i++){
for(j=0; j<9; j++){
if(comp_ary[i][j].length == 1){
if (Qsudoku[i][j] == ""){
Qsudoku[i][j] = comp_ary[i][j][0];
comp_ary[i][j] = [];
}
}
}
}
return Qsudoku;
}
IsThereNullElement = function(Qsudoku){
for(i=0; i<9; i++){
for(j=0; j<9; j++){
if(Qsudoku[i][j] == ""){
return false;
}
}
}
return true;
}
InitEmptyArray = function(){
empty_ary = Array();
for(i=0; i<9; i++){
empty_ary[i] = Array();
for(j=0; j<9; j++){
empty_ary[i][j] = Array();
for(k=0; k<9; k++){
empty_ary[i][j][k] = (k+1).toString();
}
}
}
return empty_ary;
}
DSSolve = function(Qsudoku){
comp_ary = InitEmptyArray(); //Complementary Array
window.comp_ary_old = comp_ary;
IterationMax = 5000;
while(true){
IterationMax -= 1;
comp_ary = CleanElements(comp_ary, Qsudoku);
console.log(comp_ary);
if(window.comp_ary_old == comp_ary){
//implement this.
}
else{
window.comp_ary_old = comp_ary;
}
Qsudoku = FindElements(comp_ary, Qsudoku);
//console.log(Qsudoku);
if(IsThereNullElement(Qsudoku)){
return Qsudoku;
}
if(IterationMax == 0){
return null;
}
}
}

Why do TrueType fonts use only "Quadratic Curves" on the Mac?

I extracted the path elements of a TrueType font using the code below.
The problem is that the curves described by element->type are only of the type kCGPathElementAddQuadCurveToPoint, which has 2 points (low quality). Why don't any of the elements use the higher-quality kCGPathElementAddCurveToPoint type (with 3 points)?
Does anybody have an idea?
int count = 0;
void applierFunction(void *info, const CGPathElement *element)
{
int point_count;
switch (element->type)
{
case kCGPathElementMoveToPoint:
point_count = 1;
break;
case kCGPathElementAddLineToPoint:
point_count = 1;
break;
case kCGPathElementAddQuadCurveToPoint:
point_count = 2;
break;
case kCGPathElementAddCurveToPoint:
point_count = 3;
break;
case kCGPathElementCloseSubpath:
point_count = 0;
break;
}
count+=point_count;
printf("%u.:\n", count);
for (int i = 0; i<point_count; i++) {
switch (element->type) {
case kCGPathElementMoveToPoint:
printf("kCGPathElementMoveToPoint\n");
break;
case kCGPathElementAddLineToPoint:
printf("kCGPathElementAddLineToPoint\n");
break;
case kCGPathElementAddQuadCurveToPoint:
printf("kCGPathElementAddQuadCurveToPoint\n");
break;
case kCGPathElementAddCurveToPoint:
printf("kCGPathElementAddCurveToPoint\n");
break;
case kCGPathElementCloseSubpath:
printf("kCGPathElementCloseSubpath\n");
break;
default:
printf("unknown path type\n");
}
}
}
void extractStringPathElements()
{
CFStringRef str = CFSTR("Q");
CFStringRef strFont = CFSTR("Arial");
CTFontRef font = CTFontCreateWithName(strFont, 500, NULL);
CFStringRef keys[] = {kCTFontAttributeName};
CFTypeRef values[] = {font};
CFDictionaryRef attr = CFDictionaryCreate(NULL, (const void **)&keys,
(const void **)&values, sizeof(keys) / sizeof(keys[0]),
&kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
CFAttributedStringRef attrString =
CFAttributedStringCreate(NULL, str, attr);
CFRelease(attr);
CTLineRef line = CTLineCreateWithAttributedString(attrString);
CFArrayRef runs = CTLineGetGlyphRuns(line);
long runCount = CFArrayGetCount(runs);
for (int ri = 0; ri < runCount; ri++) {
::CTRunRef run = (::CTRunRef)CFArrayGetValueAtIndex(runs, ri);
long glyphCount = CTRunGetGlyphCount(run);
const unsigned short *glyphs = CTRunGetGlyphsPtr(run);
for (int gi = 0; gi < glyphCount; gi++) {
::CGPathRef path = CTFontCreatePathForGlyph(font, glyphs[gi], NULL);
CGPathApply(path, 0, applierFunction);
printf("glyph %u has %u elements.\n", glyphs[gi], count);
CGPathRelease(path);
}
}
CFRelease(line);
CFRelease(attrString);
CFRelease(font);
}
int main (int argc, const char * argv[])
{
extractStringPathElements();
return 0;
}
TrueType has no cubic bezier curves. It's limited to quadratic bezier curves. TrueType was specified that way.

Resources