Optimizing global memory load in CUDA - parallel-processing

My task :
I have two matrices : A - (18 x 4194304) ; B - (18 x 1024).
I have to take each 18-length vector from A and compute distance with each 18-length vector from B and find minimum distance and index.
My code :
__device__
void GetMin(float &dist, int &idx)
{
float dist2;
int idx2;
dist2 = __shfl_down_sync(0xFFFFFFFF, dist, 16, 32);
idx2 = __shfl_down_sync(0xFFFFFFFF, idx, 16);
if (dist > dist2)
{
dist = dist2;
idx = idx2;
}
dist2 = __shfl_down_sync(0xFFFFFFFF, dist, 8, 32);
idx2 = __shfl_down_sync(0xFFFFFFFF, idx, 8);
if (dist > dist2)
{
dist = dist2;
idx = idx2;
}
dist2 = __shfl_down_sync(0xFFFFFFFF, dist, 4, 32);
idx2 = __shfl_down_sync(0xFFFFFFFF, idx, 4);
if (dist > dist2)
{
dist = dist2;
idx = idx2;
}
dist2 = __shfl_down_sync(0xFFFFFFFF, dist, 2, 32);
idx2 = __shfl_down_sync(0xFFFFFFFF, idx, 2);
if (dist > dist2)
{
dist = dist2;
idx = idx2;
}
dist2 = __shfl_down_sync(0xFFFFFFFF, dist, 1, 32);
idx2 = __shfl_down_sync(0xFFFFFFFF, idx, 1);
if (dist > dist2)
{
dist = dist2;
idx = idx2;
}
}
__global__
void CalcMinDist_kernel(const float *A, const float *B, float *output, const int nNumPixels, int nNumImages)
{
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int ty = threadIdx.y;
int lane_id = tx % 32;
float dist = 0;
int idx = 0;
float fMin = 99999999;
int nMinIdx = -1;
for(int i = lane_id; i < 1024; i += 32)
{
dist = 0;
for(int j = 0; j < nNumImages; ++j)
{
int img_idx = blockIdx.x * ty + j * nNumPixels;
dist += (A[img_idx] - B[i * nNumImages + j]) *
(A[img_idx] - B[i * nNumImages + j]);
}
idx = i;
GetMin(dist, idx);
if(threadIdx.x == 0)
{
if(fMin > dist)
{
fMin = dist;
nMinIdx = idx;
}
}
}
if(threadIdx.x == 0)
{
output[blockIdx.x * ty] = nMinIdx;
}
}
Looking at the profiler, I'm memory bound, and do have ~90% occupancy. Is there any way to speed up this operation?
Let me know if I need to provide any other information.

Actually, I would look at the algorithm first. This is a geometric problem - treat it as such.
You should represent the B data using a different data structure, e.g. by clustering or building a partition structure (e.g. k-d tree). That will let you avoid actually computing the distance from most B elements. (You could also consider a project onto fewer dimensions, but the benefit of this may be more elusive.)
With respect to the access pattern - you would probably benefit from having consecutive threads working on consecutive elements of the 18-element-long vectors, rather than having threads work on complete 18-element-long vectors individually. That would better fit the memory layout - right now, a warp read is of many elements which are at distance 18 from each other. If I understand the code correctly anyway.
(I also think the GetMin() could avoid some of the index swaps, but that's not significant since you only perform very few of those.)

Related

MVC image blending algorithm implementation

I followed the algorithm mentioned in the sig09 paper Coordinates for Instant Image Cloning
The algorithm
This is my code:
#include<iostream>
#include<vector>
#include<map>
#include<fstream>
#include<queue>
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include"../shared/stb_image.h"
#include"../shared/stb_image_write.h"
#define vector std::vector
#define queue std::queue
#define map std::map
#define cin std::cin
#define cout std::cout
#define endl std::endl
#define string std::string
#define PDD std::pair<double, double>
#define image Mat<unsigned char>
template<class T>
class Mat{
public:
int row, col, channel;
vector<T> data;
Mat(){}
Mat(int row, int col, int cha):row(row), col(col), channel(cha){
data.resize(row * col * cha, 0);
}
Mat(const char *name){
T *t = stbi_load(name, &col, &row, &channel, 0);
data = vector<T>(t, t + col * row * channel);
stbi_image_free(t);
}
T& at(int x, int y, int z){
return data[(x * col + y) * channel + z];
}
void write(const char *name){
stbi_write_bmp(name, col, row, channel, data.data());
}
};
#define x first
#define y second
vector<PDD> P;
map<int, map<int, bool>> st; // register
vector<vector<double>> w;
int dx8[] = {1, 1, 1, 0, 0, -1, -1, -1}, dy8[] = {-1, 0, 1, -1, 1, -1, 0, 1};
int dx4[] = {0, 0, 1, -1}, dy4[] = {1, -1, 0, 0};
bool check(int i, int j, image &mask){
if(mask.at(i, j, 0) == 0) return false;
return mask.at(i, j - 1, 0) == 0||
mask.at(i, j + 1, 0) == 0||
mask.at(i - 1, j, 0) == 0||
mask.at(i + 1, j, 0) == 0;
}
void dfs(int sx, int sy, int x, int y, int px, int py, image &mask){
if(mask.at(x, y - 1, 0) == 0 && st[x][y - 1] == 0) P.push_back({x, y - 1}), st[x][y - 1] = 1;
if(px != -1){
if(mask.at(x + 1, y, 0) == 0 && st[x + 1][y] == 0) P.push_back({x + 1, y}), st[x + 1][y] = 1;
if(mask.at(x, y + 1, 0) == 0 && st[x][y + 1] == 0) P.push_back({x, y + 1}), st[x][y + 1] = 1;
if(mask.at(x - 1, y, 0) == 0 && st[x - 1][y] == 0) P.push_back({x - 1, y}), st[x - 1][y] = 1;
}
if(sx == x && sy == y && px != -1) return;
for(int i = 0; i < 8; i ++){
int a = x + dx8[i], b = y + dy8[i];
if(a < 0 || b < 0 || a >= mask.row || b >= mask.col) continue;
if(check(a, b, mask) && (a != px || b != py)) dfs(sx, sy, a, b, x, y, mask);
}
}
double len(const PDD &a){
return sqrt(a.x * a.x + a.y * a.y);
}
double dot(const PDD &a, const PDD &b){
return a.x * b.x + a.y * b.y;
}
PDD minus(const PDD &a, const PDD &b){
return {a.x - b.x, a.y - b.y};
}
PDD normalize(const PDD &a){
return {a.x / len(a), a.y / len(a)};
}
double val(PDD &pre, PDD &cur, PDD &nxt, PDD &o){
PDD V1 = normalize(minus(pre, o));
PDD V2 = normalize(minus(nxt, o));
PDD mid = normalize(minus(cur, o));
double alpha1 = acos(dot(V1, mid));
double alpha2 = acos(dot(V2, mid));
return (tan(alpha1 / 2) + tan(alpha2 / 2)) / len(minus(cur, o)); // many nan value occured here
}
int main(int argc, char *argv[]){
image src("src.png");
image mask("mask1.png");
image tar("target.png");
image res(tar.row, tar.col, tar.channel);
for(int i = 0; i < mask.row; i ++){
for(int j = 0; j < mask.col; j ++){
if(mask.at(i, j, 0) == 255 && st[i][j] == 0){
dfs(i, j, i, j, -1, -1, mask); // find counter-clockwise border
queue<PDD> q;
vector<PDD> X;
q.push({i, j});
st[i][j] = 1;
while(q.size()){ // get all white (x, y)s in mask
auto h = q.front();
X.push_back(h);
q.pop();
vector<double> wx;
for(int k = 0; k < P.size(); k ++){ // calculate lambda value by search order
int pre = (k - 1 + P.size()) % P.size();
int cur = k;
int nxt = (k + 1) % P.size();
wx.push_back(
val(P[pre], P[cur], P[nxt], h)
);
}
w.push_back(wx);
for(int k = 0; k < 4; k ++){
int a = h.x + dx4[k], b = h.y + dy4[k];
if(st[a][b] == 1 || mask.at(a, b, 0) == 0) continue;
st[a][b] = 1;
q.push({a, b});
}
}
for(int c = 0; c < res.channel; c ++){ // every channel of res
for(int k = 0; k < X.size(); k ++){
double rx = 0, sum = 0;
for(int u = 0; u < w[k].size(); u ++){
double diff = tar.at(P[u].x, P[u].y, c) - src.at(P[u].x, P[u].y, c);
rx += w[k][u] * diff;
sum += w[k][u];
}
rx /= sum;
res.at(X[k].x, X[k].y, c) = rx + src.at(X[k].x, X[k].y, c);
}
}
res.write("./res.bmp");
return 0;
}
}
}
}
1. get the border(counter-clockwise) of the white region in the
mask
2. get all (x, y)s of pixels in the white area of the mask
3. calculate lambda value of every (x, y) in 2, but I found that lambda values of every (x, y) contain many nans (possibly caused by too small value in function val(...))
The question is I do not know how to deal with this condition in 3, nor did the paper mention it.

3d point closest to multiple lines in 3D space

I search for non iterative, closed form, algorithm to find Least squares solution for point closest to the set of 3d lines. It is similar to 3d point triangulation (to minimize re-projections) but seems to be be simpler and faster?
Lines can be described in any form, 2 points, point and unit direction or similar.
Let the i th line be given by point ai and unit direction vector di. We need to find the single point that minimizes the sum of squared point to line distances. This is where the gradient is the zero vector:
Expanding the gradient,
Algebra yields a canonical 3x3 linear system,
where the k'th row (a 3-element row vector) of matrix M is
with vector ek the respective unit basis vector, and
It's not hard to turn this into code. I borrowed (and fixed a small bug in) a Gaussian elimination function from Rosettacode to solve the system. Thanks to the author!
#include <stdio.h>
#include <math.h>
typedef double VEC[3];
typedef VEC MAT[3];
void solve(double *a, double *b, double *x, int n); // linear solver
double dot(VEC a, VEC b) { return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]; }
void find_nearest_point(VEC p, VEC a[], VEC d[], int n) {
MAT m = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
VEC b = {0, 0, 0};
for (int i = 0; i < n; ++i) {
double d2 = dot(d[i], d[i]), da = dot(d[i], a[i]);
for (int ii = 0; ii < 3; ++ii) {
for (int jj = 0; jj < 3; ++jj) m[ii][jj] += d[i][ii] * d[i][jj];
m[ii][ii] -= d2;
b[ii] += d[i][ii] * da - a[i][ii] * d2;
}
}
solve(&m[0][0], b, p, 3);
}
// Debug printing.
void pp(VEC v, char *l, char *r) {
printf("%s%.3lf, %.3lf, %.3lf%s", l, v[0], v[1], v[2], r);
}
void pv(VEC v) { pp(v, "(", ")"); }
void pm(MAT m) { for (int i = 0; i < 3; ++i) pp(m[i], "\n[", "]"); }
// A simple verifier.
double dist2(VEC p, VEC a, VEC d) {
VEC pa = { a[0]-p[0], a[1]-p[1], a[2]-p[2] };
double dpa = dot(d, pa);
return dot(d, d) * dot(pa, pa) - dpa * dpa;
}
double sum_dist2(VEC p, VEC a[], VEC d[], int n) {
double sum = 0;
for (int i = 0; i < n; ++i) sum += dist2(p, a[i], d[i]);
return sum;
}
// Check 26 nearby points and verify the provided one is nearest.
int is_nearest(VEC p, VEC a[], VEC d[], int n) {
double min_d2 = 1e100;
int ii = 2, jj = 2, kk = 2;
#define D 0.01
for (int i = -1; i <= 1; ++i)
for (int j = -1; j <= 1; ++j)
for (int k = -1; k <= 1; ++k) {
VEC pp = { p[0] + D * i, p[1] + D * j, p[2] + D * k };
double d2 = sum_dist2(pp, a, d, n);
// Prefer provided point among equals.
if (d2 < min_d2 || i == 0 && j == 0 && k == 0 && d2 == min_d2) {
min_d2 = d2;
ii = i; jj = j; kk = k;
}
}
return ii == 0 && jj == 0 && kk == 0;
}
void normalize(VEC v) {
double len = sqrt(dot(v, v));
v[0] /= len;
v[1] /= len;
v[2] /= len;
}
int main(void) {
VEC a[] = {{-14.2, 17, -1}, {1, 1, 1}, {2.3, 4.1, 9.8}, {1,2,3}};
VEC d[] = {{1.3, 1.3, -10}, {12.1, -17.2, 1.1}, {19.2, 31.8, 3.5}, {4,5,6}};
int n = 4;
for (int i = 0; i < n; ++i) normalize(d[i]);
VEC p;
find_nearest_point(p, a, d, n);
pv(p);
printf("\n");
if (!is_nearest(p, a, d, n)) printf("Woops. Not nearest.\n");
return 0;
}
// A linear solver from rosettacode (with bug fix: added a missing fabs())
#define mat_elem(a, y, x, n) (a + ((y) * (n) + (x)))
void swap_row(double *a, double *b, int r1, int r2, int n)
{
double tmp, *p1, *p2;
int i;
if (r1 == r2) return;
for (i = 0; i < n; i++) {
p1 = mat_elem(a, r1, i, n);
p2 = mat_elem(a, r2, i, n);
tmp = *p1, *p1 = *p2, *p2 = tmp;
}
tmp = b[r1], b[r1] = b[r2], b[r2] = tmp;
}
void solve(double *a, double *b, double *x, int n)
{
#define A(y, x) (*mat_elem(a, y, x, n))
int i, j, col, row, max_row, dia;
double max, tmp;
for (dia = 0; dia < n; dia++) {
max_row = dia, max = fabs(A(dia, dia));
for (row = dia + 1; row < n; row++)
if ((tmp = fabs(A(row, dia))) > max) max_row = row, max = tmp;
swap_row(a, b, dia, max_row, n);
for (row = dia + 1; row < n; row++) {
tmp = A(row, dia) / A(dia, dia);
for (col = dia+1; col < n; col++)
A(row, col) -= tmp * A(dia, col);
A(row, dia) = 0;
b[row] -= tmp * b[dia];
}
}
for (row = n - 1; row >= 0; row--) {
tmp = b[row];
for (j = n - 1; j > row; j--) tmp -= x[j] * A(row, j);
x[row] = tmp / A(row, row);
}
#undef A
}
This isn't extensively tested, but seems to be working fine.
Let base point of line is p and unit direction vector is d.
Then distance from point v to this line might be calculated using cross product
SquaredDist = ((v - p) x d)^2
Using Maple packet symbolic calculation, we can get
d := <dx, dy, dz>;
v := <vx, vy, vz>;
p := <px, py, pz>;
w := v - p;
cp := CrossProduct(d, w);
nrm := BilinearForm(cp, cp, conjugate=false); //squared dist
nr := expand(nrm);
//now partial derivatives
nrx := diff(nr, vx);
//results:
nrx := -2*dz^2*px-2*dy^2*px+2*dz^2*vx+2*dy^2*vx
+2*dx*py*dy-2*dx*vy*dy+2*dz*dx*pz-2*dz*dx*vz
nry := -2*dx^2*py-2*dz^2*py-2*dy*vz*dz+2*dx^2*vy
+2*dz^2*vy+2*dy*pz*dz+2*dx*dy*px-2*dx*dy*vx
nrz := -2*dy^2*pz+2*dy^2*vz-2*dy*dz*vy+2*dx^2*vz
-2*dx^2*pz-2*dz*vx*dx+2*dy*dz*py+2*dz*px*dx
To minimize sum of squared distances, we have to make system of linear equations for zero partial derivatives like this:
vx*2*(Sum(dz^2)+Sum(dy^2)) + vy * (-2*Sum(dx*dy)) + vz *(-2*Sum(dz*dx)) =
2*Sum(dz^2*px)-2*Sum(dy^2*px) -2*Sum(dx*py*dy)-2*Sum(dz*dx*pz)
where
Sum(dz^2) = Sum{over all i in line indexes} {dz[i] * dz[i]}
and solve it for unknowns vx, vy, vz
Edit: Old erroneous answer for planes instead of lines, left for reference
If we use general equation of line
A * x + B * y + C * z + D = 0
then distance from point (x, y, z) to this line is
Dist = Abs(A * x + B * y + C * z + D) / Sqrt(A^2 + B^2 + C^2)
To simplify - just normalize all line equations dividing by Norm's
Norm = Sqrt(A^2 + B^2 + C^2)
a = A / Norm
b = B / Norm
c = C / Norm
d = D / Norm
now equation is
a * x + b * y + c * z + d = 0
and distance
Dist = Abs(a * x + b * y + c * z + d)
and we can use squared distances like LS method (ai, bi, ci, di are coefficients for i-th line)
F = Sum(ai*x + bi*y + ci * z + d)^2 =
Sum(ai^2*x^2 + bi^2*y^2 + ci^2*z^2 + d^2 +
2 * (ai*bi*x*y + ai*ci*x*z + bi*y*ci*z + ai*x*di + bi*y*di + ci*z*di))
partial derivatives
dF/dx = 2*Sum(ai^2*x + ai*bi*y + ai*ci*z + ai*di) = 0
dF/dy = 2*Sum(bi^2*y + ai*bi*x + bi*ci*z + bi*di) = 0
dF/dz = 2*Sum(ci^2*z + ai*ci*x + bi*ci*y + ci*di) = 0
so we have system of linear equation
x * Sum(ai^2) + y * Sum(ai*bi) + z * Sum(ai*ci)= - Sum(ai*di)
y * Sum(bi^2) + x * Sum(ai*bi) + z * Sum(bi*ci)= - Sum(bi*di)
z * Sum(ci^2) + x * Sum(ai*ci) + y * Sum(bi*ci)= - Sum(ci*di)
x * Saa + y * Sab + z * Sac = - Sad
x * Sab + y * Sbb + z * Sbc = - Sbd
x * Sac + y * Sbc + z * Scc = - Scd
where S** are corresponding sums
and can solve it for unknowns x, y, z
I needed this for a sketch in Processing, so I ported Gene's answer. Works great and thought it might save someone else a little time. Unfortunately PVector/PMatrix don't have array accessors for vectors or matrices so I had to add these as local functions.
float getv(PVector v, int i) {
if(i == 0) return v.x;
if(i == 1) return v.y;
return v.z;
}
void setv(PVector v, int i, float value) {
if (i == 0) v.x = value;
else if (i == 1) v.y = value;
else v.z = value;
}
void incv(PVector v, int i, float value) {
setv(v,i,getv(v,i) + value);
}
float getm(float[] mm, int r, int c) { return mm[c + r*4]; }
void setm(float[] mm, int r, int c, float value) { mm[c + r*4] = value; }
void incm(float[] mm, int r, int c, float value) { mm[c + r*4] += value; }
PVector findNearestPoint(PVector a[], PVector d[]) {
var mm = new float[16];
var b = new PVector();
var n = a.length;
for (int i = 0; i < n; ++i) {
var d2 = d[i].dot(d[i]);
var da = d[i].dot(a[i]);
for (int ii = 0; ii < 3; ++ii) {
for (int jj = 0; jj < 3; ++jj) {
incm(mm,ii,jj, getv(d[i],ii) * getv(d[i],jj));
}
incm(mm, ii,ii, -d2);
incv(b, ii, getv(d[i], ii) * da - getv(a[i], ii) * d2);
}
}
var p = solve(mm, new float[] {b.x, b.y, b.z});
return new PVector(p[0],p[1],p[2]);
}
// Verifier
float dist2(PVector p, PVector a, PVector d) {
PVector pa = new PVector( a.x-p.x, a.y-p.y, a.z-p.z );
float dpa = d.dot(pa);
return d.dot(d) * pa.dot(pa) - dpa * dpa;
}
//double sum_dist2(VEC p, VEC a[], VEC d[], int n) {
float sum_dist2(PVector p, PVector a[], PVector d[]) {
int n = a.length;
float sum = 0;
for (int i = 0; i < n; ++i) {
sum += dist2(p, a[i], d[i]);
}
return sum;
}
// Check 26 nearby points and verify the provided one is nearest.
boolean isNearest(PVector p, PVector a[], PVector d[]) {
float min_d2 = 3.4028235E38;
int ii = 2, jj = 2, kk = 2;
final float D = 0.1f;
for (int i = -1; i <= 1; ++i)
for (int j = -1; j <= 1; ++j)
for (int k = -1; k <= 1; ++k) {
PVector pp = new PVector( p.x + D * i, p.y + D * j, p.z + D * k );
float d2 = sum_dist2(pp, a, d);
// Prefer provided point among equals.
if (d2 < min_d2 || i == 0 && j == 0 && k == 0 && d2 == min_d2) {
min_d2 = d2;
ii = i; jj = j; kk = k;
}
}
return ii == 0 && jj == 0 && kk == 0;
}
void setup() {
PVector a[] = {
new PVector(-14.2, 17, -1),
new PVector(1, 1, 1),
new PVector(2.3, 4.1, 9.8),
new PVector(1,2,3)
};
PVector d[] = {
new PVector(1.3, 1.3, -10),
new PVector(12.1, -17.2, 1.1),
new PVector(19.2, 31.8, 3.5),
new PVector(4,5,6)
};
int n = 4;
for (int i = 0; i < n; ++i)
d[i].normalize();
PVector p = findNearestPoint(a, d);
println(p);
if (!isNearest(p, a, d))
println("Woops. Not nearest.\n");
}
// From rosettacode (with bug fix: added a missing fabs())
int mat_elem(int y, int x) { return y*4+x; }
void swap_row(float[] a, float[] b, int r1, int r2, int n)
{
float tmp;
int p1, p2;
int i;
if (r1 == r2) return;
for (i = 0; i < n; i++) {
p1 = mat_elem(r1, i);
p2 = mat_elem(r2, i);
tmp = a[p1];
a[p1] = a[p2];
a[p2] = tmp;
}
tmp = b[r1];
b[r1] = b[r2];
b[r2] = tmp;
}
float[] solve(float[] a, float[] b)
{
float[] x = new float[] {0,0,0};
int n = x.length;
int i, j, col, row, max_row, dia;
float max, tmp;
for (dia = 0; dia < n; dia++) {
max_row = dia;
max = abs(getm(a, dia, dia));
for (row = dia + 1; row < n; row++) {
if ((tmp = abs(getm(a, row, dia))) > max) {
max_row = row;
max = tmp;
}
}
swap_row(a, b, dia, max_row, n);
for (row = dia + 1; row < n; row++) {
tmp = getm(a, row, dia) / getm(a, dia, dia);
for (col = dia+1; col < n; col++) {
incm(a, row, col, -tmp * getm(a, dia, col));
}
setm(a,row,dia, 0);
b[row] -= tmp * b[dia];
}
}
for (row = n - 1; row >= 0; row--) {
tmp = b[row];
for (j = n - 1; j > row; j--) {
tmp -= x[j] * getm(a, row, j);
}
x[row] = tmp / getm(a, row, row);
}
return x;
}

Obtaining orientation map of fingerprint image using OpenCV

I'm trying to implement the method of improving fingerprint images by Anil Jain. As a starter, I encountered some difficulties while extracting the orientation image, and am strictly following those steps described in Section 2.4 of that paper.
So, this is the input image:
And this is after normalization using exactly the same method as in that paper:
I'm expecting to see something like this (an example from the internet):
However, this is what I got for displaying obtained orientation matrix:
Obviously this is wrong, and it also gives non-zero values for those zero points in the original input image.
This is the code I wrote:
cv::Mat orientation(cv::Mat inputImage)
{
cv::Mat orientationMat = cv::Mat::zeros(inputImage.size(), CV_8UC1);
// compute gradients at each pixel
cv::Mat grad_x, grad_y;
cv::Sobel(inputImage, grad_x, CV_16SC1, 1, 0, 3, 1, 0, cv::BORDER_DEFAULT);
cv::Sobel(inputImage, grad_y, CV_16SC1, 0, 1, 3, 1, 0, cv::BORDER_DEFAULT);
cv::Mat Vx, Vy, theta, lowPassX, lowPassY;
cv::Mat lowPassX2, lowPassY2;
Vx = cv::Mat::zeros(inputImage.size(), inputImage.type());
Vx.copyTo(Vy);
Vx.copyTo(theta);
Vx.copyTo(lowPassX);
Vx.copyTo(lowPassY);
Vx.copyTo(lowPassX2);
Vx.copyTo(lowPassY2);
// estimate the local orientation of each block
int blockSize = 16;
for(int i = blockSize/2; i < inputImage.rows - blockSize/2; i+=blockSize)
{
for(int j = blockSize / 2; j < inputImage.cols - blockSize/2; j+= blockSize)
{
float sum1 = 0.0;
float sum2 = 0.0;
for ( int u = i - blockSize/2; u < i + blockSize/2; u++)
{
for( int v = j - blockSize/2; v < j+blockSize/2; v++)
{
sum1 += grad_x.at<float>(u,v) * grad_y.at<float>(u,v);
sum2 += (grad_x.at<float>(u,v)*grad_x.at<float>(u,v)) * (grad_y.at<float>(u,v)*grad_y.at<float>(u,v));
}
}
Vx.at<float>(i,j) = sum1;
Vy.at<float>(i,j) = sum2;
double calc = 0.0;
if(sum1 != 0 && sum2 != 0)
{
calc = 0.5 * atan(Vy.at<float>(i,j) / Vx.at<float>(i,j));
}
theta.at<float>(i,j) = calc;
// Perform low-pass filtering
float angle = 2 * calc;
lowPassX.at<float>(i,j) = cos(angle * pi / 180);
lowPassY.at<float>(i,j) = sin(angle * pi / 180);
float sum3 = 0.0;
float sum4 = 0.0;
for(int u = -lowPassSize / 2; u < lowPassSize / 2; u++)
{
for(int v = -lowPassSize / 2; v < lowPassSize / 2; v++)
{
sum3 += inputImage.at<float>(u,v) * lowPassX.at<float>(i - u*lowPassSize, j - v * lowPassSize);
sum4 += inputImage.at<float>(u, v) * lowPassY.at<float>(i - u*lowPassSize, j - v * lowPassSize);
}
}
lowPassX2.at<float>(i,j) = sum3;
lowPassY2.at<float>(i,j) = sum4;
float calc2 = 0.0;
if(sum3 != 0 && sum4 != 0)
{
calc2 = 0.5 * atan(lowPassY2.at<float>(i, j) / lowPassX2.at<float>(i, j)) * 180 / pi;
}
orientationMat.at<float>(i,j) = calc2;
}
}
return orientationMat;
}
I've already searched a lot on the web, but almost all of them are in Matlab. And there exist very few ones using OpenCV, but they didn't help me either. I sincerely hope someone could go through my code and point out any error to help. Thank you in advance.
Update
Here are the steps that I followed according to the paper:
Obtain normalized image G.
Divide G into blocks of size wxw (16x16).
Compute the x and y gradients at each pixel (i,j).
Estimate the local orientation of each block centered at pixel (i,j) using equations:
Perform low-pass filtering to remove noise. For that, convert the orientation image into a continuous vector field defined as:
where W is a two-dimensional low-pass filter, and w(phi) x w(phi) is its size, which equals to 5.
Finally, compute the local ridge orientation at (i,j) using:
Update2
This is the output of orientationMat after changing the mat type to CV_16SC1 in Sobel operation as Micka suggested:
Maybe it's too late for me to answer, but anyway somebody could read this later and solve the same problem.
I've been working for a while in the same algorithm, same method you posted... But there's some writting errors when the papper was redacted (I guess). After fighting a lot with the equations I found this errors by looking other similar works.
Here is what worked for me...
Vy(i, j) = 2*dx(u,v)*dy(u,v)
Vx(i,j) = dx(u,v)^2 - dy(u,v)^2
O(i,j) = 0.5*arctan(Vy(i,j)/Vx(i,j)
(Excuse me I wasn't able to post images, so I wrote the modified ecuations. Remeber "u" and "v" are positions of the summation across the BlockSize by BlockSize window)
The first thing and most important (obviously) are the equations, I saw that in different works this expressions were really different and in every one they talked about the same algorithm of Hong et al.
The Key is finding the Least Mean Square (First 3 equations) of the gradients (Vx and Vy), I provided the corrected formulas above for this ation. Then you can compute angle theta for the non overlapping window (16x16 size recommended in the papper), after that the algorithm says you must calculate the magnitud of the doubled angle in "x" and "y" directions (Phi_x and Phi_y).
Phi_x(i,j) = V(i,j) * cos(2*O(i,j))
Phi_y(i,j) = V(i,j) * sin(2*O(i,j))
Magnitud is just:
V = sqrt(Vx(i,j)^2 + Vy(i,j)^2)
Note that in the related work doesn't mention that you have to use the gradient magnitud, but it make sense (for me) in doing it. After all this corrections you can apply the low pass filter to Phi_x and Phi_y, I used a simple Mask of size 5x5 to average this magnitudes (something like medianblur() of opencv).
Last thing is to calculate new angle, that is the average of the 25ith neighbors in the O(i,j) image, for this you just have to:
O'(i,j) = 0.5*arctan(Phi_y/Phi_x)
We're just there... All this just for calculating the angle of the NORMAL VECTOR TO THE RIDGES DIRECTIONS (O'(i,j)) in the BlockSize by BlockSize non overlapping window, what does it mean? it means that the angle we just calculated is perpendicular to the ridges, in simple words we just calculated the angle of the riges plus 90 degrees... To get the angle we need, we just have to substract to the obtained angle 90°.
To draw the lines we need to have an initial point (X0, Y0) and a final point(X1, Y1). For that imagine a circle centered on (X0, Y0) with a radious of "r":
x0 = i + blocksize/2
y0 = j + blocksize/2
r = blocksize/2
Note we add i and j to the first coordinates becouse the window is moving and we are gonna draw the line starting from the center of the non overlaping window, so we can't use just the center of the non overlaping window.
Then to calculate the end coordinates to draw a line we can just have to use a right triangle so...
X1 = r*cos(O'(i,j)-90°)+X0
Y1 = r*sin(O'(i,j)-90°)+Y0
X2 = X0-r*cos(O'(i,j)-90°)
Y2 = Y0-r*cos(O'(i,j)-90°)
Then just use opencv line function, where initial Point is (X0,Y0) and final Point is (X1, Y1). Additional to it, I drawed the windows of 16x16 and computed the oposite points of X1 and Y1 (X2 and Y2) to draw a line of the entire window.
Hope this help somebody.
My results...
Main function:
Mat mat = imread("nwmPa.png",0);
mat.convertTo(mat, CV_32F, 1.0/255, 0);
Normalize(mat);
int blockSize = 6;
int height = mat.rows;
int width = mat.cols;
Mat orientationMap;
orientation(mat, orientationMap, blockSize);
Normalize:
void Normalize(Mat & image)
{
Scalar mean, dev;
meanStdDev(image, mean, dev);
double M = mean.val[0];
double D = dev.val[0];
for(int i(0) ; i<image.rows ; i++)
{
for(int j(0) ; j<image.cols ; j++)
{
if(image.at<float>(i,j) > M)
image.at<float>(i,j) = 100.0/255 + sqrt( 100.0/255*pow(image.at<float>(i,j)-M,2)/D );
else
image.at<float>(i,j) = 100.0/255 - sqrt( 100.0/255*pow(image.at<float>(i,j)-M,2)/D );
}
}
}
Orientation map:
void orientation(const Mat &inputImage, Mat &orientationMap, int blockSize)
{
Mat fprintWithDirectionsSmoo = inputImage.clone();
Mat tmp(inputImage.size(), inputImage.type());
Mat coherence(inputImage.size(), inputImage.type());
orientationMap = tmp.clone();
//Gradiants x and y
Mat grad_x, grad_y;
// Sobel(inputImage, grad_x, CV_32F, 1, 0, 3, 1, 0, BORDER_DEFAULT);
// Sobel(inputImage, grad_y, CV_32F, 0, 1, 3, 1, 0, BORDER_DEFAULT);
Scharr(inputImage, grad_x, CV_32F, 1, 0, 1, 0);
Scharr(inputImage, grad_y, CV_32F, 0, 1, 1, 0);
//Vector vield
Mat Fx(inputImage.size(), inputImage.type()),
Fy(inputImage.size(), inputImage.type()),
Fx_gauss,
Fy_gauss;
Mat smoothed(inputImage.size(), inputImage.type());
// Local orientation for each block
int width = inputImage.cols;
int height = inputImage.rows;
int blockH;
int blockW;
//select block
for(int i = 0; i < height; i+=blockSize)
{
for(int j = 0; j < width; j+=blockSize)
{
float Gsx = 0.0;
float Gsy = 0.0;
float Gxx = 0.0;
float Gyy = 0.0;
//for check bounds of img
blockH = ((height-i)<blockSize)?(height-i):blockSize;
blockW = ((width-j)<blockSize)?(width-j):blockSize;
//average at block WхW
for ( int u = i ; u < i + blockH; u++)
{
for( int v = j ; v < j + blockW ; v++)
{
Gsx += (grad_x.at<float>(u,v)*grad_x.at<float>(u,v)) - (grad_y.at<float>(u,v)*grad_y.at<float>(u,v));
Gsy += 2*grad_x.at<float>(u,v) * grad_y.at<float>(u,v);
Gxx += grad_x.at<float>(u,v)*grad_x.at<float>(u,v);
Gyy += grad_y.at<float>(u,v)*grad_y.at<float>(u,v);
}
}
float coh = sqrt(pow(Gsx,2) + pow(Gsy,2)) / (Gxx + Gyy);
//smoothed
float fi = 0.5*fastAtan2(Gsy, Gsx)*CV_PI/180;
Fx.at<float>(i,j) = cos(2*fi);
Fy.at<float>(i,j) = sin(2*fi);
//fill blocks
for ( int u = i ; u < i + blockH; u++)
{
for( int v = j ; v < j + blockW ; v++)
{
orientationMap.at<float>(u,v) = fi;
Fx.at<float>(u,v) = Fx.at<float>(i,j);
Fy.at<float>(u,v) = Fy.at<float>(i,j);
coherence.at<float>(u,v) = (coh<0.85)?1:0;
}
}
}
} ///for
GaussConvolveWithStep(Fx, Fx_gauss, 5, blockSize);
GaussConvolveWithStep(Fy, Fy_gauss, 5, blockSize);
for(int m = 0; m < height; m++)
{
for(int n = 0; n < width; n++)
{
smoothed.at<float>(m,n) = 0.5*fastAtan2(Fy_gauss.at<float>(m,n), Fx_gauss.at<float>(m,n))*CV_PI/180;
if((m%blockSize)==0 && (n%blockSize)==0){
int x = n;
int y = m;
int ln = sqrt(2*pow(blockSize,2))/2;
float dx = ln*cos( smoothed.at<float>(m,n) - CV_PI/2);
float dy = ln*sin( smoothed.at<float>(m,n) - CV_PI/2);
arrowedLine(fprintWithDirectionsSmoo, Point(x, y+blockH), Point(x + dx, y + blockW + dy), Scalar::all(255), 1, CV_AA, 0, 0.06*blockSize);
// qDebug () << Fx_gauss.at<float>(m,n) << Fy_gauss.at<float>(m,n) << smoothed.at<float>(m,n);
// imshow("Orientation", fprintWithDirectionsSmoo);
// waitKey(0);
}
}
}///for2
normalize(orientationMap, orientationMap,0,1,NORM_MINMAX);
imshow("Orientation field", orientationMap);
orientationMap = smoothed.clone();
normalize(smoothed, smoothed, 0, 1, NORM_MINMAX);
imshow("Smoothed orientation field", smoothed);
imshow("Coherence", coherence);
imshow("Orientation", fprintWithDirectionsSmoo);
}
seems nothing forgot )
I have read your code thoroughly and found that you have made a mistake while calculating sum3 and sum4:
sum3 += inputImage.at<float>(u,v) * lowPassX.at<float>(i - u*lowPassSize, j - v * lowPassSize);
sum4 += inputImage.at<float>(u, v) * lowPassY.at<float>(i - u*lowPassSize, j - v * lowPassSize);
instead of inputImage you should use a low pass filter.

Parallelizing the Gaussian Blur Algorithm with OpenMP

I was trying to parallelize the gaussian blur function using OpenMP,
but I am new at OpenMP, and when I tried to parallelize the two for loops (I don't think there are any variables that need to be private for each thread), it ended up
running even slower than before, and the output was different. So did I do anything wrong? What should I do to make it run faster?
void gaussian_blur(float **src, float **dst, int w, int h, float sigma)
{
int x, y, i;
int ksize = (int)(sigma * 2.f * 4.f + 1) | 1;
int halfk = ksize / 2;
float scale = -0.5f/(sigma*sigma);
float sum = 0.f;
float *kernel, *ringbuf;
int xmax = w - halfk;
int ymax = h - halfk;
// if sigma too small, just copy src to dst
if (ksize <= 1)
{
for (y = 0; y < h; y++)
for (x = 0; x < w; x++)
dst[y][x] = src[y][x];
return;
}
// create Gaussian kernel
kernel = malloc(ksize * sizeof(float));
ringbuf = malloc(ksize * sizeof(float));
#pragma omp parallel for reduction(+ : sum)
for (i = 0; i < ksize; i++)
{
float x = (float)(i - halfk);
float t = expf(scale * x * x);
kernel[i] = t;
sum += t;
}
scale = 1.f / sum;
#pragma omp parallel for
for (i = 0; i < ksize; i++)
kernel[i] *= scale;
// blur each row
#pragma omp parallel for // this is the for loop I parallelized but ended up with wrong output and running slower
for (y = 0; y < h; y++)
{
int x1;
int bufi0 = ksize-1;
float tmp = src[y][0];
for (x1 = 0; x1 < halfk ; x1++) ringbuf[x1] = tmp;
for (; x1 < ksize-1; x1++) ringbuf[x1] = src[y][x1-halfk];
for (x1 = 0; x1 < w; x1++)
{
if(x1 < xmax)
ringbuf[bufi0++] = src[y][x1+halfk];
else
ringbuf[bufi0++] = src[y][w-1];
if (bufi0 == ksize) bufi0 = 0;
dst[y][x1] = convolve(kernel, ringbuf, ksize, bufi0);
}
}
// blur each column
#pragma omp parallel for // this is the for loop I parallelized but ended up with wrong output and running slower
for (x = 0; x < w; x++)
{
int y1;
int bufi0 = ksize-1;
float tmp = dst[0][x];
for (y1 = 0; y1 < halfk ; y1++) ringbuf[y1] = tmp;
for ( ; y1 < ksize-1; y1++) ringbuf[y1] = dst[y1-halfk][x];
for (y1 = 0; y1 < h; y1++)
{
if(y1 < ymax)
ringbuf[bufi0++] = dst[y1+halfk][x];
else
ringbuf[bufi0++] = dst[h-1][x];
if (bufi0 == ksize) bufi0 = 0;
dst[y1][x] = convolve(kernel, ringbuf, ksize, bufi0);
}
}
// clean up
free(kernel);
free(ringbuf);
}
Besides the need to properly identify private and shared data, there are several things that you could do in order to speed up your program.
As a first step you should remove any unnecessary concurrency. For example, how big ksize happens to be on average? If it is less than several hundred elements, it makes absolutely no sense to employ OpenMP for such simple operations as computing the kernel and then normalising it:
#pragma omp parallel for reduction(+ : sum)
for (i = 0; i < ksize; i++)
{
float x = (float)(i - halfk);
float t = expf(scale * x * x);
kernel[i] = t;
sum += t;
}
scale = 1.f / sum;
#pragma omp parallel for
for (i = 0; i < ksize; i++)
kernel[i] *= scale;
On a typical modern CPU it would take more cycles to bootstrap the parallel regions than to compute this on a single core. Also on modern CPUs these loops can be unrolled and vectorised and you can get up to 8x boost on a single core. If the kernel is too small, then besides OpenMP overhead you will also get slowdown from excessive false sharing. You have to make sure that each thread gets an exact multiple of 16 elements (64 bytes of cache line size / sizeof(float)) to work on in order to prevent false sharing.
You also have to make sure that threads do not share cache lines in the column blur section.
// blur each column
#pragma omp parallel for
for (x = 0; x < w; x++)
{
...
for (y1 = 0; y1 < h; y1++)
{
...
dst[y1][x] = convolve(kernel, ringbuf, ksize, bufi0);
}
}
Because of the access pattern here, you have to make sure that each thread gets a chunk of columns that is a multiple of 16 or else there will be a border overlap area of 16*y1 pixels shared by every two consecutive threads where excessive false sharing will occur. If you cannot guarantee that w is divisible by 16, then you can give each thread a starting offset in the y direction, e.g. the innermost loop becomes:
int tid = omp_get_thread_num();
for (y1 = 2*tid; y1 < h; y1++)
{
...
}
for (y1 = 0; y1 < 2*tid; y1++)
{
...
}
The multiplier 2 is arbitrary. The idea is to give the next thread several rows of advance in comparison to the current one so that both threads will not be processing the same line at once at any moment in time. You could also use addition and modulo arithmetic to compute y1, i.e.
for (y2 = 0; y2 < h; y2++)
{
y1 = (y2 + 2*tid) % h;
...
}
but this is generally slower than just separating the loop in two parts.
Also mind your data size. The last level cache (LLC) has very high but still limited bandwidth. If data cannot fit in the private cache of each core then compiler optimisations such as loop vectorisations can put very high pressure on the LLC. Things get more ugly if data doesn't fit in the LLC and therefore the main memory has to be accessed.
If you don't know what false sharing is, there is an article in Dr.Dobb's that kind of explains it here.
I may have fixed your code. You did not post your convolve function so it's difficult to say for sure but I'm not sure it matters. There are at least two bugs. There is a race condition in the ringbuf array. To fix this I extend the array times the number of threads.
ringbuf = (float*)malloc(nthreads*ksize * sizeof(float));
To access the array do something like this
int ithread = omp_get_thread_num();
ringbuf[ksize*ithread + x1]
Edit: I added some code which defines ringbuf inside the parallel block. That way you don't have to access ringbuf based on the thread number.
The second errors is the ibufi0 variable. I defined a new one like this
const int ibufi0_fix = (x1+ksize-1)%ksize;
Below is the code I used to check it. Replace with your convolve function. Note, this may still be quite inefficient. There are probably cache issues such as cache misses and false sharing (particularly when you convolve vertically). Hopefully, though, the image will be correct now.
Edit: here is a paper by Intel that shows how to do this best with AVX. It's optimized to minimize the cache misses. I'm not sure it's optimized for threading though.
http://software.intel.com/en-us/articles/iir-gaussian-blur-filter-implementation-using-intel-advanced-vector-extensions
I'm writing my own function on this (it's actually the reason I started learning OpenMP) which uses SSE/AVX as well. There are a lot of similarities with matrix multiplication and image filtering so I learned how to optimized matrix multiplication first and will do Gaussian Blur shortly...
#include "math.h"
#include "omp.h"
#include "stdio.h"
#include <nmmintrin.h>
float convolve(const float *kernel, const float *ringbuf, const int ksize, const int bufi0) {
float sum = 0.0f;
for(int i=0; i<ksize; i++) {
sum += kernel[i]*ringbuf[i];
}
return sum;
}
void gaussian_blur(float *src, float *dst, int w, int h, float sigma, int nthreads)
{
int x, y, i;
int ksize = (int)(sigma * 2.f * 4.f + 1) | 1;
int halfk = ksize / 2;
printf("ksize %d\n", ksize);
float scale = -0.5f/(sigma*sigma);
float sum = 0.f;
float *kernel, *ringbuf;
int xmax = w - halfk;
int ymax = h - halfk;
// if sigma too small, just copy src to dst
if (ksize <= 1)
{
for (y = 0; y < h; y++)
for (x = 0; x < w; x++)
dst[y*w + x] = src[y*w + x];
return;
}
// create Gaussian kernel
//kernel = malloc(ksize * sizeof(float));
kernel = (float*)_mm_malloc(ksize * sizeof(float),16);
//ringbuf = malloc(ksize * sizeof(float));
ringbuf = (float*)_mm_malloc(nthreads*ksize * sizeof(float),16);
#pragma omp parallel for reduction(+ : sum) if(nthreads>1)
for (i = 0; i < ksize; i++)
{
float x = (float)(i - halfk);
float t = expf(scale * x * x);
kernel[i] = t;
sum += t;
}
scale = 1.f / sum;
#pragma omp parallel for if(nthreads>1)
for (i = 0; i < ksize; i++)
kernel[i] *= scale;
// blur each row
#pragma omp parallel for if(nthreads>1)// this is the for loop I parallelized but ended up with wrong output and running slower
for (y = 0; y < h; y++)
{
int ithread = omp_get_thread_num();
//printf("nthread %d\n", nthread);
int x1;
int bufi0 = ksize-1;
float tmp = src[y*w + 0];
for (x1 = 0; x1 < halfk ; x1++) ringbuf[ksize*ithread + x1] = tmp;
for (; x1 < ksize-1; x1++) ringbuf[ksize*ithread + x1] = src[y*w + x1-halfk];
for (x1 = 0; x1 < w; x1++)
{
const int ibufi0_fix = (x1+ksize-1)%ksize;
if(x1 < xmax)
ringbuf[ksize*ithread + ibufi0_fix] = src[y*w + x1+halfk];
else
ringbuf[ksize*ithread + ibufi0_fix] = src[y*w + w-1];
if (bufi0 == ksize) bufi0 = 0;
dst[y*w + x1] = convolve(kernel, &ringbuf[ksize*ithread], ksize, bufi0);
}
}
// blur each column
#pragma omp parallel for if(nthreads>1)// this is the for loop I parallelized but ended up with wrong output and running slower
for (x = 0; x < w; x++)
{
int ithread = omp_get_thread_num();
int y1;
int bufi0 = ksize-1;
float tmp = dst[0*w + x];
for (y1 = 0; y1 < halfk ; y1++) ringbuf[ksize*ithread + y1] = tmp;
for ( ; y1 < ksize-1; y1++) ringbuf[ksize*ithread + y1] = dst[(y1-halfk)*w + x];
for (y1 = 0; y1 < h; y1++)
{
const int ibufi0_fix = (y1+ksize-1)%ksize;
if(y1 < ymax)
ringbuf[ibufi0_fix] = dst[(y1+halfk)*w + x];
else
ringbuf[ibufi0_fix] = dst[(h-1)*w + x];
if (bufi0 == ksize) bufi0 = 0;
dst[y1*w + x] = convolve(kernel, &ringbuf[ksize*ithread], ksize, bufi0);
}
}
// clean up
_mm_free(kernel);
_mm_free(ringbuf);
}
int compare(float *dst1, float *dst2, const int n) {
int error = 0;
for(int i=0; i<n; i++) {
if(*dst1 != *dst2) error++;
}
return error;
}
int main() {
const int w = 20;
const int h = 20;
float *src = (float*)_mm_malloc(w*h*sizeof(float),16);
float *dst1 = (float*)_mm_malloc(w*h*sizeof(float),16);
float *dst2 = (float*)_mm_malloc(w*h*sizeof(float),16);
for(int i=0; i<w*h; i++) {
src[i] = i;
}
gaussian_blur(src, dst1, w, h, 1.0f, 1);
gaussian_blur(src, dst2, w, h, 1.0f, 4);
int error = compare(dst1, dst2, w*h);
printf("error %d\n", error);
_mm_free(src);
_mm_free(dst1);
_mm_free(dst2);
}
Edit: here is code which defines ringbuf inside the parallel block based on the comment by Hristo. It should be equivalent.
#include "math.h"
#include "omp.h"
#include "stdio.h"
#include <nmmintrin.h>
float convolve(const float *kernel, const float *ringbuf, const int ksize, const int bufi0) {
float sum = 0.0f;
for(int i=0; i<ksize; i++) {
sum += kernel[i]*ringbuf[i];
}
return sum;
}
void gaussian_blur(float *src, float *dst, int w, int h, float sigma, int nthreads)
{
int x, y, i;
int ksize = (int)(sigma * 2.f * 4.f + 1) | 1;
int halfk = ksize / 2;
printf("ksize %d\n", ksize);
float scale = -0.5f/(sigma*sigma);
float sum = 0.f;
float *kernel;
int xmax = w - halfk;
int ymax = h - halfk;
// if sigma too small, just copy src to dst
if (ksize <= 1)
{
for (y = 0; y < h; y++)
for (x = 0; x < w; x++)
dst[y*w + x] = src[y*w + x];
return;
}
// create Gaussian kernel
//kernel = malloc(ksize * sizeof(float));
kernel = (float*)_mm_malloc(ksize * sizeof(float),16);
#pragma omp parallel for reduction(+ : sum) if(nthreads>1)
for (i = 0; i < ksize; i++)
{
float x = (float)(i - halfk);
float t = expf(scale * x * x);
kernel[i] = t;
sum += t;
}
scale = 1.f / sum;
#pragma omp parallel for if(nthreads>1)
for (i = 0; i < ksize; i++)
kernel[i] *= scale;
// blur each row
//#pragma omp parallel for if(nthreads>1)// this is the for loop I parallelized but ended up with wrong output and running slower
#pragma omp parallel if(nthreads>1)
{
float *ringbuf = (float*)_mm_malloc(ksize * sizeof(float),16);
#pragma omp for// this is the for loop I parallelized but ended up with wrong output and running slower
for (y = 0; y < h; y++)
{
//printf("nthread %d\n", nthread);
int x1;
int bufi0 = ksize-1;
float tmp = src[y*w + 0];
for (x1 = 0; x1 < halfk ; x1++) ringbuf[x1] = tmp;
for (; x1 < ksize-1; x1++) ringbuf[x1] = src[y*w + x1-halfk];
for (x1 = 0; x1 < w; x1++)
{
const int ibufi0_fix = (x1+ksize-1)%ksize;
if(x1 < xmax)
ringbuf[ibufi0_fix] = src[y*w + x1+halfk];
else
ringbuf[ibufi0_fix] = src[y*w + w-1];
if (bufi0 == ksize) bufi0 = 0;
dst[y*w + x1] = convolve(kernel, ringbuf, ksize, bufi0);
}
}
_mm_free(ringbuf);
}
// blur each column
#pragma omp parralel if(ntheads>1)
{
float *ringbuf = (float*)_mm_malloc(ksize * sizeof(float),16);
#pragma omp for// this is the for loop I parallelized but ended up with wrong output and running slower
for (x = 0; x < w; x++)
{
int y1;
int bufi0 = ksize-1;
float tmp = dst[0*w + x];
for (y1 = 0; y1 < halfk ; y1++) ringbuf[y1] = tmp;
for ( ; y1 < ksize-1; y1++) ringbuf[y1] = dst[(y1-halfk)*w + x];
for (y1 = 0; y1 < h; y1++)
{
const int ibufi0_fix = (y1+ksize-1)%ksize;
if(y1 < ymax)
ringbuf[ibufi0_fix] = dst[(y1+halfk)*w + x];
else
ringbuf[ibufi0_fix] = dst[(h-1)*w + x];
if (bufi0 == ksize) bufi0 = 0;
dst[y1*w + x] = convolve(kernel, ringbuf, ksize, bufi0);
}
}
_mm_free(ringbuf);
}
// clean up
_mm_free(kernel);
}
int compare(float *dst1, float *dst2, const int n) {
int error = 0;
for(int i=0; i<n; i++) {
if(*dst1 != *dst2) error++;
}
return error;
}
int main() {
const int w = 20;
const int h = 20;
float *src = (float*)_mm_malloc(w*h*sizeof(float),16);
float *dst1 = (float*)_mm_malloc(w*h*sizeof(float),16);
float *dst2 = (float*)_mm_malloc(w*h*sizeof(float),16);
for(int i=0; i<w*h; i++) {
src[i] = i;
}
gaussian_blur(src, dst1, w, h, 1.0f, 1);
gaussian_blur(src, dst2, w, h, 1.0f, 4);
int error = compare(dst1, dst2, w*h);
printf("error %d\n", error);
_mm_free(src);
_mm_free(dst1);
_mm_free(dst2);
}

Generic fast Transpose of non-square matrix CUDA

The SDK provides an example and strategies for tackling a square matrix transpose but is there a good way of performing a transpose on a non square matrix? I have quite a naive implementation currently as follows which is probably terrible:
template<class S>
__global__ void transpose(S *Source, S *Destination, int SizeX, int SizeY) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid<SizeX*SizeY) {
int X = tid % SizeX;
int Y = tid / SizeX;
//(x,y) => (y,x)
int newId = (SizeY*X) + Y;
Destination[newId] = Source[tid];
}
}
Here my idea was to transpose the square part of the matrix with only the necessary threads/blocks (each thread swaps two entries of the square sub matrix), then traverse and transpose the remaining entries.
__global__ void kernelTranspuesta(float *a, float *c, int m, int n) {
int i = threadIdx.x + blockIdx.x*blockDim.x;
int j = threadIdx.y + blockIdx.y*blockDim.y;
int smallest = M < N ? M : N;
while( j < smallest ){
i = threadIdx.x + blockIdx.x*blockDim.x;
while( i < j ){
c[i*m+j] = a[j*n+i];
c[j*m+i] = a[i*n+j];
i+= blockDim.x*gridDim.x;
}
if(i == j)
c[j*m+i] = a[i*n+j];
j+= blockDim.y*gridDim.y;
}
if( M > N ) {
i = threadIdx.x + blockIdx.x*blockDim.x + N;
j = threadIdx.y + blockIdx.y*blockDim.y;
while( i < M ){
j = threadIdx.y + blockIdx.y*blockDim.y;
while( j < N){
c[j*m+i] = a[i*n+j];
j+= blockDim.y*gridDim.y;
}
i+= blockDim.x*gridDim.x;
}
}else{
i = threadIdx.x + blockIdx.x*blockDim.x;
j = threadIdx.y + blockIdx.y*blockDim.y + M;
while( i < M ){
j = threadIdx.y + blockIdx.y*blockDim.y + M;
while( j < N){
c[j*m+i] = a[i*n+j];
j+= blockDim.y*gridDim.y;
}
i+= blockDim.x*gridDim.x;
}
}
}
The kernel call is
dim3 hilos(16,16); // hilos(blockDim.x, blockDim.y)
dim3 bloques(8,8); // bloques(gridDim.x, gridDim.y)
kernelTranspuesta<<<bloques, hilos>>>(aD, cD, m, n);
I tested it on 512x256 and 256x512 matrices, let me know what you think.

Resources