Strassen algorithm not the fastest? - algorithm

I copied strassen's algorithm from somewhere and then executed it. Here is the output
n = 256
classical took 360ms
strassen 1 took 33609ms
strassen2 took 1172ms
classical took 437ms
strassen 1 took 32891ms
strassen2 took 1156ms
classical took 266ms
strassen 1 took 27234ms
strassen2 took 734ms
where strassen1 is a dynamic approach, strassen2 for cache and classical is the old matrix multiplication. This means that our old and easy classical one is the best. Is this true or i am wrong somewhere? Here's the code in Java.
import java.util.Random;
class TestIntMatrixMultiplication {
public static void main (String...args) throws Exception {
final int n = args.length > 0 ? Integer.parseInt(args[0]) : 256;
final int seed = args.length > 1 ? Integer.parseInt(args[1]) : 256;
final Random random = new Random(seed);
int[][] a, b, c;
a = new int[n][n];
b = new int[n][n];
c = new int[n][n];
for(int i=0; i<n; i++) {
for(int j=0; j<n; j++) {
a[i][j] = random.nextInt(100);
b[i][j] = random.nextInt(100);
System.out.println("n = " + n);
if (a.length < 64) {
Classical.mult(c, a, b);
strassen2.mult(c, a, b);
for (int i = 0; i <3; ++i) {
timeMultiplies1(a, b, c);
if (n <= 256)
timeMultiplies2( a, b, c);
timeMultiplies3( a, b, c);
static void timeMultiplies1 (int[][] a, int[][] b, int[][] c) {
final long start = System.currentTimeMillis();
Classical.mult(c, a, b);
final long finish = System.currentTimeMillis();
System.out.println("classical took " + (finish - start) + "ms");
static void timeMultiplies2(int[][] a, int[][] b, int[][] c) {
final long start = System.currentTimeMillis();
strassen1.mult(c, a, b);
final long finish = System.currentTimeMillis();
System.out.println("strassen 1 took " + (finish - start) + "ms");
static void timeMultiplies3 (int[][] a, int[][] b, int[][] c) {
final long start = System.currentTimeMillis();
strassen2.mult(c, a, b);
final long finish = System.currentTimeMillis();
System.out.println("strassen2 took " + (finish - start) + "ms");
static void dumpMatrix (int[][] m) {
for (int[] row : m) {
for (int val : row) {
class strassen1{
public String getName () {
return "Strassen(dynamic)";
public static int[][] mult (int[][] c, int[][] a, int[][] b) {
return strassenMatrixMultiplication(a, b);
public static int [][] strassenMatrixMultiplication(int [][] A, int [][] B) {
int n = A.length;
int [][] result = new int[n][n];
if(n == 1) {
result[0][0] = A[0][0] * B[0][0];
} else {
int [][] A11 = new int[n/2][n/2];
int [][] A12 = new int[n/2][n/2];
int [][] A21 = new int[n/2][n/2];
int [][] A22 = new int[n/2][n/2];
int [][] B11 = new int[n/2][n/2];
int [][] B12 = new int[n/2][n/2];
int [][] B21 = new int[n/2][n/2];
int [][] B22 = new int[n/2][n/2];
divideArray(A, A11, 0 , 0);
divideArray(A, A12, 0 , n/2);
divideArray(A, A21, n/2, 0);
divideArray(A, A22, n/2, n/2);
divideArray(B, B11, 0 , 0);
divideArray(B, B12, 0 , n/2);
divideArray(B, B21, n/2, 0);
divideArray(B, B22, n/2, n/2);
int [][] P1 = strassenMatrixMultiplication(addMatrices(A11, A22), addMatrices(B11, B22));
int [][] P2 = strassenMatrixMultiplication(addMatrices(A21, A22), B11);
int [][] P3 = strassenMatrixMultiplication(A11, subtractMatrices(B12, B22));
int [][] P4 = strassenMatrixMultiplication(A22, subtractMatrices(B21, B11));
int [][] P5 = strassenMatrixMultiplication(addMatrices(A11, A12), B22);
int [][] P6 = strassenMatrixMultiplication(subtractMatrices(A21, A11), addMatrices(B11, B12));
int [][] P7 = strassenMatrixMultiplication(subtractMatrices(A12, A22), addMatrices(B21, B22));
int [][] C11 = addMatrices(subtractMatrices(addMatrices(P1, P4), P5), P7);
int [][] C12 = addMatrices(P3, P5);
int [][] C21 = addMatrices(P2, P4);
int [][] C22 = addMatrices(subtractMatrices(addMatrices(P1, P3), P2), P6);
copySubArray(C11, result, 0 , 0);
copySubArray(C12, result, 0 , n/2);
copySubArray(C21, result, n/2, 0);
copySubArray(C22, result, n/2, n/2);
return result;
public static int [][] addMatrices(int [][] A, int [][] B) {
int n = A.length;
int [][] result = new int[n][n];
for(int i=0; i<n; i++)
for(int j=0; j<n; j++)
result[i][j] = A[i][j] + B[i][j];
return result;
public static int [][] subtractMatrices(int [][] A, int [][] B) {
int n = A.length;
int [][] result = new int[n][n];
for(int i=0; i<n; i++)
for(int j=0; j<n; j++)
result[i][j] = A[i][j] - B[i][j];
return result;
public static void divideArray(int[][] parent, int[][] child, int iB, int jB) {
for(int i1 = 0, i2=iB; i1<child.length; i1++, i2++)
for(int j1 = 0, j2=jB; j1<child.length; j1++, j2++)
child[i1][j1] = parent[i2][j2];
public static void copySubArray(int[][] child, int[][] parent, int iB, int jB) {
for(int i1 = 0, i2=iB; i1<child.length; i1++, i2++)
for(int j1 = 0, j2=jB; j1<child.length; j1++, j2++)
parent[i2][j2] = child[i1][j1];
class strassen2{
public String getName () {
return "Strassen(cached)";
static int [][] p1;
static int [][] p2;
static int [][] p3;
static int [][] p4;
static int [][] p5;
static int [][] p6;
static int [][] p7;
static int [][] t0;
static int [][] t1;
public static int[][] mult (int[][] c, int[][] a, int[][] b) {
final int n = c.length;
if (p1 == null || p1.length < n) {
p1 = new int[n/2][n-1];
p2 = new int[n/2][n-1];
p3 = new int[n/2][n-1];
p4 = new int[n/2][n-1];
p5 = new int[n/2][n-1];
p6 = new int[n/2][n-1];
p7 = new int[n/2][n-1];
t0 = new int[n/2][n-1];
t1 = new int[n/2][n-1];
mult(c, a, b, 0, 0, n, 0);
return c;
public static void mult (int[][] c, int[][] a, int[][] b, int i0, int j0, int n, int offs) {
if(n == 1) {
c[i0][j0] = a[i0][j0] * b[i0][j0];
} else {
final int nBy2 = n/2;
final int i1 = i0 + nBy2;
final int j1 = j0 + nBy2;
// offset applied to 'p' j index so recursive calls don't overwrite data
final int jp0 = offs;
final int jp1 = nBy2 + offs;
// P1 <- (A11 + A22)(B11 + B22)
// T0 <- (A11 + A22), T1 <- (B11 + B22), P1 <- T0*T1
for (int i = 0; i < nBy2; ++i) {
for (int j = 0; j < nBy2; ++j) {
t0[i + i0][j + jp0] = a[i + i0][j + j0] + a[i + i1][j + j1];
t1[i + i0][j + jp0] = b[i + i0][j + j0] + b[i + i1][j + j1];
mult(p1, t0, t1, i0, jp0, nBy2, offs + nBy2);
// P2 <- (A21 + A22)B11
// T0 <- (A21 + A22), T1 <- B11, P2 <- T0*T1
for (int i = 0; i < nBy2; ++i) {
for (int j = 0; j < nBy2; ++j) {
t0[i + i0][j + jp0] = a[i + i1][j + j0] + a[i + i1][j + j1];
t1[i + i0][j + jp0] = b[i + i0][j + j0];
mult(p2, t0, t1, i0, jp0, nBy2, offs + nBy2);
// P3 <- A11(B12 - B22)
// T0 <- A11, T1 <- (B12 - B22), P3 <- T0*T1
for (int i = 0; i < nBy2; ++i) {
for (int j = 0; j < nBy2; ++j) {
t0[i + i0][j + jp0] = a[i + i0][j + j0];
t1[i + i0][j + jp0] = b[i + i0][j + j1] - b[i + i1][j + j1];
mult(p3, t0, t1, i0, jp0, nBy2, offs + nBy2);
// P4 <- A22(B21 - B11)
// T0 <- A22, T1 <- (B21 - B11), P4 <- T0*T1
for (int i = 0; i < nBy2; ++i) {
for (int j = 0; j < nBy2; ++j) {
t0[i + i0][j + jp0] = a[i + i1][j + j1];
t1[i + i0][j + jp0] = b[i + i1][j + j0] - b[i + i0][j + j0];
mult(p4, t0, t1, i0, jp0, nBy2, offs + nBy2);
// P5 <- (A11 + A12) B22
// T0 <- (A11 + A12), T1 <- B22, P5 <- T0*T1
for (int i = 0; i < nBy2; ++i) {
for (int j = 0; j < nBy2; ++j) {
t0[i + i0][j + jp0] = a[i + i0][j + j0] + a[i + i0][j + j1];
t1[i + i0][j + jp0] = b[i + i1][j + j1];
mult(p5, t0, t1, i0, jp0, nBy2, offs + nBy2);
// P6 <- (A21 - A11)(B11 - B12)
// T0 <- (A21 - A11), T1 <- (B11 - B12), P6 <- T0 * T1
for (int i = 0; i < nBy2; ++i) {
for (int j = 0; j < nBy2; ++j) {
t0[i + i0][j + jp0] = a[i + i1][j + j0] - a[i + i0][j + j0];
t1[i + i0][j + jp0] = b[i + i0][j + j0] - b[i + i0][j + j1];
mult(p6, t0, t1, i0, jp0, nBy2, offs + nBy2);
// P7 <- (A12 - A22)(B21 + B22)
// T0 <- (A12 - A22), T1 <- (B21 + B22), P7 <- T0 * T1
for (int i = 0; i < nBy2; ++i) {
for (int j = 0; j < nBy2; ++j) {
t0[i + i0][j + jp0] = a[i + i0][j + j1] - a[i + i1][j + j1];
t1[i + i0][j + jp0] = b[i + i1][j + j0] + b[i + i1][j + j1];
mult(p7, t0, t1, i0, jp0, nBy2, offs + nBy2);
// combine
for (int i = 0; i < nBy2; ++i) {
for (int j = 0; j < nBy2; ++j) {
// C11 = P1 + P4 - P5 + P7;
c[i + i0][j + j0] = p1[i + i0][j + jp0] + p4[i + i0][j + jp0] - p5[i + i0][j + jp0] + p7[i + i0][j + jp0];
// C12 = P3 + P5;
c[i + i0][j + j1] = p3[i + i0][j + jp0] + p5[i + i0][j + jp0];
// C21 = P2 + P4;
c[i + i1][j + j0] = p2[i + i0][j + jp0] + p4[i + i0][j + jp0];
// C22 = P1 + P3 - P2 + P6;
c[i + i1][j + j1] = p1[i + i0][j + jp0] + p3[i + i0][j + jp0] - p2[i + i0][j + jp0] + p6[i + i0][j + jp0];
void dumpInternal () {
class Classical{
public String getName () {
return "classic";
public static int[][] mult (int[][] c, int[][] a, int[][] b) {
int n = a.length;
for(int i=0; i<n; i++) {
final int[] a_i = a[i];
final int[] c_i = c[i];
for(int j=0; j<n; j++) {
int sum = 0;
for(int k=0; k<n; k++) {
sum += a_i[k] * b[k][j];
c_i[j] = sum;
return c;

Issues I see:
1)Your Strassen multiply is dynamically allocating memory all the time. This is going to kill performance.
2)Your Strassen multiply should switch over to conventional multiply for small sizes rather than being recursive all the way down (though this optimization sort of invalidates your test).
3)You're matrix size may simply be too small to see the difference.
You should do comparisons with several different sizes. Perhaps 256, 512, 1024, 2048, 4096, 8192... Then plot the times and look at the trends. You will probably want matrix size on a log scale if it's all powers of 2.
Strassen is only faster for large N. How large will depend a lot on the implementation. What you have done for classical is only a basic implementation and is not optimal on a modern machine either.

Implementation questions aside, I think you're misunderstanding the algorithm's performance. Like phkahler said, your expectations are a little off for the performance of the algorithm. Divide-and-conquer algorithms work well for large inputs because they recursively break the problem into sub-problems which can be solved more quickly.
However, the overhead associated with this splitting action can cause the algorithm to run (sometimes much) slower for small or even medium-sized inputs. Typically, the theoretical analysis of an algorithm like Strassen will include a so-called "breakpoint" calculation. This is the input size where the overhead of splitting becomes preferable to a naive technique.
Your code needs to include a check on the size of the input that switches to the naive technique at the breakpoint.

Write down what the Strassen algorithm does for a 2 x 2 matrix. Count the operations. The number is absolutely ridiculous. It's stupid to use Strassen's method for a 2x2 matrix. Same for a 3 x 3, or 4 x 4, matrix and probably quite a way up.


MVC image blending algorithm implementation

I followed the algorithm mentioned in the sig09 paper Coordinates for Instant Image Cloning
The algorithm
This is my code:
#define vector std::vector
#define queue std::queue
#define map std::map
#define cin std::cin
#define cout std::cout
#define endl std::endl
#define string std::string
#define PDD std::pair<double, double>
#define image Mat<unsigned char>
template<class T>
class Mat{
int row, col, channel;
vector<T> data;
Mat(int row, int col, int cha):row(row), col(col), channel(cha){
data.resize(row * col * cha, 0);
Mat(const char *name){
T *t = stbi_load(name, &col, &row, &channel, 0);
data = vector<T>(t, t + col * row * channel);
T& at(int x, int y, int z){
return data[(x * col + y) * channel + z];
void write(const char *name){
stbi_write_bmp(name, col, row, channel,;
#define x first
#define y second
vector<PDD> P;
map<int, map<int, bool>> st; // register
vector<vector<double>> w;
int dx8[] = {1, 1, 1, 0, 0, -1, -1, -1}, dy8[] = {-1, 0, 1, -1, 1, -1, 0, 1};
int dx4[] = {0, 0, 1, -1}, dy4[] = {1, -1, 0, 0};
bool check(int i, int j, image &mask){
if(, j, 0) == 0) return false;
return, j - 1, 0) == 0||, j + 1, 0) == 0|| - 1, j, 0) == 0|| + 1, j, 0) == 0;
void dfs(int sx, int sy, int x, int y, int px, int py, image &mask){
if(, y - 1, 0) == 0 && st[x][y - 1] == 0) P.push_back({x, y - 1}), st[x][y - 1] = 1;
if(px != -1){
if( + 1, y, 0) == 0 && st[x + 1][y] == 0) P.push_back({x + 1, y}), st[x + 1][y] = 1;
if(, y + 1, 0) == 0 && st[x][y + 1] == 0) P.push_back({x, y + 1}), st[x][y + 1] = 1;
if( - 1, y, 0) == 0 && st[x - 1][y] == 0) P.push_back({x - 1, y}), st[x - 1][y] = 1;
if(sx == x && sy == y && px != -1) return;
for(int i = 0; i < 8; i ++){
int a = x + dx8[i], b = y + dy8[i];
if(a < 0 || b < 0 || a >= mask.row || b >= mask.col) continue;
if(check(a, b, mask) && (a != px || b != py)) dfs(sx, sy, a, b, x, y, mask);
double len(const PDD &a){
return sqrt(a.x * a.x + a.y * a.y);
double dot(const PDD &a, const PDD &b){
return a.x * b.x + a.y * b.y;
PDD minus(const PDD &a, const PDD &b){
return {a.x - b.x, a.y - b.y};
PDD normalize(const PDD &a){
return {a.x / len(a), a.y / len(a)};
double val(PDD &pre, PDD &cur, PDD &nxt, PDD &o){
PDD V1 = normalize(minus(pre, o));
PDD V2 = normalize(minus(nxt, o));
PDD mid = normalize(minus(cur, o));
double alpha1 = acos(dot(V1, mid));
double alpha2 = acos(dot(V2, mid));
return (tan(alpha1 / 2) + tan(alpha2 / 2)) / len(minus(cur, o)); // many nan value occured here
int main(int argc, char *argv[]){
image src("src.png");
image mask("mask1.png");
image tar("target.png");
image res(tar.row, tar.col,;
for(int i = 0; i < mask.row; i ++){
for(int j = 0; j < mask.col; j ++){
if(, j, 0) == 255 && st[i][j] == 0){
dfs(i, j, i, j, -1, -1, mask); // find counter-clockwise border
queue<PDD> q;
vector<PDD> X;
q.push({i, j});
st[i][j] = 1;
while(q.size()){ // get all white (x, y)s in mask
auto h = q.front();
vector<double> wx;
for(int k = 0; k < P.size(); k ++){ // calculate lambda value by search order
int pre = (k - 1 + P.size()) % P.size();
int cur = k;
int nxt = (k + 1) % P.size();
val(P[pre], P[cur], P[nxt], h)
for(int k = 0; k < 4; k ++){
int a = h.x + dx4[k], b = h.y + dy4[k];
if(st[a][b] == 1 ||, b, 0) == 0) continue;
st[a][b] = 1;
q.push({a, b});
for(int c = 0; c <; c ++){ // every channel of res
for(int k = 0; k < X.size(); k ++){
double rx = 0, sum = 0;
for(int u = 0; u < w[k].size(); u ++){
double diff =[u].x, P[u].y, c) -[u].x, P[u].y, c);
rx += w[k][u] * diff;
sum += w[k][u];
rx /= sum;[k].x, X[k].y, c) = rx +[k].x, X[k].y, c);
return 0;
1. get the border(counter-clockwise) of the white region in the
2. get all (x, y)s of pixels in the white area of the mask
3. calculate lambda value of every (x, y) in 2, but I found that lambda values of every (x, y) contain many nans (possibly caused by too small value in function val(...))
The question is I do not know how to deal with this condition in 3, nor did the paper mention it.

Bit-reversal algorithm by Rutkowska

I found a very interesting paper about bit-reversal algorithm suitable for in-place FFT: "A simple algorithm for the bit-reversal permutation" by
Urszula Rutkowska from 1990 (
However, her algorithm G1 does not appear to work as the very first iteration results in out-of-bounds error for that N1 = L << 1 and swap(a + 1, a + N1);. I assume L means the length of input vector.
Please, does anyone know if there was any errata for the paper or how to fix the algorithm?
The paper's pseudocode:
{int i,j,L1
unsigned k;
j=0; L1=L-1;
{ a=i<<1;
{ a=i<<1;
while(k<=j){ j=j-k;
Screenshot of the paper:
It was pretty garbled, frankly. I had to read the paper for the idea (run Gold's algorithm (G) for L/4 and then derive the swaps for L) and then sort of massage the code into the right form. Here's my final result.
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
static bool is_power_of_two(int L) { return L > 0 && (L & (L - 1)) == 0; }
static void swap(int i, int j) { printf("swap %d,%d\n", i, j); }
static void G(int L) {
int j = 0;
for (int i = 0; i < L - 1; i++) {
if (i < j) {
swap(i, j);
int k = L >> 1;
while (j >= k) {
j -= k;
k >>= 1;
j += k;
static void G1(int L) {
if (L < 4) {
int j = 0;
int N1 = L >> 1;
int N2 = N1 + 1;
int L2 = L >> 2;
for (int i = 0; i < L2 - 1; i++) {
if (i < j) {
int a = i << 1;
int b = j << 1;
swap(a, b);
swap(a + N2, b + N2);
swap(a + 1, b + N1);
swap(b + 1, a + N1);
} else if (i == j) {
int a = i << 1;
swap(a + 1, a + N1);
int k = L2 >> 1;
while (j >= k) {
j -= k;
k >>= 1;
j += k;
int a = (L2 - 1) << 1;
swap(a + 1, a + N1);
int main(int argc, char *argv[]) {
assert(1 < argc);
int L = atoi(argv[1]);

3d point closest to multiple lines in 3D space

I search for non iterative, closed form, algorithm to find Least squares solution for point closest to the set of 3d lines. It is similar to 3d point triangulation (to minimize re-projections) but seems to be be simpler and faster?
Lines can be described in any form, 2 points, point and unit direction or similar.
Let the i th line be given by point ai and unit direction vector di. We need to find the single point that minimizes the sum of squared point to line distances. This is where the gradient is the zero vector:
Expanding the gradient,
Algebra yields a canonical 3x3 linear system,
where the k'th row (a 3-element row vector) of matrix M is
with vector ek the respective unit basis vector, and
It's not hard to turn this into code. I borrowed (and fixed a small bug in) a Gaussian elimination function from Rosettacode to solve the system. Thanks to the author!
#include <stdio.h>
#include <math.h>
typedef double VEC[3];
typedef VEC MAT[3];
void solve(double *a, double *b, double *x, int n); // linear solver
double dot(VEC a, VEC b) { return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]; }
void find_nearest_point(VEC p, VEC a[], VEC d[], int n) {
MAT m = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
VEC b = {0, 0, 0};
for (int i = 0; i < n; ++i) {
double d2 = dot(d[i], d[i]), da = dot(d[i], a[i]);
for (int ii = 0; ii < 3; ++ii) {
for (int jj = 0; jj < 3; ++jj) m[ii][jj] += d[i][ii] * d[i][jj];
m[ii][ii] -= d2;
b[ii] += d[i][ii] * da - a[i][ii] * d2;
solve(&m[0][0], b, p, 3);
// Debug printing.
void pp(VEC v, char *l, char *r) {
printf("%s%.3lf, %.3lf, %.3lf%s", l, v[0], v[1], v[2], r);
void pv(VEC v) { pp(v, "(", ")"); }
void pm(MAT m) { for (int i = 0; i < 3; ++i) pp(m[i], "\n[", "]"); }
// A simple verifier.
double dist2(VEC p, VEC a, VEC d) {
VEC pa = { a[0]-p[0], a[1]-p[1], a[2]-p[2] };
double dpa = dot(d, pa);
return dot(d, d) * dot(pa, pa) - dpa * dpa;
double sum_dist2(VEC p, VEC a[], VEC d[], int n) {
double sum = 0;
for (int i = 0; i < n; ++i) sum += dist2(p, a[i], d[i]);
return sum;
// Check 26 nearby points and verify the provided one is nearest.
int is_nearest(VEC p, VEC a[], VEC d[], int n) {
double min_d2 = 1e100;
int ii = 2, jj = 2, kk = 2;
#define D 0.01
for (int i = -1; i <= 1; ++i)
for (int j = -1; j <= 1; ++j)
for (int k = -1; k <= 1; ++k) {
VEC pp = { p[0] + D * i, p[1] + D * j, p[2] + D * k };
double d2 = sum_dist2(pp, a, d, n);
// Prefer provided point among equals.
if (d2 < min_d2 || i == 0 && j == 0 && k == 0 && d2 == min_d2) {
min_d2 = d2;
ii = i; jj = j; kk = k;
return ii == 0 && jj == 0 && kk == 0;
void normalize(VEC v) {
double len = sqrt(dot(v, v));
v[0] /= len;
v[1] /= len;
v[2] /= len;
int main(void) {
VEC a[] = {{-14.2, 17, -1}, {1, 1, 1}, {2.3, 4.1, 9.8}, {1,2,3}};
VEC d[] = {{1.3, 1.3, -10}, {12.1, -17.2, 1.1}, {19.2, 31.8, 3.5}, {4,5,6}};
int n = 4;
for (int i = 0; i < n; ++i) normalize(d[i]);
VEC p;
find_nearest_point(p, a, d, n);
if (!is_nearest(p, a, d, n)) printf("Woops. Not nearest.\n");
return 0;
// A linear solver from rosettacode (with bug fix: added a missing fabs())
#define mat_elem(a, y, x, n) (a + ((y) * (n) + (x)))
void swap_row(double *a, double *b, int r1, int r2, int n)
double tmp, *p1, *p2;
int i;
if (r1 == r2) return;
for (i = 0; i < n; i++) {
p1 = mat_elem(a, r1, i, n);
p2 = mat_elem(a, r2, i, n);
tmp = *p1, *p1 = *p2, *p2 = tmp;
tmp = b[r1], b[r1] = b[r2], b[r2] = tmp;
void solve(double *a, double *b, double *x, int n)
#define A(y, x) (*mat_elem(a, y, x, n))
int i, j, col, row, max_row, dia;
double max, tmp;
for (dia = 0; dia < n; dia++) {
max_row = dia, max = fabs(A(dia, dia));
for (row = dia + 1; row < n; row++)
if ((tmp = fabs(A(row, dia))) > max) max_row = row, max = tmp;
swap_row(a, b, dia, max_row, n);
for (row = dia + 1; row < n; row++) {
tmp = A(row, dia) / A(dia, dia);
for (col = dia+1; col < n; col++)
A(row, col) -= tmp * A(dia, col);
A(row, dia) = 0;
b[row] -= tmp * b[dia];
for (row = n - 1; row >= 0; row--) {
tmp = b[row];
for (j = n - 1; j > row; j--) tmp -= x[j] * A(row, j);
x[row] = tmp / A(row, row);
#undef A
This isn't extensively tested, but seems to be working fine.
Let base point of line is p and unit direction vector is d.
Then distance from point v to this line might be calculated using cross product
SquaredDist = ((v - p) x d)^2
Using Maple packet symbolic calculation, we can get
d := <dx, dy, dz>;
v := <vx, vy, vz>;
p := <px, py, pz>;
w := v - p;
cp := CrossProduct(d, w);
nrm := BilinearForm(cp, cp, conjugate=false); //squared dist
nr := expand(nrm);
//now partial derivatives
nrx := diff(nr, vx);
nrx := -2*dz^2*px-2*dy^2*px+2*dz^2*vx+2*dy^2*vx
nry := -2*dx^2*py-2*dz^2*py-2*dy*vz*dz+2*dx^2*vy
nrz := -2*dy^2*pz+2*dy^2*vz-2*dy*dz*vy+2*dx^2*vz
To minimize sum of squared distances, we have to make system of linear equations for zero partial derivatives like this:
vx*2*(Sum(dz^2)+Sum(dy^2)) + vy * (-2*Sum(dx*dy)) + vz *(-2*Sum(dz*dx)) =
2*Sum(dz^2*px)-2*Sum(dy^2*px) -2*Sum(dx*py*dy)-2*Sum(dz*dx*pz)
Sum(dz^2) = Sum{over all i in line indexes} {dz[i] * dz[i]}
and solve it for unknowns vx, vy, vz
Edit: Old erroneous answer for planes instead of lines, left for reference
If we use general equation of line
A * x + B * y + C * z + D = 0
then distance from point (x, y, z) to this line is
Dist = Abs(A * x + B * y + C * z + D) / Sqrt(A^2 + B^2 + C^2)
To simplify - just normalize all line equations dividing by Norm's
Norm = Sqrt(A^2 + B^2 + C^2)
a = A / Norm
b = B / Norm
c = C / Norm
d = D / Norm
now equation is
a * x + b * y + c * z + d = 0
and distance
Dist = Abs(a * x + b * y + c * z + d)
and we can use squared distances like LS method (ai, bi, ci, di are coefficients for i-th line)
F = Sum(ai*x + bi*y + ci * z + d)^2 =
Sum(ai^2*x^2 + bi^2*y^2 + ci^2*z^2 + d^2 +
2 * (ai*bi*x*y + ai*ci*x*z + bi*y*ci*z + ai*x*di + bi*y*di + ci*z*di))
partial derivatives
dF/dx = 2*Sum(ai^2*x + ai*bi*y + ai*ci*z + ai*di) = 0
dF/dy = 2*Sum(bi^2*y + ai*bi*x + bi*ci*z + bi*di) = 0
dF/dz = 2*Sum(ci^2*z + ai*ci*x + bi*ci*y + ci*di) = 0
so we have system of linear equation
x * Sum(ai^2) + y * Sum(ai*bi) + z * Sum(ai*ci)= - Sum(ai*di)
y * Sum(bi^2) + x * Sum(ai*bi) + z * Sum(bi*ci)= - Sum(bi*di)
z * Sum(ci^2) + x * Sum(ai*ci) + y * Sum(bi*ci)= - Sum(ci*di)
x * Saa + y * Sab + z * Sac = - Sad
x * Sab + y * Sbb + z * Sbc = - Sbd
x * Sac + y * Sbc + z * Scc = - Scd
where S** are corresponding sums
and can solve it for unknowns x, y, z
I needed this for a sketch in Processing, so I ported Gene's answer. Works great and thought it might save someone else a little time. Unfortunately PVector/PMatrix don't have array accessors for vectors or matrices so I had to add these as local functions.
float getv(PVector v, int i) {
if(i == 0) return v.x;
if(i == 1) return v.y;
return v.z;
void setv(PVector v, int i, float value) {
if (i == 0) v.x = value;
else if (i == 1) v.y = value;
else v.z = value;
void incv(PVector v, int i, float value) {
setv(v,i,getv(v,i) + value);
float getm(float[] mm, int r, int c) { return mm[c + r*4]; }
void setm(float[] mm, int r, int c, float value) { mm[c + r*4] = value; }
void incm(float[] mm, int r, int c, float value) { mm[c + r*4] += value; }
PVector findNearestPoint(PVector a[], PVector d[]) {
var mm = new float[16];
var b = new PVector();
var n = a.length;
for (int i = 0; i < n; ++i) {
var d2 = d[i].dot(d[i]);
var da = d[i].dot(a[i]);
for (int ii = 0; ii < 3; ++ii) {
for (int jj = 0; jj < 3; ++jj) {
incm(mm,ii,jj, getv(d[i],ii) * getv(d[i],jj));
incm(mm, ii,ii, -d2);
incv(b, ii, getv(d[i], ii) * da - getv(a[i], ii) * d2);
var p = solve(mm, new float[] {b.x, b.y, b.z});
return new PVector(p[0],p[1],p[2]);
// Verifier
float dist2(PVector p, PVector a, PVector d) {
PVector pa = new PVector( a.x-p.x, a.y-p.y, a.z-p.z );
float dpa =;
return * - dpa * dpa;
//double sum_dist2(VEC p, VEC a[], VEC d[], int n) {
float sum_dist2(PVector p, PVector a[], PVector d[]) {
int n = a.length;
float sum = 0;
for (int i = 0; i < n; ++i) {
sum += dist2(p, a[i], d[i]);
return sum;
// Check 26 nearby points and verify the provided one is nearest.
boolean isNearest(PVector p, PVector a[], PVector d[]) {
float min_d2 = 3.4028235E38;
int ii = 2, jj = 2, kk = 2;
final float D = 0.1f;
for (int i = -1; i <= 1; ++i)
for (int j = -1; j <= 1; ++j)
for (int k = -1; k <= 1; ++k) {
PVector pp = new PVector( p.x + D * i, p.y + D * j, p.z + D * k );
float d2 = sum_dist2(pp, a, d);
// Prefer provided point among equals.
if (d2 < min_d2 || i == 0 && j == 0 && k == 0 && d2 == min_d2) {
min_d2 = d2;
ii = i; jj = j; kk = k;
return ii == 0 && jj == 0 && kk == 0;
void setup() {
PVector a[] = {
new PVector(-14.2, 17, -1),
new PVector(1, 1, 1),
new PVector(2.3, 4.1, 9.8),
new PVector(1,2,3)
PVector d[] = {
new PVector(1.3, 1.3, -10),
new PVector(12.1, -17.2, 1.1),
new PVector(19.2, 31.8, 3.5),
new PVector(4,5,6)
int n = 4;
for (int i = 0; i < n; ++i)
PVector p = findNearestPoint(a, d);
if (!isNearest(p, a, d))
println("Woops. Not nearest.\n");
// From rosettacode (with bug fix: added a missing fabs())
int mat_elem(int y, int x) { return y*4+x; }
void swap_row(float[] a, float[] b, int r1, int r2, int n)
float tmp;
int p1, p2;
int i;
if (r1 == r2) return;
for (i = 0; i < n; i++) {
p1 = mat_elem(r1, i);
p2 = mat_elem(r2, i);
tmp = a[p1];
a[p1] = a[p2];
a[p2] = tmp;
tmp = b[r1];
b[r1] = b[r2];
b[r2] = tmp;
float[] solve(float[] a, float[] b)
float[] x = new float[] {0,0,0};
int n = x.length;
int i, j, col, row, max_row, dia;
float max, tmp;
for (dia = 0; dia < n; dia++) {
max_row = dia;
max = abs(getm(a, dia, dia));
for (row = dia + 1; row < n; row++) {
if ((tmp = abs(getm(a, row, dia))) > max) {
max_row = row;
max = tmp;
swap_row(a, b, dia, max_row, n);
for (row = dia + 1; row < n; row++) {
tmp = getm(a, row, dia) / getm(a, dia, dia);
for (col = dia+1; col < n; col++) {
incm(a, row, col, -tmp * getm(a, dia, col));
setm(a,row,dia, 0);
b[row] -= tmp * b[dia];
for (row = n - 1; row >= 0; row--) {
tmp = b[row];
for (j = n - 1; j > row; j--) {
tmp -= x[j] * getm(a, row, j);
x[row] = tmp / getm(a, row, row);
return x;

Iterative/ Non-Recursive Merge Sort

I was trying iterative merge sort , but am stuck at at conditions when input length is not 2^x.
like int[] A ={4,5,1,254,66,75,12,8,65,4,87,63,53,8,99,54,12,34};
public class MergeSort {
public static void sort(int[] A) {
System.out.println("Log(A.len):"+log(A.length, 2));
for (int i = 0; i < log(A.length, 2); i++) { //log A.len
int r = 2 << i; //2^i
int mid = r >>> 1;
for (int j = 0; j+r < A.length; j = j + r) {
System.out.print("offset:" + j + " mid:" + (j + mid) + " r:" + (j + r));
merge(A, j, (j + mid), (j + r));
public static void merge(int[] A, int offset, int mid, int n) {
mid = mid - offset;
n = n - offset;
int[] L = new int[mid];
int[] R = new int[n - mid];
for (int i = 0; i < mid; i++) {
L[i] = A[i + offset];
R[i] = A[mid + i + offset];
int l = 0;
int r = 0; //left right pointer
int k = offset;
while (l < mid && r < mid) {
if (L[l] < R[r]) {
// System.out.println("in left");
A[k] = L[l];
} else {
// System.out.println("in right");
A[k] = R[r];
while (l < mid) {
A[k] = L[l];
while (r < mid) {
A[k] = R[r];
public static void main(String[] args) {
int[] A ={4,5,1,254,66,75,12,8,65,4,87,63,53,8,99,54,12,34};
public static void print_array(int[] A) {
for (int i = 0; i < A.length; i++) {
System.out.print(A[i] + " ");
static int log(int x, int base) {
return (int) (Math.log(x) / Math.log(base));
It works fine when input length is 2^x.
Also is there any better way to implement iterative version , this looks a lot messy.
C++ example of bottom up merge sort. a[] is array to sort, b[] is temp array. It includes a check for number of merge passes and swaps in place if the number of passes would be odd, in order to end up with the sorted data in a[].
void BottomUpMerge(int a[], int b[], size_t ll, size_t rr, size_t ee);
void BottomUpCopy(int a[], int b[], size_t ll, size_t rr);
size_t GetPassCount(size_t n);
void BottomUpMergeSort(int a[], int b[], size_t n)
size_t s = 1; // run size
if(GetPassCount(n) & 1){ // if odd number of passes
for(s = 1; s < n; s += 2) // swap in place for 1st pass
if(a[s] < a[s-1])
std::swap(a[s], a[s-1]);
s = 2;
while(s < n){ // while not done
size_t ee = 0; // reset end index
while(ee < n){ // merge pairs of runs
size_t ll = ee; // ll = start of left run
size_t rr = ll+s; // rr = start of right run
if(rr >= n){ // if only left run
rr = n;
BottomUpCopy(a, b, ll, rr); // copy left run
break; // end of pass
ee = rr+s; // ee = end of right run
if(ee > n)
ee = n;
BottomUpMerge(a, b, ll, rr, ee);
std::swap(a, b); // swap a and b
s <<= 1; // double the run size
void BottomUpMerge(int a[], int b[], size_t ll, size_t rr, size_t ee)
size_t o = ll; // b[] index
size_t l = ll; // a[] left index
size_t r = rr; // a[] right index
while(1){ // merge data
if(a[l] <= a[r]){ // if a[l] <= a[r]
b[o++] = a[l++]; // copy a[l]
if(l < rr) // if not end of left run
continue; // continue (back to while)
do // else copy rest of right run
b[o++] = a[r++];
while(r < ee);
break; // and return
} else { // else a[l] > a[r]
b[o++] = a[r++]; // copy a[r]
if(r < ee) // if not end of right run
continue; // continue (back to while)
do // else copy rest of left run
b[o++] = a[l++];
while(l < rr);
break; // and return
void BottomUpCopy(int a[], int b[], size_t ll, size_t rr)
do // copy left run
b[ll] = a[ll];
while(++ll < rr);
size_t GetPassCount(size_t n) // return # passes
size_t i = 0;
for(size_t s = 1; s < n; s <<= 1)
i += 1;

Generic fast Transpose of non-square matrix CUDA

The SDK provides an example and strategies for tackling a square matrix transpose but is there a good way of performing a transpose on a non square matrix? I have quite a naive implementation currently as follows which is probably terrible:
template<class S>
__global__ void transpose(S *Source, S *Destination, int SizeX, int SizeY) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid<SizeX*SizeY) {
int X = tid % SizeX;
int Y = tid / SizeX;
//(x,y) => (y,x)
int newId = (SizeY*X) + Y;
Destination[newId] = Source[tid];
Here my idea was to transpose the square part of the matrix with only the necessary threads/blocks (each thread swaps two entries of the square sub matrix), then traverse and transpose the remaining entries.
__global__ void kernelTranspuesta(float *a, float *c, int m, int n) {
int i = threadIdx.x + blockIdx.x*blockDim.x;
int j = threadIdx.y + blockIdx.y*blockDim.y;
int smallest = M < N ? M : N;
while( j < smallest ){
i = threadIdx.x + blockIdx.x*blockDim.x;
while( i < j ){
c[i*m+j] = a[j*n+i];
c[j*m+i] = a[i*n+j];
i+= blockDim.x*gridDim.x;
if(i == j)
c[j*m+i] = a[i*n+j];
j+= blockDim.y*gridDim.y;
if( M > N ) {
i = threadIdx.x + blockIdx.x*blockDim.x + N;
j = threadIdx.y + blockIdx.y*blockDim.y;
while( i < M ){
j = threadIdx.y + blockIdx.y*blockDim.y;
while( j < N){
c[j*m+i] = a[i*n+j];
j+= blockDim.y*gridDim.y;
i+= blockDim.x*gridDim.x;
i = threadIdx.x + blockIdx.x*blockDim.x;
j = threadIdx.y + blockIdx.y*blockDim.y + M;
while( i < M ){
j = threadIdx.y + blockIdx.y*blockDim.y + M;
while( j < N){
c[j*m+i] = a[i*n+j];
j+= blockDim.y*gridDim.y;
i+= blockDim.x*gridDim.x;
The kernel call is
dim3 hilos(16,16); // hilos(blockDim.x, blockDim.y)
dim3 bloques(8,8); // bloques(gridDim.x, gridDim.y)
kernelTranspuesta<<<bloques, hilos>>>(aD, cD, m, n);
I tested it on 512x256 and 256x512 matrices, let me know what you think.
