3d point closest to multiple lines in 3D space - algorithm

I search for non iterative, closed form, algorithm to find Least squares solution for point closest to the set of 3d lines. It is similar to 3d point triangulation (to minimize re-projections) but seems to be be simpler and faster?
Lines can be described in any form, 2 points, point and unit direction or similar.

Let the i th line be given by point ai and unit direction vector di. We need to find the single point that minimizes the sum of squared point to line distances. This is where the gradient is the zero vector:
Expanding the gradient,
Algebra yields a canonical 3x3 linear system,
where the k'th row (a 3-element row vector) of matrix M is
with vector ek the respective unit basis vector, and
It's not hard to turn this into code. I borrowed (and fixed a small bug in) a Gaussian elimination function from Rosettacode to solve the system. Thanks to the author!
#include <stdio.h>
#include <math.h>
typedef double VEC[3];
typedef VEC MAT[3];
void solve(double *a, double *b, double *x, int n); // linear solver
double dot(VEC a, VEC b) { return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]; }
void find_nearest_point(VEC p, VEC a[], VEC d[], int n) {
MAT m = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
VEC b = {0, 0, 0};
for (int i = 0; i < n; ++i) {
double d2 = dot(d[i], d[i]), da = dot(d[i], a[i]);
for (int ii = 0; ii < 3; ++ii) {
for (int jj = 0; jj < 3; ++jj) m[ii][jj] += d[i][ii] * d[i][jj];
m[ii][ii] -= d2;
b[ii] += d[i][ii] * da - a[i][ii] * d2;
}
}
solve(&m[0][0], b, p, 3);
}
// Debug printing.
void pp(VEC v, char *l, char *r) {
printf("%s%.3lf, %.3lf, %.3lf%s", l, v[0], v[1], v[2], r);
}
void pv(VEC v) { pp(v, "(", ")"); }
void pm(MAT m) { for (int i = 0; i < 3; ++i) pp(m[i], "\n[", "]"); }
// A simple verifier.
double dist2(VEC p, VEC a, VEC d) {
VEC pa = { a[0]-p[0], a[1]-p[1], a[2]-p[2] };
double dpa = dot(d, pa);
return dot(d, d) * dot(pa, pa) - dpa * dpa;
}
double sum_dist2(VEC p, VEC a[], VEC d[], int n) {
double sum = 0;
for (int i = 0; i < n; ++i) sum += dist2(p, a[i], d[i]);
return sum;
}
// Check 26 nearby points and verify the provided one is nearest.
int is_nearest(VEC p, VEC a[], VEC d[], int n) {
double min_d2 = 1e100;
int ii = 2, jj = 2, kk = 2;
#define D 0.01
for (int i = -1; i <= 1; ++i)
for (int j = -1; j <= 1; ++j)
for (int k = -1; k <= 1; ++k) {
VEC pp = { p[0] + D * i, p[1] + D * j, p[2] + D * k };
double d2 = sum_dist2(pp, a, d, n);
// Prefer provided point among equals.
if (d2 < min_d2 || i == 0 && j == 0 && k == 0 && d2 == min_d2) {
min_d2 = d2;
ii = i; jj = j; kk = k;
}
}
return ii == 0 && jj == 0 && kk == 0;
}
void normalize(VEC v) {
double len = sqrt(dot(v, v));
v[0] /= len;
v[1] /= len;
v[2] /= len;
}
int main(void) {
VEC a[] = {{-14.2, 17, -1}, {1, 1, 1}, {2.3, 4.1, 9.8}, {1,2,3}};
VEC d[] = {{1.3, 1.3, -10}, {12.1, -17.2, 1.1}, {19.2, 31.8, 3.5}, {4,5,6}};
int n = 4;
for (int i = 0; i < n; ++i) normalize(d[i]);
VEC p;
find_nearest_point(p, a, d, n);
pv(p);
printf("\n");
if (!is_nearest(p, a, d, n)) printf("Woops. Not nearest.\n");
return 0;
}
// A linear solver from rosettacode (with bug fix: added a missing fabs())
#define mat_elem(a, y, x, n) (a + ((y) * (n) + (x)))
void swap_row(double *a, double *b, int r1, int r2, int n)
{
double tmp, *p1, *p2;
int i;
if (r1 == r2) return;
for (i = 0; i < n; i++) {
p1 = mat_elem(a, r1, i, n);
p2 = mat_elem(a, r2, i, n);
tmp = *p1, *p1 = *p2, *p2 = tmp;
}
tmp = b[r1], b[r1] = b[r2], b[r2] = tmp;
}
void solve(double *a, double *b, double *x, int n)
{
#define A(y, x) (*mat_elem(a, y, x, n))
int i, j, col, row, max_row, dia;
double max, tmp;
for (dia = 0; dia < n; dia++) {
max_row = dia, max = fabs(A(dia, dia));
for (row = dia + 1; row < n; row++)
if ((tmp = fabs(A(row, dia))) > max) max_row = row, max = tmp;
swap_row(a, b, dia, max_row, n);
for (row = dia + 1; row < n; row++) {
tmp = A(row, dia) / A(dia, dia);
for (col = dia+1; col < n; col++)
A(row, col) -= tmp * A(dia, col);
A(row, dia) = 0;
b[row] -= tmp * b[dia];
}
}
for (row = n - 1; row >= 0; row--) {
tmp = b[row];
for (j = n - 1; j > row; j--) tmp -= x[j] * A(row, j);
x[row] = tmp / A(row, row);
}
#undef A
}
This isn't extensively tested, but seems to be working fine.

Let base point of line is p and unit direction vector is d.
Then distance from point v to this line might be calculated using cross product
SquaredDist = ((v - p) x d)^2
Using Maple packet symbolic calculation, we can get
d := <dx, dy, dz>;
v := <vx, vy, vz>;
p := <px, py, pz>;
w := v - p;
cp := CrossProduct(d, w);
nrm := BilinearForm(cp, cp, conjugate=false); //squared dist
nr := expand(nrm);
//now partial derivatives
nrx := diff(nr, vx);
//results:
nrx := -2*dz^2*px-2*dy^2*px+2*dz^2*vx+2*dy^2*vx
+2*dx*py*dy-2*dx*vy*dy+2*dz*dx*pz-2*dz*dx*vz
nry := -2*dx^2*py-2*dz^2*py-2*dy*vz*dz+2*dx^2*vy
+2*dz^2*vy+2*dy*pz*dz+2*dx*dy*px-2*dx*dy*vx
nrz := -2*dy^2*pz+2*dy^2*vz-2*dy*dz*vy+2*dx^2*vz
-2*dx^2*pz-2*dz*vx*dx+2*dy*dz*py+2*dz*px*dx
To minimize sum of squared distances, we have to make system of linear equations for zero partial derivatives like this:
vx*2*(Sum(dz^2)+Sum(dy^2)) + vy * (-2*Sum(dx*dy)) + vz *(-2*Sum(dz*dx)) =
2*Sum(dz^2*px)-2*Sum(dy^2*px) -2*Sum(dx*py*dy)-2*Sum(dz*dx*pz)
where
Sum(dz^2) = Sum{over all i in line indexes} {dz[i] * dz[i]}
and solve it for unknowns vx, vy, vz
Edit: Old erroneous answer for planes instead of lines, left for reference
If we use general equation of line
A * x + B * y + C * z + D = 0
then distance from point (x, y, z) to this line is
Dist = Abs(A * x + B * y + C * z + D) / Sqrt(A^2 + B^2 + C^2)
To simplify - just normalize all line equations dividing by Norm's
Norm = Sqrt(A^2 + B^2 + C^2)
a = A / Norm
b = B / Norm
c = C / Norm
d = D / Norm
now equation is
a * x + b * y + c * z + d = 0
and distance
Dist = Abs(a * x + b * y + c * z + d)
and we can use squared distances like LS method (ai, bi, ci, di are coefficients for i-th line)
F = Sum(ai*x + bi*y + ci * z + d)^2 =
Sum(ai^2*x^2 + bi^2*y^2 + ci^2*z^2 + d^2 +
2 * (ai*bi*x*y + ai*ci*x*z + bi*y*ci*z + ai*x*di + bi*y*di + ci*z*di))
partial derivatives
dF/dx = 2*Sum(ai^2*x + ai*bi*y + ai*ci*z + ai*di) = 0
dF/dy = 2*Sum(bi^2*y + ai*bi*x + bi*ci*z + bi*di) = 0
dF/dz = 2*Sum(ci^2*z + ai*ci*x + bi*ci*y + ci*di) = 0
so we have system of linear equation
x * Sum(ai^2) + y * Sum(ai*bi) + z * Sum(ai*ci)= - Sum(ai*di)
y * Sum(bi^2) + x * Sum(ai*bi) + z * Sum(bi*ci)= - Sum(bi*di)
z * Sum(ci^2) + x * Sum(ai*ci) + y * Sum(bi*ci)= - Sum(ci*di)
x * Saa + y * Sab + z * Sac = - Sad
x * Sab + y * Sbb + z * Sbc = - Sbd
x * Sac + y * Sbc + z * Scc = - Scd
where S** are corresponding sums
and can solve it for unknowns x, y, z

I needed this for a sketch in Processing, so I ported Gene's answer. Works great and thought it might save someone else a little time. Unfortunately PVector/PMatrix don't have array accessors for vectors or matrices so I had to add these as local functions.
float getv(PVector v, int i) {
if(i == 0) return v.x;
if(i == 1) return v.y;
return v.z;
}
void setv(PVector v, int i, float value) {
if (i == 0) v.x = value;
else if (i == 1) v.y = value;
else v.z = value;
}
void incv(PVector v, int i, float value) {
setv(v,i,getv(v,i) + value);
}
float getm(float[] mm, int r, int c) { return mm[c + r*4]; }
void setm(float[] mm, int r, int c, float value) { mm[c + r*4] = value; }
void incm(float[] mm, int r, int c, float value) { mm[c + r*4] += value; }
PVector findNearestPoint(PVector a[], PVector d[]) {
var mm = new float[16];
var b = new PVector();
var n = a.length;
for (int i = 0; i < n; ++i) {
var d2 = d[i].dot(d[i]);
var da = d[i].dot(a[i]);
for (int ii = 0; ii < 3; ++ii) {
for (int jj = 0; jj < 3; ++jj) {
incm(mm,ii,jj, getv(d[i],ii) * getv(d[i],jj));
}
incm(mm, ii,ii, -d2);
incv(b, ii, getv(d[i], ii) * da - getv(a[i], ii) * d2);
}
}
var p = solve(mm, new float[] {b.x, b.y, b.z});
return new PVector(p[0],p[1],p[2]);
}
// Verifier
float dist2(PVector p, PVector a, PVector d) {
PVector pa = new PVector( a.x-p.x, a.y-p.y, a.z-p.z );
float dpa = d.dot(pa);
return d.dot(d) * pa.dot(pa) - dpa * dpa;
}
//double sum_dist2(VEC p, VEC a[], VEC d[], int n) {
float sum_dist2(PVector p, PVector a[], PVector d[]) {
int n = a.length;
float sum = 0;
for (int i = 0; i < n; ++i) {
sum += dist2(p, a[i], d[i]);
}
return sum;
}
// Check 26 nearby points and verify the provided one is nearest.
boolean isNearest(PVector p, PVector a[], PVector d[]) {
float min_d2 = 3.4028235E38;
int ii = 2, jj = 2, kk = 2;
final float D = 0.1f;
for (int i = -1; i <= 1; ++i)
for (int j = -1; j <= 1; ++j)
for (int k = -1; k <= 1; ++k) {
PVector pp = new PVector( p.x + D * i, p.y + D * j, p.z + D * k );
float d2 = sum_dist2(pp, a, d);
// Prefer provided point among equals.
if (d2 < min_d2 || i == 0 && j == 0 && k == 0 && d2 == min_d2) {
min_d2 = d2;
ii = i; jj = j; kk = k;
}
}
return ii == 0 && jj == 0 && kk == 0;
}
void setup() {
PVector a[] = {
new PVector(-14.2, 17, -1),
new PVector(1, 1, 1),
new PVector(2.3, 4.1, 9.8),
new PVector(1,2,3)
};
PVector d[] = {
new PVector(1.3, 1.3, -10),
new PVector(12.1, -17.2, 1.1),
new PVector(19.2, 31.8, 3.5),
new PVector(4,5,6)
};
int n = 4;
for (int i = 0; i < n; ++i)
d[i].normalize();
PVector p = findNearestPoint(a, d);
println(p);
if (!isNearest(p, a, d))
println("Woops. Not nearest.\n");
}
// From rosettacode (with bug fix: added a missing fabs())
int mat_elem(int y, int x) { return y*4+x; }
void swap_row(float[] a, float[] b, int r1, int r2, int n)
{
float tmp;
int p1, p2;
int i;
if (r1 == r2) return;
for (i = 0; i < n; i++) {
p1 = mat_elem(r1, i);
p2 = mat_elem(r2, i);
tmp = a[p1];
a[p1] = a[p2];
a[p2] = tmp;
}
tmp = b[r1];
b[r1] = b[r2];
b[r2] = tmp;
}
float[] solve(float[] a, float[] b)
{
float[] x = new float[] {0,0,0};
int n = x.length;
int i, j, col, row, max_row, dia;
float max, tmp;
for (dia = 0; dia < n; dia++) {
max_row = dia;
max = abs(getm(a, dia, dia));
for (row = dia + 1; row < n; row++) {
if ((tmp = abs(getm(a, row, dia))) > max) {
max_row = row;
max = tmp;
}
}
swap_row(a, b, dia, max_row, n);
for (row = dia + 1; row < n; row++) {
tmp = getm(a, row, dia) / getm(a, dia, dia);
for (col = dia+1; col < n; col++) {
incm(a, row, col, -tmp * getm(a, dia, col));
}
setm(a,row,dia, 0);
b[row] -= tmp * b[dia];
}
}
for (row = n - 1; row >= 0; row--) {
tmp = b[row];
for (j = n - 1; j > row; j--) {
tmp -= x[j] * getm(a, row, j);
}
x[row] = tmp / getm(a, row, row);
}
return x;
}

Related

MVC image blending algorithm implementation

I followed the algorithm mentioned in the sig09 paper Coordinates for Instant Image Cloning
The algorithm
This is my code:
#include<iostream>
#include<vector>
#include<map>
#include<fstream>
#include<queue>
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include"../shared/stb_image.h"
#include"../shared/stb_image_write.h"
#define vector std::vector
#define queue std::queue
#define map std::map
#define cin std::cin
#define cout std::cout
#define endl std::endl
#define string std::string
#define PDD std::pair<double, double>
#define image Mat<unsigned char>
template<class T>
class Mat{
public:
int row, col, channel;
vector<T> data;
Mat(){}
Mat(int row, int col, int cha):row(row), col(col), channel(cha){
data.resize(row * col * cha, 0);
}
Mat(const char *name){
T *t = stbi_load(name, &col, &row, &channel, 0);
data = vector<T>(t, t + col * row * channel);
stbi_image_free(t);
}
T& at(int x, int y, int z){
return data[(x * col + y) * channel + z];
}
void write(const char *name){
stbi_write_bmp(name, col, row, channel, data.data());
}
};
#define x first
#define y second
vector<PDD> P;
map<int, map<int, bool>> st; // register
vector<vector<double>> w;
int dx8[] = {1, 1, 1, 0, 0, -1, -1, -1}, dy8[] = {-1, 0, 1, -1, 1, -1, 0, 1};
int dx4[] = {0, 0, 1, -1}, dy4[] = {1, -1, 0, 0};
bool check(int i, int j, image &mask){
if(mask.at(i, j, 0) == 0) return false;
return mask.at(i, j - 1, 0) == 0||
mask.at(i, j + 1, 0) == 0||
mask.at(i - 1, j, 0) == 0||
mask.at(i + 1, j, 0) == 0;
}
void dfs(int sx, int sy, int x, int y, int px, int py, image &mask){
if(mask.at(x, y - 1, 0) == 0 && st[x][y - 1] == 0) P.push_back({x, y - 1}), st[x][y - 1] = 1;
if(px != -1){
if(mask.at(x + 1, y, 0) == 0 && st[x + 1][y] == 0) P.push_back({x + 1, y}), st[x + 1][y] = 1;
if(mask.at(x, y + 1, 0) == 0 && st[x][y + 1] == 0) P.push_back({x, y + 1}), st[x][y + 1] = 1;
if(mask.at(x - 1, y, 0) == 0 && st[x - 1][y] == 0) P.push_back({x - 1, y}), st[x - 1][y] = 1;
}
if(sx == x && sy == y && px != -1) return;
for(int i = 0; i < 8; i ++){
int a = x + dx8[i], b = y + dy8[i];
if(a < 0 || b < 0 || a >= mask.row || b >= mask.col) continue;
if(check(a, b, mask) && (a != px || b != py)) dfs(sx, sy, a, b, x, y, mask);
}
}
double len(const PDD &a){
return sqrt(a.x * a.x + a.y * a.y);
}
double dot(const PDD &a, const PDD &b){
return a.x * b.x + a.y * b.y;
}
PDD minus(const PDD &a, const PDD &b){
return {a.x - b.x, a.y - b.y};
}
PDD normalize(const PDD &a){
return {a.x / len(a), a.y / len(a)};
}
double val(PDD &pre, PDD &cur, PDD &nxt, PDD &o){
PDD V1 = normalize(minus(pre, o));
PDD V2 = normalize(minus(nxt, o));
PDD mid = normalize(minus(cur, o));
double alpha1 = acos(dot(V1, mid));
double alpha2 = acos(dot(V2, mid));
return (tan(alpha1 / 2) + tan(alpha2 / 2)) / len(minus(cur, o)); // many nan value occured here
}
int main(int argc, char *argv[]){
image src("src.png");
image mask("mask1.png");
image tar("target.png");
image res(tar.row, tar.col, tar.channel);
for(int i = 0; i < mask.row; i ++){
for(int j = 0; j < mask.col; j ++){
if(mask.at(i, j, 0) == 255 && st[i][j] == 0){
dfs(i, j, i, j, -1, -1, mask); // find counter-clockwise border
queue<PDD> q;
vector<PDD> X;
q.push({i, j});
st[i][j] = 1;
while(q.size()){ // get all white (x, y)s in mask
auto h = q.front();
X.push_back(h);
q.pop();
vector<double> wx;
for(int k = 0; k < P.size(); k ++){ // calculate lambda value by search order
int pre = (k - 1 + P.size()) % P.size();
int cur = k;
int nxt = (k + 1) % P.size();
wx.push_back(
val(P[pre], P[cur], P[nxt], h)
);
}
w.push_back(wx);
for(int k = 0; k < 4; k ++){
int a = h.x + dx4[k], b = h.y + dy4[k];
if(st[a][b] == 1 || mask.at(a, b, 0) == 0) continue;
st[a][b] = 1;
q.push({a, b});
}
}
for(int c = 0; c < res.channel; c ++){ // every channel of res
for(int k = 0; k < X.size(); k ++){
double rx = 0, sum = 0;
for(int u = 0; u < w[k].size(); u ++){
double diff = tar.at(P[u].x, P[u].y, c) - src.at(P[u].x, P[u].y, c);
rx += w[k][u] * diff;
sum += w[k][u];
}
rx /= sum;
res.at(X[k].x, X[k].y, c) = rx + src.at(X[k].x, X[k].y, c);
}
}
res.write("./res.bmp");
return 0;
}
}
}
}
1. get the border(counter-clockwise) of the white region in the
mask
2. get all (x, y)s of pixels in the white area of the mask
3. calculate lambda value of every (x, y) in 2, but I found that lambda values of every (x, y) contain many nans (possibly caused by too small value in function val(...))
The question is I do not know how to deal with this condition in 3, nor did the paper mention it.

Runtime error for large inputs for sorting ( quicksort)

This is a very simple program where the user inputs (x,y) coordinates and distance 'd' and the program has to find out the number of unrepeated coordinates from (x,y) to (x+d,y).
For eg: if input for one test case is: 4,9,2 then the unrepeated coordinates are (4,9),(5,9) and (6,9)(x=4,y=9,d=2). I have used a sorting algorithm as mentioned in the question (to keep track of multiple occurrences) however the program shows runtime error for test cases beyond 30. Is there any mistake in the code or is it an issue with my compiler?
For a detailed explanation of question: https://www.hackerearth.com/practice/algorithms/sorting/merge-sort/practice-problems/algorithm/missing-soldiers-december-easy-easy/
#include <stdio.h>
#include <stdlib.h>
int partition(int *arr, int p, int r) {
int x;
x = arr[r];
int tmp;
int i = p - 1;
for (int j = p; j <= r - 1; ++j) {
if (arr[j] <= x) {
i = i + 1;
tmp = arr[i];
arr[i] = arr[j];
arr[j] = tmp;
}
}
tmp = arr[i + 1];
arr[i + 1] = arr[r];
arr[r] = tmp;
return (i + 1);
}
void quicksort(int *arr, int p, int r) {
int q;
if (p < r) {
q = partition(arr, p, r);
quicksort(arr, p, q - 1);
quicksort(arr, q + 1, r);
}
}
int count(int A[],int ct) {
int cnt = 0;
for (int i = 0; i < ct; ++i) {
if (A[i] != A[i + 1]) {
cnt++;
}
}
return cnt;
}
int main() {
int t;
scanf("%d", &t);
long int tmp, y, d;
int ct = 0;
int i = 0;
int x[1000];
int j = 0;
for (int l = 0; l < t; ++l) {
scanf("%d%d%d", &tmp, &y, &d);
ct = ct + d + 1; //this counts the total no of coordinates for each (x,y,d)
for (int i = 0; i <= d; ++i) {
x[j] = tmp + i; //storing all possible the x and x+d coordinates
j++;
}
}
int cnt;
int p = ct - 1;
quicksort(x, 0, p); //quicksort sorting
for (int l = 0; l < ct; ++l) {
printf("%d ", x[l]); //prints sorted array not necessary to question
}
cnt = count(x, ct); //counts the number of non-repeated vertices
printf("%d\n", cnt);
}
The problem was the bounds of the array int x[1000] is not enough for the data given below.

Given numbers from 1 to k, select d many numbers that their sum equal to v

I am trying to find the number of distinct vectors in a set that has the following properties:
A set is k numbers starting from 1 to k+1
D is the number of elements that can be selected
V is the sum of the elements
Examples
k=3, d=3, v=6, the result is 7;
<1, 2, 3>, <1, 3, 2>, <2, 1, 3>, <2, 2, 2>, <2, 3, 1>, <3, 1, 2>, <3, 2, 1>
k=4, d=2, v=7, the result is 2;
<3, 4>, <4, 3>
In this case, <2, 5> is not valid because 5 exceeds the value of k.
I want to find out if there is a mathematical formula to calculate the result. If there isn't a formula, how efficiently can this algorithm be implemented? I have found a rather mysterious implementation but i wonder if it can be improved upon.
public static int NumberOfDistinctVectors(int k, int d ,int v) {
if((v > k * d) || (v < d)) return 0;
if(d == 1 || v == d) return 1;
if(v == d + 1) return d;
int alpha = 1, beta = 0;
if(1 < v + k - k * d)
alpha = v + k - k * d;
if(k < v - d + 1)
beta = k;
else
beta = v - d + 1;
int sum = 0;
for(int i = alpha; i <= beta; i++) {
sum += NumberOfDistinctVectors(k, d-1, v-i);
}
return sum;
}
The problem is very related to the following:
What is the number of combinations to distribute b identical objects in c groups
where no group contains more than n objects?
which is discussed here
Just think of your numbers being made of the object (+1). So in your case
c = d, because each group corresponds to one of your numbers
b = v-d, since you need to put at least one (+1) object into each of the d groups
n = k-1, since we assume a (+1) already in each group and don't want to get larger than k
Find the code below (using appache-commons for c(N,K))
public static int NumberOfDistinctVectors(int k, int d ,int v) {
return combinations(v-d, d, k-1);
}
//combinations to distribute b identical objects to c groups
//where no group has more than n objects
public static int combinations(int b, int c, int n)
{
int sum = 0;
for(int i = 0; i <= c; i++)
{
if(b+c-1-i*(n+1) >= c-1)
sum += Math.pow(-1, i) * CombinatoricsUtils.binomialCoefficient(c, i)
* CombinatoricsUtils.binomialCoefficient(b+c-1-i*(n+1), c-1);
}
return sum;
}
Let me also quote from the original answer:
"whether this is actually any more useful than the recurrence is
another question"
Here is another way of counting that may be more efficient. It is based on the formula for permutations with repetition. I have added comments in the code hoping it makes it a bit easier to follow.
public static int NumberOfDistinctVectors2(int k, int d, int v)
{
return NumberOfDistinctVectors2_rec(1, 0, k, d, v, 1, 1);
}
public static int NumberOfDistinctVectors2_rec(
int i, /* Current number being added */
int j, /* Amount of already picked numbers */
int k, /* Maximum number that can be picked */
int d, /* Total amount of numbers to pick */
int v, /* Remaining value */
long num, /* Numerator in "permutations with repetition" formula */
long den) /* Denominator in "permutations with repetition" formula */
{
// Amount of remaining numbers to pick
int rem = d - j;
// Remaining value is too big or too small
if (v < i * rem || v > k * rem) return 0;
// If no numbers to add then we are done
if (rem == 0) return Math.toIntExact(num / den);
// If only one number to add this can be used as a "shortcut"
if (rem == 1) return d * Math.toIntExact(num / den);
// Counted permutations
int count = 0;
// Maximum amount of repetitions for the current number
int maxRep = Math.min(v / i, rem);
// Factor to multiply the numerator
int numFactor = 1;
// Factor to multiply the denominator
int denFactor = 1;
// Consider adding repetitions of the current number
for (int r = 1; r <= maxRep; r++)
{
// The numerator is the factorial of the total amount of numbers
numFactor *= (j + r);
// The denominator is the product of the factorials of the number of repetitions of each number
denFactor *= r;
// We add "r" repetitions of the current number and count all possible permutations from there
count += NumberOfDistinctVectors2_rec(i + 1, j + r, k, d, v - i * r, num * numFactor, den * denFactor);
}
// Consider permutations that do not include the current number
count += NumberOfDistinctVectors2_rec(i + 1, j, k, d, v, num, den);
return count;
}
Here is a small class testing it where this method appears to be significantly faster (see it in Rextester).
class NumberOfDistinctVectorsTest
{
// Original method
public static int NumberOfDistinctVectors(int k, int d ,int v)
{
if((v > k * d) || (v < d)) return 0;
if(d == 1 || v == d) return 1;
if(v == d + 1) return d;
int alpha = 1, beta = 0;
if(1 < v + k - k * d)
alpha = v + k - k * d;
if(k < v - d + 1)
beta = k;
else
beta = v - d + 1;
int sum = 0;
for(int i = alpha; i <= beta; i++)
{
sum += NumberOfDistinctVectors(k, d-1, v-i);
}
return sum;
}
// New method
public static int NumberOfDistinctVectors2(int k, int d, int v)
{
return NumberOfDistinctVectors2_rec(1, 0, k, d, v, 1, 1);
}
public static int NumberOfDistinctVectors2_rec(int i, int j, int k, int d, int v, long num, long den)
{
int rem = d - j;
if (v < i * rem || v > k * rem) return 0;
if (rem == 0) return Math.toIntExact(num / den);
if (rem == 1) return d * Math.toIntExact(num / den);
int count = 0;
int maxRep = Math.min(v / i, rem);
int numFactor = 1;
int denFactor = 1;
for (int r = 1; r <= maxRep; r++)
{
numFactor *= (j + r);
denFactor *= r;
count += NumberOfDistinctVectors2_rec(i + 1, j + r, k, d, v - i * r, num * numFactor, den * denFactor);
}
count += NumberOfDistinctVectors2_rec(i + 1, j, k, d, v, num, den);
return count;
}
public static void main(final String[] args)
{
// Test 1
System.out.println(NumberOfDistinctVectors(3, 3, 6));
System.out.println(NumberOfDistinctVectors2(3, 3, 6));
// Test 2
System.out.println(NumberOfDistinctVectors(4, 2, 7));
System.out.println(NumberOfDistinctVectors2(4, 2, 7));
// Test 3
System.out.println(NumberOfDistinctVectors(12, 5, 20));
System.out.println(NumberOfDistinctVectors2(12, 5, 20));
// Test runtime
long startTime, endTime;
int reps = 100;
startTime = System.nanoTime();
for (int i = 0; i < reps; i++)
{
NumberOfDistinctVectors(12, 5, 20);
}
endTime = System.nanoTime();
double t1 = ((endTime - startTime) / (reps * 1000.));
startTime = System.nanoTime();
for (int i = 0; i < reps; i++)
{
NumberOfDistinctVectors2(12, 5, 20);
}
endTime = System.nanoTime();
double t2 = ((endTime - startTime) / (reps * 1000.));
System.out.println("Original method: " + t1 + "ms");
System.out.println("New method: " + t2 + "ms");
}
}
Output:
7
7
2
2
3701
3701
Original method: 45.64331ms
New method: 5.89364ms
EDIT: New test (run on JDoodle with Apache Commons 3.6.1) including SaiBot's answer:
import org.apache.commons.math3.util.CombinatoricsUtils;
public class NumberOfDistinctVectorsTest
{
// Original method
public static int NumberOfDistinctVectors(int k, int d ,int v)
{
if((v > k * d) || (v < d)) return 0;
if(d == 1 || v == d) return 1;
if(v == d + 1) return d;
int alpha = 1, beta = 0;
if(1 < v + k - k * d)
alpha = v + k - k * d;
if(k < v - d + 1)
beta = k;
else
beta = v - d + 1;
int sum = 0;
for(int i = alpha; i <= beta; i++)
{
sum += NumberOfDistinctVectors(k, d-1, v-i);
}
return sum;
}
// jdehesa method
public static int NumberOfDistinctVectors2(int k, int d, int v)
{
return NumberOfDistinctVectors2_rec(1, 0, k, d, v, 1, 1);
}
public static int NumberOfDistinctVectors2_rec(int i, int j, int k, int d, int v, long num, long den)
{
int rem = d - j;
if (v < i * rem || v > k * rem) return 0;
if (rem == 0) return Math.toIntExact(num / den);
if (rem == 1) return d * Math.toIntExact(num / den);
int count = 0;
int maxRep = Math.min(v / i, rem);
int numFactor = 1;
int denFactor = 1;
for (int r = 1; r <= maxRep; r++)
{
numFactor *= (j + r);
denFactor *= r;
count += NumberOfDistinctVectors2_rec(i + 1, j + r, k, d, v - i * r, num * numFactor, den * denFactor);
}
count += NumberOfDistinctVectors2_rec(i + 1, j, k, d, v, num, den);
return count;
}
// SaiBot method
public static int NumberOfDistinctVectors3(int k, int d ,int v)
{
return combinations(v-d, d, k-1);
}
//combinations to distribute b identical objects to c groups
//where no group has more than n objects
public static int combinations(int b, int c, int n)
{
int sum = 0;
for(int i = 0; i <= c; i++)
{
if(b+c-1-i*(n+1) >= c-1)
sum += Math.pow(-1, i) * CombinatoricsUtils.binomialCoefficient(c, i)
* CombinatoricsUtils.binomialCoefficient(b+c-1-i*(n+1), c-1);
}
return sum;
}
public static void main(final String[] args)
{
// Test 1
System.out.println(NumberOfDistinctVectors(3, 3, 6));
System.out.println(NumberOfDistinctVectors2(3, 3, 6));
System.out.println(NumberOfDistinctVectors3(3, 3, 6));
// Test 2
System.out.println(NumberOfDistinctVectors(4, 2, 7));
System.out.println(NumberOfDistinctVectors2(4, 2, 7));
System.out.println(NumberOfDistinctVectors3(4, 2, 7));
// Test 3
System.out.println(NumberOfDistinctVectors(12, 5, 20));
System.out.println(NumberOfDistinctVectors2(12, 5, 20));
System.out.println(NumberOfDistinctVectors3(12, 5, 20));
// Test runtime
long startTime, endTime;
int reps = 100;
startTime = System.nanoTime();
for (int i = 0; i < reps; i++)
{
NumberOfDistinctVectors(12, 5, 20);
}
endTime = System.nanoTime();
double t1 = ((endTime - startTime) / (reps * 1000.));
startTime = System.nanoTime();
for (int i = 0; i < reps; i++)
{
NumberOfDistinctVectors2(12, 5, 20);
}
endTime = System.nanoTime();
double t2 = ((endTime - startTime) / (reps * 1000.));
startTime = System.nanoTime();
for (int i = 0; i < reps; i++)
{
NumberOfDistinctVectors3(12, 5, 20);
}
endTime = System.nanoTime();
double t3 = ((endTime - startTime) / (reps * 1000.));
System.out.println("Original method: " + t1 + "ms");
System.out.println("jdehesa method: " + t2 + "ms");
System.out.println("SaiBot method: " + t3 + "ms");
}
}
Output:
7
7
7
2
2
2
3701
3701
3701
Original method: 97.81325ms
jdehesa method: 7.2753ms
SaiBot method: 2.70861ms
The timings are not very stable in JDoodle (I used it because it allows for Maven dependencies), but in general SaiBot's method is the fastest by far.

Probability of stay in matrix

You are given a matrix of order (M x N). You can move in 4 directions: left, top, right and bottom. You are given initial position (x, y) and number of steps which you can move from the given location. While moving if you go out of the matrix, you are disqualified from the game. What is the probability that you are not disqualified?
I solved the question in the following two ways:
Way 1. Find out total ways say T1 in which you will be inside the matrix and find out total ways T2 in which you will be out of the matrix. Then return T1 / (T1 + T2) as the result.
Way 2. Use the fact that probability of reaching your neighbor is: 1/4 as you can move only in 4 directions from the given position and calculate the result.
But the two approaches are giving different results in many scenarios.
Please find my code below and do let me know where I am mistaken or if there is fault in the approaches.
public class ProbabilityOfStay {
private int[] x = {0, 1, 0, -1};
private int[] y = {-1, 0, 1, 0};
private int ROW;
private int COL;
private int xPos;
private int yPos;
private int steps ;
int[][][] stayDP = null;
int[][][] nonStayDP = null;
float[][][] sp = null;
public ProbabilityOfStay(int R, int C, int x, int y, int steps)
{
this.ROW = R;
this.COL = C;
this.xPos = x;
this.yPos = y;
this.steps = steps;
stayDP = new int[ROW][COL][steps];
nonStayDP = new int[ROW][COL][steps];
sp = new float[ROW][COL][steps];
this.initializeInt(stayDP, -1);
this.initializeInt(nonStayDP, -1);
this.initializeF(sp, -1);
}
private void initializeInt(int[][][] M, int d)
{
for(int i = 0; i < ROW; i++)
{
for(int j = 0; j < COL; j++)
{
for(int k = 0; k < steps; k++)
M[i][j][k] = d;
}
}
}
private void initializeF(float[][][] M, int d)
{
for(int i = 0; i < ROW; i++)
{
for(int j = 0; j < COL; j++)
{
for(int k = 0; k < steps; k++)
M[i][j][k] = d;
}
}
}
private int getTotalStayPath()
{
int p = getStayPaths(xPos, yPos, steps);
return p;
}
private int getStayPaths(int xp, int yp, int s)
{
if(xp < 0 || xp >= ROW || yp < 0 || yp >= COL)
return 0;
if(s == 0)
return 1;
if(stayDP[xp][yp][s-1] != -1)
return stayDP[xp][yp][s-1];
int ans = 0;
for(int i = 0; i < x.length; i++)
{
ans += getStayPaths(xp + x[i], yp + y[i], s-1);
}
return (stayDP[xp][yp][s-1] = ans);
}
private int getTotalNonStayPath()
{
int p = getNonStayPaths(xPos, yPos, steps);
return p;
}
private int getNonStayPaths(int xp, int yp, int s)
{
if(xp < 0 || xp >= ROW || yp < 0 || yp >= COL)
return 1;
if(s == 0)
return 0;
if(nonStayDP[xp][yp][s-1] != -1)
return nonStayDP[xp][yp][s-1];
int ans = 0;
for(int i = 0; i < x.length; i++)
{
ans += getNonStayPaths(xp + x[i], yp + y[i], s - 1);
}
return (nonStayDP[xp][yp][s-1] = ans);
}
private float getStayProbabilityM()
{
float p = getProbability(xPos, yPos, steps);
return p;
}
private float getProbability(int xp, int yp, int s)
{
if(xp < 0 || xp >= ROW || yp < 0 || yp >= COL)
return 0;
if(s == 0)
return 1;
if(sp[xp][yp][s-1] != -1)
return sp[xp][yp][s-1];
float ans = 0.0f;
for(int i = 0; i < x.length; i++)
{
ans += (getProbability(xp + x[i], yp + y[i], s-1)) / 4.0;
}
return (sp[xp][yp][s-1] = ans);
}
public static void main(String[] args)
{
int ROW = 7, COL = 7, x = 3, y = 5, steps = 3; //(x, y) is your position in the matrix.
ProbabilityOfStay pos = new ProbabilityOfStay(ROW, COL, x, y, steps);
int totalStayPaths = pos.getTotalStayPath(); //number of ways in which you can stay in the matrix.
int totalNonStayPaths = pos.getTotalNonStayPath(); ////number of ways in which you can't stay in the matrix.
float stayingProbability = (totalStayPaths / (float)(totalStayPaths + totalNonStayPaths));
float sP_memorization = pos.getStayProbabilityM();
System.out.println("Total stay paths: " + totalStayPaths + ", total non-stay paths: " + totalNonStayPaths + ", Stay probability: " + stayingProbability);
System.out.println("Total probability memoriation: " + sP_memorization);
}
}
If I run the program it prints:
Total stay paths: 56, total non-stay paths: 5
However, this results in a total number of paths of 56+5=61.
There are 4 choices at each of 3 steps, so the total should be 4*4*4 = 64.
I think the issue is that you stop counting as soon as the path goes off the board. This means that the paths are not of equal probability so your calculation by dividing the number of paths is not valid.
If you change the computation to:
float stayingProbability = (totalStayPaths / (float)Math.pow(4,steps));
it prints matching answers.

Generic fast Transpose of non-square matrix CUDA

The SDK provides an example and strategies for tackling a square matrix transpose but is there a good way of performing a transpose on a non square matrix? I have quite a naive implementation currently as follows which is probably terrible:
template<class S>
__global__ void transpose(S *Source, S *Destination, int SizeX, int SizeY) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid<SizeX*SizeY) {
int X = tid % SizeX;
int Y = tid / SizeX;
//(x,y) => (y,x)
int newId = (SizeY*X) + Y;
Destination[newId] = Source[tid];
}
}
Here my idea was to transpose the square part of the matrix with only the necessary threads/blocks (each thread swaps two entries of the square sub matrix), then traverse and transpose the remaining entries.
__global__ void kernelTranspuesta(float *a, float *c, int m, int n) {
int i = threadIdx.x + blockIdx.x*blockDim.x;
int j = threadIdx.y + blockIdx.y*blockDim.y;
int smallest = M < N ? M : N;
while( j < smallest ){
i = threadIdx.x + blockIdx.x*blockDim.x;
while( i < j ){
c[i*m+j] = a[j*n+i];
c[j*m+i] = a[i*n+j];
i+= blockDim.x*gridDim.x;
}
if(i == j)
c[j*m+i] = a[i*n+j];
j+= blockDim.y*gridDim.y;
}
if( M > N ) {
i = threadIdx.x + blockIdx.x*blockDim.x + N;
j = threadIdx.y + blockIdx.y*blockDim.y;
while( i < M ){
j = threadIdx.y + blockIdx.y*blockDim.y;
while( j < N){
c[j*m+i] = a[i*n+j];
j+= blockDim.y*gridDim.y;
}
i+= blockDim.x*gridDim.x;
}
}else{
i = threadIdx.x + blockIdx.x*blockDim.x;
j = threadIdx.y + blockIdx.y*blockDim.y + M;
while( i < M ){
j = threadIdx.y + blockIdx.y*blockDim.y + M;
while( j < N){
c[j*m+i] = a[i*n+j];
j+= blockDim.y*gridDim.y;
}
i+= blockDim.x*gridDim.x;
}
}
}
The kernel call is
dim3 hilos(16,16); // hilos(blockDim.x, blockDim.y)
dim3 bloques(8,8); // bloques(gridDim.x, gridDim.y)
kernelTranspuesta<<<bloques, hilos>>>(aD, cD, m, n);
I tested it on 512x256 and 256x512 matrices, let me know what you think.

Resources