Generic fast Transpose of non-square matrix CUDA

Generic fast Transpose of non-square matrix CUDA - matrix

The SDK provides an example and strategies for tackling a square matrix transpose but is there a good way of performing a transpose on a non square matrix? I have quite a naive implementation currently as follows which is probably terrible:
template<class S>
__global__ void transpose(S *Source, S *Destination, int SizeX, int SizeY) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid<SizeX*SizeY) {
int X = tid % SizeX;
int Y = tid / SizeX;
//(x,y) => (y,x)
int newId = (SizeY*X) + Y;
Destination[newId] = Source[tid];
}
}

Here my idea was to transpose the square part of the matrix with only the necessary threads/blocks (each thread swaps two entries of the square sub matrix), then traverse and transpose the remaining entries.
__global__ void kernelTranspuesta(float *a, float *c, int m, int n) {
int i = threadIdx.x + blockIdx.x*blockDim.x;
int j = threadIdx.y + blockIdx.y*blockDim.y;
int smallest = M < N ? M : N;
while( j < smallest ){
i = threadIdx.x + blockIdx.x*blockDim.x;
while( i < j ){
c[i*m+j] = a[j*n+i];
c[j*m+i] = a[i*n+j];
i+= blockDim.x*gridDim.x;
}
if(i == j)
c[j*m+i] = a[i*n+j];
j+= blockDim.y*gridDim.y;
}
if( M > N ) {
i = threadIdx.x + blockIdx.x*blockDim.x + N;
j = threadIdx.y + blockIdx.y*blockDim.y;
while( i < M ){
j = threadIdx.y + blockIdx.y*blockDim.y;
while( j < N){
c[j*m+i] = a[i*n+j];
j+= blockDim.y*gridDim.y;
}
i+= blockDim.x*gridDim.x;
}
}else{
i = threadIdx.x + blockIdx.x*blockDim.x;
j = threadIdx.y + blockIdx.y*blockDim.y + M;
while( i < M ){
j = threadIdx.y + blockIdx.y*blockDim.y + M;
while( j < N){
c[j*m+i] = a[i*n+j];
j+= blockDim.y*gridDim.y;
}
i+= blockDim.x*gridDim.x;
}
}
}
The kernel call is
dim3 hilos(16,16); // hilos(blockDim.x, blockDim.y)
dim3 bloques(8,8); // bloques(gridDim.x, gridDim.y)
kernelTranspuesta<<<bloques, hilos>>>(aD, cD, m, n);
I tested it on 512x256 and 256x512 matrices, let me know what you think.

Related

Runtime error for large inputs for sorting ( quicksort)

This is a very simple program where the user inputs (x,y) coordinates and distance 'd' and the program has to find out the number of unrepeated coordinates from (x,y) to (x+d,y).
For eg: if input for one test case is: 4,9,2 then the unrepeated coordinates are (4,9),(5,9) and (6,9)(x=4,y=9,d=2). I have used a sorting algorithm as mentioned in the question (to keep track of multiple occurrences) however the program shows runtime error for test cases beyond 30. Is there any mistake in the code or is it an issue with my compiler?
For a detailed explanation of question: https://www.hackerearth.com/practice/algorithms/sorting/merge-sort/practice-problems/algorithm/missing-soldiers-december-easy-easy/
#include <stdio.h>
#include <stdlib.h>
int partition(int *arr, int p, int r) {
int x;
x = arr[r];
int tmp;
int i = p - 1;
for (int j = p; j <= r - 1; ++j) {
if (arr[j] <= x) {
i = i + 1;
tmp = arr[i];
arr[i] = arr[j];
arr[j] = tmp;
}
}
tmp = arr[i + 1];
arr[i + 1] = arr[r];
arr[r] = tmp;
return (i + 1);
}
void quicksort(int *arr, int p, int r) {
int q;
if (p < r) {
q = partition(arr, p, r);
quicksort(arr, p, q - 1);
quicksort(arr, q + 1, r);
}
}
int count(int A[],int ct) {
int cnt = 0;
for (int i = 0; i < ct; ++i) {
if (A[i] != A[i + 1]) {
cnt++;
}
}
return cnt;
}
int main() {
int t;
scanf("%d", &t);
long int tmp, y, d;
int ct = 0;
int i = 0;
int x[1000];
int j = 0;
for (int l = 0; l < t; ++l) {
scanf("%d%d%d", &tmp, &y, &d);
ct = ct + d + 1; //this counts the total no of coordinates for each (x,y,d)
for (int i = 0; i <= d; ++i) {
x[j] = tmp + i; //storing all possible the x and x+d coordinates
j++;
}
}
int cnt;
int p = ct - 1;
quicksort(x, 0, p); //quicksort sorting
for (int l = 0; l < ct; ++l) {
printf("%d ", x[l]); //prints sorted array not necessary to question
}
cnt = count(x, ct); //counts the number of non-repeated vertices
printf("%d\n", cnt);
}

The problem was the bounds of the array int x[1000] is not enough for the data given below.

Find the number of intersections of n line segments with endpoints on two parallel lines

Finding the number of intersections of n line segments with endpoints on two parallel lines.
Let there be two sets of n points:
A={p1,p2,…,pn} on y=0
B={q1,q2,…,qn} on y=1
Each point pi is connected to its corresponding point qi to form a line segment.
I need to write a code using divide-and-conquer algorithm which returns the number of intersection points of all n line segments.
for example:
input:
3
1 101
-234 234
567 765
output:
1
I coded as below but it I have wrong answers.
can anyone help me with this code or give me another solution for the question?
#include<iostream>
#include <vector>
#include<algorithm>
using namespace std;
void merge1(vector< pair <int, int> > vect, int l, int m, int r)
{
int n1 = m - l + 1;
int n2 = r - m;
vector< pair <int, int> > vect_c_l(n1);
vector< pair <int, int> > vect_c_r(n2);
for (int i = 0; i < n1; i++)
vect_c_l[i] = vect[l + i];
for (int j = 0; j < n2; j++)
vect_c_r[j] = vect[m + 1 + j];
int i = 0;
int j = 0;
int k = l;
while (i < n1 && j < n2) {
if (vect_c_l[i].first <= vect_c_r[j].first) {
vect[k] = vect_c_l[i];
i++;
}
else {
vect[k] = vect_c_r[j];
j++;
}
k++;
}
while (i < n1) {
vect[k] = vect_c_l[i];
i++;
k++;
}
while (j < n2) {
vect[k] = vect_c_r[j];
j++;
k++;
}
}
int merge2(vector< pair <int, int> > vect, int l, int m, int r)
{
int n1 = m - l + 1;
int n2 = r - m;
int inv_count = 0;
vector< pair <int, int> > vect_c_l(n1);
vector< pair <int, int> > vect_c_r(n2);
for (int i = 0; i < n1; i++)
vect_c_l[i] = vect[l + i];
for (int j = 0; j < n2; j++)
vect_c_r[j] = vect[m + 1 + j];
int i = 0;
int j = 0;
int k = l;
while (i < n1 && j < n2) {
if (vect_c_l[i].second < vect_c_r[j].second) {
vect[k] = vect_c_l[i];
i++;
}
else {
vect[k] = vect_c_r[j];
j++;
inv_count = inv_count + (m - i);
}
k++;
}
while (i < n1) {
vect[k] = vect_c_l[i];
i++;
k++;
}
while (j < n2) {
vect[k] = vect_c_r[j];
j++;
k++;
}
return inv_count;
}
void mergeSort1(vector< pair <int, int> > vect, int l, int r) {
if (l >= r) {
return;
}
int m = l + (r - l) / 2;
mergeSort1(vect, l, m);
mergeSort1(vect, m + 1, r);
merge1(vect, l, m, r);
}
int mergeSort2(vector< pair <int, int> > vect, int l, int r) {
int inv_count = 0;
if (r > l) {
int m = l + (r - l) / 2;
inv_count += mergeSort2(vect, l, m);
inv_count += mergeSort2(vect, m+ 1, r);
/*Merge the two parts*/
inv_count += merge2(vect, l, m + 1, r);
}
return inv_count;
}
int main() {
int n,c=0;
cin >> n;
int a, b;
vector< pair <int, int> > vect;
for (int i = 0;i < n;i++) {
cin >> a >> b;
vect.push_back(make_pair(a, b));
}
mergeSort1(vect,0,n-1);
cout << mergeSort2(vect,0, n - 1);
}

I'd take advantage of the idea that computing whether the segments intersect is much simpler than computing where they intersect. Two segments intersect if their x values are on different sides of one another on y=1 and y=0. (i.e. if both x values on one segment are both smaller than the others, or both larger).
Objects make this easy to state. Build a segment object who's main job is to determine whether it intersects another instance.
class Segment {
constructor(x) {
this.x0 = x[0];
this.x1 = x[1];
}
// answer whether the reciever intersects the passed segment
intersects(segment) {
// this is ambiguous in the problem, but assume touching endpoints
// count as intersections
if (this.x0 === segment.x0 || this.x1 === segment.x1) return true;
let sort0 = this.x0 < segment.x0
let sort1 = this.x1 < segment.x1
return sort0 !== sort1
}
}
let input = [
[1, 101],
[-234, 234],
[567, 765]
];
let segments = input.map(x => new Segment(x))
// check segments with one another in pairs
let pairs = segments.map((v, i) => segments.slice(i + 1).map(w => [v, w])).flat();
let intersections = pairs.reduce((acc, p) => p[0].intersects(p[1]) ? acc + 1 : acc, 0)
console.log(intersections)

You can also see the problem by abstracting from all the lines.
If there were no intersection that would mean that the order of indexes on both parallel lines are the same.
So the number of intersections are equal to the number of swaps you need to perform on neughbor -points to get the same order of indexes on both sides
In your example you have the two sequences of indexes
1,3,4,2 on the upper line
2,1,4,3 on the lower line
to convert the lower sequence by swapping neighbours, you need 4 swaps:
2,1,4,3 start
-> 1,2,4,3
-> 1,4,2,3
-> 1,4,3,2
-> 1,3,4,2 = upper sequence

Maximal Square with 0 inside

The question Maximal Square in https://leetcode.com/problems/maximal-square/description/ is easy to solve by DP. But how to solve the following up question:
Similar as Maximal Square question, but allows 0's inside a square, "inside" means the border of the square must be all 1.
For example, given the following matrix:
1 0 1 0 0
1 0 1 1 1
1 1 1 0 1
1 0 1 1 1
Return 9.
Update: Because the 3*3 matrix in the right bottom corner matches the requirement, the border must be all 1, and there can be 0 inside the square.
I thought up a O(n^3) algorithm: take maze[i][j] as the right bottom corner of the square if maze[i][j] == 1, enumerate the edge length of the square. If edge length is 3, consider whether maze[i - 2][j - 2], maze[i][j - 2], maze[i - 2][j], maze[i][j] forms a square with the numbers in each edge are all 1.
Is there any better algorithm?

Your problem can be solved in O (n * m) time and space complexity, where n is total rows and m is total columns in matrix. You may look at the code below where I have commented out to make it understandable.
Please, let me know if you have any doubt.
#include <bits/stdc++.h>
using namespace std;
void precalRowSum(vector< vector<int> >& grid, vector< vector<int> >&rowSum, int n, int m) {
// contiguous sum upto jth position in ith row
for (int i = 0; i < n; ++i) {
int sum = 0;
for (int j = 0; j < m; ++j) {
if (grid[i][j] == 1) {
sum++;
} else {
sum = 0;
}
rowSum[i][j] = sum;
}
}
}
void precalColSum(vector< vector<int> >& grid, vector< vector<int> >&colSum, int n, int m) {
// contiguous sum upto ith position in jth column
for (int j = 0; j < m; ++j) {
int sum = 0;
for (int i = 0; i < n; ++i) {
if (grid[i][j] == 1) {
sum++;
} else {
sum = 0;
}
colSum[i][j] = sum;
}
}
}
int solve(vector< vector<int> >& grid, int n, int m) {
vector< vector<int> >rowSum(n, vector<int>(m, 0));
vector< vector<int> >colSum(n, vector<int>(m, 0));
// calculate rowwise sum for 1
precalRowSum(grid, rowSum, n, m);
// calculate colwise sum for 1
precalColSum(grid, colSum, n, m);
vector< vector<int> >zerosHeight(n, vector<int>(m, 0));
int ans = 0;
for (int i = 0; i < (n - 1); ++i) {
for (int j = 0; j < m; ++j) {
zerosHeight[i][j] = ( grid[i][j] == 0 );
if (grid[i][j] == 0 && i > 0) {
zerosHeight[i][j] += zerosHeight[i - 1][j];
}
}
if (i == 0) continue;
// perform calculation on ith row
for (int j = 1; j < m; ) {
int height = zerosHeight[i][j];
if (!height) {
j++;
continue;
}
int cnt = 0;
while (j < m && height == zerosHeight[i][j]) {
j++;
cnt++;
}
if ( j == m) break;
if (cnt == height && (i - cnt) >= 0 ) {
// zeros are valid, now check validity for boundries
// Check validity of upper boundray, lower boundary, left boundary, right boundary respectively
if (rowSum[i - cnt][j] >= (cnt + 2) && rowSum[i + 1][j] >= (cnt + 2) &&
colSum[i + 1][j - cnt - 1] >= (cnt + 2) && colSum[i + 1][j] >= (cnt + 2) ){
ans = max(ans, (cnt + 2) * (cnt + 2) );
}
}
}
}
return ans;
}
int main() {
int n, m;
cin>>n>>m;
vector< vector<int> >grid;
for (int i = 0; i < n; ++i) {
vector<int>tmp;
for (int j = 0; j < m; ++j) {
int x;
cin>>x;
tmp.push_back(x);
}
grid.push_back(tmp);
}
cout<<endl;
cout<< solve(grid, n, m) <<endl;
return 0;
}

3d point closest to multiple lines in 3D space

I search for non iterative, closed form, algorithm to find Least squares solution for point closest to the set of 3d lines. It is similar to 3d point triangulation (to minimize re-projections) but seems to be be simpler and faster?
Lines can be described in any form, 2 points, point and unit direction or similar.

Let the i th line be given by point ai and unit direction vector di. We need to find the single point that minimizes the sum of squared point to line distances. This is where the gradient is the zero vector:
Expanding the gradient,
Algebra yields a canonical 3x3 linear system,
where the k'th row (a 3-element row vector) of matrix M is
with vector ek the respective unit basis vector, and
It's not hard to turn this into code. I borrowed (and fixed a small bug in) a Gaussian elimination function from Rosettacode to solve the system. Thanks to the author!
#include <stdio.h>
#include <math.h>
typedef double VEC[3];
typedef VEC MAT[3];
void solve(double *a, double *b, double *x, int n); // linear solver
double dot(VEC a, VEC b) { return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]; }
void find_nearest_point(VEC p, VEC a[], VEC d[], int n) {
MAT m = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
VEC b = {0, 0, 0};
for (int i = 0; i < n; ++i) {
double d2 = dot(d[i], d[i]), da = dot(d[i], a[i]);
for (int ii = 0; ii < 3; ++ii) {
for (int jj = 0; jj < 3; ++jj) m[ii][jj] += d[i][ii] * d[i][jj];
m[ii][ii] -= d2;
b[ii] += d[i][ii] * da - a[i][ii] * d2;
}
}
solve(&m[0][0], b, p, 3);
}
// Debug printing.
void pp(VEC v, char *l, char *r) {
printf("%s%.3lf, %.3lf, %.3lf%s", l, v[0], v[1], v[2], r);
}
void pv(VEC v) { pp(v, "(", ")"); }
void pm(MAT m) { for (int i = 0; i < 3; ++i) pp(m[i], "\n[", "]"); }
// A simple verifier.
double dist2(VEC p, VEC a, VEC d) {
VEC pa = { a[0]-p[0], a[1]-p[1], a[2]-p[2] };
double dpa = dot(d, pa);
return dot(d, d) * dot(pa, pa) - dpa * dpa;
}
double sum_dist2(VEC p, VEC a[], VEC d[], int n) {
double sum = 0;
for (int i = 0; i < n; ++i) sum += dist2(p, a[i], d[i]);
return sum;
}
// Check 26 nearby points and verify the provided one is nearest.
int is_nearest(VEC p, VEC a[], VEC d[], int n) {
double min_d2 = 1e100;
int ii = 2, jj = 2, kk = 2;
#define D 0.01
for (int i = -1; i <= 1; ++i)
for (int j = -1; j <= 1; ++j)
for (int k = -1; k <= 1; ++k) {
VEC pp = { p[0] + D * i, p[1] + D * j, p[2] + D * k };
double d2 = sum_dist2(pp, a, d, n);
// Prefer provided point among equals.
if (d2 < min_d2 || i == 0 && j == 0 && k == 0 && d2 == min_d2) {
min_d2 = d2;
ii = i; jj = j; kk = k;
}
}
return ii == 0 && jj == 0 && kk == 0;
}
void normalize(VEC v) {
double len = sqrt(dot(v, v));
v[0] /= len;
v[1] /= len;
v[2] /= len;
}
int main(void) {
VEC a[] = {{-14.2, 17, -1}, {1, 1, 1}, {2.3, 4.1, 9.8}, {1,2,3}};
VEC d[] = {{1.3, 1.3, -10}, {12.1, -17.2, 1.1}, {19.2, 31.8, 3.5}, {4,5,6}};
int n = 4;
for (int i = 0; i < n; ++i) normalize(d[i]);
VEC p;
find_nearest_point(p, a, d, n);
pv(p);
printf("\n");
if (!is_nearest(p, a, d, n)) printf("Woops. Not nearest.\n");
return 0;
}
// A linear solver from rosettacode (with bug fix: added a missing fabs())
#define mat_elem(a, y, x, n) (a + ((y) * (n) + (x)))
void swap_row(double *a, double *b, int r1, int r2, int n)
{
double tmp, *p1, *p2;
int i;
if (r1 == r2) return;
for (i = 0; i < n; i++) {
p1 = mat_elem(a, r1, i, n);
p2 = mat_elem(a, r2, i, n);
tmp = *p1, *p1 = *p2, *p2 = tmp;
}
tmp = b[r1], b[r1] = b[r2], b[r2] = tmp;
}
void solve(double *a, double *b, double *x, int n)
{
#define A(y, x) (*mat_elem(a, y, x, n))
int i, j, col, row, max_row, dia;
double max, tmp;
for (dia = 0; dia < n; dia++) {
max_row = dia, max = fabs(A(dia, dia));
for (row = dia + 1; row < n; row++)
if ((tmp = fabs(A(row, dia))) > max) max_row = row, max = tmp;
swap_row(a, b, dia, max_row, n);
for (row = dia + 1; row < n; row++) {
tmp = A(row, dia) / A(dia, dia);
for (col = dia+1; col < n; col++)
A(row, col) -= tmp * A(dia, col);
A(row, dia) = 0;
b[row] -= tmp * b[dia];
}
}
for (row = n - 1; row >= 0; row--) {
tmp = b[row];
for (j = n - 1; j > row; j--) tmp -= x[j] * A(row, j);
x[row] = tmp / A(row, row);
}
#undef A
}
This isn't extensively tested, but seems to be working fine.

Let base point of line is p and unit direction vector is d.
Then distance from point v to this line might be calculated using cross product
SquaredDist = ((v - p) x d)^2
Using Maple packet symbolic calculation, we can get
d := <dx, dy, dz>;
v := <vx, vy, vz>;
p := <px, py, pz>;
w := v - p;
cp := CrossProduct(d, w);
nrm := BilinearForm(cp, cp, conjugate=false); //squared dist
nr := expand(nrm);
//now partial derivatives
nrx := diff(nr, vx);
//results:
nrx := -2*dz^2*px-2*dy^2*px+2*dz^2*vx+2*dy^2*vx
+2*dx*py*dy-2*dx*vy*dy+2*dz*dx*pz-2*dz*dx*vz
nry := -2*dx^2*py-2*dz^2*py-2*dy*vz*dz+2*dx^2*vy
+2*dz^2*vy+2*dy*pz*dz+2*dx*dy*px-2*dx*dy*vx
nrz := -2*dy^2*pz+2*dy^2*vz-2*dy*dz*vy+2*dx^2*vz
-2*dx^2*pz-2*dz*vx*dx+2*dy*dz*py+2*dz*px*dx
To minimize sum of squared distances, we have to make system of linear equations for zero partial derivatives like this:
vx*2*(Sum(dz^2)+Sum(dy^2)) + vy * (-2*Sum(dx*dy)) + vz *(-2*Sum(dz*dx)) =
2*Sum(dz^2*px)-2*Sum(dy^2*px) -2*Sum(dx*py*dy)-2*Sum(dz*dx*pz)
where
Sum(dz^2) = Sum{over all i in line indexes} {dz[i] * dz[i]}
and solve it for unknowns vx, vy, vz
Edit: Old erroneous answer for planes instead of lines, left for reference
If we use general equation of line
A * x + B * y + C * z + D = 0
then distance from point (x, y, z) to this line is
Dist = Abs(A * x + B * y + C * z + D) / Sqrt(A^2 + B^2 + C^2)
To simplify - just normalize all line equations dividing by Norm's
Norm = Sqrt(A^2 + B^2 + C^2)
a = A / Norm
b = B / Norm
c = C / Norm
d = D / Norm
now equation is
a * x + b * y + c * z + d = 0
and distance
Dist = Abs(a * x + b * y + c * z + d)
and we can use squared distances like LS method (ai, bi, ci, di are coefficients for i-th line)
F = Sum(ai*x + bi*y + ci * z + d)^2 =
Sum(ai^2*x^2 + bi^2*y^2 + ci^2*z^2 + d^2 +
2 * (ai*bi*x*y + ai*ci*x*z + bi*y*ci*z + ai*x*di + bi*y*di + ci*z*di))
partial derivatives
dF/dx = 2*Sum(ai^2*x + ai*bi*y + ai*ci*z + ai*di) = 0
dF/dy = 2*Sum(bi^2*y + ai*bi*x + bi*ci*z + bi*di) = 0
dF/dz = 2*Sum(ci^2*z + ai*ci*x + bi*ci*y + ci*di) = 0
so we have system of linear equation
x * Sum(ai^2) + y * Sum(ai*bi) + z * Sum(ai*ci)= - Sum(ai*di)
y * Sum(bi^2) + x * Sum(ai*bi) + z * Sum(bi*ci)= - Sum(bi*di)
z * Sum(ci^2) + x * Sum(ai*ci) + y * Sum(bi*ci)= - Sum(ci*di)
x * Saa + y * Sab + z * Sac = - Sad
x * Sab + y * Sbb + z * Sbc = - Sbd
x * Sac + y * Sbc + z * Scc = - Scd
where S** are corresponding sums
and can solve it for unknowns x, y, z

I needed this for a sketch in Processing, so I ported Gene's answer. Works great and thought it might save someone else a little time. Unfortunately PVector/PMatrix don't have array accessors for vectors or matrices so I had to add these as local functions.
float getv(PVector v, int i) {
if(i == 0) return v.x;
if(i == 1) return v.y;
return v.z;
}
void setv(PVector v, int i, float value) {
if (i == 0) v.x = value;
else if (i == 1) v.y = value;
else v.z = value;
}
void incv(PVector v, int i, float value) {
setv(v,i,getv(v,i) + value);
}
float getm(float[] mm, int r, int c) { return mm[c + r*4]; }
void setm(float[] mm, int r, int c, float value) { mm[c + r*4] = value; }
void incm(float[] mm, int r, int c, float value) { mm[c + r*4] += value; }
PVector findNearestPoint(PVector a[], PVector d[]) {
var mm = new float[16];
var b = new PVector();
var n = a.length;
for (int i = 0; i < n; ++i) {
var d2 = d[i].dot(d[i]);
var da = d[i].dot(a[i]);
for (int ii = 0; ii < 3; ++ii) {
for (int jj = 0; jj < 3; ++jj) {
incm(mm,ii,jj, getv(d[i],ii) * getv(d[i],jj));
}
incm(mm, ii,ii, -d2);
incv(b, ii, getv(d[i], ii) * da - getv(a[i], ii) * d2);
}
}
var p = solve(mm, new float[] {b.x, b.y, b.z});
return new PVector(p[0],p[1],p[2]);
}
// Verifier
float dist2(PVector p, PVector a, PVector d) {
PVector pa = new PVector( a.x-p.x, a.y-p.y, a.z-p.z );
float dpa = d.dot(pa);
return d.dot(d) * pa.dot(pa) - dpa * dpa;
}
//double sum_dist2(VEC p, VEC a[], VEC d[], int n) {
float sum_dist2(PVector p, PVector a[], PVector d[]) {
int n = a.length;
float sum = 0;
for (int i = 0; i < n; ++i) {
sum += dist2(p, a[i], d[i]);
}
return sum;
}
// Check 26 nearby points and verify the provided one is nearest.
boolean isNearest(PVector p, PVector a[], PVector d[]) {
float min_d2 = 3.4028235E38;
int ii = 2, jj = 2, kk = 2;
final float D = 0.1f;
for (int i = -1; i <= 1; ++i)
for (int j = -1; j <= 1; ++j)
for (int k = -1; k <= 1; ++k) {
PVector pp = new PVector( p.x + D * i, p.y + D * j, p.z + D * k );
float d2 = sum_dist2(pp, a, d);
// Prefer provided point among equals.
if (d2 < min_d2 || i == 0 && j == 0 && k == 0 && d2 == min_d2) {
min_d2 = d2;
ii = i; jj = j; kk = k;
}
}
return ii == 0 && jj == 0 && kk == 0;
}
void setup() {
PVector a[] = {
new PVector(-14.2, 17, -1),
new PVector(1, 1, 1),
new PVector(2.3, 4.1, 9.8),
new PVector(1,2,3)
};
PVector d[] = {
new PVector(1.3, 1.3, -10),
new PVector(12.1, -17.2, 1.1),
new PVector(19.2, 31.8, 3.5),
new PVector(4,5,6)
};
int n = 4;
for (int i = 0; i < n; ++i)
d[i].normalize();
PVector p = findNearestPoint(a, d);
println(p);
if (!isNearest(p, a, d))
println("Woops. Not nearest.\n");
}
// From rosettacode (with bug fix: added a missing fabs())
int mat_elem(int y, int x) { return y*4+x; }
void swap_row(float[] a, float[] b, int r1, int r2, int n)
{
float tmp;
int p1, p2;
int i;
if (r1 == r2) return;
for (i = 0; i < n; i++) {
p1 = mat_elem(r1, i);
p2 = mat_elem(r2, i);
tmp = a[p1];
a[p1] = a[p2];
a[p2] = tmp;
}
tmp = b[r1];
b[r1] = b[r2];
b[r2] = tmp;
}
float[] solve(float[] a, float[] b)
{
float[] x = new float[] {0,0,0};
int n = x.length;
int i, j, col, row, max_row, dia;
float max, tmp;
for (dia = 0; dia < n; dia++) {
max_row = dia;
max = abs(getm(a, dia, dia));
for (row = dia + 1; row < n; row++) {
if ((tmp = abs(getm(a, row, dia))) > max) {
max_row = row;
max = tmp;
}
}
swap_row(a, b, dia, max_row, n);
for (row = dia + 1; row < n; row++) {
tmp = getm(a, row, dia) / getm(a, dia, dia);
for (col = dia+1; col < n; col++) {
incm(a, row, col, -tmp * getm(a, dia, col));
}
setm(a,row,dia, 0);
b[row] -= tmp * b[dia];
}
}
for (row = n - 1; row >= 0; row--) {
tmp = b[row];
for (j = n - 1; j > row; j--) {
tmp -= x[j] * getm(a, row, j);
}
x[row] = tmp / getm(a, row, row);
}
return x;
}

how to generate Chase's sequence

In the draft section 7.2.1.3 of The art of computer programming, generating all combinations, Knuth introduced Algorithm C for generating Chase's sequence.
He also mentioned a similar algorithm (based on the following equation) working with index-list without source code (exercise 45 of the draft).
I finally worked out a c++ version which I think is quite ugly. To generate all C_n^m combination, the memory complexity is about 3 (m+1) and the time complexity is bounded by O(m n^m)
class chase_generator_t{
public:
using size_type = ptrdiff_t;
enum class GET : char{ VALUE, INDEX };
chase_generator_t(size_type _n) : n(_n){}
void choose(size_type _m){
m = _m;
++_m;
index.resize(_m);
threshold.resize(_m + 1);
tag.resize(_m);
for (size_type i = 0, j = n - m; i != _m; ++i){
index[i] = j + i;
tag[i] = tag_t::DECREASE;
using std::max;
threshold[i] = max(i - 1, (index[i] - 3) | 1);
}
threshold[_m] = n;
}
bool get(size_type &x, size_type &y, GET const which){
if (which == GET::VALUE) return __get<false>(x, y);
return __get<true>(x, y);
}
size_type get_n() const{
return n;
}
size_type get_m() const{
return m;
}
size_type operator[](size_t const i) const{
return index[i];
}
private:
enum class tag_t : char{ DECREASE, INCREASE };
size_type n, m;
std::vector<size_type> index, threshold;
std::vector<tag_t> tag;
template<bool GetIndex>
bool __get(size_type &x, size_type &y){
using std::max;
size_type p = 0, i, q;
find:
q = p + 1;
if (index[p] == threshold[q]){
if (q >= m) return false;
p = q;
goto find;
}
x = GetIndex ? p : index[p];
if (tag[p] == tag_t::INCREASE){
using std::min;
increase:
index[p] = min(index[p] + 2, threshold[q]);
threshold[p] = index[p] - 1;
}
else if (index[p] && (i = (index[p] - 1) & ~1) >= p){
index[p] = i;
threshold[p] = max(p - 1, (index[p] - 3) | 1);
}
else{
tag[p] = tag_t::INCREASE;
i = p | 1;
if (index[p] == i) goto increase;
index[p] = i;
threshold[p] = index[p] - 1;
}
y = index[p];
for (q = 0; q != p; ++q){
tag[q] = tag_t::DECREASE;
threshold[q] = max(q - 1, (index[q] - 3) | 1);
}
return true;
}
};
Does any one has a better implementation, i.e. run faster with the same memory or use less memory with the same speed?

I think that the C code below is closer to what Knuth had in mind. Undoubtedly there are ways to make it more elegant (in particular, I'm leaving some scaffolding in case it helps with experimentation), though I'm skeptical that the array w can be disposed of. If storage is really important for some reason, then steal the sign bit from the a array.
#include <stdbool.h>
#include <stdio.h>
enum {
N = 10,
T = 5
};
static void next(int a[], bool w[], int *r) {
bool found_r = false;
int j;
for (j = *r; !w[j]; j++) {
int b = a[j] + 1;
int n = a[j + 1];
if (b < (w[j + 1] ? n - (2 - (n & 1)) : n)) {
if ((b & 1) == 0 && b + 1 < n) b++;
a[j] = b;
if (!found_r) *r = j > 1 ? j - 1 : 0;
return;
}
w[j] = a[j] - 1 >= j;
if (w[j] && !found_r) {
*r = j;
found_r = true;
}
}
int b = a[j] - 1;
if ((b & 1) != 0 && b - 1 >= j) b--;
a[j] = b;
w[j] = b - 1 >= j;
if (!found_r) *r = j;
}
int main(void) {
typedef char t_less_than_n[T < N ? 1 : -1];
int a[T + 1];
bool w[T + 1];
for (int j = 0; j < T + 1; j++) {
a[j] = N - (T - j);
w[j] = true;
}
int r = 0;
do {
for (int j = T - 1; j > -1; j--) printf("%x", a[j]);
putchar('\n');
if (false) {
for (int j = T - 1; j > -1; j--) printf("%d", w[j]);
putchar('\n');
}
next(a, w, &r);
} while (a[T] == N);
}

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Generic fast Transpose of non-square matrix CUDA - matrix

Related

Runtime error for large inputs for sorting ( quicksort)

Find the number of intersections of n line segments with endpoints on two parallel lines

Maximal Square with 0 inside

3d point closest to multiple lines in 3D space

how to generate Chase's sequence

Categories

Resources