OpenCL barrier of finding max in a block - parallel-processing

I've found a piece of OpenCL kernel sample code in Nvidia's developer site
The purpose function maxOneBlock is to find out the biggest value of array maxValue and store it to maxValue[0].
I was fully understand about the looping part, but confused about the unroll part: Why the unroll part do not need to sync thread after each step is done?
e.g: When one thread is done the comparison of localId and localId+32, how does it ensure other thread have stored its result to localId+16?
The kernel code:
void maxOneBlock(__local float maxValue[],
__local int maxInd[])
{
uint localId = get_local_id(0);
uint localSize = get_local_size(0);
int idx;
float m1, m2, m3;
for (uint s = localSize/2; s > 32; s >>= 1)
{
if (localId < s)
{
m1 = maxValue[localId];
m2 = maxValue[localId+s];
m3 = (m1 >= m2) ? m1 : m2;
idx = (m1 >= m2) ? localId : localId + s;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// unroll the final warp to reduce loop and sync overheads
if (localId < 32)
{
m1 = maxValue[localId];
m2 = maxValue[localId+32];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 32;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
m1 = maxValue[localId];
m2 = maxValue[localId+16];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 16;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
m1 = maxValue[localId];
m2 = maxValue[localId+8];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 8;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
m1 = maxValue[localId];
m2 = maxValue[localId+4];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 4;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
m1 = maxValue[localId];
m2 = maxValue[localId+2];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 2;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
m1 = maxValue[localId];
m2 = maxValue[localId+1];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 1;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
}
}

Why the unroll part do not need to sync thread after each step is done?
The sample is incorrect, a barrier is indeed required after each step.
It looks like the sample is written in warp-synchronous style, which is a way of exploiting the underlying execution mechanism of the warps on NVIDIA hardware, but incorrect synchronization will cause it to break if the underlying execution mechanism changes or in presence of compiler optimizations.

Related

C++ algorithm code for Magical sequence that will generate desired output

The Magical Sequence
A Magical Sequence is defined as shown.
Magical[1] = 0
Magical[2] = 1
Magical[n] = Magical[n-1] + 2*Magical[n-2] + 3*Magical[n-3] + ... (n-1)*Magical[1] + n*1., for n > 2
Given n (1 <= n <= 10^9 ), find Magical[n].
Example 1: input: 3
Output: 4
Explanation:
Magical[n] = 1*Magical[n-1] + 2*Magical[n-2] + 3*1
Magical[3] = 1*Magical[2] + 2*Magical[1] + 3*1
Magical[3] = 1*1 + 2*0 + 3*1
Magical[3] = 4
Example 2: input: 4
Output: 10
Magical[4] = 1*Magical[3]+2*Magical[2]+3*Magical[1]+4*1
= 1*4+2*1+3*0+4 = 10
Example 3: input: 5
Output: 26
Magical[5] = 1*Magical[4]+2*Magical[3]+3*Magical[2]+4*Magical[1]+5*1
= 1*10+2*4+3*1+4*0+5 = 26
I tried something like below :-
int CuckooNum(int n)
{
if (1 == n)
{
return 0;
}
else if (2 == n)
{
return 1;
}
std::vector<int> vec;
vec.resize(n);
vec[0] = 4;
vec[1] = 0;
vec[2] = 1;
int multiplyer = n;
int result = 0;
for (int index=3; index <= n; index++)
{
result += multiplyer * vec[index-1];
vec[index] = result;
multiplyer--;
}
return result;
}
long long func(int n)
{
if (n==1) return 0;
else if (n==2) return 1;
else return 1*func(n-1)+2*func(n-2)+n;
}
As the size n can be very large (10^9), a direct implementation O(n^2) is not possible.
A specific algorithm is needed. I will focus here on the algorithm, and propose a O(log n) solution.
To simplify explanation, I rename magical[] as x[]
Moreover, we can define x[0] = 1. Then,
x[n] = x[n-1] + 2*x[n-2] + 3*x[n-3] + ... (n-1)*x[1] + n*x[0]
As
x[n-1] = 1*x[n-2] + 2*x[n-3] + ... (n-2)*x[1] + (n-1)*x[0]
It follows
x[n] - x[n-1] = x[n-1] + x[n-2] + x[n-3] + ... x[1] + x[0] = S[n-1]
When S[n] represents the sum of the terms until n (x[0] included)
Moreover,
S[n] = S[n-1] + x[n] = 2*S[n-1] + x[n-1]
Therefore, the iterative formula can be represented in a simple matrix form:
(x[n]) = (1 1) (x[n-1])
(S[n]) (1 2) (S[n-1])
Or, defining the vector (x[n] S[n])^t as Z[n]:
Z[n] = A * Z[n-1] where A is the matrix (1 1)
(1 2)
Note: this formula is valid for n>= 4 only, as the first x[n] values do no respect the simple recurrence relation.
It follows that
Z[n] = A^(n-3) Z[3] with Z[3] = (4 6)^t
Classically, this calculation can be performed with O(log n) complexity, iteratively calculating A^2, A^4, A^8 etc.
Pay attention that the values increase rapidly.
Here is an example of C++ implementation. Note that this implementation is not optimized, as for example it doesn't use the fact that all matrices are symmetric.
#include <iostream>
#include <array>
using Matr22 = std::array<std::array<long long int, 2>, 2>;
using Vect2 = std::array<long long int, 2>;
Matr22 Matrsquare (const Matr22 &m) {
Matr22 m2;
m2[0][0] = m[0][0]*m[0][0] + m[0][1]*m[1][0];
m2[0][1] = m[0][0]*m[0][1] + m[0][1]*m[1][1];
m2[1][0] = m[1][0]*m[0][0] + m[1][1]*m[1][0];
m2[1][1] = m[1][0]*m[0][1] + m[1][1]*m[1][1];
return m2;
}
Matr22 Mult (const Matr22 &m1, const Matr22 &m2) {
Matr22 y;
y[0][0] = m1[0][0]*m2[0][0] + m1[0][1]*m2[1][0];
y[0][1] = m1[0][0]*m2[0][1] + m1[0][1]*m2[1][1];
y[1][0] = m1[1][0]*m2[0][0] + m1[1][1]*m2[1][0];
y[1][1] = m1[1][0]*m2[0][1] + m1[1][1]*m2[1][1];
return y;
}
Vect2 Mult (const Matr22 &m, const Vect2& x) {
Vect2 y;
y[0] = m[0][0] * x[0] + m[0][1] * x[1];
y[1] = m[1][0] * x[0] + m[1][1] * x[1];
return y;
}
// Matrix exponentiation
Matr22 Mult_exp (const Matr22 &m, int exp) {
Matr22 y = {1, 0, 0, 1};
if (exp == 0) return y;
Matr22 M2k = m;
while (exp) {
if (exp%2) y = Mult (y, M2k);
M2k = Matrsquare (M2k);
exp /= 2;
};
return y;
}
long long int Magical (int n) {
if (n == 1) return 0;
if (n == 2) return 1;
if (n == 3) return 4;
Matr22 A = {1, 1, 1, 2};
Vect2 z = {4, 6}; // corresponds to n=3
auto Ak = Mult_exp (A, n-3);
z = Mult (Ak, z);
return z[0];
}
int main() {
int n;
std::cout << "Input n: ";
std::cin >> n;
auto ans = Magical (n);
std::cout << "Magical[" << n << "] = " << ans << '\n';
}

The Two Water Jug Puzzle

I am trying to solve the two WATER JUG PUZZLE using euclidean algorithm and Diophantine equation.
let gcd(m,n) = g
using euclidean aldortihm we get X' and Y' such that mX' + nY' = g
for mX + nY = d
if d%g!= 0 no solution exists
else i made X' as X' / g * d and Y' as Y' / g * d
this is one solution for mX + nY = d
now multiple solutions by m ( X' + ( k * n / g ) ) + n ( Y' - ( m * k / g ) ) = d
i just needed to output the SUM OF NO. OPERATIONS
so, i think of the solution as X' + Y' + k * ( n - m ) / g and i want to minimise this
my code below for the same (its giving wrong answers...)
int X, Y;
int gcd(int a, int b)
{
if (b == 0)
{
X = 1;
Y = 0;
return a;
}
int g = gcd(b, a % b);
int X1 = Y;
int Y1 = X - (a / b) * Y;
X = X1;
Y = Y1;
return g;
}
cin >> m >> n >> d;
int g = gcd(n, m);
if (d % g)
cout << -1 << endl;
else
{
X = X / g * d;
Y = Y / g * d;
int ans = X + Y;
int mn = ans;
while (ans > 0)
{
ans += ((m - n) / g);
mn = min(ans, mn);
}
while (ans < 10000)
{
ans += ((n - m) / g);
mn = min(ans, mn);
}
cout << mn << endl;
}

3d point closest to multiple lines in 3D space

I search for non iterative, closed form, algorithm to find Least squares solution for point closest to the set of 3d lines. It is similar to 3d point triangulation (to minimize re-projections) but seems to be be simpler and faster?
Lines can be described in any form, 2 points, point and unit direction or similar.
Let the i th line be given by point ai and unit direction vector di. We need to find the single point that minimizes the sum of squared point to line distances. This is where the gradient is the zero vector:
Expanding the gradient,
Algebra yields a canonical 3x3 linear system,
where the k'th row (a 3-element row vector) of matrix M is
with vector ek the respective unit basis vector, and
It's not hard to turn this into code. I borrowed (and fixed a small bug in) a Gaussian elimination function from Rosettacode to solve the system. Thanks to the author!
#include <stdio.h>
#include <math.h>
typedef double VEC[3];
typedef VEC MAT[3];
void solve(double *a, double *b, double *x, int n); // linear solver
double dot(VEC a, VEC b) { return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]; }
void find_nearest_point(VEC p, VEC a[], VEC d[], int n) {
MAT m = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
VEC b = {0, 0, 0};
for (int i = 0; i < n; ++i) {
double d2 = dot(d[i], d[i]), da = dot(d[i], a[i]);
for (int ii = 0; ii < 3; ++ii) {
for (int jj = 0; jj < 3; ++jj) m[ii][jj] += d[i][ii] * d[i][jj];
m[ii][ii] -= d2;
b[ii] += d[i][ii] * da - a[i][ii] * d2;
}
}
solve(&m[0][0], b, p, 3);
}
// Debug printing.
void pp(VEC v, char *l, char *r) {
printf("%s%.3lf, %.3lf, %.3lf%s", l, v[0], v[1], v[2], r);
}
void pv(VEC v) { pp(v, "(", ")"); }
void pm(MAT m) { for (int i = 0; i < 3; ++i) pp(m[i], "\n[", "]"); }
// A simple verifier.
double dist2(VEC p, VEC a, VEC d) {
VEC pa = { a[0]-p[0], a[1]-p[1], a[2]-p[2] };
double dpa = dot(d, pa);
return dot(d, d) * dot(pa, pa) - dpa * dpa;
}
double sum_dist2(VEC p, VEC a[], VEC d[], int n) {
double sum = 0;
for (int i = 0; i < n; ++i) sum += dist2(p, a[i], d[i]);
return sum;
}
// Check 26 nearby points and verify the provided one is nearest.
int is_nearest(VEC p, VEC a[], VEC d[], int n) {
double min_d2 = 1e100;
int ii = 2, jj = 2, kk = 2;
#define D 0.01
for (int i = -1; i <= 1; ++i)
for (int j = -1; j <= 1; ++j)
for (int k = -1; k <= 1; ++k) {
VEC pp = { p[0] + D * i, p[1] + D * j, p[2] + D * k };
double d2 = sum_dist2(pp, a, d, n);
// Prefer provided point among equals.
if (d2 < min_d2 || i == 0 && j == 0 && k == 0 && d2 == min_d2) {
min_d2 = d2;
ii = i; jj = j; kk = k;
}
}
return ii == 0 && jj == 0 && kk == 0;
}
void normalize(VEC v) {
double len = sqrt(dot(v, v));
v[0] /= len;
v[1] /= len;
v[2] /= len;
}
int main(void) {
VEC a[] = {{-14.2, 17, -1}, {1, 1, 1}, {2.3, 4.1, 9.8}, {1,2,3}};
VEC d[] = {{1.3, 1.3, -10}, {12.1, -17.2, 1.1}, {19.2, 31.8, 3.5}, {4,5,6}};
int n = 4;
for (int i = 0; i < n; ++i) normalize(d[i]);
VEC p;
find_nearest_point(p, a, d, n);
pv(p);
printf("\n");
if (!is_nearest(p, a, d, n)) printf("Woops. Not nearest.\n");
return 0;
}
// A linear solver from rosettacode (with bug fix: added a missing fabs())
#define mat_elem(a, y, x, n) (a + ((y) * (n) + (x)))
void swap_row(double *a, double *b, int r1, int r2, int n)
{
double tmp, *p1, *p2;
int i;
if (r1 == r2) return;
for (i = 0; i < n; i++) {
p1 = mat_elem(a, r1, i, n);
p2 = mat_elem(a, r2, i, n);
tmp = *p1, *p1 = *p2, *p2 = tmp;
}
tmp = b[r1], b[r1] = b[r2], b[r2] = tmp;
}
void solve(double *a, double *b, double *x, int n)
{
#define A(y, x) (*mat_elem(a, y, x, n))
int i, j, col, row, max_row, dia;
double max, tmp;
for (dia = 0; dia < n; dia++) {
max_row = dia, max = fabs(A(dia, dia));
for (row = dia + 1; row < n; row++)
if ((tmp = fabs(A(row, dia))) > max) max_row = row, max = tmp;
swap_row(a, b, dia, max_row, n);
for (row = dia + 1; row < n; row++) {
tmp = A(row, dia) / A(dia, dia);
for (col = dia+1; col < n; col++)
A(row, col) -= tmp * A(dia, col);
A(row, dia) = 0;
b[row] -= tmp * b[dia];
}
}
for (row = n - 1; row >= 0; row--) {
tmp = b[row];
for (j = n - 1; j > row; j--) tmp -= x[j] * A(row, j);
x[row] = tmp / A(row, row);
}
#undef A
}
This isn't extensively tested, but seems to be working fine.
Let base point of line is p and unit direction vector is d.
Then distance from point v to this line might be calculated using cross product
SquaredDist = ((v - p) x d)^2
Using Maple packet symbolic calculation, we can get
d := <dx, dy, dz>;
v := <vx, vy, vz>;
p := <px, py, pz>;
w := v - p;
cp := CrossProduct(d, w);
nrm := BilinearForm(cp, cp, conjugate=false); //squared dist
nr := expand(nrm);
//now partial derivatives
nrx := diff(nr, vx);
//results:
nrx := -2*dz^2*px-2*dy^2*px+2*dz^2*vx+2*dy^2*vx
+2*dx*py*dy-2*dx*vy*dy+2*dz*dx*pz-2*dz*dx*vz
nry := -2*dx^2*py-2*dz^2*py-2*dy*vz*dz+2*dx^2*vy
+2*dz^2*vy+2*dy*pz*dz+2*dx*dy*px-2*dx*dy*vx
nrz := -2*dy^2*pz+2*dy^2*vz-2*dy*dz*vy+2*dx^2*vz
-2*dx^2*pz-2*dz*vx*dx+2*dy*dz*py+2*dz*px*dx
To minimize sum of squared distances, we have to make system of linear equations for zero partial derivatives like this:
vx*2*(Sum(dz^2)+Sum(dy^2)) + vy * (-2*Sum(dx*dy)) + vz *(-2*Sum(dz*dx)) =
2*Sum(dz^2*px)-2*Sum(dy^2*px) -2*Sum(dx*py*dy)-2*Sum(dz*dx*pz)
where
Sum(dz^2) = Sum{over all i in line indexes} {dz[i] * dz[i]}
and solve it for unknowns vx, vy, vz
Edit: Old erroneous answer for planes instead of lines, left for reference
If we use general equation of line
A * x + B * y + C * z + D = 0
then distance from point (x, y, z) to this line is
Dist = Abs(A * x + B * y + C * z + D) / Sqrt(A^2 + B^2 + C^2)
To simplify - just normalize all line equations dividing by Norm's
Norm = Sqrt(A^2 + B^2 + C^2)
a = A / Norm
b = B / Norm
c = C / Norm
d = D / Norm
now equation is
a * x + b * y + c * z + d = 0
and distance
Dist = Abs(a * x + b * y + c * z + d)
and we can use squared distances like LS method (ai, bi, ci, di are coefficients for i-th line)
F = Sum(ai*x + bi*y + ci * z + d)^2 =
Sum(ai^2*x^2 + bi^2*y^2 + ci^2*z^2 + d^2 +
2 * (ai*bi*x*y + ai*ci*x*z + bi*y*ci*z + ai*x*di + bi*y*di + ci*z*di))
partial derivatives
dF/dx = 2*Sum(ai^2*x + ai*bi*y + ai*ci*z + ai*di) = 0
dF/dy = 2*Sum(bi^2*y + ai*bi*x + bi*ci*z + bi*di) = 0
dF/dz = 2*Sum(ci^2*z + ai*ci*x + bi*ci*y + ci*di) = 0
so we have system of linear equation
x * Sum(ai^2) + y * Sum(ai*bi) + z * Sum(ai*ci)= - Sum(ai*di)
y * Sum(bi^2) + x * Sum(ai*bi) + z * Sum(bi*ci)= - Sum(bi*di)
z * Sum(ci^2) + x * Sum(ai*ci) + y * Sum(bi*ci)= - Sum(ci*di)
x * Saa + y * Sab + z * Sac = - Sad
x * Sab + y * Sbb + z * Sbc = - Sbd
x * Sac + y * Sbc + z * Scc = - Scd
where S** are corresponding sums
and can solve it for unknowns x, y, z
I needed this for a sketch in Processing, so I ported Gene's answer. Works great and thought it might save someone else a little time. Unfortunately PVector/PMatrix don't have array accessors for vectors or matrices so I had to add these as local functions.
float getv(PVector v, int i) {
if(i == 0) return v.x;
if(i == 1) return v.y;
return v.z;
}
void setv(PVector v, int i, float value) {
if (i == 0) v.x = value;
else if (i == 1) v.y = value;
else v.z = value;
}
void incv(PVector v, int i, float value) {
setv(v,i,getv(v,i) + value);
}
float getm(float[] mm, int r, int c) { return mm[c + r*4]; }
void setm(float[] mm, int r, int c, float value) { mm[c + r*4] = value; }
void incm(float[] mm, int r, int c, float value) { mm[c + r*4] += value; }
PVector findNearestPoint(PVector a[], PVector d[]) {
var mm = new float[16];
var b = new PVector();
var n = a.length;
for (int i = 0; i < n; ++i) {
var d2 = d[i].dot(d[i]);
var da = d[i].dot(a[i]);
for (int ii = 0; ii < 3; ++ii) {
for (int jj = 0; jj < 3; ++jj) {
incm(mm,ii,jj, getv(d[i],ii) * getv(d[i],jj));
}
incm(mm, ii,ii, -d2);
incv(b, ii, getv(d[i], ii) * da - getv(a[i], ii) * d2);
}
}
var p = solve(mm, new float[] {b.x, b.y, b.z});
return new PVector(p[0],p[1],p[2]);
}
// Verifier
float dist2(PVector p, PVector a, PVector d) {
PVector pa = new PVector( a.x-p.x, a.y-p.y, a.z-p.z );
float dpa = d.dot(pa);
return d.dot(d) * pa.dot(pa) - dpa * dpa;
}
//double sum_dist2(VEC p, VEC a[], VEC d[], int n) {
float sum_dist2(PVector p, PVector a[], PVector d[]) {
int n = a.length;
float sum = 0;
for (int i = 0; i < n; ++i) {
sum += dist2(p, a[i], d[i]);
}
return sum;
}
// Check 26 nearby points and verify the provided one is nearest.
boolean isNearest(PVector p, PVector a[], PVector d[]) {
float min_d2 = 3.4028235E38;
int ii = 2, jj = 2, kk = 2;
final float D = 0.1f;
for (int i = -1; i <= 1; ++i)
for (int j = -1; j <= 1; ++j)
for (int k = -1; k <= 1; ++k) {
PVector pp = new PVector( p.x + D * i, p.y + D * j, p.z + D * k );
float d2 = sum_dist2(pp, a, d);
// Prefer provided point among equals.
if (d2 < min_d2 || i == 0 && j == 0 && k == 0 && d2 == min_d2) {
min_d2 = d2;
ii = i; jj = j; kk = k;
}
}
return ii == 0 && jj == 0 && kk == 0;
}
void setup() {
PVector a[] = {
new PVector(-14.2, 17, -1),
new PVector(1, 1, 1),
new PVector(2.3, 4.1, 9.8),
new PVector(1,2,3)
};
PVector d[] = {
new PVector(1.3, 1.3, -10),
new PVector(12.1, -17.2, 1.1),
new PVector(19.2, 31.8, 3.5),
new PVector(4,5,6)
};
int n = 4;
for (int i = 0; i < n; ++i)
d[i].normalize();
PVector p = findNearestPoint(a, d);
println(p);
if (!isNearest(p, a, d))
println("Woops. Not nearest.\n");
}
// From rosettacode (with bug fix: added a missing fabs())
int mat_elem(int y, int x) { return y*4+x; }
void swap_row(float[] a, float[] b, int r1, int r2, int n)
{
float tmp;
int p1, p2;
int i;
if (r1 == r2) return;
for (i = 0; i < n; i++) {
p1 = mat_elem(r1, i);
p2 = mat_elem(r2, i);
tmp = a[p1];
a[p1] = a[p2];
a[p2] = tmp;
}
tmp = b[r1];
b[r1] = b[r2];
b[r2] = tmp;
}
float[] solve(float[] a, float[] b)
{
float[] x = new float[] {0,0,0};
int n = x.length;
int i, j, col, row, max_row, dia;
float max, tmp;
for (dia = 0; dia < n; dia++) {
max_row = dia;
max = abs(getm(a, dia, dia));
for (row = dia + 1; row < n; row++) {
if ((tmp = abs(getm(a, row, dia))) > max) {
max_row = row;
max = tmp;
}
}
swap_row(a, b, dia, max_row, n);
for (row = dia + 1; row < n; row++) {
tmp = getm(a, row, dia) / getm(a, dia, dia);
for (col = dia+1; col < n; col++) {
incm(a, row, col, -tmp * getm(a, dia, col));
}
setm(a,row,dia, 0);
b[row] -= tmp * b[dia];
}
}
for (row = n - 1; row >= 0; row--) {
tmp = b[row];
for (j = n - 1; j > row; j--) {
tmp -= x[j] * getm(a, row, j);
}
x[row] = tmp / getm(a, row, row);
}
return x;
}

Problems converting HSL to RGB

I am trying to convert HSL values to RGB but I am getting strange results, if the hue value is about 180 then the resulting RGB is negative dependent on the lightness value.
My implementation:
public class HSLtoRGB {
private static float hueToRGB(float m1, float m2, float h) {
if(h < 0) {
h += 1.0f;
} else if(h > 1.0f) {
h -= 1.0f;
}
if((h * 6) < 1) {
return m1 + (m2 - m1) * 6 * h;
} else if((h * 2) < 1) {
return m2;
} else if((h * 3) < 2) {
return m1 + (m2 - m1) * ((2 / 3) - h) * 6;
} else {
return m1;
}
}
public static void main(String[] args) {
float h = 180.0f / 360.0f;
float s = 100.0f / 100.0f;
float l = 38.0f / 100.0f;
float r = 0;
float g = 0;
float b = 0;
if(s == 0.0) {
r = g = b = l;
} else {
float m2 = l < 0.5 ? l * (1 + s) : (l + s) - (l * s);
float m1 = (l * 2) - m2;
r = hueToRGB(m1, m2, h + (1.0f / 3.0f));
g = hueToRGB(m1, m2, h);
b = hueToRGB(m1, m2, h - (1.0f / 3.0f));
}
System.out.printf("%.2f %.2f %.2f -> %.2f %.2f %.2f",
h, s, l,
r, g, b);
}
}
and the output from the above:
0.50 1.00 0.38 -> 0.00 -2.28 0.76
I followed this algorithm and checked many others to get the above formula, including the one in the CSS3 docs:
HOW TO RETURN hsl.to.rgb(h, s, l):
SELECT:
l<=0.5: PUT l*(s+1) IN m2
ELSE: PUT l+s-l*s IN m2
PUT l*2-m2 IN m1
PUT hue.to.rgb(m1, m2, h+1/3) IN r
PUT hue.to.rgb(m1, m2, h ) IN g
PUT hue.to.rgb(m1, m2, h-1/3) IN b
RETURN (r, g, b)
HOW TO RETURN hue.to.rgb(m1, m2, h):
IF h<0: PUT h+1 IN h
IF h>1: PUT h-1 IN h
IF h*6<1: RETURN m1+(m2-m1)*h*6
IF h*2<1: RETURN m2
IF h*3<2: RETURN m1+(m2-m1)*(2/3-h)*6
RETURN m1
It looks like the culprit is this line:
return m1 + (m2 - m1) * ((2 / 3) - h) * 6;
You're doing integer division 2/3, which will equal 0. A simple change to:
return m1 + (m2 - m1) * ((2.0f / 3.0f) - h) * 6;
seems to fix it for me. It comes back with equal blue/green 0.76, which makes sense for hue 180(cyan).

Fast solution to Subset sum algorithm by Pisinger

This is a follow-up to my previous question. I still find it very interesting problem and as there is one algorithm which deserves more attention I'm posting it here.
From Wikipedia: For the case that each xi is positive and bounded by the same constant, Pisinger found a linear time algorithm.
There is a different paper which seems to describe the same algorithm but it is a bit difficult to read for me so please - does anyone know how to translate the pseudo-code from page 4 (balsub) into working implementation?
Here are couple of pointers I collected so far:
http://www.diku.dk/~pisinger/95-6.ps (the paper)
https://stackoverflow.com/a/9952759/1037407
http://www.diku.dk/hjemmesider/ansatte/pisinger/codes.html
PS: I don't really insist on precisely this algorithm so if you know of any other similarly performant algorithm please feel free to suggest it bellow.
Edit
This is a Python version of the code posted bellow by oldboy:
class view(object):
def __init__(self, sequence, start):
self.sequence, self.start = sequence, start
def __getitem__(self, index):
return self.sequence[index + self.start]
def __setitem__(self, index, value):
self.sequence[index + self.start] = value
def balsub(w, c):
'''A balanced algorithm for Subset-sum problem by David Pisinger
w = weights, c = capacity of the knapsack'''
n = len(w)
assert n > 0
sum_w = 0
r = 0
for wj in w:
assert wj > 0
sum_w += wj
assert wj <= c
r = max(r, wj)
assert sum_w > c
b = 0
w_bar = 0
while w_bar + w[b] <= c:
w_bar += w[b]
b += 1
s = [[0] * 2 * r for i in range(n - b + 1)]
s_b_1 = view(s[0], r - 1)
for mu in range(-r + 1, 1):
s_b_1[mu] = -1
for mu in range(1, r + 1):
s_b_1[mu] = 0
s_b_1[w_bar - c] = b
for t in range(b, n):
s_t_1 = view(s[t - b], r - 1)
s_t = view(s[t - b + 1], r - 1)
for mu in range(-r + 1, r + 1):
s_t[mu] = s_t_1[mu]
for mu in range(-r + 1, 1):
mu_prime = mu + w[t]
s_t[mu_prime] = max(s_t[mu_prime], s_t_1[mu])
for mu in range(w[t], 0, -1):
for j in range(s_t[mu] - 1, s_t_1[mu] - 1, -1):
mu_prime = mu - w[j]
s_t[mu_prime] = max(s_t[mu_prime], j)
solved = False
z = 0
s_n_1 = view(s[n - b], r - 1)
while z >= -r + 1:
if s_n_1[z] >= 0:
solved = True
break
z -= 1
if solved:
print c + z
print n
x = [False] * n
for j in range(0, b):
x[j] = True
for t in range(n - 1, b - 1, -1):
s_t = view(s[t - b + 1], r - 1)
s_t_1 = view(s[t - b], r - 1)
while True:
j = s_t[z]
assert j >= 0
z_unprime = z + w[j]
if z_unprime > r or j >= s_t[z_unprime]:
break
z = z_unprime
x[j] = False
z_unprime = z - w[t]
if z_unprime >= -r + 1 and s_t_1[z_unprime] >= s_t[z]:
z = z_unprime
x[t] = True
for j in range(n):
print x[j], w[j]
// Input:
// c (capacity of the knapsack)
// n (number of items)
// w_1 (weight of item 1)
// ...
// w_n (weight of item n)
//
// Output:
// z (optimal solution)
// n
// x_1 (indicator for item 1)
// ...
// x_n (indicator for item n)
#include <algorithm>
#include <cassert>
#include <iostream>
#include <vector>
using namespace std;
int main() {
int c = 0;
cin >> c;
int n = 0;
cin >> n;
assert(n > 0);
vector<int> w(n);
int sum_w = 0;
int r = 0;
for (int j = 0; j < n; ++j) {
cin >> w[j];
assert(w[j] > 0);
sum_w += w[j];
assert(w[j] <= c);
r = max(r, w[j]);
}
assert(sum_w > c);
int b;
int w_bar = 0;
for (b = 0; w_bar + w[b] <= c; ++b) {
w_bar += w[b];
}
vector<vector<int> > s(n - b + 1, vector<int>(2 * r));
vector<int>::iterator s_b_1 = s[0].begin() + (r - 1);
for (int mu = -r + 1; mu <= 0; ++mu) {
s_b_1[mu] = -1;
}
for (int mu = 1; mu <= r; ++mu) {
s_b_1[mu] = 0;
}
s_b_1[w_bar - c] = b;
for (int t = b; t < n; ++t) {
vector<int>::const_iterator s_t_1 = s[t - b].begin() + (r - 1);
vector<int>::iterator s_t = s[t - b + 1].begin() + (r - 1);
for (int mu = -r + 1; mu <= r; ++mu) {
s_t[mu] = s_t_1[mu];
}
for (int mu = -r + 1; mu <= 0; ++mu) {
int mu_prime = mu + w[t];
s_t[mu_prime] = max(s_t[mu_prime], s_t_1[mu]);
}
for (int mu = w[t]; mu >= 1; --mu) {
for (int j = s_t[mu] - 1; j >= s_t_1[mu]; --j) {
int mu_prime = mu - w[j];
s_t[mu_prime] = max(s_t[mu_prime], j);
}
}
}
bool solved = false;
int z;
vector<int>::const_iterator s_n_1 = s[n - b].begin() + (r - 1);
for (z = 0; z >= -r + 1; --z) {
if (s_n_1[z] >= 0) {
solved = true;
break;
}
}
if (solved) {
cout << c + z << '\n' << n << '\n';
vector<bool> x(n, false);
for (int j = 0; j < b; ++j) x[j] = true;
for (int t = n - 1; t >= b; --t) {
vector<int>::const_iterator s_t = s[t - b + 1].begin() + (r - 1);
vector<int>::const_iterator s_t_1 = s[t - b].begin() + (r - 1);
while (true) {
int j = s_t[z];
assert(j >= 0);
int z_unprime = z + w[j];
if (z_unprime > r || j >= s_t[z_unprime]) break;
z = z_unprime;
x[j] = false;
}
int z_unprime = z - w[t];
if (z_unprime >= -r + 1 && s_t_1[z_unprime] >= s_t[z]) {
z = z_unprime;
x[t] = true;
}
}
for (int j = 0; j < n; ++j) {
cout << x[j] << '\n';
}
}
}
great code man, but it sometimes crashed in this codeblock
for (mu = w[t]; mu >= 1; --mu)
{
for (int j = s_t[mu] - 1; j >= s_t_1[mu]; --j)
{
if (j >= w.size())
{ // !!! PROBLEM !!!
}
int mu_prime = mu - w[j];
s_t[mu_prime] = max(s_t[mu_prime], j);
}
}
...

Resources