errors with updating halide function in all pixel - halide
I try to implement an optical flow algorithm in Halide. I have some problems with the update of u and v vectors. Here is my C++ version:
for(int i= 0; i<h; i++) {
for(int j= 0; j<bpl; j++) {
float iix = Ix[i*bpl+j];
float iiy = Iy[i*bpl+j];
float iit = It[i*bpl+j];
for(int k=0; k<40; k++) {
float Uav = (u[(i-1)*bpl+j] + u[(i+1)*bpl+j] + u[i*bpl+j-1] + u[i*bpl+j+1])/4;
float Vav = (v[(i-1)*bpl+j] + v[(i+1)*bpl+j] + v[i*bpl+j-1] + v[i*bpl+j+1])/4;
float P = iix*Uav + iiy*Vav + iit;
float D = iix*iix + iiy*iiy + lambda*lambda;
float tmp = P/D;
float utmp = Uav - iix*tmp;
float vtmp = Vav - iiy*tmp;
u[i*bpl+j] = utmp;
v[i*bpl+j] = vtmp;
}
}
}
And this is my halide implementation:
Func u("u"), v("v");
Func Uav("Uav"), Vav("Vav"), P("P"), D("D"), tmp("tmp"), utmp("utmp"), vtmp("vtmp");
RDom r_0(0, input_1.width(), 0, input_1.height());
u(x, y, c) = 0;
v(x, y, c) = 0;
for(int k=0; k<40; k++) {
Uav (x, y, c) = (u(x, y-1, c) + u(x, y+1, c) + u(x-1, y, c) + u(x+1, y, c))/4;
Vav (x, y, c) = (v(x, y-1, c) + v(x, y+1, c) + v(x-1, y, c) + v(x+1, y, c))/4;
P (x, y, c) = Ix(x, y, c) * Uav(x, y, c) + Iy(x, y, c) * Vav(x, y, c) + It(x, y, c);
D (x, y, c) = Ix(x, y, c) * Ix(x, y, c) + Iy(x, y, c) * Iy(x, y, c) + lambda * lambda;
tmp (x, y, c) = P(x, y, c)/D(x, y, c);
utmp(x, y, c) = Uav(x, y, c) - Ix(x, y, c) * tmp(x, y, c);
vtmp(x, y, c) = Vav(x, y, c) - Iy(x, y, c) * tmp(x, y, c);
u(r_0.x, r_0.y, c) = utmp(x, y, c);
v(r_0.x, r_0.y, c) = vtmp(x, y, c);
}
When I run my program, I get the following runtime error:
Error:
Func u cannot be given a new update definition, because it has already been realized or used in the definition of another Func.
Aborted (core dumped)
Thanks for yours responses.
As say, AhiyaHiya, variables x, y, c are declared as:
Var x("x"), y("y"), c("c");
As you suggested, I use an extern c++ function to update my Halide functions. Here is my extern function:
extern "C" DLLEXPORT buffer_t compute_flow(buffer_t *Ix, buffer_t *Iy, buffer_t *It, buffer_t *u, buffer_t *v,
const int32_t bpl, const int32_t h, const float lambda, const uint8_t IsU) {
//about Ix
const auto min0_ix = Ix->min[0];
const auto internalX_ix = min0_ix;
const auto min1_ix = Ix->min[1];
const auto internalY_ix = min1_ix;
const auto stride0_ix = Ix->stride[0];
const auto stride1_ix = Ix->stride[1];
const auto x_ix = bpl + internalX_ix;
const auto y_ix = h + internalY_ix;
//about Iy
const auto min0_iy = Iy->min[0];
const auto internalX_iy = min0_iy;
const auto min1_iy = Iy->min[1];
const auto internalY_iy = min1_iy;
const auto stride0_iy = Iy->stride[0];
const auto stride1_iy = Iy->stride[1];
const auto x_iy = bpl + internalX_iy;
const auto y_iy = h + internalY_iy;
//about It
const auto min0_it = It->min[0];
const auto internalX_it = min0_it;
const auto min1_it = It->min[1];
const auto internalY_it = min1_it;
const auto stride0_it = It->stride[0];
const auto stride1_it = It->stride[1];
const auto x_it = bpl + internalX_it;
const auto y_it = h + internalY_it;
const auto iix = *(Ix->host + (x_ix - min0_ix) * stride0_ix + (y_ix - min1_ix) * stride1_ix);
const auto iiy = *(Iy->host + (x_iy - min0_iy) * stride0_iy + (y_iy - min1_iy) * stride1_iy);
const auto iit = *(It->host + (x_it - min0_it) * stride0_it + (y_it - min1_it) * stride1_it);
//about u
const auto min0_u = u->min[0];
const auto internalX_u = min0_u;
const auto min1_u = u->min[1];
const auto internalY_u = min1_u;
const auto stride0_u = u->stride[0];
const auto stride1_u = u->stride[1];
const auto x_u = bpl + internalX_u;
const auto y_u = h + internalY_u;
//about v
const auto min0_v = v->min[0];
const auto internalX_v = min0_v;
const auto min1_v = v->min[1];
const auto internalY_v = min1_v;
const auto stride0_v = v->stride[0];
const auto stride1_v = v->stride[1];
const auto x_v = bpl + internalX_v;
const auto y_v = h + internalY_v;
buffer_t *uResult, *vResult;
for(int k=0; k<40; k++) {
const auto u0 = *(u->host + (x_u - min0_u) * stride0_u + (y_u - 1 - min1_u) * stride1_u); //u[(i-1)*bpl+j]
const auto u1 = *(u->host + (x_u - min0_u) * stride0_u + (y_u + 1 - min1_u) * stride1_u); //u[(i+1)*bpl+j]
const auto u2 = *(u->host + (x_u - 1 - min0_u) * stride0_u + (y_u - min1_u) * stride1_u); //u[i*bpl+j-1]
const auto u3 = *(u->host + (x_u + 1 - min0_u) * stride0_u + (y_u - min1_u) * stride1_u); //u[i*bpl+j+1]
const auto v0 = *(v->host + (x_v - min0_v) * stride0_v + (y_v - 1 - min1_v) * stride1_v); //v[(i-1)*bpl+j]
const auto v1 = *(v->host + (x_v - min0_v) * stride0_v + (y_v + 1 - min1_v) * stride1_v); //v[(i+1)*bpl+j]
const auto v2 = *(v->host + (x_v - 1 - min0_v) * stride0_v + (y_v - min1_v) * stride1_v); //v[i*bpl+j-1]
const auto v3 = *(v->host + (x_v + 1 - min0_v) * stride0_v + (y_v - min1_v) * stride1_v); //v[i*bpl+j+1]
const auto Uav = (u0 + u1 + u2 + u3)/4;
const auto Vav = (v0 + v1 + v2 + v3)/4;
const auto P = iix*Uav + iiy*Vav + iit;
const auto D = iix*iix + iiy*iiy + lambda*lambda;
const auto tmp = P/D;
const auto utmp = Uav - iix*tmp;
const auto vtmp = Vav - iiy*tmp;
*(u->host + (x_u - min0_u) * stride0_u + (y_u - min1_u) * stride1_u) = utmp; //u[i*bpl+j]
*(v->host + (x_v - min0_v) * stride0_v + (y_v - min1_v) * stride1_v) = vtmp; //v[i*bpl+j]
if(IsU)
*(uResult->host + (x_u - min0_u) * stride0_u + (y_u - min1_u) * stride1_u) = utmp;
else
*(vResult->host + (x_v - min0_v) * stride0_v + (y_v - min1_v) * stride1_v) = vtmp;
}
if(IsU) return *uResult;
else return *vResult;
}
And in my main, I call it as follows:
const float lambda = 0.05;
Image<uint8_t> input_1 = load_image(argv[1]);
Image<uint8_t> input_2 = load_image(argv[1]);
Var x("x"); //image indice in x direction
Var y("y"); //image indice in y direction
Var c("c"); //image number of channel
//clamp to edge
Func clamped_1("clamped_1"), clamped_2("clamped_2");
clamped_1 = BoundaryConditions::repeat_edge(input_1);
clamped_2 = BoundaryConditions::repeat_edge(input_2);
//convert rgb image to grayscale image
Func f_1("f_1"), f_2("f_2");
f_1(x,y,c) = min(0.299f * clamped_1(x,y,0) + 0.587f * clamped_1(x,y,1) + 0.114f * clamped_1(x,y,2), 255.0f);
f_2(x,y,c) = min(0.299f * clamped_2(x,y,0) + 0.587f * clamped_2(x,y,1) + 0.114f * clamped_2(x,y,2), 255.0f);
//gaussian bluring
Image<float> kernel(5, 5);
kernel(0, 1) = 0.000067; kernel(0, 1) = 0.001663; kernel(0, 2) = 0.004706; kernel(0, 3) = 0.001663; kernel(0, 4) = 0.000067;
kernel(1, 0) = 0.001663; kernel(1, 1) = 0.041482; kernel(1, 2) = 0.117381; kernel(1, 3) = 0.041482; kernel(1, 4) = 0.001663;
kernel(2, 0) = 0.004706; kernel(2, 1) = 0.117381; kernel(2, 2) = 0.332152; kernel(2, 3) = 0.117381; kernel(2, 4) = 0.004706;
kernel(3, 0) = 0.001663; kernel(3, 1) = 0.041482; kernel(3, 2) = 0.117381; kernel(3, 3) = 0.041482; kernel(3, 4) = 0.001663;
kernel(4, 0) = 0.000067; kernel(4, 1) = 0.001663; kernel(4, 2) = 0.004706; kernel(4, 3) = 0.001663; kernel(4, 4) = 0.000067;
RDom r(kernel);
Func I1("I1"), I2("I2");
I1(x, y, c) = sum(f_1(x+r.x, y+r.y, c) * kernel(r.x, r.y));
I2(x, y, c) = sum(f_2(x+r.x, y+r.y, c) * kernel(r.x, r.y));
//inputs derivations
Func Ix("Ix"), Iy("Iy"), It("It");
Ix(x, y, c) = (-I1(x-1, y-1, c) + I1(x, y-1, c) - I1(x-1, y, c) + I1(x, y, c)) +
(-I2(x-1, y-1, c) + I2(x, y-1, c) - I2(x-1, y, c) + I2(x, y, c));
Iy(x, y, c) = (-I1(x-1, y-1, c) - I1(x, y-1, c) + I1(x-1, y, c) + I1(x, y, c)) +
(-I2(x-1, y-1, c) - I2(x, y-1, c) + I2(x-1, y, c) + I2(x, y, c));
It(x, y, c) = (-I1(x-1, y-1, c) - I1(x, y-1, c) - I1(x-1, y, c) - I1(x, y, c)) -
( I2(x-1, y-1, c) + I2(x, y-1, c) + I2(x-1, y, c) + I2(x, y, c));
Func u("u"), v("v");
u(x, y, c) = 0; v(x, y, c) = 0;
Func callU("callU"), callV("callV");
vector<ExternFuncArgument> argsU(9);
argsU[0] = Ix; argsU[1] = Iy; argsU[2] = It;
argsU[3] = u; argsU[4] = v; argsU[5] = input_1.width();
argsU[6] = input_1.height(); argsU[7] = lambda; argsU[8] = 1;
vector<ExternFuncArgument> argsV(9);
argsV[0] = Ix; argsV[1] = Iy; argsV[2] = It;
argsV[3] = u; argsV[4] = v; argsV[5] = input_1.width();
argsV[6] = input_1.height(); argsV[7] = lambda; argsV[8] = 0;
vector<Type> types(9);
types[0] = Ix.output_types()[0]; types[1] = Iy.output_types()[0]; types[2] = It.output_types()[0];
types[3] = u.output_types()[0]; types[4] = v.output_types()[0]; types[5] = Int(32);
types[6] = Int(32); types[7] = Float(32); types[8] = UInt(8);
callU.define_extern("compute_flow", argsU, types, 1);
callV.define_extern("compute_flow", argsV, types, 1);
Func outputU("outputU"), outputV("outputV");
outputU(x, y, c) = callU(x, y, c);
outputV(x, y, c) = callV(x, y, c);
Ix.compute_root();
Iy.compute_root();
It.compute_root();
outputU.compile_jit();
outputV.compile_jit();
Image<uint8_t> out_u = outputU.realize(input_1.width(), input_1.height(), input_1.channels());
Image<uint8_t> out_v = outputV.realize(input_1.width(), input_1.height(), input_1.channels());
When I don't schedule u and v, all compile well, but I get this runtime error:
Error: Func u cannot be scheduled to be computed inline, because it is used in the externally-computed function callU Aborted (core dumped)
But when I schedule both u and v as:
u.compute_root();
v.compute_root();
I get the following runtime error:
Internal error at /home/rokiatou/Documents/Thèse/halide/Halide-master/src/BoundsInference.cpp:283 Condition failed: b.empty() || b.size() == func_args.size() Aborted (core dumped)
I'm not sure if my external function compute_flow is well defined. I can not solve the scheduling problem.
Any help is welcome. Thank you.
I am assuming that the variables x,y,c are declared as Halide::Var; if that were the case, then the above listed error is actually accurate.
You can use a c++ for-loop to add update definitions to your Halide::Func, but you would need to use regular C or C++ style variables to do so, at least in one variable spot; the code you have above just references the same variables over and over.
Regarding access to the pixels you listed, "(x-1, y, c), (x+1, y, c), (x, y, c), (x, y-1, c)[...]" here is an example of accessing a value within a buffer_t* in an extern Halide func:
extern "C" void
auto get_something_done_in_c(buffer_t* my_buffer, const int32_t dx, const int32_t dy)
{
const auto min0 = my_buffer->min[0];
const auto internal_x = min0;
const auto min1 = my_buffer->min[1];
const auto internal_y = min1;
const auto stride0 = my_buffer->stride[0];
const auto stride1 = my_buffer->stride[1];
const auto x1 = dx + internal_x;
const auto y1 = dy + internal_y;
const auto value = *(my_buffer->host + (x1 - min0) * stride0 + (y1 - min1) * stride1);
return value;
}
HalideExtern_3(int32_t, get_something_done_in_c, buffer_t, int32_t, int32_t);
To explain a little about how to get the 'value'... I use the my_buffer variable to get access to the data pointer, known as host. That host variable returns the pointer address for the data that you care about. Since this is a 1 dimensional buffer, you use the x and y coordinates, multiplied by stride0 and stride1 to get the address offset within the data pointer to get the value you care about.
Related
NURBS derivative using de Boor's algorithm
At the bottom of De Boor's Algorithm, it is said that De Boor's algorithm also works for NURBS curves. We just multiply every control point by its weight converting the NURBS curve to a 4D B-spline curve, perform de Boor's algorithm on this 4D B-spline curve, and then project the resulting curve back by dividing the first three components with the fourth and keeping the fourth component as its new weight. Then modifying the code from B-Spline derivative using de Boor's algorithm, I came up with the following. import numpy as np import math as m weights = [0.3, 1, 1, 2, 1, 1, 0.5, 1, 1, 3, 1] def deBoor(k, x, t, c_, p): c = [] for point, w in zip(c_, weights): c.append([point[0]*w, point[1]*w, point[2]*w, w]) c = np.array(c) d = [c[j + k - p] for j in range(0, p+1)] for r in range(1, p+1): for j in range(p, r-1, -1): alpha = (x - t[j+k-p]) / (t[j+1+k-r] - t[j+k-p]) d[j] = (1.0 - alpha) * d[j-1] + alpha * d[j] return np.array([ d[p][0] / d[p][3], d[p][1] / d[p][3], d[p][2] / d[p][3] ]) def deBoorDerivative(k, x, t, c_, p): c = [] for point, w in zip(c_, weights): c.append([point[0]*w, point[1]*w, point[2]*w, w]) c = np.array(c) q = [p * (c[j+k-p+1] - c[j+k-p]) / (t[j+k+1] - t[j+k-p+1]) for j in range(0, p)] for r in range(1, p): for j in range(p-1, r-1, -1): right = j+1+k-r left = j+k-(p-1) alpha = (x - t[left]) / (t[right] - t[left]) q[j] = (1.0 - alpha) * q[j-1] + alpha * q[j] return np.array([ q[p-1][0] / q[p-1][3], q[p-1][1] / q[p-1][3], q[p-1][2] / q[p-1][3] ]) def finiteDifferenceDerivative(k, x, t, c, p): f = lambda xx : deBoor(k, xx, t, c, p) dx = 1e-7 return (- f(x + 2 * dx) \ + 8 * f(x + dx) \ - 8 * f(x - dx) \ + f(x - 2 * dx)) / ( 12 * dx ) points = np.array([[i, m.sin(i / 3.0), m.cos(i / 2)] for i in range(0, 11)]) knots = np.array([0, 0, 0, 0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0, 1.0, 1.0, 1.0]) a = deBoorDerivative(7, 0.44, knots, points, 3) b = finiteDifferenceDerivative(7, 0.44, knots, points, 3) print(a) print(b) Although the derivative calculated from finite difference is not the same as the one when using deboors algorithm. [ 9.125 1.02221755 -2.22839545] [16.85238398 0.14138772 -5.90135073]
Solved it. This computes the derivative (velocity) and the position (point) at t at once using deboors algorithm, (written in C). typedef struct vec3 { double x, y, z; } vec3_t; typedef struct vec4 { double x, y, z, w; } vec4_t; vec4_t vec4homo (vec3_t u, double w) { return (vec4_t){u.x * w, u.y * w, u.z * w, w }; } vec4_t vec4add (vec4_t u, vec4_t v) { return (vec4_t){u.x + v.x, u.y + v.y, u.z + v.z, u.w + v.w}; } vec4_t vec4sub (vec4_t u, vec4_t v) { return (vec4_t){u.x - v.x, u.y - v.y, u.z - v.z, u.w - v.w}; } vec4_t vec4mul (vec4_t u, double s) { return (vec4_t){u.x * s, u.y * s, u.z * s, u.w * s }; } vec4_t vec4div (vec4_t u, double s) { return (vec4_t){u.x / s, u.y / s, u.z / s, u.w / s }; } vec3_t vec4trunc (vec4_t u) { return (vec3_t){u.x, u.y, u.z }; } vec3_t vecadd (vec3_t u, vec3_t v) { return (vec3_t){u.x + v.x, u.y + v.y, u.z + v.z}; } vec3_t vecsub (vec3_t u, vec3_t v) { return (vec3_t){u.x - v.x, u.y - v.y, u.z - v.z}; } vec3_t vecmul (vec3_t u, double s) { return (vec3_t){u.x * s, u.y * s, u.z * s }; } vec3_t vecdiv (vec3_t u, double s) { return (vec3_t){u.x / s, u.y / s, u.z / s }; } typedef struct pv { vec3_t position; vec3_t velocity; } pv_t; typedef struct nurbs { vec3_t P[100]; double w[100]; double U[100]; int p; int m; int n; } nurbs_t; int findspan(double* U, double t, int n, int p) { if(t >= U[n]) { return n - 1; } if(t <= U[p]) { return p; } int low = p; int high = n; int mid = (low + high) / 2; while(t < U[mid] || t >= U[mid+1]) { if(t < U[mid]) { high = mid; } else { low = mid; } mid = (low + high) / 2; } return mid; } pv_t nurbs_deboor(double t, nurbs_t* func) { vec3_t* P = func->P; double* U = func->U; double* w = func->w; int p = func->p; int m = func->m; int n = func->n; int k = findspan(U, t, n, p); vec4_t d[30]; vec4_t q[30]; for(int i = 0; i < p + 1; i++) { d[i] = vec4homo(P[i+k-p], w[i+k-p]); if(!(i < p)) { continue; } q[i] = vec4mul(vec4sub(vec4homo(P[i+k-p+1], w[i+k-p+1]), vec4homo(P[i+k-p], w[i+k-p])), p); q[i] = vec4div(q[i], U[i+k+1] - U[i+k-p+1]); } for(int r = 1; r < p + 1; r++) { for(int j = p; j > r - 1; j--) { double alpha = (t - U[j+k-p]) / (U[j+1+k-r] - U[j+k-p]); d[j] = vec4add(vec4mul(d[j-1], 1.0-alpha), vec4mul(d[j], alpha)); if(!(r < p && j < p)) { continue; } alpha = (t - U[j+k-p+1]) / (U[j+1+k-r] - U[j+k-p+1]); q[j] = vec4add(vec4mul(q[j-1], 1.0-alpha), vec4mul(q[j], alpha)); } } pv_t pv; pv.position = vecdiv(vec4trunc(d[p]), d[p].w); pv.velocity = vecdiv(vecsub(vec4trunc(q[p-1]), vecmul(pv.position, q[p-1].w)), d[p].w); return pv; }
Dithering (Floyd-Steinberg) only updates part of graphics object in p5.js
I'm trying to implement Floyd-Steinberg dithering in a P5.js sketch by pre-dithering a bunch of circles in a graphics object (in setup) and then drawing them later. However, I keep running into the issue where only part of the circle is dithered, and the rest looks normal. Any suggestions are welcome as I'm really stumped as to what is going on. setup(): let circs; function setup() { //... createCanvas(1000,1000); let size = 200; circs = []; circs.push({ gfx: createGraphics(size, size), size: size, color: color(random(255)) }); for (let i = 0; i < circs.length; i++) dither(circs[i]); // ... } draw(): function draw() { if (!paused) { background(bg); drawShadow(4); // just a call to the drawingContext shadow for (let i = 0; i < circs.length; i++) { push(); translate(width / 2, height / 2); imageMode(CENTER); image(circs[i].gfx, 0, 0); pop(); } } } floyd-steinberg - based on https://openprocessing.org/sketch/1192123 function index(x, y, g) { return (x + y * g.width) * 4; } function dither(g) { g.loadPixels(); for (let y = 0; y < g.height - 1; y++) { for (let x = 1; x < g.width - 1; x++) { let oldr = g.pixels[index(x, y, g)]; let oldg = g.pixels[index(x, y, g) + 1]; let oldb = g.pixels[index(x, y, g) + 2]; let factor = 1.0; let newr = round((factor * oldr) / 255) * (255 / factor); let newg = round((factor * oldg) / 255) * (255 / factor); let newb = round((factor * oldb) / 255) * (255 / factor); g.pixels[index(x, y, g)] = newr; g.pixels[index(x, y, g) + 1] = newg; g.pixels[index(x, y, g) + 2] = newb; g.pixels[index(x + 1, y, g)] += ((oldr - newr) * 7) / 16.0; g.pixels[index(x + 1, y, g) + 1] += ((oldr - newr) * 7) / 16.0; g.pixels[index(x + 1, y, g) + 2] += ((oldr - newr) * 7) / 16.0; g.pixels[index(x - 1, y + 1, g)] += ((oldr - newr) * 3) / 16.0; g.pixels[index(x - 1, y + 1, g) + 1] += ((oldr - newr) * 3) / 16.0; g.pixels[index(x - 1, y + 1, g) + 2] += ((oldr - newr) * 3) / 16.0; g.pixels[index(x, y + 1, g)] += ((oldr - newr) * 5) / 16.0; g.pixels[index(x, y + 1, g) + 1] += ((oldr - newr) * 5) / 16.0; g.pixels[index(x, y + 1, g) + 2] += ((oldr - newr) * 5) / 16.0; g.pixels[index(x + 1, y + 1, g)] += ((oldr - newr) * 1) / 16.0; g.pixels[index(x + 1, y + 1, g) + 1] += ((oldr - newr) * 1) / 16.0; g.pixels[index(x + 1, y + 1, g) + 2] += ((oldr - newr) * 1) / 16.0; } } g.updatePixels(); } I'm not sure what I'm missing as the dithering algorithm loops over the height and width and then should be updating, but I think I'm missing something.
p5.Graphics objects have a pixelDensity inherited from the sketch. When the pixel density is > 1 as it is for high DPI displays you need to account for this when you are computing your pixels indices: function index(x, y, g) { const d = g.pixelDensity(); return (x + y * g.width * d) * 4; } And when you are processing pixels you will need to double the maximum values for x and y. Here's a demonstration of the effects of pixelDensity (and whether or not you handle it): let g; function setup() { createCanvas(400, 400); g = createGraphics(width, height); redrawGraphics(); noLoop(); setInterval( () => { redrawGraphics(frameCount % 2); redraw(); }, 2000 ); } function index(x, y, g, d) { return (x + y * g.width * d) * 4; } function redrawGraphics(hdpi) { const d = hdpi ? pixelDensity() : 1; g.background(0); g.loadPixels(); for (let y = 0; y < height * 2; y++) { for (let x = 0; x < width * 2; x++) { let ix = index(x, y, g, d); let r = map(sin((x - y) / width * TWO_PI), -1, 1, 0, 255); g.pixels[ix] = r; g.pixels[ix + 1] = 0; g.pixels[ix + 2] = 0; g.pixels[ix + 3] = 255; } } g.updatePixels(); } function draw() { image(g, 0, 0); } <script src="https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.4.0/p5.js"></script>
3D Simplex Noise Sudden Height Change
I have a problem generating 3D Noise. I've written a framework that uses DirectX11 to render everything. I generate a Geo-sphere and modify the height values using a 3D Simplex Noise function. The problem is that when I see the result I see sudden changes in height that are not noise like at all... (the rectangle shape in the center of the picture) I've modified the Persistence to 0.1 so the error is easily seen here... I can't figure out the issue with the sudden height changes. This is an error, right? I calculate the height with the following... for( int i = 0; i < sphere.Vertices.size(); ++i ) { // seperate out our positions float x = sphere.Vertices[ i ].Position.x; float y = sphere.Vertices[ i ].Position.y; float z = sphere.Vertices[ i ].Position.z; // get our noise value ( -1 to 1 ) float ix = noise.octavenoise3D( 10, 0.1, 0.5, x, y, z, perm, &grad3[0][0] ); // pack our coordinates into a vector XMVECTOR curPos = { x, y, z }; // get the normalized vector of our position XMVECTOR normPos = XMVector3Normalize( curPos ); // seperate our normalzed x y and z float normX = XMVectorGetX( normPos ); float normY = XMVectorGetY( normPos ); float normZ = XMVectorGetZ( normPos ); // figure out the height of this specific vertice, maxHeight = sphereRadius / 3.0f; float height = ix * maxHeight; float change = height + sphereRadius; // calculate the offset x y and z by the noise float changeX = change * normX; float changeY = change * normY; float changeZ = change * normZ; // save our new x y and z vertices[ i ].Pos.x = x + changeX; vertices[ i ].Pos.y = y + changeY; vertices[ i ].Pos.z = z + changeZ; // calculate color based on noise value float colorChange = ( 0.5f * ix ); float color = 0.5f + colorChange; // save color value in r g b vertices[ i ].Color.x = color; vertices[ i ].Color.y = color; vertices[ i ].Color.z = color; } Also, changing the base coordinates of the function doesn't get rid of this weird output. (for anyone who thinks that starting at 0,0,0 was messing it up somehow) Noise Implementation float Noise::octavenoise3D( const float octaves, const float persistence, const float scale, const float x, const float y, const float z, int *perm, int *grad3 ) float total = 0; float frequency = scale; float amplitude = 1; float maxAmplitude = 0; for( int i = 0; i < octaves; i++ ) { total = total + rawnoise3D( x * frequency, y * frequency, z * frequency, perm, grad3 ) * amplitude; frequency = frequency * 2; maxAmplitude = maxAmplitude + amplitude; amplitude = amplitude * persistence; } return total / maxAmplitude; float Noise::rawnoise3D( const float x, const float y, const float z, int *perm, int *grad3 ) float n0, n1, n2, n3; float F3 = 1.0 / 3.0; float s = ( x + y + z ) * F3; int i = fastfloor( x + s ); int j = fastfloor( y + s ); int k = fastfloor( z + s ); float G3 = 1.0 / 6.0; float t = ( i + j + k ) * G3; float X0 = i - t; float Y0 = j - t; float Z0 = k - t; float x0 = x - X0; float y0 = y - Y0; float z0 = z - Z0; int i1, j1, k1; int i2, j2, k2; if( x0 >= y0 ) { if( y0 >= z0 ) { i1 = 0; j1 = 0; k1 = 1; i2 = 1; j2 = 1; k2 = 0; } else if( x0 >= z0 ) { i1 = 1; j1 = 0; k1 = 0; i2 = 1; j2 = 0; k2 = 1; } else { i1 = 0; j1 = 0; k1 = 1; i2 = 1; j2 = 0; k2 = 1; } } else { if( y0 < z0 ) { i1 = 0; j1 = 0; k1 = 1; i2 = 0; j2 = 1; k2 = 0; } else if( x0 < z0 ) { i1 = 0; j1 = 1; k1 = 0; i2 = 0; j2 = 1; k2 = 1; } else { i1 = 0; j1 = 1; k1 = 0; i2 = 1; j2 = 1; k2 = 0; } } float x1 = x0 - i1 + G3; float y1 = y0 - j1 + G3; float z1 = z0 - k1 + G3; float x2 = x0 - i2 + 2.0 * G3; float y2 = y0 - j2 + 2.0 *G3; float z2 = z0 - k2 + 2.0 *G3; float x3 = x0 - 1.0 + 3.0 * G3; float y3 = y0 - 1.0 + 3.0 * G3; float z3 = z0 - 1.0 + 3.0 * G3; int ii = i & 255; int jj = j & 255; int kk = k & 255; int gi0 = perm[ ii + perm[ jj + perm[ kk ] ] ] % 12; int gi1 = perm[ ii+ i1 + perm[ jj + j1 + perm[ kk + k1 ] ] ] % 12; int gi2 = perm[ ii + i2 + perm[ jj + j2 + perm[ kk + k2 ] ] ] % 12; int gi3 = perm[ ii + 1 + perm[ jj + 1 + perm[ kk + 1 ] ] ] % 12; float t0 = 0.6 - ( x0 * x0 ) - ( y0 * y0 ) - ( z0 * z0 ); if( t0 < 0 ) { n0 = 0.0; } else { t0 = t0 * t0; n0 = ( t0 * t0 ) * dot( &grad3[ gi0 ], x0, y0, z0); } float t1 = 0.6 - ( x1 * x1 ) - ( y1 * y1 ) - ( z1 * z1 ); if( t1 < 0 ) { n1 = 0.0; } else { t1 *= t1; n1 = ( t1 * t1 ) * dot( &grad3[ gi1 ], x1, y1, z1); } float t2 = 0.6 - ( x2 * x2 ) - ( y2 * y2 ) - ( z2 * z2 ); if( t2 < 0 ) { n2 = 0.0; } else { t2 *= t2; n2 = ( t2 * t2 ) * dot( &grad3[ gi2 ], x2, y2, z2); } float t3 = 0.6 - ( x3 * x3 ) - ( y3 * y3 ) - ( z3 * z3 ); if( t3 < 0 ) { n3 = 0.0; } else { t3 = t3 * t3; n3 = t3 * t3 * dot( &grad3[ gi3 ], x3, y3, z3); } float final = 32.0 * ( n0 + n1 + n2 + n3 ); return final; int Noise::fastfloor( const float x ) return x > 0 ? (int)x : (int)x - 1; float Noise::dot( const int* g, const float x, const float y, const float z ) return g[0]*x + g[1]*y + g[2]*z;
I found the solution... Solution: I was silly. I had a dumb error in my raw noise function but i fixed it now This is the fixed part of the code if( x0 >= y0 ) { if( y0 >= z0 ) { i1 = 1; j1 = 0; k1 = 0; i2 = 1; j2 = 1; k2 = 0; } else if( x0 >= z0 ) { i1 = 1; j1 = 0; k1 = 0; i2 = 1; j2 = 0; k2 = 1; } else { i1 = 0; j1 = 0; k1 = 1; i2 = 1; j2 = 0; k2 = 1; } } else { if( y0 < z0 ) { i1 = 0; j1 = 0; k1 = 1; i2 = 0; j2 = 1; k2 = 1; } else if( x0 < z0 ) { i1 = 0; j1 = 1; k1 = 0; i2 = 0; j2 = 1; k2 = 1; } else { i1 = 0; j1 = 1; k1 = 0; i2 = 1; j2 = 1; k2 = 0; } } Turned out I had several incorrect values in the old one.
CUDA Image Rotation
I am having trouble implementing image rotation in CUDA. I have a very simple Rotate function working as follows: __device__ float readPixVal( float* ImgSrc,int ImgWidth,int x,int y) { return (float)ImgSrc[y*ImgWidth+x]; } __device__ void putPixVal( float* ImgSrc,int ImgWidth,int x,int y, float floatVal) { ImgSrc[y*ImgWidth+x] = floatVal; } __global__ void Rotate(float* Source, float* Destination, int sizeX, int sizeY, float deg) { int i = blockIdx.x * blockDim.x + threadIdx.x;// Kernel definition int j = blockIdx.y * blockDim.y + threadIdx.y; if(i < sizeX && j < sizeY) { putPixVal(Destination, sizeX, ((float)i)*cos(deg) - ((float)j)*sin(deg), ((float)i)*sin(deg) + ((float)j)*cos(deg)), readPixVal(Source, sizeX, i, j)); } } The problem is, I do not know how to do any interpolation. With the above, many pixels are skipped due to integer roundoff. Anyone know how to fix this, or are there any free/opensource implementations of image rotate? I could not find any for CUDA.
Generally in this sort of image manipulation you loop over all destination pixel positions calculating the corresponding pixel (or interpolating groups of pixels) in the source image. This ensures that you evenly and uniformly fill the resulting image which is normally what you care about.
void rotateImage_Kernel(cufftComplex* trg, const cufftComplex* src, const unsigned int imageWidth,const unsigned int imageHeight, const float angle, const float scale) { // compute thread dimension const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; //// compute target address const unsigned int idx = x + y * imageWidth; const int xA = (x - imageWidth/2 ); const int yA = (y - imageHeight/2 ); const int xR = (int)floor(1.0f/scale * (xA * cos(angle) - yA * sin(angle))); const int yR = (int)floor(1.0f/scale * (xA * sin(angle) + yA * cos(angle))); float src_x = xR + imageWidth/2; float src_y = yR + imageHeight/2; if ( src_x >= 0.0f && src_x < imageWidth && src_y >= 0.0f && src_y < imageHeight) { // BI - LINEAR INTERPOLATION float src_x0 = (float)(int)(src_x); float src_x1 = (src_x0+1); float src_y0 = (float)(int)(src_y); float src_y1 = (src_y0+1); float sx = (src_x-src_x0); float sy = (src_y-src_y0); int idx_src00 = min(max(0.0f,src_x0 + src_y0 * imageWidth),imageWidth*imageHeight-1.0f); int idx_src10 = min(max(0.0f,src_x1 + src_y0 * imageWidth),imageWidth*imageHeight-1.0f); int idx_src01 = min(max(0.0f,src_x0 + src_y1 * imageWidth),imageWidth*imageHeight-1.0f); int idx_src11 = min(max(0.0f,src_x1 + src_y1 * imageWidth),imageWidth*imageHeight-1.0f); trg[idx].y = 0.0f; trg[idx].x = (1.0f-sx)*(1.0f-sy)*src[idx_src00].x; trg[idx].x += ( sx)*(1.0f-sy)*src[idx_src10].x; trg[idx].x += (1.0f-sx)*( sy)*src[idx_src01].x; trg[idx].x += ( sx)*( sy)*src[idx_src11].x; } else { trg[idx].x = 0.0f; trg[idx].y = 0.0f; } DEVICE_METHODE_LAST_COMMAND; } void translateImage_Kernel(cufftComplex* trg, const cufftComplex* src, const unsigned int imageWidth, const unsigned int imageHeight, const float tX, const float tY) { // compute thread dimension const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; //// compute target address const unsigned int idx = x + y * imageWidth; const int xB = ((int)x + (int)tX ); const int yB = ((int)y + (int)tY ); if ( xB >= 0 && xB < imageWidth && yB >= 0 && yB < imageHeight) { trg[idx] = src[xB + yB * imageWidth]; } else { trg[idx].x = 0.0f; trg[idx].y = 0.0f; } DEVICE_METHODE_LAST_COMMAND; }
This seems to do the trick __global__ void Rotate(float* Source, float* Destination, int sizeX, int sizeY, float deg) { int i = blockIdx.x * blockDim.x + threadIdx.x;// Kernel definition int j = blockIdx.y * blockDim.y + threadIdx.y; int xc = sizeX - sizeX/2; int yc = sizeY - sizeY/2; int newx = ((float)i-xc)*cos(deg) - ((float)j-yc)*sin(deg) + xc; int newy = ((float)i-xc)*sin(deg) + ((float)j-yc)*cos(deg) + yc; if (newx >= 0 && newx < sizeX && newy >= 0 && newy < sizeY) { putPixVal(Destination, sizeX, i , j, readPixVal(Source, sizeX, newx, newy)); } }
Is there a name for this sampling algorithm used in Minicraft?
For Ludum Dare 22, Notch programmed a game in 48 hours called Minicraft. It's like a 2D minecraft. Anyway the source is available (here: http://www.ludumdare.com/compo/ludum-dare-22/?action=preview&uid=398 ), and I was taking a look since I am interested in random generation of terrain and levels. In the code is a block of code which runs the core generation, and the algorithm to me seems familiar, but I can't put a name to it. I'd like to know exactly what it is so I can read more about it and learn how it works. Specifically, the code is from levelGen.java: do { int halfStep = stepSize / 2; for (int y = 0; y < w; y += stepSize) { for (int x = 0; x < w; x += stepSize) { double a = sample(x, y); double b = sample(x + stepSize, y); double c = sample(x, y + stepSize); double d = sample(x + stepSize, y + stepSize); double e = (a + b + c + d) / 4.0 + (random.nextFloat() * 2 - 1) * stepSize * scale; setSample(x + halfStep, y + halfStep, e); } } for (int y = 0; y < w; y += stepSize) { for (int x = 0; x < w; x += stepSize) { double a = sample(x, y); double b = sample(x + stepSize, y); double c = sample(x, y + stepSize); double d = sample(x + halfStep, y + halfStep); double e = sample(x + halfStep, y - halfStep); double f = sample(x - halfStep, y + halfStep); double H = (a + b + d + e) / 4.0 + (random.nextFloat() * 2 - 1) * stepSize * scale * 0.5; double g = (a + c + d + f) / 4.0 + (random.nextFloat() * 2 - 1) * stepSize * scale * 0.5; setSample(x + halfStep, y, H); setSample(x, y + halfStep, g); } } stepSize /= 2; scale *= (scaleMod + 0.8); scaleMod *= 0.3; } while (stepSize > 1); Those two for loops are running some kind of sampling algorithm, and I would just like to know if this is known named algorithm, or if notch just rolled his own.
This looks like the diamond-square algorithm.