Halide JIT vs Generator Differences - halide

While playing around with Halide, I see that totally different pseudocodes are created for a same pipline when using JIT and a generated function approaches. It looks like I'm missing something and so I'd very appreciate and hint. Here is what I did:
A simple 'dilate' pipline is defined as:
int jit_main ()
{
Target target = get_jit_target_from_environment ();
const int width = 1280, height = 1024;
Buffer <uint8_t> input (width, height);
for (int y = 0; y < height; y++)
for (int x = 0; x < width; x++)
input (x, y) = rand () & 0xff;
Var x ("x_1"), y ("y_1");
Func clamped ("clamped_1");
clamped = BoundaryConditions::repeat_edge (input);
Func max_x ("max_x_1");
max_x (x, y) = max (clamped (x - 1, y), clamped (x, y), clamped (x + 1, y));
Func dilate ("dilate_1");
dilate (x, y) = max (max_x (x, y - 1), max_x (x, y), max_x (x, y + 1));
tick (NULL);
Buffer<uint8_t> out = dilate.realize (width, height, target);
tick ("inline");
dilate.print_loop_nest ();
dilate.compile_to_lowered_stmt ("dilate_1_.html", {}, HTML);
}
The resulting pseudocode looks as follows (fragment):
produce dilate_1 {
let t125 = ((dilate_1.min.1 * dilate_1.stride.1) + dilate_1.min.0)
for (dilate_1.s0.y_1, dilate_1.min.1, dilate_1.extent.1) {
let t128 = max(min(dilate_1.s0.y_1, 1024), 1)
let t126 = max(min(dilate_1.s0.y_1, 1023), 0)
let t127 = max(min(dilate_1.s0.y_1, 1022), -1)
let t129 = ((dilate_1.s0.y_1 * dilate_1.stride.1) - t125)
for (dilate_1.s0.x_1, dilate_1.min.0, dilate_1.extent.0) {
dilate_1[(dilate_1.s0.x_1 + t129)] = max(b0[((max(min(dilate_1.s0.x_1, 1278), -1) + (t126 * 1280)) + 1)], max(b0[(max(min(dilate_1.s0.x_1, 1279), 0) + (t126 * 1280))], max(b0[((max(min(dilate_1.s0.x_1, 1280), 1) + (t126 * 1280)) + -1)], max(b0[((max(min(dilate_1.s0.x_1, 1280), 1) + (t127 * 1280)) + 1279)], max(b0[((max(min(dilate_1.s0.x_1, 1279), 0) + (t127 * 1280)) + 1280)], max(b0[((max(min(dilate_1.s0.x_1, 1278), -1) + (t127 * 1280)) + 1281)], max(b0[((max(min(dilate_1.s0.x_1, 1280), 1) + (t128 * 1280)) + -1281)], max(b0[((max(min(dilate_1.s0.x_1, 1279), 0) + (t128 * 1280)) + -1280)], b0[((max(min(dilate_1.s0.x_1, 1278), -1) + (t128 * 1280)) + -1279)]))))))))
}
}
}
Then I defined a generator:
class Dilate0Generator : public Halide::Generator <Dilate0Generator>
{
public:
Input<Buffer<uint8_t>> input_0 {"input_0", 2};
Output<Buffer<uint8_t>> dilate_0 {"dilate_0", 2};
Var x {"x_0"}, y {"y_0"};
void generate ()
{
Func clamped_0 {"clamped_0"};
clamped_0 = BoundaryConditions::repeat_edge (input_0);
Func max_x_0 {"max_x_0"};
max_x_0 (x, y) =
max (clamped_0 (x - 1, y), clamped_0 (x, y), clamped_0 (x + 1, y));
dilate_0 (x, y) =
max (max_x_0 (x, y - 1), max_x_0 (x, y), max_x_0 (x, y + 1));
dilate_0.print_loop_nest ();
}
};
HALIDE_REGISTER_GENERATOR (Dilate0Generator, dilate_0)
And it's pseudocode is completely different (fragment):
produce dilate_0 {
let dilate_0.s0.y_0.prologue = min(max((input_0.min.1 + 1), dilate_0.min.1), (dilate_0.extent.1 + dilate_0.min.1))
let dilate_0.s0.y_0.epilogue$3 = min(max(max((input_0.min.1 + 1), dilate_0.min.1), ((input_0.extent.1 + input_0.min.1) + -1)), (dilate_0.extent.1 + dilate_0.min.1))
let t166 = (dilate_0.s0.y_0.prologue - dilate_0.min.1)
let t168 = ((input_0.min.1 * input_0.stride.1) + input_0.min.0)
let t170 = ((dilate_0.min.1 * dilate_0.stride.1) + dilate_0.min.0)
let t167 = (input_0.extent.1 + input_0.min.1)
let t169 = (input_0.extent.0 + input_0.min.0)
for (dilate_0.s0.y_0, dilate_0.min.1, t166) {
let t171 = ((max(min((t167 + -1), dilate_0.s0.y_0), input_0.min.1) * input_0.stride.1) - t168)
let t173 = ((max((min((dilate_0.s0.y_0 + 2), t167) + -1), input_0.min.1) * input_0.stride.1) - t168)
let t174 = ((max((min(dilate_0.s0.y_0, t167) + -1), input_0.min.1) * input_0.stride.1) - t168)
let t175 = ((dilate_0.s0.y_0 * dilate_0.stride.1) - t170)
for (dilate_0.s0.x_0, dilate_0.min.0, dilate_0.extent.0) {
dilate_0[(dilate_0.s0.x_0 + t175)] = (let t132 = max((min((dilate_0.s0.x_0 + 2), t169) + -1), input_0.min.0) in (let t133 = max(min((t169 + -1), dilate_0.s0.x_0), input_0.min.0) in (let t134 = max((min(dilate_0.s0.x_0, t169) + -1), input_0.min.0) in max(input_0[(t132 + t171)], max(input_0[(t133 + t171)], max(input_0[(t134 + t171)], max(input_0[(t134 + t173)], max(input_0[(t133 + t173)], max(input_0[(t132 + t173)], max(input_0[(t134 + t174)], max(input_0[(t133 + t174)], input_0[(t132 + t174)])))))))))))
}
}
let t183 = (dilate_0.extent.0 + dilate_0.min.0)
let t184 = (input_0.extent.0 + input_0.min.0)
let t185 = max((input_0.min.0 + 1), dilate_0.min.0)
let t178 = min(max((t184 + -1), t185), t183)
let t177 = min(t183, t185)
let t176 = (dilate_0.s0.y_0.epilogue$3 - dilate_0.s0.y_0.prologue)
let t179 = ((input_0.min.1 * input_0.stride.1) + input_0.min.0)
let t181 = ((dilate_0.min.1 * dilate_0.stride.1) + dilate_0.min.0)
for (dilate_0.s0.y_0, dilate_0.s0.y_0.prologue, t176) {
let t189 = (((dilate_0.s0.y_0 + 1) * input_0.stride.1) - t179)
let t190 = (((dilate_0.s0.y_0 + -1) * input_0.stride.1) - t179)
let t187 = ((dilate_0.s0.y_0 * input_0.stride.1) - t179)
let t191 = ((dilate_0.s0.y_0 * dilate_0.stride.1) - t181)
let t186 = (t177 - dilate_0.min.0)
for (dilate_0.s0.x_0, dilate_0.min.0, t186) {
dilate_0[(dilate_0.s0.x_0 + t191)] = (let t140 = max((min((dilate_0.s0.x_0 + 2), t184) + -1), input_0.min.0) in (let t141 = max(min((t184 + -1), dilate_0.s0.x_0), input_0.min.0) in (let t142 = max((min(dilate_0.s0.x_0, t184) + -1), input_0.min.0) in max(input_0[(t140 + t187)], max(input_0[(t141 + t187)], max(input_0[(t142 + t187)], max(input_0[(t142 + t189)], max(input_0[(t141 + t189)], max(input_0[(t140 + t189)], max(input_0[(t142 + t190)], max(input_0[(t141 + t190)], input_0[(t140 + t190)])))))))))))
}
let t194 = (((dilate_0.s0.y_0 + 1) * input_0.stride.1) - t179)
let t195 = (((dilate_0.s0.y_0 + -1) * input_0.stride.1) - t179)
let t193 = ((dilate_0.s0.y_0 * input_0.stride.1) - t179)
let t196 = ((dilate_0.s0.y_0 * dilate_0.stride.1) - t181)
let t192 = (t178 - t177)
for (dilate_0.s0.x_0, t177, t192) {
dilate_0[(dilate_0.s0.x_0 + t196)] = max(input_0[((dilate_0.s0.x_0 + t193) + 1)], max(input_0[(dilate_0.s0.x_0 + t193)], max(input_0[((dilate_0.s0.x_0 + t193) + -1)], max(input_0[((dilate_0.s0.x_0 + t194) + -1)], max(input_0[(dilate_0.s0.x_0 + t194)], max(input_0[((dilate_0.s0.x_0 + t194) + 1)], max(input_0[((dilate_0.s0.x_0 + t195) + -1)], max(input_0[(dilate_0.s0.x_0 + t195)], input_0[((dilate_0.s0.x_0 + t195) + 1)]))))))))
}
let t200 = (((dilate_0.s0.y_0 + 1) * input_0.stride.1) - t179)
let t201 = (((dilate_0.s0.y_0 + -1) * input_0.stride.1) - t179)
let t198 = ((dilate_0.s0.y_0 * input_0.stride.1) - t179)
let t202 = ((dilate_0.s0.y_0 * dilate_0.stride.1) - t181)
let t197 = (t183 - t178)
for (dilate_0.s0.x_0, t178, t197) {
dilate_0[(dilate_0.s0.x_0 + t202)] = (let t152 = max((min((dilate_0.s0.x_0 + 2), t184) + -1), input_0.min.0) in (let t153 = max(min((t184 + -1), dilate_0.s0.x_0), input_0.min.0) in (let t154 = max((min(dilate_0.s0.x_0, t184) + -1), input_0.min.0) in max(input_0[(t152 + t198)], max(input_0[(t153 + t198)], max(input_0[(t154 + t198)], max(input_0[(t154 + t200)], max(input_0[(t153 + t200)], max(input_0[(t152 + t200)], max(input_0[(t154 + t201)], max(input_0[(t153 + t201)], input_0[(t152 + t201)])))))))))))
}
}
let t203 = ((dilate_0.extent.1 + dilate_0.min.1) - dilate_0.s0.y_0.epilogue$3)
let t205 = ((input_0.min.1 * input_0.stride.1) + input_0.min.0)
let t207 = ((dilate_0.min.1 * dilate_0.stride.1) + dilate_0.min.0)
let t204 = (input_0.extent.1 + input_0.min.1)
let t206 = (input_0.extent.0 + input_0.min.0)
for (dilate_0.s0.y_0, dilate_0.s0.y_0.epilogue$3, t203) {
let t208 = ((max(min((t204 + -1), dilate_0.s0.y_0), input_0.min.1) * input_0.stride.1) - t205)
let t210 = ((max((min((dilate_0.s0.y_0 + 2), t204) + -1), input_0.min.1) * input_0.stride.1) - t205)
let t211 = ((max((min(dilate_0.s0.y_0, t204) + -1), input_0.min.1) * input_0.stride.1) - t205)
let t212 = ((dilate_0.s0.y_0 * dilate_0.stride.1) - t207)
for (dilate_0.s0.x_0, dilate_0.min.0, dilate_0.extent.0) {
dilate_0[(dilate_0.s0.x_0 + t212)] = (let t161 = max((min((dilate_0.s0.x_0 + 2), t206) + -1), input_0.min.0) in (let t162 = max(min((t206 + -1), dilate_0.s0.x_0), input_0.min.0) in (let t163 = max((min(dilate_0.s0.x_0, t206) + -1), input_0.min.0) in max(input_0[(t161 + t208)], max(input_0[(t162 + t208)], max(input_0[(t163 + t208)], max(input_0[(t163 + t210)], max(input_0[(t162 + t210)], max(input_0[(t161 + t210)], max(input_0[(t163 + t211)], max(input_0[(t162 + t211)], input_0[(t161 + t211)])))))))))))
}
}
}
The generated version runs in an order of magnitude faster, which is not surprising, given that the pseudocode for it looks a lot more optimized.
It runs even faster that an existed example
My noob question is how comes that JIT can not create the same representation?
Thanks a lot for any answer/idea/help/hint...

The difference between the two is that in the JIT case, the size of the input (and thus the location of the boundary condition) is known at compile-time.
However the generated code should be similar. I think the fact that you don't get five separate cases in the JIT case is a bug in Halide. I have opened an issue on the Halide github repo.
https://github.com/halide/Halide/issues/5353
EDIT: Thanks for uncovering a bug! Fixed in https://github.com/halide/Halide/pull/5355

Related

PCL transformation error about Quaternion to matrix4f

I use these camera extrinsics parameters to transform .ply file through PCL, but the result is not correct. I think it is because of the formula is not correct from Quaternion to matrix4f.
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
master
Eigen::Matrix4f transform_1 = Eigen::Matrix4f::Identity();
qw = 0.980613;
qx = -0.0777902;
qy = -0.176786;
qz = -0.0330758,
tx = -0.798112;
ty = -0.774293;
tz = 3.76053;
transform_1 (0, 0) = 1 - 2 * pow(qy, 2) - 2 * pow(qz, 2);
transform_1 (0, 1) = 2 * qx*qy - 2 * qz*qw;
transform_1 (0, 2) = 2 * qx*qz + 2 * qy*qw;
transform_1 (0, 3) = tx;
transform_1 (1, 0) = 2 * qx*qy + 2 * qz*qw;
transform_1 (1, 1) = 1 - 2 * pow(qx,2) - 2 * pow(qz,2);
transform_1 (1, 2) = 2 * qy*qz - 2 * qx*qw;
transform_1 (1, 3) = ty;
transform_1 (2, 0) = 2 * qx*qz - 2 * qy*qw;
transform_1 (2, 1) = 2 * qy*qz + 2 * qx*qw;
transform_1 (2, 2) = 1 - 2 * pow(qx,2) - 2 * pow(qy,2);
transform_1 (2, 3) = tz;
transform_1(3, 0) = 0;
transform_1(3, 1) = 0;
transform_1(3, 2) = 0;
transform_1(3, 3) = 1;
sub01
Eigen::Matrix4f transform_2 = Eigen::Matrix4f::Identity();
qw = 0.861117;
qx = -0.0716478;
qy = 0.427619;
qz = 0.265493,
tx = -2.94326;
ty = -1.91445;
tz = 6.074;
transform_2(0, 0) = 1 - 2 * pow(qy, 2) - 2 * pow(qz, 2);
transform_2(0, 1) = 2 * qx*qy - 2 * qz*qw;
transform_2(0, 2) = 2 * qx*qz + 2 * qy*qw;
transform_2(0, 3) = tx;
transform_2(1, 0) = 2 * qx*qy + 2 * qz*qw;
transform_2(1, 1) = 1 - 2 * pow(qx, 2) - 2 * pow(qz, 2);
transform_2(1, 2) = 2 * qy*qz - 2 * qx*qw;
transform_2(1, 3) = ty;
transform_2(2, 0) = 2 * qx*qz - 2 * qy*qw;
transform_2(2, 1) = 2 * qy*qz + 2 * qx*qw;
transform_2(2, 2) = 1 - 2 * pow(qx, 2) - 2 * pow(qy, 2);
transform_2(2, 3) = tz;
transform_2(3, 0) = 0;
transform_2(3, 1) = 0;
transform_2(3, 2) = 0;
transform_2(3, 3) = 1;
I use cloudcompare also got the same result

When creating an angle, how do I control the attributes of the automatically created points?

I'm working with a polygon and attempting to create angles with labels but when angles are created, so are the points used to define them. This would be fine but I can't control the labels on the automatically created points (and I don't know what they are called or how to find out).
var points = [
[0, 0],
[0, 5],
[3, 0]
];
for (k = 0; k < showAngle.length; k++) {
if (showAngle[k] == 1) {
var angle = board.create('angle', [points[k], points[((k + 1) % points.length)], points[((k + 2) % points.length)]],{fixed:true});
} else if (showAngle[k] == 2) {
var angle = board.create('angle', [points[k], points[((k + 1) % points.length)], points[((k + 2) % points.length)]], {
fixed: false,
name: function() {
return ((180/Math.PI)*JXG.Math.Geometry.rad(points[k], points[((k + 1) % points.length)], points[((k + 2) % points.length)])).toFixed(1) + '°';
}
});
}
}
https://jsfiddle.net/jscottuq/acyrLxfh/12/ contains what I've got so far.
The arrays showLen and showAngle are setting what labels are shown for each side/angle (0 - no label, 1 - name , 2 - measurement).
These will be set when the jsxgraph is created.
At the time being, the possibility to control the style of the newly created points of an angle is missing. We will add this soon.
However, a solution would be to use the already existing points which are hidden in this example. For this it would be helpful to kee a list of these points, e.g. jxg_points:
var jxg_points = [];
for (i = 0; i < points.length; i++) {
var rise = points[(i + 1) % points.length][1] - points[i][1];
var run = points[(i + 1) % points.length][0] - points[i][0];
var point = board.create('point', [points[i][0], points[i][1]], {
fixed: true,
visible:false
});
jxg_points.push(point); // Store the point
points[i].pop();
len[i] = Math.round((Math.sqrt(rise * rise + run * run) + Number.EPSILON) * 100) / 100;
}
Then the points can be reused for the angles without creating new points:
for (k = 0; k < showAngle.length; k++) {
if (showAngle[k] == 1) {
angle = board.create('angle', [
jxg_points[k],
jxg_points[((k + 1) % jxg_points.length)],
jxg_points[((k + 2) % jxg_points.length)]
],{fixed:true});
} else if (showAngle[k] == 2) {
var angle = board.create('angle', [
jxg_points[k],
jxg_points[((k + 1) % jxg_points.length)],
jxg_points[((k + 2) % jxg_points.length)]], {
fixed: false,
name: function() {
return ((180/Math.PI)*JXG.Math.Geometry.rad(points[k], points[((k + 1) % points.length)], points[((k + 2) % points.length)])).toFixed(1) + '°';
}
});
}
}
See it live at https://jsfiddle.net/d8an0epy/.

How can I optimise my code using "For-Loop"?

I am trying to develop a code which calculates Local Density of states of electrons in a material. For which I am using a multiple for loops and multiple tables. it takes 45sec to complete, I need less time for that. any suggestions how to optimize this code.
AbsoluteTiming[Ns=2; \[Eta] = 0.001;
Nx=15;
Ny=15;
NN=Nx*Ny;
Nband=8;
kkmx = Ns*Nx;
kkmy = Ns*Ny;
wmax = 0.2; nw = 800; p = 0;
Print["starting ldos calc"];
nsite = 2;
ldos = 0;
For[kx = 0, kx <= (Ns - 1.)*2*(Pi/kkmx), kx += 2*(Pi/kkmx),
For[ky = 0, ky <= (Ns - 1.)*2*(Pi/kkmy), ky += 2*(Pi/kkmy),
ES = Eigensystem[H];
elist = Table[ES[[1,l]], {l, 1, Nband/2*4*NN}];
ulist = Table[Abs[ES[[2,l,i]]]^2, {l, 1, Nband/2*4*NN}, {i, 388+1, 388+(nsite - 1)*Nband/2 + Nband/2}];
vlist = Table[Abs[ES[[2,l,i + Nband/2*NN*2]]]^2, {l, 1, Nband/2*4*NN}, {i, 388+1, 388+(nsite - 1)*Nband/2 + Nband/2}];
ldossc = Table[Im[Total[Table[ulist[[l,1 ;; All]]*(1/(-wmax + wmax*2*(w/nw) - elist[[l]] + I*\[Eta])) +
vlist[[l,1 ;; All]]*(1/(-wmax + 2*wmax*(w/nw) + elist[[l]] + I*\[Eta])), {l, 1, Nband/2*4*NN}]]], {w, 0, nw}]; ldos = ldos + ldossc;
Export["ldosorb_up_P.dat", Table[{-wmax + wmax*2*(\[Omega]/nw), (-Pi^(-1))*(ldos[[\[Omega] + 1,i]]/Ns^2)}, {\[Omega], 0, nw}, {i, 1,8}]];
(* Export["ldostot.dat", Table[{-wmax + wmax*2*(\[Omega]/nw), (-Pi^(-1))*((ldos[[\[Omega] + 1,i]] + ldos[[\[Omega] + 1,i + 1]] + ldos[[\[Omega] + 1,i + 2]] + ldos[[\[Omega] + 1,i + 3]] + ldos[[\[Omega] + 1,i + 4]])/Ns^2)}, {\[Omega], 0, nw}, {i, 1, (nsite - 1)*Nband/2 + Nband/2 - 4}]]; *)
Print["kx=", kx, " ky=", ky, " nsx=", (kx/(2*Pi))*kkmx + 1.]; ]; ]; ]```

scheduling common loop in discreet pipeline funcs

I have a number of halide pipelines (lowercase p) which all read the same input image and produce unique outputs. Some share common output dimensions, some do not. Every pipeline reads each pixel in the source image once. The output images needed may vary at runtime based on user input.
I'm using a Pipeline to compute all of these outputs into a Realization. Is there any way to schedule these disparate Funcs to achieve a single outer loop in the Pipeline?
It appears I can create a wrapper function which packs these Funcs into a Tuple and but this requires they all output the same dimensions.
Am I missing any other options?
Edited to Add Sample code
//Buffer<> input = Buffer<uint8_t>::make_interleaved(width, height, 4);
//fill buffer with image data
Var x("x"), y("y"), c("c");
Func rgb("rgb");
rgb(x,y,c) = ConciseCasts::u8_sat(input(x,y,c));
// Define a one-dimensional reduction domain over x
RDom r(0, input.width());
Func hist1("hist1");
Func hist2("hist2");
// Histogram buckets start as zero.
hist1(x,y) = 0;
hist2(x,y,c) = 0;
// Make a histogram for every scanline of input
hist1(rgb(r, y, 0), y ) += 1;
hist2(rgb(r, y, c), y, c) += 1;
Func clamp1("clamp1");
clamp1(x,y) = ConciseCasts::u8_sat(hist1(x,y));
Func clamp2("clamp2");
clamp2(x,y,c) = ConciseCasts::u8_sat(hist2(x,y,c));
//use clamp1 as a wrapper
hist1.compute_at(clamp1, y);
//schedule hist2 the same way (but unroll c)
hist2.compute_at(clamp2, y);
clamp2.bound(c,0,3).reorder(c, x, y).unroll(c);
hist2.bound(c,0,3).reorder(c, x, y).unroll(c);
hist2.update(0).reorder(c, r, y).unroll(c);
clamp1
.bound(x, 0, 256)
.bound(y, 0, input.height());
clamp2
.bound(x, 0, 256)
.bound(y, 0, input.height());
Pipeline pipe = Pipeline({clamp1, clamp2});
Looking at the lowered statement I see:
produce clamp1 {
for (clamp1.s0.y, 0, 2160) {
allocate hist1[int32 * 256 * 1]
produce hist1 {
for (hist1.s0.x, 0, 256) {
hist1[hist1.s0.x] = 0
}
for (hist1.s1.r4$x, 0, 4096) {
hist1[int32(b0[((hist1.s1.r4$x*4) + (clamp1.s0.y*16384))])] = (hist1[int32(b0[((hist1.s1.r4$x*4) + (clamp1.s0.y*16384))])] + 1)
}
}
for (clamp1.s0.x, 0, 256) {
clamp1[((clamp1.s0.x + (clamp1.s0.y*clamp1.stride.1)) - (clamp1.min.0 + (clamp1.min.1*clamp1.stride.1)))] = uint8(max(min(hist1[clamp1.s0.x], 255), 0))
}
free hist1
}
}
produce clamp2 {
for (clamp2.s0.y, 0, 2160) {
allocate hist2[int32 * 256 * 1 * 3]
produce hist2 {
for (hist2.s0.x, 0, 256) {
hist2[hist2.s0.x] = 0
hist2[(hist2.s0.x + 256)] = 0
hist2[(hist2.s0.x + 512)] = 0
}
for (hist2.s1.r4$x, 0, 4096) {
hist2[int32(b0[((hist2.s1.r4$x*4) + (clamp2.s0.y*16384))])] = (hist2[int32(b0[((hist2.s1.r4$x*4) + (clamp2.s0.y*16384))])] + 1)
hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 1)]) + 256)] = (hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 1)]) + 256)] + 1)
hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 2)]) + 512)] = (hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 2)]) + 512)] + 1)
}
}
for (clamp2.s0.x, 0, 256) {
clamp2[((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[clamp2.s0.x], 255), 0))
clamp2[(((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) + clamp2.stride.2) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[(clamp2.s0.x + 256)], 255), 0))
clamp2[(((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) + (clamp2.stride.2*2)) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[(clamp2.s0.x + 512)], 255), 0))
}
free hist2
}
}
}
What I'm hoping to achieve is a lowered statement that looks closer to this (I just cut and pasted this together):
produce clamps {
for (clamp1.s0.y, 0, 2160) {
allocate hist1[int32 * 256 * 1]
allocate hist2[int32 * 256 * 1 * 3]
produce hists {
for (hist1.s0.x, 0, 256) {
hist1[hist1.s0.x] = 0
hist2[hist2.s0.x] = 0
hist2[(hist2.s0.x + 256)] = 0
hist2[(hist2.s0.x + 512)] = 0
}
for (hist1.s1.r4$x, 0, 4096) {
hist1[int32(b0[((hist1.s1.r4$x*4) + (clamp1.s0.y*16384))])] = (hist1[int32(b0[((hist1.s1.r4$x*4) + (clamp1.s0.y*16384))])] + 1)
hist2[int32(b0[((hist2.s1.r4$x*4) + (clamp2.s0.y*16384))])] = (hist2[int32(b0[((hist2.s1.r4$x*4) + (clamp2.s0.y*16384))])] + 1)
hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 1)]) + 256)] = (hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 1)]) + 256)] + 1)
hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 2)]) + 512)] = (hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 2)]) + 512)] + 1)
}
}
for (clamp1.s0.x, 0, 256) {
clamp1[((clamp1.s0.x + (clamp1.s0.y*clamp1.stride.1)) - (clamp1.min.0 + (clamp1.min.1*clamp1.stride.1)))] = uint8(max(min(hist1[clamp1.s0.x], 255), 0))
clamp2[((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[clamp2.s0.x], 255), 0))
clamp2[(((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) + clamp2.stride.2) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[(clamp2.s0.x + 256)], 255), 0))
clamp2[(((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) + (clamp2.stride.2*2)) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[(clamp2.s0.x + 512)], 255), 0))
}
free hist1
free hist2
}
}
However if I try to add
clamp2.compute_with(clamp1, y);
I get the following error when jitting
Internal error at /Halide/src/ScheduleFunctions.cpp:2228
Condition failed: injector.found_store_level && injector.found_compute_level
This might be another use case for compute_with, which is not merged yet. You can try out the compute_with_directive branch to see if it meets your needs. Hopefully this will be merged soon.

How to make a vertical wave with using canvas?

I changed the script by adding text instead of an image.
I want to wave was vertically downwards. I know that a little editing but I tried different options and it did not work.
http://jsfiddle.net/7ynn4/3/
var options = {
period:100,
squeeze:0,
wavelength:40,
amplitude:30,
shading:300,
fps:30
}
var ca = document.getElementById('canvas');
var ctx = ca.getContext('2d');
ctx.canvas.width = 400;
ctx.canvas.height = 150;
ctx.font = 'bold 45pt Arial';
ctx.textAlign = 'center';
ctx.fillStyle = 'blue';
ctx.fillText('Hello World', 170, 60);
w = canvas.width,
h = canvas.height,
od = ctx.getImageData( 0, 0, w, h ).data;
setInterval(function() {
var id = ctx.getImageData( 0, 0, w, h ),
d = id.data,
now = ( new Date() )/options.period,
y,
x,
lastO,
shade,
sq = ( y - h/2 ) * options.squeeze,
px,
pct,
o,
y2,
opx;
for ( y = 0; y < h; y += 1 ) {
lastO = 0;
shade = 0;
sq = ( y - h/2 ) * options.squeeze;
for ( x = 0; x < w; x += 1 ) {
px = ( y * w + x ) * 4;
pct = x/w;
o = Math.sin( x/options.wavelength - now ) * options.amplitude * pct;
y2 = y + ( o + sq * pct ) << 0;
opx = ( y2 * w + x ) * 4;
shade = (o-lastO) * options.shading;
d[px ] = od[opx ]+shade;
d[px+1] = od[opx+1]+shade;
d[px+2] = od[opx+2]+shade;
d[px+3] = od[opx+3];
lastO = o;
}
}
ctx.putImageData( id, 0, 0 );
},
1000/options.fps
);
You just flip the values around so that x is affected instead of y -
... vars cut, but replace y2 with x2 ...
/// reversed from here
for (x = 0; x < w; x += 1) {
lastO = 0;
shade = 0;
sq = (x - w * 0.5) * options.squeeze;
for (y = 0; y < h; y += 1) {
px = (y * w + x) * 4;
pct = y / h;
o = Math.sin(y/options.wavelength-now) * options.amplitude * pct;
/// the important one: you might need to compensate here (-5)
x2 = x - 5 + (o + sq * pct) | 0;
opx = (x2 + y * w) * 4;
shade = (o - lastO) * options.shading;
d[px] = od[opx] + shade;
d[px + 1] = od[opx + 1] + shade;
d[px + 2] = od[opx + 2] + shade;
d[px + 3] = od[opx + 3];
lastO = o;
}
}
ctx.putImageData(id, 0, 0);
MODIFIED FIDDLE HERE

Resources