Why is my performance bad? (Noob scheduling) - halide

I'm mainly a very high level programmer so thinking about things like CPU locality is very new to me.
I'm working on a basic bilinear demosaic (for RGGB sensor data) and I've got the algorithm right (judging by the results) but it's not performing as well as I'd hoped (~210Mpix/s).
Here's my code (the input is a 4640x3472 image with a single channel of RGGB):
def get_bilinear_debayer(input_raw, print_nest=False):
x, y, c = Var(), Var(), Var()
# Clamp and move to 32 bit for lots of space for averaging.
input = Func()
input[x,y] = cast(
UInt(32),
input_raw[
clamp(x,0,input_raw.width()-1),
clamp(y,0,input_raw.height()-1)]
)
# Interpolate vertically
vertical = Func()
vertical[x,y] = (input[x,y-1] + input[x,y+1])/2
# Interpolate horizontally
horizontal = Func()
horizontal[x,y] = (input[x-1,y] + input[x+1,y])/2
# Interpolate on diagonals
diagonal_average = Func()
diagonal_average[x, y] = (
input[x+1,y-1] +
input[x+1,y+1] +
input[x-1,y-1] +
input[x-1,y+1])/4
# Interpolate on adjacents
adjacent_average = Func()
adjacent_average[x, y] = (horizontal[x,y] + vertical[x,y])/2
red, green, blue = Func(), Func(), Func()
# Calculate the red channel
red[x, y, c] = select(
# Red photosite
c == 0, input[x, y],
# Green photosite
c == 1, select(x%2 == 0, vertical[x,y],
horizontal[x,y]),
# Blue photosite
diagonal_average[x,y]
)
# Calculate the blue channel
blue[x, y, c] = select(
# Blue photosite
c == 2, input[x, y],
# Green photosite
c == 1, select(x%2 == 1, vertical[x,y],
horizontal[x,y]),
# Red photosite
diagonal_average[x,y]
)
# Calculate the green channel
green[x, y, c] = select(
# Green photosite
c == 1, input[x,y],
# Red/Blue photosite
adjacent_average[x,y]
)
# Switch color interpolator based on requested color.
# Specify photosite as third argument, calculated as [x, y, z] = (0, 0, 0), (0, 1, 1), (1, 0, 1), (1, 1, 2)
# Happily works out to a sum of x mod 2 and y mod 2.
debayer = Func()
debayer[x, y, c] = select(c == 0, red[x, y, x%2 + y%2],
c == 1, green[x, y, x%2 + y%2],
blue[x, y, x%2 + y%2])
# Scheduling
x_outer, y_outer, x_inner, y_inner, tile_index = Var(), Var(), Var(), Var(), Var()
bits = input_raw.get().type().bits
output = Func()
# Cast back to the original colour space
output[x,y,c] = cast(UInt(bits), debayer[x,y,c])
# Reorder so that colours are calculated in order (red runs, then green, then blue)
output.reorder_storage(c, x, y)
# Tile in 128x128 squares
output.tile(x, y, x_outer, y_outer, x_inner, y_inner, 128, 128)
# Vectorize based on colour
output.bound(c, 0, 3)
output.vectorize(c)
# Fuse and parallelize
output.fuse(x_outer, y_outer, tile_index)
output.parallel(tile_index)
# Debugging
if print_nest:
output.print_loop_nest()
debayer.print_loop_nest()
red.print_loop_nest()
green.print_loop_nest()
blue.print_loop_nest()
return output
Honestly I have no idea what I'm doing here and I'm too new to this to have any clue where or what to look at.
Any advice on how to improve the scheduling is helpful. I'm still learning but feedback is hard to find.
The schedule I have is the best I've been able to do but it's pretty much entirely trial and error.
EDIT: I added an extra 30Mpix/s by doing the whole adjacent average summation in the function directly and by vectorizing on x_inner instead of colour.
EDIT: New schedule:
# Set input bounds.
output.bound(x, 0, (input_raw.width()/2)*2)
output.bound(y, 0, (input_raw.height()/2)*2)
output.bound(c, 0, 3)
# Reorder so that colours are calculated in order (red runs, then green, then blue)
output.reorder_storage(c, x, y)
output.reorder(c, x, y)
# Tile in 128x128 squares
output.tile(x, y, x_outer, y_outer, x_inner, y_inner, 128, 128)
output.unroll(x_inner, 2).unroll(y_inner,2)
# Vectorize based on colour
output.unroll(c)
output.vectorize(c)
# Fuse and parallelize
output.fuse(x_outer, y_outer, tile_index)
output.parallel(tile_index)
EDIT: Final schedule that's now beating (640MP/s) the Intel Performance Primitive benchmark that was run on a CPU twice as powerful as mine:
output = Func()
# Cast back to the original colour space
output[x,y,c] = cast(UInt(bits), debayer[x,y,c])
# Set input bounds.
output.bound(x, 0, (input_raw.width()/2)*2)
output.bound(y, 0, (input_raw.height()/2)*2)
output.bound(c, 0, 3)
# Tile in 128x128 squares
output.tile(x, y, x_outer, y_outer, x_inner, y_inner, 128, 128)
output.unroll(x_inner, 2).unroll(y_inner, 2)
# Vectorize based on colour
output.vectorize(x_inner, 16)
# Fuse and parallelize
output.fuse(x_outer, y_outer, tile_index)
output.parallel(tile_index)
target = Target()
target.arch = X86
target.os = OSX
target.bits = 64
target.set_feature(AVX)
target.set_feature(AVX2)
target.set_feature(SSE41)
output.compile_jit(target)

Make sure that you are using unroll(c) to make the per-channel select logic optimize away. Unrolling by 2 in x and y will also help:
output.unroll(x, 2).unroll(y,2)
The goal there is to optimize out the select logic between even/odd rows and columns. In order to take full advantage of that, you'll likely also need to tell Halide that the min and extent are a multiple of 2:
output.output_buffer().set_bounds(0,
(f.output_buffer().min(0) / 2) * 2,
(output.output_buffer().extent(0) / 2) * 2)
output.output_buffer().set_bounds(1,
(f.output_buffer().min(1) / 2) * 2,
(output.output_buffer().extent(1) / 2) * 2)
Though it may be worth stating even more stringent constraints, such as using 128 instead of 2 to assert multiples of the tile size or just hardwiring the min and extent to reflect the actual sensor parameters if you are only supporting a single camera.

Related

cv2 pose estimation using homography matrix

I am trying to calculate the pose of image Y, given image X. Image Y is the same as image X rotated 90ยบ degrees.
1 -So, for starters i find the matches between both images.
2 -Then i store all the good matches.
3 -The homography between the the matches from both images is calculated using cv2.RANSAC.
4 -Then for the X image, i transform the 2d matching points into 3d, adding 0 as the Z axis.
5 -Object points contain all points from matches of original image, while image points contain matches from the training image. Both array of points are filtered using the mask returned by homography.
6 -After that, i use cv2.calibrateCamera with these object points and image points.
7 -Finnaly i use cv2.projectPoints to get the projections of the axis
I know with that until step 5, the results are correct because i use cv2.drawMatches to see the matches. However this may not be the way to get what i want to achieve.
matches = flann.knnMatch(query_image.descriptors, descriptors, k=2)
good = []
for m, n in matches:
if m.distance < 0.70 * n.distance:
good.append(m)
current_good = good
src_pts = np.float32([selected_image.keypoints[m.queryIdx].pt for m in current_good]).reshape(-1, 1, 2)
dst_pts = np.float32([keypoints[m.trainIdx].pt for m in current_good]).reshape(-1, 1, 2)
homography, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
test = np.zeros(((mask.ravel() > 0).sum(), 3),np.float32) #obj points
test1 = np.zeros(((mask.ravel() > 0).sum(), 2), np.float32) #img points
i=0
counter=0
for m in current_good:
if mask.ravel()[i] == 1:
test[counter][0] = selected_image.keypoints[m.queryIdx].pt[0]
test[counter][1] = selected_image.keypoints[m.queryIdx].pt[1]
test1[counter][0] = selected_image.keypoints[m.trainIdx].pt[0]
test1[counter][1] = selected_image.keypoints[m.trainIdx].pt[1]
counter+=1
i+=1
gray = cv2.cvtColor(self.train_image, cv2.COLOR_BGR2GRAY)
gray = np.float32(gray)
#here start my doubts about what i want to do and if it is possible to do it this way
ret, mtx, dist, rvecs, tvecs = cv2.calibrateCamera([test], [test1], gray.shape[::-1], None, None)
axis = np.float32([[3, 0, 0], [0, 3, 0], [0, 0, -3]]).reshape(-1, 3)
rvecs = np.array(rvecs, np.float32)
tvecs = np.array(tvecs, np.float32)
imgpts, jac = cv2.projectPoints(axis, rvecs, tvecs, mtx, dist)
However after all this, imgpts returned by cv2.projectPoints give results that don't make much sense to me, like :
[[[857.3185 109.317406]]
[[857.2196 108.360954]]
[[857.2846 107.579605]]]
I would like to have a normal to my image like it is shown here https://docs.opencv.org/trunk/d7/d53/tutorial_py_pose.html and i successfully got it to work using the chessboard image. But trying to adapt to a general image is giving me strange results.

akima and/or rgl are shutting down Rstudio

the following procedure is shutting dow my Rstudio: I understand is any of the akima or rgl packages or both. How to solve this? data here
s=read.csv("GRVMAX tadpoles.csv")
require(nlme)
t=s[s$SPP== levels(s$SPP)[1],]
head(t)
t=na.omit(t)
t$TEM=as.numeric(as.character(t$TEM))
library(akima)
x=t$TEM
y=t$value
z=t$time
spline <- with(t,interp(x,y,z,duplicate="median",linear=T))
# rotatable 3D plot of points and spline surface
library(rgl)
open3d(scale=c(1/diff(range(x)),1/diff(range(y)),1/diff(range(z))))
with(spline,surface3d(as.character(x),y,z, col))
points3d(x,y,z, add=T)
title3d(xlab="temperature",ylab="performance",zlab="time")
axes3d()
interp() causes the problem. I think the reason is that scale of y is much different from x (the algorithm of interp() is basically for contour of spatial map). So interp() run when you give y changed scale. (Note; I did y*10 and output/10 but maybe it is a rough scale change. It whoud be better to concider methods of changing scale)
library(nlme); library(akima); library(rgl)
s = read.csv("GRVMAX tadpoles.csv")
t = s[s$SPP == levels(s$SPP)[1],]
t = na.omit(t)
head(t)
t$TEM = as.numeric(as.character(t$TEM))
x = t$TEM
y = t$value * 10 # scale change
z = t$time
spline <- interp(x, y, z, duplicate = "median", linear = T) # with() is unnecessary
spline$y <- spline$y / 10 # rescale
y <- y / 10 # rescale
open3d() # Is scale needed ??
# persp3d() can directly take interp.obj as an argument
persp3d(spline, col = "blue", alpha = 0.5, axes = F, xlab="", ylab="", zlab="")
points3d(x, y, z, add=T)
title3d(xlab="temperature", ylab="performance", zlab="time")
axes3d()

Matlab - Scale down an image using an average of four pixels

I have just started learning image-processing and Matlab and I'm trying to scale down an image using an average of 4 pixels. That means that for every 4 original pixels I calculate the average and produce 1 output pixel.
So far I have the following code:
img = imread('bird.jpg');
row_size = size(img, 1);
col_size = size(img, 2);
res = zeros(floor(row_size/2), floor(col_size/2));
figure, imshow(img);
for i = 1:2:row_size
for j = 1:2:col_size
num = mean([img(i, j), img(i, j+1), img(i+1, j), img(i+1, j+1)]);
res(round(i/2), round(j/2)) = num;
end
end
figure, imshow(uint8(res));
This code manages to scale down the image but it converts it to grayscale.
I understand that I probably have to calculate the average of the RGB components for the output pixel but I don't know how to access them, calculate the average and insert them to the result matrix.
In Matlab, an RGB image is treated as a 3D array. You can check it with:
depth_size = size(img, 3)
depth_size =
3
The loop solution, as you have done, is explained in Sardar_Usama's answer. However, in Matlab it is recommended to avoid loops whenever you want to gain speed.
This is a vectorized solution to scale down an RGB image by a factor of n:
img = imread('bird.jpg');
n = 2; % n can only be integer
[row_size, col_size] = size(img(:, :, 1));
% getting rid of extra rows and columns that won't be counted in averaging:
I = img(1:n*floor(row_size / n), 1:n*floor(col_size / n), :);
[r, ~] = size(I(:, :, 1));
% separating and re-ordering the three colors of image in a way ...
% that averaging could be done with a single 'mean' command:
R = reshape(permute(reshape(I(:, :, 1), r, n, []), [2, 1, 3]), n*n, [], 1);
G = reshape(permute(reshape(I(:, :, 2), r, n, []), [2, 1, 3]), n*n, [], 1);
B = reshape(permute(reshape(I(:, :, 3), r, n, []), [2, 1, 3]), n*n, [], 1);
% averaging and reshaping the colors back to the image form:
R_avg = reshape(mean(R), r / n, []);
G_avg = reshape(mean(G), r / n, []);
B_avg = reshape(mean(B), r / n, []);
% concatenating the three colors together:
scaled_img = cat(3, R_avg, G_avg, B_avg);
% casting the result to the class of original image
scaled_img = cast(scaled_img, 'like', img);
Benchmarking:
If you want to know why vectorized solutions are more popular, take a look at how long it takes to process an RGB 768 x 1024 image with the two methods:
------------------- With vectorized solution:
Elapsed time is 0.024690 seconds.
------------------- With nested loop solution:
Elapsed time is 6.127933 seconds.
So there is more than 2 orders of magnitude difference of speed between the two solutions.
Another possible solution can be using the function blockproc as mentioned at this link. This will also avoid for loops.
You can take care of that using the modified code below:
img = imread('bird.jpg');
row_size = size(img, 1);
col_size = size(img, 2);
figure, imshow(img);
res = zeros(floor(row_size/2), floor(col_size/2), 3); %Pre-allocation
for p = 1:2:row_size
for q = 1:2:col_size
num = mean([img(p, q,:), img(p, q+1,:), img(p+1, q,:), img(p+1, q+1,:)]);
res(round(p/2), round(q/2),:) = num;
end
end
figure, imshow(uint8(res));
I took a sample image of 1200x1600x3 uint8 which is converted to 600x800x3 uint8 by the above code which is correct because (1200*1600)/4 = 480000 and 600*800 = 480000
P.S : I changed the variable names i and j to p and q respectively since i and j are reserved for imaginary numbers.

Hilbert-Peano curve to scan image of arbitrary size

I have written an implementation of Hilbert-Peano space filling curve in Python (from a Matlab one) to flatten my 2D image:
def hilbert_peano(n):
if n<=0:
x=0
y=0
else:
[x0, y0] = hilbert_peano(n-1)
x = (1/2) * np.array([-0.5+y0, -0.5+x0, 0.5+x0, 0.5-y0])
y = (1/2) * np.array([-0.5+x0, 0.5+y0, 0.5+y0, -0.5-y0])
return x,y
However, the classical Hilbert-Peano curve only works for multi-dimensionnal array whose shape is a power of two (ex: 256*256 or 512*512 in case of a 2D array (image)).
Does anybody know how to extend this to an array of arbitrary size?
I had the same problem and have written an algorithm that generates a Hilbert-like curve for rectangles of arbitrary size in 2D and 3D. Example for 55x31: curve55x31
The idea is to recursively apply a Hilbert-like template but avoid odd sizes when halving the domain dimensions. If the dimensions happen to be powers of two, the classic Hilbert curve is generated.
def gilbert2d(x, y, ax, ay, bx, by):
"""
Generalized Hilbert ('gilbert') space-filling curve for arbitrary-sized
2D rectangular grids.
"""
w = abs(ax + ay)
h = abs(bx + by)
(dax, day) = (sgn(ax), sgn(ay)) # unit major direction
(dbx, dby) = (sgn(bx), sgn(by)) # unit orthogonal direction
if h == 1:
# trivial row fill
for i in range(0, w):
print x, y
(x, y) = (x + dax, y + day)
return
if w == 1:
# trivial column fill
for i in range(0, h):
print x, y
(x, y) = (x + dbx, y + dby)
return
(ax2, ay2) = (ax/2, ay/2)
(bx2, by2) = (bx/2, by/2)
w2 = abs(ax2 + ay2)
h2 = abs(bx2 + by2)
if 2*w > 3*h:
if (w2 % 2) and (w > 2):
# prefer even steps
(ax2, ay2) = (ax2 + dax, ay2 + day)
# long case: split in two parts only
gilbert2d(x, y, ax2, ay2, bx, by)
gilbert2d(x+ax2, y+ay2, ax-ax2, ay-ay2, bx, by)
else:
if (h2 % 2) and (h > 2):
# prefer even steps
(bx2, by2) = (bx2 + dbx, by2 + dby)
# standard case: one step up, one long horizontal, one step down
gilbert2d(x, y, bx2, by2, ax2, ay2)
gilbert2d(x+bx2, y+by2, ax, ay, bx-bx2, by-by2)
gilbert2d(x+(ax-dax)+(bx2-dbx), y+(ay-day)+(by2-dby),
-bx2, -by2, -(ax-ax2), -(ay-ay2))
def main():
width = int(sys.argv[1])
height = int(sys.argv[2])
if width >= height:
gilbert2d(0, 0, width, 0, 0, height)
else:
gilbert2d(0, 0, 0, height, width, 0)
A 3D version and more documentation is available at https://github.com/jakubcerveny/gilbert
I found this page by Lutz Tautenhahn:
"Draw A Space-Filling Curve of Arbitrary Size" (http://lutanho.net/pic2html/draw_sfc.html)
The algorithm doesn't have a name, he doesn't reference anyone else and the sketch suggests he came up with it himself.
I wonder if this is possible for a z order curve and how?
[1]Draw A Space-Filling Curve of Arbitrary Size
I finally choose, as suggested by Betterdev as adaptive curves are not that straigthforward [1], to compute a bigger curve and then get rid of coordinates which are outside my image shape:
# compute the needed order
order = np.max(np.ceil([np.log2(M), np.log2(N)]))
# Hilbert curve to scan a 2^order * 2^order image
x, y = hilbert_peano(order)
mat = np.zeros((2**order, 2**order))
# curve as a 2D array
mat[x, y] = np.arange(0, x.size, dtype=np.uint)
# clip the curve to the image shape
mat = mat[:M, :N]
# compute new indices (from 0 to M*N)
I = np.argsort(mat.flat)
x_new, y_new = np.meshgrid(np.arange(0, N, dtype=np.uint), np.arange(0, M, dtype=np.uint))
# apply the new order to the grid
x_new = x_new.flat[I]
y_new = y_new.flat[I]
[1] Zhang J., Kamata S. and Ueshige Y., "A Pseudo-Hilbert Scan Algorithm for Arbitrarily-Sized Rectangle Region"

smooth coloring algorithm for the mandelbrot set

I know there are a lot of questions alrady answered about this. However, mine varies slightly. Whenever we implement the smooth coloring algorithim as I understand it.
mu = 1 + n + math.log2(math.log2(z)) / math.log2(2)
where n is the escape iteration and 2 is the power z is to, and if im not mistaken z is the modulus of the complex number at that escape iteration. We then use this renormalized escape value in our linear interpolation between colors to produce a smoothly banded mandelbrot set. I've seen answers to other questions about this where we run this value through a HSB to RGB conversion, however I still fail to understand how this would provide a smooth gradient of colors and how to implement this in python.
However, whenever I attempted to implement this it produces floating point RGB values, but there isn't an image format that I know of, besides a .tiff file, that would support this, and if we round off to integers we still have unsmooth banding. So how is this supposed to produce a smoothly banded image if we cannot directly use the RGB values it produces? Example code of what I tried below, since I don't undertand fully how to implement this I made an attempt at a solution that somewhat produces smooth banding. This produces a somewhat smoothly banded image between two colors blue for the full set and a progressively whiter color the further we zoom in on the set to the point where at a certain depth everything just appears blurred. Since I'm using tkinter to do this I had to convert the RGB values to hex to be able to draw them to the canvas.
I;m computing the set recursively, and in my other function (not posted below) i am setting the window width and height then iterating over these for the pixels of the tkinter window and computing this recursion in the inner loop.
def linear_interp(self, color_1, color_2, i):
r = (color_1[0] * (1 - i)) + (color_2[0] * i)
g = (color_1[1] * (1 - i)) + (color_2[1] * i)
b = (color_1[2] * (1 - i)) + (color_2[2] * i)
rgb_list = [r, g, b]
for value in rgb_list:
if value > MAX_COLOR:
rgb_list[rgb_list.index(value)] = MAX_COLOR
if value < 0:
rgb_list[rgb_list.index(value)] = abs(value)
return (int(rgb_list[0]), int(rgb_list[1]),
int(rgb_list[2]))
def rgb_to_hex(self, color):
return "#%02x%02x%02x" % color
def mandel(self, x, y, z, iteration):
bmin = 100
bmax = 255
power_z = 2
mod_z = math.sqrt((z.real * z.real) + (z.imag * z.imag))
#If its not in the set or we have reached the maximum depth
if abs(z) >= float(power_z) or iteration == DEPTH:
z = z
if iteration > 255:
factor = (iteration / DEPTH) * 255
else:
factor = iteration
logs = math.log2(math.log2(abs(z) + 1 ) / math.log2(power_z))
r = g = math.floor(factor + 5 - logs)
b = bmin + (bmax - bmin) * r / 255
rgb = (abs(r), abs(g), abs(round(b)))
self.canvas.create_line(x, y, x + 1, y + 1,
fill = self.rgb_to_hex(rgb))
else:
z = (z * z) + self.c
self.mandel(x, y, z, iteration + 1)
return z
The difference between colors #000000, #010000, ..., #FE0000, #FF0000 is so small that you obtain a smooth gradient from black to red. Hence, simply round your values: Suppose your smoothened color values of your smoothness function range from 0 to (excl) 1, then you simply use
(int) (value * 256)

Resources