I've been trying to write an OpenCL kernel that populates an OpenCL image with values. However, I've been having problems with some texels not being written to. I can't seem to get the write_image() function to write to texels with different x and y coordinates.
I've created a reduction program here. Hopefully this is straightforward enough to be readable:
#include <iostream>
#include <cassert>
#include <OpenCL/OpenCL.h>
const char* clSource[] = {
"kernel void set(write_only image2d_t image)\n",
"{\n",
" int x = get_global_id(0);\n",
" int y = get_global_id(1);\n",
" float4 result = float4(1.0, 1.0, 1.0, 1.0);\n",
" printf(\"Writing dimensions %d x %d: %d, %d, %d, %d\\n\", x, y,\n",
" (int)result.x*255, (int)result.y*255, (int)result.z*255, (int)result.w*255);\n",
" write_imagef(image, int2(x, y), result);\n",
"}\n",
};
int main(int argc, const char * argv[])
{
const unsigned int WIDTH = 3;
const unsigned int HEIGHT = 3;
cl_int clError;
cl_platform_id platform;
clError = clGetPlatformIDs(1, &platform, nullptr);
assert(clError == CL_SUCCESS);
cl_device_id device;
clError = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, nullptr);
assert(clError == CL_SUCCESS);
cl_context_properties properties[] = {
CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
0
};
cl_context openCLContext = clCreateContext(properties, 1, &device, nullptr, nullptr, &clError);
assert(clError == CL_SUCCESS);
cl_command_queue commandQueue = clCreateCommandQueue(openCLContext, device, 0, &clError);
assert(clError == CL_SUCCESS);
cl_program program = clCreateProgramWithSource(openCLContext, sizeof(clSource) / sizeof(const char*), clSource, nullptr, &clError);
assert(clError == CL_SUCCESS);
clError = clBuildProgram(program, 1, &device, "", nullptr, nullptr);
assert(clError == CL_SUCCESS);
cl_kernel kernel = clCreateKernel(program, "set", &clError);
assert(clError == CL_SUCCESS);
cl_image_format imageFormat;
imageFormat.image_channel_data_type = CL_UNORM_INT8;
imageFormat.image_channel_order = CL_RGBA;
cl_image_desc imageDesc;
imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
imageDesc.image_width = WIDTH;
imageDesc.image_height = HEIGHT;
imageDesc.image_depth = 1;
imageDesc.image_array_size = 1;
imageDesc.image_row_pitch = 0;
imageDesc.image_slice_pitch = 0;
imageDesc.num_mip_levels = 0;
imageDesc.num_samples = 0;
imageDesc.buffer = nullptr;
cl_mem clTexture = clCreateImage(openCLContext, CL_MEM_WRITE_ONLY, &imageFormat, &imageDesc, nullptr, &clError);
assert(clError == CL_SUCCESS);
clError = clSetKernelArg(kernel, 0, sizeof(cl_mem), &clTexture);
assert(clError == CL_SUCCESS);
size_t globalWorkOffset[] = {0, 0, 0};
size_t globalWorkSize[] = {WIDTH, HEIGHT, 0};
size_t localWorkSize[] = {1, 1, 0};
cl_event event1;
clError = clEnqueueNDRangeKernel(commandQueue, kernel, 2, globalWorkOffset, globalWorkSize, localWorkSize, 0, nullptr, &event1);
assert(clError == CL_SUCCESS);
unsigned char* bitmap = new unsigned char[WIDTH * HEIGHT * 4];
size_t origin[] = {0, 0, 0};
size_t region[] = {WIDTH, HEIGHT, 1};
cl_event event2;
clError = clEnqueueReadImage(commandQueue, clTexture, CL_TRUE, origin, region, 0, 0, bitmap, 1, &event1, &event2);
std::cout << "============================================" << std::endl;
clError = clWaitForEvents(1, &event2);
assert(clError == CL_SUCCESS);
for (size_t i = 0; i < HEIGHT; ++i) {
for (size_t j = 0; j < WIDTH; ++j) {
std::cout << "Reading dimensions " << j << " x " << i << ": ";
std::cout << static_cast<int>(bitmap[4*(i*WIDTH+j)+0]) << ", ";
std::cout << static_cast<int>(bitmap[4*(i*WIDTH+j)+1]) << ", ";
std::cout << static_cast<int>(bitmap[4*(i*WIDTH+j)+2]) << ", ";
std::cout << static_cast<int>(bitmap[4*(i*WIDTH+j)+3]) << std::endl;
}
}
delete[] bitmap;
clError = clReleaseEvent(event1);
assert(clError == CL_SUCCESS);
clError = clReleaseEvent(event2);
assert(clError == CL_SUCCESS);
clError = clReleaseMemObject(clTexture);
assert(clError == CL_SUCCESS);
clError = clReleaseKernel(kernel);
assert(clError == CL_SUCCESS);
clError = clReleaseProgram(program);
assert(clError == CL_SUCCESS);
clError = clReleaseCommandQueue(commandQueue);
assert(clError == CL_SUCCESS);
clError = clReleaseDevice(device);
assert(clError == CL_SUCCESS);
clError = clReleaseContext(openCLContext);
assert(clError == CL_SUCCESS);
return 0;
}
After all that, here is the output on OS X (10.9):
Writing dimensions 0 x 0: 255, 255, 255, 255
Writing dimensions 1 x 0: 255, 255, 255, 255
Writing dimensions 2 x 0: 255, 255, 255, 255
Writing dimensions 0 x 1: 255, 255, 255, 255
Writing dimensions 1 x 1: 255, 255, 255, 255
Writing dimensions 2 x 1: 255, 255, 255, 255
Writing dimensions 0 x 2: 255, 255, 255, 255
Writing dimensions 1 x 2: 255, 255, 255, 255
Writing dimensions 2 x 2: 255, 255, 255, 255
============================================
Reading dimensions 0 x 0: 255, 255, 255, 255
Reading dimensions 1 x 0: 0, 0, 0, 0
Reading dimensions 2 x 0: 0, 0, 0, 0
Reading dimensions 0 x 1: 0, 0, 0, 0
Reading dimensions 1 x 1: 255, 255, 255, 255
Reading dimensions 2 x 1: 0, 0, 0, 0
Reading dimensions 0 x 2: 0, 0, 0, 0
Reading dimensions 1 x 2: 0, 0, 0, 0
Reading dimensions 2 x 2: 255, 255, 255, 255
Program ended with exit code: 0
I get the same result on a ATI Radeon HD 5750 as I do on a NVIDIA GeForce GT 650M.
OpenCL to OpenGL texture problems and opencl image2d_t doesn't write back values seem to have similar problems, but neither of those have anything that helps me.
Am I doing something wrong? Or are image writes simply not supported on Mavericks drivers?
The issue is in the way that you are constructing vector values. Instead of this:
typeN(a, b, ..., k)
You should be doing this:
(typeN)(a, b, ..., k)
The former actually causes a compilation error on non-Apple platforms, so I'm not actually sure how Apple's compiler is interpreting that code.
So, for your kernel, the two relevant lines that need to be changed are these:
float4 result = float4(1.0, 1.0, 1.0, 1.0);
...
write_imagef(image, int2(x, y), result);
Which should now become:
float4 result = (float4)(1.0, 1.0, 1.0, 1.0);
...
write_imagef(image, (int2)(x, y), result);
I was able to compile and run your program successfully with the following kernel change:
const char* clSource[] = {
"__kernel void set(write_only image2d_t image)\n",
"{\n",
" int x = get_global_id(0);\n",
" int y = get_global_id(1);\n",
" float4 result = (float4)(1.0, 1.0, 1.0, 1.0);\n",
" printf(\"Writing dimensions %d x %d: %d, %d, %d, %d\\n\", x, y,\n",
" (int)result.x*255, (int)result.y*255, (int)result.z*255, (int)result.w*255);\n",
" write_imagef(image, (int2)(x, y), result);\n",
"}\n",
};
As an example you cannot write float4(1.0.... but you must write it as C style typecast (float4). I have no idea why it even compiled cleanly with your drivers.
Another really weird issue in the output is that your output seems to come from the case where WIDTH and HEIGHT on line 23 were 3. Is the output from a version where it indeed is 3?
Regardless it works fine after the changes.
Related
I followed the algorithm mentioned in the sig09 paper Coordinates for Instant Image Cloning
The algorithm
This is my code:
#include<iostream>
#include<vector>
#include<map>
#include<fstream>
#include<queue>
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include"../shared/stb_image.h"
#include"../shared/stb_image_write.h"
#define vector std::vector
#define queue std::queue
#define map std::map
#define cin std::cin
#define cout std::cout
#define endl std::endl
#define string std::string
#define PDD std::pair<double, double>
#define image Mat<unsigned char>
template<class T>
class Mat{
public:
int row, col, channel;
vector<T> data;
Mat(){}
Mat(int row, int col, int cha):row(row), col(col), channel(cha){
data.resize(row * col * cha, 0);
}
Mat(const char *name){
T *t = stbi_load(name, &col, &row, &channel, 0);
data = vector<T>(t, t + col * row * channel);
stbi_image_free(t);
}
T& at(int x, int y, int z){
return data[(x * col + y) * channel + z];
}
void write(const char *name){
stbi_write_bmp(name, col, row, channel, data.data());
}
};
#define x first
#define y second
vector<PDD> P;
map<int, map<int, bool>> st; // register
vector<vector<double>> w;
int dx8[] = {1, 1, 1, 0, 0, -1, -1, -1}, dy8[] = {-1, 0, 1, -1, 1, -1, 0, 1};
int dx4[] = {0, 0, 1, -1}, dy4[] = {1, -1, 0, 0};
bool check(int i, int j, image &mask){
if(mask.at(i, j, 0) == 0) return false;
return mask.at(i, j - 1, 0) == 0||
mask.at(i, j + 1, 0) == 0||
mask.at(i - 1, j, 0) == 0||
mask.at(i + 1, j, 0) == 0;
}
void dfs(int sx, int sy, int x, int y, int px, int py, image &mask){
if(mask.at(x, y - 1, 0) == 0 && st[x][y - 1] == 0) P.push_back({x, y - 1}), st[x][y - 1] = 1;
if(px != -1){
if(mask.at(x + 1, y, 0) == 0 && st[x + 1][y] == 0) P.push_back({x + 1, y}), st[x + 1][y] = 1;
if(mask.at(x, y + 1, 0) == 0 && st[x][y + 1] == 0) P.push_back({x, y + 1}), st[x][y + 1] = 1;
if(mask.at(x - 1, y, 0) == 0 && st[x - 1][y] == 0) P.push_back({x - 1, y}), st[x - 1][y] = 1;
}
if(sx == x && sy == y && px != -1) return;
for(int i = 0; i < 8; i ++){
int a = x + dx8[i], b = y + dy8[i];
if(a < 0 || b < 0 || a >= mask.row || b >= mask.col) continue;
if(check(a, b, mask) && (a != px || b != py)) dfs(sx, sy, a, b, x, y, mask);
}
}
double len(const PDD &a){
return sqrt(a.x * a.x + a.y * a.y);
}
double dot(const PDD &a, const PDD &b){
return a.x * b.x + a.y * b.y;
}
PDD minus(const PDD &a, const PDD &b){
return {a.x - b.x, a.y - b.y};
}
PDD normalize(const PDD &a){
return {a.x / len(a), a.y / len(a)};
}
double val(PDD &pre, PDD &cur, PDD &nxt, PDD &o){
PDD V1 = normalize(minus(pre, o));
PDD V2 = normalize(minus(nxt, o));
PDD mid = normalize(minus(cur, o));
double alpha1 = acos(dot(V1, mid));
double alpha2 = acos(dot(V2, mid));
return (tan(alpha1 / 2) + tan(alpha2 / 2)) / len(minus(cur, o)); // many nan value occured here
}
int main(int argc, char *argv[]){
image src("src.png");
image mask("mask1.png");
image tar("target.png");
image res(tar.row, tar.col, tar.channel);
for(int i = 0; i < mask.row; i ++){
for(int j = 0; j < mask.col; j ++){
if(mask.at(i, j, 0) == 255 && st[i][j] == 0){
dfs(i, j, i, j, -1, -1, mask); // find counter-clockwise border
queue<PDD> q;
vector<PDD> X;
q.push({i, j});
st[i][j] = 1;
while(q.size()){ // get all white (x, y)s in mask
auto h = q.front();
X.push_back(h);
q.pop();
vector<double> wx;
for(int k = 0; k < P.size(); k ++){ // calculate lambda value by search order
int pre = (k - 1 + P.size()) % P.size();
int cur = k;
int nxt = (k + 1) % P.size();
wx.push_back(
val(P[pre], P[cur], P[nxt], h)
);
}
w.push_back(wx);
for(int k = 0; k < 4; k ++){
int a = h.x + dx4[k], b = h.y + dy4[k];
if(st[a][b] == 1 || mask.at(a, b, 0) == 0) continue;
st[a][b] = 1;
q.push({a, b});
}
}
for(int c = 0; c < res.channel; c ++){ // every channel of res
for(int k = 0; k < X.size(); k ++){
double rx = 0, sum = 0;
for(int u = 0; u < w[k].size(); u ++){
double diff = tar.at(P[u].x, P[u].y, c) - src.at(P[u].x, P[u].y, c);
rx += w[k][u] * diff;
sum += w[k][u];
}
rx /= sum;
res.at(X[k].x, X[k].y, c) = rx + src.at(X[k].x, X[k].y, c);
}
}
res.write("./res.bmp");
return 0;
}
}
}
}
1. get the border(counter-clockwise) of the white region in the
mask
2. get all (x, y)s of pixels in the white area of the mask
3. calculate lambda value of every (x, y) in 2, but I found that lambda values of every (x, y) contain many nans (possibly caused by too small value in function val(...))
The question is I do not know how to deal with this condition in 3, nor did the paper mention it.
I'm using the following code in order to convert my ImageMagick image to 32-bit HBITMAP:
BITMAP bitmap;
std::memset(&bitmap, 0, sizeof(bitmap));
bitmap.bmType = 0;
bitmap.bmWidth = image->image()->columns;
bitmap.bmHeight = image->image()->rows;
bitmap.bmWidthBytes = 4 * bitmap.bmWidth;
bitmap.bmPlanes = 1;
bitmap.bmBitsPixel = 32;
bitmap.bmBits = NULL;
const size_t size = bitmap.bmWidthBytes * bitmap.bmHeight;
auto buffer = (HANDLE)GlobalAlloc(GMEM_MOVEABLE | GMEM_DDESHARE, size);
RGBQUAD *bitmap_bits = (RGBQUAD *) GlobalLock((HGLOBAL) buffer);
register RGBQUAD *q = bitmap_bits;
for (size_t y = 0; y < image->image()->rows; y++)
{
register auto p = GetVirtualPixels(image->image(), 0, y, image->image()->columns, 1, exception);
if (!p) break;
for (size_t x = 0; x < image->image()->columns; x++)
{
q->rgbRed = ScaleQuantumToChar(GetPixelRed(image->image(), p));
q->rgbGreen = ScaleQuantumToChar(GetPixelGreen(image->image(), p));
q->rgbBlue = ScaleQuantumToChar(GetPixelBlue(image->image(), p));
q->rgbReserved = 0;
p += GetPixelChannels(image->image());
q++;
}
}
bitmap.bmBits = bitmap_bits;
HBITMAP hbmp = CreateBitmapIndirect(&bitmap);
It works well, but I'd like to save some memory by using images with lower depth. Unfortunately I'm not even able to make it work with 24-bit images. I modified my code to look like this:
BITMAP bitmap;
std::memset(&bitmap, 0, sizeof(bitmap));
bitmap.bmType = 0;
bitmap.bmWidth = image->image()->columns;
bitmap.bmHeight = image->image()->rows;
bitmap.bmWidthBytes = ((bitmap.bmWidth * 24 + 31) / 32) * 4;
bitmap.bmPlanes = 1;
bitmap.bmBitsPixel = 24;
bitmap.bmBits = NULL;
const size_t length = bitmap.bmWidthBytes * bitmap.bmHeight;
auto buffer = (HANDLE)GlobalAlloc(GMEM_MOVEABLE | GMEM_DDESHARE, length);
RGBTRIPLE *bitmap_bits = (RGBTRIPLE *) GlobalLock((HGLOBAL) buffer);
register RGBTRIPLE *q = bitmap_bits;
for (size_t y = 0; y < image->image()->rows; y++)
{
register auto p = GetVirtualPixels(image->image(), 0, y, image->image()->columns, 1, exception);
if (!p) break;
for (size_t x = 0; x < image->image()->columns; x++)
{
q->rgbtRed = ScaleQuantumToChar(GetPixelRed(image->image(), p));
q->rgbtGreen = ScaleQuantumToChar(GetPixelGreen(image->image(), p));
q->rgbtBlue = ScaleQuantumToChar(GetPixelBlue(image->image(), p));
p += GetPixelChannels(image->image());
q++;
}
}
bitmap.bmBits = bitmap_bits;
HBITMAP hbmp = CreateBitmapIndirect(&bitmap);
But it seems that this code cannot produce valid bitmap. What am I doing wrong?
You are not taking the stride/alignment into account. Each row needs to be DWORD aligned.
Calculating Surface Stride
In an uncompressed bitmap, the stride is the number of bytes needed to go from the start of one row of pixels to the start of the next row. The image format defines a minimum stride for an image. In addition, the graphics hardware might require a larger stride for the surface that contains the image.
For uncompressed RGB formats, the minimum stride is always the image width in bytes, rounded up to the nearest DWORD. You can use the following formula to calculate the stride:
stride = ((((biWidth * biBitCount) + 31) & ~31) >> 3)
You need to fix the way you access the RGBTRIPLEs in the buffer.
Before the "x loop" you should do something like q = (RGBTRIPLE*) (((char*)bitmap_bits) + (y * bitmap.bmWidthBytes));
CreateBitmapIndirect creates a DDB which is perhaps not the best choice, create a DIB instead:
#define CalcStride(w, bpp) ( ((((w) * (bpp)) + 31) & ~31) >> 3 )
static void SetPixel24(UINT w, void*bits, UINT x, UINT y, COLORREF cr)
{
RGBTRIPLE*p = ((RGBTRIPLE*) ( ((char*)bits) + (y * CalcStride(w, 24)) )) + x;
p->rgbtRed = GetRValue(cr);
p->rgbtGreen = GetGValue(cr);
p->rgbtBlue = GetBValue(cr);
}
void Silly24BPPExample()
{
HWND hWnd = CreateWindowEx(WS_EX_APPWINDOW, WC_STATIC, 0, WS_VISIBLE|WS_CAPTION|WS_SYSMENU|WS_OVERLAPPEDWINDOW|SS_BITMAP|SS_REALSIZECONTROL, 0, 0, 99, 99, 0, 0, 0, 0);
const INT w = 4, h = 4, bpp = 24;
BITMAPINFO bi;
ZeroMemory(&bi, sizeof(bi));
BITMAPINFOHEADER&bih = bi.bmiHeader;
bih.biSize = sizeof(BITMAPINFOHEADER);
bih.biWidth = w, bih.biHeight = -h;
bih.biPlanes = 1, bih.biBitCount = bpp;
bih.biCompression = BI_RGB;
void*bits;
HBITMAP hBmp = CreateDIBSection(NULL, &bi, DIB_RGB_COLORS, &bits, NULL, 0);
for (UINT x = 0; x < w; ++x)
for (UINT y = 0; y < h; ++y)
SetPixel24(w, bits, x, y, RGB(255, 0, 0)); // All red
SetPixel24(w, bits, 0, 0, RGB(0, 0, 255)); // except one blue
SendMessage(hWnd, STM_SETIMAGE, IMAGE_BITMAP, (LPARAM) hBmp);
for (MSG msg; IsWindow(hWnd) && GetMessage(&msg, 0, 0, 0); ) DispatchMessage(&msg);
// DeleteObject(...)
}
Trying to convert 32 bit image B8R8G8A8 to 24bit image R8G8B8 and 16bit R5G5B5.
But result is very strange, maybe I do not understand how to convert image properly. How to do it properly and fix colors?
Input image:
After Convert32to16():
After Convert32to24():
stdafx.h
// stdafx.h : include file for standard system include files,
// or project specific include files that are used frequently, but
// are changed infrequently
//
#pragma once
#include "targetver.h"
#include <stdio.h>
#include <tchar.h>
#include <iostream>
#include <fstream>
#include <windows.h>
// TODO: reference additional headers your program requires here
ImageConverter.cpp
#include "stdafx.h"
using std::cout;
using std::endl;
using std::ofstream;
using std::ifstream;
void Convert32to24(void* B8G8R8A8, BYTE* R8G8B8, int width, int height)
{
long B8G8R8A8Size = (width * height * 4);
long j = 0;
for (long i = 0; i < (B8G8R8A8Size - 3); i = i + 4)
{
BYTE Red = ((PBYTE)B8G8R8A8)[i + 2];
BYTE Green = ((PBYTE)B8G8R8A8)[i + 1];
BYTE Blue = ((PBYTE)B8G8R8A8)[i];
BYTE Alpha = ((PBYTE)B8G8R8A8)[i + 3];
R8G8B8[j] = Red;
R8G8B8[j + 1] = Green;
R8G8B8[j + 2] = Blue;
j = j + 3;
}
}
void Convert32to16(void* B8G8R8A8, BYTE* R5G5B5, int width, int height)
{
long B8G8R8A8Size = (width * height * 4);
long j = 0;
for (long i = 0; i < (B8G8R8A8Size - 3); i = i + 4)
{
BYTE Red = ((PBYTE)B8G8R8A8)[i + 2] >> 3;
BYTE Green = ((PBYTE)B8G8R8A8)[i + 1] >> 3;
BYTE Blue = ((PBYTE)B8G8R8A8)[i] >> 3;
BYTE Alpha = ((PBYTE)B8G8R8A8)[i + 3];
uint16_t RGB565 = ((Red >> 3) << 11) | ((Green >> 2) << 5) | (Blue >> 3);
R5G5B5[j] = RGB565 >> 8;
R5G5B5[j + 1] = RGB565 & 0xFF;
j = j + 2;
}
}
void WriteDataToBmp(const WCHAR *filename, void *imageData, int width, int height, int BitCount, int bytesPerPixel)
{
HANDLE hdl = INVALID_HANDLE_VALUE;
DWORD bytesWritten;
BITMAPFILEHEADER fileHeader;
BITMAPINFOHEADER fileInfo;
fileInfo.biSize = sizeof(BITMAPINFOHEADER);
fileInfo.biBitCount = BitCount;
fileInfo.biCompression = BI_RGB;
fileInfo.biWidth = width;
fileInfo.biHeight = 0 - height;
fileInfo.biPlanes = 1;
fileInfo.biSizeImage = (width * height * bytesPerPixel);
fileHeader.bfSize = sizeof(BITMAPFILEHEADER) + sizeof(BITMAPINFOHEADER) + fileInfo.biSizeImage;
fileHeader.bfType = 'MB';
fileHeader.bfOffBits = sizeof(BITMAPFILEHEADER) + sizeof(BITMAPINFOHEADER);
hdl = CreateFile(filename, GENERIC_WRITE, 0, NULL, CREATE_ALWAYS, 0, NULL);
if (hdl == INVALID_HANDLE_VALUE)
{
return;
}
WriteFile(hdl, &fileHeader, sizeof(fileHeader), &bytesWritten, NULL);
WriteFile(hdl, &fileInfo, sizeof(fileInfo), &bytesWritten, NULL);
WriteFile(hdl, imageData, fileInfo.biSizeImage, &bytesWritten, NULL);
CloseHandle(hdl);
}
unsigned char* ReadDataFromBmp(char* filename)
{
FILE* f = fopen(filename, "rb");
unsigned char info[54];
fread(info, sizeof(unsigned char), 54, f);
int width = *(int*)&info[18];
int height = *(int*)&info[22];
int size = abs(4 * width * height);
unsigned char* data = new unsigned char[size];
fread(data, sizeof(unsigned char), size, f);
fclose(f);
return data;
}
int main(int args, char** cat) {
int width = 1440;
int height = 900;
int bytesOnPixel;
BYTE *OutputImage24Bit = new BYTE[width * height * 3];
BYTE *OutputImage16Bit = new BYTE[width * height * 2];
unsigned char* inputImage32Bit = ReadDataFromBmp((char*)"E:/TestImage.bmp");
bytesOnPixel = 2;
Convert32to16(inputImage32Bit, OutputImage16Bit, width, height);
WriteDataToBmp(L"E:/TestImage16bit.bmp", OutputImage16Bit, width, height, 8 * bytesOnPixel, bytesOnPixel);
bytesOnPixel = 3;
Convert32to24(inputImage32Bit, OutputImage24Bit, width, height);
WriteDataToBmp(L"E:/TestImage24bit.bmp", OutputImage24Bit, width, height, 8 * bytesOnPixel, bytesOnPixel);
return 1;
}
fileInfo.biCompression = BI_RGB;
16-bit bitmap uses BI_BITFIELDS compression. In addition 16-bit bitmap has to populate the color table to show if it is using 555 format, 565 format, or a different format.
24-bit and 16-bit bitmap need padding. Albeit, that's not an issue if the width in bytes is a multiple of 4. In general you cannot read/write pixel after pixel because the padding can throw everything off. Instead make 2 loops to go through the height and width. Pixel size would also depend on padding.
Note that you can do the same with or GDI+ or WIC. You can change the bitmap to different formats PixelFormat16bppRGB555, PixelFormat16bppRGB565, PixelFormat16bppARGB1555, PixelFormat24bppRGB...
GDI+ example:
int main()
{
Gdiplus::GdiplusStartupInput tmp;
ULONG_PTR token;
Gdiplus::GdiplusStartup(&token, &tmp, NULL);
auto *source = Gdiplus::Bitmap::FromFile(L"test.bmp");
auto *destination = source->Clone(0, 0, source->GetWidth(), source->GetHeight(),
PixelFormat16bppRGB565);
CLSID clsid_bmp;
CLSIDFromString(L"{557cf400-1a04-11d3-9a73-0000f81ef32e}", &clsid_bmp);
destination->Save(L"copy.bmp", &clsid_bmp);
delete destination;
delete source;
Gdiplus::GdiplusShutdown(token);
return 0;
}
Home made version: (using std::vector for memory, instead of new/delete)
void Convert32to24(const wchar_t* file, std::vector<BYTE> &src, int width, int height)
{
int width_in_bytes_32 = width * 4;
int width_in_bytes_24 = ((width * 24 + 31) / 32) * 4;
DWORD size = width_in_bytes_24 * height;
std::vector<BYTE> dst(size);
for(int h = 0; h < height; h++)
for(int w = 0; w < width; w++)
{
int i = h * width_in_bytes_32 + w * 4;
int j = h * width_in_bytes_24 + w * 3;
dst[j + 0] = src[i + 0];
dst[j + 1] = src[i + 1];
dst[j + 2] = src[i + 2];
}
BITMAPFILEHEADER bf = { 'MB', 54 + size, 0, 0, 54 };
BITMAPINFOHEADER bi = { sizeof(bi), width, height, 1, 24, BI_RGB };
std::ofstream fout(file, std::ios::binary);
fout.write((char*)&bf, sizeof(bf));
fout.write((char*)&bi, sizeof(bi));
fout.write((char*)&dst[0], size);
}
void Convert32to16(const wchar_t* file, std::vector<BYTE> &src, int width, int height)
{
int width_in_bytes_32 = width * 4;
int width_in_bytes_16 = ((width * 16 + 31) / 32) * 4;
DWORD size = width_in_bytes_16 * height;
std::vector<BYTE> dst(size);
for(int h = 0; h < height; h++)
for(int w = 0; w < width; w++)
{
int i = h * width_in_bytes_32 + w * 4;
int j = h * width_in_bytes_16 + w * 2;
//555 format, each color is from 0 to 32, instead of 0 to 256
uint16_t blu = (uint16_t)(src[i + 0] * 31.f / 255.f);
uint16_t grn = (uint16_t)(src[i + 1] * 31.f / 255.f);
uint16_t red = (uint16_t)(src[i + 2] * 31.f / 255.f);
uint16_t sum = (red) | (grn << 5) | (blu << 10);
memcpy(&dst[j], &sum, 2);
}
BITMAPFILEHEADER bf = { 'MB', 54 + size, 0, 0, 54 };
BITMAPINFOHEADER bi = { sizeof(bi), width, height, 1, 16, BI_BITFIELDS };
std::ofstream fout(file, std::ios::binary);
fout.write((char*)&bf, sizeof(bf));
fout.write((char*)&bi, sizeof(bi));
//555 format
COLORREF color[]{
0b0000000000011111,//31
0b0000001111100000,//31 << 5
0b0111110000000000 //31 << 10
};
fout.write((char*)&color, sizeof(color));
fout.write((char*)&dst[0], size);
}
int main()
{
const wchar_t* file_32 = L"E:\\TestImage.bmp";
const wchar_t* file_16 = L"E:\\OutputImage16Bit.bmp";
const wchar_t* file_24 = L"E:\\OutputImage24Bit.bmp";
BITMAPFILEHEADER bh;
BITMAPINFOHEADER bi;
std::ifstream fin(file_32, std::ios::binary);
if(!fin)
return 0;
fin.read((char*)&bh, sizeof(bh));
fin.read((char*)&bi, sizeof(bi));
if(bi.biBitCount != 32)
return 0;
std::vector<BYTE> source(bh.bfSize);
fin.read((char*)&source[0], bh.bfSize);
Convert32to16(file_16, source, bi.biWidth, bi.biHeight);
Convert32to24(file_24, source, bi.biWidth, bi.biHeight);
return 0;
}
So I want to use SIMD instructions in C++ to compare values from an uint32_t array and store the values back in a new one of this arrays.
It works more or less fine, but I am still using 4 if-clauses to determine if the values I got after the SIMD instructions to write back the values.
is there a way to do this with SIMD instructions?
The function allocateAlignedBuffer does what the name says and is working correctly.
uint32_t* testFunc(uint32_t* arr, uint32_t num, uint32_t* cnt) {
uint32_t numcnt = 4;
uint32_t * resArr = allocateAlignedBuffer<uint32_t>(num, true);
uint32_t * resPos = resArr;
*cnt = 0;
__m128i comp2 = _mm_set_epi32(10,10,10,10);
for (int i = 0; i < num; i+=4) {
__m128i positions = _mm_set_epi32(i+3,i+2,i+1,i);
__m128i vec = _mm_load_si128 ( reinterpret_cast<const __m128i*> ( (&arr[i]) ) );
__m128i simdAnd2 = _mm_cmpge_ps(vec, comp2);
int comp = _mm_movemask_epi8 (simdAnd2);
if (comp == 0x0000) {
//std::cout << "nothing found\n";
continue;
}
else if (comp < 65535) {
if ( ((uint32_t *) &simdAnd2)[0] ){
std::cout << "first byte not 0\n";
resPos[0] = ((uint32_t *) &positions)[0];
resPos++;
*cnt++;
}
if (((uint32_t *) &simdAnd2)[1]){
std::cout << "second byte not 0\n";
resPos[0] = ((uint32_t *) &positions)[1];
resPos++;
*cnt++;
}
if (((uint32_t *) &simdAnd2)[2]){
std::cout << "3rd byte not 0\n";
resPos[0] = ((uint32_t *) &positions)[2];
resPos++;
*cnt++;
}
if (((uint32_t *) &simdAnd2)[3]){
std::cout << "4th byte not 0\n";
resPos[0] = ((uint32_t *) &positions)[3];
resPos++;
*cnt++;
}
}
else { //all elements equal
resPos[0] = ((uint32_t *) &positions)[0];
resPos[1] = ((uint32_t *) &positions)[1];
resPos[2] = ((uint32_t *) &positions)[2];
resPos[3] = ((uint32_t *) &positions)[3];
resPos += numcnt;
*cnt += numcnt;
}
}
std::cout << "cnt "<<*cnt<<"\n";
return resArr;
}
Also there is probably a lot to optimize I believe.
Another variant with using shuffles:
__m128i g_shuffles[16] =
{
_mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0),
_mm_setr_epi8(12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(4, 5, 6, 7, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 0, 0, 0),
_mm_setr_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0),
_mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0),
_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
};
uint32_t g_steps[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
uint32_t * testFunc2(uint32_t* arr, uint32_t num, uint32_t * cnt)
{
uint32_t * resArr = (uint32_t*)_mm_malloc(num*sizeof(uint32_t), 16);
uint32_t * resPos = resArr;
*cnt = 0;
__m128i threshold = _mm_set1_epi32(10 - 1);
__m128i positions = _mm_setr_epi32(0, 1, 2, 3);
__m128i _4 = _mm_set1_epi32(4);
__m128i _1 = _mm_set1_epi32(1);
__m128i _cnt = _mm_setzero_si128();
for (int i = 0; i < num; i += 4)
{
__m128i _arr = _mm_loadu_si128((__m128i*)(arr + i));
__m128i comparemask = _mm_cmpgt_epi32(_arr, threshold);
_cnt = _mm_add_epi32(_cnt, _mm_and_si128(comparemask, _1));
int index = _mm_movemask_ps(_mm_castsi128_ps(comparemask));
__m128i storePositions = _mm_shuffle_epi8(positions, g_shuffles[index]);
_mm_storeu_si128((__m128i*)resPos, storePositions);
resPos += g_steps[index];
positions = _mm_add_epi32(positions, _4);
}
uint32_t cnts[4];
_mm_storeu_si128((__m128i*)cnts, _cnt);
*cnt = cnts[0] + cnts[1] + cnts[2] + cnts[3];
std::cout << "cnt " << *cnt << "\n";
return resArr;
}
Here's a version with a pshufb trick to do the compaction, not tested though and the shuffle masks shouldn't really be local.
uint32_t* testFunc(uint32_t* arr, uint32_t num, uint32_t * cnt)
{
uint32_t numcnt = 4;
uint32_t * resArr = (uint32_t*)_mm_malloc(num*sizeof(uint32_t), 16);
uint32_t * resPos = resArr;
*cnt = 0;
__m128i comp2 = _mm_set1_epi32(10);
__m128i positions = _mm_setr_epi32(0, 1, 2, 3);
__m128i _4 = _mm_set1_epi32(4);
__m128i _1 = _mm_set1_epi32(1);
int count = 0;
const int X = 0x80808080;
__m128i compaction_masks[16];
compaction_masks[0] = _mm_set1_epi8(0x80);
compaction_masks[1] = _mm_set_epi32(X, X, X, 0x03020100);
compaction_masks[2] = _mm_set_epi32(X, X, X, 0x07060504);
compaction_masks[3] = _mm_set_epi32(X, X, 0x07060504, 0x03020100);
compaction_masks[4] = _mm_set_epi32(X, X, X, 0x0B0A0908);
compaction_masks[5] = _mm_set_epi32(X, X, 0x0B0A0908, 0x03020100);
compaction_masks[6] = _mm_set_epi32(X, X, 0x0B0A0908, 0x07060504);
compaction_masks[7] = _mm_set_epi32(X, 0x0B0A0908, 0x07060504, 0x03020100);
compaction_masks[8] = _mm_set_epi32(X, X, X, 0x0F0E0D0C);
compaction_masks[9] = _mm_set_epi32(X, X, 0x0F0E0D0C, 0x03020100);
compaction_masks[10] = _mm_set_epi32(X, X, 0x0F0E0D0C, 0x07060504);
compaction_masks[11] = _mm_set_epi32(X, 0x0F0E0D0C, 0x07060504, 0x03020100);
compaction_masks[12] = _mm_set_epi32(X, X, 0x0F0E0D0C, 0x0B0A0908);
compaction_masks[13] = _mm_set_epi32(X, 0x0F0E0D0C, 0x0B0A0908, 0x03020100);
compaction_masks[14] = _mm_set_epi32(X, 0x0F0E0D0C, 0x0B0A0908, 0x07060504);
compaction_masks[15] = _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100);
for (int i = 0; i < num; i += 4)
{
__m128i vec = _mm_loadu_si128((__m128i*)(arr + i));
__m128i simdAnd2 = _mm_cmplt_epi32(comp2, vec);
int comp = _mm_movemask_ps(_mm_castsi128_ps(simdAnd2));
__m128i shufmask = compaction_masks[comp];
vec = _mm_shuffle_epi8(positions, shufmask);
_mm_storeu_si128((__m128i*)resPos, vec);
resPos += __builtin_popcount(comp);
count += __builtin_popcount(comp);
positions = _mm_add_epi32(positions, _4);
}
*cnt = count;
return resArr;
}
The idea here is that every individual case could of course be shuffled into place, the 16 cases are distinguished by loading the shuffle mask corresponding to the case-index, which is given by movmskps. With AVX2 you can do a similar thing using vpermd.
I have made some changes, which have to lead performance increasing:
#include <immintrin.h>
#include <memory.h>
uint32_t* testFunc(uint32_t* arr, uint32_t num, uint32_t * cnt)
{
uint32_t numcnt = 4;
uint32_t * resArr = (uint32_t*)_mm_malloc(num*sizeof(uint32_t), 16);
uint32_t * resPos = resArr;
*cnt = 0;
__m128i comp2 = _mm_set1_epi32(10);
__m128i positions = _mm_setr_epi32(0, 1, 2, 3);
__m128i _4 = _mm_set1_epi32(4);
__m128i _1 = _mm_set1_epi32(1);
__m128i _cnt = _mm_setzero_si128();
for (int i = 0; i < num; i += 4)
{
__m128i vec = _mm_loadu_si128((__m128i*)(arr + i));
__m128i simdAnd2 = _mm_cmplt_epi32(comp2, vec);//arr >= comp2
_cnt = _mm_add_epi32(_cnt, _mm_and_si128(simdAnd2, _1));
int comp = _mm_movemask_epi8(simdAnd2);
if (comp == 65535)
{
_mm_storeu_si128((__m128i*)resPos, positions);
resPos += 4;
}
else if (comp < 65535)
{
if (((uint32_t *)&simdAnd2)[0]) {
std::cout << "first byte not 0\n";
resPos[0] = ((uint32_t *)&positions)[0];
resPos++;
}
if (((uint32_t *)&simdAnd2)[1]) {
std::cout << "second byte not 0\n";
resPos[0] = ((uint32_t *)&positions)[1];
resPos++;
}
if (((uint32_t *)&simdAnd2)[2]) {
std::cout << "3rd byte not 0\n";
resPos[0] = ((uint32_t *)&positions)[2];
resPos++;
}
if (((uint32_t *)&simdAnd2)[3]) {
std::cout << "4th byte not 0\n";
resPos[0] = ((uint32_t *)&positions)[3];
resPos++;
}
}
positions = _mm_add_epi32(positions, _4);
}
uint32_t cnts[4];
_mm_storeu_si128((__m128i*)cnts, _cnt);
*cnt = cnts[0] + cnts[1] + cnts[2] + cnts[3];
std::cout << "cnt " << *cnt << "\n";
return resArr;
}
Of course, it will be good if all scalar instructions in cycle are changed to vector instructions.
I'm going to find lines in an image, so I use hough transform to do it.
But now I'm trying to find the longest line in the image(there must exist a longest line in my image), is there any method to do it without sacrifice the computation speed?
using namespace std;
using namespace cv;
int main()
{
VideoCapture cap("D:\\DataBox\\v0.avi");
if (!cap.isOpened())
cout << "fail to open!" << endl; //return -1;
else
cout << "Video Load Succeed" << endl;
while (true)
{
cout << "----------------------------------------------------" << endl;
Mat src;
cap >> src;
pyrDown(src, src, Size(src.cols / 2, src.rows / 2));
pyrDown(src, src, Size(src.cols / 2, src.rows / 2));
cvtColor(src, src, CV_BGR2GRAY);
Mat tsrc;
threshold(src, tsrc, 90, 255, THRESH_BINARY_INV);
Mat grad_x, grad_y;
Mat abs_grad_x, abs_grad_y;
Mat sobel;
int scale = 1;
int delta = 0;
int ddepth = CV_16S;
Sobel(tsrc, grad_x, ddepth, 1, 0, 3, scale, delta, BORDER_DEFAULT);
convertScaleAbs(grad_x, abs_grad_x);
Sobel(tsrc, grad_y, ddepth, 0, 1, 3, scale, delta, BORDER_DEFAULT);
convertScaleAbs(grad_y, abs_grad_y);
addWeighted(abs_grad_x, 0.5, abs_grad_y, 0.5, 0, sobel);
vector<Vec2f> lines;
int threshold = 250;
HoughLines(sobel , lines, 1, CV_PI / 180, threshold, 0, 0);
Mat cdst;
cvtColor(sobel, cdst, CV_GRAY2BGR);
for (size_t i = 0; i < lines.size(); i++)
{
float rho = lines[i][0], theta = lines[i][1];
Point pt1, pt2;
double a = cos(theta), b = sin(theta);
double x0 = a*rho, y0 = b*rho;
pt1.x = cvRound(x0 + 1000 * (-b));
pt1.y = cvRound(y0 + 1000 * (a));
pt2.x = cvRound(x0 - 1000 * (-b));
pt2.y = cvRound(y0 - 1000 * (a));
line(cdst, pt1, pt2, Scalar(0, 0, 255), 3, CV_AA);
}
imshow("Video", cdst);
waitKey(30);
}
}