Maximum float value in 10-bit image in WIC - winapi

I'm trying to convert a HDR image float array I load to a 10-bit DWORD with WIC.
The type of the loading file is GUID_WICPixelFormat128bppPRGBAFloat and I got an array of 4 floats per color.
When I try to convert these to 10 bit as follows:
struct RGBX
{
unsigned int b : 10;
unsigned int g : 10;
unsigned int r : 10;
int a : 2;
} rgbx;
(which is the format requested by the NVIDIA encoding library for 10-bit rgb),
then I assume I have to divide each of the floats by 1024.0f in order to get them inside the 10 bits of a DWORD.
However, I notice that some of the floats are > 1, which means that their range is not [0,1] as it happens when the image is 8 bit.
What would their range be? How to store a floating point color into a 10-bits integer?
I'm trying to use the NVidia's HDR encoder which requires an ARGB10 like the above structure.
How is the 10 bit information of a color stored as a floating point number?
Btw I tried to convert with WIC but conversion from GUID_WICPixelFormat128bppPRGBAFloat to GUID_WICPixelFormat32bppR10G10B10A2 fails.
HRESULT ConvertFloatTo10(const float* f, int wi, int he, std::vector<DWORD>& out)
{
CComPtr<IWICBitmap> b;
wbfact->CreateBitmapFromMemory(wi, he, GUID_WICPixelFormat128bppPRGBAFloat, wi * 16, wi * he * 16, (BYTE*)f, &b);
CComPtr<IWICFormatConverter> wf;
wbfact->CreateFormatConverter(&wf);
wf->Initialize(b, GUID_WICPixelFormat32bppR10G10B10A2, WICBitmapDitherTypeNone, 0, 0, WICBitmapPaletteTypeCustom);
// This last call fails with 0x88982f50 : The component cannot be found.
}
Edit: I found a paper (https://hal.archives-ouvertes.fr/hal-01704278/document), is this relevant to this question?

Floating-point color content that is greater than the 0..1 range is High Dynamic Range (HDR) content. If you trivially convert it to 10:10:10:2 UNORM then you are using 'clipping' for values over 1. This doesn't give good results.
SDR 10:10:10 or 8:8:8
You should instead use tone-mapping which converts the HDR signal to a SDR (Standard Dynamic Range a.k.a. 0..1) before or as part of doing the conversion to 10:10:10:2.
There a many different approaches to tone-mapping, but a common 'generic' solution is the Reinhard tone-mapping operator. Here's an implementation using DirectXTex.
std::unique_ptr<ScratchImage> timage(new (std::nothrow) ScratchImage);
if (!timage)
{
wprintf(L"\nERROR: Memory allocation failed\n");
return 1;
}
// Compute max luminosity across all images
XMVECTOR maxLum = XMVectorZero();
hr = EvaluateImage(image->GetImages(), image->GetImageCount(), image->GetMetadata(),
[&](const XMVECTOR* pixels, size_t w, size_t y)
{
UNREFERENCED_PARAMETER(y);
for (size_t j = 0; j < w; ++j)
{
static const XMVECTORF32 s_luminance = { { { 0.3f, 0.59f, 0.11f, 0.f } } };
XMVECTOR v = *pixels++;
v = XMVector3Dot(v, s_luminance);
maxLum = XMVectorMax(v, maxLum);
}
});
if (FAILED(hr))
{
wprintf(L" FAILED [tonemap maxlum] (%08X%ls)\n", static_cast<unsigned int>(hr), GetErrorDesc(hr));
return 1;
}
maxLum = XMVectorMultiply(maxLum, maxLum);
hr = TransformImage(image->GetImages(), image->GetImageCount(), image->GetMetadata(),
[&](XMVECTOR* outPixels, const XMVECTOR* inPixels, size_t w, size_t y)
{
UNREFERENCED_PARAMETER(y);
for (size_t j = 0; j < w; ++j)
{
XMVECTOR value = inPixels[j];
const XMVECTOR scale = XMVectorDivide(
XMVectorAdd(g_XMOne, XMVectorDivide(value, maxLum)),
XMVectorAdd(g_XMOne, value));
const XMVECTOR nvalue = XMVectorMultiply(value, scale);
value = XMVectorSelect(value, nvalue, g_XMSelect1110);
outPixels[j] = value;
}
}, *timage);
if (FAILED(hr))
{
wprintf(L" FAILED [tonemap apply] (%08X%ls)\n", static_cast<unsigned int>(hr), GetErrorDesc(hr));
return 1;
}
HDR10
UPDATE: If you are trying to convert HDR floating-point content to an "HDR10" signal, then you need to do:
Color-space rotate from Rec.709 or P3D65 to Rec.2020.
Normalize for 'paper white' / 10,000 nits.
Apply the ST.2084 gamma curve.
Quantize to 10-bit.
// HDTV to UHDTV (Rec.709 color primaries into Rec.2020)
const XMMATRIX c_from709to2020 =
{
0.6274040f, 0.0690970f, 0.0163916f, 0.f,
0.3292820f, 0.9195400f, 0.0880132f, 0.f,
0.0433136f, 0.0113612f, 0.8955950f, 0.f,
0.f, 0.f, 0.f, 1.f
};
// DCI-P3-D65 https://en.wikipedia.org/wiki/DCI-P3 to UHDTV (DCI-P3-D65 color primaries into Rec.2020)
const XMMATRIX c_fromP3D65to2020 =
{
0.753845f, 0.0457456f, -0.00121055f, 0.f,
0.198593f, 0.941777f, 0.0176041f, 0.f,
0.047562f, 0.0124772f, 0.983607f, 0.f,
0.f, 0.f, 0.f, 1.f
};
// Custom Rec.709 into Rec.2020
const XMMATRIX c_fromExpanded709to2020 =
{
0.6274040f, 0.0457456f, -0.00121055f, 0.f,
0.3292820f, 0.941777f, 0.0176041f, 0.f,
0.0433136f, 0.0124772f, 0.983607f, 0.f,
0.f, 0.f, 0.f, 1.f
};
inline float LinearToST2084(float normalizedLinearValue)
{
const float ST2084 = pow((0.8359375f + 18.8515625f * pow(abs(normalizedLinearValue), 0.1593017578f)) / (1.0f + 18.6875f * pow(abs(normalizedLinearValue), 0.1593017578f)), 78.84375f);
return ST2084; // Don't clamp between [0..1], so we can still perform operations on scene values higher than 10,000 nits
}
// You can adjust this up to 10000.f
float paperWhiteNits = 200.f;
hr = TransformImage(image->GetImages(), image->GetImageCount(), image->GetMetadata(),
[&](XMVECTOR* outPixels, const XMVECTOR* inPixels, size_t w, size_t y)
{
UNREFERENCED_PARAMETER(y);
const XMVECTOR paperWhite = XMVectorReplicate(paperWhiteNits);
for (size_t j = 0; j < w; ++j)
{
XMVECTOR value = inPixels[j];
XMVECTOR nvalue = XMVector3Transform(value, c_from709to2020);
// Some people prefer the look of using c_fromP3D65to2020
// or c_fromExpanded709to2020 instead.
// Convert to ST.2084
nvalue = XMVectorDivide(XMVectorMultiply(nvalue, paperWhite), c_MaxNitsFor2084);
XMFLOAT4A tmp;
XMStoreFloat4A(&tmp, nvalue);
tmp.x = LinearToST2084(tmp.x);
tmp.y = LinearToST2084(tmp.y);
tmp.z = LinearToST2084(tmp.z);
nvalue = XMLoadFloat4A(&tmp);
value = XMVectorSelect(value, nvalue, g_XMSelect1110);
outPixels[j] = value;
}
}, *timage);
You should really take a look at texconv.
Reference
Reinhard et al., "Photographic tone reproduction for digital images", ACM Transactions on Graphics, Volume 21, Issue 3 (July 2002). ACM DL.

#ChuckWalbourn answer is helpful, however I don't want to tonemap to [0,1] as there is no point in tonemapping to SDR then going to 10-bit HDR.
What I 'd think it's correct is to scale to [0,4] instead by first using g_XMFour.
const XMVECTOR scale = XMVectorDivide(
XMVectorAdd(g_XMFour, XMVectorDivide(v, maxLum)),
XMVectorAdd(g_XMFour, v));
then using a specialized 10-bit store which scales by 255 instead of 1023:
void XMStoreUDecN4a(DirectX::PackedVector::XMUDECN4* pDestination,DirectX::FXMVECTOR V)
{
using namespace DirectX;
XMVECTOR N;
static const XMVECTOR Scale = { 255.0f, 255.0f, 255.0f, 3.0f };
assert(pDestination);
N = XMVectorClamp(V, XMVectorZero(), g_XMFour);
N = XMVectorMultiply(N, Scale);
pDestination->v = ((uint32_t)DirectX::XMVectorGetW(N) << 30) |
(((uint32_t)DirectX::XMVectorGetZ(N) & 0x3FF) << 20) |
(((uint32_t)DirectX::XMVectorGetY(N) & 0x3FF) << 10) |
(((uint32_t)DirectX::XMVectorGetX(N) & 0x3FF));
}
And then a specialized 10-bit load which divides with 255 instead of 1023:
DirectX::XMVECTOR XMLoadUDecN4a(DirectX::PackedVector::XMUDECN4* pSource)
{
using namespace DirectX;
fourx vectorOut;
uint32_t Element;
Element = pSource->v & 0x3FF;
vectorOut.r = (float)Element / 255.f;
Element = (pSource->v >> 10) & 0x3FF;
vectorOut.g = (float)Element / 255.f;
Element = (pSource->v >> 20) & 0x3FF;
vectorOut.b = (float)Element / 255.f;
vectorOut.a = (float)(pSource->v >> 30) / 3.f;
const DirectX::XMVECTORF32 j = { vectorOut.r,vectorOut.g,vectorOut.b,vectorOut.a };
return j;
}

Related

Correct RGB values for AVFrame

I have to fill the ffmpeg AVFrame->data from a cairo surface pixel data. I have this code:
/* Image info and pixel data */
width = cairo_image_surface_get_width( surface );
height = cairo_image_surface_get_height( surface );
stride = cairo_image_surface_get_stride( surface );
pix = cairo_image_surface_get_data( surface );
for( row = 0; row < height; row++ )
{
data = pix + row * stride;
for( col = 0; col < width; col++ )
{
img->video_frame->data[0][row * img->video_frame->linesize[0] + col] = data[0];
img->video_frame->data[1][row * img->video_frame->linesize[1] + col] = data[1];
//img->video_frame->data[2][row * img->video_frame->linesize[2] + col] = data[2];
data += 4;
}
img->video_frame->pts++;
}
But the colors in the exported video are wrong. The original heart is red. Can someone point me in the right direction? The encode.c example is useless sadly and on the Internet there is a lot of confusion about Y, Cr and Cb which I really don't understand. Please feel free to ask for more details. Many thanks.
You need to use libswscale to convert the source image data from RGB24 to YUV420P.
Something like:
int width = cairo_image_surface_get_width( surface );
int height = cairo_image_surface_get_height( surface );
int stride = cairo_image_surface_get_stride( surface );
uint8_t *pix = cairo_image_surface_get_data( surface );
uint8_t *data[1] = { pix };
int linesize[1] = { stride };
struct SwsContext *sws_ctx = sws_getContext(width, height, AV_PIX_FMT_RGB24 ,
width, height, AV_PIX_FMT_YUV420P,
SWS_BILINEAR, NULL, NULL, NULL);
sws_scale(sws_ctx, data, linesize, 0, height,
img->video_frame->data, img->video_frame->linesize);
sws_freeContext(sws_ctx);
See the example here: scaling_video

Can't get BITMAPINFOHEADER data to display odd width bmp images correctly

I am trying to display a 24-bit uncompressed bitmap with an odd width using standard Win32 API calls, but it seems like I have a stride problem.
According to the msdn:
https://msdn.microsoft.com/en-us/library/windows/desktop/dd318229%28v=vs.85%29.aspx
"For uncompressed RGB formats, the minimum stride is always the image width in bytes, rounded up to the nearest DWORD. You can use the following formula to calculate the stride:
stride = ((((biWidth * biBitCount) + 31) & ~31) >> 3)"
but this simply does not work for me and below is is the code:
void Init()
{
pImage = ReadBMP("data\\bird.bmp");
size_t imgSize = pImage->width * pImage->height * 3;
BITMAPINFOHEADER bmih;
bmih.biSize = sizeof(BITMAPINFOHEADER);
bmih.biBitCount = 24;
// This is probably where the bug is
LONG stride = ((((pImage->width * bmih.biBitCount) + 31) & ~31) >> 3);
//bmih.biWidth = pImage->width;
bmih.biWidth = stride;
bmih.biHeight = -((LONG)pImage->height);
bmih.biPlanes = 1;
bmih.biCompression = BI_RGB;
bmih.biSizeImage = 0;
bmih.biXPelsPerMeter = 1;
bmih.biYPelsPerMeter = 1;
bmih.biClrUsed = 0;
bmih.biClrImportant = 0;
BITMAPINFO dbmi;
ZeroMemory(&dbmi, sizeof(dbmi));
dbmi.bmiHeader = bmih;
dbmi.bmiColors->rgbBlue = 0;
dbmi.bmiColors->rgbGreen = 0;
dbmi.bmiColors->rgbRed = 0;
dbmi.bmiColors->rgbReserved = 0;
HDC hdc = ::GetDC(NULL);
mTestBMP = CreateDIBitmap(hdc,
&bmih,
CBM_INIT,
pImage->pSrc,
&dbmi,
DIB_RGB_COLORS);
hdc = ::GetDC(NULL);
}
and here the drawing fuction
RawBMP *pImage;
HBITMAP mTestBMP;
void UpdateScreen(HDC srcHDC)
{
if (pImage != nullptr && mTestBMP != 0x00)
{
HDC hdc = CreateCompatibleDC(srcHDC);
SelectObject(hdc, mTestBMP);
BitBlt(srcHDC,
0, // x
0, // y
// I tried passing the stride here and it did not work either
pImage->width, // width of the image
pImage->height, // height
hdc,
0, // x and
0, // y of upper left corner
SRCCOPY);
DeleteDC(hdc);
}
}
If I pass the original image width (odd number) instead of the stride
LONG stride = ((((pImage->width * bmih.biBitCount) + 31) & ~31) >> 3);
//bmih.biWidth = stride;
bmih.biWidth = pImage->width;
the picture looks skewed, below shows the differences:
and if I pass the stride according to msdn, then nothing shows up because the stride is too large.
any clues? Thank you!
thanks Jonathan for the solution. I need to copy row by row with the proper padding for odd width images. More or less the code for 24-bit uncompressed images:
const uint32_t bitCount = 24;
LONG strideInBytes;
// if the width is odd, then we need to add padding
if (width & 0x1)
{
strideInBytes = ((((width * bitCount) + 31) & ~31) >> 3);
}
else
{
strideInBytes = width * 3;
}
// allocate the new buffer
unsigned char *pBuffer = new unsigned char[strideInBytes * height];
memset(pBuffer, 0xaa, strideInBytes * height);
// Copy row by row
for (uint32_t yy = 0; yy < height; yy++)
{
uint32_t rowSizeInBytes = width * 3;
unsigned char *pDest = &pBuffer[yy * strideInBytes];
unsigned char *pSrc = &pData[yy * rowSizeInBytes];
memcpy(pDest, pSrc, rowSizeInBytes);
}
rawBMP->pSrc = pBuffer;
rawBMP->width = width;
rawBMP->height = height;
rawBMP->stride = strideInBytes;

I am using Open cv for creating RGB to HSI then doing a histogram. Then Fourier transform and back to HSI to RGB

I can not debug this programme. I am going to convert RGB to HSI and then Put a histogram in anyone channel. before Fourier and after Fourier.
#include "stdafx.h"
#include <opencv2/opencv.hpp>
#include <opencv\highgui.h>
#include <iostream>
// ass.cpp : Converts the given RGB image to HSI colour space then
// performs Fourier filtering on a particular channel.
//
using namespace std;
using namespace cv;
// Declarations of 4 unfinished functions
Mat rgb2hsi(const Mat& rgb); // converts RGB image to HSI space
Mat hsi2rgb(const Mat& hsi); // converts HSI image to RGB space
Mat histogram(const Mat& im); // returns the histogram of the selected channel in HSI space
// void filter(Mat& im);// // performs frequency-domain filtering on a single-channel image
int main(int argc, char* argv[])
{
if (argc < 2) // check number of arguments
{
cerr << "feed me something!!" << endl; // no arguments passed
return -1;
}
string path = argv[1];
Mat im; // load an RGB image
Mat hsi = rgb2hsi(im); // convert it to HSI space
Mat slices[3]; // 3 channels of the converted HSI image
im = imread(path); //try to load path
if (im.empty()) // loaded Sucessfully
{
cerr << "I Cannot load the file : ";
return -1;
}
imshow("BEFORE", im);
split(hsi, slices); // split up the packed HSI image into an array of matrices
Mat& h = slices[0];
Mat& s = slices[1];
Mat& i = slices[2]; // references to H, S, and I layers
Mat hist1, hist2; // histogram of the selected channel before and after filtering
Going to apply histogram. May be I miss some header. draw is not taken.
Mat histogram(const Mat& im)
{
Mat hist;
const float range[] = { 0, 255 };
const int channels[] = { 0 };
const int bins = range[1] - range[0];
const int dims[] = { bins, 1 };
const Size binSize(2, 240);
const float* ranges[] = { range };
// calculate the histogram
calcHist(&im, 1, channels, Mat(), hist, 1, dims, ranges);
Mat draw = Mat::zeros(binSize.height, binSize.width * bins, CV_8UC3);
double maxVal;
minMaxLoc(hist, NULL, &maxVal, 0, 0);
for (int b = 0; b < bins; b++)
{
float val = hist.at<float>(b, 0);
int x0 = binSize.width * b;
int y0 = draw.rows - val / maxVal * binSize.height + 1;
int x1 = binSize.width * (b + 1) - 1;
int y1 = draw.rows - 1;
rectangle(draw,0, cv::(Point(x0, y0), cv::Point(x1, y1)), Scalar::all(255), CV_FILLED);
}
return draw;
}
imwrite("input-original.png", rgb); // write the input image
imwrite("hist-original.png", histogram(h)); // write the histogram of the selected channel
filter(h); // perform filtering
merge(slices, 3, hsi); // combine the separated H, S, and I layers to a big packed matrix
rgb = hsi2rgb(hsi); // convert HSI back to RGB colour space
imwrite("input-filtered.png", rgb); // write the filtered image
imwrite("hist-filtered.png", histogram(h)); // and the histogram of the filtered channel
return 0;
}
Mat rgb2hsi(const Mat& rgb)
{
Mat slicesRGB[3];
Mat slicesHSI[3];
Mat &r = slicesRGB[0], &g = slicesRGB[1], &b = slicesRGB[2];
Mat &h = slicesHSI[0], &s = slicesHSI[1], &i = slicesHSI[2];
split(rgb, slicesRGB);
//
// TODO: implement colour conversion RGB => HSI
//
// begin of conversion code
h = r * 1.0f;
s = g * 1.0f;
i = b * 1.0f;
// end of conversion code
Mat hsi;
merge(slicesHSI, 3, hsi);
return hsi;
}
Mat hsi2rgb(const Mat& hsi)
{
Mat slicesRGB[3];
Mat slicesHSI[3];
Mat &r = slicesRGB[0], &g = slicesRGB[1], &b = slicesRGB[2];
Mat &h = slicesHSI[0], &s = slicesHSI[1], &i = slicesHSI[2];
split(hsi, slicesHSI);
// begin of conversion code
r = h * 1.0f;
g = s * 1.0f;
b = i * 1.0f;
// end of conversion code
Mat rgb;
merge(slicesRGB, 3, rgb);
return rgb;
}
Mat histogram(const Mat& im)
{
Mat hist;
const float range[] = { 0, 255 };
const int channels[] = { 0 };
const int bins = range[1] - range[0];
const int dims[] = { bins, 1 };
const Size binSize(2, 240);
const float* ranges[] = { range };
// calculate the histogram
calcHist(&im, 1, channels, Mat(), hist, 1, dims, ranges);
Mat draw = Mat::zeros(binSize.height, binSize.width * bins, CV_8UC3);
double maxVal;
minMaxLoc(hist, NULL, &maxVal, 0, 0);
for (int b = 0; b < bins; b++)
{
float val = hist.at<float>(b, 0);
int x0 = binSize.width * b;
int y0 = draw.rows - val / maxVal * binSize.height + 1;
int x1 = binSize.width * (b + 1) - 1;
int y1 = draw.rows - 1;
rectangle(draw, Point(x0, y0), Point(x1, y1), Scalar::all(255), CV_FILLED);
}
return draw;
}
void filter(Mat& im)
{
int type = im.type();
// Convert pixel data from unsigned 8-bit integers (0~255)
// to 32-bit floating numbers, as required by cv::dft
if (type != CV_32F) im.convertTo(im, CV_32F);
// Perform 2-D Discrete Fourier Transform
Mat f;
dft(im, f, DFT_COMPLEX_OUTPUT + DFT_SCALE); // do DFT
// Separate the packed complex matrix to two matrices
Mat complex[2];
Mat& real = complex[0]; // the real part
Mat& imag = complex[1]; // the imaginary part
split(f, complex); // dft(im) => {real,imag}
// Frequency domain filtering
int xc = im.cols / 2; // find (xc,yc) the highest
int yc = im.rows / 2; // frequency component
for (int y = 0; y < im.rows; y++) // go through each row..
{
for (int x = 0; x < im.cols; x++) // then through each column..
{
//
// TODO: Design your formula here to decide if the component is
// discarded or kept.
//
if (false) // override this condition
{
real.at<float>(y, x) = 0;
imag.at<float>(y, x) = 0;
}
}
}
// Pack the real and imaginary parts
// back to the 2-channel matrix
merge(complex, 2, f); // {real,imag} => f
// Perform 2-D Inverse Discrete Fourier Transform
idft(f, im, DFT_REAL_OUTPUT); // do iDFT
// convert im back to it's original type
im.convertTo(im, type);
}
Error List
1 IntelliSense: expected a ';' d:\709
Tutorial\Dibya_project\Dibya_project\Dibya_project.cpp 48 2 Dibya_project
2 IntelliSense: identifier "draw" is undefined d:\709
Tutorial\Dibya_project\Dibya_project\Dibya_project.cpp 70 13 Dibya_project
3 IntelliSense: no instance of overloaded function "rectangle"
matches the argument list
argument types are: (, int, , cv::Scalar_, int) d:\709
Tutorial\Dibya_project\Dibya_project\Dibya_project.cpp 72 4 Dibya_project
4 IntelliSense: expected an identifier d:\709
Tutorial\Dibya_project\Dibya_project\Dibya_project.cpp 72 26 Dibya_project
5 IntelliSense: no instance of constructor "cv::Point_<Tp>::Point
[with _Tp=int]" matches the argument list
argument types are: (, double __cdecl (double _X)) d:\709 Tutorial\Dibya_project\Dibya_project\Dibya_project.cpp 72 27 Dibya_project
broken here (in Mat histogram(...)):
rectangle(draw,0, cv::(Point(x0, y0), cv::Point(x1, y1)), Scalar::all(255), CV_FILLED);
should be either:
rectangle(draw,0, cv::Rect(Point(x0, y0), cv::Point(x1, y1)), Scalar::all(255), CV_FILLED);
or:
rectangle(draw,0, Point(x0, y0), cv::Point(x1, y1), Scalar::all(255), CV_FILLED);
I think there is a typo in including the highgui header file.

DX11 convert pixel format BGRA to RGBA

I have currently the problem that a library creates a DX11 texture with BGRA pixel format.
But the displaying library can only display RGBA correctly. (This means the colors are swapped in the rendered image)
After looking around I found a simple for-loop to solve the problem, but the performance is not very good and scales bad with higher resolutions. I'm new to DirectX and maybe I just missed a simple function to do the converting.
// Get the image data
unsigned char* pDest = view->image->getPixels();
// Prepare source texture
ID3D11Texture2D* pTexture = static_cast<ID3D11Texture2D*>( tex );
// Get context
ID3D11DeviceContext* pContext = NULL;
dxDevice11->GetImmediateContext(&pContext);
// Copy data, fast operation
pContext->CopySubresourceRegion(texStaging, 0, 0, 0, 0, tex, 0, nullptr);
// Create mapping
D3D11_MAPPED_SUBRESOURCE mapped;
HRESULT hr = pContext->Map( texStaging, 0, D3D11_MAP_READ, 0, &mapped );
if ( FAILED( hr ) )
{
return;
}
// Calculate size
const size_t size = _width * _height * 4;
// Access pixel data
unsigned char* pSrc = static_cast<unsigned char*>( mapped.pData );
// Offsets
int offsetSrc = 0;
int offsetDst = 0;
int rowOffset = mapped.RowPitch % _width;
// Loop through it, BRGA to RGBA conversation
for (int row = 0; row < _height; ++row)
{
for (int col = 0; col < _width; ++col)
{
pDest[offsetDst] = pSrc[offsetSrc+2];
pDest[offsetDst+1] = pSrc[offsetSrc+1];
pDest[offsetDst+2] = pSrc[offsetSrc];
pDest[offsetDst+3] = pSrc[offsetSrc+3];
offsetSrc += 4;
offsetDst += 4;
}
// Adjuste offset
offsetSrc += rowOffset;
}
// Unmap texture
pContext->Unmap( texStaging, 0 );
Solution:
Texture2D txDiffuse : register(t0);
SamplerState texSampler : register(s0);
struct VSScreenQuadOutput
{
float4 Position : SV_POSITION;
float2 TexCoords0 : TEXCOORD0;
};
float4 PSMain(VSScreenQuadOutput input) : SV_Target
{
return txDiffuse.Sample(texSampler, input.TexCoords0).rgba;
}
Obviously iterating over a texture on you CPU is not the most effective way. If you know that colors in a texture are always swapped like that and you don't want to modify the texture itself in your C++ code, the most straightforward way would be to do it in the pixel shader. When you sample the texture, simply swap colors there. You won't even notice any performance drop.

Which is best simple Gaussian blur or FFT of Gaussian blur for sigma=20?

I'm making a program to blur a 16 bit grayscale image in CUDA.
In my program, if I use a Gaussian blur function with sigma = 20 or 30, it takes a lot of time, while it is fast with sigma = 2.0 or 3.0.
I've read in some web site that Guaussian blur with FFT is good for large kernel size or large sigma value:
Is It really true ?
Which algorithm should I use: simple Gaussian blur or Gaussian blur with FFT ?
My code for Guassian Blur is below. In my code , is there something wrong or not ?
enter code here
__global__
void gaussian_blur(
unsigned short* const blurredChannel, // return value: blurred channel (either red, green, or blue)
const unsigned short* const inputChannel, // red, green, or blue channel from the original image
int rows,
int cols,
const float* const filterWeight, // gaussian filter weights. The weights look like a bell shape.
int filterWidth // number of pixels in x and y directions for calculating average blurring
)
{
int r = blockIdx.y * blockDim.y + threadIdx.y; // current row
int c = blockIdx.x * blockDim.x + threadIdx.x; // current column
if ((r >= rows) || (c >= cols))
{
return;
}
int half = filterWidth / 2;
float blur = 0.f; // will contained blurred value
int width = cols - 1;
int height = rows - 1;
for (int i = -half; i <= half; ++i) // rows
{
for (int j = -half; j <= half; ++j) // columns
{
// Clamp filter to the image border
int h = min(max(r + i, 0), height);
int w = min(max(c + j, 0), width);
// Blur is a product of current pixel value and weight of that pixel.
// Remember that sum of all weights equals to 1, so we are averaging sum of all pixels by their weight.
int idx = w + cols * h; // current pixel index
float pixel = static_cast<float>(inputChannel[idx]);
idx = (i + half) * filterWidth + j + half;
float weight = filterWeight[idx];
blur += pixel * weight;
}
}
blurredChannel[c + r * cols] = static_cast<unsigned short>(blur);
}
void createFilter(float *gKernel,double sigma,int radius)
{
double r, s = 2.0 * sigma * sigma;
// sum is for normalization
double sum = 0.0;
// generate 9*9 kernel
int m=0;
for (int x = -radius; x <= radius; x++)
{
for(int y = -radius; y <= radius; y++)
{
r = std::sqrtf(x*x + y*y);
gKernel[m] = (exp(-(r*r)/s))/(3.14 * s);
sum += gKernel[m];
m++;
}
}
m=0;
// normalize the Kernel
for(int i = 0; i < (radius*2 +1); ++i)
for(int j = 0; j < (radius*2 +1); ++j)
gKernel[m++] /= sum;
}
int main()
{
cudaError_t cudaStatus;
const int size =81;
float gKernel[size];
float *dev_p=0;
cudaStatus = cudaMalloc((void**)&dev_p, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
createFilter(gKernel,20.0,4);
cudaStatus = cudaMemcpy(dev_p, gKernel, size* sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
/* i read image Buffere in unsigned short that code is not added here ,becouse it is large , and copy image data of buffere from host to device*/
/* So, suppose i have unsigned short *d_img which contain image data */
cudaMalloc( (void**)&d_img, length* sizeof(unsigned short));
cudaMalloc( (void**)&d_blur_img, length* sizeof(unsigned short));
static const int BLOCK_WIDTH = 32;
int image_width=1580.0,image_height=1050.0;
int x = static_cast<int>(ceilf(static_cast<float>(image_width) / BLOCK_WIDTH));
int y = static_cast<int>(ceilf(static_cast<float>((image_height) ) / BLOCK_WIDTH));
const dim3 grid (x, y, 1); // number of blocks
const dim3 block(BLOCK_WIDTH, BLOCK_WIDTH, 1);
gaussian_blur<<<grid,block>>>(d_blur_img,d_img,1050.0,1580.0,dev_p,9.0);
cudaDeviceSynchronize();
/* after bluring image i will copied buffer from Device to Host and free gpu memory */
cudaFree(d_img);
cudaFree(d_blur_img);
cudaFree(dev_p);
return 0;
}
Short answer: both algorithms are good with respect to image blurring, so feel free to pick the best (fastest) one for your use case.
Kernel size and sigma value are directly correlated: the greater the sigma, the larger the kernel (and thus the more operations-per-pixel to get the final result).
If you implemented a naive convolution, then you should try a separable convolution implementation instead; it will reduce the computation time by an order of magnitude already.
Now some more insight: they implement almost the same Gaussian blurring operation. Why almost ? It's because taking the FFT of an image does implicitly periodize it. Hence, at the border of the image, the convolution kernel sees an image that has been wrapped around its edge. This is called circular convolution (because of the wrapping). On the other hand, Gaussian blur implements a simple linear convolution.

Resources