Why is chrominance lost when i copy DXGI_FORMAT_NV12 ID3D11Texture from a d3d11device to a d3d11on12device? - directx-11

D3D11_TEXTURE2D_DESC texture_desc = {0};
texture_desc.Width = 640;
texture_desc.Height = 480;
texture_desc.MipLevels = 1;
texture_desc.Format = DXGI_FORMAT_NV12;
texture_desc.SampleDesc.Count = 1;
texture_desc.ArraySize = 1;
texture_desc.Usage = D3D11_USAGE_DEFAULT;
texture_desc.MiscFlags = D3D11_RESOURCE_MISC_SHARED;
Microsoft::WRL::ComPtr<ID3D11Texture2D> temp_texture_for_my_device{nullptr};
my_device->CreateTexture2D(&texture_desc, NULL, &temp_texture_for_my_device);
Microsoft::WRL::ComPtr<IDXGIResource> dxgi_resource{nullptr};
temp_texture_for_my_device.As(&dxgi_resource);
HANDLE shared_handle = NULL;
dxgi_resource->GetSharedHandle(&shared_handle);
dxgi_resource->Release();
Microsoft::WRL::ComPtr<ID3D11Texture2D> temp_texture_for_ffmpeg_device {nullptr};
ffmpeg_device->OpenSharedResource(shared_handle, __uuidof(ID3D11Texture2D), (void**)temp_texture_for_ffmpeg_device.GetAddressOf());
ffmpeg_device_context->CopySubresourceRegion(temp_texture_for_ffmpeg_device.Get(), 0, 0, 0, 0, (ID3D11Texture2D*)ffmpeg_avframe->data[0], (int)ffmpeg_avframe->data[1], NULL);
ffmpeg_device_context->Flush();
I copy temp_texture_for_ffmpeg_device to a D3D11_USAGE_STAGING, it's normal, but when i copy temp_texture_for_my_device to a D3D11_USAGE_STAGING, i lost the chrominance data.
When i map the texture to cpu via D3D11_USAGE_STAGING:
temp_texture_for_ffmpeg_device : RowPitch is 768, DepthPitch is 768 * 720;
temp_texture_for_my_device : RowPitch is 1024, DepthPitch is 1024 * 480;
I think there are some different parameters between the two devices(or device context?), but I don't know what parameters would cause such a difference
my_device and my_device_context are created by D3D11On12CreateDevice

The DirectX Video formats are planar, meaning that each component is contiguous in memory rather than being interleaved like most formats. For DirectX 12, this is explicitly exposed in the layout information which you can obtain via D3D12GetFormatPlaneCount.
Here's a template that works with D3D12_SUBRESOURCE_DATA and D3D12_MEMCPY_DEST. Here the SlicePitch is set to the size of an individual plane.
template<typename T, typename PT> void AdjustPlaneResource(
_In_ DXGI_FORMAT fmt,
_In_ size_t height,
_In_ size_t slicePlane,
_Inout_ T& res) noexcept
{
switch (static_cast<int>(fmt))
{
case DXGI_FORMAT_NV12:
case DXGI_FORMAT_P010:
case DXGI_FORMAT_P016:
if (!slicePlane)
{
// Plane 0
res.SlicePitch = res.RowPitch * static_cast<PT>(height);
}
else
{
// Plane 1
res.pData = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(res.pData) + res.RowPitch * PT(height));
res.SlicePitch = res.RowPitch * static_cast<PT>((height + 1) >> 1);
}
break;
case DXGI_FORMAT_NV11:
if (!slicePlane)
{
// Plane 0
res.SlicePitch = res.RowPitch * static_cast<PT>(height);
}
else
{
// Plane 1
res.pData = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(res.pData) + res.RowPitch * PT(height));
res.RowPitch = (res.RowPitch >> 1);
res.SlicePitch = res.RowPitch * static_cast<PT>(height);
}
break;
}
}
For DirectX 11, the extra planar information has to be assumed as it's not directly exposed by the API as such. You have to compute the extra space required for the additional plane(s). Here's a snippet from DirectXTex. In this case slice is the total size of all the planes in one 'slice' of the resource.
case DXGI_FORMAT_R8G8_B8G8_UNORM:
case DXGI_FORMAT_G8R8_G8B8_UNORM:
case DXGI_FORMAT_YUY2:
assert(IsPacked(fmt));
pitch = ((uint64_t(width) + 1u) >> 1) * 4u;
slice = pitch * uint64_t(height);
break;
case DXGI_FORMAT_Y210:
case DXGI_FORMAT_Y216:
assert(IsPacked(fmt));
pitch = ((uint64_t(width) + 1u) >> 1) * 8u;
slice = pitch * uint64_t(height);
break;
case DXGI_FORMAT_NV12:
case DXGI_FORMAT_420_OPAQUE:
assert(IsPlanar(fmt));
pitch = ((uint64_t(width) + 1u) >> 1) * 2u;
slice = pitch * (uint64_t(height) + ((uint64_t(height) + 1u) >> 1));
break;
case DXGI_FORMAT_P010:
case DXGI_FORMAT_P016:
assert(IsPlanar(fmt));
pitch = ((uint64_t(width) + 1u) >> 1) * 4u;
slice = pitch * (uint64_t(height) + ((uint64_t(height) + 1u) >> 1));
break;
case DXGI_FORMAT_NV11:
assert(IsPlanar(fmt));
pitch = ((uint64_t(width) + 3u) >> 2) * 4u;
slice = pitch * uint64_t(height) * 2u;
break;
case DXGI_FORMAT_P208:
assert(IsPlanar(fmt));
pitch = ((uint64_t(width) + 1u) >> 1) * 2u;
slice = pitch * uint64_t(height) * 2u;
break;
case DXGI_FORMAT_V208:
assert(IsPlanar(fmt));
pitch = uint64_t(width);
slice = pitch * (uint64_t(height) + (((uint64_t(height) + 1u) >> 1) * 2u));
break;
case DXGI_FORMAT_V408:
assert(IsPlanar(fmt));
pitch = uint64_t(width);
slice = pitch * (uint64_t(height) + (uint64_t(height >> 1) * 4u));
break;

Related

OCR algorithm (GOCR) to 32F429IDISCOVERY board

I'm trying to implement an OCR algorithm (GOCR algorithm specifically) to 32F429IDISCOVERY board and I'm still getting nothing back...
I'm recording a image from OV7670 camera in RGB565 format to SDRAM of the board that is then converted to greyscale and passed to the algorithm itself.
From this and other forums I got the impression that GOCR is very good algorithm and it seemed to be working very well on PC but I just cant get it to work on the board.
Does anyone have some experience with implementing OCR or GOCR? I am not sure where the problem is because it beaves in a very wierd way. The code stops in different part of the algorithm almost every time...
Calling the OCR algorithm:
void ocr_algorithm(char *output_str) {
job_t job1, *job; /* fixme, dont want global variables for lib */
job=OCR_JOB=&job1;
int linecounter;
const char *line;
uint8_t r,g,b;
uint32_t n,i,buffer;
char *p_pic;
uint32_t *image = (uint32_t*) SDRAM_START_ADR;
setvbuf(stdout, (char *) NULL, _IONBF, 0); /* not buffered */
job_init(job); /* init cfg and db */
job_init_image(job); /* single image */
p_pic = malloc(IMG_ROWS*IMG_COLUMNS);
// Converting RGB565 to grayscale
i=0;
for (n = 0; n < IMG_ROWS*IMG_COLUMNS; n++) {
if (n % 2 == 0){
buffer = image[i] & 0xFFFF;
}
else{
buffer = (image[i] >> 16) & 0xFFFF;
i++;
}
r = (uint8_t) ((buffer >> 11) & 0x1F);
g = (uint8_t) ((buffer >> 5) & 0x3F);
b = (uint8_t) (buffer & 0x1F);
// RGB888
r = ((r * 527) + 23) >> 6;
g = ((g * 259) + 33) >> 6;
b = ((b * 527) + 23) >> 6;
// Greyscale
p_pic[n] = 0.299*r + 0.587*g + 0.114*b;
}
//read_picture;
job->src.p.p = p_pic;
job->src.p.x = IMG_ROWS;
job->src.p.y = IMG_COLUMNS;
job->src.p.bpp = 1;
/* call main loop */
pgm2asc(job);
//print output
strcpy(output_str, "");
linecounter = 0;
line = getTextLine(&(job->res.linelist), linecounter++);
while (line) {
strcat(output_str, line);
strcat(output_str, "\n");
line = getTextLine(&(job->res.linelist), linecounter++);
}
free_textlines(&(job->res.linelist));
job_free_image(job);
free(p_pic);
}

Is this part of a real IFFT process really optimal?

When calculating (I)FFT it is possible to calculate "N*2 real" data points using a ordinary complex (I)FFT of N data points.
Not sure about my terminology here, but this is how I've read it described.
There are several posts about this on stackoverflow already.
This can speed things up a bit when only dealing with such "real" data which is often the case when dealing with for example sound (re-)synthesis.
This increase in speed is offset by the need for a pre-processing step that somehow... uhh... fidaddles? the data to achieve this. Look I'm not even going to try to convince anyone I fully understand this but thanks to previously mentioned threads, I came up with the following routine, which does the job nicely (thank you!).
However, on my microcontroller this costs a bit more than I'd like even though trigonometric functions are already optimized with LUTs.
But the routine itself just looks like it should be possible to optimize mathematically to minimize processing. To me it seems similar to plain 2d rotation. I just can't quite wrap my head around it, but it just feels like this could be done with fewer both trigonometric calls and arithmetic operations.
I was hoping perhaps someone else might easily see what I don't and provide some insight into how this math may be simplified.
This particular routine is for use with IFFT, before the bit-reversal stage.
pseudo-version:
INPUT
MAG_A/B = 0 TO 1
PHA_A/B = 0 TO 2PI
INDEX = 0 TO PI/2
r = MAG_A * sin(PHA_A)
i = MAG_B * sin(PHA_B)
rsum = r + i
rdif = r - i
r = MAG_A * cos(PHA_A)
i = MAG_B * cos(PHA_B)
isum = r + i
idif = r - i
r = -cos(INDEX)
i = -sin(INDEX)
rtmp = r * isum + i * rdif
itmp = i * isum - r * rdif
OUTPUT rsum + rtmp
OUTPUT itmp + idif
OUTPUT rsum - rtmp
OUTPUT itmp - idif
original working code, if that's your poison:
void fft_nz_set(fft_complex_t complex[], unsigned bits, unsigned index, int32_t mag_lo, int32_t pha_lo, int32_t mag_hi, int32_t pha_hi) {
unsigned size = 1 << bits;
unsigned shift = SINE_TABLE_BITS - (bits - 1);
unsigned n = index; // index for mag_lo, pha_lo
unsigned z = size - index; // index for mag_hi, pha_hi
int32_t rsum, rdif, isum, idif, r, i;
r = smmulr(mag_lo, sine(pha_lo)); // mag_lo * sin(pha_lo)
i = smmulr(mag_hi, sine(pha_hi)); // mag_hi * sin(pha_hi)
rsum = r + i; rdif = r - i;
r = smmulr(mag_lo, cosine(pha_lo)); // mag_lo * cos(pha_lo)
i = smmulr(mag_hi, cosine(pha_hi)); // mag_hi * cos(pha_hi)
isum = r + i; idif = r - i;
r = -sinetable[(1 << SINE_BITS) - (index << shift)]; // cos(pi_c * (index / size) / 2)
i = -sinetable[index << shift]; // sin(pi_c * (index / size) / 2)
int32_t rtmp = smmlar(r, isum, smmulr(i, rdif)) << 1; // r * isum + i * rdif
int32_t itmp = smmlsr(i, isum, smmulr(r, rdif)) << 1; // i * isum - r * rdif
complex[n].r = rsum + rtmp;
complex[n].i = itmp + idif;
complex[z].r = rsum - rtmp;
complex[z].i = itmp - idif;
}
// For reference, this would be used as follows to generate a sawtooth (after IFFT)
void synth_sawtooth(fft_complex_t *complex, unsigned fft_bits) {
unsigned fft_size = 1 << fft_bits;
fft_sym_dc(complex, 0, 0); // sets dc bin [0]
for(unsigned n = 1, z = fft_size - 1; n <= fft_size >> 1; n++, z--) {
// calculation of amplitude/index (sawtooth) for both n and z
fft_sym_magnitude(complex, fft_bits, n, 0x4000000 / n, 0x4000000 / z);
}
}

running the publicly available PCL RANSAC code & saving the result not working

I am trying to use the publicly available code of RANSAC for PCL from here: http://pointclouds.org/documentation/tutorials/random_sample_consensus.php
However, I am omitting the 3D viewer portion. The problem I am facing is that I cant save the result & also when I check the final point cloud size, it`s showing zero. Here is the code:
#include <pcl/sample_consensus/ransac.h>
#include <pcl/sample_consensus/sac_model_plane.h>
#include <pcl/sample_consensus/sac_model_sphere.h>
#include <pcl/visualization/pcl_visualizer.h>
#include <boost/thread/thread.hpp>
int main(int argc, char** argv)
{
// initialize PointClouds
pcl::PointCloud<pcl::PointXYZ>::Ptr cloud (new pcl::PointCloud<pcl::PointXYZ>);
pcl::PointCloud<pcl::PointXYZ>::Ptr final (new pcl::PointCloud<pcl::PointXYZ>);
pcl::PointCloud<pcl::PointXYZ> final_result = *final;
// populate our PointCloud with points
cloud->width = 500;
cloud->height = 1;
cloud->is_dense = false;
cloud->points.resize (cloud->width * cloud->height);
for (size_t i = 0; i < cloud->points.size (); ++i)
{
if (pcl::console::find_argument (argc, argv, "-s") >= 0 || pcl::console::find_argument (argc, argv, "-sf") >= 0)
{
cloud->points[i].x = 1024 * rand () / (RAND_MAX + 1.0);
cloud->points[i].y = 1024 * rand () / (RAND_MAX + 1.0);
if (i % 5 == 0)
cloud->points[i].z = 1024 * rand () / (RAND_MAX + 1.0);
else if(i % 2 == 0)
cloud->points[i].z = sqrt( 1 - (cloud->points[i].x * cloud->points[i].x)
- (cloud->points[i].y * cloud->points[i].y));
else
cloud->points[i].z = - sqrt( 1 - (cloud->points[i].x * cloud->points[i].x)
- (cloud->points[i].y * cloud->points[i].y));
}
else
{
cloud->points[i].x = 1024 * rand () / (RAND_MAX + 1.0);
cloud->points[i].y = 1024 * rand () / (RAND_MAX + 1.0);
if( i % 2 == 0)
cloud->points[i].z = 1024 * rand () / (RAND_MAX + 1.0);
else
cloud->points[i].z = -1 * (cloud->points[i].x + cloud->points[i].y);
}
}
std::vector<int> inliers;
// created RandomSampleConsensus object and compute the appropriated model
pcl::SampleConsensusModelSphere<pcl::PointXYZ>::Ptr
model_s(new pcl::SampleConsensusModelSphere<pcl::PointXYZ> (cloud));
pcl::SampleConsensusModelPlane<pcl::PointXYZ>::Ptr
model_p (new pcl::SampleConsensusModelPlane<pcl::PointXYZ> (cloud));
if(pcl::console::find_argument (argc, argv, "-f") >= 0)
{
pcl::RandomSampleConsensus<pcl::PointXYZ> ransac (model_p);
ransac.setDistanceThreshold (.01);
ransac.computeModel();
ransac.getInliers(inliers);
}
else if (pcl::console::find_argument (argc, argv, "-sf") >= 0 )
{
pcl::RandomSampleConsensus<pcl::PointXYZ> ransac (model_s);
ransac.setDistanceThreshold (.01);
ransac.computeModel();
ransac.getInliers(inliers);
}
// copies all inliers of the model computed to another PointCloud
pcl::copyPointCloud<pcl::PointXYZ>(*cloud, inliers, *final);
cout << final->size() << endl; // show the size
pcl::PointCloud<pcl::PointXYZ> final_result = *final;
pcl::io::savePCDFile ("final_result.pcd", final_result); // save
return 0;
}
any idea why this is not working?
You have to pass -s or -sf as argument when running the compiled program.

How can I optimize this OpenCL Sobel Filter Kernel?

I wrote an OpenCL kernel for a 3x3 Sobel filter, and currently it's running in about 17 millis on a 2k x 2k image. This isn't as a fast as I had hoped; does anyone have any suggestions for how to improve the speed? I've followed most of the suggestions on the checklist for optimizing kernels. My processor is an Intel i5-3450. The workgroup size is 8x8, and the number of workitems is height x width / 16, which is 2048 x 128 on the images I'm running on.
__kernel void localCacheSobelFilter(
const __global char16* src,
__write_only __global float16* angle,
__write_only __global float16* mag,
const int width,
const int height)
{
// Cache the data we're looking at in __local space
const int row = get_global_id(0);
const int col = get_global_id(1);
const int cacheRow = get_local_id(0) + 1;
const int cacheCol = get_local_id(1) + 1;
__local char16 cache[BLOCK_SIZE + 2][BLOCK_SIZE + 2];
cache[cacheRow][cacheCol] = src[ indexOf(row, col) ];
// --- Deal with the boundary conditions
// This adds in the rows above and below the local block,
// ignoring the corners.
const bool atTopRow = (cacheRow == 1);
const bool atBottomRow = (cacheRow == BLOCK_SIZE);
if(atTopRow) {
cache[0][cacheCol] = src[ indexOf(row - 1, col) ];
} else if (atBottomRow) {
cache[BLOCK_SIZE + 1][cacheCol] = src[ indexOf(row + 1, col) ];
}
// This adds in the columns to the left and right of the local block,
// ignoring the corners.
const bool atLeftCol = (cacheCol == 1);
const bool atRightCol = (cacheCol == BLOCK_SIZE);
if(atLeftCol) {
cache[cacheRow][0].sf = src[ indexOf(row, col - 1) ].sf;
} else if (atRightCol) {
cache[cacheRow][BLOCK_SIZE + 1].s0 = src[ indexOf(row, col + 1) ].s0;
}
// Now finally check the corners
const bool atTLCorner = atTopRow && atLeftCol;
const bool atTRCorner = atTopRow && atRightCol;
const bool atBLCorner = atBottomRow && atLeftCol;
const bool atBRCorner = atBottomRow && atRightCol;
if(atTLCorner) {
cache[0][0].sf = src[ indexOf(row - 1, col - 1) ].sf;
} else if (atTRCorner) {
cache[0][BLOCK_SIZE + 1].s0 = src[ indexOf(row - 1, col + 1) ].s0;
} else if (atBLCorner) {
cache[BLOCK_SIZE + 1][0].sf = src[ indexOf(row + 1, col - 1) ].sf;
} else if (atBRCorner) {
cache[BLOCK_SIZE + 1][BLOCK_SIZE + 1].s0 = src[ indexOf(row + 1, col + 1) ].s0;
}
barrier(CLK_LOCAL_MEM_FENCE);
//===========================================================================
// Do the calculation
// [..., pix00] upperRow [pix02, ...]
// [..., pix10] centerRow [pix12, ...]
// [..., pix20] lowerRow [pix22, ...]
const char pix00 = cache[cacheRow - 1][cacheCol - 1].sf;
const char pix10 = cache[cacheRow ][cacheCol - 1].sf;
const char pix20 = cache[cacheRow + 1][cacheCol - 1].sf;
const char16 upperRow = cache[cacheRow - 1][cacheCol];
const char16 centerRow = cache[cacheRow ][cacheCol];
const char16 lowerRow = cache[cacheRow + 1][cacheCol];
const char pix02 = cache[cacheRow - 1][cacheCol + 1].s0;
const char pix12 = cache[cacheRow ][cacheCol + 1].s0;
const char pix22 = cache[cacheRow + 1][cacheCol + 1].s0;
// Do the calculations for Gy
const char16 upperRowShiftLeft = (char16)(upperRow.s123456789abcdef, pix02);
const char16 upperRowShiftRight = (char16)(pix00, upperRow.s0123456789abcde);
const char16 lowerRowShiftLeft = (char16)(lowerRow.s123456789abcdef, pix22);
const char16 lowerRowShiftRight = (char16)(pix20, lowerRow.s0123456789abcde);
const float16 Gy = convert_float16(
(upperRowShiftLeft + 2 * upperRow + upperRowShiftRight)
- (lowerRowShiftLeft + 2 * lowerRow + lowerRowShiftRight));
// Do the calculations for Gx
const char16 centerRowShiftLeft = (char16)(centerRow.s123456789abcdef, pix12);
const char16 centerRowShiftRight = (char16)(pix10, centerRow.s0123456789abcde);
const float16 Gx = convert_float16(
(upperRowShiftRight + 2 * centerRowShiftRight + lowerRowShiftRight)
- (upperRowShiftLeft + 2 * centerRowShiftLeft + lowerRowShiftLeft));
// Find the angle and magnitude
angle[ indexOf(row, col) ] = 0.0; //atan2(Gy, Gx);
mag[ indexOf(row, col) ] = ALPHA * max(Gx, Gy) + BETA * min(Gx, Gy);
}
Any help would be greatly appreciated. Thanks!
In your kernel you have a lot of "if" to avoid the edge effect. But it is time consuming and useless for 99% time.
I think you can change the global work size in NDRangeKernel and use an offset to avoid edge effect.
Ex:
Offset = {1,1,0}
GlobalWorkSize = { width-2, height-2,0}

Monochrome Bitmap SetPixel/GetPixel problems... Win32 C Code

This is some of my bitmask code (monochrome bitmaps). There is no problem with the Bitmask_Create() function. I have tested it with opening, loading and saving windows monochrome bitmaps, and it works great. However, the GetPixel and SetPixel functions I've made don't seem to work right. In some instances they seem to work fine depending on the bitmap dimensions.
If anyone could help, I would appreciate it. It's driving me insane.
Thanks.
typedef struct _GL_BITMASK GL_BITMASK;
struct _GL_BITMASK {
int nWidth; // Width in pixels
int nHeight; // Height in pixels
int nPitch; // Width of scanline in bytes (may have extra padding to align to DWORD)
BYTE *pData; // Pointer to the first byte of the first scanline (top down)
};
int BitMask_GetPixel(GL_BITMASK *pBitMask, int x, int y)
{
INT nElement = ((y * pBitMask->nPitch) + (x / 8));
PBYTE pElement = pBitMask->pData + nElement;
BYTE bMask = 1 << (7 - (x % 8));
return *pElement & bMask;
}
void BitMask_SetPixel(GL_BITMASK *pBitMask, int x, int y, int nPixelColor)
{
INT nElement = x / 8;
INT nScanLineOffset = y * pBitMask->nPitch;
PBYTE pElement = pBitMask->pData + nScanLineOffset + nElement;
BYTE bMask = 1 << (7 - (x % 8));
if(*pElement & bMask)
{
if(!nPixelColor) return;
else *pElement ^= bMask;
}
else
{
if(nPixelColor) return;
else *pElement |= bMask;
}
}
GL_BITMASK *BitMask_Create(INT nWidth, INT nHeight)
{
GL_BITMASK *pBitMask;
int nPitch;
nPitch = ((nWidth / 8) + 3) & ~3;
pBitMask = (GL_BITMASK *)GlobalAlloc(GMEM_FIXED, (nPitch * nHeight) + sizeof(GL_BITMASK));
if(!pBitMask)
return (GL_BITMASK *)NULL;
pBitMask->nPitch = nPitch;
pBitMask->nWidth = nWidth;
pBitMask->nHeight = nHeight;
pBitMask->pData = (PBYTE)pBitMask + sizeof(GL_BITMASK);
return pBitMask;
}
I think your formula for calculating pitch is just a little bit off. It works when the width is a multiple of 8, but not otherwise. Try:
nPitch = ((nWidth + 31) / 8) & ~3;
I figured it out... I had the two tests backwards for nPixelColor in SetPixel()
if(*pElement & bMask)
{
if(nPixelColor) return; // this was !nPixelColor
else *pElement ^= bMask;
}
else
{
if(!nPixelColor) return; // this was nPixelColor
else *pElement |= bMask;
}

Resources