I'm using CoreAudio low level API for audio capturing. The app target is MAC OSX, not iOS.
During testing it, from time to time we got very annoying noise modulate with real audio. the phenomena develops with time, started from barely noticeable and become more and more dominant.
Analyze the captured audio under Audacity indicate that the end part of the audio packet is wrong.
Here are sample picture:
the intrusion repeat every 40 ms which is the configured packetization time (in terms of buffer samples)
Update:
Over time the gap became larger, here is another snapshot from the same captured file 10 minutes later. the gap now contains 1460 samples which is 33ms from the total 40ms of the packet!!
CODE SNIPPESTS:
capture callback
OSStatus MacOS_AudioDevice::captureCallback(void *inRefCon,
AudioUnitRenderActionFlags *ioActionFlags,
const AudioTimeStamp *inTimeStamp,
UInt32 inBusNumber,
UInt32 inNumberFrames,
AudioBufferList *ioData)
{
MacOS_AudioDevice* _this = static_cast<MacOS_AudioDevice*>(inRefCon);
// Get the new audio data
OSStatus err = AudioUnitRender(_this->m_AUHAL, ioActionFlags, inTimeStamp, inBusNumber, inNumberFrames, _this->m_InputBuffer);
if (err != noErr)
{
...
return err;
}
// ignore callback on unexpected buffer size
if (_this->m_params.bufferSizeSamples != inNumberFrames)
{
...
return noErr;
}
// Deliver audio data
DeviceIOMessage message;
message.bufferSizeBytes = _this->m_deviceBufferSizeBytes;
message.buffer = _this->m_InputBuffer->mBuffers[0].mData;
if (_this->m_callbackFunc)
{
_this->m_callbackFunc(_this, message);
}
}
Open and start capture device:
void MacOS_AudioDevice::openAUHALCapture()
{
UInt32 enableIO;
AudioStreamBasicDescription streamFormat;
UInt32 size;
SInt32 *channelArr;
std::stringstream ss;
AudioObjectPropertyAddress deviceBufSizeProperty =
{
kAudioDevicePropertyBufferFrameSize,
kAudioDevicePropertyScopeInput,
kAudioObjectPropertyElementMaster
};
// AUHAL
AudioComponentDescription cd = {kAudioUnitType_Output, kAudioUnitSubType_HALOutput, kAudioUnitManufacturer_Apple, 0, 0};
AudioComponent HALOutput = AudioComponentFindNext(NULL, &cd);
verify_macosapi(AudioComponentInstanceNew(HALOutput, &m_AUHAL));
verify_macosapi(AudioUnitInitialize(m_AUHAL));
// enable input IO
enableIO = 1;
verify_macosapi(AudioUnitSetProperty(m_AUHAL, kAudioOutputUnitProperty_EnableIO, kAudioUnitScope_Input, 1, &enableIO, sizeof(enableIO)));
// disable output IO
enableIO = 0;
verify_macosapi(AudioUnitSetProperty(m_AUHAL, kAudioOutputUnitProperty_EnableIO, kAudioUnitScope_Output, 0, &enableIO, sizeof(enableIO)));
// Setup current device
size = sizeof(AudioDeviceID);
verify_macosapi(AudioUnitSetProperty(m_AUHAL, kAudioOutputUnitProperty_CurrentDevice, kAudioUnitScope_Global, 0, &m_MacDeviceID, sizeof(AudioDeviceID)));
// Set device native buffer length before setting AUHAL stream
size = sizeof(m_originalDeviceBufferTimeFrames);
verify_macosapi(AudioObjectSetPropertyData(m_MacDeviceID, &deviceBufSizeProperty, 0, NULL, size, &m_originalDeviceBufferTimeFrames));
// Get device format
size = sizeof(AudioStreamBasicDescription);
verify_macosapi(AudioUnitGetProperty(m_AUHAL, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Input, 1, &streamFormat, &size));
// Setup channel map
assert(m_params.numOfChannels <= streamFormat.mChannelsPerFrame);
channelArr = new SInt32[streamFormat.mChannelsPerFrame];
for (int i = 0; i < streamFormat.mChannelsPerFrame; i++)
channelArr[i] = -1;
for (int i = 0; i < m_params.numOfChannels; i++)
channelArr[i] = i;
verify_macosapi(AudioUnitSetProperty(m_AUHAL, kAudioOutputUnitProperty_ChannelMap, kAudioUnitScope_Input, 1, channelArr, sizeof(SInt32) * streamFormat.mChannelsPerFrame));
delete [] channelArr;
// Setup stream converters
streamFormat.mFormatID = kAudioFormatLinearPCM;
streamFormat.mFormatFlags = kAudioFormatFlagIsSignedInteger;
streamFormat.mFramesPerPacket = m_SamplesPerPacket;
streamFormat.mBitsPerChannel = m_params.sampleDepthBits;
streamFormat.mSampleRate = m_deviceSampleRate;
streamFormat.mChannelsPerFrame = 1;
streamFormat.mBytesPerFrame = 2;
streamFormat.mBytesPerPacket = streamFormat.mFramesPerPacket * streamFormat.mBytesPerFrame;
verify_macosapi(AudioUnitSetProperty(m_AUHAL, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Output, 1, &streamFormat, size));
// Setup callbacks
AURenderCallbackStruct input;
input.inputProc = captureCallback;
input.inputProcRefCon = this;
verify_macosapi(AudioUnitSetProperty(m_AUHAL, kAudioOutputUnitProperty_SetInputCallback, kAudioUnitScope_Global, 0, &input, sizeof(input)));
// Calculate the size of the IO buffer (in samples)
if (m_params.bufferSizeMS != -1)
{
unsigned int desiredSignalsInBuffer = (m_params.bufferSizeMS / (double)1000) * m_deviceSampleRate;
// making sure the value stay in the device's supported range
desiredSignalsInBuffer = std::min<unsigned int>(desiredSignalsInBuffer, m_deviceBufferFramesRange.mMaximum);
desiredSignalsInBuffer = std::max<unsigned int>(m_deviceBufferFramesRange.mMinimum, desiredSignalsInBuffer);
m_deviceBufferFrames = desiredSignalsInBuffer;
}
// Set device buffer length
size = sizeof(m_deviceBufferFrames);
verify_macosapi(AudioObjectSetPropertyData(m_MacDeviceID, &deviceBufSizeProperty, 0, NULL, size, &m_deviceBufferFrames));
m_deviceBufferSizeBytes = m_deviceBufferFrames * streamFormat.mBytesPerFrame;
m_deviceBufferTimeMS = 1000 * m_deviceBufferFrames/m_deviceSampleRate;
// Calculate number of buffers from channels
size = offsetof(AudioBufferList, mBuffers[0]) + (sizeof(AudioBuffer) * m_params.numOfChannels);
// Allocate input buffer
m_InputBuffer = (AudioBufferList *)malloc(size);
m_InputBuffer->mNumberBuffers = m_params.numOfChannels;
// Pre-malloc buffers for AudioBufferLists
for(UInt32 i = 0; i< m_InputBuffer->mNumberBuffers ; i++)
{
m_InputBuffer->mBuffers[i].mNumberChannels = 1;
m_InputBuffer->mBuffers[i].mDataByteSize = m_deviceBufferSizeBytes;
m_InputBuffer->mBuffers[i].mData = malloc(m_deviceBufferSizeBytes);
}
// Update class properties
m_params.sampleRateHz = streamFormat.mSampleRate;
m_params.bufferSizeSamples = m_deviceBufferFrames;
m_params.bufferSizeBytes = m_params.bufferSizeSamples * streamFormat.mBytesPerFrame;
}
eADMReturnCode MacOS_AudioDevice::start()
{
eADMReturnCode ret = OK;
LOGAPI(ret);
if (!m_isStarted && m_isOpen)
{
OSStatus err = AudioOutputUnitStart(m_AUHAL);
if (err == noErr)
m_isStarted = true;
else
ret = ERROR;
}
return ret;
}
Any idea what cause it and how to solve?
Thanks in advance!
Periodic glitches or dropouts can be caused by not paying attention to or by not fully processing the number of frames sent to each audio callback. Valid buffers don't always contain the expected or same number of samples (inNumberFrames might not equal bufferSizeSamples or the previous inNumberFrames in a perfectly valid audio buffer).
It is possible that these types of glitches might be caused by attempting to record at 44.1k on some models of iOS devices that only support 48k audio in hardware.
Some types of glitch might also be caused by any non-hard-real-time code within your m_callbackFunc function (such as any synchronous file reads/writes, OS calls, Objective C message dispatch, GC, or memory allocation/deallocation).
Related
The true time it takes from when I send the first bit to a serial port to when I receive the last bit it pings back I measured to be 6ms but ReadFile takes around 70-80ms. I'm wondering if this is expected, is this just Windows or is it my code at fault? Here's the function to send and read from the serial port, in my main I have declared and initialized the HANDLE and called that function.
int sendBytes(char* command, char* COM, HANDLE hSerial, int read) {
BOOL Write_Status;
DCB dcbSerialParams = { 0 }; // Initializing DCB structure
dcbSerialParams.DCBlength = sizeof(dcbSerialParams);
Write_Status = GetCommState(hSerial, &dcbSerialParams); //retreives the current settings
if (Write_Status == FALSE) {
printf("\n Error! in GetCommState()");
CloseHandle(hSerial);
return 1;
}
dcbSerialParams.BaudRate = CBR_57600;
dcbSerialParams.ByteSize = 8;
dcbSerialParams.StopBits = ONESTOPBIT;
dcbSerialParams.Parity = NOPARITY;
Write_Status = SetCommState(hSerial, &dcbSerialParams); //Configuring the port according to settings in DCB
if (Write_Status == FALSE)
{
CloseHandle(hSerial);
return 1;
}
///*----------------------------- Writing a Character to Serial Port----------------------------------------*/
int length = strlen(command);
char send[20];
strcpy(send, command);
send[length + 1] = 13;
send[length + 2] = 10;
DWORD dNoOFBytestoWrite; // No of bytes to write into the port
DWORD dNoOfBytesWritten = 0; // No of bytes written to the port
dNoOFBytestoWrite = length + 2; // Calculating the no of bytes to write into the port
if (!WriteFile(hSerial, send, dNoOFBytestoWrite, &dNoOfBytesWritten, NULL))
printf("Error writing text to %s\n", COM);
if (read) {
int maxChars = 100;
BOOL Read_Status; // Status of the various operations
DWORD dwEventMask; // Event mask to trigger
char SerialBuffer[100]; // Buffer Containing Rxed Data
DWORD NoBytesRead; // Bytes read by ReadFile()
///*------------------------------------ Setting Receive Mask ----------------------------------------------*/
Read_Status = SetCommMask(hSerial, EV_RXCHAR); //Configure Windows to Monitor the serial device for Character Reception
if (Read_Status == FALSE)
printf("\n\n Error! in Setting CommMask");
// else
// printf("\n\n Setting CommMask successfull");
///*------------------------------------ Setting WaitComm() Event ----------------------------------------*/
// printf("\n\n Waiting for Data Reception");
Read_Status = WaitCommEvent(hSerial, &dwEventMask, NULL); //Wait for the character to be received
// /*-------------------------- Program will Wait here till a Character is received ------------------------*/
if (Read_Status == FALSE)
{
printf("\n Error! in Setting WaitCommEvent()");
}
else //If WaitCommEvent()==True Read the RXed data using ReadFile();
{
// printf("\n\n Characters Received \t");
clock_t begin = clock();
if (!ReadFile(hSerial, SerialBuffer, 24, &NoBytesRead, NULL))
{
printf("wrong character");
return 1;
}
clock_t end = clock();
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("time : %f\n",time_spent);
}
}
}
This is not how you measure timing with sub-second precision:
clock_t begin = clock();
// stuff
clock_t end = clock();
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
This is how you measure timing:
LARGE_INTEGER before, after, frequency;
QueryPerformanceCounter(&before);
// stuff
QueryPerformanceCounter(&after);
QueryPerformanceFrequency(&frequency);
double time_spent = (after.QuadPart - before.QuadPart) / (double)frequency.QuadPart;
CLOCKS_PER_SEC is imprecise, and then clock() can be even worse, often as bad as the scheduler quantum which is typically 10ms or 15ms.
I have an application that render a cube and do some other syuff. But unfortunately, when I want to move some vertices with the mouse. I have an error. When I map the vertex buffer, the vertices recorded in another struct array is empty. For testing my application, I just put a map after creating the vertexbuffer in order to see if the debugger show me the real numbers. It fails too. Everything is populated with a 0.00 value (positions, normals, tangents...).What is the problem ?
Here you can find the code.
D3D11_BUFFER_DESC bd;
ZeroMemory( &bd, sizeof(bd) );
bd.Usage = D3D11_USAGE_DYNAMIC; //D3D11_USAGE_DEFAULT; //D3D11_USAGE_DYNAMIC;// D3D11_USAGE_DEFAULT;
bd.ByteWidth = CBSize( sizeof( VERTEX_PNTTB)* NumVertices);
bd.BindFlags = D3D11_BIND_VERTEX_BUFFER;
bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; // 0; // D3D11_CPU_ACCESS_READ; //D3D11_CPU_ACCESS_WRITE; // 0;
bd.StructureByteStride = 0;
bd.MiscFlags = 0;
//bd.StructureByteStride
//bd.MiscFlags = 0;
D3D11_SUBRESOURCE_DATA InitData;
ZeroMemory( &InitData, sizeof(InitData) );
InitData.pSysMem = (void*)vertices; //(void*)(mesh->GetVertexes().data()); //vertices; //(float*)vertices; // (UINT*)vertices;
InitData.SysMemPitch=0;
ID3D11Buffer* vbuff = NULL;
hr = device->CreateBuffer(&bd, &InitData, &vbuff); // &(m->vertexBuffer)); //&m_pVertexBuffer );
//if( FAILED( hr ) )
// return hr;
//}
m->vertexBuffer = vbuff;
D3D11_MAPPED_SUBRESOURCE mappedResource;
ID3D11Buffer* &buff = vbuff;
//g_pImmediateContext->CopyResource(buff, mElements[ 0 ]->getElement().model ->Meshes()[selectedMeshIndex]->VertexBuffer());
hr = g_pImmediateContext->Map( buff, 0, D3D11_MAP_WRITE_DISCARD ,0, &mappedResource); // D3D11_MAP_WRITE_DISCARD, 0, &mappedResource);
if (SUCCEEDED(hr))
{
VERTEX_PNTTB *vertices = (VERTEX_PNTTB *)mappedResource.pData;
// Fill vertex buffer
//vertices[0].position = vertices[0]; // XMFLOAT3(toX0,toY0,0); //-1;//toX0; //-1;//vf1.x; // x0;
/*for(UINT i=0; i<faces_indices.size(); i++)
{
vertices[ faces_indices[i] ].position.x = vertices[faces_indices[i] ].position.x + DirectX::XMFLOAT3(20*dx,20*dy,0).x;
vertices[ faces_indices[i] ].position.y = vertices[faces_indices[i] ].position.y + DirectX::XMFLOAT3(20*dx,20*dy,0).y;
vertices[ faces_indices[i] ].position.z = vertices[faces_indices[i] ].position.z + DirectX::XMFLOAT3(20*dx,20*dy,0).z;
}*/
//g_pImmediateContext->Unmap( mElements[ 0 ]->getElement().model ->Meshes()[selectedMeshIndex]->VertexBuffer(), 0);
g_pImmediateContext->Unmap( buff, 0);
}
Generally you don't want to create Vertex Buffer or Index Buffers in CPU-readable memory as it has a major negative performance impact. You'll find it's much better to have static VB/IB and another copy of the data in standard RAM for the CPU to modify.
That said, you can create the Direct3D 11 buffer in shared memory by using D3D11_USAGE_DYNAMIC and D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE. You then call Map with D3D11_MAP_READ_WRITE.
You definitely should not use D3D11_MAP_WRITE_DISCARD which gives you a fresh piece of memory that has no data in it which will overwrite the old data when you call Unmap.
I've been trying to profile an OpenCL host code for FIR filtering on MAC, Ubuntu and other platforms. My Host code and kernel are as below.
The issue is that irrespective of the number of samples that I provide for the FIR filter, the clenquendrangelernel ends up taking the same amount of time. Also I've profiled the clEnqueueReadBuffer and clEnqueueWriteBuffer as well and somehow they also end up taking the same amount of time. In mac I'm profiling with mach as well as using OpenCL events, in ubuntu, I'm profiling with PAPI. Im unable to understand why this is happening, ideally with increase in the number of samples, the clEnqueueReadBuffer and clEnqueueWriteBuffer should take more time and so should kernel execution.
Kernel:-
__kernel void fir4(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[4] = {5,7,5,7};
/*for(j=0;j<4;j++)
{
output[i] += coeff[j]*(input[i+4-j-1]);
}*/
//unrolled
output[i] += coeff[0]*(input[i+4-0-1]);
output[i] += coeff[1]*(input[i+4-1-1]);
output[i] += coeff[2]*(input[i+4-2-1]);
output[i] += coeff[3]*(input[i+4-3-1]);
}
__kernel void fir8(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[8] = {5,7,5,7,5,7,5,7};
for(j=0;j<8;j++)
{
output[i] += coeff[j]*(input[i+8-j-1]);
}
}
__kernel void fir12(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[12] = {5,7,5,7,5,7,5,7,5,7,5,7};
for(j=0;j<12;j++)
{
output[i] += coeff[j]*(input[i+12-j-1]);
}
}
Host Code:-
// Use a static data size for simplicity
//
#define DATA_SIZE (48000)
#define NUM_COEFF (4)
int main(int argc, char** argv)
{
uint64_t start;
uint64_t end;
uint64_t elapsed;
double elapsedmilli;
int err; // error code returned from api calls
float data[DATA_SIZE]; // original data set given to device
float coeff[NUM_COEFF];
float results_host[DATA_SIZE] = {};
float results[DATA_SIZE]; // results returned from device
unsigned int correct; // number of correct results returned
size_t global; // global domain size for our calculation
size_t local; // local domain size for our calculation
cl_event event; //Linking event to kernel for profiling
cl_platform_id platform_id = NULL; // compute device platform id
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program; // compute program
cl_kernel kernel; // compute kernel
cl_mem input; // device memory used for the input array
cl_mem output; // device memory used for the output array
// Fill our data set with random float values
//
int i,j = 0;
unsigned int count = DATA_SIZE;
unsigned int taps = NUM_COEFF;
for(i = 0; i < count; i++)
data[i] = rand() / (float)RAND_MAX;
for(i=0; i < taps; i++)
{
if(!(i%2))
coeff[i] = 5;
else
coeff[i] = 7;
}
//Connect to a platform on device
err = clGetPlatformIDs(1, &platform_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to locate opencl platform!\n");
return EXIT_FAILURE;
}
// Connect to a compute device
//
int gpu = 0;
err = clGetDeviceIDs(platform_id, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}
// Create a compute context
//
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context)
{
printf("Error: Failed to create a compute context!\n");
return EXIT_FAILURE;
}
// Create a command commands
//
commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
if (!commands)
{
printf("Error: Failed to create a command commands!\n");
return EXIT_FAILURE;
}
//Use function and load the kernel source from .cl files in the same folder
//
char *KernelSource = load_program_source("fir.cl");
// Create the compute program from the source buffer
//
program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
if (!program)
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}
// Build the program executable
//
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
// Create the compute kernel in the program we wish to run
//
switch(taps)
{
case(4):
{
kernel = clCreateKernel(program, "fir4", &err);
break;
}
case(8):
{
kernel = clCreateKernel(program, "fir8", &err);
break;
}
case(12):
{
kernel = clCreateKernel(program, "fir12", &err);
break;
}
default:
{
kernel = clCreateKernel(program, "fir4", &err);
break;
}
}
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel! - %d\n",err);
exit(1);
}
// Create the input and output arrays in device memory for our calculation
//
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
if (!input || !output)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
// Write our data set into the input array in device memory
//
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
// Set the arguments to our compute kernel
//
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
// Get the maximum work group size for executing the kernel on the device
//
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", err);
exit(1);
}
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
global = count;
local = 48;
start = mach_absolute_time();
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, &event);
if (err)
{
printf("Error: Failed to execute kernel!-%d\n",err);
return EXIT_FAILURE;
}
// Wait for the command commands to get serviced before reading back results
//
clWaitForEvents(1, &event);
clFinish(commands);
end = mach_absolute_time();
cl_ulong time_start, time_end;
double total_time;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = time_end - time_start;
printf("cl:main timing:opencl clEnqueueNDRangeKernel %0.3f us\n", total_time / 1000.0);
elapsed = end - start;
struct mach_timebase_info info;
mach_timebase_info(&info);
double t = 1e-9 * (elapsed) * info.numer / info.denom;
elapsedmilli = 1e-6 * (elapsed) * info.numer / info.denom;
printf("cl:main timing:MACH clEnqueueNDRangeKernel %f ms, %d elapsed\n",elapsedmilli,elapsed);
// Read back the results from the device to verify the output
//
err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
// Validate our results
//
correct = 0;
for(i=0; i<DATA_SIZE; i++)
{
for(j=0;j<NUM_COEFF;j++)
{
results_host[i]+=coeff[j]*(data[i+NUM_COEFF-j-1]);
}
//printf("Host Output[%d]-%f\n",i,results_host[i]);
}
for(i = 0; i < count; i++)
{
if(results[i] == results_host[i])
correct++;
//printf("CL Output[%d]-%f\n",i,results[i]);
}
// Print a brief summary detailing the results
//
printf("Computed '%d/%d' correct values! Samples-%d,Taps-%d\n", correct, count, DATA_SIZE, NUM_COEFF);
// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
Adding just 10-20 multiplications and additions per item is not comparable to kernel overhead time. Try with 100 or 1000-wide coefficients array.
Using more input elements per item with that way, just increases cache hit numbers(also ratio) because more threads read from same locations.
If DATA_SIZE is several millions, then all data could not fit in cache and become slower linearly with its length. 48000 means less than 200kB. A HD5850 has 512 k L2 cache(3x bandwidth of memory) and 8kB L1 per compute unit(too fast) for example.
I am trying to implement fast IO under Windows, and working my way up to Overlapped IO. In my research, Unbuffered IO requires page aligned buffers. Ive attempted to implement this in my code below. However, I occasionally have Readfiles last error report no access (error 998, ERROR_NOACCESS) - prior to completing the read, and after a few reads of a page aligned buffer. Sometimes 16. Sometimes 4, etc.
I cant for the life of me figure out why i am occasionally throwing an error. Any insight would be helpful.
ci::BufferRef CinderSequenceRendererApp::CreateFileLoadWinNoBufferSequential(fs::path path) {
HANDLE file = CreateFile(path.c_str(), GENERIC_READ, 0, NULL, OPEN_EXISTING, FILE_FLAG_NO_BUFFERING | FILE_FLAG_SEQUENTIAL_SCAN, 0);
if (file == INVALID_HANDLE_VALUE)
{
console() << "Could not open file for reading" << std::endl;
}
ci::BufferRef latestAvailableBufferRef = nullptr;
LARGE_INTEGER nLargeInteger = { 0 };
GetFileSizeEx(file, &nLargeInteger);
// how many reads do we need to fill our buffer with a buffer size of x and a read size of y
// Our buffer needs to hold 'n' sector sizes that wil fit the size of the file
SYSTEM_INFO si;
GetSystemInfo(&si);
long readAmount = si.dwPageSize;
int numReads = 0;
ULONG bufferSize = 0;
// calculate sector aligned buffer size that holds our file size
while (bufferSize < nLargeInteger.QuadPart)
{
numReads++;
bufferSize = (numReads) * readAmount;
}
// need one page extra for null if we need it
latestAvailableBufferRef = ci::Buffer::create(bufferSize + readAmount);
if (latestAvailableBufferRef != nullptr)
{
DWORD outputBytes = 1;
// output bytes = 0 when OEF
void* address = latestAvailableBufferRef->getData();
DWORD bytesRead = 0;
while (outputBytes != 0)
{
bool result = ReadFile(file, address, readAmount, &outputBytes, 0);
if (!result )//&& (outputBytes == 0))
{
getLastReadError();
}
address = (void*)((long)address + readAmount);
bytesRead += outputBytes;
}
}
CloseHandle(file);
// resize our buffer to expected file size?
latestAvailableBufferRef->resize(nLargeInteger.QuadPart);
return latestAvailableBufferRef;
}
Cast to long long - I was truncating my pointer address. Duh. Thanks to #jonathan-potter
I am working on capturing and streaming audio to RTMP server at a moment. I work under MacOS (in Xcode), so for capturing audio sample-buffer I use AVFoundation-framework. But for encoding and streaming I need to use ffmpeg-API and libfaac encoder. So output format must be AAC (for supporting stream playback on iOS-devices).
And I faced with such problem: audio-capturing device (in my case logitech camera) gives me sample-buffer with 512 LPCM samples, and I can select input sample-rate from 16000, 24000, 36000 or 48000 Hz. When I give these 512 samples to AAC-encoder (configured for appropriate sample-rate), I hear a slow and jerking audio (seems as like pice of silence after each frame).
I figured out (maybe I am wrong), that libfaac encoder accepts audio frames only with 1024 samples. When I set input samplerate to 24000 and resample input sample-buffer to 48000 before encoding, I obtain 1024 resampled samples. After encoding these 1024 sampels to AAC, I hear proper sound on output. But my web-cam produce 512 samples in buffer for any input samplerate, when output sample-rate must be 48000 Hz. So I need to do resampling in any case, and I will not obtain exactly 1024 samples in buffer after resampling.
Is there a way to solve this problem within ffmpeg-API functionality?
I would be grateful for any help.
PS:
I guess that I can accumulate resampled buffers until count of samples become 1024, and then encode it, but this is stream so there will be troubles with resulting timestamps and with other input devices, and such solution is not suitable.
The current issue came out of the problem described in [question]: How to fill audio AVFrame (ffmpeg) with the data obtained from CMSampleBufferRef (AVFoundation)?
Here is a code with audio-codec configs (there also was video stream but video work fine):
/*global variables*/
static AVFrame *aframe;
static AVFrame *frame;
AVOutputFormat *fmt;
AVFormatContext *oc;
AVStream *audio_st, *video_st;
Init ()
{
AVCodec *audio_codec, *video_codec;
int ret;
avcodec_register_all();
av_register_all();
avformat_network_init();
avformat_alloc_output_context2(&oc, NULL, "flv", filename);
fmt = oc->oformat;
oc->oformat->video_codec = AV_CODEC_ID_H264;
oc->oformat->audio_codec = AV_CODEC_ID_AAC;
video_st = NULL;
audio_st = NULL;
if (fmt->video_codec != AV_CODEC_ID_NONE)
{ //… /*init video codec*/}
if (fmt->audio_codec != AV_CODEC_ID_NONE) {
audio_codec= avcodec_find_encoder(fmt->audio_codec);
if (!(audio_codec)) {
fprintf(stderr, "Could not find encoder for '%s'\n",
avcodec_get_name(fmt->audio_codec));
exit(1);
}
audio_st= avformat_new_stream(oc, audio_codec);
if (!audio_st) {
fprintf(stderr, "Could not allocate stream\n");
exit(1);
}
audio_st->id = oc->nb_streams-1;
//AAC:
audio_st->codec->sample_fmt = AV_SAMPLE_FMT_S16;
audio_st->codec->bit_rate = 32000;
audio_st->codec->sample_rate = 48000;
audio_st->codec->profile=FF_PROFILE_AAC_LOW;
audio_st->time_base = (AVRational){1, audio_st->codec->sample_rate };
audio_st->codec->channels = 1;
audio_st->codec->channel_layout = AV_CH_LAYOUT_MONO;
if (oc->oformat->flags & AVFMT_GLOBALHEADER)
audio_st->codec->flags |= CODEC_FLAG_GLOBAL_HEADER;
}
if (video_st)
{
// …
/*prepare video*/
}
if (audio_st)
{
aframe = avcodec_alloc_frame();
if (!aframe) {
fprintf(stderr, "Could not allocate audio frame\n");
exit(1);
}
AVCodecContext *c;
int ret;
c = audio_st->codec;
ret = avcodec_open2(c, audio_codec, 0);
if (ret < 0) {
fprintf(stderr, "Could not open audio codec: %s\n", av_err2str(ret));
exit(1);
}
//…
}
And resampling and encoding audio:
if (mType == kCMMediaType_Audio)
{
CMSampleTimingInfo timing_info;
CMSampleBufferGetSampleTimingInfo(sampleBuffer, 0, &timing_info);
double pts=0;
double dts=0;
AVCodecContext *c;
AVPacket pkt = { 0 }; // data and size must be 0;
int got_packet, ret;
av_init_packet(&pkt);
c = audio_st->codec;
CMItemCount numSamples = CMSampleBufferGetNumSamples(sampleBuffer);
NSUInteger channelIndex = 0;
CMBlockBufferRef audioBlockBuffer = CMSampleBufferGetDataBuffer(sampleBuffer);
size_t audioBlockBufferOffset = (channelIndex * numSamples * sizeof(SInt16));
size_t lengthAtOffset = 0;
size_t totalLength = 0;
SInt16 *samples = NULL;
CMBlockBufferGetDataPointer(audioBlockBuffer, audioBlockBufferOffset, &lengthAtOffset, &totalLength, (char **)(&samples));
const AudioStreamBasicDescription *audioDescription = CMAudioFormatDescriptionGetStreamBasicDescription(CMSampleBufferGetFormatDescription(sampleBuffer));
SwrContext *swr = swr_alloc();
int in_smprt = (int)audioDescription->mSampleRate;
av_opt_set_int(swr, "in_channel_layout", AV_CH_LAYOUT_MONO, 0);
av_opt_set_int(swr, "out_channel_layout", audio_st->codec->channel_layout, 0);
av_opt_set_int(swr, "in_channel_count", audioDescription->mChannelsPerFrame, 0);
av_opt_set_int(swr, "out_channel_count", audio_st->codec->channels, 0);
av_opt_set_int(swr, "out_channel_layout", audio_st->codec->channel_layout, 0);
av_opt_set_int(swr, "in_sample_rate", audioDescription->mSampleRate,0);
av_opt_set_int(swr, "out_sample_rate", audio_st->codec->sample_rate,0);
av_opt_set_sample_fmt(swr, "in_sample_fmt", AV_SAMPLE_FMT_S16, 0);
av_opt_set_sample_fmt(swr, "out_sample_fmt", audio_st->codec->sample_fmt, 0);
swr_init(swr);
uint8_t **input = NULL;
int src_linesize;
int in_samples = (int)numSamples;
ret = av_samples_alloc_array_and_samples(&input, &src_linesize, audioDescription->mChannelsPerFrame,
in_samples, AV_SAMPLE_FMT_S16P, 0);
*input=(uint8_t*)samples;
uint8_t *output=NULL;
int out_samples = av_rescale_rnd(swr_get_delay(swr, in_smprt) +in_samples, (int)audio_st->codec->sample_rate, in_smprt, AV_ROUND_UP);
av_samples_alloc(&output, NULL, audio_st->codec->channels, out_samples, audio_st->codec->sample_fmt, 0);
in_samples = (int)numSamples;
out_samples = swr_convert(swr, &output, out_samples, (const uint8_t **)input, in_samples);
aframe->nb_samples =(int) out_samples;
ret = avcodec_fill_audio_frame(aframe, audio_st->codec->channels, audio_st->codec->sample_fmt,
(uint8_t *)output,
(int) out_samples *
av_get_bytes_per_sample(audio_st->codec->sample_fmt) *
audio_st->codec->channels, 1);
aframe->channel_layout = audio_st->codec->channel_layout;
aframe->channels=audio_st->codec->channels;
aframe->sample_rate= audio_st->codec->sample_rate;
if (timing_info.presentationTimeStamp.timescale!=0)
pts=(double) timing_info.presentationTimeStamp.value/timing_info.presentationTimeStamp.timescale;
aframe->pts=pts*audio_st->time_base.den;
aframe->pts = av_rescale_q(aframe->pts, audio_st->time_base, audio_st->codec->time_base);
ret = avcodec_encode_audio2(c, &pkt, aframe, &got_packet);
if (ret < 0) {
fprintf(stderr, "Error encoding audio frame: %s\n", av_err2str(ret));
exit(1);
}
swr_free(&swr);
if (got_packet)
{
pkt.stream_index = audio_st->index;
pkt.pts = av_rescale_q(pkt.pts, audio_st->codec->time_base, audio_st->time_base);
pkt.dts = av_rescale_q(pkt.dts, audio_st->codec->time_base, audio_st->time_base);
// Write the compressed frame to the media file.
ret = av_interleaved_write_frame(oc, &pkt);
if (ret != 0) {
fprintf(stderr, "Error while writing audio frame: %s\n",
av_err2str(ret));
exit(1);
}
}
I also ended up here after having a similar problem. I'm reading audio and video from a Blackmagic Decklink SDI card in 720p50 meaning I had 960 samples per videoframe (48k/50fps) I wanted to encode together with the video. Got really weird audio when only sending 960 samples to aacenc and it didn't really complain about this fact either.
Started to use AVAudioFifo (see ffmpeg/doc/examples/transcode_aac.c) and kept adding frames to it until I had enough frames to satisfy aacenc. This will mean I have samples playing too late I guess, since pts will be set on 1024 samples when the first 960 should really have another value. But, it's not really noticeable as far as I can hear/see.
I got a similar problem. I was encoding PCM packets to AAC while the length of PCM packets are sometimes smaller than 1024.
If I encode the packet that's smaller than 1024, the audio will be slow. On the other hand, if I throw it away, the audio will get faster. swr_convert function didn't have any automatic buffering from my observation.
I ended up with a buffer scheme that packets was filled to a 1024 buffer and the buffer gets encoded and cleaned everytime it's full.
The function to fill buffer is below:
// put frame data into buffer of fixed size
bool ffmpegHelper::putAudioBuffer(const AVFrame *pAvFrameIn, AVFrame **pAvFrameBuffer, AVCodecContext *dec_ctx, int frame_size, int &k0) {
// prepare pFrameAudio
if (!(*pAvFrameBuffer)) {
if (!(*pAvFrameBuffer = av_frame_alloc())) {
av_log(NULL, AV_LOG_ERROR, "Alloc frame failed\n");
return false;
} else {
(*pAvFrameBuffer)->format = dec_ctx->sample_fmt;
(*pAvFrameBuffer)->channels = dec_ctx->channels;
(*pAvFrameBuffer)->sample_rate = dec_ctx->sample_rate;
(*pAvFrameBuffer)->nb_samples = frame_size;
int ret = av_frame_get_buffer(*pAvFrameBuffer, 0);
if (ret < 0) {
char err[500];
av_log(NULL, AV_LOG_ERROR, "get audio buffer failed: %s\n",
av_make_error_string(err, AV_ERROR_MAX_STRING_SIZE, ret));
return false;
}
(*pAvFrameBuffer)->nb_samples = 0;
(*pAvFrameBuffer)->pts = pAvFrameIn->pts;
}
}
// copy input data to buffer
int n_channels = pAvFrameIn->channels;
int new_samples = min(pAvFrameIn->nb_samples - k0, frame_size - (*pAvFrameBuffer)->nb_samples);
int k1 = (*pAvFrameBuffer)->nb_samples;
if (pAvFrameIn->format == AV_SAMPLE_FMT_S16) {
int16_t *d_in = (int16_t *)pAvFrameIn->data[0];
d_in += n_channels * k0;
int16_t *d_out = (int16_t *)(*pAvFrameBuffer)->data[0];
d_out += n_channels * k1;
for (int i = 0; i < new_samples; ++i) {
for (int j = 0; j < pAvFrameIn->channels; ++j) {
*d_out++ = *d_in++;
}
}
} else {
printf("not handled format for audio buffer\n");
return false;
}
(*pAvFrameBuffer)->nb_samples += new_samples;
k0 += new_samples;
return true;
}
And the loop for fill buffer and encode is below:
// transcoding needed
int got_frame;
AVMediaType stream_type;
// decode the packet (do it your self)
decodePacket(packet, dec_ctx, &pAvFrame_, got_frame);
if (enc_ctx->codec_type == AVMEDIA_TYPE_AUDIO) {
ret = 0;
// break audio packet down to buffer
if (enc_ctx->frame_size > 0) {
int k = 0;
while (k < pAvFrame_->nb_samples) {
if (!putAudioBuffer(pAvFrame_, &pFrameAudio_, dec_ctx, enc_ctx->frame_size, k))
return false;
if (pFrameAudio_->nb_samples == enc_ctx->frame_size) {
// the buffer is full, encode it (do it yourself)
ret = encodeFrame(pFrameAudio_, stream_index, got_frame, false);
if (ret < 0)
return false;
pFrameAudio_->pts += enc_ctx->frame_size;
pFrameAudio_->nb_samples = 0;
}
}
} else {
ret = encodeFrame(pAvFrame_, stream_index, got_frame, false);
}
} else {
// encode packet directly
ret = encodeFrame(pAvFrame_, stream_index, got_frame, false);
}
You have to break sample buffer into chunks of size 1024, i did for recording mp3 in android for more info follow these links link1,links2
If anyone ended up here, I had the same issue, and just as #Mohit pointed out for AAC each audio frame has to be broken down into 1024 bytes chunks.
example:
uint8_t *buffer = (uint8_t*) malloc(1024);
AVFrame *frame = av_frame_alloc();
while((fread(buffer, 1024, 1, fp)) == 1) {
frame->data[0] = buffer;
}
A possible solution is to use asetnsamples filter which sets the number of samples for each output audio frame :
https://ffmpeg.org/ffmpeg-filters.html#asetnsamples
You can feed the filter with your input frames and the resulting output frames each have the desired number of samples. The value for the number of samples in filter should be equal to frame_size of the encoder AVCodecContext.