Creating a video from images using ffmpeg libav and libx264?

Creating a video from images using ffmpeg libav and libx264? - ffmpeg

I am trying to create a video from images using the ffmpeg library. The images have a size of 1920x1080 and are supposed to be encoded with H.264 using a .mkv container.
I have come across various problems, thinking I am getting closer to a solution, but this one I am really stuck on. With the settings I use, the first X frames (around 40, depending on what and how many images I use for the video) of my video are not encoded. avcodec_encode_video2 does not return any error (return value is 0) with got_picture_ptr = 0.
The result is a video that actually looks as expected, but the first seconds are weirdly jumpy.
So this is how I create the video file:
// m_codecContext is an instance variable of type AVCodecContext *
// m_formatCtx is an instance variable of type AVFormatContext *
// outputFileName is a valid filename ending with .mkv
AVOutputFormat *oformat = av_guess_format(NULL, outputFileName, NULL);
if (oformat == NULL)
{
oformat = av_guess_format("mpeg", NULL, NULL);
}
// oformat->video_codec is AV_CODEC_ID_H264
AVCodec *codec = avcodec_find_encoder(oformat->video_codec);
m_codecContext = avcodec_alloc_context3(codec);
m_codecContext->codec_id = oformat->video_codec;
m_codecContext->codec_type = AVMEDIA_TYPE_VIDEO;
m_codecContext->gop_size = 30;
m_codecContext->bit_rate = width * height * 4
m_codecContext->width = width;
m_codecContext->height = height;
m_codecContext->time_base = (AVRational){1,frameRate};
m_codecContext->max_b_frames = 1;
m_codecContext->pix_fmt = AV_PIX_FMT_YUV420P;
m_formatCtx = avformat_alloc_context();
m_formatCtx->oformat = oformat;
m_formatCtx->video_codec_id = oformat->video_codec;
snprintf(m_formatCtx->filename, sizeof(m_formatCtx->filename), "%s", outputFileName);
AVStream *videoStream = avformat_new_stream(m_formatCtx, codec);
if(!videoStream)
{
printf("Could not allocate stream\n");
}
videoStream->codec = m_codecContext;
if(m_formatCtx->oformat->flags & AVFMT_GLOBALHEADER)
{
m_codecContext->flags |= CODEC_FLAG_GLOBAL_HEADER;
}
avcodec_open2(m_codecContext, codec, NULL) < 0);
avio_open(&m_formatCtx->pb, outputFileName.toStdString().c_str(), AVIO_FLAG_WRITE);
avformat_write_header(m_formatCtx, NULL);
this is how the frames are added:
void VideoCreator::writeImageToVideo(const QSharedPointer<QImage> &img, int frameIndex)
{
AVFrame *frame = avcodec_alloc_frame();
/* alloc image and output buffer */
int size = m_codecContext->width * m_codecContext->height;
int numBytes = avpicture_get_size(m_codecContext->pix_fmt, m_codecContext->width, m_codecContext->height);
uint8_t *outbuf = (uint8_t *)malloc(numBytes);
uint8_t *picture_buf = (uint8_t *)av_malloc(numBytes);
int ret = av_image_fill_arrays(frame->data, frame->linesize, picture_buf, m_codecContext->pix_fmt, m_codecContext->width, m_codecContext->height, 1);
frame->data[0] = picture_buf;
frame->data[1] = frame->data[0] + size;
frame->data[2] = frame->data[1] + size/4;
frame->linesize[0] = m_codecContext->width;
frame->linesize[1] = m_codecContext->width/2;
frame->linesize[2] = m_codecContext->width/2;
fflush(stdout);
for (int y = 0; y < m_codecContext->height; y++)
{
for (int x = 0; x < m_codecContext->width; x++)
{
unsigned char b = img->bits()[(y * m_codecContext->width + x) * 4 + 0];
unsigned char g = img->bits()[(y * m_codecContext->width + x) * 4 + 1];
unsigned char r = img->bits()[(y * m_codecContext->width + x) * 4 + 2];
unsigned char Y = (0.257 * r) + (0.504 * g) + (0.098 * b) + 16;
frame->data[0][y * frame->linesize[0] + x] = Y;
if (y % 2 == 0 && x % 2 == 0)
{
unsigned char V = (0.439 * r) - (0.368 * g) - (0.071 * b) + 128;
unsigned char U = -(0.148 * r) - (0.291 * g) + (0.439 * b) + 128;
frame->data[1][y/2 * frame->linesize[1] + x/2] = U;
frame->data[2][y/2 * frame->linesize[2] + x/2] = V;
}
}
}
int pts = frameIndex;//(1.0 / 30.0) * 90.0 * frameIndex;
frame->pts = pts;//av_rescale_q(m_codecContext->coded_frame->pts, m_codecContext->time_base, formatCtx->streams[0]->time_base); //(1.0 / 30.0) * 90.0 * frameIndex;
int got_packet_ptr;
AVPacket packet;
av_init_packet(&packet);
packet.data = outbuf;
packet.size = numBytes;
packet.stream_index = formatCtx->streams[0]->index;
packet.flags |= AV_PKT_FLAG_KEY;
packet.pts = packet.dts = pts;
m_codecContext->coded_frame->pts = pts;
ret = avcodec_encode_video2(m_codecContext, &packet, frame, &got_packet_ptr);
if (got_packet_ptr != 0)
{
m_codecContext->coded_frame->pts = pts; // Set the time stamp
if (m_codecContext->coded_frame->pts != (0x8000000000000000LL))
{
pts = av_rescale_q(m_codecContext->coded_frame->pts, m_codecContext->time_base, formatCtx->streams[0]->time_base);
}
packet.pts = pts;
if(m_codecContext->coded_frame->key_frame)
{
packet.flags |= AV_PKT_FLAG_KEY;
}
std::cout << "pts: " << packet.pts << ", dts: " << packet.dts << std::endl;
av_interleaved_write_frame(formatCtx, &packet);
av_free_packet(&packet);
}
free(picture_buf);
free(outbuf);
av_free(frame);
printf("\n");
}
and this is the cleanup:
int numBytes = avpicture_get_size(m_codecContext->pix_fmt, m_codecContext->width, m_codecContext->height);
int got_packet_ptr = 1;
int ret;
// for(; got_packet_ptr != 0; i++)
while (got_packet_ptr)
{
uint8_t *outbuf = (uint8_t *)malloc(numBytes);
AVPacket packet;
av_init_packet(&packet);
packet.data = outbuf;
packet.size = numBytes;
ret = avcodec_encode_video2(m_codecContext, &packet, NULL, &got_packet_ptr);
if (got_packet_ptr)
{
av_interleaved_write_frame(m_formatCtx, &packet);
}
av_free_packet(&packet);
free(outbuf);
}
av_write_trailer(formatCtx);
avcodec_close(m_codecContext);
av_free(m_codecContext);
printf("\n");
I assume it might be tied to the PTS and DTS values, but I have tried EVERYTHING. The frame index seems to make the most sense. The images are correct, I can save them to files without any problems. I am running out of ideas.
I would be incredibly thankful if there was someone out there who knew better than me...
Cheers,
marikaner
UPDATE:
If this is of any help this is the output at the end of the video encoding:
[libx264 # 0x7fffc00028a0] frame I:19 Avg QP:14.24 size:312420
[libx264 # 0x7fffc00028a0] frame P:280 Avg QP:19.16 size:148867
[libx264 # 0x7fffc00028a0] frame B:181 Avg QP:21.31 size: 40540
[libx264 # 0x7fffc00028a0] consecutive B-frames: 24.6% 75.4%
[libx264 # 0x7fffc00028a0] mb I I16..4: 30.9% 45.5% 23.7%
[libx264 # 0x7fffc00028a0] mb P I16..4: 4.7% 9.1% 4.5% P16..4: 23.5% 16.6% 12.6% 0.0% 0.0% skip:28.9%
[libx264 # 0x7fffc00028a0] mb B I16..4: 0.6% 0.5% 0.3% B16..8: 26.7% 11.0% 5.5% direct: 3.9% skip:51.5% L0:39.4% L1:45.0% BI:15.6%
[libx264 # 0x7fffc00028a0] final ratefactor: 19.21
[libx264 # 0x7fffc00028a0] 8x8 transform intra:48.2% inter:47.3%
[libx264 # 0x7fffc00028a0] coded y,uvDC,uvAC intra: 54.9% 53.1% 30.4% inter: 25.4% 13.5% 4.2%
[libx264 # 0x7fffc00028a0] i16 v,h,dc,p: 41% 29% 11% 19%
[libx264 # 0x7fffc00028a0] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 16% 26% 31% 3% 4% 3% 7% 3% 6%
[libx264 # 0x7fffc00028a0] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 30% 26% 14% 4% 5% 4% 7% 4% 7%
[libx264 # 0x7fffc00028a0] i8c dc,h,v,p: 58% 26% 13% 3%
[libx264 # 0x7fffc00028a0] Weighted P-Frames: Y:17.1% UV:3.6%
[libx264 # 0x7fffc00028a0] ref P L0: 63.1% 21.4% 11.4% 4.1% 0.1%
[libx264 # 0x7fffc00028a0] ref B L0: 85.7% 14.3%
[libx264 # 0x7fffc00028a0] kb/s:27478.30

Libav is probably delaying the processing of the initial frames. A good practice is to check for any delayed frames after you have finished processing all frames. This is done as follows:
int i=NUMBER_OF_FRAMES_PREVIOUSLY_ENCODED
for(; got_packet_ptr; i++)
ret = avcodec_encode_video2(m_codecContext, &packet, NULL, &got_packet_ptr);
//Write the packets to a container after this.
The point is to pass a NULL pointer in place of the frame to be encoded and continue to do so until the packet you get is non-empty. See this link for the code example - the part under "get the delayed frames".
An easier way out would be to set the number of b frames to be 0.
m_codecContext->max_b_frames = 0;
Let me know if this works fine.
Also, you haven't used the libx264 API at all. You can make use of the libx264 APIs for encoding videos, they have a simpler and cleaner syntax. Plus it offers you more control over the settings and improved performance.
For writing the video stream to mkv container, you still will have to use the libav libraries. though.

At least for me adding
frame->width = m_codecContext->width;
frame->height = m_codecContext->height;
frame->format = m_codecContext->pix_fmt;
made this example code work as expected.

Related

How to check a CGDisplay is iPad by sidecar on MacOS?

As MacOS and iPad support sidecar now, CGGetActiveDisplayList provides the list of displays contains iPad when sidecar working, but how to check witch CGDisplay is the iPad?
CGDirectDisplayID displays[10] = {0};
uint32_t count;
CGError err = CGGetActiveDisplayList(10, displays, &count);
printf("%d %d\n", displays[0], displays[1]);
for (int i = 0; i < count; i++) {
CGDisplayModeRef mode = CGDisplayCopyDisplayMode(displays[i]);
int32_t io = CGDisplayModeGetIODisplayModeID(mode);
uint32_t io_flag = CGDisplayModeGetIOFlags(mode);
bool usable = CGDisplayModeIsUsableForDesktopGUI(mode);
size_t width = CGDisplayModeGetPixelWidth(mode);
size_t height = CGDisplayModeGetPixelHeight(mode);
printf("i: %d io: %d usable: %s, io flag: 0x%x, pixel size: %dx%d\n",i, io, usable ? "T": "F", io_flag, width, height);
}
Output:
i: 0 io: 8 usable: T, io flag: 0x7, pixel size: 2880x1800
i: 1 io: 0 usable: T, io flag: 0x2000007, pixel size: 2576x1892

shared memory memcpy performance issue

I'm doing some performance tunning on a shared memory based message queue. I found a strange phenomenon that I can't explain: I ran the same code for 3 epochs, the avg running time is getting better for each epoch.
Here's the minimal demo code:
inline uint64_t current_time_nanos() {
static timespec tp;
clock_gettime(CLOCK_REALTIME, &tp);
return tp.tv_nsec + tp.tv_sec * 1000000000LLU;
}
void test() {
static constexpr size_t TOTAL_SIZE = 16 * 1024 * 1024;
static constexpr size_t COUNT = TOTAL_SIZE / sizeof(market_data);
static_assert(TOTAL_SIZE % sizeof(market_data) == 0);
market_data md;
market_data *ptr =
(market_data *)mmap(nullptr, TOTAL_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (MAP_FAILED == ptr) {
printf("failed to mmap: %s\n", strerror(errno));
}
pthread_mutex_t mtx;
assert(0 == pthread_mutex_init(&mtx, nullptr));
std::atomic_uint64_t pos{0};
mlock(ptr, TOTAL_SIZE);
// Epoach 1
auto st = current_time_nanos();
for (int i = 0; i < COUNT; i++) {
assert(0 == pthread_mutex_lock(&mtx));
memcpy(&ptr[pos.fetch_add(1, std::memory_order_acq_rel) % COUNT], &md,
sizeof(market_data));
assert(0 == pthread_mutex_unlock(&mtx));
}
auto ed = current_time_nanos();
printf("total used: %lu, avg = %f.\n", ed - st, double(ed - st) / COUNT);
// Epoach 2
pos = 0;
st = current_time_nanos();
for (int i = 0; i < COUNT; i++) {
assert(0 == pthread_mutex_lock(&mtx));
memcpy(&ptr[pos.fetch_add(1, std::memory_order_acq_rel) % COUNT], &md,
sizeof(market_data));
assert(0 == pthread_mutex_unlock(&mtx));
}
ed = current_time_nanos();
printf("total used: %lu, avg = %f.\n", ed - st, double(ed - st) / COUNT);
// Epoach 3
pos = 0;
st = current_time_nanos();
for (int i = 0; i < COUNT; i++) {
assert(0 == pthread_mutex_lock(&mtx));
memcpy(&ptr[pos.fetch_add(1, std::memory_order_acq_rel) % COUNT], &md,
sizeof(market_data));
assert(0 == pthread_mutex_unlock(&mtx));
}
ed = current_time_nanos();
printf("total used: %lu, avg = %f.\n", ed - st, double(ed - st) / COUNT);
}
I've run the code for multiple times, It can be sure that the avg execution time is getting better for each epoch. e.g epoch 3 has the best performance.
I wonder why this happens and How can I do some warmup that I could gain the performance from 3rd epoch without actually do memcpy?
sample result:
total used: 2479219, avg = 75.659760.
total used: 2092045, avg = 63.844147.
total used: 1718318, avg = 52.438904.
Here's the detailed info:
CPU: Intel Xeon 6348 2.6GHZ (Cascade-Lake)
Compiler: G++ 10.2.1 with O3 enabled
I've already use mlock to avoid page fault. it helps a lot. I also try to use _mm_prefetch but there's no performance gain actually.(Or may be I'm not using it correctly)

FFmpeg - MJPEG decoding gives inconsistent values

I have a set of JPEG frames which I am muxing into an avi, which gives me a mjpeg video. This is the command I run on the console:
ffmpeg -y -start_number 0 -i %06d.JPEG -codec copy vid.avi
When I try to demux the video using ffmpeg C api, I get frames which are slightly different in values. Demuxing code looks something like this:
AVFormatContext* fmt_ctx = NULL;
AVCodecContext* cdc_ctx = NULL;
AVCodec* vid_cdc = NULL;
int ret;
unsigned int height, width;
....
// read_nframes is the number of frames to read
output_arr = new unsigned char [height * width * 3 *
sizeof(unsigned char) * read_nframes];
avcodec_open2(cdc_ctx, vid_cdc, NULL);
int num_bytes;
uint8_t* buffer = NULL;
const AVPixelFormat out_format = AV_PIX_FMT_RGB24;
num_bytes = av_image_get_buffer_size(out_format, width, height, 1);
buffer = (uint8_t*)av_malloc(num_bytes * sizeof(uint8_t));
AVFrame* vid_frame = NULL;
vid_frame = av_frame_alloc();
AVFrame* conv_frame = NULL;
conv_frame = av_frame_alloc();
av_image_fill_arrays(conv_frame->data, conv_frame->linesize, buffer,
out_format, width, height, 1);
struct SwsContext *sws_ctx = NULL;
sws_ctx = sws_getContext(width, height, cdc_ctx->pix_fmt,
width, height, out_format,
SWS_BILINEAR, NULL,NULL,NULL);
int frame_num = 0;
AVPacket vid_pckt;
while (av_read_frame(fmt_ctx, &vid_pckt) >=0) {
ret = avcodec_send_packet(cdc_ctx, &vid_pckt);
if (ret < 0)
break;
ret = avcodec_receive_frame(cdc_ctx, vid_frame);
if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF)
break;
if (ret >= 0) {
// convert image from native format to planar GBR
sws_scale(sws_ctx, vid_frame->data,
vid_frame->linesize, 0, vid_frame->height,
conv_frame->data, conv_frame->linesize);
unsigned char* r_ptr = output_arr +
(height * width * sizeof(unsigned char) * 3 * frame_num);
unsigned char* g_ptr = r_ptr + (height * width * sizeof(unsigned char));
unsigned char* b_ptr = g_ptr + (height * width * sizeof(unsigned char));
unsigned int pxl_i = 0;
for (unsigned int r = 0; r < height; ++r) {
uint8_t* avframe_r = conv_frame->data[0] + r*conv_frame->linesize[0];
for (unsigned int c = 0; c < width; ++c) {
r_ptr[pxl_i] = avframe_r[0];
g_ptr[pxl_i] = avframe_r[1];
b_ptr[pxl_i] = avframe_r[2];
avframe_r += 3;
++pxl_i;
}
}
++frame_num;
if (frame_num >= read_nframes)
break;
}
}
...
In my experience around two-thirds of the pixel values are different, each by +-1 (in a range of [0,255]). I am wondering is it due to some decoding scheme FFmpeg uses for reading JPEG frames? I tried encoding and decoding png frames, and it works perfectly fine. I am sure this is something to do with the libav decoding process because the MD5 values are consistent between the images and the video:
ffmpeg -i %06d.JPEG -f framemd5 -
ffmpeg -i vid.avi -f framemd5 -
In short my goal is to get the same pixel by pixel values for each JPEG frame as I would I have gotten if I was reading the JPEG images directly. Here is the stand-alone bitbucket code I used. It includes cmake files to build code, and a couple of jpeg frames with the converted avi file to test this problem. (give '--filetype png' to test the png decoding).

FFMPEG C api h.264 encoding / MPEG2 ts streaming problems

Class prototype is as follows:
#ifndef _FULL_MOTION_VIDEO_STREAM_H_
#define _FULL_MOTION_VIDEO_STREAM_H_
#include <memory>
#include <string>
#ifndef INT64_C
# define INT64_C(c) (c ## LL)
# define UINT64_C(c) (c ## ULL)
#endif
extern "C"
{
#include "libavutil/opt.h"
#include "libavcodec/avcodec.h"
#include "libavutil/channel_layout.h"
#include "libavutil/common.h"
#include "libavutil/imgutils.h"
#include "libavutil/mathematics.h"
#include "libavutil/samplefmt.h"
#include "libavformat/avformat.h"
#include <libavutil/timestamp.h>
#include <libswscale/swscale.h>
#include <libswresample/swresample.h>
}
class FMVStream
{
public:
struct OutputStream
{
OutputStream() :
st(0),
next_pts(0),
samples_count(0),
frame(0),
tmpFrame(0),
sws_ctx(0)
{
}
AVStream *st;
/* pts of the next frame that will be generated */
int64_t next_pts;
int samples_count;
AVFrame *frame;
AVFrame *tmpFrame;
struct SwsContext *sws_ctx;
};
///
/// Constructor
///
FMVStream();
///
/// Destructor
///
~FMVStream();
///
/// Frame encoder helper function
///
/// Encodes a raw RGB frame into the transport stream
///
int EncodeFrame(uint8_t* frame);
///
/// Frame width setter
///
void setFrameWidth(int width);
///
/// Frame width getter
///
int getFrameWidth() const;
///
/// Frame height setter
///
void setFrameHeight(int height);
///
/// Frame height getter
///
int getFrameHeight() const;
///
/// Stream address setter
///
void setStreamAddress(const std::string& address);
///
/// Stream address getter
///
std::string getStreamAddress() const;
private:
///
/// Video Stream creation
///
AVStream* initVideoStream(AVFormatContext* oc);
///
/// Raw frame transcoder
///
/// This will convert the raw RGB frame to a raw YUV frame necessary for h.264 encoding
///
void CopyFrameData(uint8_t* src_frame);
///
/// Video frame allocator
///
AVFrame* AllocPicture(PixelFormat pix_fmt, int width, int height);
///
/// Debug print helper function
///
void print_sdp(AVFormatContext **avc, int n);
///
/// Write the frame to the stream
///
int write_frame(AVFormatContext *fmt_ctx, const AVRational *time_base, AVStream *st, AVPacket *pkt);
///
/// initialize the frame data
///
void initFrame();
// formatting data needed for output streaming and the output container (MPEG 2 TS)
AVOutputFormat* format;
AVFormatContext* format_ctx;
// structure container for our video stream
OutputStream stream;
AVIOContext* io_ctx;
std::string streamFilename;
int frameWidth;
int frameHeight;
};
#endif
This block starts the class declaration.
#include "FullMotionVideoStream.h"
#include <stdexcept>
#include <iostream>
FMVStream::FMVStream()
: format(0),
format_ctx(0),
stream(),
io_ctx(0),
streamFilename("test.mpeg"),
frameWidth(640),
frameHeight(480)
{
// Register all formats and codecs
av_register_all();
avcodec_register_all();
// Init networking
avformat_network_init();
// Find format
this->format = av_guess_format("mpegts", NULL, NULL);
// allocate the AVFormatContext
this->format_ctx = avformat_alloc_context();
if (!this->format_ctx)
{
throw std::runtime_error("avformat_alloc_context failed");
}
this->format_ctx->oformat = this->format;
//sprintf_s(this->format_ctx->filename, sizeof(this->format_ctx->filename), "%s", this->streamFilename.c_str());
this->stream.st = initVideoStream(this->format_ctx);
this->initFrame();
// Allocate AVIOContext
int ret = avio_open(&this->io_ctx, this->streamFilename.c_str(), AVIO_FLAG_WRITE);
if (ret != 0)
{
throw std::runtime_error("avio_open failed");
}
this->format_ctx->pb = this->io_ctx;
// Print some debug info about the format
av_dump_format(this->format_ctx, 0, NULL, 1);
// Begin the output by writing the container header
avformat_write_header(this->format_ctx, NULL);
AVFormatContext* ac[] = { this->format_ctx };
print_sdp(ac, 1);
}
FMVStream::~FMVStream()
{
av_write_trailer(this->format_ctx);
avcodec_close(this->stream.st->codec);
avio_close(io_ctx);
avformat_free_context(this->format_ctx);
av_frame_free(&this->stream.frame);
av_free(this->format);
}
AVFrame* FMVStream::AllocPicture(PixelFormat pix_fmt, int width, int height)
{
// Allocate a frame
AVFrame* frame = av_frame_alloc();
if (frame == nullptr)
{
throw std::runtime_error("avcodec_alloc_frame failed");
}
if (av_image_alloc(frame->data, frame->linesize, width, height, pix_fmt, 1) < 0)
{
throw std::runtime_error("av_image_alloc failed");
}
frame->width = width;
frame->height = height;
frame->format = pix_fmt;
return frame;
}
void FMVStream::print_sdp(AVFormatContext **avc, int n)
{
char sdp[2048];
av_sdp_create(avc, n, sdp, sizeof(sdp));
printf("SDP:\n%s\n", sdp);
fflush(stdout);
}
AVStream* FMVStream::initVideoStream(AVFormatContext *oc)
{
AVStream* st = avformat_new_stream(oc, NULL);
if (st == nullptr)
{
std::runtime_error("Could not alloc stream");
}
AVCodec* codec = avcodec_find_encoder(AV_CODEC_ID_H264);
if (codec == nullptr)
{
throw std::runtime_error("couldn't find mpeg2 encoder");
}
st->codec = avcodec_alloc_context3(codec);
st->codec->codec_id = AV_CODEC_ID_H264;
st->codec->codec_type = AVMEDIA_TYPE_VIDEO;
st->codec->bit_rate = 400000;
st->codec->width = this->frameWidth;
st->codec->height = this->frameHeight;
st->time_base.num = 1;
st->time_base.den = 30;
st->codec->framerate.num = 1;
st->codec->framerate.den = 30;
st->codec->max_b_frames = 2;
st->codec->gop_size = 12;
st->codec->pix_fmt = PIX_FMT_YUV420P;
st->id = oc->nb_streams - 1;
if (oc->oformat->flags & AVFMT_GLOBALHEADER)
{
st->codec->flags |= CODEC_FLAG_GLOBAL_HEADER;
}
// option setup for the codec
av_opt_set(st->codec->priv_data, "profile", "baseline", AV_OPT_SEARCH_CHILDREN);
if (avcodec_open2(st->codec, codec, NULL) < 0)
{
throw std::runtime_error("avcodec_open failed");
}
return st;
}
void FMVStream::initFrame()
{
// Allocate a tmp frame for converting our raw RGB data to YUV for encoding
this->stream.tmpFrame = this->AllocPicture(PIX_FMT_RGB24, this->frameWidth, this->frameHeight);
// Allocate a main frame
this->stream.frame = this->AllocPicture(PIX_FMT_YUV420P, this->frameWidth, this->frameHeight);
}
This block is attempting to convert from the raw RGB to our needed YUV format for h.264 encoding.
void FMVStream::CopyFrameData(uint8_t* data)
{
// fill image with our raw RGB data
//avpicture_alloc((AVPicture*)this->stream.tmpFrame, PIX_FMT_RGB24, this->stream.st->codec->width, this->stream.st->codec->height);
int numBytes = avpicture_get_size(PIX_FMT_RGB24, this->stream.st->codec->width, this->stream.st->codec->height);
uint8_t* buffer = (uint8_t*) av_malloc(numBytes * sizeof(uint8_t));
avpicture_fill((AVPicture*)this->stream.tmpFrame, buffer, PIX_FMT_RGB24, this->stream.st->codec->width, this->stream.st->codec->height);
for (int y = 0; y < this->stream.st->codec->height; y++)
{
for (int x = 0; x < this->stream.st->codec->width; x++)
{
int offset = 3 * (x + y * this->stream.st->codec->width);
this->stream.tmpFrame->data[0][offset + 0] = data[x + y * this->stream.st->codec->width]; // R
this->stream.tmpFrame->data[0][offset + 1] = data[x + y * this->stream.st->codec->width + 1]; // G
this->stream.tmpFrame->data[0][offset + 2] = data[x + y * this->stream.st->codec->width + 2]; // B
}
}
// convert the RGB frame to a YUV frame using the sws Context
this->stream.sws_ctx = sws_getContext(this->stream.st->codec->width, this->stream.st->codec->height, PIX_FMT_RGB32, this->stream.st->codec->width, this->stream.st->codec->height, PIX_FMT_YUV420P, SWS_FAST_BILINEAR, NULL, NULL, NULL);
// use the scale function to transcode this raw frame to the correct type
sws_scale(this->stream.sws_ctx, this->stream.tmpFrame->data, this->stream.tmpFrame->linesize, 0, this->stream.st->codec->height, this->stream.frame->data, this->stream.frame->linesize);
}
This is the block that encodes the raw data to h.264, and then send it out the Mpeg2 ts. I believe the problem lies within this block. I can put a break point in my write frame block and see that frames are being written, however, opening the resulting file in VLC results in a blank video. The file is approx 2Mb.
int FMVStream::EncodeFrame(uint8_t* data)
{
AVCodecContext* c = this->stream.st->codec;
AVRational one;
one.den = one.num = 1;
// check to see if we want to keep writing frames we can probably change this to a toggle switch
if (av_compare_ts(this->stream.next_pts, this->stream.st->codec->time_base, 10, one) >= 0)
{
this->stream.frame = nullptr;
}
else
{
// Convert and load the frame data into the AVFrame struct
CopyFrameData(data);
}
// setup the timestamp stepping
AVPacket pkt = { 0 };
av_init_packet(&pkt);
this->stream.frame->pts = (int64_t)((1.0 / this->stream.st->codec->framerate.den) * 90000.0 * this->stream.next_pts++);
int gotPacket, out_size, ret;
out_size = avcodec_encode_video2(c, &pkt, this->stream.frame, &gotPacket);
if (gotPacket == 1)
{
ret = write_frame(this->format_ctx, &c->time_base, this->stream.st, &pkt);
}
else
{
ret = 0;
}
if (ret < 0)
{
std::cerr << "Error writing video frame" << std::endl;
}
av_free_packet(&pkt);
return ((this->stream.frame != nullptr) || gotPacket) ? 0 : 1;
}
int FMVStream::write_frame(AVFormatContext *fmt_ctx, const AVRational *time_base, AVStream *st, AVPacket *pkt)
{
/* rescale output packet timestamp values from codec to stream timebase */
av_packet_rescale_ts(pkt, *time_base, st->time_base);
pkt->stream_index = st->index;
return av_interleaved_write_frame(fmt_ctx, pkt);
}
void FMVStream::setFrameWidth(const int width)
{
this->frameWidth = width;
}
int FMVStream::getFrameWidth() const
{
return this->frameWidth;
}
void FMVStream::setFrameHeight(const int height)
{
this->frameHeight = height;
}
int FMVStream::getFrameHeight() const
{
return this->frameHeight;
}
void FMVStream::setStreamAddress(const std::string& address)
{
this->streamFilename = address;
}
std::string FMVStream::getStreamAddress() const
{
return this->streamFilename;
}
Here is the Main function.
#include "FullMotionVideoStream.h"
#include <iostream>
#include <thread>
#include <chrono>
int main(int argc, char** argv)
{
FMVStream* fmv = new FMVStream;
fmv->setFrameWidth(640);
fmv->setFrameHeight(480);
std::cout << "Streaming Address: " << fmv->getStreamAddress() << std::endl;
// create our alternating frame of black and white to test the streaming functionality
uint8_t white[640 * 480 * sizeof(uint8_t) * 3];
uint8_t black[640 * 480 * sizeof(uint8_t) * 3];
std::memset(white, 255, 640 * 480 * sizeof(uint8_t) * 3);
std::memset(black, 0, 640 * 480 * sizeof(uint8_t)* 3);
for (auto i = 0; i < 100; i++)
{
auto ret = fmv->EncodeFrame(white);
if (ret != 0)
{
std::cerr << "There was a problem encoding the frame: " << i << std::endl;
}
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
for (auto i = 0; i < 100; i++)
{
auto ret = fmv->EncodeFrame(black);
if (ret != 0)
{
std::cerr << "There was a problem encoding the frame: " << i << std::endl;
}
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
delete fmv;
}
Here is the resultant output via the console / my print SDP function.
[libx264 # 000000ac95f58440] using cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2
AVX FMA3 AVX2 LZCNT BMI2
[libx264 # 000000ac95f58440] profile Constrained Baseline, level 3.0
Output #0, mpegts, to '(null)':
Stream #0:0: Video: h264 (libx264), yuv420p, 640x480, q=-1--1, 400 kb/s, 30
tbn
SDP:
v=0
o=- 0 0 IN IP4 127.0.0.1
s=No Name
t=0 0
a=tool:libavformat 56.23.104
m=video 0 RTP/AVP 96
b=AS:400
a=rtpmap:96 H264/90000
a=fmtp:96 packetization-mode=1
a=control:streamid=0
Streaming Address: test.mpeg
[libx264 # 000000ac95f58440] frame I:45 Avg QP: 0.51 size: 1315
[libx264 # 000000ac95f58440] frame P:136 Avg QP: 0.29 size: 182
[libx264 # 000000ac95f58440] mb I I16..4: 99.7% 0.0% 0.3%
[libx264 # 000000ac95f58440] mb P I16..4: 0.1% 0.0% 0.1% P16..4: 0.1% 0.0
% 0.0% 0.0% 0.0% skip:99.7%
[libx264 # 000000ac95f58440] final ratefactor: -68.99
[libx264 # 000000ac95f58440] coded y,uvDC,uvAC intra: 0.5% 0.5% 0.5% inter: 0.0%
0.1% 0.1%
[libx264 # 000000ac95f58440] i16 v,h,dc,p: 96% 0% 3% 0%
[libx264 # 000000ac95f58440] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 1% 10% 85% 0% 3%
0% 1% 0% 0%
[libx264 # 000000ac95f58440] i8c dc,h,v,p: 100% 0% 0% 0%
[libx264 # 000000ac95f58440] ref P L0: 46.8% 25.2% 28.0%
[libx264 # 000000ac95f58440] kb/s:0.03
I know there are probably many issues with this program, I am very new with FFMPEG and multimedia programming in general. Ive used many pieces of code found through searching google/ stack overflow to get to this point as is. The file has a good size but comes up as length 0.04 tells me that my time stamping must be broken between the frames / pkts, but I am unsure on how to fix this issue.
I tried inspecting the file with ffmpeg.exe using ffmpeg -i and outputting to a regular TS. It seems my code works more then I originally intended however, I am simply trying to output a bunch of all white frames.
ffmpeg -i test.mpeg test.ts
ffmpeg version N-70125-g6c9537b Copyright (c) 2000-2015 the FFmpeg developers
built with gcc 4.9.2 (GCC)
configuration: --disable-static --enable-shared --enable-gpl --enable-version3
--disable-w32threads --enable-avisynth --enable-bzlib --enable-fontconfig --ena
ble-frei0r --enable-gnutls --enable-iconv --enable-libass --enable-libbluray --e
nable-libbs2b --enable-libcaca --enable-libfreetype --enable-libgme --enable-lib
gsm --enable-libilbc --enable-libmodplug --enable-libmp3lame --enable-libopencor
e-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libopus --enabl
e-librtmp --enable-libschroedinger --enable-libsoxr --enable-libspeex --enable-l
ibtheora --enable-libtwolame --enable-libvidstab --enable-libvo-aacenc --enable-
libvo-amrwbenc --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-l
ibwebp --enable-libx264 --enable-libx265 --enable-libxavs --enable-libxvid --ena
ble-lzma --enable-decklink --enable-zlib
libavutil 54. 19.100 / 54. 19.100
libavcodec 56. 26.100 / 56. 26.100
libavformat 56. 23.104 / 56. 23.104
libavdevice 56. 4.100 / 56. 4.100
libavfilter 5. 11.101 / 5. 11.101
libswscale 3. 1.101 / 3. 1.101
libswresample 1. 1.100 / 1. 1.100
libpostproc 53. 3.100 / 53. 3.100
Input #0, mpegts, from 'test.mpeg':
Duration: 00:00:00.04, start: 0.000000, bitrate: 24026 kb/s
Program 1
Metadata:
service_name : Service01
service_provider: FFmpeg
Stream #0:0[0x100]: Video: h264 (Constrained Baseline) ([27][0][0][0] / 0x00
1B), yuv420p, 640x480, 25 fps, 25 tbr, 90k tbn, 50 tbc
File 'test.ts' already exists. Overwrite ? [y/N] y
Output #0, mpegts, to 'test.ts':
Metadata:
encoder : Lavf56.23.104
Stream #0:0: Video: mpeg2video, yuv420p, 640x480, q=2-31, 200 kb/s, 25 fps,
90k tbn, 25 tbc
Metadata:
encoder : Lavc56.26.100 mpeg2video
Stream mapping:
Stream #0:0 -> #0:0 (h264 (native) -> mpeg2video (native))
Press [q] to stop, [?] for help
frame= 3 fps=0.0 q=2.0 Lsize= 9kB time=00:00:00.08 bitrate= 883.6kbits/
s dup=0 drop=178
video:7kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing ove
rhead: 22.450111%

avpicture_fill does not do what you think it does. It does not fill the picture using data from ptr, as the source, It fills the picture using ptr as the destination. So basically, you are clearing the image before you encode it.

on your av_packet_rescale_ts(pkt, *time_base, st->time_base);
you are using AvCodecContext::time_base
and you set AvCodecContext::framerate instead.
st->time_base.num = 1;
st->time_base.den = 30;
st->codec->framerate.num = 1;
st->codec->framerate.den = 30;
change to:
st->time_base.num = 1;
st->time_base.den = 30;
st->codec->time_base = st->time_base;

CUDA: Dependence of kernel performance on occupancy

I am doing Finite Difference computation (Stencil Computation) on GPU (Fermi) using CUDA. When I tested my code using CUDA profiler, I found the occupany was 0.333. After I ordered my computation and increased the occupany to 0.677, the execution time of the kernel didn't decrease but increased. In other words, there was a decrease in performance when the occupany got increased by 1/3.
My question is:
Does the performance of the kernel depend on the computation irrespective of the occupancy?

The answer is "it depends", both on the characteristics of your workload and on how you define performance. Generally speaking, if your bottleneck is math throughput you're often fine with a lower occupancy (12.5%-33%), but if your bottleneck is memory then you usually want a higher occupancy (66% or higher). This is just a rule of thumb, not an absolute rule. Most kernels fall somewhere in the middle but there are exceptions at both extremes.
Occupancy is the maximum number of threads of your kernel that can be active at once (limited by register count per thread or other resources) divided by the maximum number of threads the GPU can have active when not limited by other resources. Active means the thread has hardware resources assigned and is available for scheduling, not that it has any instructions executing on a given clock cycle.
After issuing instruction i for a thread, the instruction i+1 for that thread might not be able to run immediately, if it depends on the result of instruction i. If that instruction is a math instruction, the result will be available in a few clock cycles. If it's a memory load instruction, it might be 100s of cycles. Rather than waiting, the GPU will issue instructions from some other thread who's dependencies are satisfied.
So if you're mostly doing math, you only need a few (few in GPU terms; on a CPU it would be considered many) threads to hide the few cycles of latency from math instructions, so you can get away with low occupancy. But if you've got a lot of memory traffic, you need more threads to ensure that some of them are ready to execute on every cycle, since each one spends a lot of time "sleeping" waiting for memory operations to complete.
If the algorithmic changes you made to increase occupancy also increased the amount of work done on each thread, and if you already had enough threads to keep the GPU busy, then the change will just slow you down. Increasing occupancy only improves performance up to the point where you have enough threads to keep the GPU busy.

Jesse Hall has already answered your question, so I will limit myself to complement his answer.
Occupancy is not the only figure of merit to take care of in order to maximize the algorithm performance, which most often coincide with the execution time. I suggest to take a look at the instructive GTC2010 presentation by Vasily Volkov:
Better Performance at Lower Occupancy
Below, I'm providing a simple example, inspired by Part II of the above presentation.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define BLOCKSIZE 512
//#define DEBUG
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/***********************************************/
/* MEMCPY1 - EACH THREAD COPIES ONE FLOAT ONLY */
/***********************************************/
__global__ void memcpy1(float *src, float *dst, unsigned int N)
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float a0 = src[tid];
dst[tid] = a0;
}
}
/*******************************************/
/* MEMCPY2 - EACH THREAD COPIES TWO FLOATS */
/*******************************************/
__global__ void memcpy2(float *src, float *dst, unsigned int N)
{
const int tid = threadIdx.x + blockIdx.x * (2 * blockDim.x);
if (tid < N) {
float a0 = src[tid];
float a1 = src[tid + blockDim.x];
dst[tid] = a0;
dst[tid + blockDim.x] = a1;
}
}
/********************************************/
/* MEMCPY4 - EACH THREAD COPIES FOUR FLOATS */
/********************************************/
__global__ void memcpy4(float *src, float *dst, unsigned int N)
{
const int tid = threadIdx.x + blockIdx.x * (4 * blockDim.x);
if (tid < N) {
float a0 = src[tid];
float a1 = src[tid + blockDim.x];
float a2 = src[tid + 2 * blockDim.x];
float a3 = src[tid + 3 * blockDim.x];
dst[tid] = a0;
dst[tid + blockDim.x] = a1;
dst[tid + 2 * blockDim.x] = a2;
dst[tid + 3 * blockDim.x] = a3;
}
}
/***********************************************/
/* MEMCPY4_2 - EACH THREAD COPIES FOUR FLOATS2 */
/***********************************************/
__global__ void memcpy4_2(float2 *src, float2 *dst, unsigned int N)
{
const int tid = threadIdx.x + blockIdx.x * (4 * blockDim.x);
if (tid < N/2) {
float2 a0 = src[tid];
float2 a1 = src[tid + blockDim.x];
float2 a2 = src[tid + 2 * blockDim.x];
float2 a3 = src[tid + 3 * blockDim.x];
dst[tid] = a0;
dst[tid + blockDim.x] = a1;
dst[tid + 2 * blockDim.x] = a2;
dst[tid + 3 * blockDim.x] = a3;
}
}
/********/
/* MAIN */
/********/
void main()
{
const int N = 131072;
const int N_iter = 20;
// --- Setting host data and memory space for result
float* h_vect = (float*)malloc(N*sizeof(float));
float* h_result = (float*)malloc(N*sizeof(float));
for (int i=0; i<N; i++) h_vect[i] = i;
// --- Setting device data and memory space for result
float* d_src; gpuErrchk(cudaMalloc((void**)&d_src, N*sizeof(float)));
float* d_dest1; gpuErrchk(cudaMalloc((void**)&d_dest1, N*sizeof(float)));
float* d_dest2; gpuErrchk(cudaMalloc((void**)&d_dest2, N*sizeof(float)));
float* d_dest4; gpuErrchk(cudaMalloc((void**)&d_dest4, N*sizeof(float)));
float* d_dest4_2; gpuErrchk(cudaMalloc((void**)&d_dest4_2, N*sizeof(float)));
gpuErrchk(cudaMemcpy(d_src, h_vect, N*sizeof(float), cudaMemcpyHostToDevice));
// --- Warmup
for (int i=0; i<N_iter; i++) memcpy1<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_src, d_dest1, N);
// --- Creating events for timing
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
/***********/
/* MEMCPY1 */
/***********/
cudaEventRecord(start, 0);
for (int i=0; i<N_iter; i++) {
memcpy1<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_src, d_dest1, N);
#ifdef DEGUB
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GB/s = %f\n", (1.e-6)*(float)(N*N_iter*sizeof(float))/time);
gpuErrchk(cudaMemcpy(h_result, d_dest1, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_result[i] != h_vect[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_vect[i], h_result[i]); return; }
/***********/
/* MEMCPY2 */
/***********/
cudaEventRecord(start, 0);
for (int i=0; i<N_iter; i++) {
memcpy2<<<iDivUp(N/2,BLOCKSIZE), BLOCKSIZE>>>(d_src, d_dest2, N);
#ifdef DEGUB
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GB/s = %f\n", (1.e-6)*(float)(N*N_iter*sizeof(float))/time);
gpuErrchk(cudaMemcpy(h_result, d_dest2, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_result[i] != h_vect[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_vect[i], h_result[i]); return; }
/***********/
/* MEMCPY4 */
/***********/
cudaEventRecord(start, 0);
for (int i=0; i<N_iter; i++) {
memcpy4<<<iDivUp(N/4,BLOCKSIZE), BLOCKSIZE>>>(d_src, d_dest4, N);
#ifdef DEGUB
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GB/s = %f\n", (1.e-6)*(float)(N*N_iter*sizeof(float))/time);
gpuErrchk(cudaMemcpy(h_result, d_dest4, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_result[i] != h_vect[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_vect[i], h_result[i]); return; }
/*************/
/* MEMCPY4_2 */
/*************/
cudaEventRecord(start, 0);
for (int i=0; i<N_iter; i++) {
memcpy4_2<<<iDivUp(N/8,BLOCKSIZE), BLOCKSIZE>>>((float2*)d_src, (float2*)d_dest4_2, N);
#ifdef DEGUB
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GB/s = %f\n", (1.e-6)*(float)(N*N_iter*sizeof(float))/time);
gpuErrchk(cudaMemcpy(h_result, d_dest4_2, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_result[i] != h_vect[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_vect[i], h_result[i]); return; }
cudaDeviceReset();
}
Below, the performance of the above code, when run on a GeForce GT540M and a Kepler K20c.
BLOCKSIZE 32
GT540M K20c Tesla C2050
memcpy1 2.3GB/s 13% 28.1GB/s 18% 14.9GB/s 12%
memcpy2 4.4GB/s 13% 41.1GB/s 18% 24.8GB/s 13%
memcpy4 7.5GB/s 13% 54.8GB/s 18% 34.6GB/s 13%
memcpy4_2 11.2GB/2 14% 68.8GB/s 18% 44.0GB7s 14%
BLOCKSIZE 64
GT540M K20c Tesla C2050
memcpy1 4.6GB/s 27% 44.1GB/s 36% 26.1GB/s 26%
memcpy2 8.1GB/s 27% 57.1GB/s 36% 35.7GB/s 26%
memcpy4 11.4GB/s 27% 63.2GB/s 36% 43.5GB/s 26%
memcpy4_2 12.6GB/s 27% 72.8GB/s 36% 49.7GB/s 27%
BLOCKSIZE 128
GT540M K20c Tesla C2050
memcpy1 8.0GB/s 52% 60.6GB/s 78% 36.1GB/s 52%
memcpy2 11.6GB/2 52% 61.6GB/s 78% 44.8GB/s 52%
memcpy4 12.4GB/2 52% 62.2GB/s 78% 48.3GB/s 52%
memcpy4_2 12.5GB/s 52% 61.9GB/s 78% 49.5GB7s 52%
BLOCKSIZE 256
GT540M K20c Tesla C2050
memcpy1 10.6GB/s 80% 61.2GB/s 74% 42.0GB/s 77%
memcpy2 12.3GB/s 80% 66.2GB/s 74% 48.2GB/s 77%
memcpy4 12.4GB/s 80% 66.4GB/s 74% 45.5GB/s 77%
memcpy4_2 12.6GB/s 70% 72.6GB/s 74% 50.8GB/s 77%
BLOCKSIZE 512
GT540M K20c Tesla C2050
memcpy1 10.3GB/s 80% 54.5GB/s 75% 41.6GB/s 75%
memcpy2 12.2GB/s 80% 67.1GB/s 75% 47.7GB/s 75%
memcpy4 12.4GB/s 80% 67.9GB/s 75% 46.9GB/s 75%
memcpy4_2 12.5GB/s 55% 70.1GB/s 75% 48.3GB/s 75%
The above results show that you can have better performance, i.e. 12GB/s for the GT540M case, with lower occupancy, i.e. 27%, if you properly exploit Instruction Level Parallelism (ILP) by giving each thread more work to do in order to hide latency.

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio