Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
This question does not appear to be about programming within the scope defined in the help center.
Closed 7 years ago.
Improve this question
I have implemented the strlen() function in different ways, including SSE2 assembly, SSE4.2 assembly and SSE2 intrinsic, I also exerted some experiments on them, with strlen() in <string.h> and strlen() in glibc. However, their performance in terms of milliseconds (time) are unexpected.
My experiment environment:
CentOS 7.0 + gcc 4.8.5 + Intel Xeon
Following are my implementations:
strlen using SSE2 assembly
long strlen_sse2_asm(const char* src){
long result = 0;
asm(
"movl %1, %%edi\n\t"
"movl $-0x10, %%eax\n\t"
"pxor %%xmm0, %%xmm0\n\t"
"lloop:\n\t"
"addl $0x10, %%eax\n\t"
"movdqu (%%edi,%%eax), %%xmm1\n\t"
"pcmpeqb %%xmm0, %%xmm1\n\t"
"pmovmskb %%xmm1, %%ecx\n\t"
"test %%ecx, %%ecx\n\t"
"jz lloop\n\t"
"bsf %%ecx, %%ecx\n\t"
"addl %%ecx, %%eax\n\t"
"movl %%eax, %0"
:"=r"(result)
:"r"(src)
:"%eax"
);
return result;
}
2.strlen using SSE4.2 assembly
long strlen_sse4_2_asm(const char* src){
long result = 0;
asm(
"movl %1, %%edi\n\t"
"movl $-0x10, %%eax\n\t"
"pxor %%xmm0, %%xmm0\n\t"
"lloop2:\n\t"
"addl $0x10, %%eax\n\t"
"pcmpistri $0x08,(%%edi, %%eax), %%xmm0\n\t"
"jnz lloop2\n\t"
"add %%ecx, %%eax\n\t"
"movl %%eax, %0"
:"=r"(result)
:"r"(src)
:"%eax"
);
return result;
}
3. strlen using SSE2 intrinsic
long strlen_sse2_intrin_align(const char* src){
if (src == NULL || *src == '\0'){
return 0;
}
const __m128i zero = _mm_setzero_si128();
const __m128i* ptr = (const __m128i*)src;
if(((size_t)ptr&0xF)!=0){
__m128i xmm = _mm_loadu_si128(ptr);
unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(xmm,zero));
if(mask!=0){
return (const char*)ptr-src+(size_t)ffs(mask);
}
ptr = (__m128i*)(0x10+(size_t)ptr & ~0xF);
}
for (;;ptr++){
__m128i xmm = _mm_load_si128(ptr);
unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(xmm,zero));
if (mask!=0)
return (const char*)ptr-src+(size_t)ffs(mask);
}
}
I also looked up the one implemented in linux kernel, following is its implementation
size_t strlen_inline_asm(const char* str){
int d0;
size_t res;
asm volatile("repne\n\t"
"scasb"
:"=c" (res), "=&D" (d0)
: "1" (str), "a" (0), "" (0xffffffffu)
: "memory");
return ~res-1;
}
In my experience, I also added the one of standard library and compared their performance.
Followings are my main function code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <xmmintrin.h>
#include <x86intrin.h>
#include <emmintrin.h>
#include <time.h>
#include <unistd.h>
#include <sys/time.h>
int main()
{
struct timeval tpstart,tpend;
int i=0;
for(;i<1023;i++){
test_str[i] = 'a';
}
test_str[i]='\0';
gettimeofday(&tpstart,NULL);
for(i=0;i<10000000;i++)
strlen(test_str);
gettimeofday(&tpend,NULL);
printf("strlen from stirng.h--->%lf\n",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);
gettimeofday(&tpstart,NULL);
for(i=0;i<10000000;i++)
strlen_inline_asm(test_str);
gettimeofday(&tpend,NULL);
printf("strlen_inline_asm--->%lf\n",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);
gettimeofday(&tpstart,NULL);
for(i=0;i<10000000;i++)
strlen_sse2_asm(test_str);
gettimeofday(&tpend,NULL);
printf("strlen_sse2_asm--->%lf\n",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);
gettimeofday(&tpstart,NULL);
for(i=0;i<10000000;i++)
strlen_sse4_2_asm(test_str);
gettimeofday(&tpend,NULL);
printf("strlen_sse4_2_asm--->%lf\n",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);
gettimeofday(&tpstart,NULL);
for(i=0;i<10000000;i++)
strlen_sse2_intrin_align(test_str);
gettimeofday(&tpend,NULL);
printf("strlen_sse2_intrin_align--->%lf\n",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);
return 0;
}
The result is : (ms)
strlen from stirng.h--->23.518000
strlen_inline_asm--->222.311000
strlen_sse2_asm--->782.907000
strlen_sse4_2_asm--->955.960000
strlen_sse2_intrin_align--->3499.586000
I have some questions about it:
Why strlen of string.h is so fast? I think its code should be identify to strlen_inline_asm because I copied the code from /linux-4.2.2/arch/x86/lib/string_32.c[http://lxr.oss.org.cn/source/arch/x86/lib/string_32.c#L164]
Why sse2 intrinsic and sse2 assembly are so different in performance?
Could someone help me how to disassembly the code so that I can see what has the function strlen of static library been transformed by the compiler? I used gcc -s but didn't find the disassembly of strlen from the <string.h>
I think my code may be not very well, I would be appreciate if you could help me improve my code, especially assembly ones.
Thanks.
Like I said in comments, your biggest error is benchmarking with -O0. I discussed exactly why testing with -O0 is a terrible idea in the first part of another post.
Benchmarks should be done with at least -O2, preferably with the same optimizations as your full project will build with, if you're trying to test test what source makes the fastest asm.
-O0 explains inline asm being way faster than C with intrinsics (or regular compiled C, for C strlen implementation borrowed from glibc).
IDK -O0 would still optimize away loop that discards the result of library strlen repeatedly, or if it somehow just avoided some other huge performance pitfall. It's not interesting to guess about exactly what happened in such a flawed test.
I tightened up your SSE2 inline-asm version. Mostly just because I've been playing with gcc inline asm input/output constraints recently, and wanted to see what it would look like if I wrote it to let the compiler choose which registers to use for temporaries, and avoided unneeded instructions.
The same inline asm works for 32 and 64-bit x86 targets; see this compiled for both on the Godbolt compiler explorer. When compiling to a stand-along function, it doesn't have to save/restore any registers even in 32bit mode:
WARNING: it can read past the end of the string by up to 15 bytes. This could segfault. See Is it safe to read past the end of a buffer within the same page on x86 and x64? for details on avoiding that: get to an alignment boundary, then use aligned loads because that's always safe if the vector contains at least 1 byte of string data. I left the code unchanged because it's interesting to discuss the effect of aligning pointers for SSE vs. AVX. Aligning pointers also avoids cache-line splits, and 4k page-splits (which are a performance pothole before Skylake).
#include <immintrin.h>
size_t strlen_sse2_asm(const char* src){
// const char *orig_src = src; // for a pointer-increment with a "+r" (src) output operand
size_t result = 0;
unsigned int tmp1;
__m128i zero = _mm_setzero_si128(), vectmp;
// A pointer-increment may perform better than an indexed addressing mode
asm(
"\n.Lloop:\n\t"
"movdqu (%[src], %[res]), %[vectmp]\n\t" // result reg is used as the loop counter
"pcmpeqb %[zerovec], %[vectmp]\n\t"
"pmovmskb %[vectmp], %[itmp]\n\t"
"add $0x10, %[res]\n\t"
"test %[itmp], %[itmp]\n\t"
"jz .Lloop\n\t"
"bsf %[itmp], %[itmp]\n\t"
"add %q[itmp], %q[res]\n\t" // q modifier to get quadword register.
// (add %edx, %rax doesn't work). But in 32bit mode, q gives a 32bit reg, so the same code works
: [res] "+r"(result), [vectmp] "=&x" (vectmp), [itmp] "=&r" (tmp1)
: [zerovec] "x" (zero) // There might already be a zeroed vector reg when inlining
, [src] "r"(src)
, [dummy] "m" (*(const char (*)[])src) // this reads the whole object, however long gcc thinks it is
: //"memory" // not needed because of the dummy input
);
return result;
// return result + tmp1; // doing the add outside the asm makes gcc sign or zero-extend tmp1.
// No benefit anyway, since gcc doesn't know that tmp1 is the offset within a 16B chunk or anything.
}
Note the dummy input, as an alternative to a "memory" clobber, to tell the compiler that the inline asm reads the memory pointed to by src, as well as the value of src itself. (The compiler doesn't know what the asm does; for all it knows the asm just aligns a pointer with and or something, so assuming that all input pointers are dereferenced would lead to missed optimizations from reordering / combining loads and stores across the asm. Also, this lets the compiler know we only read the memory, not modify it.) The GCC manual uses an example with this unspecified-length array syntax "m" (*(const char (*)[])src)
It should keep register pressure to a minimum when inlining, and doesn't tie up any special-purpose registers (like ecx which is needed for variable-count shifts).
If you could shave another uop out of the inner loop, it would be down to 4 uops that could issue at one per cycle. As it is, 5 uops means each iteration may take 2 cycles to issue from the frontend, on Intel SnB CPUs. (Or 1.25 cycles on later CPUs like Haswell, and maybe on SnB if I was wrong about the whole-number behaviour.)
Using an aligned pointer would allow the load to fold into a memory operand for pcmpeqb. (As well as being necessary for correctness if the string start is unaligned and the end is near the end of a page). Interestingly, using the zero-vector as the destination for pcmpeqb is ok in theory: you don't need to re-zero the vector between iterations, because you exit the loop if it's ever non-zero. It has 1-cycle latency, so turning the zero vector into a loop-carried dependency is only a problem when cache-misses delay an old iteration. Removing this loop-carried dependency chain might help in practice, though, by letting the back end go faster when catching up after a cache miss that delayed an old iteration.
AVX solves the problem completely (except for correctness if the string ends near the end of a page). AVX allows the load to be folded even without doing an alignment check first. 3-operand non-destructive vpcmpeqb avoids turning the zero vector into a loop-carried dependency. AVX2 would allow checking 32B at once.
Unrolling will help either way, but helps more without AVX. Align to a 64B boundary or something, and then load the whole cache line into four 16B vectors. Doing a combined check on the result of PORing them all together may be good, since pmovmsk + compare-and-branch is 2 uops.
Using SSE4.1 PTEST doesn't help (compared to pmovmsk / test / jnz) because it's 2 uops and can't macro-fuse the way test can.
PTEST can directly test for the whole 16B vector being all-zero or all-ones (using ANDNOT -> CF part), but not if one of the byte-elements is zero. (So we can't avoid pcmpeqb).
Have a look at Agner Fog's guides for optimizing asm, and the other links on the x86 wiki. Most optimization (Agner Fog's, and Intel's and AMD's) will mention optimizing memcpy and strlen specifically, IIRC.
If you read the source of the strlen function in the glibc, you can see that the function is not testing the string char by char, but longword by longword with complex bitwise operations : http://www.stdlib.net/~colmmacc/strlen.c.html. I guess it explains its speed, but the fact that it's even faster than rep instructions in assembly is indeed quite surprising.
Related
I looked into some C code from
http://www.mcs.anl.gov/~kazutomo/rdtsc.html
They use stuff like __inline__, __asm__ etc like the following:
code1:
static __inline__ tick gettick (void) {
unsigned a, d;
__asm__ __volatile__("rdtsc": "=a" (a), "=d" (d) );
return (((tick)a) | (((tick)d) << 32));
}
code2:
volatile int __attribute__((noinline)) foo2 (int a0, int a1) {
__asm__ __volatile__ ("");
}
I was wondering what does the code1 and code2 do?
(Editor's note: for this specific RDTSC use case, intrinsics are preferred: How to get the CPU cycle count in x86_64 from C++? See also https://gcc.gnu.org/wiki/DontUseInlineAsm)
The __volatile__ modifier on an __asm__ block forces the compiler's optimizer to execute the code as-is. Without it, the optimizer may think it can be either removed outright, or lifted out of a loop and cached.
This is useful for the rdtsc instruction like so:
__asm__ __volatile__("rdtsc": "=a" (a), "=d" (d) )
This takes no dependencies, so the compiler might assume the value can be cached. Volatile is used to force it to read a fresh timestamp.
When used alone, like this:
__asm__ __volatile__ ("")
It will not actually execute anything. You can extend this, though, to get a compile-time memory barrier that won't allow reordering any memory access instructions:
__asm__ __volatile__ ("":::"memory")
The rdtsc instruction is a good example for volatile. rdtsc is usually used when you need to time how long some instructions take to execute. Imagine some code like this, where you want to time r1 and r2's execution:
__asm__ ("rdtsc": "=a" (a0), "=d" (d0) )
r1 = x1 + y1;
__asm__ ("rdtsc": "=a" (a1), "=d" (d1) )
r2 = x2 + y2;
__asm__ ("rdtsc": "=a" (a2), "=d" (d2) )
Here the compiler is actually allowed to cache the timestamp, and valid output might show that each line took exactly 0 clocks to execute. Obviously this isn't what you want, so you introduce __volatile__ to prevent caching:
__asm__ __volatile__("rdtsc": "=a" (a0), "=d" (d0))
r1 = x1 + y1;
__asm__ __volatile__("rdtsc": "=a" (a1), "=d" (d1))
r2 = x2 + y2;
__asm__ __volatile__("rdtsc": "=a" (a2), "=d" (d2))
Now you'll get a new timestamp each time, but it still has a problem that both the compiler and the CPU are allowed to reorder all of these statements. It could end up executing the asm blocks after r1 and r2 have already been calculated. To work around this, you'd add some barriers that force serialization:
__asm__ __volatile__("mfence;rdtsc": "=a" (a0), "=d" (d0) :: "memory")
r1 = x1 + y1;
__asm__ __volatile__("mfence;rdtsc": "=a" (a1), "=d" (d1) :: "memory")
r2 = x2 + y2;
__asm__ __volatile__("mfence;rdtsc": "=a" (a2), "=d" (d2) :: "memory")
Note the mfence instruction here, which enforces a CPU-side barrier, and the "memory" specifier in the volatile block which enforces a compile-time barrier. On modern CPUs, you can replace mfence:rdtsc with rdtscp for something more efficient.
asm is for including native Assembly code into the C source code. E.g.
int a = 2;
asm("mov a, 3");
printf("%i", a); // will print 3
Compilers have different variants of it. __asm__ should be synonymous, maybe with some compiler-specific differences.
volatile means the variable can be modified from outside (aka not by the C program). For instance when programming a microcontroller where the memory address 0x0000x1234 is mapped to some device-specific interface (i.e. when coding for the GameBoy, buttons/screen/etc are accessed this way.)
volatile std::uint8_t* const button1 = 0x00001111;
This disabled compiler optimizations that rely on *button1 not changing unless being changed by the code.
It is also used in multi-threaded programming (not needed anymore today?) where a variable might be modified by another thread.
inline is a hint to the compiler to "inline" calls to a function.
inline int f(int a) {
return a + 1
}
int a;
int b = f(a);
This should not be compiled into a function call to f but into int b = a + 1. As if f where a macro. Compilers mostly do this optimization automatically depending on function usage/content. __inline__ in this example might have a more specific meaning.
Similarily __attribute__((noinline)) (GCC-specific syntax) prevents a function from being inlined.
The __asm__ attribute specifies the name to be used in assembler code for the function or variable.
The __volatile__ qualifier, generally used in Real-Time-Computing of embedded systems, addresses a problem with compiler tests of the status register for the ERROR or READY bit causing problems during optimization. __volatile__ was introduced as a way of telling the compiler that the object is subject to rapid change and to force every reference of the object to be a genuine reference.
The following code fragment creates a function (fun) with just one RET instruction.
The loop repeatedly calls the function and overwrites the contents of the RET instruction after returning.
#include <sys/mman.h>
#include<stdlib.h>
#include<unistd.h>
#include <string.h>
typedef void (*foo)();
#define RET (0xC3)
int main(){
// Allocate an executable page
char * ins = (char *) mmap(0, 4096, PROT_EXEC|PROT_READ|PROT_WRITE, MAP_PRIVATE| MAP_ANONYMOUS, 0, 0);
// Just write a RET instruction
*ins = RET;
// make fun point to the function with just RET instruction
foo fun = (foo)(ins);
// Repeat 0xfffffff times
for(long i = 0; i < 0xfffffff; i++){
fun();
*ins = RET;
}
return 0;
}
The Linux perf on X86 Broadwell machine has the following icache and iTLB statistics:
perf stat -e L1-icache-load-misses -e iTLB-load-misses ./a.out
Performance counter stats for './a.out':
805,516,067 L1-icache-load-misses
4,857 iTLB-load-misses
32.052301220 seconds time elapsed
Now, look at the same code without overwriting the RET instruction.
#include <sys/mman.h>
#include<stdlib.h>
#include<unistd.h>
#include <string.h>
typedef void (*foo)();
#define RET (0xC3)
int main(){
// Allocate an executable page
char * ins = (char *) mmap(0, 4096, PROT_EXEC|PROT_READ|PROT_WRITE, MAP_PRIVATE| MAP_ANONYMOUS, 0, 0);
// Just write a RET instruction
*ins = RET;
// make fun point to the function with just RET instruction
foo fun = (foo)(ins);
// Repeat 0xfffffff times
for(long i = 0; i < 0xfffffff; i++){
fun();
// Commented *ins = RET;
}
return 0;
}
And here is the perf statistics on the same machine.
perf stat -e L1-icache-load-misses -e iTLB-load-misses ./a.out
Performance counter stats for './a.out':
11,738 L1-icache-load-misses
425 iTLB-load-misses
0.773433500 seconds time elapsed
Notice that overwriting the instruction causes L1-icache-load-misses to grow from 11,738 to 805,516,067 -- a manifold growth.
Also notice that iTLB-load-misses grows from 425 to 4,857--quite a growth but less compared to L1-icache-load-misses.
The running time grows from 0.773433500 seconds to 32.052301220 seconds -- a 41x growth!
It is unclear why the CPU should cause i-cache misses if the instruction footprint is so small. The only difference in the two examples is that the instruction is modified. Granted the L1 iCache and dCache are separate, isn't there a way to install code into iCache so that the cache i-cache misses can be avoided?
Furthermore, why is there a 10x growth in the iTLB misses?
Granted the L1 iCache and dCache are separate, isn't there a way to install code into iCache so that the cache i-cache misses can be avoided?
No.
If you want to modify code - the only path this can go is the following:
Store Date Execution Engine
Store Buffer & Forwarding
L1 Data Cache
Unified L2 Cache
L1 Instruction Cache
Note that you are also missing out on the μOP Cache.
This is illustrated by this diagram1, which I believe is sufficiently accurate.
I would suspect the iTLB misses could be due to regular TLB flushes. In case of no modification you are not affected by iTLB misses because your instructions actually come from the μOP Cache.
If they don't, I'm not quite sure. I would think the L1 Instruction Cache is virtually addressed, so no need to access the TLB if there is a hit.
1: unfortunately the image has a very restrictive copyright, so I refrain from highlighting the path / inlining the image.
Greeting everyone.
Due to some special reason, we have to re-implement the AVX2 intrics like the following way:
static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_xmm256_and_si256(__m256i s1, __m256i s2){
__m256i result;
__asm__ ("vpand %2, %1, %0": "=r"(result): "rm" "s1", "rm" "s2" ) ;
// sorry, this statement does not work
return result;
}
Corresponding function is _mm256_and_si256(__m256i s1, __m256i s2), which is an AVX2 intrincs.
After some search through google, I found some simililar such as connect some base type like int, float and long to input registers.
However, I still didn't find the way to connect the input parameters s1 and s2 to the input registers ymm1 and ymm2 which are used for the asm vpand code.
So anyone here is willing to help me to make the above example work?
Thank you so much in advance!!
The r constraint is for general purpose registers, and your asm block has wrong syntax anyway. The appropriate constraint for avx is x, and also mind that only one operand can be in memory (although that could be either one, which this template doesn't handle). Furthermore the nodebug attribute doesn't seem to exist.
As such, something like this will work better:
__attribute__((always_inline)) inline __m256i
_xmm256_and_si256(__m256i s1, __m256i s2)
{
__m256i result;
__asm__ ("vpand %2, %1, %0" : "=x"(result) : "x"(s1), "xm"(s2) );
return result;
}
As for today i used my own min() function (for float and int)
that was based on if but today as i get know that x86 has some operand
for min - this is
MINSS - Minimum of operands
i think that if based min() routine is counter effective and
im very careful for optimization, so i would like rewrite my own
routine into minss version with some inline assembly,
I would like to find how the most effective version of this in
gcc inline assembly would look like
I need something like
int min(int a, int b)
{
// minss a, b
//return
}
for both int and float, to use minss opcode and has minimal prologue and
epilogue
or just using library version would be faster? though i would like
to not use library min/max and had it as much fast if possible
Here is the most efficient possible implementation of min for ints and floats:
int
min_int(int a, int b)
{
return a < b ? a : b;
}
float
min_float(float a, float b)
{
return a < b ? a : b;
}
"But," you exclaim, "those will have conditional jumps in them!" Nope. Here's the output of gcc -S -O2:
min_int:
cmpl %edi, %esi
movl %edi, %eax
cmovle %esi, %eax
ret
min_float:
minss %xmm1, %xmm0
ret
For ints you get a conditional move, and for floats you get minss, because the compiler is very smart. No inline ASM needed!
EDIT: If you're still curious about how to do it with inline assembly, here's an example (for gcc):
float
min_float_asm(float a, float b)
{
float result = a;
asm ("minss %1, %0" : "+x" (result) : "x" (b));
return result;
}
The x constraint means "any SSE register", and "+x" means the value will be read and written, whereas "x" means read-only.
Well, I would suggest against such micro-optimization. If you want to do it anyway, GCC has some __builtin_* functions. One is v4sf __builtin_ia32_minss (v4sf, v4sf). There are other min* built-ins as well, check the docs.
Update
To gain more portability, you might want to take a look at the Intel Intrinsics Guide. Those functions are usually supported by GCC and Clang as well.
I am working on a C++ app on Intel Mac OS X 10.6.x. I have a variable which contains pixel data which was obtained using OpenGL call glReadPixels. I want to do some operations directly on the pixel data using x86_64 assembly instructions. The assembly routine works fine in test programs but when I try to use it on the pixel data, it only gets zeroes in the memory location pointed by the pixel data variable. I am guessing this is since I am trying to access video memory directly from x86_64 assembly. Is there a way to access x86_64 video memory directly from assembly? Otherwise how can I resolve this situation?
Appreciate any pointers. Thanks in advance.
See below for code sample to flip last n and 1st n bytes. Same code works well in test program.
void Flip(void *b, unsigned int w, unsigned int h)
{
__asm {
mov r8, rdi //rdi b
mov r9, rsi //W
mov r10,rdx //H
mov r11, 0 // h <- 0
mov r12, 0 // w<- 0
outloop:
------------
.............
.............
}
This isn't really an answer but the comments bit is too short to post this.
Your inline assembly is problematic, in multiple ways:
it assumes by the time the compiler gets to the inline block, the function arguments are still in the arg registers. This isn't guaranteed.
it uses MS-VC++ style inline assembly; I'm unsure about OSX Clang, but gcc proper refuses to even compile this.
It'd also be good to have a complete (compilable) source fragment. Something like (32bit code):
int mogrifyFramebufferContents(unsigned char *fb, int width, int height)
{
int i, sum;
glReadPixels(1, 1, width, height, GL_RGBA, GL_UNSIGNED_BYTE, fb);
for (i = 0, sum = 0; i < 4 * width * height; i++)
sum += fb[i];
printf("sum over fb via C: %d\n", sum);
asm("xorl %0, %0\n"
"xorl %1, %1\n"
"0:\n"
"movsbl (%2, %1), %ebx\n"
"addl %ebx, %0\n"
"incl %1\n"
"cmpl %1, %3\n"
"jl 0b"
: "=r"(sum)
: "r"(i), "r"(fb), "r"(4 * width * height)
: "cc", "%ebx");
printf("sum over fb via inline asm: %d\n", sum);
return (sum);
}
If I haven't made a one-off error, this code should result in the same output for C and assembly. Try something similar, right at the place where you access the read data within C, and compare assembly results - or even singlestep through the generated assembly with a debugger.
A stackoverflow search for "gcc inline assembly syntax" will give you a starting point for what the %0...%2 placeholders mean, and/or how the register assignment constraints like "=r"(sum) above work.