All
I found my riscv gcc will come times change my 64bit write order, and I want to avoid this
...
// A 64bit write to addr & addr+4, which map to IO
*((volatile uint32_t *)(addr)) = data;
*((volatile uint32_t *)(addr+4)) = data >> 32;
*(uint64_t*)buf = 0x0;
Then I get
sw a0, 0(a3)
sw zero, 0(a5) // lower 32bit of buf
sw a2, 4(a3) // high 32bit of addr
sw zero, 4(a5)
It confused some of IO device, is there any way to avoid gcc to do this opt?
Thanks
I'm just starting to learn the ARM cortex-M4, which have advanced functions like DSP instructions,......
uint32_t my_rearrange(uint32_t value){
uint32_t value_high = (value & 0xffff0000)>>16;
uint32_t value_low = (value & 0x0000ffff);
return (value_low<<16)|value_high;
}
This is a simple code for rearranging an unsigned int variable.
Is there anyway to tune this function for best performance or fastest execution in cortex-M4? Is there a way for me to use the dsp instructions in this function?
You don't need to use any of the dsp instructions for this. All you appear to be doing is a 16 bit rotate. The compiler should recognise this and generate a ROR. GCC 5.4.1 with flags -O3 -std=c++11 -march=armv7-m -mtune=cortex-m4 -mthumb does, and for your code generates
my_rearrange(unsigned long):
ror r0, r0, #16
bx lr
The only way you could speed it up would be if it was inlined.
Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
This question does not appear to be about programming within the scope defined in the help center.
Closed 7 years ago.
Improve this question
I have implemented the strlen() function in different ways, including SSE2 assembly, SSE4.2 assembly and SSE2 intrinsic, I also exerted some experiments on them, with strlen() in <string.h> and strlen() in glibc. However, their performance in terms of milliseconds (time) are unexpected.
My experiment environment:
CentOS 7.0 + gcc 4.8.5 + Intel Xeon
Following are my implementations:
strlen using SSE2 assembly
long strlen_sse2_asm(const char* src){
long result = 0;
asm(
"movl %1, %%edi\n\t"
"movl $-0x10, %%eax\n\t"
"pxor %%xmm0, %%xmm0\n\t"
"lloop:\n\t"
"addl $0x10, %%eax\n\t"
"movdqu (%%edi,%%eax), %%xmm1\n\t"
"pcmpeqb %%xmm0, %%xmm1\n\t"
"pmovmskb %%xmm1, %%ecx\n\t"
"test %%ecx, %%ecx\n\t"
"jz lloop\n\t"
"bsf %%ecx, %%ecx\n\t"
"addl %%ecx, %%eax\n\t"
"movl %%eax, %0"
:"=r"(result)
:"r"(src)
:"%eax"
);
return result;
}
2.strlen using SSE4.2 assembly
long strlen_sse4_2_asm(const char* src){
long result = 0;
asm(
"movl %1, %%edi\n\t"
"movl $-0x10, %%eax\n\t"
"pxor %%xmm0, %%xmm0\n\t"
"lloop2:\n\t"
"addl $0x10, %%eax\n\t"
"pcmpistri $0x08,(%%edi, %%eax), %%xmm0\n\t"
"jnz lloop2\n\t"
"add %%ecx, %%eax\n\t"
"movl %%eax, %0"
:"=r"(result)
:"r"(src)
:"%eax"
);
return result;
}
3. strlen using SSE2 intrinsic
long strlen_sse2_intrin_align(const char* src){
if (src == NULL || *src == '\0'){
return 0;
}
const __m128i zero = _mm_setzero_si128();
const __m128i* ptr = (const __m128i*)src;
if(((size_t)ptr&0xF)!=0){
__m128i xmm = _mm_loadu_si128(ptr);
unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(xmm,zero));
if(mask!=0){
return (const char*)ptr-src+(size_t)ffs(mask);
}
ptr = (__m128i*)(0x10+(size_t)ptr & ~0xF);
}
for (;;ptr++){
__m128i xmm = _mm_load_si128(ptr);
unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(xmm,zero));
if (mask!=0)
return (const char*)ptr-src+(size_t)ffs(mask);
}
}
I also looked up the one implemented in linux kernel, following is its implementation
size_t strlen_inline_asm(const char* str){
int d0;
size_t res;
asm volatile("repne\n\t"
"scasb"
:"=c" (res), "=&D" (d0)
: "1" (str), "a" (0), "" (0xffffffffu)
: "memory");
return ~res-1;
}
In my experience, I also added the one of standard library and compared their performance.
Followings are my main function code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <xmmintrin.h>
#include <x86intrin.h>
#include <emmintrin.h>
#include <time.h>
#include <unistd.h>
#include <sys/time.h>
int main()
{
struct timeval tpstart,tpend;
int i=0;
for(;i<1023;i++){
test_str[i] = 'a';
}
test_str[i]='\0';
gettimeofday(&tpstart,NULL);
for(i=0;i<10000000;i++)
strlen(test_str);
gettimeofday(&tpend,NULL);
printf("strlen from stirng.h--->%lf\n",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);
gettimeofday(&tpstart,NULL);
for(i=0;i<10000000;i++)
strlen_inline_asm(test_str);
gettimeofday(&tpend,NULL);
printf("strlen_inline_asm--->%lf\n",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);
gettimeofday(&tpstart,NULL);
for(i=0;i<10000000;i++)
strlen_sse2_asm(test_str);
gettimeofday(&tpend,NULL);
printf("strlen_sse2_asm--->%lf\n",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);
gettimeofday(&tpstart,NULL);
for(i=0;i<10000000;i++)
strlen_sse4_2_asm(test_str);
gettimeofday(&tpend,NULL);
printf("strlen_sse4_2_asm--->%lf\n",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);
gettimeofday(&tpstart,NULL);
for(i=0;i<10000000;i++)
strlen_sse2_intrin_align(test_str);
gettimeofday(&tpend,NULL);
printf("strlen_sse2_intrin_align--->%lf\n",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);
return 0;
}
The result is : (ms)
strlen from stirng.h--->23.518000
strlen_inline_asm--->222.311000
strlen_sse2_asm--->782.907000
strlen_sse4_2_asm--->955.960000
strlen_sse2_intrin_align--->3499.586000
I have some questions about it:
Why strlen of string.h is so fast? I think its code should be identify to strlen_inline_asm because I copied the code from /linux-4.2.2/arch/x86/lib/string_32.c[http://lxr.oss.org.cn/source/arch/x86/lib/string_32.c#L164]
Why sse2 intrinsic and sse2 assembly are so different in performance?
Could someone help me how to disassembly the code so that I can see what has the function strlen of static library been transformed by the compiler? I used gcc -s but didn't find the disassembly of strlen from the <string.h>
I think my code may be not very well, I would be appreciate if you could help me improve my code, especially assembly ones.
Thanks.
Like I said in comments, your biggest error is benchmarking with -O0. I discussed exactly why testing with -O0 is a terrible idea in the first part of another post.
Benchmarks should be done with at least -O2, preferably with the same optimizations as your full project will build with, if you're trying to test test what source makes the fastest asm.
-O0 explains inline asm being way faster than C with intrinsics (or regular compiled C, for C strlen implementation borrowed from glibc).
IDK -O0 would still optimize away loop that discards the result of library strlen repeatedly, or if it somehow just avoided some other huge performance pitfall. It's not interesting to guess about exactly what happened in such a flawed test.
I tightened up your SSE2 inline-asm version. Mostly just because I've been playing with gcc inline asm input/output constraints recently, and wanted to see what it would look like if I wrote it to let the compiler choose which registers to use for temporaries, and avoided unneeded instructions.
The same inline asm works for 32 and 64-bit x86 targets; see this compiled for both on the Godbolt compiler explorer. When compiling to a stand-along function, it doesn't have to save/restore any registers even in 32bit mode:
WARNING: it can read past the end of the string by up to 15 bytes. This could segfault. See Is it safe to read past the end of a buffer within the same page on x86 and x64? for details on avoiding that: get to an alignment boundary, then use aligned loads because that's always safe if the vector contains at least 1 byte of string data. I left the code unchanged because it's interesting to discuss the effect of aligning pointers for SSE vs. AVX. Aligning pointers also avoids cache-line splits, and 4k page-splits (which are a performance pothole before Skylake).
#include <immintrin.h>
size_t strlen_sse2_asm(const char* src){
// const char *orig_src = src; // for a pointer-increment with a "+r" (src) output operand
size_t result = 0;
unsigned int tmp1;
__m128i zero = _mm_setzero_si128(), vectmp;
// A pointer-increment may perform better than an indexed addressing mode
asm(
"\n.Lloop:\n\t"
"movdqu (%[src], %[res]), %[vectmp]\n\t" // result reg is used as the loop counter
"pcmpeqb %[zerovec], %[vectmp]\n\t"
"pmovmskb %[vectmp], %[itmp]\n\t"
"add $0x10, %[res]\n\t"
"test %[itmp], %[itmp]\n\t"
"jz .Lloop\n\t"
"bsf %[itmp], %[itmp]\n\t"
"add %q[itmp], %q[res]\n\t" // q modifier to get quadword register.
// (add %edx, %rax doesn't work). But in 32bit mode, q gives a 32bit reg, so the same code works
: [res] "+r"(result), [vectmp] "=&x" (vectmp), [itmp] "=&r" (tmp1)
: [zerovec] "x" (zero) // There might already be a zeroed vector reg when inlining
, [src] "r"(src)
, [dummy] "m" (*(const char (*)[])src) // this reads the whole object, however long gcc thinks it is
: //"memory" // not needed because of the dummy input
);
return result;
// return result + tmp1; // doing the add outside the asm makes gcc sign or zero-extend tmp1.
// No benefit anyway, since gcc doesn't know that tmp1 is the offset within a 16B chunk or anything.
}
Note the dummy input, as an alternative to a "memory" clobber, to tell the compiler that the inline asm reads the memory pointed to by src, as well as the value of src itself. (The compiler doesn't know what the asm does; for all it knows the asm just aligns a pointer with and or something, so assuming that all input pointers are dereferenced would lead to missed optimizations from reordering / combining loads and stores across the asm. Also, this lets the compiler know we only read the memory, not modify it.) The GCC manual uses an example with this unspecified-length array syntax "m" (*(const char (*)[])src)
It should keep register pressure to a minimum when inlining, and doesn't tie up any special-purpose registers (like ecx which is needed for variable-count shifts).
If you could shave another uop out of the inner loop, it would be down to 4 uops that could issue at one per cycle. As it is, 5 uops means each iteration may take 2 cycles to issue from the frontend, on Intel SnB CPUs. (Or 1.25 cycles on later CPUs like Haswell, and maybe on SnB if I was wrong about the whole-number behaviour.)
Using an aligned pointer would allow the load to fold into a memory operand for pcmpeqb. (As well as being necessary for correctness if the string start is unaligned and the end is near the end of a page). Interestingly, using the zero-vector as the destination for pcmpeqb is ok in theory: you don't need to re-zero the vector between iterations, because you exit the loop if it's ever non-zero. It has 1-cycle latency, so turning the zero vector into a loop-carried dependency is only a problem when cache-misses delay an old iteration. Removing this loop-carried dependency chain might help in practice, though, by letting the back end go faster when catching up after a cache miss that delayed an old iteration.
AVX solves the problem completely (except for correctness if the string ends near the end of a page). AVX allows the load to be folded even without doing an alignment check first. 3-operand non-destructive vpcmpeqb avoids turning the zero vector into a loop-carried dependency. AVX2 would allow checking 32B at once.
Unrolling will help either way, but helps more without AVX. Align to a 64B boundary or something, and then load the whole cache line into four 16B vectors. Doing a combined check on the result of PORing them all together may be good, since pmovmsk + compare-and-branch is 2 uops.
Using SSE4.1 PTEST doesn't help (compared to pmovmsk / test / jnz) because it's 2 uops and can't macro-fuse the way test can.
PTEST can directly test for the whole 16B vector being all-zero or all-ones (using ANDNOT -> CF part), but not if one of the byte-elements is zero. (So we can't avoid pcmpeqb).
Have a look at Agner Fog's guides for optimizing asm, and the other links on the x86 wiki. Most optimization (Agner Fog's, and Intel's and AMD's) will mention optimizing memcpy and strlen specifically, IIRC.
If you read the source of the strlen function in the glibc, you can see that the function is not testing the string char by char, but longword by longword with complex bitwise operations : http://www.stdlib.net/~colmmacc/strlen.c.html. I guess it explains its speed, but the fact that it's even faster than rep instructions in assembly is indeed quite surprising.
As for today i used my own min() function (for float and int)
that was based on if but today as i get know that x86 has some operand
for min - this is
MINSS - Minimum of operands
i think that if based min() routine is counter effective and
im very careful for optimization, so i would like rewrite my own
routine into minss version with some inline assembly,
I would like to find how the most effective version of this in
gcc inline assembly would look like
I need something like
int min(int a, int b)
{
// minss a, b
//return
}
for both int and float, to use minss opcode and has minimal prologue and
epilogue
or just using library version would be faster? though i would like
to not use library min/max and had it as much fast if possible
Here is the most efficient possible implementation of min for ints and floats:
int
min_int(int a, int b)
{
return a < b ? a : b;
}
float
min_float(float a, float b)
{
return a < b ? a : b;
}
"But," you exclaim, "those will have conditional jumps in them!" Nope. Here's the output of gcc -S -O2:
min_int:
cmpl %edi, %esi
movl %edi, %eax
cmovle %esi, %eax
ret
min_float:
minss %xmm1, %xmm0
ret
For ints you get a conditional move, and for floats you get minss, because the compiler is very smart. No inline ASM needed!
EDIT: If you're still curious about how to do it with inline assembly, here's an example (for gcc):
float
min_float_asm(float a, float b)
{
float result = a;
asm ("minss %1, %0" : "+x" (result) : "x" (b));
return result;
}
The x constraint means "any SSE register", and "+x" means the value will be read and written, whereas "x" means read-only.
Well, I would suggest against such micro-optimization. If you want to do it anyway, GCC has some __builtin_* functions. One is v4sf __builtin_ia32_minss (v4sf, v4sf). There are other min* built-ins as well, check the docs.
Update
To gain more portability, you might want to take a look at the Intel Intrinsics Guide. Those functions are usually supported by GCC and Clang as well.
I am working on a C++ app on Intel Mac OS X 10.6.x. I have a variable which contains pixel data which was obtained using OpenGL call glReadPixels. I want to do some operations directly on the pixel data using x86_64 assembly instructions. The assembly routine works fine in test programs but when I try to use it on the pixel data, it only gets zeroes in the memory location pointed by the pixel data variable. I am guessing this is since I am trying to access video memory directly from x86_64 assembly. Is there a way to access x86_64 video memory directly from assembly? Otherwise how can I resolve this situation?
Appreciate any pointers. Thanks in advance.
See below for code sample to flip last n and 1st n bytes. Same code works well in test program.
void Flip(void *b, unsigned int w, unsigned int h)
{
__asm {
mov r8, rdi //rdi b
mov r9, rsi //W
mov r10,rdx //H
mov r11, 0 // h <- 0
mov r12, 0 // w<- 0
outloop:
------------
.............
.............
}
This isn't really an answer but the comments bit is too short to post this.
Your inline assembly is problematic, in multiple ways:
it assumes by the time the compiler gets to the inline block, the function arguments are still in the arg registers. This isn't guaranteed.
it uses MS-VC++ style inline assembly; I'm unsure about OSX Clang, but gcc proper refuses to even compile this.
It'd also be good to have a complete (compilable) source fragment. Something like (32bit code):
int mogrifyFramebufferContents(unsigned char *fb, int width, int height)
{
int i, sum;
glReadPixels(1, 1, width, height, GL_RGBA, GL_UNSIGNED_BYTE, fb);
for (i = 0, sum = 0; i < 4 * width * height; i++)
sum += fb[i];
printf("sum over fb via C: %d\n", sum);
asm("xorl %0, %0\n"
"xorl %1, %1\n"
"0:\n"
"movsbl (%2, %1), %ebx\n"
"addl %ebx, %0\n"
"incl %1\n"
"cmpl %1, %3\n"
"jl 0b"
: "=r"(sum)
: "r"(i), "r"(fb), "r"(4 * width * height)
: "cc", "%ebx");
printf("sum over fb via inline asm: %d\n", sum);
return (sum);
}
If I haven't made a one-off error, this code should result in the same output for C and assembly. Try something similar, right at the place where you access the read data within C, and compare assembly results - or even singlestep through the generated assembly with a debugger.
A stackoverflow search for "gcc inline assembly syntax" will give you a starting point for what the %0...%2 placeholders mean, and/or how the register assignment constraints like "=r"(sum) above work.