Can a pointer be used as offset and base of a memory reference with inline assembly?
For example:
int main(){
char a[16],b[16];
asm volatile("\
movq $123,16(%%rsp,%%rbx,1)"
:"=m"(*a)::"rbx");
}
Could be something like:
int main(){
char a[16],b[16];
asm volatile("\
movq $123,(%0,%%rbx,1)"
:"=m"(*a)::"rbx");
}
One choice is to use one additional register:
int main(){
char a[16],b[16];
asm volatile("\
lea %0,%%rcx\n\
movq $123,(%%rcx,%%rbx,1)"
:"=m"(*a)::"rbx","rcx");
}
Related
This question already has answers here:
Idiomatic way of performance evaluation?
(1 answer)
benchmarking, code reordering, volatile
(8 answers)
Why is volatile needed in C?
(18 answers)
Adding a redundant assignment speeds up code when compiled without optimization
(1 answer)
Closed 4 months ago.
I write a test program for x86 system. In the loop, there are four different store statements. If I uncomment statement1, the result is 3.2ns. The results for other statements are 2.2ns, 3.7ns, 2.6ns respectively. I can't understand these results. I think the first statement1 should be the fastest because it stores an immediate value and doesn't need to load the value at first like other statements.
Why those four statements have different speed. Could anyone explain them? Thanks.
$ ./a.out 0
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <stdlib.h>
#define BUF_SIZE 8192
#define ROUND 100000000UL
int main(int argc, char **argv)
{
char *buf, *buf_newaddr, *buf_pageend;
unsigned long i __attribute__((aligned(64)));
int buf_realsize;
unsigned long offset __attribute__((aligned(64)));
struct timespec start={0,0}, end={0,0};
double start_ns, end_ns;
if (argc != 2) {
printf("missing args\n");
exit(-1);
}
offset = atoi(argv[1]);
again:
buf = (void *)malloc(BUF_SIZE);
buf_pageend = (void *)((unsigned long)(buf + 4095) & 0xfffffffffffff000UL);
if (buf_pageend - buf < 1024) { // make sure we have enough space in case the 'offset' is negative
// don't free, occupy it in order to alloc another different block
goto again;
}
memset(buf, 0, BUF_SIZE);
printf("&i = %lx, &offset=%lx\n", &i, &offset);
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = 0; i < ROUND; i++) {
//*((unsigned long *)(buf_pageend + offset)) = 0; // 3.2ns
//*((unsigned long *)(buf_pageend + offset)) = (unsigned long)(buf_pageend + offset); // 2.2ns
//*((unsigned long *)(buf_pageend + offset)) = i; // 3.7ns
//*((unsigned long *)(buf_pageend + offset)) = offset; // 2.6ns
}
clock_gettime(CLOCK_MONOTONIC, &end);
start_ns = start.tv_sec*1000000000 + start.tv_nsec;
end_ns = end.tv_sec*1000000000 + end.tv_nsec;
printf("ns: %lf\n", (end_ns - start_ns)/ROUND);
}
EDIT 2022-10-30 17:43 for discussion in comments:
The asm for the second assignment statement is:
movq -176(%rbp), %rdx
movq -64(%rbp), %rax
leaq (%rdx,%rax), %rcx
movq -176(%rbp), %rdx // delete this line
movq -64(%rbp), %rax // delete this line
addq %rdx, %rax
movq %rcx, (%rax)
movq -112(%rbp), %rax
addq $1, %rax
movq %rax, -112(%rbp)
If I delete the two lines marked with //, the result will change from 2.2ns to 3.6ns.
I need to write the C function that will return the value of a specific hardware register. For example R0.
I am unclear from the GHS documentation how this is done with the macros provided by the GHS Compiler.
uint32_t readRegRx(void)
{
uint32_t x = 0U;
__asm("MOV Rx, ??");
return x;
}
What is the syntax in the GHS compiler for referencing a local variable as an argument to an inline assembly instruction?
I've seen this in the GHS documentation:
asm int mulword(a,b)
{
%con a %con b
mov r0,a
mov r1,b
mul r0,r1,r0
%con a %reg b
mov r0,a
mul r0,b,r0
%reg a %con b
mov r0,b
mul r0,a,r0
%reg a %reg b
mul r0,a,b
%con a %mem b
mov r0,a
ldr r1,b
mul r0,r1,r0
%mem a %con b
ldr r0,a
mov r1,b
mul r0,r1,r0
%mem a %mem b
ldr r0,a
ldr r1,b
mul r0,r1,r0
%error
}
But this isn't exactly what I want, I think. The example from the documention above describes a function taking arguments. The return value is implicitly in R0.
In my case, what I want is to use a plain C function, with embedded inline assembly to read a register (R-anything) and store the value in a local variable in the function.
I received information from the GHS support and it addresses the means to read hardware registers (Rn) within C functions analogous to the extended GNU ARM features. The following applies to GHS compiler usage, not GNU compiler usage:
"For asm macro purposes (use within an asm macro), it's probably best to use the enhanced GNU asm syntax, and you'll want to turn on "Accept GNU asm statements" (--gnu_asm)."
int bar(void)
{
int a;
__asm__("mov r0, %0" : : "r"(a)); // move a to r0. Replace r0 to suit.
// Stuff
return a;
}
Alternative method:
asm void readR0(r0Val)
{
%reg r0Val
mov r0Val,r0
}
void foo(void)
{
register int regValReg = 0;
readR0(regValReg);
// Stuff
}
I wrote a simple program:
constexpr int strlen_c(char const* s)
{
return *s ? 1 + strlen_c(s + 1) : 0;
}
int main()
{
return strlen_c("hello world");
}
I expected that the compiler optimizes the function and evaluates its result in compile time. But actually the generated machine code evaluates the result in a loop:
mov edx, offset aHelloWorld ; "hello world"
loc_408D00:
add edx, 1
mov eax, edx
sub eax, offset aHelloWorld ; "hello world"
cmp byte ptr [edx], 0
jnz short loc_408D00
leave
retn
The program is being compiled with g++ version 5.3 with flags -std=c++11 -Ofast -O2. The same result I obtain in Visual Studio 2013, and g++ 4.9.
Quaestion what is the reason the compiler couldn't optimize the given code?
A constexpr function is not necessarily always evaluated at compile time. However, it must be evaluated at compile time if used in a constexpr context, So, following will work regardless of the compiler optimizations:
int main()
{
constexpr auto len = strlen_c("hello world");
return len;
}
Following is the assembly generated for the above code:
main:
mov eax, 11
ret
Demo
I am new to 64bit Assembly coding. So I tried some simple Programms:
c-programm:
#include <stdio.h>
extern double bla();
double x=0;
int main() {
x=bla();
printf(" %f",x);
return 0;
}
Assembly:
section .data
section .text
global bla
bla:
mov rax,10
movq xmm0,rax
ret
The result was alwals 0.0 instead of 10.0
But when i make it without a immediate it works fine
#include <stdio.h>
extern double bla(double y);
double x=0;
double a=10;
int main() {
x=bla(a);
printf("add returned %f",x);
return 0;
}
section .data
section .text
global bla
bla:
movq rax,xmm0
movq xmm0,rbx ;xmm0=0 now
movq xmm0,rax ;xmm0=10 now
ret
Do I need a different Instruction to load a Immediate in a 64bit Register?
The problem here was that the OP was trying to move 10 into a floating-point register with the following code:
mov rax,10
movq xmm0,rax
That cannot work, since movq into xmm0 assumes that the bit-pattern of the source is already in floating-point format - and of course it isn't: it's an integer.
#Michael Petch's suggestion was to use the (NASM) assembler's floating-point converter as follows:
mov rax,__float64__(10.0)
movq xmm0,rax
That then produces the expected output.
I am using a templated class in c++ and I am planning on ensuring compatibility with doubles and mpfr floats. The only division that occurs in the program is division by 2. The behavior for doubles and mpfr floats for division by 2 should be different because, in mpfr, I have direct access to the exponent.
Question: What do you suggest to result in the most efficient compiled code?
I expect that a run-time solution checking the type of the templated variable would be inefficient.
Boost's mpfr wrapper does not seem useful because it doesn't seem to use the mpfr_div_2ui command and would, instead, divide by the mpfr float with a value of 2. I expect this to be slower than directly changing the exponent.
I could use an overloaded command to deal with the two cases of mpfr floats and doubles.
I could use some user-set #define flag that the user would need to set to use mpfr data types.
Are there any other suggestions?
I'd check whether number<mpfr_floatXXX> doesn't already detect the optimization.
Boost's mpfr wrapper does not seem useful because it doesn't seem to use the mpfr_div_2ui command and would, instead, divide by the mpfr float with a value of 2. I expect this to be slower than directly changing the exponent.
That expectation is not well founded. Simply check:
#include <boost/multiprecision/mpfr.hpp>
int main() {
using namespace boost::multiprecision;
mpfr_float_50 n ("787878787878");
n /= 2;
}
Compiles into
mov rax, QWORD PTR fs:40
mov QWORD PTR [rsp+232], rax
xor eax, eax
lea rdi, [rsp+16]
call mpfr_init2
cmp QWORD PTR [rsp+40], 0
xor ecx, ecx
mov edx, 10
lea rdi, [rsp+16]
call mpfr_set_str
test eax, eax
cmp QWORD PTR [rsp+40], 0
lea rsi, [rsp+16]
xor ecx, ecx
mov edx, 2
mov rdi, rsi
call mpfr_div_ui
So, it isn't nearly as bad as you made it seem.
Implementation
Here is my non-generic implementation:
mp::mpfr_float_50 div_2ui(mp::mpfr_float_50 const& f, unsigned i) {
mp::mpfr_float_50 r;
::mpfr_div_2ui(
r.backend().data(),
f.backend().data(),
i,
MPFR_RNDN);
return r;
}
A generic implementation would look like:
template <typename T, typename Enable = void> struct is_mpfr : boost::mpl::false_ {};
template <unsigned digits10, mp::mpfr_allocation_type AllocationType, mp::expression_template_option ET>
struct is_mpfr<
mp::number<mp::mpfr_float_backend<digits10, AllocationType>, ET >
> : boost::mpl::true_
{};
template <typename T>
T div_2ui_impl(T f, unsigned i, boost::mpl::false_) {
while (i--)
f /= 2;
return f;
}
template <typename Mpfr>
Mpfr div_2ui_impl(Mpfr f, unsigned i, boost::mpl::true_) {
std::cout << "-- optimized --";
Mpfr r;
::mpfr_div_2ui(r.backend().data(), f.backend().data(), i, MPFR_RNDN);
return r;
}
template <typename T>
T div_2ui(T const &f, unsigned i) {
return div_2ui_impl(f, i, is_mpfr<T> { });
}
Live Demo
Live On Coliru
template <typename T>
void test() {
T n("787878787878");
n = arith::div_2ui(n, 1);
std::cout << __FUNCTION__ << ": " << n << "\n";
}
int main() {
std::cout << std::fixed;
test<mp::mpfr_float_50>();
test<mp::mpfr_float_100>();
test<mp::cpp_int>();
test<mp::cpp_dec_float_100>();
test<mp::number<mp::gmp_int> >();
test<mp::mpf_float_1000>();
}
Prints
-- optimized --test: 393939393939.000000
-- optimized --test: 393939393939.000000
test: 393939393939
test: 393939393939.000000
test: 393939393939
test: 393939393939.000000