I am trying to write a simple application on Mac OS X using only syscalls, no standard library.
main.c
#define PROT_READ 0x1
#define PROT_WRITE 0x2
#define MAP_ANONYMOUS 0x20
#define MAP_PRIVATE 0x02
#define PAGE_SIZE 4096
#define NULL 0
#define STDOUT 1
#define SYSCALL_BASE 0x2000000
#define SYSCALL_GET(num) SYSCALL_BASE + num
long long syscall(long long arg1, long long arg2, long long arg3, long long arg4, long long arg5, long long arg6, long long cn);
void exit(long long status) {
syscall(status, 0, 0, 0, 0, 0, SYSCALL_GET(1));
}
long long write(long long fd, char *buf, long long len) {
return syscall(fd, buf, len, 0, 0, 0, SYSCALL_GET(4));
}
void *mmap(void *addr, long long length, long long prot, long long flags, long long fd, long long offset) {
return syscall(addr, length, prot, flags, fd, offset, SYSCALL_GET(197));
}
long long munmap(void *addr, long long length) {
return syscall(addr, length, 0, 0, 0, 0, SYSCALL_GET(73));
}
int strlen(char *s) {
int len = 0;
while (*(s++) != '\0') {
len++;
}
return len;
}
int putchar(char c) {
return write(STDOUT, &c, 1);
}
int main(int argc, char *argv[]) {
if (argc <= 1) {
return 0;
}
int *lengths = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
for (int i = 1; i < argc; i++) {
lengths[i] = strlen(argv[i]);
}
for (int i = 1; i < argc; i++) {
write(STDOUT, argv[i], lengths[i]);
putchar(' ');
}
putchar('\n');
munmap(lengths, PAGE_SIZE);
return 0;
}
start.s
.global start
.global _syscall
.text
start:
popq %rdi
movq %rsp, %rsi
andq $0xfffffffffffffff0, %rsp
call _main
movq %rax, %rdi
call _exit
_syscall:
movq %rcx, %r10
movq 8(%rsp), %rax
pushq %rbx # alignment
syscall
popq %rbx
retq
As you can see, the application basically reflects simple echo. When I run the program with no arguments, it successfully finishes, so I assume exit call works. But when I run it with any argument, it crashes with Segmentation fault: 11. As far as I understand it now, when mmap is called, kernel returns strange value: 9. I assume that 9 is not a proper address, but I cannot understand my mistake, because according to documentations, all the values passed to syscall are correct. Syscall numbers are taken from here.
I would like to know too.
I think apple uses MAP_ANON, which is a different number. Try
#define MAP_ANON 0x1000
https://github.com/nneonneo/osx-10.9-opensource/blob/master/xnu-2422.1.72/bsd/sys/mman.h#L150
Related
This question already has answers here:
Idiomatic way of performance evaluation?
(1 answer)
benchmarking, code reordering, volatile
(8 answers)
Why is volatile needed in C?
(18 answers)
Adding a redundant assignment speeds up code when compiled without optimization
(1 answer)
Closed 4 months ago.
I write a test program for x86 system. In the loop, there are four different store statements. If I uncomment statement1, the result is 3.2ns. The results for other statements are 2.2ns, 3.7ns, 2.6ns respectively. I can't understand these results. I think the first statement1 should be the fastest because it stores an immediate value and doesn't need to load the value at first like other statements.
Why those four statements have different speed. Could anyone explain them? Thanks.
$ ./a.out 0
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <stdlib.h>
#define BUF_SIZE 8192
#define ROUND 100000000UL
int main(int argc, char **argv)
{
char *buf, *buf_newaddr, *buf_pageend;
unsigned long i __attribute__((aligned(64)));
int buf_realsize;
unsigned long offset __attribute__((aligned(64)));
struct timespec start={0,0}, end={0,0};
double start_ns, end_ns;
if (argc != 2) {
printf("missing args\n");
exit(-1);
}
offset = atoi(argv[1]);
again:
buf = (void *)malloc(BUF_SIZE);
buf_pageend = (void *)((unsigned long)(buf + 4095) & 0xfffffffffffff000UL);
if (buf_pageend - buf < 1024) { // make sure we have enough space in case the 'offset' is negative
// don't free, occupy it in order to alloc another different block
goto again;
}
memset(buf, 0, BUF_SIZE);
printf("&i = %lx, &offset=%lx\n", &i, &offset);
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = 0; i < ROUND; i++) {
//*((unsigned long *)(buf_pageend + offset)) = 0; // 3.2ns
//*((unsigned long *)(buf_pageend + offset)) = (unsigned long)(buf_pageend + offset); // 2.2ns
//*((unsigned long *)(buf_pageend + offset)) = i; // 3.7ns
//*((unsigned long *)(buf_pageend + offset)) = offset; // 2.6ns
}
clock_gettime(CLOCK_MONOTONIC, &end);
start_ns = start.tv_sec*1000000000 + start.tv_nsec;
end_ns = end.tv_sec*1000000000 + end.tv_nsec;
printf("ns: %lf\n", (end_ns - start_ns)/ROUND);
}
EDIT 2022-10-30 17:43 for discussion in comments:
The asm for the second assignment statement is:
movq -176(%rbp), %rdx
movq -64(%rbp), %rax
leaq (%rdx,%rax), %rcx
movq -176(%rbp), %rdx // delete this line
movq -64(%rbp), %rax // delete this line
addq %rdx, %rax
movq %rcx, (%rax)
movq -112(%rbp), %rax
addq $1, %rax
movq %rax, -112(%rbp)
If I delete the two lines marked with //, the result will change from 2.2ns to 3.6ns.
I created a simple demo to show that unaligned memory stores/loads are generally not atomic on x86_64 and ARM64 architectures. This demo consists of a C++ program that creates two threads — the first one billion times calls a function called store, the second one does the same with a function called load. The source code of the program is here:
#include <cstdint>
#include <cstdlib>
#include <iostream>
#include <thread>
extern "C" void store(void*);
extern "C" uint16_t load(void*);
alignas(64) char buf[65];
char* ptr;
static long n = 1'000'000'000L;
void f1()
{
for (long i = 0; i < n; i++)
store(ptr);
}
void f2()
{
long v0x0000 = 0;
long v0x0101 = 0;
long v0x0100 = 0;
long v0x0001 = 0;
long other = 0;
for (long i = 0; i < n; i++)
{
uint16_t a = load(ptr);
if (a == 0x0000) v0x0000++;
else if (a == 0x0101) v0x0101++;
else if (a == 0x0100) v0x0100++;
else if (a == 0x0001) v0x0001++;
else other++;
}
std::cout << "0x0000: " << v0x0000 << std::endl;
std::cout << "0x0101: " << v0x0101 << std::endl;
std::cout << "0x0100: " << v0x0100 << std::endl;
std::cout << "0x0001: " << v0x0001 << std::endl;
std::cout << "other: " << other << std::endl;
}
int main(int arc, char* argv[])
{
int offset = std::atoi(argv[1]);
ptr = buf + offset;
std::thread t1(f1);
std::thread t2(f2);
t1.join();
t2.join();
}
The store and load functions are defined separately in the assembly source files. For x86_64 as follows:
.intel_syntax noprefix
.global store
.global load
.text
store:
mov eax, 0
mov WORD PTR [rdi], ax
mov eax, 0x0101
mov WORD PTR [rdi], ax
ret
load:
movzx eax, WORD PTR [rdi]
ret
And, for ARM64 as follows:
.global store
.global load
.text
store:
mov w1, 0x0000
strh w1, [x0]
mov w1, 0x0101
strh w1, [x0]
ret
load:
ldrh w0, [x0]
ret
When I run the program, everything works as expected. When I pass offset 0, the stores/loads are aligned and just the values 0x0000 and 0x0101 are observed in the reading thread. When I pass offset 63, the stores/loads are unaligned and cross the cache line boundary, and the values 0x0100 and 0x0001 are observed as well. This holds for both architectures.
However, I noticed that there is a big difference in the execution times of these test runs. Some typical times I observed:
x86_64 + offset 0 (aligned): 6.9 [s]
x86_64 + offset 63 (unaligned): 28.3 [s]
ARM64 + offset 0 (aligned): 6.8 [s]
ARM64 + offset 63 (unaligned): 9.2 [s]
On x86_64, when two cache lines are involved in unaligned cases, the runtime is several times slower. But on ARM64, the runtime is slower only slightly. I wonder what makes the difference in this behavior between both architectures. (I am not much familiar with cache coherency mechanisms.)
Particular processors for experiments were Intel Xeon E5-2680 v3 and Cortex-A72. The former was in a dual-socket server, but I restricted both threads to a single socket only (by taskset or numactl). The latter was in Raspberry Pi 4 device. Both systems run Linux plus I used GCC for builds.
I've developed a little program that tests the performance of 32 bit Windows structured exception handling. To keep the overhead minimal in contrast to the rest, I wrote the code generating an filtering the exception in assembly.
This is the C++-code:
#include <Windows.h>
#include <iostream>
using namespace std;
bool __fastcall getPointerFaultSafe( void *volatile *from, void **to );
int main()
{
auto getThreadTimes = []( LONGLONG &kt, LONGLONG &ut )
{
union
{
FILETIME ft;
LONGLONG ll;
} creationTime, exitTime, kernelTime, userTime;
GetThreadTimes( GetCurrentThread(), &creationTime.ft, &exitTime.ft, &kernelTime.ft, &userTime.ft );
kt = kernelTime.ll;
ut = userTime.ll;
};
LONGLONG ktStart, utStart;
getThreadTimes( ktStart, utStart );
size_t const COUNT = 100'000;
void *pv;
for( size_t c = COUNT; c; --c )
getPointerFaultSafe( nullptr, &pv );
LONGLONG ktEnd, utEnd;
getThreadTimes( ktEnd, utEnd );
double ktNsPerException = (ktEnd - ktStart) * 100.0 / COUNT,
utNsPerException = (utEnd - utStart) * 100.0 / COUNT;
cout << "kernel-time per exception: " << ktNsPerException << "ns" << endl;
cout << "user-time per exception: " << utNsPerException << "ns" << endl;
return 0;
}
This is the assembly-code:
.686P
PUBLIC ?getPointerFaultSafe##YI_NPCRAXPAPAX#Z
PUBLIC sehHandler
.SAFESEH sehHandler
sehHandler PROTO
_DATA SEGMENT
byebyeOffset dd 0
_DATA ENDS
exc_ctx_eax = 0b0h
exc_ctx_eip = 0b8h
_TEXT SEGMENT
?getPointerFaultSafe##YI_NPCRAXPAPAX#Z PROC
ASSUME ds:_DATA
push OFFSET sehHandler
push dword ptr fs:0
mov dword ptr fs:0, esp
mov byebyeOffset, OFFSET byebye - OFFSET mightfail
mov al, 1
mightfail:
mov ecx, dword ptr [ecx]
mov dword ptr [edx], ecx
byebye:
mov edx, dword ptr [esp]
mov dword ptr fs:0, edx
add esp, 8
ret 0
?getPointerFaultSafe##YI_NPCRAXPAPAX#Z ENDP
sehHandler PROC
mov eax, dword ptr [esp + 12]
mov dword ptr [eax + exc_ctx_eax], 0
mov edx, byebyeOffset
add [eax + exc_ctx_eip], edx
mov eax, 0
ret 0
sehHandler ENDP
_TEXT ENDS
END
How do I get the asm-module of my program /SAFESEH-compatible?
Why does this program consume so much userland CPU-time? The library-code being called by the operating-system after the exception has begun to be handled should have only to save all the registers in the CONTEXT-structure, fill the EXCEPTION_RECORD-structure, call the topmost exception-filter which - in this case - shifts the execution two instructions further, and when it returns it will in my case restore all the registers an continue execution according to what I returned in EAX. That's should all not be so much time that almost 1/3 of the CPU-time will be spent in userland. That's about 2,3ms, i.e. when my old Ryzen 1800X is boosting on one core with 4GHz, about 5.200 clock-cycles.
I'm using the byebyeOffset-variable in my code to carry the distance between the unsafe instruction that might generate an access-violation and the safe code afterwards. I'm initializing this variable before the unsafe instruction. But it would be nice to have this offset statically as an immediate at the point where I add it on EIP in the exception-filter function sehHandler; but the offsets are scoped to getPointerFaultSafe. Of course storing the offset and fetching it from the variable take a negligible part of the overall computation time, but it would be nicer to have a clean solution.
I'm trying to convert assembly written in AT&T syntax from a DevC++ project to inline assembly in Visual Studio.
This is the AT&T I'm trying to convert:
void Painter::drawRectangle(int surface, int x, int y, int width, int height, int red, int green, int blue) {
asm("mov %0, %%eax":: "r" (0x004EAA90));
asm("call *%eax");
asm("mov %eax, %ecx");
asm("mov (%ecx), %eax");
asm("push %0":: "m" (blue));
asm("push %0":: "m" (green));
asm("push %0":: "m" (red));
asm("push %0":: "m" (height));
asm("push %0":: "m" (width));
asm("push %0":: "m" (y));
asm("push %0":: "m" (x));
asm("push %0":: "m" (surface));
asm("call *0x14(%eax)");
}
what i've done so far:
void _drawrectangle(int surface, int x, int y, int width, int height, int red, int green, int blue)
{
__asm
{
mov eax, 0x004eaa90
call dword ptr [eax]
mov ecx, eax
mov eax, [ecx]
push blue
push green
push red
push height
push width
push y
push x
push surface
call dword ptr [eax + 0x14]
}
}
I'm writing this in my DLL, which I've already injected into the game. The game crashes on opening. And I've already hooked another drawing function in C++, which worked.
Hopefully you can help me/guide me in the right direction. Thank you.
Here's how you could write your function in C++ without the use of inline assembly:
#ifndef _MSC_VER
/* For GCC and clang */
#undef __thiscall
#define __thiscall __attribute__((thiscall))
#endif
struct some_interface {
virtual void _unknown_0() = 0;
virtual void _unknown_4() = 0;
virtual void _unknown_8() = 0;
virtual void _unknown_C() = 0;
virtual void _unknown_10() = 0;
virtual void __thiscall drawRectangle(int surface, int x, int y,
int width, int height,
int red, int green, int blue) = 0;
};
const auto get_interface = (some_interface *(*)()) 0x4EAA90;
void
drawRectangle(int surface, int x, int y, int width, int height,
int red, int green, int blue) {
get_interface()->drawRectangle(surface, x, y, width, height,
red, green, blue);
}
The code you're trying to translate first calls a function that returns a pointer to some class object with at least 6 virtual methods defined. It then calls the 6th virtual method of that object. The some_interface struct minimally recreates that class so the 6th virtual method can be called. The get_interface constant is a function pointer that points to the function located at 0x4EAA90 and in C++ function pointers can be used just like a function.
The above code generates the following assembly in GCC 8.2:
drawRectangle(int, int, int, int, int, int, int, int):
subl $12, %esp
movl $5155472, %eax
call *%eax
movl (%eax), %edx
movl %eax, %ecx
pushl 44(%esp)
pushl 44(%esp)
pushl 44(%esp)
pushl 44(%esp)
pushl 44(%esp)
pushl 44(%esp)
pushl 44(%esp)
pushl 44(%esp)
call *20(%edx)
addl $12, %esp
ret
and the following assembly with Visual C++ 2017:
void drawRectangle(int,int,int,int,int,int,int,int) PROC ; drawRectangle, COMDAT
mov eax, 5155472 ; 004eaa90H
call eax
push DWORD PTR _blue$[esp-4]
mov ecx, eax
push DWORD PTR _green$[esp]
mov edx, DWORD PTR [eax]
push DWORD PTR _red$[esp+4]
push DWORD PTR _height$[esp+8]
push DWORD PTR _width$[esp+12]
push DWORD PTR _y$[esp+16]
push DWORD PTR _x$[esp+20]
push DWORD PTR _surface$[esp+24]
call DWORD PTR [edx+20]
ret 0
void drawRectangle(int,int,int,int,int,int,int,int) ENDP ; drawRectangle
# x at %ebp+8, n at %ebp+12
movl 8(%ebp), %esi
movl 12(%ebp), %ebx
movl $-1, %edi
movl $1, %edx
.L2:
movl %edx, %eax
andl %esi, %eax
xorl %eax, %edi
movl %ebx, %ecx
sall %cl, %edx
testl %edx, %edx
jne .L2
movl %edi, %eax
I converted the above code to the below code, but i am not completely sure if it is correct.
int loop(int x, int n){
int result = -1;
for (mask = 1; mask >= result; mask = x&1) {
result ^= n;
}
return result;
}
x and n are two integers stored in %ebp memory and are moved to registry %esi and %ebx. Result and Mask have a value of -1 and 1 and that is from the first part of the code
I think after .L2: the loop starts and that is where i get confused.
At the end result is returned movl %edi, %eax
Your code is completely wrong. You should have done some test by yourself before posting quesetions.
First of all, mask in your code is not declared in your function.
Then, after declareing mask as int, the function loop will fall into an infinite loop when result won't become positive via result ^= n;. On the other hand, the assembly code won't fall into an infinite loop unless n is multiple of 32 (including zero).
To convert the code to assembly:
1. I did direct conversion from assembly to C.
Note that I used unsigned type uint32_t because
Use unsigned type because left shift operation to signed integer will cause undefined behavior when overflow occures or the value to be shifted is negative.
Use uint32_t because size of unsigned int is dependent to environments and it may be less than 32-bit long while registers used here (except for %cl) are 32-bit long.
Quote from N1570 6.5.7 Bitwise shift operators:
4 The result of E1 << E2 is E1 left-shifted E2 bit positions; vacated bits are filled with zeros. If E1 has an unsigned type, the value of the result is E1 × 2E2, reduced modulo one more than the maximum value representable in the result type. If E1 has a signed type and nonnegative value, and E1 × 2E2 is representable in the result type, then that is the resulting value; otherwise, the behavior is undefined.
Also note that stdint.h or inttypes.h has to be included to use uint32_t.
The width to shift is masked to 5-bit long in x86 CPUs that is 80286 or later.
uint32_t loop(uint32_t x, uint32_t n) {
uint32_t esi = x; /* movl 8(%ebp), %esi */
uint32_t ebx = n; /* movl 12(%ebp), %ebx */
uint32_t edi = -1; /* movl $-1, %edi */
uint32_t edx = 1; /* movl $1, %edx */
uint32_t eax, ecx;
do { /* .L2: */
eax = edx; /* movl %edx, %eax */
eax &= esi; /* andl %esi, %eax */
edi ^= eax; /* xorl %eax, %edi */
ecx = ebx; /* movl %ebx, %ecx */
edx <<= (ecx & 0xff) & 31; /* sall %cl, %edx */
} while (edx != 0); /* testl %edx, %edx ; jne .L2 */
eax = edi; /* movl %edi, %eax */
return eax;
}
2. I introduced variable names to make their roles clear.
uint32_t loop(uint32_t x, uint32_t n) {
uint32_t result = -1;
uint32_t mask = 1;
uint32_t eax, ecx;
do {
eax = mask;
eax &= x;
result ^= eax;
ecx = n;
mask <<= (ecx & 0xff) & 31;
} while (mask != 0);
return result;
}
3. I merged some expressions.
uint32_t loop(uint32_t x, uint32_t n) {
uint32_t result = -1;
uint32_t mask = 1;
do {
result ^= mask & x;
mask <<= n & 31;
} while (mask != 0);
return result;
}
4. I changed do loop to for loop because your attempt uses it.
uint32_t loop(uint32_t x, uint32_t n) {
uint32_t result = -1;
uint32_t mask;
for (mask = 1; mask != 0; mask <<= n & 31) {
result ^= mask & x;
}
return result;
}
Full code for testing and demo:
#include <stdio.h>
#include <inttypes.h>
#include <limits.h>
__asm__ (
/* support both environments that does and doesn't add underscore before function name */
"loop_asm:\n"
"_loop_asm:\n"
"push %ebp\n"
"mov %esp, %ebp\n"
"push %esi\n"
"push %edi\n"
"push %ebx\n"
"# x at %ebp+8, n at %ebp+12\n"
"movl 8(%ebp), %esi\n"
"movl 12(%ebp), %ebx\n"
"movl $-1, %edi\n"
"movl $1, %edx\n"
".L2_test:\n" /* rename .L2 to .L2_test to avoid collision */
"movl %edx, %eax\n"
"andl %esi, %eax\n"
"xorl %eax, %edi\n"
"movl %ebx, %ecx\n"
"sall %cl, %edx\n"
"testl %edx, %edx\n"
"jne .L2_test\n"
"movl %edi, %eax\n"
"pop %ebx\n"
"pop %edi\n"
"pop %esi\n"
"leave\n"
"ret\n"
);
uint32_t loop_asm(uint32_t, uint32_t);
uint32_t loop_convert(uint32_t x, uint32_t n) {
uint32_t result = -1;
uint32_t mask;
for (mask = 1; mask != 0; mask <<= n & 31) {
result ^= mask & x;
}
return result;
}
int mask;
int loop(int x, int n){
int result = -1;
for (mask = 1; mask >= result; mask = x&1) {
result ^= n;
}
return result;
}
int main(void) {
int x, n;
uint32_t raw, test, conv;
int miss_count = 0;
/* search for mismatch in some range */
for (n = 1; n < 32; n++) {
uint32_t x_test;
for (x_test = 0; x_test < UINT32_C(100000); x_test++) {
if (loop_asm(x, n) != loop_convert(x, n)) {
printf("mismatch at x=%"PRIu32", n=%d\n", x_test, n);
if (miss_count < INT_MAX) miss_count++;
}
}
}
printf("%d mismatch(es) found.\n", miss_count);
/* print some examples */
x = 100;
n = 5;
raw = loop_asm(x, n);
conv = loop_convert(x, n);
printf("loop_asm(%d, %d) = %"PRIu32"\n", x, n, raw);
printf("loop_convert(%d, %d) = %"PRIu32"\n", x, n, conv);
fflush(stdout);
test = loop(x, n);
printf("loop(%d, %d) = %"PRIu32"\n", x, n, test);
return 0;
}