I have a C program in Visual Studio where I would like to compare the disassembly between the standard debug build and a build where speed is optimized.
For standard debug things work perfectly, I set a breakpoint and then right-click "Goto Dissassembly" to view the equivalent assembly code.
However, when I try to do that when the optimized for speed switch is enabled, I only see part of the assembly.
My procedure for creating an optimized speed build is (staying in the debug build configuration):
From the project properties -> C/C++->Optimization change Optimization from disabled /Od to Optimize for maximize speed /O2 and changing favor Size or Speed to 'Favor Fast Code /Ot)
Under Code Generation ->Basic Runtime checks change to 'Default'
Note: I decided to modify my debug configuration and not do a full optimization (which includes size) because I wanted to preserve my code symbols.
As part of my investigation, I created a trivial program for testing. Here is the source:
#include<stdio.h>
int main()
{
int x = 0;
int y = 0;
printf("The value of x is %d\n", x);
while (1)
{
++x;
if (x > 1000000)
break;
}
y = x;
printf("The value of y is %d\n", y);
return 0;
}
In full debug mode here is my disassembly: Note: I can easily see what is happening in the WHILE LOOP
int x = 0;
00CF1779 mov dword ptr [x],0
int y = 0;
00CF1780 mov dword ptr [y],0
printf("The value of x is %d\n", x);
00CF1787 mov eax,dword ptr [x]
00CF178A push eax
00CF178B push offset string "The value of x is %d\n" (0CF6B30h)
00CF1790 call _printf (0CF1320h)
00CF1795 add esp,8
while (1)
00CF1798 mov eax,1
00CF179D test eax,eax
00CF179F je main+47h (0CF17B7h)
{
++x;
00CF17A1 mov eax,dword ptr [x]
00CF17A4 add eax,1
00CF17A7 mov dword ptr [x],eax
if (x > 1000000)
00CF17AA cmp dword ptr [x],0F4240h
00CF17B1 jle main+45h (0CF17B5h)
break;
00CF17B3 jmp main+47h (0CF17B7h)
}
00CF17B5 jmp main+28h (0CF1798h)
y = x;
00CF17B7 mov eax,dword ptr [x]
00CF17BA mov dword ptr [y],eax
printf("The value of y is %d\n", y);
00CF17BD mov eax,dword ptr [y]
00CF17C0 push eax
00CF17C1 push offset string "The value of y is %d\n" (0CF6BE8h)
00CF17C6 call _printf (0CF1320h)
00CF17CB add esp,8
return 0;
Here is the disassembly from the optimized speed build - note there is no code shown for the while loop routine
int x = 0;
int y = 0;
printf("The value of x is %d\n", x);
00EE16F0 push 0
00EE16F2 push offset string "The value of x is %d\n" (0EE6B30h)
00EE16F7 call _printf (0EE1320h)
while (1)
{
++x;
if (x > 1000000)
break;
}
y = x;
printf("The value of y is %d\n", y);
00EE16FC push 0F4241h
00EE1701 push offset string "The value of y is %d\n" (0EE6BE8h)
00EE1706 call _printf (0EE1320h)
00EE170B add esp,10h
return 0;
00EE170E xor eax,eax
}
00EE1710 ret
Both builds output the same values for x and y in the printf statements.
Does anyone know how to solve my problem?
Related
I've developed a little program that tests the performance of 32 bit Windows structured exception handling. To keep the overhead minimal in contrast to the rest, I wrote the code generating an filtering the exception in assembly.
This is the C++-code:
#include <Windows.h>
#include <iostream>
using namespace std;
bool __fastcall getPointerFaultSafe( void *volatile *from, void **to );
int main()
{
auto getThreadTimes = []( LONGLONG &kt, LONGLONG &ut )
{
union
{
FILETIME ft;
LONGLONG ll;
} creationTime, exitTime, kernelTime, userTime;
GetThreadTimes( GetCurrentThread(), &creationTime.ft, &exitTime.ft, &kernelTime.ft, &userTime.ft );
kt = kernelTime.ll;
ut = userTime.ll;
};
LONGLONG ktStart, utStart;
getThreadTimes( ktStart, utStart );
size_t const COUNT = 100'000;
void *pv;
for( size_t c = COUNT; c; --c )
getPointerFaultSafe( nullptr, &pv );
LONGLONG ktEnd, utEnd;
getThreadTimes( ktEnd, utEnd );
double ktNsPerException = (ktEnd - ktStart) * 100.0 / COUNT,
utNsPerException = (utEnd - utStart) * 100.0 / COUNT;
cout << "kernel-time per exception: " << ktNsPerException << "ns" << endl;
cout << "user-time per exception: " << utNsPerException << "ns" << endl;
return 0;
}
This is the assembly-code:
.686P
PUBLIC ?getPointerFaultSafe##YI_NPCRAXPAPAX#Z
PUBLIC sehHandler
.SAFESEH sehHandler
sehHandler PROTO
_DATA SEGMENT
byebyeOffset dd 0
_DATA ENDS
exc_ctx_eax = 0b0h
exc_ctx_eip = 0b8h
_TEXT SEGMENT
?getPointerFaultSafe##YI_NPCRAXPAPAX#Z PROC
ASSUME ds:_DATA
push OFFSET sehHandler
push dword ptr fs:0
mov dword ptr fs:0, esp
mov byebyeOffset, OFFSET byebye - OFFSET mightfail
mov al, 1
mightfail:
mov ecx, dword ptr [ecx]
mov dword ptr [edx], ecx
byebye:
mov edx, dword ptr [esp]
mov dword ptr fs:0, edx
add esp, 8
ret 0
?getPointerFaultSafe##YI_NPCRAXPAPAX#Z ENDP
sehHandler PROC
mov eax, dword ptr [esp + 12]
mov dword ptr [eax + exc_ctx_eax], 0
mov edx, byebyeOffset
add [eax + exc_ctx_eip], edx
mov eax, 0
ret 0
sehHandler ENDP
_TEXT ENDS
END
How do I get the asm-module of my program /SAFESEH-compatible?
Why does this program consume so much userland CPU-time? The library-code being called by the operating-system after the exception has begun to be handled should have only to save all the registers in the CONTEXT-structure, fill the EXCEPTION_RECORD-structure, call the topmost exception-filter which - in this case - shifts the execution two instructions further, and when it returns it will in my case restore all the registers an continue execution according to what I returned in EAX. That's should all not be so much time that almost 1/3 of the CPU-time will be spent in userland. That's about 2,3ms, i.e. when my old Ryzen 1800X is boosting on one core with 4GHz, about 5.200 clock-cycles.
I'm using the byebyeOffset-variable in my code to carry the distance between the unsafe instruction that might generate an access-violation and the safe code afterwards. I'm initializing this variable before the unsafe instruction. But it would be nice to have this offset statically as an immediate at the point where I add it on EIP in the exception-filter function sehHandler; but the offsets are scoped to getPointerFaultSafe. Of course storing the offset and fetching it from the variable take a negligible part of the overall computation time, but it would be nicer to have a clean solution.
What is difference between nested if (condition) and logical operator in terms of performance and logic.
if(a && b && c){
//do something
}
if(a){
if(b){
if(c){
//do something
}
}
}
Are the above codes same logic wise?
My main concern is performance of the code, performance wise which is the best to use ?
If you try to convert those two codes into assembly language (which is very close to machine language), both codes will converted exactly the same (first code, second code):
C:
void Main(){
int a=1, b=2, c= 3, res = 0;
if(a && b && c)
res = 100;
}
// or
void Main(){
int a=1, b=2, c= 3, res = 0;
if(a)
if(b)
if(c)
res = 100;
}
Assembly Output:
Main():
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-4], 1
mov DWORD PTR [rbp-8], 2
mov DWORD PTR [rbp-12], 3
mov DWORD PTR [rbp-16], 0
cmp DWORD PTR [rbp-4], 0
je .L3 ; jump to the end if `a` is not true
cmp DWORD PTR [rbp-8], 0
je .L3 ; jump to the end if `b` is not true
cmp DWORD PTR [rbp-12], 0
je .L3 ; jump to the end if `c` is not true
mov DWORD PTR [rbp-16], 100 ; otherwise do something
.L3:
nop
pop rbp
ret
Using WinDBG for debugging the assembly code of an executable, it seems that compiler inserts some other codes between two sequential statements. The statements are pretty simple, e.g. they don't work with complex objects for function calls;
int a, b;
char c;
long l;
a = 0; // ##
b = a + 1; // %%
c = 1; // ##
l = 1000000;
l = l + 1;
And the disassembly is
## 008a1725 c745f800000000 mov dword ptr [ebp-8],0
008a172c 80bd0bffffff00 cmp byte ptr [ebp-0F5h],0 ss:002b:0135f71f=00
008a1733 750d jne test!main+0x42 (008a1742)
008a1735 687c178a00 push offset test!main+0x7c (008a177c)
008a173a e893f9ffff call test!ILT+205(__RTC_UninitUse) (008a10d2)
008a173f 83c404 add esp,4
008a1742 8b45ec mov eax,dword ptr [ebp-14h]
%% 008a1745 83c001 add eax,1
008a1748 c6850bffffff01 mov byte ptr [ebp-0F5h],1
008a174f 8945ec mov dword ptr [ebp-14h],eax
## 008a1752 c645e301 mov byte ptr [ebp-1Dh],1
Please note that ##, %% and ## in the disassembly list show the corresponding C++ lines.
So what are that call, cmp, jne and push?
It is the compiler run-time error checking (RTC), the RTC switch check for uninitialized variables, I think that you can manage it from Visual Studio (compiler options).
For more information, take a look to this. Section /RTCu switch
I am pretty new to assembly that I'm learning from the last 7 hours (It's an early peek into the courses I had in the next semester starting next month). I read some online tutorials, and the nasm manual and started to port a C program to nasm, just for learning.
int fact(int n)
{
return (n < 0) ? 1 : n * fact(n - 1);
}
I then started to port it to assembly, and had this as my solution:
fact:
; int fact(int n)
cmp dword ebx, 0 ; n == 0
je .yes
.no:
push ebx ; save ebx in stack
sub ebx, dword 1 ; sub 1 from ebx. (n - 1)
call fact ; call fact recursively
pop ebx ; get back the ebx from stack
imul eax, ebx ; eax *= ebx; eax == fact(n - 1)
ret
.yes:
mov eax, dword 1 ; store 1 in eax to return it
ret
I take in a DWORD (int I suppose) in the ebx register and return the value in the eax register. As you can see I am not at all using the variable i that I have declared in the .bss section. My variables are like this:
section .bss
; int i, f
i resb 2
f resb 2
It's 2 bytes for an int right? Okay then I'm prompting the user in the _main, getting the input with _scanf and then calling the function. Other than this and calling the function, I have no other code that changes the value of i variable.
mov ebx, dword [i] ; check for validity of the factorial value
cmp dword ebx, 0
jnl .no
.yes:
push em ; print error message and exit
call _printf
add esp, 4
ret
.no:
push dword 0 ; print the result and exit
push dword [i]
push rm
call _printf
add esp, 12
call fact ; call the fact function
mov dword [f], eax
push dword [f] ; print the result and exit
push dword [i]
push rm
call _printf
add esp, 12
ret
I don't see where I'm modifying the value of i variable, on first print before the call to fact it is indeed the same value entered by the user, but after calling the function, in the later print, it is printing some garbage value, as the following output:
E:\ASM> factorial
Enter a number: 5
The factorial of 5 is 0The factorial of 7864325 is 120
E:\ASM>
Any clues? My complete source code is in this gist: https://gist.github.com/sriharshachilakapati/70049a778e12d8edd9c7acf6c2d44c33
I ve got a little problem with using MapViewOfFile. This function returns the starting address of the mapped view so as I think it's a sequence of bytes. And this is where I ve stacked:
INVOKE MapViewOfFile, hMapFile, FILE_MAP_READ, 0, 0, 0
mov pMemory, eax
mov edx, DWORD PTR [pMemory]
The pointer is correct cause during saving as a whole block of memory to file, everything is fine. So my question is: how to refer to every single elements(bytes).
Thanks in advance
Cast pMemory to the correct type and move it around from pMemory to pMemory + size of the mapped memory - size of the type to which you refer...
In other words, you have effectively allocated memory and associated the menory with a file that is changed as you change the memory.
In C assuming pMemory is the pointer returned by MapViewOfFile:
int x = (*(int *)pMemory); // Read the first int
char c = (*(char *)pMemory); // Read the first char
typedef struct oddball { int x, int y, int z, char str[256] } oddball; // assume the typedef syntax is right...
oddball *p = (oddball *)pMemory; // point to the base of the mapped memory
p += 14; // p now points to the 15th instance of oddball in the file.
// Or... p = &p[14];
p->x = 0;
p->y = 0;
p->z = 0;
strcpy( p->str( "This is the 0, 0, 0 position" ) );
// You've now changed the memory to which p[14] refers.
// To read every byte... (Again in C, use the compiler to generate asm
// Assumes:
// fileSize is the size of the mapped memory in bytes
// pMemory is the pointer returned by MapViewOfFile
// buffer is a block of memory that will hold n bytes
// pos is the position from which you want to read
// n is the number of bytes to read from position pos and the smallest size in bytes to which buffer can point
void readBytes( unsigned int fileSize, char *pMemory, char *buffer, unsigned int n, unsigned int pos )
{
char *endP = pMemory + fileSize;
char *start = pMemory + pos;
char *end = start + n;
int i = 0;
// Code to stay within your memory boundaries
if( end > endP )
{
n -= (end - endP); // This might be wrong...
end = endP;
}
if( end < start )
return;
// end boundary check
for( ; start < end; start++, i++ )
{
buffer[i] = *start;
}
}
Here's the asm code generated from the code above by the compiler with -O2
.686P
.XMM
.model flat
PUBLIC _readBytes
_TEXT SEGMENT
_fileSize$ = 8 ; size = 4
_pMemory$ = 12 ; size = 4
_buffer$ = 16 ; size = 4
_n$ = 20 ; size = 4
_pos$ = 24 ; size = 4
_readBytes PROC ; COMDAT
mov eax, DWORD PTR _pMemory$[esp-4]
mov edx, DWORD PTR _fileSize$[esp-4]
mov ecx, DWORD PTR _n$[esp-4]
add edx, eax
add eax, DWORD PTR _pos$[esp-4]
add ecx, eax
cmp ecx, edx
jbe SHORT $LN5#readBytes
mov ecx, edx
$LN5#readBytes:
cmp eax, ecx
jae SHORT $LN1#readBytes
push esi
mov esi, DWORD PTR _buffer$[esp]
sub esi, eax
$LL3#readBytes:
mov dl, BYTE PTR [eax]
mov BYTE PTR [esi+eax], dl
inc eax
cmp eax, ecx
jb SHORT $LL3#readBytes
pop esi
$LN1#readBytes:
ret 0
_readBytes ENDP
_TEXT ENDS
END