I ve got a little problem with using MapViewOfFile. This function returns the starting address of the mapped view so as I think it's a sequence of bytes. And this is where I ve stacked:
INVOKE MapViewOfFile, hMapFile, FILE_MAP_READ, 0, 0, 0
mov pMemory, eax
mov edx, DWORD PTR [pMemory]
The pointer is correct cause during saving as a whole block of memory to file, everything is fine. So my question is: how to refer to every single elements(bytes).
Thanks in advance
Cast pMemory to the correct type and move it around from pMemory to pMemory + size of the mapped memory - size of the type to which you refer...
In other words, you have effectively allocated memory and associated the menory with a file that is changed as you change the memory.
In C assuming pMemory is the pointer returned by MapViewOfFile:
int x = (*(int *)pMemory); // Read the first int
char c = (*(char *)pMemory); // Read the first char
typedef struct oddball { int x, int y, int z, char str[256] } oddball; // assume the typedef syntax is right...
oddball *p = (oddball *)pMemory; // point to the base of the mapped memory
p += 14; // p now points to the 15th instance of oddball in the file.
// Or... p = &p[14];
p->x = 0;
p->y = 0;
p->z = 0;
strcpy( p->str( "This is the 0, 0, 0 position" ) );
// You've now changed the memory to which p[14] refers.
// To read every byte... (Again in C, use the compiler to generate asm
// Assumes:
// fileSize is the size of the mapped memory in bytes
// pMemory is the pointer returned by MapViewOfFile
// buffer is a block of memory that will hold n bytes
// pos is the position from which you want to read
// n is the number of bytes to read from position pos and the smallest size in bytes to which buffer can point
void readBytes( unsigned int fileSize, char *pMemory, char *buffer, unsigned int n, unsigned int pos )
{
char *endP = pMemory + fileSize;
char *start = pMemory + pos;
char *end = start + n;
int i = 0;
// Code to stay within your memory boundaries
if( end > endP )
{
n -= (end - endP); // This might be wrong...
end = endP;
}
if( end < start )
return;
// end boundary check
for( ; start < end; start++, i++ )
{
buffer[i] = *start;
}
}
Here's the asm code generated from the code above by the compiler with -O2
.686P
.XMM
.model flat
PUBLIC _readBytes
_TEXT SEGMENT
_fileSize$ = 8 ; size = 4
_pMemory$ = 12 ; size = 4
_buffer$ = 16 ; size = 4
_n$ = 20 ; size = 4
_pos$ = 24 ; size = 4
_readBytes PROC ; COMDAT
mov eax, DWORD PTR _pMemory$[esp-4]
mov edx, DWORD PTR _fileSize$[esp-4]
mov ecx, DWORD PTR _n$[esp-4]
add edx, eax
add eax, DWORD PTR _pos$[esp-4]
add ecx, eax
cmp ecx, edx
jbe SHORT $LN5#readBytes
mov ecx, edx
$LN5#readBytes:
cmp eax, ecx
jae SHORT $LN1#readBytes
push esi
mov esi, DWORD PTR _buffer$[esp]
sub esi, eax
$LL3#readBytes:
mov dl, BYTE PTR [eax]
mov BYTE PTR [esi+eax], dl
inc eax
cmp eax, ecx
jb SHORT $LL3#readBytes
pop esi
$LN1#readBytes:
ret 0
_readBytes ENDP
_TEXT ENDS
END
Related
I've developed a little program that tests the performance of 32 bit Windows structured exception handling. To keep the overhead minimal in contrast to the rest, I wrote the code generating an filtering the exception in assembly.
This is the C++-code:
#include <Windows.h>
#include <iostream>
using namespace std;
bool __fastcall getPointerFaultSafe( void *volatile *from, void **to );
int main()
{
auto getThreadTimes = []( LONGLONG &kt, LONGLONG &ut )
{
union
{
FILETIME ft;
LONGLONG ll;
} creationTime, exitTime, kernelTime, userTime;
GetThreadTimes( GetCurrentThread(), &creationTime.ft, &exitTime.ft, &kernelTime.ft, &userTime.ft );
kt = kernelTime.ll;
ut = userTime.ll;
};
LONGLONG ktStart, utStart;
getThreadTimes( ktStart, utStart );
size_t const COUNT = 100'000;
void *pv;
for( size_t c = COUNT; c; --c )
getPointerFaultSafe( nullptr, &pv );
LONGLONG ktEnd, utEnd;
getThreadTimes( ktEnd, utEnd );
double ktNsPerException = (ktEnd - ktStart) * 100.0 / COUNT,
utNsPerException = (utEnd - utStart) * 100.0 / COUNT;
cout << "kernel-time per exception: " << ktNsPerException << "ns" << endl;
cout << "user-time per exception: " << utNsPerException << "ns" << endl;
return 0;
}
This is the assembly-code:
.686P
PUBLIC ?getPointerFaultSafe##YI_NPCRAXPAPAX#Z
PUBLIC sehHandler
.SAFESEH sehHandler
sehHandler PROTO
_DATA SEGMENT
byebyeOffset dd 0
_DATA ENDS
exc_ctx_eax = 0b0h
exc_ctx_eip = 0b8h
_TEXT SEGMENT
?getPointerFaultSafe##YI_NPCRAXPAPAX#Z PROC
ASSUME ds:_DATA
push OFFSET sehHandler
push dword ptr fs:0
mov dword ptr fs:0, esp
mov byebyeOffset, OFFSET byebye - OFFSET mightfail
mov al, 1
mightfail:
mov ecx, dword ptr [ecx]
mov dword ptr [edx], ecx
byebye:
mov edx, dword ptr [esp]
mov dword ptr fs:0, edx
add esp, 8
ret 0
?getPointerFaultSafe##YI_NPCRAXPAPAX#Z ENDP
sehHandler PROC
mov eax, dword ptr [esp + 12]
mov dword ptr [eax + exc_ctx_eax], 0
mov edx, byebyeOffset
add [eax + exc_ctx_eip], edx
mov eax, 0
ret 0
sehHandler ENDP
_TEXT ENDS
END
How do I get the asm-module of my program /SAFESEH-compatible?
Why does this program consume so much userland CPU-time? The library-code being called by the operating-system after the exception has begun to be handled should have only to save all the registers in the CONTEXT-structure, fill the EXCEPTION_RECORD-structure, call the topmost exception-filter which - in this case - shifts the execution two instructions further, and when it returns it will in my case restore all the registers an continue execution according to what I returned in EAX. That's should all not be so much time that almost 1/3 of the CPU-time will be spent in userland. That's about 2,3ms, i.e. when my old Ryzen 1800X is boosting on one core with 4GHz, about 5.200 clock-cycles.
I'm using the byebyeOffset-variable in my code to carry the distance between the unsafe instruction that might generate an access-violation and the safe code afterwards. I'm initializing this variable before the unsafe instruction. But it would be nice to have this offset statically as an immediate at the point where I add it on EIP in the exception-filter function sehHandler; but the offsets are scoped to getPointerFaultSafe. Of course storing the offset and fetching it from the variable take a negligible part of the overall computation time, but it would be nicer to have a clean solution.
What is difference between nested if (condition) and logical operator in terms of performance and logic.
if(a && b && c){
//do something
}
if(a){
if(b){
if(c){
//do something
}
}
}
Are the above codes same logic wise?
My main concern is performance of the code, performance wise which is the best to use ?
If you try to convert those two codes into assembly language (which is very close to machine language), both codes will converted exactly the same (first code, second code):
C:
void Main(){
int a=1, b=2, c= 3, res = 0;
if(a && b && c)
res = 100;
}
// or
void Main(){
int a=1, b=2, c= 3, res = 0;
if(a)
if(b)
if(c)
res = 100;
}
Assembly Output:
Main():
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-4], 1
mov DWORD PTR [rbp-8], 2
mov DWORD PTR [rbp-12], 3
mov DWORD PTR [rbp-16], 0
cmp DWORD PTR [rbp-4], 0
je .L3 ; jump to the end if `a` is not true
cmp DWORD PTR [rbp-8], 0
je .L3 ; jump to the end if `b` is not true
cmp DWORD PTR [rbp-12], 0
je .L3 ; jump to the end if `c` is not true
mov DWORD PTR [rbp-16], 100 ; otherwise do something
.L3:
nop
pop rbp
ret
Very specific optimisation task.
I have 3 arrays:
const char* inputTape
const int* inputOffset, organised in a group of four
char* outputTapeoutput
which i must assemble output tape from input, according to following 5 operations:
int selectorOffset = inputOffset[4*i];
char selectorValue = inputTape[selectorOffset];
int outputOffset = inputOffset[4*i+1+selectorValue];
char outputValue = inputTape[outputOffset];
outputTape[i] = outputValue; // store byte
and then advance counter.
All iterations are same and could be done all in parallel. Format of inputOffset could be a subject for change, but until same input will produce same output.
OpenCL on GPU fails on this algorithm (works same or even slower that cpu)
Assembly the best i got 5 mov, 1 lea, 1 dec instructions. Upd:
thanks to Peter Cordes little hint
loop_start:
mov eax,dword ptr [rdx-10h] ; selector offset
movzx r10d,byte ptr [rax+r8] ; selector value
mov eax,dword ptr [rdx+r10*4-0Ch] ; output offset
movzx r10d,byte ptr [r8+rax] ; output value
mov byte ptr [r9+rcx-1],r10b ; store to outputTape
lea rdx, [rdx-10h] ; pointer to inputOffset for current
dec ecx ; loop counter, sets zero flag if (ecx == 0)
jne loop_start ; continue looping while non zero iterations left: ( ecx != 0 )
How could i optimise this for SSE/AVX operation? i am stumbled...
UPD: better to see it than to hear it..
I have a C program in Visual Studio where I would like to compare the disassembly between the standard debug build and a build where speed is optimized.
For standard debug things work perfectly, I set a breakpoint and then right-click "Goto Dissassembly" to view the equivalent assembly code.
However, when I try to do that when the optimized for speed switch is enabled, I only see part of the assembly.
My procedure for creating an optimized speed build is (staying in the debug build configuration):
From the project properties -> C/C++->Optimization change Optimization from disabled /Od to Optimize for maximize speed /O2 and changing favor Size or Speed to 'Favor Fast Code /Ot)
Under Code Generation ->Basic Runtime checks change to 'Default'
Note: I decided to modify my debug configuration and not do a full optimization (which includes size) because I wanted to preserve my code symbols.
As part of my investigation, I created a trivial program for testing. Here is the source:
#include<stdio.h>
int main()
{
int x = 0;
int y = 0;
printf("The value of x is %d\n", x);
while (1)
{
++x;
if (x > 1000000)
break;
}
y = x;
printf("The value of y is %d\n", y);
return 0;
}
In full debug mode here is my disassembly: Note: I can easily see what is happening in the WHILE LOOP
int x = 0;
00CF1779 mov dword ptr [x],0
int y = 0;
00CF1780 mov dword ptr [y],0
printf("The value of x is %d\n", x);
00CF1787 mov eax,dword ptr [x]
00CF178A push eax
00CF178B push offset string "The value of x is %d\n" (0CF6B30h)
00CF1790 call _printf (0CF1320h)
00CF1795 add esp,8
while (1)
00CF1798 mov eax,1
00CF179D test eax,eax
00CF179F je main+47h (0CF17B7h)
{
++x;
00CF17A1 mov eax,dword ptr [x]
00CF17A4 add eax,1
00CF17A7 mov dword ptr [x],eax
if (x > 1000000)
00CF17AA cmp dword ptr [x],0F4240h
00CF17B1 jle main+45h (0CF17B5h)
break;
00CF17B3 jmp main+47h (0CF17B7h)
}
00CF17B5 jmp main+28h (0CF1798h)
y = x;
00CF17B7 mov eax,dword ptr [x]
00CF17BA mov dword ptr [y],eax
printf("The value of y is %d\n", y);
00CF17BD mov eax,dword ptr [y]
00CF17C0 push eax
00CF17C1 push offset string "The value of y is %d\n" (0CF6BE8h)
00CF17C6 call _printf (0CF1320h)
00CF17CB add esp,8
return 0;
Here is the disassembly from the optimized speed build - note there is no code shown for the while loop routine
int x = 0;
int y = 0;
printf("The value of x is %d\n", x);
00EE16F0 push 0
00EE16F2 push offset string "The value of x is %d\n" (0EE6B30h)
00EE16F7 call _printf (0EE1320h)
while (1)
{
++x;
if (x > 1000000)
break;
}
y = x;
printf("The value of y is %d\n", y);
00EE16FC push 0F4241h
00EE1701 push offset string "The value of y is %d\n" (0EE6BE8h)
00EE1706 call _printf (0EE1320h)
00EE170B add esp,10h
return 0;
00EE170E xor eax,eax
}
00EE1710 ret
Both builds output the same values for x and y in the printf statements.
Does anyone know how to solve my problem?
I have a function in VS where I pass a pointer to the function. I then want to store the pointer in a register to further manipulate. How do you do that?
I have tried
float __declspec(align(16)) x[16] =
{
0.125000, 0.125000, 0.125000, 0,
-0.125000, 0.125000, -0.125000, 0,
0.125000, -0.125000, -0.125000, 0,
-0.125000, -0.125000, 0.125000, 0
};
void e()
{
__asm mov eax, x // doesn't work
__asm mov ebx, [eax]
}
void f(float *p)
{
__asm mov eax, p // does work
__asm mov ebx, [eax]
}
int main()
{
f(x);
e();
}
Option 1 actually seems to work fine. Consider the following program:
#include <stdio.h>
void f(int *p) {
__asm mov eax, p
__asm mov ebx, [eax]
// break here
}
void main()
{
int i = 0x12345678;
f(&i);
}
With Visual Studio 2008 SP1, a single-file C++ program and debug build, I'm getting the following in the registers window when stepping into the end of f():
EAX = 004DF960
EBX = 12345678
ECX = 00000000
EDX = 00000001
ESI = 00000000
EDI = 004DF884
EIP = 003013C3
ESP = 004DF7B8
EBP = 004DF884
EFL = 00000202
Looking at the values in EAX, EBX and ESP, that looks like a pretty good evidence that you actually have the pointer you wanted in EAX. The address in EAX is just a tad higher than the one in ESP, suggesting it's one frame higher up the stack. The dereferenced value loaded into EBX suggests we got the right address.
Loading the address of a global is subtly different. The following example uses the LEA instruction to accomplish the task.
#include <stdio.h>
int a[] = { 0x1234, 0x4567 };
void main()
{
// __asm mov eax, a ; interpreted as eax <- a[0]
__asm lea eax, a ; interpreted as eax <- &a[0]
__asm mov ebx, [eax]
__asm mov ecx, [eax+4]
// break here
}
Stepping to the end of main() gets you the following register values. EAX gets the address of the first element in the array, while EBX an ECX get the values of its members.
EAX = 00157038
EBX = 00001234
ECX = 00004567
EDX = 00000001
ESI = 00000000
EDI = 0047F800
EIP = 001513C9
ESP = 0047F734
EBP = 0047F800
EFL = 00000202
The magic isn't in the LEA instruction itself. Rather, it appears that the __asm directive treats C/C++ identifiers differently depending on whether a MOV or an LEA instruction is used. Here is the ASM dump of the same program, when the MOV instruction is uncommented. Notice how the MOV instruction gets the content of a[] as its argument (DWORD PTR), while the LEA instruction gets its offset.
; ...
PUBLIC ?a##3PAHA ; a
_DATA SEGMENT
?a##3PAHA DD 01234H ; a
DD 04567H
_DATA ENDS
; ...
mov eax, DWORD PTR ?a##3PAHA
lea eax, OFFSET ?a##3PAHA
mov ebx, DWORD PTR [eax]
mov ecx, DWORD PTR [eax+4]
; ...
I'm not sure if this is correct, but have you tried casting *p to an int first and then loading that value?
void f(*p)
{
int tmp = (int)p;
// asm stuff...
}