rdpmc in user mode does not work even with PCE set - performance

Based on the Wikipedia entry as well as the Intel manual, rdpmc should be available to user-mode processes as long as bit 8 of CR4 is set. However, I am still running into general protection error when trying to run rdpmc from userspace even with that bit set.
I am running on an 8-core Intel X3470 on kernel 2.6.32-279.el6.x86_64.
Here is the user-mode program I am trying to execute:
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <sched.h>
#include <assert.h>
uint64_t
read_pmc(int ecx)
{
unsigned int a, d;
__asm __volatile("rdpmc" : "=a"(a), "=d"(d) : "c"(ecx));
return ((uint64_t)a) | (((uint64_t)d) << 32);
}
int main(int ac, char **av)
{
uint64_t start, end;
cpu_set_t cpuset;
unsigned int c;
int i;
if (ac != 3) {
fprintf(stderr, "usage: %s cpu-id pmc-num\n", av[0]);
exit(EXIT_FAILURE);
}
i = atoi(av[1]);
c = atoi(av[2]);
CPU_ZERO(&cpuset);
CPU_SET(i, &cpuset);
assert(sched_setaffinity(0, sizeof(cpuset), &cpuset) == 0);
printf("%lu\n", read_pmc(c));
return 0;
}
Here is the kernel module which sets the bit and reads out CR4 so I can manually verify that the bit has been set.
/*
* Enable PMC in user mode.
*/
#include <linux/module.h>
#include <linux/kernel.h>
int init_module(void)
{
typedef long unsigned int uint64_t;
uint64_t output;
// Set CR4, Bit 8 to enable PMC
__asm__("push %rax\n\t"
"mov %cr4,%rax;\n\t"
"or $(1 << 7),%rax;\n\t"
"mov %rax,%cr4;\n\t"
"wbinvd\n\t"
"pop %rax"
);
// Read back CR4 to check the bit.
__asm__("\t mov %%cr4,%0" : "=r"(output));
printk(KERN_INFO "%lu", output);
return 0;
}
void cleanup_module(void)
{
__asm__("push %rax\n\t"
"push %rbx\n\t"
"mov %cr4,%rax;\n\t"
"mov $(1 << 7), %rbx\n\t"
"not %rbx\n\t"
"and %rbx, %rax;\n\t"
"mov %rax,%cr4;\n\t"
"wbinvd\n\t"
"pop %rbx\n\t"
"pop %rax\n\t"
);
}

Apparently, when Intel says Bit 8, they are referring to the 9th bit from the right, since their indexing begins at 0. Replacing $(1 << 7) with $(1 << 8) globally resolves the issue, and allows rdpmc to be called from user mode.
Here is the updated kernel module, also using on_each_cpu to make sure that it is set on every core.
/*
* Read PMC in kernel mode.
*/
#include <linux/module.h> /* Needed by all modules */
#include <linux/kernel.h> /* Needed for KERN_INFO */
static void printc4(void) {
typedef long unsigned int uint64_t;
uint64_t output;
// Read back CR4 to check the bit.
__asm__("\t mov %%cr4,%0" : "=r"(output));
printk(KERN_INFO "%lu", output);
}
static void setc4b8(void * info) {
// Set CR4, Bit 8 (9th bit from the right) to enable
__asm__("push %rax\n\t"
"mov %cr4,%rax;\n\t"
"or $(1 << 8),%rax;\n\t"
"mov %rax,%cr4;\n\t"
"wbinvd\n\t"
"pop %rax"
);
// Check which CPU we are on:
printk(KERN_INFO "Ran on Processor %d", smp_processor_id());
printc4();
}
static void clearc4b8(void * info) {
printc4();
__asm__("push %rax\n\t"
"push %rbx\n\t"
"mov %cr4,%rax;\n\t"
"mov $(1 << 8), %rbx\n\t"
"not %rbx\n\t"
"and %rbx, %rax;\n\t"
"mov %rax,%cr4;\n\t"
"wbinvd\n\t"
"pop %rbx\n\t"
"pop %rax\n\t"
);
printk(KERN_INFO "Ran on Processor %d", smp_processor_id());
}
int init_module(void)
{
on_each_cpu(setc4b8, NULL, 0);
return 0;
}
void cleanup_module(void)
{
on_each_cpu(clearc4b8, NULL, 0);
}

Echoing "2" to /sys/bus/event_source/devices/cpu/rdpmc allows user processes
to access performance counters via the rdpmc instruction.
Note that behaviour has changed. Prior to 4.0 "1" meant "enabled"
while meant "0" disable. Now "1" means allow only for processes that have active perf events. More details: http://man7.org/linux/man-pages/man2/perf_event_open.2.html

Related

__seg_fs on GCC. Is it possible to emulate it just in a program?

I've just read about support for %fs and %gs segment prefixes on the Intel platforms in GCC.
It was mentioned that "The way you obtain %gs-based pointers, or control the
value of %gs itself, is out of the scope of gcc;"
I'm looking for a way when I manually can set the value of %fs (I'm on IA32, RH Linux) and work with it. When I just set %fs=%ds the test below works fine and this is expected. But I cannot change the test in order to have another value of %fs and do not get a segmentation fault. I start thinking that changing the value of %fs is not the only thing to do. So I'm looking for an advice how to make a part of memory addressed by %fs that is not equal to DS.
#include <stddef.h>
typedef char __seg_fs fs_ptr;
fs_ptr p[] = {'h','e','l','l','o','\0'};
void fs_puts(fs_ptr *s)
{
char buf[100];
buf[0] = s[0];
buf[1] = s[1];
buf[2] = s[2];
buf[3] = '\0';
puts(buf);
}
void __attribute__((constructor)) set_fs()
{
__asm__("mov %ds, %bx\n\t"
"add $0, %bx\n\t" //<---- if fs=ds then the program executes as expected. If not $0 here, then segmentation fault happens.
"mov %bx, %fs\n\t");
}
int main()
{
fs_puts(p);
return 0;
}
I've talked with Armin who implemented __seg_gs/__seg_fs in GCC (Thanks Armin!).
So basically I cannot use these keywords for globals. The aim of introducing __seg_gs/fs was to have a possibility to dynamically allocate regions of memory that are thread-local.
We cannot use __thread for a pointer and to allocate a memory for it using malloc. But __seg_gs/fs introduce such possibility.
The test below somehow illustrates that.
Note that arch_prctl() was used. It exists as 64-bit version only.
Also note that %fs is used for __thread on 64-bit and %gs is free.
#include <stddef.h>
#include <string.h>
#include <stdio.h>
#include <asm/ldt.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#include <asm/prctl.h>
#include <sys/syscall.h>
#include <unistd.h>
typedef __seg_gs char gs_str;
void gs_puts(gs_str *ptr)
{
int i;
char buf[100];
for(i = 0; i < 100; i++)
buf[i] = ptr[i];
puts(buf);
}
int main()
{
int i;
void *buffer = malloc(100 * sizeof(char));
arch_prctl(ARCH_SET_GS, buffer);
gs_str *gsobj = (gs_str *)0;
for (i = 0; i < 100; i++)
gsobj[i] = 'a'; /* in the %gs space */
gs_puts(gsobj);
return 0;
}

compiling error "impossible constraint in 'asm'"

I tried to compile Csmith on my "SunOS sun4v 5.10" system, but I got errors like these:
platform.cpp: In function 'long unsigned int platform_gen_seed()':
platform.cpp:78: error: impossible constraint in 'asm'
Could anyone tell where is the mistake?
#if (TARGET_CPU_powerpc == 1 || TARGET_CPU_powerpc64 == 1)
/*For PPC, got from:
http://lists.ozlabs.org/pipermail/linuxppc-dev/1999-October/003889.html
*/
static unsigned long long read_time(void) {
unsigned long long retval;
unsigned long junk;
__asm__ __volatile__ ("\n\
1: mftbu %1\n\
mftb %L0\n\
mftbu %0\n\
cmpw %0,%1\n\
bne 1b"
: "=r" (retval), "=r" (junk));
return retval;
}
#else
#ifdef WIN32
static unsigned __int64 read_time(void) {
unsigned l, h;
_asm {rdtsc
mov l, eax
mov h, edx
}
return (h << 32) + l ;
}
#else
static long long read_time(void) {
long long l;
asm volatile( "rdtsc\n\t"
: "=A" (l)
);
return l;
}
#endif
#endif
unsigned long platform_gen_seed()
{
return (long) read_time();
}
The problem is that the code you're trying to compile assumes that any target CPU that's not a PowerPC must be an x86 processor. The code simply doesn't doesn't support your SPARC CPU.
Fortunately the code doesn't seem to be critical, it's apparently only used to seed a random number generator, which is then used to create random C programs. The goal being to prevent multiple instances of the program that are started at the same time from generating the same random programs. I'd replace the code with something more portable that's not dependent on the CPU. Something like this:
#ifdef WIN32
unsigned long platform_gen_seed()
{
LARGE_INTEGER now;
QueryPerformanceCounter(&now);
return now.LowPart;
}
#else /* assume something Unix-like */
static unsigned long generic_gen_seed() {
pid_t pid = getpid();
time_t now;
time(&now);
return (unsigned long)(now ^ (pid << 16 | ((pid >> 16) & 0xFFFF)));
}
#ifdef CLOCK_REALTIME
unsigned long platform_gen_seed()
{
struct timespec now, resolution;
if (clock_gettime(CLOCK_REALTIME, &now) == -1
|| clock_getres(CLOCK_REALTIME, &resolution) == -1
|| resolution.tv_sec > 0 || resolution.tv_nsec > 1000000) {
return generic_gen_seed();
}
return (now.tv_nsec / resolution.tv_nsec
+ now.tv_sec * resolution.tv_nsec);
}
#else
unsigned long platform_gen_seed()
{
return generic_gen_seed();
}
#endif /* CLOCK_REALTIME */
#endif /* WIN32 */
The code has been test in isolation on Linux and Windows. It should also work in isolation on Solaris SPARC, but I don't know how well it work in context of the actual program.

MakeCodeWritable

good afternoon.
I got the code below on a book. I'm trying to execute it, but I don't know what is the "first" and "last" parameters on the MakeCodeWritable function, or where I can find them. Someone can help? This code is about C obfuscation method. I'm using Xcode program and LLVM GCC 4.2 compiler.
#include <stdio.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
typedef unsigned int uint32;
typedef char* caddr_t;
typedef uint32* waddr_t;
#define Tam_celula 64
#define ALIGN __attribute__((aligned(Tam_celula)))
void makeCodeWritable(char* first, char* last) {
char* firstpage = first - ((int)first % getpagesize());
char* lastpage = last - ((int)last % getpagesize());
int pages = (lastpage-firstpage)/getpagesize()+1;
if (mprotect(firstpage,pages*getpagesize(), PROT_READ|PROT_EXEC|PROT_WRITE)==-1) perror("mprotect");
}
void xor(caddr_t from, caddr_t to, int len){
int i;
for(i=0;i<len;i++){
*to ^= *from; from++; to++;
} }
void swap(caddr_t from, caddr_t to, int len){
int i;
for(i=0;i<len;i++){
char t = *from; *from = *to; *to = t; from++; to++;
} }
#define CELLSIZE 64
#define ALIGN asm volatile (".align 64\n");
void P() {
static int firsttime=1; if (firsttime) {
xor(&&cell5,&&cell2,CELLSIZE);
xor(&&cell0,&&cell3,CELLSIZE);
swap(&&cell1,&&cell4,CELLSIZE);
firsttime = 0; }
char* a[] = {&&align0,&&align1,&&align2,&&align3,&&align4,&&align5};
char*next[] ={&&cell0,&&cell1,&&cell2,&&cell3, &&cell4,&&cell5};
goto *next[0];
align0: ALIGN
cell0: printf("SPGM0\n");
xor(&&cell0,&&cell3,3*CELLSIZE);
goto *next[3];
align1: ALIGN
cell1: printf("SPGM2\n"); xor(&&cell0,&&cell3,3*CELLSIZE);
goto *next[4];
align2: ALIGN
cell2: printf("SPGM4\n"); xor(&&cell0,&&cell3,3*CELLSIZE);
goto *next[5];
align3: ALIGN
cell3: printf("SPGM1\n"); xor(&&cell3,&&cell0,3*CELLSIZE);
goto *next[1];
align4: ALIGN
cell4: printf("SPGM3\n"); xor(&&cell3,&&cell0,3*CELLSIZE);
goto *next[2];
align5: ALIGN
cell5: printf("SPGM5\n");
xor(&&cell3,&&cell0,3*CELLSIZE);
}
int main (int argc, char *argv[]) {
makeCodeWritable(...);
P(); P();
}
The first argument should be (char *)P, because it looks like you want to modify code inside function P. The second argument is the ending address of function P. You can first compile the code, and using objdump -d to see the address of beginning and end of P, then calculate the size of the function, SIZE, then manually specify in the makeCodeWritable( (char *)P, ((char *)P) + SIZE.
The second way is utilizing the as to get the size of function P, but it depends on the assembler language on your platform. This is code snipe I modified from your code, it should be able to compile and run in x86, x86_64 in GCC 4.x on Linux platform.
align5: ALIGN
cell5: printf("SPGM5\n");
xor(&&cell3,&&cell0,3*CELLSIZE);
// adding an label to the end of function P to assembly code
asm ("END_P: \n");
;
}
extern char __sizeof__myfunc[];
int main (int argc, char *argv[]) {
// calculate the code size, ending - starting address of P
asm (" __sizeof__myfunc = END_P-P \n");
// you can see the code size of P
printf("code size is %d\n", (unsigned)__sizeof__myfunc);
makeCodeWritable( (char*)P, ((char *)P) + (unsigned)__sizeof__myfunc);
P(); P();
}
With some modification to support LLVM GCC and as in Mac OS X
int main (int argc, char *argv[]) {
size_t sizeof__myfunc = 0;
asm volatile ("movq $(_END_P - _P),%0;"
: "=r" (sizeof__myfunc)
: );
printf("%d\n", sizeof__myfunc);

Inline assembly in C

Pretty self explanatory code. Why doesn't it work!
#include <stdio.h>
int main() {
__asm__("number dw 0"); // declare number?
printf("%d",number);
__asm__("mov %eax,number"
"inc %eax"
"mov number,%eax");
printf("%d",number);
return 0;
}
cc ex1.c -o ex1
ex1.c: In function ‘main’:
ex1.c:22:17: error: ‘number’ undeclared (first use in this function)
ex1.c:22:17: note: each undeclared identifier is reported only once for each function it appears in
make: *** [ex1] Error 1
Thanks.
I have a lot of knowledge gaps to fill... the gcc manual was confusing me with regards to inline assembly as was google results for tutorials...
working on an intel i7 processor
Use this syntax, you can access variables declared in C from the inline assembly
#include <stdio.h>
int main() {
int number = 0;
printf("%d\n",number);
asm(
"mov %[number],%%eax\n"
"inc %%eax\n"
"mov %%eax,%[number]\n"
: [number] "=m" (number) : "m" (number) : "eax", "cc" );
printf("%d\n",number);
return 0;
}
You can let the compiler load number into the eax register for you by specifying the "a" constraint on the input
#include <stdio.h>
int main() {
int number = 0;
printf("%d\n",number);
asm(
"inc %%eax\n"
"mov %%eax,%[number]\n"
: [number] "=m" (number) : "a" (number) : "cc" );
printf("%d\n",number);
return 0;
}
And since x86 inc instruction can operate on memory directly you could reduce it to this
#include <stdio.h>
int main() {
int number = 0;
printf("%d\n",number);
asm(
"incl %[number]\n" /* incl -> "long" (32-bits) */
: [number] "=m" (number) : "m" (number) : "cc" );
printf("%d\n",number);
return 0;
}
For more information see gcc documentation:
6.41 Assembler Instructions with C Expression Operands

Changing the Interrupt descriptor Table

I am using Linux 2.6.26 kernel version and I am trying to change the interrupt descriptor table using a kernel module. I am only trying to change the page fault table entry here. So I make a copy of the original IDT and make changes to the page fault table entry only. The objective of the ISR is to print out information of the page fault before calling the original Page fault handler. But the kernel just crashes once I load it with insmod i.e it specifically crashed with the "loadIDTR" function. With further debugging, I found out that by not changing any entry if I load the IDTR it works fine. I am out of ideas.
I have pasted the code below
#include <linux/module.h> // for init_module()
#include <linux/init.h>
#include <linux/mm.h> // for get_free_page()
#include <linux/sched.h>
#include <linux/spinlock.h>
#define SUCCESS 0
#define PGFAULT_INT 0x0E
static char modname[] = "pgfaults";
static unsigned short oldidtr[3], newidtr[3];
static unsigned long long *oldidt, *newidt;
static unsigned long isr_orig, kpage;
static char *why[]={ "sra", "srp", "swa", "swp", "ura", "urp", "uwa", "uwp" };
unsigned long long gate_desc_orig,gate_desc_orig1;
static void my_intrept( unsigned long *tos )
{
// stack-layout:
// es,ds,edi,esi,ebp,esp,ebx,edx,ecx,eax,err,eip,cs,efl
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13
volatile unsigned long vaddr;
struct task_struct *task = current;
unsigned long err = tos[ 10 ];
unsigned long eip = tos[ 11 ];
static int count = 0;
int exe, len = 0;
char msg[80]="";
// get the faulting virtual address from register CR2
asm(" mov %%cr2, %%eax ; movl %%eax, %0 " : "=m" (vaddr) );
// construct the diagnostic message
len += sprintf( msg+len, "#%-6d ", ++count );
len += sprintf( msg+len, "%16s ", task->comm );
len += sprintf( msg+len, "pid=%-5d ", task->pid );
len += sprintf( msg+len, "CR2=%08X ", (unsigned int) vaddr );
len += sprintf( msg+len, "EIP=%08X ", (unsigned int) eip );
len += sprintf( msg+len, "%s ", why[ err ] );
// note if an instruction-fetch caused the page-fault
if ( vaddr == eip ) exe = 'x'; else exe = ' ';
len += sprintf( msg+len, "%c ", exe );
// print this diagnostic message to the kernel log
printk( "<1> %s \n", msg );
}
//---------- NEW PAGE-FAULT EXCEPTION-HANDLER ---------//
asmlinkage void isr0x0E( void );
asm(" .text ");
asm(" .type isr0x0E, #function ");
asm("isr0x0E: ");
asm(" pushal ");
asm(" pushl %ds ");
asm(" pushl %es ");
//
asm(" movl %ss, %eax ");
asm(" movl %eax, %ds ");
asm(" movl %eax, %es ");
//
asm(" pushl %esp ");
asm(" call my_intrept ");
asm(" addl $4, %esp ");
//
asm(" popl %es ");
asm(" popl %ds ");
asm(" popal ");
asm(" jmp *isr_orig ");
//-------------------------------------------------------//
static void load_IDTR( void *regimage )
{
asm(" lidt %0 " : : "m" (*(unsigned short*)regimage) );
}
int pgfault_init( void )
{
int i;
unsigned long long gate_desc,gate_desc1,gate_desc2;
spinlock_t lock =SPIN_LOCK_UNLOCKED;
unsigned long flags;
unsigned short selector1;
// allocate a mapped kernel page for our new IDT
kpage =__get_free_page( GFP_KERNEL);
if ( !kpage ) return -ENOMEM;
// initialize our other global variables
asm(" sidt oldidtr ; sidt newidtr ");
memcpy( newidtr+1, &kpage, sizeof( kpage ) );
oldidt = (unsigned long long *)(*(unsigned long*)(oldidtr+1));
newidt = (unsigned long long *)(*(unsigned long*)(newidtr+1));
// extract and save entry-point to original page-pault handler
gate_desc_orig = oldidt[ PGFAULT_INT ];
gate_desc =gate_desc_orig & 0xFFFF00000000FFFF;
gate_desc |= ( gate_desc >> 32 );
isr_orig = (unsigned long)gate_desc;
// initialize our new Interrupt Descriptor Table
memcpy( newidt, oldidt, 256*sizeof( unsigned long long ) );
gate_desc_orig1 = (unsigned long)isr0x0E;
gate_desc = gate_desc_orig1 & 0x00000000FFFFFFFF;
gate_desc = gate_desc | ( gate_desc << 32 );
gate_desc1= 0xFFFF0000;
gate_desc1= gate_desc1 << 32;
gate_desc1= gate_desc1 | 0x0000FFFF;
gate_desc = gate_desc & gate_desc1;
gate_desc2= 0x0000EF00;
gate_desc2= gate_desc2 <<32;
gate_desc2= gate_desc2 | 0x00100000;
gate_desc = gate_desc | gate_desc2; // trap-gate
//Part which is most likely creating a fault when loading the idtr
newidt[ PGFAULT_INT ] = gate_desc;
//**********************************************
// activate the new IDT
spin_lock_irqsave(&lock,flags);
load_IDTR( newidtr );
spin_unlock_irqrestore(&lock,flags);
// smp_call_function( load_IDTR, oldidtr, 1, 1 );
return SUCCESS;
}
void pgfault_exit( void )
{
// reactivate the old IDT
unsigned long flags;
spinlock_t lock =SPIN_LOCK_UNLOCKED;
spin_lock_irqsave(&lock,flags);
load_IDTR( oldidtr );
spin_unlock_irqrestore(&lock,flags);
// smp_call_function( load_IDTR, oldidtr, 1, 1 );
// release allocated kernel page
if ( kpage ) free_page( kpage );
}
EXPORT_SYMBOL_GPL(my_intrept);
MODULE_LICENSE("GPL");
module_init( pgfault_init);
module_exit( pgfault_exit);
Why don't you use kernel function instead of fiddling with bits manually!
check it (it is initialization module func):
struct desc_ptr newidtr;
gate_desc *oldidt, *newidt;
store_idt(&__IDT_register);
oldidt = (gate_desc *)__IDT_register.address;
__IDT_page =__get_free_page(GFP_KERNEL);
if(!__IDT_page)
return -1;
newidtr.address = __IDT_page;
newidtr.size = __IDT_register.size;
newidt = (gate_desc *)newidtr.address;
memcpy(newidt, oldidt, __IDT_register.size);
pack_gate(&newidt[PGFAULT_NR], GATE_INTERRUPT, (unsigned long)isr0x0E, 0, 0, __KERNEL_CS);
__load_idt((void *)&newidtr);
smp_call_function(__load_idt, &newidtr, 0, 1);
return 0;
I have tested it it works!
Your segment selector in your trap gate descriptor appears to be hardcoded to 0x0010, when it should be __KERNEL_CS (which is 0x0060 in the 2.6.26 kernel sources I have).
By the way, this is pretty baroque:
gate_desc_orig1 = (unsigned long)isr0x0E;
gate_desc = gate_desc_orig1 & 0x00000000FFFFFFFF;
gate_desc = gate_desc | ( gate_desc << 32 );
gate_desc1= 0xFFFF0000;
gate_desc1= gate_desc1 << 32;
gate_desc1= gate_desc1 | 0x0000FFFF;
gate_desc = gate_desc & gate_desc1;
gate_desc2= 0x0000EF00;
gate_desc2= gate_desc2 <<32;
gate_desc2= gate_desc2 | 0x00100000;
gate_desc = gate_desc | gate_desc2; // trap-gate
You could simplify that down to (with the __KERNEL_CS fix):
gate_desc = (unsigned long long)isr0x0E * 0x100000001ULL;
gate_desc &= 0xFFFF00000000FFFFULL;
gate_desc |= 0x0000EF0000000000ULL; // trap-gate
gate_desc |= (unsigned long long)__KERNEL_CS << 16;
For reference, here is a working implementation of a custom page fault handler for the Linux x86_64 architecture. I just tested this module myself with kernel 3.2, it works perfectly.
https://github.com/RichardUSTC/intercept-page-fault-handler

Resources