I have two "empty" kernels, each with one if statement which will never be touched.
#include <cstdio>
#include <time.h>
#include <sys/time.h>
#include <cuda.h>
inline double wtime(){
double time[2];
struct timeval time1;
gettimeofday(&time1, NULL);
time[0]=time1.tv_sec;
time[1]=time1.tv_usec;
return time[0]+time[1]*1.0e-6;
}
__global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
__global__ void __empty2(bool flag){if(flag){ ; }}
int main(){
cudaDeviceSynchronize();
double s = wtime();
__empty1<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty1: %.3f\n", 1000*(wtime()-s));
cudaDeviceSynchronize();
s = wtime();
__empty2<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty2: %.3f\n", 1000*(wtime()-s));
return 0;
}
I compiled the code with cuda-7.5 and -O3, then run it on K40m.
nvcc -O3 -arch=sm_35 ./main.cu
The first empty kernel takes 1ms, while the second kernel takes 0.02ms.
empty1: 1.075
empty2: 0.019
That's weird since neither of these two kernels enters the branch. The runtime of these two kernels is expected to be the same.
The codes are not the same. As pointed out in the comments, this is discoverable by looking at the SASS.
Example:
$ cat t1353.cu
#include <cstdio>
#include <time.h>
#include <sys/time.h>
#include <cuda.h>
inline double wtime(){
double time[2];
struct timeval time1;
gettimeofday(&time1, NULL);
time[0]=time1.tv_sec;
time[1]=time1.tv_usec;
return time[0]+time[1]*1.0e-6;
}
__global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
__global__ void __empty2(bool flag){if(flag){ ; }}
int main(){
cudaDeviceSynchronize();
double s = wtime();
__empty1<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty1: %.3f\n", 1000*(wtime()-s));
cudaDeviceSynchronize();
s = wtime();
__empty2<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty2: %.3f\n", 1000*(wtime()-s));
return 0;
}
$ nvcc -arch=sm_35 -o t1353 t1353.cu
$ cuobjdump -sass t1353
Fatbin elf code:
================
arch = sm_35
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_35
Fatbin elf code:
================
arch = sm_35
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
code for sm_35
Function : _Z8__empty2b
.headerflags #"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x0800000000b81000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0018*/ EXIT; /* 0x18000000001c003c */
/*0020*/ BRA 0x20; /* 0x12007ffffc1c003c */
/*0028*/ NOP; /* 0x85800000001c3c02 */
/*0030*/ NOP; /* 0x85800000001c3c02 */
/*0038*/ NOP; /* 0x85800000001c3c02 */
.............................
Function : _Z8__empty1b
.headerflags #"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x08b8b0a0a0a0a000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ LDC.S8 R0, c[0x0][0x140]; /* 0x7c880000a01ffc02 */
/*0018*/ I2I.U16.S8 R0, R0; /* 0xe6000000001c8402 */
/*0020*/ LOP.AND R0, R0, 0xff; /* 0xc20000007f9c0001 */
/*0028*/ I2I.S32.S16 R0, R0; /* 0xe6000000001cd802 */
/*0030*/ ISETP.EQ.AND P0, PT, R0, RZ, PT; /* 0xdb281c007f9c001e */
/*0038*/ #P0 EXIT; /* 0x180000000000003c */
/* 0x08b810b800108010 */
/*0048*/ MOV32I R4, 0x0; /* 0x74000000001fc012 */
/*0050*/ MOV32I R5, 0x0; /* 0x74000000001fc016 */
/*0058*/ MOV R7, RZ; /* 0xe4c03c007f9c001e */
/*0060*/ MOV R6, RZ; /* 0xe4c03c007f9c001a */
/*0068*/ JCAL 0x0; /* 0x1100000000000100 */
/*0070*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0078*/ EXIT; /* 0x18000000001c003c */
/*0080*/ BRA 0x80; /* 0x12007ffffc1c003c */
/*0088*/ NOP; /* 0x85800000001c3c02 */
/*0090*/ NOP; /* 0x85800000001c3c02 */
/*0098*/ NOP; /* 0x85800000001c3c02 */
/*00a0*/ NOP; /* 0x85800000001c3c02 */
/*00a8*/ NOP; /* 0x85800000001c3c02 */
/*00b0*/ NOP; /* 0x85800000001c3c02 */
/*00b8*/ NOP; /* 0x85800000001c3c02 */
.............................
Fatbin ptx code:
================
arch = sm_35
code version = [5,0]
producer = cuda
host = linux
compile_size = 64bit
compressed
$
So in the empty2 case the machine code looks like this:
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0018*/ EXIT; /*
In the empty1 case it is longer:
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ LDC.S8 R0, c[0x0][0x140]; /* 0x7c880000a01ffc02 */
/*0018*/ I2I.U16.S8 R0, R0; /* 0xe6000000001c8402 */
/*0020*/ LOP.AND R0, R0, 0xff; /* 0xc20000007f9c0001 */
/*0028*/ I2I.S32.S16 R0, R0; /* 0xe6000000001cd802 */
/*0030*/ ISETP.EQ.AND P0, PT, R0, RZ, PT; /* 0xdb281c007f9c001e */
/*0038*/ #P0 EXIT;
...
/*0078*/ EXIT; /* 0x18000000001c003c */
The bigger issue here is possibly one of timing rigor/correctness. CUDA has lazy initialization. This means that the first set of calls in your CUDA code may incur more than the usual amount of timing overhead. According to my testing this is affecting the conclusion here. If I run a "warm-up" call to empty1 before actually timing it, the measured time between the two cases is nearly the same. It could be plausibly explained by the difference in code length.
Example:
$ cat t1353.cu
#include <cstdio>
#include <time.h>
#include <sys/time.h>
#include <cuda.h>
inline double wtime(){
double time[2];
struct timeval time1;
gettimeofday(&time1, NULL);
time[0]=time1.tv_sec;
time[1]=time1.tv_usec;
return time[0]+time[1]*1.0e-6;
}
__global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
__global__ void __empty2(bool flag){if(flag){ ; }}
int main(){
__empty1<<<256,256>>>(false);
cudaDeviceSynchronize();
double s = wtime();
__empty1<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty1: %.3f\n", 1000*(wtime()-s));
__empty2<<<256,256>>>(false);
cudaDeviceSynchronize();
s = wtime();
__empty2<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty2: %.3f\n", 1000*(wtime()-s));
return 0;
}
$ nvcc -arch=sm_35 -o t1353 t1353.cu
$ ./t1353
empty1: 0.023
empty2: 0.015
$
Related
I am trying to use the write protection feature of Linux's userfaultfd, but it does not appear to be enabled in my kernel even though I am using version 5.13 (write protection should be fully supported in 5.10+).
When I run
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <linux/userfaultfd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <unistd.h>
#define errExit(msg) \
do { \
perror(msg); \
exit(EXIT_FAILURE); \
} while (0)
static int has_bit(uint64_t val, uint64_t bit) {
return (val & bit) == bit;
}
int main() {
long uffd; /* userfaultfd file descriptor */
struct uffdio_api uffdio_api;
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1)
errExit("userfaultfd");
uffdio_api.api = UFFD_API;
uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
errExit("ioctl-UFFDIO_API");
printf("UFFDIO_API: %d\n", has_bit(uffdio_api.ioctls, 1UL << _UFFDIO_API));
printf("UFFDIO_REGISTER: %d\n", has_bit(uffdio_api.ioctls, 1UL << _UFFDIO_REGISTER));
printf("UFFDIO_UNREGISTER: %d\n", has_bit(uffdio_api.ioctls, 1UL << _UFFDIO_UNREGISTER));
printf("UFFDIO_WRITEPROTECT: %d\n", has_bit(uffdio_api.ioctls, 1UL << _UFFDIO_WRITEPROTECT));
printf("UFFD_FEATURE_PAGEFAULT_FLAG_WP: %d\n", has_bit(uffdio_api.features, UFFD_FEATURE_PAGEFAULT_FLAG_WP));
}
The output is
UFFDIO_API: 1
UFFDIO_REGISTER: 1
UFFDIO_UNREGISTER: 1
UFFDIO_WRITEPROTECT: 0
UFFD_FEATURE_PAGEFAULT_FLAG_WP: 1
The UFFD_FEATURE_PAGEFAULT_FLAG_WP feature is enabled, but the UFFDIO_WRITEPROTECT ioctl is marked as not supported, which is necessary to enable write protection.
What might lead to this feature being disabled, and how can I enable it?
I am using Ubuntu MATE 21.10 with Linux kernel version 5.13.0-30-generic.
EDIT:
It seems like despite the man page section on the UFFD_API ioctl (https://man7.org/linux/man-pages/man2/ioctl_userfaultfd.2.html), this might be the intended behavior for a system where write protection is enabled. However, when I run a full program that spawns a poller thread and writes to the protected memory, the poller thread does not receive any notification.
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
#define errExit(msg) \
do { \
perror(msg); \
exit(EXIT_FAILURE); \
} while (0)
static int page_size;
static void* fault_handler_thread(void* arg) {
long uffd; /* userfaultfd file descriptor */
uffd = (long) arg;
/* Loop, handling incoming events on the userfaultfd
file descriptor. */
for (;;) {
/* See what poll() tells us about the userfaultfd. */
struct pollfd pollfd;
int nready;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);
if (nready == -1)
errExit("poll");
printf("\nfault_handler_thread():\n");
printf(
" poll() returns: nready = %d; "
"POLLIN = %d; POLLERR = %d\n",
nready, (pollfd.revents & POLLIN) != 0,
(pollfd.revents & POLLERR) != 0);
// received fault, exit the program
exit(EXIT_FAILURE);
}
}
int main() {
long uffd; /* userfaultfd file descriptor */
char* addr; /* Start of region handled by userfaultfd */
uint64_t len; /* Length of region handled by userfaultfd */
pthread_t thr; /* ID of thread that handles page faults */
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
struct uffdio_writeprotect uffdio_wp;
int s;
page_size = sysconf(_SC_PAGE_SIZE);
len = page_size;
/* Create and enable userfaultfd object. */
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1)
errExit("userfaultfd");
uffdio_api.api = UFFD_API;
uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
errExit("ioctl-UFFDIO_API");
addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (addr == MAP_FAILED)
errExit("mmap");
printf("Address returned by mmap() = %p\n", addr);
/* Register the memory range of the mapping we just created for
handling by the userfaultfd object. */
uffdio_register.range.start = (unsigned long) addr;
uffdio_register.range.len = len;
uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
errExit("ioctl-UFFDIO_REGISTER");
printf("uffdio_register.ioctls = 0x%llx\n", uffdio_register.ioctls);
printf("Have _UFFDIO_WRITEPROTECT? %s\n", (uffdio_register.ioctls & _UFFDIO_WRITEPROTECT) ? "YES" : "NO");
uffdio_wp.range.start = (unsigned long) addr;
uffdio_wp.range.len = len;
uffdio_wp.mode = UFFDIO_WRITEPROTECT_MODE_WP;
if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffdio_wp) == -1)
errExit("ioctl-UFFDIO_WRITEPROTECT");
/* Create a thread that will process the userfaultfd events. */
s = pthread_create(&thr, NULL, fault_handler_thread, (void*) uffd);
if (s != 0) {
errno = s;
errExit("pthread_create");
}
/* Main thread now touches memory in the mapping, touching
locations 1024 bytes apart. This will trigger userfaultfd
events for all pages in the region. */
usleep(100000);
size_t l;
l = 0xf; /* Ensure that faulting address is not on a page
boundary, in order to test that we correctly
handle that case in fault_handling_thread(). */
char i = 0;
while (l < len) {
printf("Write address %p in main(): ", addr + l);
addr[l] = i++;
printf("%d\n", addr[l]);
l += 1024;
usleep(100000); /* Slow things down a little */
}
exit(EXIT_SUCCESS);
}
The UFFD_API ioctl does not seem to ever report _UFFD_WRITEPROTECT as can be seen here in the kernel source code (1, 2). I assume that this is because whether this operation is supported or not depends on the kind of underlying mapping.
The feature is in fact reporeted on a per-registered-range basis. You will have to set the API with ioctl(uffd, UFFDIO_API, ...) first, then register a range with ioctl(uffd, UFFDIO_REGISTER, ...) and then check the uffdio_register.ioctls field.
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <linux/userfaultfd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <unistd.h>
#define errExit(msg) \
do { \
perror(msg); \
exit(EXIT_FAILURE); \
} while (0)
int main(void) {
long uffd;
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1)
errExit("userfaultfd");
struct uffdio_api uffdio_api = { .api = UFFD_API };
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
errExit("ioctl(UFFDIO_API)");
const size_t region_sz = 0x4000;
void *region = mmap(NULL, region_sz, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
if (region == MAP_FAILED)
errExit("mmap");
if (posix_memalign((void **)region, sysconf(_SC_PAGESIZE), region_sz))
errExit("posix_memalign");
printf("Region mapped at %p - %p\n", region, region + region_sz);
struct uffdio_register uffdio_register = {
.range = { .start = (unsigned long)region, .len = region_sz },
.mode = UFFDIO_REGISTER_MODE_WP
};
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
errExit("ioctl(UFFDIO_REGISTER)");
printf("uffdio_register.ioctls = 0x%llx\n", uffdio_register.ioctls);
printf("Have _UFFDIO_WRITEPROTECT? %s\n", (uffdio_register.ioctls & _UFFDIO_WRITEPROTECT) ? "YES" : "NO");
if ((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) != UFFD_API_RANGE_IOCTLS)
errExit("bad ioctl set");
struct uffdio_writeprotect wp = {
.range = { .start = (unsigned long)region, .len = region_sz },
.mode = UFFDIO_WRITEPROTECT_MODE_WP
};
if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp) == -1)
errExit("ioctl(UFFDIO_WRITEPROTECT)");
puts("ioctl(UFFDIO_WRITEPROTECT) successful.");
return EXIT_SUCCESS;
}
Output:
Region mapped at 0x7f45c48fe000 - 0x7f45c4902000
uffdio_register.ioctls = 0x5c
Have _UFFDIO_WRITEPROTECT? YES
ioctl(UFFDIO_WRITEPROTECT) successful.
I found the solution. The write-protected pages must be touched after registering but before marking them as write-protected. This is an undocumented requirement, from what I can tell.
In other words, add
for (size_t i = 0; i < len; i += page_size) {
addr[i] = 0;
}
between registering and write-protecting.
It works if I change the full example to
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
#define errExit(msg) \
do { \
perror(msg); \
exit(EXIT_FAILURE); \
} while (0)
static int page_size;
static void* fault_handler_thread(void* arg) {
long uffd; /* userfaultfd file descriptor */
uffd = (long) arg;
/* Loop, handling incoming events on the userfaultfd
file descriptor. */
for (;;) {
/* See what poll() tells us about the userfaultfd. */
struct pollfd pollfd;
int nready;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);
if (nready == -1)
errExit("poll");
printf("\nfault_handler_thread():\n");
printf(
" poll() returns: nready = %d; "
"POLLIN = %d; POLLERR = %d\n",
nready, (pollfd.revents & POLLIN) != 0,
(pollfd.revents & POLLERR) != 0);
// received fault, exit the program
exit(EXIT_FAILURE);
}
}
int main() {
long uffd; /* userfaultfd file descriptor */
char* addr; /* Start of region handled by userfaultfd */
uint64_t len; /* Length of region handled by userfaultfd */
pthread_t thr; /* ID of thread that handles page faults */
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
struct uffdio_writeprotect uffdio_wp;
int s;
page_size = sysconf(_SC_PAGE_SIZE);
len = page_size;
/* Create and enable userfaultfd object. */
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1)
errExit("userfaultfd");
uffdio_api.api = UFFD_API;
uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
errExit("ioctl-UFFDIO_API");
addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (addr == MAP_FAILED)
errExit("mmap");
printf("Address returned by mmap() = %p\n", addr);
/* Register the memory range of the mapping we just created for
handling by the userfaultfd object. */
uffdio_register.range.start = (unsigned long) addr;
uffdio_register.range.len = len;
uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
errExit("ioctl-UFFDIO_REGISTER");
printf("uffdio_register.ioctls = 0x%llx\n", uffdio_register.ioctls);
printf("Have _UFFDIO_WRITEPROTECT? %s\n", (uffdio_register.ioctls & _UFFDIO_WRITEPROTECT) ? "YES" : "NO");
for (size_t i = 0; i < len; i += page_size) {
addr[i] = 0;
}
uffdio_wp.range.start = (unsigned long) addr;
uffdio_wp.range.len = len;
uffdio_wp.mode = UFFDIO_WRITEPROTECT_MODE_WP;
if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffdio_wp) == -1)
errExit("ioctl-UFFDIO_WRITEPROTECT");
/* Create a thread that will process the userfaultfd events. */
s = pthread_create(&thr, NULL, fault_handler_thread, (void*) uffd);
if (s != 0) {
errno = s;
errExit("pthread_create");
}
/* Main thread now touches memory in the mapping, touching
locations 1024 bytes apart. This will trigger userfaultfd
events for all pages in the region. */
usleep(100000);
size_t l;
l = 0xf; /* Ensure that faulting address is not on a page
boundary, in order to test that we correctly
handle that case in fault_handling_thread(). */
char i = 0;
while (l < len) {
printf("Write address %p in main(): ", addr + l);
addr[l] = i++;
printf("%d\n", addr[l]);
l += 1024;
usleep(100000); /* Slow things down a little */
}
exit(EXIT_SUCCESS);
}
I am trying to develop a kernel module that hooks system call. I'm testing on Raspberry Pi 3B, running raspbian buster Linux 4.19.97-v7+ armv7l.
So typically on x86 we can overwrite CR0 register but there is no similar register on ARM architecture. I tried to do it via set_memory_rw and then enabling it before exiting using set_memory_ro like one answer to a similar question at Cannot use set_memory_rw in Linux kernel on ARM64
but it doesn't work.
My code:
// SPDX-License-Identifier: GPL-3.0
#include <linux/init.h> // module_{init,exit}()
#include <linux/module.h> // THIS_MODULE, MODULE_VERSION, ...
#include <linux/kernel.h> // printk(), pr_*()
#include <linux/kallsyms.h> // kallsyms_lookup_name()
#include <asm/syscall.h> // syscall_fn_t, __NR_*
#include <asm/ptrace.h> // struct pt_regs
#include <asm/tlbflush.h> // flush_tlb_kernel_range()
#include <asm/pgtable.h> // {clear,set}_pte_bit(), set_pte()
#include <linux/vmalloc.h> // vm_unmap_aliases()
#include <linux/mm.h> // struct mm_struct, apply_to_page_range()
#include <linux/kconfig.h> // IS_ENABLED()
#ifdef pr_fmt
#undef pr_fmt
#endif
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
static struct mm_struct *init_mm_ptr;
static void* *syscall_table;
#define MAGIC "mamaliga"
#define SIZEOF_MAGIC 8
#define ROOTKIT_SYS_CALL_TABLE 0x801011c4
int pos;
// static void* original_read;
asmlinkage long (*original_read)(unsigned int fd, char __user *buf, size_t count);
/********** HELPERS **********/
// From arch/arm/mm/pageattr.c.
struct page_change_data {
pgprot_t set_mask;
pgprot_t clear_mask;
};
static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr,
void *data)
{
struct page_change_data *cdata = data;
pte_t pte = *ptep;
pte = clear_pte_bit(pte, cdata->clear_mask);
pte = set_pte_bit(pte, cdata->set_mask);
set_pte_ext(ptep, pte, 0);
return 0;
}
void (*flush)(unsigned long start, unsigned long end);
// From arch/arm64/mm/pageattr.c.
static int __change_memory_common(unsigned long start, unsigned long size,
pgprot_t set_mask, pgprot_t clear_mask)
{
struct page_change_data data;
int ret;
data.set_mask = set_mask;
data.clear_mask = clear_mask;
ret = apply_to_page_range(init_mm_ptr, start, size, change_page_range, &data);
flush = (void*)kallsyms_lookup_name("flush_tlb_kernel_range");
flush(start, start + size);
return ret;
}
// Simplified set_memory_rw() from arch/arm/mm/pageattr.c.
static int set_page_rw(unsigned long addr)
{
vm_unmap_aliases();
return __change_memory_common(addr, PAGE_SIZE,
__pgprot(0),
__pgprot(L_PTE_RDONLY));
}
// Simplified set_memory_ro() from arch/arm/mm/pageattr.c.
static int set_page_ro(unsigned long addr)
{
vm_unmap_aliases();
return __change_memory_common(addr, PAGE_SIZE,
__pgprot(L_PTE_RDONLY),
__pgprot(0));
}
/********** ACTUAL MODULE **********/
asmlinkage long myread(unsigned int fd, char __user *buf, size_t count)
{
long ret;
/* Call original read_syscall */
ret = original_read(fd, buf, count);
pr_info("Hooked!\n");
return ret;
}
static int __init modinit(void)
{
int res;
pr_info("init\n");
// Shouldn't fail.
init_mm_ptr = (struct mm_struct *)kallsyms_lookup_name("init_mm");
syscall_table = (void* *)kallsyms_lookup_name("sys_call_table");
printk(KERN_INFO "syscall_table: 0xx%llx\n", syscall_table);
original_read = syscall_table[__NR_read];
res = set_page_rw(((unsigned long)syscall_table + __NR_read) & PAGE_MASK);
if (res != 0) {
pr_err("set_page_rw() failed: %d\n", res);
return res;
}
else {
pr_info("set_page_rw() OK");
}
syscall_table[__NR_read] = myread;
res = set_page_ro(((unsigned long)syscall_table + __NR_read) & PAGE_MASK);
if (res != 0) {
pr_err("set_page_ro() failed: %d\n", res);
return res;
}
else {
pr_info("set_page_ro() OK");
}
pr_info("init done\n");
return 0;
}
static void __exit modexit(void)
{
int res;
pr_info("exit\n");
res = set_page_rw(((unsigned long)syscall_table + __NR_read) & PAGE_MASK);
if (res != 0) {
pr_err("set_page_rw() failed: %d\n", res);
return;
}
syscall_table[__NR_read] = original_read;
res = set_page_ro(((unsigned long)syscall_table + __NR_read) & PAGE_MASK);
if (res != 0)
pr_err("set_page_ro() failed: %d\n", res);
pr_info("goodbye\n");
}
module_init(modinit);
module_exit(modexit);
MODULE_VERSION("0.1");
MODULE_DESCRIPTION("Syscall hijack on arm64.");
MODULE_AUTHOR("Marco Bonelli");
MODULE_LICENSE("GPL");
Dmesg trace errors as follow
[ 89.767769] interceptor: loading out-of-tree module taints kernel.
[ 89.771442] interceptor: init
[ 89.806435] syscall_table: 0xx75c5dda175c5dda1
[ 89.812886] interceptor: set_page_rw() OK
[ 89.812898] Unable to handle kernel paging request at virtual address 801011d0
[ 89.822616] pgd = c9de9ec3
[ 89.826529] [801011d0] *pgd=0001141e(bad)
[ 89.831760] Internal error: Oops: 80d [#1] SMP ARM
[ 89.837763] Modules linked in: interceptor(O+) cmac bnep hci_uart btbcm serdev bluetooth ecdh_generic binfmt_misc evdev brcmfmac brcmutil sha256_generic cfg80211 raspberrypi_hwmon rfkill hwmon bcm2835_codec(C) bcm2835_v4l2(C) snd_bcm2835(C) v4l2_mem2mem snd_pcm bcm2835_mmal_vchiq(C) v4l2_common videobuf2_dma_contig snd_timer videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 videobuf2_common snd videodev media vc_sm_cma(C) fixed uio_pdrv_genirq uio ip_tables x_tables ipv6
[ 89.887532] CPU: 1 PID: 981 Comm: insmod Tainted: G C O 4.19.97-v7+ #1293
[ 89.898046] Hardware name: BCM2835
[ 89.902693] PC is at modinit+0xb0/0x1000 [interceptor]
[ 89.909084] LR is at (null)
[ 89.913214] pc : [<7f7430b0>] lr : [<00000000>] psr: 60000013
[ 89.920704] sp : b20e5d80 ip : 80d0517c fp : b20e5d9c
[ 89.927176] r10: b5ab3340 r9 : 00000002 r8 : b5ab3300
[ 89.933650] r7 : 00000000 r6 : fffff000 r5 : 00000000 r4 : 801011c4
[ 89.941469] r3 : 7f73e068 r2 : 75c5dda1 r1 : 00000000 r0 : 0000001d
[ 89.949256] Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment user
[ 89.957653] Control: 10c5383d Table: 320ec06a DAC: 00000055
[ 89.964666] Process insmod (pid: 981, stack limit = 0x322dc319)
[ 89.971873] Stack: (0xb20e5d80 to 0xb20e6000)
[ 89.977495] 5d80: 7f740000 7f743000 80d04d48 00000000 b20e5e14 b20e5da0 8010312c 7f74300c
[ 89.988192] 5da0: 802821d4 8085dffc 00000000 006000c0 b20e5dcc b20e5dc0 8085dffc 802ba274
[ 89.998889] 5dc0: b20e5e14 b20e5dd0 802ba274 802c7118 802bb6cc 802bab54 00000001 00003c76
[ 90.009578] 5de0: 00000000 a0000013 bccc8000 75c5dda1 7f740000 7f740000 7f740000 b7c04880
[ 90.020281] 5e00: 80d04d48 b5ab3300 b20e5e3c b20e5e18 801ba19c 801030e8 b20e5e3c b20e5e28
[ 90.031099] 5e20: 802a8250 b20e5f30 7f740000 00000002 b20e5f0c b20e5e40 801b9114 801ba134
[ 90.042029] 5e40: 7f74000c 00007fff 7f740000 801b6100 00000000 80ae4f00 7f7401fc 7f740114
[ 90.053063] 5e60: 7f740130 00000000 b5ab3308 7f740048 b20e5e94 80afd2d0 802d061c 802d0488
[ 90.064243] 5e80: b20e5ea0 b21029c0 00000000 00000000 00000000 00000000 00000000 00000000
[ 90.075471] 5ea0: 6e72656b 00006c65 00000000 00000000 00000000 00000000 00000000 00000000
[ 90.086778] 5ec0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 75c5dda1
[ 90.098119] 5ee0: 7fffffff 80d04d48 00000000 00000003 0002d064 7fffffff 00000000 0000017b
[ 90.109451] 5f00: b20e5fa4 b20e5f10 801b9974 801b7360 7fffffff 00000000 00000003 00000000
[ 90.120790] 5f20: 00000000 bccc8000 0000209c 00000000 bccc84e2 bccc89c0 bccc8000 0000209c
[ 90.132121] 5f40: bccc9a5c bccc98a8 bccc92a0 00003000 000032c0 00000000 00000000 00000000
[ 90.143457] 5f60: 000018d8 00000025 00000026 0000001d 0000001b 00000017 00000000 75c5dda1
[ 90.154784] 5f80: 010d6c00 7e9317d4 0003fce8 0000017b 801011c4 b20e4000 00000000 b20e5fa8
[ 90.166119] 5fa0: 80101000 801b98c4 010d6c00 7e9317d4 00000003 0002d064 00000000 00000004
[ 90.177457] 5fc0: 010d6c00 7e9317d4 0003fce8 0000017b 01355818 00000000 00000002 00000000
[ 90.188783] 5fe0: 7e931608 7e9315f8 00022cb8 76cadaf0 60000010 00000003 00000000 00000000
[ 90.200155] [<7f7430b0>] (modinit [interceptor]) from [<8010312c>] (do_one_initcall+0x50/0x218)
[ 90.212033] [<8010312c>] (do_one_initcall) from [<801ba19c>] (do_init_module+0x74/0x220)
[ 90.223280] [<801ba19c>] (do_init_module) from [<801b9114>] (load_module+0x1dc0/0x2404)
[ 90.234455] [<801b9114>] (load_module) from [<801b9974>] (sys_finit_module+0xbc/0xcc)
[ 90.245459] [<801b9974>] (sys_finit_module) from [<80101000>] (ret_fast_syscall+0x0/0x28)
[ 90.256813] Exception stack(0xb20e5fa8 to 0xb20e5ff0)
[ 90.263464] 5fa0: 010d6c00 7e9317d4 00000003 0002d064 00000000 00000004
[ 90.274748] 5fc0: 010d6c00 7e9317d4 0003fce8 0000017b 01355818 00000000 00000002 00000000
[ 90.286023] 5fe0: 7e931608 7e9315f8 00022cb8 76cadaf0
[ 90.292627] Code: eb28f040 e594400c e30e3068 e3473f73 (e584300c)
[ 90.300277] ---[ end trace 9daed852fe9a568f ]---
I also tried some other suggestions from ARM64 - Linux Memory Write protection won't disable which disable the Memory Write protection via the corresponding PTE to an virtual address by using the Linux Kernel Functions. it doesn't work either.
#include <linux/module.h> /* Needed by all modules */
#include <linux/unistd.h> /* Needed for __NR_read */
#include <linux/reboot.h> /* Needed for kernel_restart() */
#include <linux/slab.h> /* Needed for kmalloc() */
#include <linux/mm.h>
#include <asm/cacheflush.h> /* Needed for cache flush */
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#define CR0_WRITE_PROTECT_MASK (1 << 16)
#define MAGIC "mamaliga"
#define SIZEOF_MAGIC 8
#define ROOTKIT_SYS_CALL_TABLE 0x801011c4
// ARM
#define HIJACK_SIZE 12
void **sys_call_table;
asmlinkage long (*read_syscall_ref)(unsigned int fd, char __user *buf, size_t count);
int pos; /* Size of MAGIC matched so far */
/* Function that replaces the original read_syscall.*/
asmlinkage long my_read_syscall_ref(unsigned int fd, char __user *buf, size_t count)
{
long ret;
int i;
/* Call original read_syscall */
ret = read_syscall_ref(fd, buf, count);
return ret;
}
void mem_text_write_kernel_word(unsigned long *addr, unsigned long word)
{
*addr = word;
flush_icache_range((unsigned long)addr,
((unsigned long)addr + sizeof(long)));
}
void cacheflush ( void *begin, unsigned long size )
{
flush_icache_range((unsigned long)begin, (unsigned long)begin + size);
}
static pgd_t *get_global_pgd (void)
{
pgd_t *pgd;
unsigned int ttb_reg;
asm volatile (
" mrc p15, 0, %0, c2, c0, 1"
: "=r" (ttb_reg));
printk(KERN_INFO "1st try: %08x", ttb_reg);
asm volatile (
"mrrc p15, 1, %Q0, %R0, c2"
: "=r"(ttb_reg));
printk(KERN_INFO "2nd try: %08x", ttb_reg);
if (PAGE_OFFSET == 0x80000000) ttb_reg -= (1 << 4); else if (PAGE_OFFSET == 0xc0000000) ttb_reg -= (16 << 10);
printk(KERN_INFO "3rd try: %08x", ttb_reg);
ttb_reg &= ~(PTRS_PER_PGD*sizeof(pgd_t)-1);
printk(KERN_INFO "4th try: %08x", ttb_reg);
pgd = __va(ttb_reg);
printk(KERN_INFO "Global virt pgd: %08x", pgd);
return pgd;
}
static pte_t *lookup_address (unsigned long addr, unsigned int *level)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
printk(KERN_INFO "lookup_address %08x", addr);
pgd = get_global_pgd() + pgd_index(addr);
printk(KERN_INFO "pgd 0x%0x= %p",pgd_val(*pgd), pgd);
pud = pud_offset (pgd, addr);
printk(KERN_INFO "pud 0x%0x= %p", pud_val(*pud), pud);
pmd = pmd_offset (pud, addr);
printk(KERN_INFO "pmd 0x%0x= %p", pmd_val(*pmd), pmd);
return pte_offset_kernel (pmd, addr);
}
int make_rw(unsigned long address)
{
unsigned int level;
pte_t *ptep, pte;
ptep = lookup_address(address, &level);
pte = *ptep;
printk(KERN_INFO "pte = %08x", pte);
printk(KERN_INFO "PTE before 0x%lx\n", pte);
*ptep = pte_mkwrite(*ptep);
*ptep = clear_pte_bit(*ptep, __pgprot((_AT(pteval_t, 1) << 7)));
__flush_tlb_all();
printk(KERN_INFO "PTE after 0x%lx\n", pte);
return 0;
}
static int __init interceptor_start(void)
{
unsigned long original_cr0;
/* Reading contents of control register cr0. The cr0 register has various
control flags that modify the basic operation of the processor. */
/* Disable `write-protect` mode. Do so by setting the WP (Write protect)
bit to 0. When set to 1, the CPU can't write to read-only pages */
// /* Store original read() syscall */
static void **sys_call_table;
void *swi_table_addr = (long *)0xffff0008; // Known address of Software Interrupt handler
unsigned long offset_from_swi_vector_adr = 0;
unsigned long *swi_vector_adr = 0;
offset_from_swi_vector_adr = ((*(long *)swi_table_addr) & 0xfff) + 8;
swi_vector_adr = *(unsigned long *)(swi_table_addr + offset_from_swi_vector_adr);
while (swi_vector_adr++)
{
if (((*(unsigned long *)swi_vector_adr) & 0xfffff000) == 0xe28f8000)
{ // Copy the entire sys_call_table from the offset_from_swi_vector_adr starting the hardware interrupt table
offset_from_swi_vector_adr = ((*(unsigned long *)swi_vector_adr) & 0xfff) + 8; // 0xe28f8000 is end of interrupt space. Hence we stop.
sys_call_table = (void *)swi_vector_adr + offset_from_swi_vector_adr;
break;
}
}
// sys_call_table = (void *) ROOTKIT_SYS_CALL_TABLE;
printk(KERN_INFO "ROOTKIT_SYS_CALL_TABLE: 0x%08x\n", sys_call_table);
printk(KERN_INFO "__NR_read: 0x%d\n", __NR_read);
read_syscall_ref = (void *)sys_call_table[__NR_read];
printk(KERN_INFO "read_syscall_ref: 0x%p\n", read_syscall_ref);
printk("func: %pF at address: %p\n", read_syscall_ref, read_syscall_ref);
void *func = &my_read_syscall_ref;
printk("Func: %pF at address: %p\n", func, func);
printk(KERN_INFO "my_read_syscall_ref: 0x%p\n", my_read_syscall_ref);
/* Replace in the system call table the original
read() syscall with our intercepting function */
make_rw(&read_syscall_ref);
// sys_call_table[__NR_read] = (unsigned long *) my_read_syscall_ref;
//hijack_start(read_syscall_ref, &my_read_syscall_ref);
printk(KERN_INFO "sys_call_table[__NR_read]: 0x%p\n", sys_call_table[__NR_read]);
printk(KERN_INFO "%s\n", "Hello");
/* A non 0 return value means init_module failed; module can't be loaded */
return 0;
}
/* Cleanup function which is called just before module
is rmmoded. It restores the original read() syscall. */
static void __exit interceptor_end(void)
{
/* Restore original read() syscall */
sys_call_table[__NR_read] = (unsigned long *) read_syscall_ref;
printk(KERN_INFO "%s\n", "Bye bye");
return;
}
module_init(interceptor_start);
module_exit(interceptor_end);
MODULE_LICENSE("GPL");
I'm really stuck at the moment and your suggestions would be very much appreciated!
you can use ftrace implement arm hook in kernel 4.19.
Problem
I am trying to find out whether mmx or xmm registers are faster for copying elements of an array to another array (I know about memcpy() but I need this function for a very specific purpose).
My souce code is below. The relevant function is copyarray(). I can use either mmx or xmm registers with movq or movsd respectively, and the result is correct. However, when I use mmx registers, any timer I use (either clock() or QueryPerformanceCounter) to time the operations returns NaN.
Compiled with: gcc -std=c99 -O2 -m32 -msse3 -mincoming-stack-boundary=2 -mfpmath=sse,387 -masm=intel copyasm.c -o copyasm.exe
This is a very strange bug and I cannot figure out why using mmx registers would cause a timer to return NaN seconds, while using xmm registers in exactly the same code returns a valid time value
EDIT
Results using xmm registers:
Elapsed time: 0.000000 seconds, Gigabytes copied per second: inf GB
Residual = 0.000000
0.937437 0.330424 0.883267 0.118717 0.962493 0.584826 0.344371 0.423719
0.937437 0.330424 0.883267 0.118717 0.962493 0.584826 0.344371 0.423719
Results using mmx register:
Elapsed time: nan seconds, Gigabytes copied per second: inf GB
Residual = 0.000000
0.000000 0.754173 0.615345 0.634724 0.611286 0.547655 0.729637 0.942381
0.935759 0.754173 0.615345 0.634724 0.611286 0.547655 0.729637 0.942381
Source Code
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <x86intrin.h>
#include <windows.h>
inline double
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) copyarray(
double* restrict dst,
const double* restrict src,
const int n)
{
// int i = n;
// do {
// *dst++ = *src++;
// i--;
// } while(i);
__asm__ __volatile__
(
"mov ecx, %[n] \n\t"
"mov edi, %[dst] \n\t"
"mov esi, %[src] \n\t"
"xor eax, eax \n\t"
"sub ecx,1 \n\t"
"L%=: \n\t"
"movq mm0, QWORD PTR [esi+ecx*8] \n\t"
"movq QWORD PTR [edi+ecx*8], mm0 \n\t"
"sub ecx, 1 \n\t"
"jge L%= \n\t"
: // no outputs
: // inputs
[dst] "m" (dst),
[src] "m" (src),
[n] "g" (n)
: // register clobber
"eax","ecx","edi","esi",
"mm0"
);
}
void printarray(double* restrict a, int n)
{
for(int i = 0; i < n; ++i) {
printf(" %f ", *(a++));
}
printf("\n");
}
double residual(const double* restrict dst,
const double* restrict src,
const int n)
{
double residual = 0.0;
for(int i = 0; i < n; ++i)
residual += *(dst++) - *(src++);
return(residual);
}
int main()
{
double *A = NULL;
double *B = NULL;
int n = 8;
double memops;
double time3;
clock_t time1;
// LARGE_INTEGER frequency, time1, time2;
// QueryPerformanceFrequency(&frequency);
int trials = 1 << 0;
A = _mm_malloc(n*sizeof(*A), 64);
B = _mm_malloc(n*sizeof(*B), 64);
srand(time(NULL));
for(int i = 0; i < n; ++i)
*(A+i) = (double) rand()/RAND_MAX;
// QueryPerformanceCounter(&time1);
time1 = clock();
for(int i = 0; i < trials; ++i)
copyarray(B,A,n);
// QueryPerformanceCounter(&time2);
// time3 = (double)(time2.QuadPart - time1.QuadPart) / frequency.QuadPart;
time3 = (double) (clock() - time1)/CLOCKS_PER_SEC;
memops = (double) trials*n/time3*sizeof(*A)/1.0e9;
printf("Elapsed time: %f seconds, Gigabytes copied per second: %f GB\n",time3, memops);
printf("Residual = %f\n",residual(B,A,n));
printarray(A,n);
printarray(B,n);
_mm_free(A);
_mm_free(B);
}
You have to be careful when mixing MMX with floating point - use SSE instead if possible. If you must use MMX then read the section titled "MMX - State Management" on this page - note the requirement for the emms instruction after any MMX instructions before you next perform any floating point operations.
Based on the Wikipedia entry as well as the Intel manual, rdpmc should be available to user-mode processes as long as bit 8 of CR4 is set. However, I am still running into general protection error when trying to run rdpmc from userspace even with that bit set.
I am running on an 8-core Intel X3470 on kernel 2.6.32-279.el6.x86_64.
Here is the user-mode program I am trying to execute:
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <sched.h>
#include <assert.h>
uint64_t
read_pmc(int ecx)
{
unsigned int a, d;
__asm __volatile("rdpmc" : "=a"(a), "=d"(d) : "c"(ecx));
return ((uint64_t)a) | (((uint64_t)d) << 32);
}
int main(int ac, char **av)
{
uint64_t start, end;
cpu_set_t cpuset;
unsigned int c;
int i;
if (ac != 3) {
fprintf(stderr, "usage: %s cpu-id pmc-num\n", av[0]);
exit(EXIT_FAILURE);
}
i = atoi(av[1]);
c = atoi(av[2]);
CPU_ZERO(&cpuset);
CPU_SET(i, &cpuset);
assert(sched_setaffinity(0, sizeof(cpuset), &cpuset) == 0);
printf("%lu\n", read_pmc(c));
return 0;
}
Here is the kernel module which sets the bit and reads out CR4 so I can manually verify that the bit has been set.
/*
* Enable PMC in user mode.
*/
#include <linux/module.h>
#include <linux/kernel.h>
int init_module(void)
{
typedef long unsigned int uint64_t;
uint64_t output;
// Set CR4, Bit 8 to enable PMC
__asm__("push %rax\n\t"
"mov %cr4,%rax;\n\t"
"or $(1 << 7),%rax;\n\t"
"mov %rax,%cr4;\n\t"
"wbinvd\n\t"
"pop %rax"
);
// Read back CR4 to check the bit.
__asm__("\t mov %%cr4,%0" : "=r"(output));
printk(KERN_INFO "%lu", output);
return 0;
}
void cleanup_module(void)
{
__asm__("push %rax\n\t"
"push %rbx\n\t"
"mov %cr4,%rax;\n\t"
"mov $(1 << 7), %rbx\n\t"
"not %rbx\n\t"
"and %rbx, %rax;\n\t"
"mov %rax,%cr4;\n\t"
"wbinvd\n\t"
"pop %rbx\n\t"
"pop %rax\n\t"
);
}
Apparently, when Intel says Bit 8, they are referring to the 9th bit from the right, since their indexing begins at 0. Replacing $(1 << 7) with $(1 << 8) globally resolves the issue, and allows rdpmc to be called from user mode.
Here is the updated kernel module, also using on_each_cpu to make sure that it is set on every core.
/*
* Read PMC in kernel mode.
*/
#include <linux/module.h> /* Needed by all modules */
#include <linux/kernel.h> /* Needed for KERN_INFO */
static void printc4(void) {
typedef long unsigned int uint64_t;
uint64_t output;
// Read back CR4 to check the bit.
__asm__("\t mov %%cr4,%0" : "=r"(output));
printk(KERN_INFO "%lu", output);
}
static void setc4b8(void * info) {
// Set CR4, Bit 8 (9th bit from the right) to enable
__asm__("push %rax\n\t"
"mov %cr4,%rax;\n\t"
"or $(1 << 8),%rax;\n\t"
"mov %rax,%cr4;\n\t"
"wbinvd\n\t"
"pop %rax"
);
// Check which CPU we are on:
printk(KERN_INFO "Ran on Processor %d", smp_processor_id());
printc4();
}
static void clearc4b8(void * info) {
printc4();
__asm__("push %rax\n\t"
"push %rbx\n\t"
"mov %cr4,%rax;\n\t"
"mov $(1 << 8), %rbx\n\t"
"not %rbx\n\t"
"and %rbx, %rax;\n\t"
"mov %rax,%cr4;\n\t"
"wbinvd\n\t"
"pop %rbx\n\t"
"pop %rax\n\t"
);
printk(KERN_INFO "Ran on Processor %d", smp_processor_id());
}
int init_module(void)
{
on_each_cpu(setc4b8, NULL, 0);
return 0;
}
void cleanup_module(void)
{
on_each_cpu(clearc4b8, NULL, 0);
}
Echoing "2" to /sys/bus/event_source/devices/cpu/rdpmc allows user processes
to access performance counters via the rdpmc instruction.
Note that behaviour has changed. Prior to 4.0 "1" meant "enabled"
while meant "0" disable. Now "1" means allow only for processes that have active perf events. More details: http://man7.org/linux/man-pages/man2/perf_event_open.2.html
I am using Linux 2.6.26 kernel version and I am trying to change the interrupt descriptor table using a kernel module. I am only trying to change the page fault table entry here. So I make a copy of the original IDT and make changes to the page fault table entry only. The objective of the ISR is to print out information of the page fault before calling the original Page fault handler. But the kernel just crashes once I load it with insmod i.e it specifically crashed with the "loadIDTR" function. With further debugging, I found out that by not changing any entry if I load the IDTR it works fine. I am out of ideas.
I have pasted the code below
#include <linux/module.h> // for init_module()
#include <linux/init.h>
#include <linux/mm.h> // for get_free_page()
#include <linux/sched.h>
#include <linux/spinlock.h>
#define SUCCESS 0
#define PGFAULT_INT 0x0E
static char modname[] = "pgfaults";
static unsigned short oldidtr[3], newidtr[3];
static unsigned long long *oldidt, *newidt;
static unsigned long isr_orig, kpage;
static char *why[]={ "sra", "srp", "swa", "swp", "ura", "urp", "uwa", "uwp" };
unsigned long long gate_desc_orig,gate_desc_orig1;
static void my_intrept( unsigned long *tos )
{
// stack-layout:
// es,ds,edi,esi,ebp,esp,ebx,edx,ecx,eax,err,eip,cs,efl
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13
volatile unsigned long vaddr;
struct task_struct *task = current;
unsigned long err = tos[ 10 ];
unsigned long eip = tos[ 11 ];
static int count = 0;
int exe, len = 0;
char msg[80]="";
// get the faulting virtual address from register CR2
asm(" mov %%cr2, %%eax ; movl %%eax, %0 " : "=m" (vaddr) );
// construct the diagnostic message
len += sprintf( msg+len, "#%-6d ", ++count );
len += sprintf( msg+len, "%16s ", task->comm );
len += sprintf( msg+len, "pid=%-5d ", task->pid );
len += sprintf( msg+len, "CR2=%08X ", (unsigned int) vaddr );
len += sprintf( msg+len, "EIP=%08X ", (unsigned int) eip );
len += sprintf( msg+len, "%s ", why[ err ] );
// note if an instruction-fetch caused the page-fault
if ( vaddr == eip ) exe = 'x'; else exe = ' ';
len += sprintf( msg+len, "%c ", exe );
// print this diagnostic message to the kernel log
printk( "<1> %s \n", msg );
}
//---------- NEW PAGE-FAULT EXCEPTION-HANDLER ---------//
asmlinkage void isr0x0E( void );
asm(" .text ");
asm(" .type isr0x0E, #function ");
asm("isr0x0E: ");
asm(" pushal ");
asm(" pushl %ds ");
asm(" pushl %es ");
//
asm(" movl %ss, %eax ");
asm(" movl %eax, %ds ");
asm(" movl %eax, %es ");
//
asm(" pushl %esp ");
asm(" call my_intrept ");
asm(" addl $4, %esp ");
//
asm(" popl %es ");
asm(" popl %ds ");
asm(" popal ");
asm(" jmp *isr_orig ");
//-------------------------------------------------------//
static void load_IDTR( void *regimage )
{
asm(" lidt %0 " : : "m" (*(unsigned short*)regimage) );
}
int pgfault_init( void )
{
int i;
unsigned long long gate_desc,gate_desc1,gate_desc2;
spinlock_t lock =SPIN_LOCK_UNLOCKED;
unsigned long flags;
unsigned short selector1;
// allocate a mapped kernel page for our new IDT
kpage =__get_free_page( GFP_KERNEL);
if ( !kpage ) return -ENOMEM;
// initialize our other global variables
asm(" sidt oldidtr ; sidt newidtr ");
memcpy( newidtr+1, &kpage, sizeof( kpage ) );
oldidt = (unsigned long long *)(*(unsigned long*)(oldidtr+1));
newidt = (unsigned long long *)(*(unsigned long*)(newidtr+1));
// extract and save entry-point to original page-pault handler
gate_desc_orig = oldidt[ PGFAULT_INT ];
gate_desc =gate_desc_orig & 0xFFFF00000000FFFF;
gate_desc |= ( gate_desc >> 32 );
isr_orig = (unsigned long)gate_desc;
// initialize our new Interrupt Descriptor Table
memcpy( newidt, oldidt, 256*sizeof( unsigned long long ) );
gate_desc_orig1 = (unsigned long)isr0x0E;
gate_desc = gate_desc_orig1 & 0x00000000FFFFFFFF;
gate_desc = gate_desc | ( gate_desc << 32 );
gate_desc1= 0xFFFF0000;
gate_desc1= gate_desc1 << 32;
gate_desc1= gate_desc1 | 0x0000FFFF;
gate_desc = gate_desc & gate_desc1;
gate_desc2= 0x0000EF00;
gate_desc2= gate_desc2 <<32;
gate_desc2= gate_desc2 | 0x00100000;
gate_desc = gate_desc | gate_desc2; // trap-gate
//Part which is most likely creating a fault when loading the idtr
newidt[ PGFAULT_INT ] = gate_desc;
//**********************************************
// activate the new IDT
spin_lock_irqsave(&lock,flags);
load_IDTR( newidtr );
spin_unlock_irqrestore(&lock,flags);
// smp_call_function( load_IDTR, oldidtr, 1, 1 );
return SUCCESS;
}
void pgfault_exit( void )
{
// reactivate the old IDT
unsigned long flags;
spinlock_t lock =SPIN_LOCK_UNLOCKED;
spin_lock_irqsave(&lock,flags);
load_IDTR( oldidtr );
spin_unlock_irqrestore(&lock,flags);
// smp_call_function( load_IDTR, oldidtr, 1, 1 );
// release allocated kernel page
if ( kpage ) free_page( kpage );
}
EXPORT_SYMBOL_GPL(my_intrept);
MODULE_LICENSE("GPL");
module_init( pgfault_init);
module_exit( pgfault_exit);
Why don't you use kernel function instead of fiddling with bits manually!
check it (it is initialization module func):
struct desc_ptr newidtr;
gate_desc *oldidt, *newidt;
store_idt(&__IDT_register);
oldidt = (gate_desc *)__IDT_register.address;
__IDT_page =__get_free_page(GFP_KERNEL);
if(!__IDT_page)
return -1;
newidtr.address = __IDT_page;
newidtr.size = __IDT_register.size;
newidt = (gate_desc *)newidtr.address;
memcpy(newidt, oldidt, __IDT_register.size);
pack_gate(&newidt[PGFAULT_NR], GATE_INTERRUPT, (unsigned long)isr0x0E, 0, 0, __KERNEL_CS);
__load_idt((void *)&newidtr);
smp_call_function(__load_idt, &newidtr, 0, 1);
return 0;
I have tested it it works!
Your segment selector in your trap gate descriptor appears to be hardcoded to 0x0010, when it should be __KERNEL_CS (which is 0x0060 in the 2.6.26 kernel sources I have).
By the way, this is pretty baroque:
gate_desc_orig1 = (unsigned long)isr0x0E;
gate_desc = gate_desc_orig1 & 0x00000000FFFFFFFF;
gate_desc = gate_desc | ( gate_desc << 32 );
gate_desc1= 0xFFFF0000;
gate_desc1= gate_desc1 << 32;
gate_desc1= gate_desc1 | 0x0000FFFF;
gate_desc = gate_desc & gate_desc1;
gate_desc2= 0x0000EF00;
gate_desc2= gate_desc2 <<32;
gate_desc2= gate_desc2 | 0x00100000;
gate_desc = gate_desc | gate_desc2; // trap-gate
You could simplify that down to (with the __KERNEL_CS fix):
gate_desc = (unsigned long long)isr0x0E * 0x100000001ULL;
gate_desc &= 0xFFFF00000000FFFFULL;
gate_desc |= 0x0000EF0000000000ULL; // trap-gate
gate_desc |= (unsigned long long)__KERNEL_CS << 16;
For reference, here is a working implementation of a custom page fault handler for the Linux x86_64 architecture. I just tested this module myself with kernel 3.2, it works perfectly.
https://github.com/RichardUSTC/intercept-page-fault-handler