kernel change MMAP memory but user space not change - linux-kernel

Goal: I want to transfer data from kernel driver to user space app real time.
Method: I use mmap to connect kernel buffer_K and user buffer_U. When I change write data to K, the U will be changed also.
Problem: When I changed the buffer_K, use mmcpy(buffer_k, buffer_another, length), the buffer_K changed, but the buffer_U not change, my change frequency is 4ms.
This is my code
In kernel space, if kernel work done, it will trigger a signal to notice user space.
static uint8_t *mmap_buffer;
mmap_buffer = (uint8_t *)kmalloc(ads1299.samp_size * ads1299.buff_size, GFP_KERNEL);
int event()
{
memcpy(mmap_buffer, ads1299.buff_a, ads1299.buff_size * ads1299.samp_size);
SEND_SIGNAL_TO_APP;
}
int ads1299_mmap(struct file *flip, struct vm_area_struct *vma)
{
unsigned long page;
unsigned long start = (unsigned long)vma->vm_start;
unsigned long size = (unsigned long)(vma->vm_end - vma->vm_start);
vma->vm_flags |= VM_IO;
vma->vm_flags |= VM_SHARED;
page = virt_to_phys(mmap_buffer);
if(remap_pfn_range(vma,start,page>>PAGE_SHIFT, size, vma->vm_page_prot))
{
return -1;
}
return 0;
}
This is my user code
unsigned char *buffer= NULL;
buffer = (unsigned char*)malloc(charDataLen*sizeof(unsigned char));
buffer = (unsigned char *)mmap(NULL, getpagesize(), PROT_READ, MAP_SHARED, fd, 0);
if(buffer == MAP_FAILED)
{
printf("mmap error\r\n");
return -1;
}
int signal_handle()
{
usebufer(buffer)
}
Could you tell me some suggestion about my problem. Thanks.

Related

Why does L2 hardware prefetcher perform worse with only 1 KiB or 2 KiB access size?

I have a simple multi-threaded program where the thread performs random reads on a given file (in memory) divided evenly amongst the threads. The thread reads from the file to buffer and sets a value. This is really a program designed to test memory bandwidth. This is the following program,
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <stdbool.h>
#include <ctype.h>
#include <inttypes.h>
#include <pthread.h>
#include <assert.h>
#include <time.h>
#define NS_IN_SECOND 1000000000
uint64_t nano_time(void) {
struct timespec ts;
if( clock_gettime(CLOCK_REALTIME, &ts) == 0)
return ts.tv_sec * NS_IN_SECOND + ts.tv_nsec;
}
// avx512 test
#include <stdint.h>
void *__memmove_chk_avx512_no_vzeroupper(void *dest, void *src, size_t s);
/**
* To create 4 GB file: This will allocate space on disk
* $ dd < /dev/zero bs=1048576 count=4096 > testfile
*
* 100 GiB
* dd if=/dev/zero of=bigmmaptest bs=1M count=102400
* To clear cache:
* $ sync; echo 1 > /proc/sys/vm/drop_caches
*/
//#define SAMPLE_LATENCY 1
#define BYTES_IN_GB (1024*1024*1024)
// Block sized will be used for read and the same will be used for striding
// when iterating over a file in mmap.
#define DEFAULT_BLOCK_SIZE 4096 //8192
#define NANOSECONDS_IN_SECOND 1000000000
const char DEFAULT_NAME[] = "/mnt/tmp/mmaptest";
#define EXIT_MSG(...) \
do { \
printf(__VA_ARGS__); \
_exit(-1); \
} while (0)
uint64_t read_mmap_test(int fd, int tid, size_t block_size, size_t filesize, char* buf,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t write_mmap_test(int fd, int tid, size_t block_size, size_t filesize, char* buf,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t mmap_test(int fd, int tid, size_t block_size, size_t filesize, char *buf,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t read_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t write_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t syscall_test(int fd, int tid, size_t block_size, size_t filesize,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end);
size_t get_filesize(const char* filename);
void print_help_message(const char *progname);
char* map_buffer(int fd, size_t size);
void *run_tests(void *);
static int silent = 0;
typedef struct {
int tid;
int fd;
char *mapped_buffer;
int read_mmap;
int read_syscall;
int write_mmap;
int write_syscall;
off_t *offsets;
size_t block_size;
size_t chunk_size;
int retval;
uint64_t start_time;
uint64_t end_time;
} threadargs_t;
size_t filesize;
int main(int argc, char **argv) {
char *fname = (char*) DEFAULT_NAME;
char *mapped_buffer = NULL;
int c, fd, i, flags = O_RDWR, numthreads = 1, ret, option_index;
static int randomaccess = 0,
read_mmap = 0, read_syscall = 0,
write_mmap = 0, write_syscall = 0,
mixed_mmap = 0, write_tr = 0;
off_t *offsets = 0;
size_t block_size = DEFAULT_BLOCK_SIZE, numblocks,
new_file_size = 0;
uint64_t min_start_time, max_end_time = 0, retval;
// permissions
uint64_t mode = S_IRWXU | S_IRWXG;
pthread_t *threads;
threadargs_t *threadargs;
static struct option long_options[] =
{
// Options set a flag
{"randomaccess", no_argument, &randomaccess, 1},
{"readmmap", no_argument, &read_mmap, 1},
{"readsyscall", no_argument, &read_syscall, 1},
{"silent", no_argument, &silent, 1},
{"writemmap", no_argument, &write_mmap, 1},
{"writesyscall", no_argument, &write_syscall, 1},
{"mixedmmap", no_argument, &mixed_mmap, 1},
// Options take an argument
{"block", required_argument, 0, 'b'},
{"file", required_argument, 0, 'f'},
{"help", no_argument, 0, 'h'},
{"size", no_argument, 0, 's'},
{"threads", required_argument, 0, 't'},
{"writethreads", no_argument, 0, 'w'},
{0, 0, 0, 0}
};
//read operations
while(1) {
c = getopt_long(argc, argv, "b:f:h:s:t:w:",
long_options, &option_index);
// is end of the option
if (c == -1)
break;
switch(c)
{
case 0:
break;
case 'b':
block_size = atoi(optarg);
break;
case 'f':
fname = optarg;
break;
case 'h':
print_help_message(argv[0]);
_exit(0);
case 's':
new_file_size = (size_t)(atoi(optarg)) * BYTES_IN_GB;
break;
case 't':
numthreads = (int) (atoi(optarg));
break;
case 'w':
write_tr = atoi(optarg);
break;
default:
break;
}
}
if(!silent){
printf("PID: %d\n", getpid());
printf("Using file %s \n", fname);
}
if ((filesize = get_filesize(fname)) == -1) {
if (read_mmap || read_syscall) {
printf("Cannot obtain file size for %s: %s"
"File must exist prior to running read tests.\n",
fname, strerror(errno));
_exit(-1);
}
else
filesize = new_file_size;
}
fd = open((const char*)fname, flags, mode);
if(fd <0) {
printf("Clould not open/create file %s: %s\n",
fname, strerror(errno));
_exit(-1);
}
if(block_size < 0 || block_size > filesize){
printf("Invalid block size: %zu for file of size "
"%zu. Block size must be greater than 0 and no"
"greater than the file size.\n",
block_size, filesize);
_exit(-1);
}
/*
* Generate random block number for random file access.
* Sequential for sequential access
*/
numblocks = filesize/block_size;
if(filesize % block_size > 0)
numblocks++;
offsets = (off_t *) malloc(numblocks * sizeof(off_t));
if(offsets == 0){
printf("Failed to allocate memory: %s\n", strerror(errno));
_exit(-1);
}
for (uint64_t i = 0; i < numblocks; i++)
if(randomaccess)
offsets[i] = ((int)random() % numblocks) * block_size;
else
offsets[i] = i*block_size;
if (numblocks % numthreads != 0)
EXIT_MSG("We have %" PRIu64 " blocks and %d threads. "
"Threads must evenly divide blocks. "
"Please fix the args.\n",
(uint_least64_t)numblocks, numthreads);
if( read_mmap || write_mmap || mixed_mmap)
assert((mapped_buffer = map_buffer(fd, filesize)) != NULL);
threads = (pthread_t*)malloc(numthreads * sizeof(pthread_t));
threadargs =
(threadargs_t*)malloc(numthreads * sizeof(threadargs_t));
if (threads == NULL || threadargs == NULL)
EXIT_MSG("Could not allocate thread array for %d threads.\n", numthreads);
for (i = 0; i < numthreads; i++) {
if(mixed_mmap){
if (i < write_tr) {
write_mmap = 1;
} else {
read_mmap = 1;
}
}
threadargs[i].fd = fd;
threadargs[i].tid = i;
threadargs[i].block_size = block_size;
threadargs[i].chunk_size = filesize/numthreads;
threadargs[i].mapped_buffer = mapped_buffer;
threadargs[i].offsets = &offsets[numblocks/numthreads * i];
threadargs[i].read_mmap = read_mmap;
threadargs[i].read_syscall = read_syscall;
threadargs[i].write_mmap = write_mmap;
threadargs[i].write_syscall = write_syscall;
int ret = pthread_create(&threads[i], NULL, run_tests, &threadargs[i]);
if (ret!=0)
EXIT_MSG("pthread_create for %dth thread failed: %s\n",
i, strerror(errno));
}
for (i = 0; i< numthreads; i++){
ret = pthread_join(threads[i], NULL);
if (ret !=0)
EXIT_MSG("Thread %d failed in join: %s\n",
i, strerror(errno));
}
// for mixed mode determine read and write aggregate b/w.
if(mixed_mmap) {
// Write b/w
min_start_time = threadargs[0].start_time;
max_end_time = 0;
// Since tid 0 to write_tr-1 did writes, find it's min and max.
for(i=0; i < write_tr; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("Write: %.2f\n",
(double)write_tr*(filesize/numthreads)/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
// Read b/w
min_start_time = threadargs[write_tr].start_time;
max_end_time = 0;
for(i=write_tr; i < numthreads; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("Read: %.2f\n",
(double)(numthreads-write_tr)*(filesize/numthreads)/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
/**
* For total run time. Find the smallest start time
* and largest end time across all threads.
*/
min_start_time = threadargs[0].start_time;
max_end_time = 0;
for (i=0; i< numthreads; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("%.2f\n",
(double)filesize/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
munmap(mapped_buffer, filesize);
close(fd);
}
void * run_tests(void *args) {
uint64_t retval;
threadargs_t t = *(threadargs_t*)args;
if(t.read_mmap) {
if(!silent)
printf("Running read mmap test:\n");
retval = read_mmap_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.mapped_buffer, t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.read_syscall) {
if(!silent)
printf("Running read syscall test:\n");
retval = read_syscall_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.write_mmap) {
if(!silent)
printf("Running write mmap test:\n");
retval = write_mmap_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.mapped_buffer, t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.write_syscall) {
if(!silent)
printf("Running write syscall test:\n");
retval = write_syscall_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
return (void*) 0;
}
#define READ 1
#define WRITE 2
/**
********* SYSCALL section
*/
uint64_t read_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end) {
return syscall_test(fd, tid, block_size, filesize, READ, offsets,
begin, end);
}
uint64_t write_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end) {
return syscall_test(fd, tid, block_size, filesize, WRITE, offsets,
begin, end);
}
uint64_t syscall_test(int fd, int tid, size_t block_size, size_t filesize,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end) {
bool done = false;
char * buffer = NULL;
int i = 0;
size_t total_bytes_transferred = 0;
uint64_t begin_time, end_time, ret_token = 0;
buffer = (char*)malloc(block_size);
if(buffer == NULL) {
printf("Failed to allocate memory: %s\n", strerror(errno));
return -1;
}
memset((void*)buffer, 0, block_size);
begin_time= nano_time();
while(!done) {
size_t bytes_transferred = 0;
if(optype == READ)
bytes_transferred = pread(fd, buffer, block_size, offsets[i++]);
else if (optype == WRITE)
bytes_transferred = pwrite(fd, buffer, block_size, offsets[i++]);
if (bytes_transferred == 0)
done = true;
else if(bytes_transferred == -1){
printf("Failed to IO: %s\n", strerror(errno));
return -1;
}
else {
total_bytes_transferred += bytes_transferred;
if (optype == WRITE && total_bytes_transferred == filesize)
done = true;
// Do random operation
ret_token += buffer[0];
}
if (i*block_size >= filesize)
done = true;
}
end_time = nano_time();
if(!silent){
printf("%s: %" PRIu64 " bytes transferred in %" PRIu64 ""
" ns.\n", (optype == READ)?"read-syscall":"write-syscall",
(uint_least64_t)total_bytes_transferred, (end_time-begin_time));
// Throughput in GB/s
printf("(tid %d) %.2f\n", tid,
(double)filesize/(double)(end_time-begin_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
*begin = begin_time;
*end = end_time;
return ret_token;
}
/**
* MMAP tests
*/
uint64_t read_mmap_test(int fd, int tid, size_t block_size, size_t filesize,
char *buf, off_t *offsets, uint64_t *begin, uint64_t *end) {
return mmap_test(fd, tid, block_size, filesize, buf, READ, offsets, begin, end);
}
uint64_t write_mmap_test(int fd, int tid, size_t block_size, size_t filesize,
char *buf, off_t *offsets, uint64_t *begin, uint64_t *end){
return mmap_test(fd, tid, block_size, filesize, buf, WRITE, offsets, begin, end);
}
// Add memory addr
#if SAMPLE_LATENCY
#define BEGIN_LAT_SAMPLE \
if (num_samples < MAX_LAT_SAMPLES && i%LAT_SAMPL_INTERVAL == 0) \
lat_begin_time = nano_time();
#define END_LAT_SAMPLE \
if (num_samples < MAX_LAT_SAMPLES && i%LAT_SAMPL_INTERVAL == 0) { \
lat_end_time = nano_time(); \
latency_samples[i/LAT_SAMPL_INTERVAL % MAX_LAT_SAMPLES] = \
lat_end_time - lat_begin_time; \
num_samples++; \
}
#define MAX_LAT_SAMPLES 50
//#define LAT_SAMPL_INTERVAL (1000*1048576)
#define LAT_SAMPL_INTERVAL block_size
#else
#define BEGIN_LAT_SAMPLE ;
#define END_LAT_SAMPLE
#endif
uint64_t mmap_test(int fd, int tid, size_t block_size, size_t filesize, char *mapped_buffer,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end) {
bool done = false;
char *buffer = NULL;
uint64_t i, j, numblocks, ret;
uint64_t begin_time, end_time, ret_token = 0;
#if SAMPLE_LATENCY
uint64_t lat_begin_time, lat_end_time;
size_t latency_samples[MAX_LAT_SAMPLES];
int num_samples = 0;
memset((void*)latency_samples, 0, sizeof(latency_samples));
#endif
buffer = (char*)malloc(block_size);
if(buffer == NULL) {
printf("Failed to allocate memory: %s\n", strerror(errno));
return -1;
}
memset((void*)buffer, 1, block_size);
begin_time = nano_time();
for(i=0; i<filesize; i+=block_size){
off_t offset = offsets[i/block_size];
BEGIN_LAT_SAMPLE;
if(optype == READ) {
//__memmove_chk_avx512_no_vzeroupper(buffer, &mapped_buffer[offset], block_size);
memcpy(buffer, &mapped_buffer[offset], block_size);
ret_token += buffer[0];
}
else if (optype == WRITE) {
//__memmove_chk_avx512_no_vzeroupper(&mapped_buffer[offset], buffer, block_size);
memcpy(&mapped_buffer[offset], buffer, block_size);
ret_token += mapped_buffer[i];
}
END_LAT_SAMPLE;
}
end_time = nano_time();
if(!silent) {
printf("%s: %" PRIu64 " bytes read in %" PRIu64 " ns.\n",
(optype==READ)?"readmap":"writemap",
(uint_least64_t)filesize, (end_time-begin_time));
// print GB/s
printf("(tid %d) %.2f\n", tid,
(double)filesize/(double)(end_time-begin_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
*begin = begin_time;
*end = end_time;
#if SAMPLE_LATENCY
printf("\nSample latency for %ld byte block:\n", block_size);
for (i = 0; i < MAX_LAT_SAMPLES; i++)
printf("\t%ld: %ld\n", i, latency_samples[i]);
#endif
return ret_token;
}
char* map_buffer(int fd, size_t size) {
char *mapped_buffer = NULL;
// Populate
mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_POPULATE, fd, 0);
// Shared
// mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
// MAP_SHARED, fd, 0);
// Anon test
// mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
// MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if(mapped_buffer == MAP_FAILED)
EXIT_MSG("Failed to mmap file of size %zu: %s\n",
size, strerror(errno));
// Might also need to gurantee page aligned - posix_memalign()
// int mret = madvise(mapped_buffer, filesize, MADV_HUGEPAGE);
// if(mret!=0) {
// fprintf(stderr, "failed madvise: %s\n", strerror(errno));
// }
return mapped_buffer;
}
size_t get_filesize(const char* filename){
int retval;
struct stat st;
retval = stat(filename, &st);
if(retval)
return -1;
else
return st.st_size;
}
void print_help_message(const char *progname) {
/* take only the last portion of the path */
const char *basename = strrchr(progname, '/');
basename = basename ? basename + 1 : progname;
printf("usage: %s [OPTION]\n", basename);
printf(" -h, --help\n"
" Print this help and exit.\n");
printf(" -b, --block[=BLOCKSIZE]\n"
" Block size used for read system calls.\n"
" For mmap tests, the size of the stride when iterating\n"
" over the file.\n"
" Defaults to %d.\n", DEFAULT_BLOCK_SIZE);
printf(" -f, --file[=FILENAME]\n"
" Perform all tests on this file (defaults to %s).\n",
DEFAULT_NAME);
printf(" --readsyscall\n"
" Perform a read test using system calls.\n");
printf(" --readmmap\n"
" Perform a read test using mmap.\n");
printf(" --writesyscall\n"
" Perform a write test using system calls.\n");
printf(" --writemmap\n"
" Perform a write test using mmap.\n");
printf(" --randomaccess\n"
" Perform random access.\n");
printf(" --threads\n"
" Number of threads to use. Defaults to one.\n");
printf(" --mixedmmap\n"
" Perfom read and write concurrently at different offsets\n");
printf(" -w, -writethreads[=0]\n"
" Number of threads that should perform write\n");
}
To compile:
$ gcc testm.c -o testm -lpthread -static -O2 -fno-builtin-memcpy
Commands to run the program:
$ dd if=/dev/zero of=bigmmaptest bs=1M count=25600 # 25 GiB file
$ ./testm -b 1024 -f bigmmaptest --threads 16 --randomaccess --readmmap
I am on a 32 core Xeon 5218 2nd Gen. L1d KiB /L2 MiB /L3 MiB -- 512 / 16 / 22
When the memcpy size is 1 KiB I get 21.7 GB/s but when the size is 256B I get 26.68 GB/s and 34.8 GB/s when the size is 4 KiB. Why is there a drop in the middle?
I observe that 2 KiB also performs poorly when compared to 256B and 4 KiB.
What's more interesting is, when I disable the L2 hardware prefetcher and without any other changes my bandwidth automatically increases for 1 KiB and 2 KiB. Without prefetch 2 KiB memcpy gives 34.8 GB/s. All of these are aggregate bandwidth.
With perf, I did measure L2 load-store misses but they turned out to not change drastically. This effect is also not seen for 8 threads and below.
I am on linux 5.0.4. I am using the glibC memcpy (gcc 7.5.0) and even with -O2 I observe the above quirk. Where 1 KiB access size gives 18.76 GiB/s with L2 prefetch and without I get 30.32 GiB/s. For comparison, 256 B access size provides 24.7 GiB/s with prefetch and 24.8 GiB/s without. Clearly, the drop in performance is because of the L2 cache pollution caused by the prefetcher, as this is not observed with smaller thread counts. I was considering if SMT could be the reason for increased pollution but I observe the effect distinctly at 16 threads on 16 physical cores.
Skimming through glibc memcpy code, I can see that any access below the size of 4 KiB uses AVX 256 instructions, so there is nothing changing there.
The smaller 256B size not seeing a drop from the L2 streamer might be due to the sequence of cache misses being too short to activate the streamer and waste bandwidth (and slots in the LFBs and L2 <-> L3 superqueue) on requests that won't be useful.
For aligned 4k, there are no bytes within the same page that you're not fetching, so the L2 prefetcher is positively useful, or at least not harmful. (Demand loads come in pretty quickly for later lines when running memcpy so I'm guessing speeds were about the same with/without HW prefetch enabled, unless HW prefetch helps getting started on a new 4k chunk while still waiting for the end of the previous.)
The L2 only sees physical addresses, and AFAIK it doesn't try to prefetch across a 4k boundary. (Even if its within the same 2M hugepage, because it doesn't know that either.) The "next-page prefetcher" Intel mentions being new in Ivy Bridge is AFAIK just a TLB prefetch, not data.
So with aligned 4k memcpy, HW prefetch stops automatically at the end of the data you're actually going to read, not wasting any bandwidth. Since mmap gives you page-aligned memory, these 4k memcopies are from a single source page.
(The destination is irrelevant as it probably stays hot in L1d cache, with maybe an occasional eviction to L2, and the reload from it after memcpy can come from store-forwarding, not even having to wait for memcpy's store to commit to L1d.)
Prediction: If your smaller memcpy source starts part way into a 4k page, but still end at the end of a 4k page, you'd probably see similar behaviour to prefetch disabled. e.g. generate a random page number, and start at 3072 bytes into it, doing a 1 KiB copy. So all your 1 KiB copies come from the ends of pages, never middles.
(You'd still have more dTLB misses per byte memcpyed, because each TLB entry is only covering 1 K of the data you ever actually read. You did you use MAP_POPULATE so you shouldn't be seeing page faults in the timed region, assuming you have enough RAM.)
L1d KiB /L2 MiB /L3 MiB -- 512 / 16 / 22
Those are aggregate totals, but L1d and L2 are private per-core! You have 32kiB L1d and 1MiB L2 per core, because this is Cascade Lake, same layout as Skylake-X.
And BTW, I'd consider using a fast PRNG like xorshift+ or xorshift* inside the timing loop; that's easily random enough to defeat prefetching; even a simple LFSR or even LCG with a power-of-2 modulo would do that (and be very cheap, just an imul and add). It avoids having to read offsets from another array, if you really want to isolate just the memcpy memory accesses. Probably doesn't make a difference though. One advantage of a very simple PRNG with a period equal to the space you're trying to cover (like an LCG) is that you won't generate the same address twice, giving you a random permutation of the blocks. But with a big enough block of memory, random cache hits even from L3 are unlikely even without that hard-to-achieve property.
Your current array of offsets is fine. (I didn't look at the code super closely, so I'm just assuming there aren't bugs.)

Linux Kernel: manually modify page table entry flags

I am trying to manually mark a certain memory region of a userspace process as non-cacheable (for educational purposes, not intended to be used in production code) by setting a flag in the respective page table entries.
I have an Ubuntu 14.04 (ASLR disabled) with a 4.4 Linux kernel running on an x86_64 Intel Skylake processor.
In my kernel module I have the following function:
/*
* Set memory region [start,end], excluding 'addr', of process with PID 'pid' as uncacheable.
*/
ssize_t set_uncachable(uint32_t pid, uint64_t start, uint64_t end, uint64_t addr)
{
struct task_struct* ts = NULL;
struct vm_area_struct *curr, *first = NULL;
struct mm_struct* mm;
pgd_t * pgd;
pte_t * pte;
uint64_t numpages, curr_addr;
uint32_t level, j, i = 0;
printk(KERN_INFO "set_unacheable called\n");
ts = pid_task(find_vpid(pid), PIDTYPE_PID); //find task from PID
pgd = ts->mm->pgd; //page table root of the task
first = ts->mm->mmap;
curr = first;
if(first == NULL)
return -1;
do
{
printk(KERN_INFO "Region %3u [0x%016llx - 0x%016llx]", i, curr->vm_start, curr->vm_end);
numpages = (curr->vm_end - curr->vm_start) / PAGE_SIZE; //PAGE_SIZE is 4K for now
if(curr->vm_start > curr->vm_end)
numpages = 0;
for(j = 0; j < numpages; j++)
{
curr_addr = curr->vm_start + (PAGE_SIZE*j);
pte = lookup_address_in_pgd(pgd, curr_addr, &level);
if((pte != NULL) && (level == 1))
{
printk(KERN_INFO "PTE for 0x%016x - 0x%016x (level %u)\n", curr_addr, pte->pte, level);
if(curr_addr >= start && curr_addr < end && curr_addr != addr)
{
//setting page entry to PAT#3
pte->pte |= PWT_BIT | PCD_BIT;
pte->pte &= ~PAT_BIT;
printk(KERN_INFO "PTE for 0x%016x - 0x%016x (level %u) -- UPDATED\n", curr_addr, pte->pte, level);
}
}
}
curr = curr->vm_next;
if(curr == NULL)
return -1;
i++;
} while (curr != first);
return 0;
}
To test the above code I run an application that allocates a certain region in memory:
//#define BUF_ADDR_START 0x0000000008400000LL /* works */
#define BUF_ADDR_START 0x00007ffff0000000LL /* does not work */
[...]
buffer = mmap((void *)BUF_ADDR, BUF_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED | MAP_POPULATE, 0, 0);
if ( buffer == MAP_FAILED )
{
printf("Failed to map buffer\n");
exit(-1);
}
memset(buffer, 0, BUF_SIZE);
printf("Buffer at %p\n", buffer);
I want to mark the buffer uncacheable using my kernel module. The code in my kernel module works for 0x8400000, but for 0x7ffff0000000 no page table entry is found (i.e. lookup_address_in_pgd returns NULL). The buffer is definitely allocated in the test program, though.
It seems like my kernel module works for low addresses (code, data, and heap sections), but not for memory mapped at higher addresses (stack, shared libraries, etc.).
Does anyone have an idea why it fails for larger addresses? Suggestions on how to implement set_uncachable more elegantly are welcome as well ;-)
Thanks!

get /dev/random in kernel module

I need to get both /dev/random and /dev/urandom within kernel module.
get_random_bytes API provided to get /dev/urandom.
But there is no API for /dev/random so I tried to ioctl and read file in kernel space.
Here is what I have done.
using RNDGETPOOL ioctl
in include/linux/random.h
RNDGETPOOL is declared
/* Get the contents of the entropy pool. (Superuser only.) */
#define RNDGETPOOL _IOR( 'R', 0x02, int [2] )
but, It won't work so I checked driver/char/random.h noticed RNDGETPOOL is gone!!
static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
{
int size, ent_count;
int __user *p = (int __user *)arg;
int retval;
switch (cmd) {
case RNDGETENTCNT:
/* inherently racy, no point locking */
if (put_user(input_pool.entropy_count, p))
return -EFAULT;
return 0;
case RNDADDTOENTCNT:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (get_user(ent_count, p))
return -EFAULT;
credit_entropy_bits(&input_pool, ent_count);
return 0;
case RNDADDENTROPY:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (get_user(ent_count, p++))
return -EFAULT;
if (ent_count < 0)
return -EINVAL;
if (get_user(size, p++))
return -EFAULT;
retval = write_pool(&input_pool, (const char __user *)p,
size);
if (retval < 0)
return retval;
credit_entropy_bits(&input_pool, ent_count);
return 0;
case RNDZAPENTCNT:
case RNDCLEARPOOL:
/* Clear the entropy pool counters. */
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
rand_initialize();
return 0;
default:
return -EINVAL;
}
}
I searched google and find out ioctl RNDGETPOOL is removed. done!
using random_read function from driver/char/random.c:997
static ssize_t random_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
here is my kernel module's function accesses to /dev/random.
static void read_file()
{
struct file *file;
loff_t pos = 0;
//ssize_t wc;
unsigned char buf_ent[21]={0,};
int ent_c;
int i;
ssize_t length = 0;
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
file = filp_open("/dev/random", O_WRONLY, 0);
file->f_op->unlocked_ioctl(file, RNDGETENTCNT, &ent_c);
if(ent_c < sizeof(char))
{
printk("not enough entropy\n");
}
printk("ent counter : %d\n", ent_c);
//file->f_op->unlocked_ioctl(file, RNDGETPOOL, &ent_st.buf);
length = file->f_op->read(file, buf_ent, ent_c/ 8, &pos);
if(length <0)
{
printk("failed to random_read\n");
}
printk("length : %d\n", length);
printk("ent: ");
for(i=0;i<length; i++)
{
printk("%02x", buf_ent[i]);
}
printk("\n");
filp_close(file,0);
set_fs(old_fs);
}
outputs seems to be random
first try
[1290902.992048] ent_c : 165
[1290902.992060] length : 20
[1290902.992060] ent: d89290f4a5eea8e087a63943ed0129041e80b568
second try
[1290911.493990] ent_c : 33
[1290911.493994] length : 4
[1290911.493994] ent: 7832640a
by the way random_read function argument has __user keyword. Buf buf in code is in kernel space.
Is appropriate using random_read function in kernel space??
The in-kernel interface to get random bytes is get_random_bytes():
static void read_file(void)
{
unsigned char buf_ent[21];
get_random_bytes(buf_ent, 21);
print_hex_dump_bytes("ent: ", DUMP_PREFIX_NONE, buf_ent, 21);
}

I can't decrypt the ciphertext with AES-CBC

I encrypt a message, and I send it to other computer by socket.But I can't decrypt the ciphertext. I used same key and iv in two computers. I try to send ciphertext to other computer,and then send back the ciphertext. I can decrypt the ciphertext in same computer. But I want to decrtpt it in other computer.
This is my code.
#include "cooloi_aes.h"
CooloiAES::CooloiAES(std::string aes_key)
{
//key = (unsigned char*)"01234567890123456789012345678901";
key = (unsigned char*)aes_key.c_str();
iv = (unsigned char*)"aabbccddeeffgghh";
ERR_load_crypto_strings();
OpenSSL_add_all_algorithms();
OPENSSL_config(NULL);
}
CooloiAES::~CooloiAES()
{
}
CooloiAES* CooloiAES::Create(std::string aes_key)
{
auto ret = new CooloiAES(aes_key);
return ret;
}
void CooloiAES::handleErrors(void)
{
ERR_print_errors_fp(stderr);
abort();
}
int CooloiAES::encrypt(unsigned char *plaintext, int plaintext_len, unsigned char *key,
unsigned char *iv, unsigned char *ciphertext)
{
EVP_CIPHER_CTX *ctx;
int len;
int ciphertext_len;
// Create and initialise the context
if(!(ctx = EVP_CIPHER_CTX_new()))
handleErrors();
// Initialise the encryption operation. IMPORTANT - ensure you use a key
// and IV size appropriate for your cipher
// In this example we are using 256 bit AES (i.e. a 256 bit key). The
// IV size for *most* modes is the same as the block size. For AES this
// is 128 bits
if(1 != EVP_EncryptInit_ex(ctx, EVP_aes_256_cbc(), NULL, key, iv))
handleErrors();
// Provide the message to be encrypted, and obtain the encrypted output.
// EVP_EncryptUpdate can be called multiple times if necessary
if(1 != EVP_EncryptUpdate(ctx, ciphertext, &len, plaintext, plaintext_len))
handleErrors();
ciphertext_len = len;
// Finalise the encryption. Further ciphertext bytes may be written at
// this stage.
//
if(1 != EVP_EncryptFinal_ex(ctx, ciphertext + len, &len))
handleErrors();
ciphertext_len += len;
// Clean up
EVP_CIPHER_CTX_free(ctx);
return ciphertext_len;
}
int CooloiAES::decrypt(unsigned char *ciphertext, int ciphertext_len, unsigned char *key,
unsigned char *iv, unsigned char *plaintext)
{
EVP_CIPHER_CTX *ctx;
int len;
int plaintext_len;
// Create and initialise the context */
if(!(ctx = EVP_CIPHER_CTX_new()))
handleErrors();
// Initialise the decryption operation. IMPORTANT - ensure you use a key
// and IV size appropriate for your cipher
// In this example we are using 256 bit AES (i.e. a 256 bit key). The
// IV size for *most* modes is the same as the block size. For AES this
// is 128 bits
if(1 != EVP_DecryptInit_ex(ctx, EVP_aes_256_cbc(), NULL, key, iv))
handleErrors();
// Provide the message to be decrypted, and obtain the plaintext output.
// EVP_DecryptUpdate can be called multiple times if necessary
if(1 != EVP_DecryptUpdate(ctx, plaintext, &len, ciphertext, ciphertext_len))
handleErrors();
plaintext_len = len;
// Finalise the decryption. Further plaintext bytes may be written at
// this stage.
if(1 != EVP_DecryptFinal_ex(ctx, plaintext + len, &len))
handleErrors();
plaintext_len += len;
// Clean up
EVP_CIPHER_CTX_free(ctx);
return plaintext_len;
}
std::string CooloiAES::aes_encrypt(std::string msg)
{
int MSG_LEN = ((msg.size() / 16) + 1) * 16;
unsigned char* plaintext = (unsigned char*)msg.c_str();
unsigned char ciphertext[MSG_LEN];
unsigned char decryptedtext[MSG_LEN];
unsigned char cipher[MSG_LEN];
int decryptedtext_len,ciphertext_len;
ciphertext_len = encrypt(plaintext,strlen((char*)plaintext),key,iv,ciphertext);
std::string str(ciphertext,ciphertext+ciphertext_len);
return str;
}
std::string CooloiAES::aes_decrypt(std::string msg)
{
int MSG_LEN = msg.size();
//unsigned char* plaintext = (unsigned char*)msg.c_str();
unsigned char ciphertext[MSG_LEN];
unsigned char decryptedtext[MSG_LEN];
unsigned char cipher[MSG_LEN];
int decryptedtext_len,ciphertext_len;
memcpy(cipher,msg.data(),msg.size());
int len = sizeof(cipher);
decryptedtext_len = decrypt(cipher,len,key,iv,decryptedtext);
decryptedtext[decryptedtext_len] = '\0';
std::string dec((char*)decryptedtext);
return dec;
}

how to convert long int to char

#include <iostream>
#include <Windows.h>
#include <string>
using namespace std;
HANDLE hPort = CreateFile("COM2",
GENERIC_WRITE|GENERIC_READ,0,NULL,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,NULL);
DCB dcb;
bool writebyte(char*data)
{
DWORD byteswritten;
if (!GetCommState(hPort,&dcb))
{
printf("\nSerial port can't be open\n");
return false;
}
dcb.BaudRate = CBR_9600;
dcb.ByteSize = 8;
dcb.Parity = NOPARITY;
dcb.StopBits = ONESTOPBIT;
if (!SetCommState(hPort,&dcb))
return false;
bool retVal = WriteFile(hPort,data,1,&byteswritten,NULL);
return retVal;
}
int ReadByte()
{
int Val;
BYTE Byte;
DWORD dwBytesTransferred;
DWORD dwCommModemStatus;
if (!GetCommState(hPort,&dcb))
return 0;
SetCommMask(hPort,EV_RXCHAR | EV_ERR);
WaitCommEvent (hPort,&dwCommModemStatus,0);
if (dwCommModemStatus & EV_RXCHAR)
ReadFile (hPort,&Byte,1,&dwBytesTransferred,0);
Val = Byte;
return Val;
}
int main() {
POINT p;
int x;
int y;
int z;
while(0==0){
GetCursorPos(&p);
x = p.x;
y = p.y;
HDC hDC;
hDC = GetDC(NULL);
cin >> z;
cout << GetPixel(hDC, x, y) << endl;
Sleep(z);
ReleaseDC(NULL, hDC);
char data = GetPixel(hDC, x, y);
if (writebyte(&data))
cout <<" DATA SENT.. " << (int)data<< "\n";
}
}
in the part of sending data through serial communication, instead of sending the data as GetPixel(hDC, x, y), it only sends the value "-1" . I was thinking it is because char is only for small integers and the output I was giving is a very very long number. I tried to change it to long int but i still get the same result. That it only sends "-1". I thought that the solution might be converting char to long int or long int to char before sending the data but I don't know how..can someone help me?
Why do you use hDC after releasing it?
ReleaseDC(NULL, hDC);
char data = GetPixel(hDC, x, y);
GetPixel will return -1 (CLR_INVALID) in case of an error (see MSDN).
And, by the way, a COLORREF is not a char, so you lose Information when storing the return value of GetPixel in char data. You should store the complete COLORREF and send/receive all of it's bytes (so send/receive sizeof(COLORREF) bytes).
Also be aware of byte order. If you are transferring multi byte data between two machines then you must assure that both agree on the order of the bytes. If for example one machine is little endian and the other big endian, then they store COLORREF with different byte order in memory. One is storing the COLORREF 0x00BBGGRR in memory as { 0, 0xbb, 0xgg, 0xrr } whereas the other is storing it as { 0xrr, 0xgg, 0xbb, 0 }. So you need to define a transmit byte order which both sides use independant of their host byte order. If you don't want to invent the wheel new, you can take a look at network byte order and reuse that. Socket API gives you some functions like ntohl and htonl which help you in converting from host byte order to network byte order and vice versa.

Resources