Compile error: Kernel module

Compile error: Kernel module - makefile

I am a newbie at kernel programming & I wish to run this kernel module (posted below)... and i ran the makefile (posted below) for that, but I am getting the following errors: Can someone please help me understand how to overcome this:
The kernel program should run error free as it is taken from Intel's implementation:
obj-m += hello-1.o
all:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
clean:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
This is the error:
snehil#ubuntu:~/Desktop/measure$ make
make -C /lib/modules/3.0.0-12-generic/build M=/home/snehil/Desktop/measure modules
make[1]: Entering directory `/usr/src/linux-headers-3.0.0-12-generic'
CC [M] /home/snehil/Desktop/measure/measure1.o
/home/snehil/Desktop/measure/measure1.c: In function ‘hello_start’:
/home/snehil/Desktop/measure/measure1.c:108:2: error: implicit declaration of function
‘kmalloc’ [-Werror=implicit-function-declaration]
/home/snehil/Desktop/measure/measure1.c:108:8: warning: assignment makes pointer from
integer without a cast [enabled by default]
/home/snehil/Desktop/measure/measure1.c:115:11: warning: assignment makes pointer from
integer without a cast [enabled by default]
/home/snehil/Desktop/measure/measure1.c:124:12: warning: assignment makes pointer from
integer without a cast [enabled by default]
/home/snehil/Desktop/measure/measure1.c:130:13: warning: assignment makes pointer from
integer without a cast [enabled by default]
cc1: some warnings being treated as errors
make[2]: *** [/home/snehil/Desktop/measure/measure1.o] Error 1
make[1]: *** [_module_/home/snehil/Desktop/measure] Error 2
make[1]: Leaving directory `/usr/src/linux-headers-3.0.0-12-generic'
make: *** [all] Error 2
snehil#ubuntu:~/Desktop/measure$ gcc measure1
gcc: error: measure1: No such file or directory
gcc: fatal error: no input files
compilation terminated.
This is the kernel module code:
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/hardirq.h>
#include <linux/preempt.h>
#include <linux/sched.h>
#define SIZE_OF_STAT 100000
#define BOUND_OF_LOOP 1000
#define UINT64_MAX (18446744073709551615ULL)
void inline Filltimes(uint64_t **times) {
unsigned long flags;
int i, j;
uint64_t start, end;
unsigned cycles_low, cycles_high, cycles_low1, cycles_high1;
volatile int variable = 0;
asm volatile ("CPUID\n\t"
"RDTSC\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low)::"%rax", "%rbx", "%rcx",
"%rdx");
asm volatile ("CPUID\n\t"
"RDTSC\n\t"
"CPUID\n\t"
"RDTSC\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx",
"%rdx");
asm volatile ("CPUID\n\t"
"RDTSC\n\t"::: "%rax", "%rbx", "%rcx", "%rdx");
for (j=0; j<BOUND_OF_LOOP; j++) {
for (i =0; i<SIZE_OF_STAT; i++) {
variable = 0;
preempt_disable();
raw_local_irq_save(flags);
asm volatile (
"CPUID\n\t"
"RDTSC\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx",
"%rdx");
/*call the function to measure here*/
asm volatile(
"CPUID\n\t"
"RDTSC\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx",
"%rdx");
raw_local_irq_restore(flags);
preempt_enable();
start = ( ((uint64_t)cycles_high << 32) | cycles_low );
end = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 );
if ( (end - start) < 0) {
printk(KERN_ERR "\n\n>>>>>>>>>>>>>> CRITICAL ERROR IN TAKING THE TIME!!!!!!\n loop(%d)
stat(%d) start = %llu, end = %llu, variable = %u\n", j, i, start, end, variable);
times[j][i] = 0;
}
else
{
times[j][i] = end - start;
}
}
}
return;
}
uint64_t var_calc(uint64_t *inputs, int size)
{
int i;
uint64_t acc = 0, previous = 0, temp_var = 0;
for (i=0; i< size; i++) {
if (acc < previous) goto overflow;
previous = acc;
acc += inputs[i];
}
acc = acc * acc;
if (acc < previous) goto overflow;
previous = 0;
for (i=0; i< size; i++){
if (temp_var < previous) goto overflow;
previous = temp_var;
temp_var+= (inputs[i]*inputs[i]);
}
temp_var = temp_var * size;
if (temp_var < previous) goto overflow;
temp_var =(temp_var - acc)/(((uint64_t)(size))*((uint64_t)(size)));
return (temp_var);
overflow:
printk(KERN_ERR "\n\n>>>>>>>>>>>>>> CRITICAL OVERFLOW ERROR IN var_calc!!!!!!\n\n");
return -EINVAL;
}
static int __init hello_start(void)
{
int i = 0, j = 0, spurious = 0, k =0;
uint64_t **times;
uint64_t *variances;
uint64_t *min_values;
uint64_t max_dev = 0, min_time = 0, max_time = 0, prev_min =0, tot_var=0,
max_dev_all=0, var_of_vars=0, var_of_mins=0;
printk(KERN_INFO "Loading hello module...\n");
times = kmalloc(BOUND_OF_LOOP*sizeof(uint64_t*), GFP_KERNEL);
if (!times) {
printk(KERN_ERR "unable to allocate memory for times\n");
return 0;
}
for (j=0; j<BOUND_OF_LOOP; j++) {
times[j] = kmalloc(SIZE_OF_STAT*sizeof(uint64_t), GFP_KERNEL);
if (!times[j]) {
printk(KERN_ERR "unable to allocate memory for times[%d]\n", j);
for (k=0; k<j; k++)
kfree(times[k]);
return 0;
}
}
variances = kmalloc(BOUND_OF_LOOP*sizeof(uint64_t), GFP_KERNEL);
if (!variances) {
printk(KERN_ERR "unable to allocate memory for variances\n");
return 0;
}
min_values = kmalloc(BOUND_OF_LOOP*sizeof(uint64_t), GFP_KERNEL);
if (!min_values) {
printk(KERN_ERR "unable to allocate memory for min_values\n");
return 0;
}
Filltimes(times);
for (j=0; j<BOUND_OF_LOOP; j++) {
max_dev = 0;
min_time = 0;
max_time = 0;
for (i =0; i<SIZE_OF_STAT; i++) {
if ((min_time == 0)||(min_time > times[j][i]))
min_time = times[j][i];
if (max_time < times[j][i])
max_time = times[j][i];
}
max_dev = max_time - min_time;
min_values[j] = min_time;
if ((prev_min != 0) && (prev_min > min_time))
spurious++;
if (max_dev > max_dev_all)
max_dev_all = max_dev;
variances[j] = var_calc(times[j], SIZE_OF_STAT);
tot_var += variances[j];
printk(KERN_ERR "loop_size:%d >>>> variance(cycles): %llu; max_deviation: %llu ;min
time: %llu", j, variances[j], max_dev, min_time);
prev_min = min_time;
}
var_of_vars = var_calc(variances, BOUND_OF_LOOP);
var_of_mins = var_calc(min_values, BOUND_OF_LOOP);
printk(KERN_ERR "\n total number of spurious min values = %d", spurious);
printk(KERN_ERR "\n total variance = %llu", (tot_var/BOUND_OF_LOOP));
printk(KERN_ERR "\n absolute max deviation = %llu", max_dev_all);
printk(KERN_ERR "\n variance of variances = %llu", var_of_vars);
printk(KERN_ERR "\n variance of minimum values = %llu", var_of_mins);
for (j=0; j<BOUND_OF_LOOP; j++) {
kfree(times[j]);
}
kfree(times);
kfree(variances);
kfree(min_values);
return 0;
}
static void __exit hello_end(void)
{
printk(KERN_INFO "Goodbye Mr.\n");
}
module_init(hello_start);
module_exit(hello_end);

If you are using kmalloc or kzalloc() for memory allocation
you have to include #include<linux/slab.h>.
They are called as slab allocators, these slab are chunks i.e. "cache" present
in RAM and are physically contiguous. These slab allocator use underlying
"Buddy System Algorithm", buddy allocator to provide more fine-grained allocation.
Fore more referrence go through the below link:
http://en.wikipedia.org/wiki/Slab_allocation
http://en.wikipedia.org/wiki/Buddy_algorithm
Hope this answers your question!!!!!.

You haven't included the header for kmalloc. Add #include <linux/slab.h> to your code.

Related

Compiling LAPACKe library using C in MacOS

I need to use LAPACKe functions in a code that should run on Linux and macOS, but the problem is under OsX. I have a MacBook Pro 2021, with an M1 Pro processor, running an OsX 12.6.2
I wrote an example code
#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__
#include <Accelerate/Accelerate.h>
#else
#include <lapacke.h>
#endif
const int N= 3, NRHS=2, LDA=N, LDB=N;
const int NN=5, NRHS2=3;
int main(int argc, char** argv) {
/******* Example of using the lapack library with
the C interface for solving linear systems
for a general full matrix **************/
int ipiv[N], info;
// a[LDA*N]
double a[] = {
6.80, -2.11, 5.66,
-6.05, -3.30, 5.36,
-0.45, 2.58, -2.70
};
// b[LDB*NRHS]
double b[] = {
4.02, 6.19, -8.22,
-1.56, 4.00, -8.67
};
printf("\nTest of using LAPACKe Library\n");
printf("Matrix A : %d by %d\n", N, N);
for(int i=0; i<N; i++){
int s = i;
for(int j=0; j<N; j++, s+=LDA) printf(" % 6.2lf", a[s]);
printf("\n");
}
printf("\nRight hand side %d vectors of %d\n", NRHS, N);
for(int i=0; i<N; i++){
int s = i;
for(int j=0; j<NRHS; j++, s+=LDA) printf(" % 10.6lf", b[s]);
printf("\n");
}
/** As long the LAPACK_COL_MAJOR is used, the matrix is
filled up by columns, pay attention in the way is printed **/
info = LAPACKE_dgesv( LAPACK_COL_MAJOR, N, NRHS, a, LDA, ipiv, b, LDB);
if (info == 0) {
printf("\nFactorization of LU : %d by %d\n", N, N);
for(int i=0; i<N; i++){
int s = i;
for(int j=0; j<N; j++, s+=LDA) printf(" % 10.5lf", a[s]);
printf("\n");
}
printf("\nSolution of %dright hand side vectors of %d\n", NRHS, N);
for(int i=0; i<N; i++){
int s = i;
for(int j=0; j<NRHS; j++, s+=LDA) printf(" % 10.6lf", b[s]);
printf("\n");
}
printf("\nFactorization pivot indices by %d\n", N);
for(int i=0; i<N; i++) printf(" % 5.0d", ipiv[i]);
printf("\n\n");
} // End of if (info == 0)
else
printf("An error ocurred in the LAPACK lib dgesv, with code %d\n", info);
/******* Example of using the lapack library with
the C interface for solving linear systems
for a trigiagonal matrix **************/
int ldb = NN;
// d1[NN-1] lower diagonal
double dl[] = {1, 4, 4, 1};
// d[NN] main diagonal
double d[] = {-2, -2, -2, -2, -2};
// du[NN-1] upper diagonal
double du[] = {1, 4, 4, 1};
// bb[NN*NRHS2] number of righ hand side vectors
double bb[] = {
3., 5., 5., 5., 3.,
-1.56, 4., -8.67, 1.75, 2.86,
9.81, -4.09, -4.57, -8.61, 8.99
};
info = LAPACKE_dgtsv(LAPACK_COL_MAJOR, NN, NRHS2, dl, d, du, bb, ldb);
if (info == 0) {
printf("\nTest of using LAPACKe Library for Tridiagonal systems\n");
printf("\nSolution of %dright hand side vectors of %d\n", NRHS2, NN);
for(int i=0; i<NN; i++){
int s = i;
for(int j=0; j<NRHS2; j++, s+=ldb) printf(" % 10.6lf", bb[s]);
printf("\n");
}
} // End of if (info == 0)
else
printf("An error ocurred in the LAPACK lib dgesv, with code %d\n", info);
return 0;
}
This runs in a WSL (Windows Subsystem Linux) and compiles successfully making the command
gcc example2.c -o lapack -llapacke
But, when I try to compile it in my Mac using the line
gcc example2.c -o lapacke -framework Accelerate
I receive the following error:
example2.c:47:9: error: implicit declaration of function 'LAPACKE_dgesv' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
info = LAPACKE_dgesv( LAPACK_COL_MAJOR, N, NRHS, a, LDA, ipiv, b, LDB);
^
example2.c:47:24: error: use of undeclared identifier 'LAPACK_COL_MAJOR'
info = LAPACKE_dgesv( LAPACK_COL_MAJOR, N, NRHS, a, LDA, ipiv, b, LDB);
^
example2.c:91:9: error: implicit declaration of function 'LAPACKE_dgtsv' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
info = LAPACKE_dgtsv(LAPACK_COL_MAJOR, NN, NRHS2, dl, d, du, bb, ldb);
^
example2.c:91:9: note: did you mean 'LAPACKE_dgesv'?
example2.c:47:9: note: 'LAPACKE_dgesv' declared here
info = LAPACKE_dgesv( LAPACK_COL_MAJOR, N, NRHS, a, LDA, ipiv, b, LDB);
^
example2.c:91:23: error: use of undeclared identifier 'LAPACK_COL_MAJOR'
info = LAPACKE_dgtsv(LAPACK_COL_MAJOR, NN, NRHS2, dl, d, du, bb, ldb);
^
4 errors generated.
It looks like inside the Accelerate.h the headers of LAPACKe functions are not defined. I was looking around but I don't see anything that can help me.
BTW, I have another example using cblas and it runs smoothly

Why does L2 hardware prefetcher perform worse with only 1 KiB or 2 KiB access size?

I have a simple multi-threaded program where the thread performs random reads on a given file (in memory) divided evenly amongst the threads. The thread reads from the file to buffer and sets a value. This is really a program designed to test memory bandwidth. This is the following program,
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <stdbool.h>
#include <ctype.h>
#include <inttypes.h>
#include <pthread.h>
#include <assert.h>
#include <time.h>
#define NS_IN_SECOND 1000000000
uint64_t nano_time(void) {
struct timespec ts;
if( clock_gettime(CLOCK_REALTIME, &ts) == 0)
return ts.tv_sec * NS_IN_SECOND + ts.tv_nsec;
}
// avx512 test
#include <stdint.h>
void *__memmove_chk_avx512_no_vzeroupper(void *dest, void *src, size_t s);
/**
* To create 4 GB file: This will allocate space on disk
* $ dd < /dev/zero bs=1048576 count=4096 > testfile
*
* 100 GiB
* dd if=/dev/zero of=bigmmaptest bs=1M count=102400
* To clear cache:
* $ sync; echo 1 > /proc/sys/vm/drop_caches
*/
//#define SAMPLE_LATENCY 1
#define BYTES_IN_GB (1024*1024*1024)
// Block sized will be used for read and the same will be used for striding
// when iterating over a file in mmap.
#define DEFAULT_BLOCK_SIZE 4096 //8192
#define NANOSECONDS_IN_SECOND 1000000000
const char DEFAULT_NAME[] = "/mnt/tmp/mmaptest";
#define EXIT_MSG(...) \
do { \
printf(__VA_ARGS__); \
_exit(-1); \
} while (0)
uint64_t read_mmap_test(int fd, int tid, size_t block_size, size_t filesize, char* buf,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t write_mmap_test(int fd, int tid, size_t block_size, size_t filesize, char* buf,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t mmap_test(int fd, int tid, size_t block_size, size_t filesize, char *buf,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t read_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t write_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t syscall_test(int fd, int tid, size_t block_size, size_t filesize,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end);
size_t get_filesize(const char* filename);
void print_help_message(const char *progname);
char* map_buffer(int fd, size_t size);
void *run_tests(void *);
static int silent = 0;
typedef struct {
int tid;
int fd;
char *mapped_buffer;
int read_mmap;
int read_syscall;
int write_mmap;
int write_syscall;
off_t *offsets;
size_t block_size;
size_t chunk_size;
int retval;
uint64_t start_time;
uint64_t end_time;
} threadargs_t;
size_t filesize;
int main(int argc, char **argv) {
char *fname = (char*) DEFAULT_NAME;
char *mapped_buffer = NULL;
int c, fd, i, flags = O_RDWR, numthreads = 1, ret, option_index;
static int randomaccess = 0,
read_mmap = 0, read_syscall = 0,
write_mmap = 0, write_syscall = 0,
mixed_mmap = 0, write_tr = 0;
off_t *offsets = 0;
size_t block_size = DEFAULT_BLOCK_SIZE, numblocks,
new_file_size = 0;
uint64_t min_start_time, max_end_time = 0, retval;
// permissions
uint64_t mode = S_IRWXU | S_IRWXG;
pthread_t *threads;
threadargs_t *threadargs;
static struct option long_options[] =
{
// Options set a flag
{"randomaccess", no_argument, &randomaccess, 1},
{"readmmap", no_argument, &read_mmap, 1},
{"readsyscall", no_argument, &read_syscall, 1},
{"silent", no_argument, &silent, 1},
{"writemmap", no_argument, &write_mmap, 1},
{"writesyscall", no_argument, &write_syscall, 1},
{"mixedmmap", no_argument, &mixed_mmap, 1},
// Options take an argument
{"block", required_argument, 0, 'b'},
{"file", required_argument, 0, 'f'},
{"help", no_argument, 0, 'h'},
{"size", no_argument, 0, 's'},
{"threads", required_argument, 0, 't'},
{"writethreads", no_argument, 0, 'w'},
{0, 0, 0, 0}
};
//read operations
while(1) {
c = getopt_long(argc, argv, "b:f:h:s:t:w:",
long_options, &option_index);
// is end of the option
if (c == -1)
break;
switch(c)
{
case 0:
break;
case 'b':
block_size = atoi(optarg);
break;
case 'f':
fname = optarg;
break;
case 'h':
print_help_message(argv[0]);
_exit(0);
case 's':
new_file_size = (size_t)(atoi(optarg)) * BYTES_IN_GB;
break;
case 't':
numthreads = (int) (atoi(optarg));
break;
case 'w':
write_tr = atoi(optarg);
break;
default:
break;
}
}
if(!silent){
printf("PID: %d\n", getpid());
printf("Using file %s \n", fname);
}
if ((filesize = get_filesize(fname)) == -1) {
if (read_mmap || read_syscall) {
printf("Cannot obtain file size for %s: %s"
"File must exist prior to running read tests.\n",
fname, strerror(errno));
_exit(-1);
}
else
filesize = new_file_size;
}
fd = open((const char*)fname, flags, mode);
if(fd <0) {
printf("Clould not open/create file %s: %s\n",
fname, strerror(errno));
_exit(-1);
}
if(block_size < 0 || block_size > filesize){
printf("Invalid block size: %zu for file of size "
"%zu. Block size must be greater than 0 and no"
"greater than the file size.\n",
block_size, filesize);
_exit(-1);
}
/*
* Generate random block number for random file access.
* Sequential for sequential access
*/
numblocks = filesize/block_size;
if(filesize % block_size > 0)
numblocks++;
offsets = (off_t *) malloc(numblocks * sizeof(off_t));
if(offsets == 0){
printf("Failed to allocate memory: %s\n", strerror(errno));
_exit(-1);
}
for (uint64_t i = 0; i < numblocks; i++)
if(randomaccess)
offsets[i] = ((int)random() % numblocks) * block_size;
else
offsets[i] = i*block_size;
if (numblocks % numthreads != 0)
EXIT_MSG("We have %" PRIu64 " blocks and %d threads. "
"Threads must evenly divide blocks. "
"Please fix the args.\n",
(uint_least64_t)numblocks, numthreads);
if( read_mmap || write_mmap || mixed_mmap)
assert((mapped_buffer = map_buffer(fd, filesize)) != NULL);
threads = (pthread_t*)malloc(numthreads * sizeof(pthread_t));
threadargs =
(threadargs_t*)malloc(numthreads * sizeof(threadargs_t));
if (threads == NULL || threadargs == NULL)
EXIT_MSG("Could not allocate thread array for %d threads.\n", numthreads);
for (i = 0; i < numthreads; i++) {
if(mixed_mmap){
if (i < write_tr) {
write_mmap = 1;
} else {
read_mmap = 1;
}
}
threadargs[i].fd = fd;
threadargs[i].tid = i;
threadargs[i].block_size = block_size;
threadargs[i].chunk_size = filesize/numthreads;
threadargs[i].mapped_buffer = mapped_buffer;
threadargs[i].offsets = &offsets[numblocks/numthreads * i];
threadargs[i].read_mmap = read_mmap;
threadargs[i].read_syscall = read_syscall;
threadargs[i].write_mmap = write_mmap;
threadargs[i].write_syscall = write_syscall;
int ret = pthread_create(&threads[i], NULL, run_tests, &threadargs[i]);
if (ret!=0)
EXIT_MSG("pthread_create for %dth thread failed: %s\n",
i, strerror(errno));
}
for (i = 0; i< numthreads; i++){
ret = pthread_join(threads[i], NULL);
if (ret !=0)
EXIT_MSG("Thread %d failed in join: %s\n",
i, strerror(errno));
}
// for mixed mode determine read and write aggregate b/w.
if(mixed_mmap) {
// Write b/w
min_start_time = threadargs[0].start_time;
max_end_time = 0;
// Since tid 0 to write_tr-1 did writes, find it's min and max.
for(i=0; i < write_tr; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("Write: %.2f\n",
(double)write_tr*(filesize/numthreads)/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
// Read b/w
min_start_time = threadargs[write_tr].start_time;
max_end_time = 0;
for(i=write_tr; i < numthreads; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("Read: %.2f\n",
(double)(numthreads-write_tr)*(filesize/numthreads)/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
/**
* For total run time. Find the smallest start time
* and largest end time across all threads.
*/
min_start_time = threadargs[0].start_time;
max_end_time = 0;
for (i=0; i< numthreads; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("%.2f\n",
(double)filesize/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
munmap(mapped_buffer, filesize);
close(fd);
}
void * run_tests(void *args) {
uint64_t retval;
threadargs_t t = *(threadargs_t*)args;
if(t.read_mmap) {
if(!silent)
printf("Running read mmap test:\n");
retval = read_mmap_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.mapped_buffer, t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.read_syscall) {
if(!silent)
printf("Running read syscall test:\n");
retval = read_syscall_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.write_mmap) {
if(!silent)
printf("Running write mmap test:\n");
retval = write_mmap_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.mapped_buffer, t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.write_syscall) {
if(!silent)
printf("Running write syscall test:\n");
retval = write_syscall_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
return (void*) 0;
}
#define READ 1
#define WRITE 2
/**
********* SYSCALL section
*/
uint64_t read_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end) {
return syscall_test(fd, tid, block_size, filesize, READ, offsets,
begin, end);
}
uint64_t write_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end) {
return syscall_test(fd, tid, block_size, filesize, WRITE, offsets,
begin, end);
}
uint64_t syscall_test(int fd, int tid, size_t block_size, size_t filesize,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end) {
bool done = false;
char * buffer = NULL;
int i = 0;
size_t total_bytes_transferred = 0;
uint64_t begin_time, end_time, ret_token = 0;
buffer = (char*)malloc(block_size);
if(buffer == NULL) {
printf("Failed to allocate memory: %s\n", strerror(errno));
return -1;
}
memset((void*)buffer, 0, block_size);
begin_time= nano_time();
while(!done) {
size_t bytes_transferred = 0;
if(optype == READ)
bytes_transferred = pread(fd, buffer, block_size, offsets[i++]);
else if (optype == WRITE)
bytes_transferred = pwrite(fd, buffer, block_size, offsets[i++]);
if (bytes_transferred == 0)
done = true;
else if(bytes_transferred == -1){
printf("Failed to IO: %s\n", strerror(errno));
return -1;
}
else {
total_bytes_transferred += bytes_transferred;
if (optype == WRITE && total_bytes_transferred == filesize)
done = true;
// Do random operation
ret_token += buffer[0];
}
if (i*block_size >= filesize)
done = true;
}
end_time = nano_time();
if(!silent){
printf("%s: %" PRIu64 " bytes transferred in %" PRIu64 ""
" ns.\n", (optype == READ)?"read-syscall":"write-syscall",
(uint_least64_t)total_bytes_transferred, (end_time-begin_time));
// Throughput in GB/s
printf("(tid %d) %.2f\n", tid,
(double)filesize/(double)(end_time-begin_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
*begin = begin_time;
*end = end_time;
return ret_token;
}
/**
* MMAP tests
*/
uint64_t read_mmap_test(int fd, int tid, size_t block_size, size_t filesize,
char *buf, off_t *offsets, uint64_t *begin, uint64_t *end) {
return mmap_test(fd, tid, block_size, filesize, buf, READ, offsets, begin, end);
}
uint64_t write_mmap_test(int fd, int tid, size_t block_size, size_t filesize,
char *buf, off_t *offsets, uint64_t *begin, uint64_t *end){
return mmap_test(fd, tid, block_size, filesize, buf, WRITE, offsets, begin, end);
}
// Add memory addr
#if SAMPLE_LATENCY
#define BEGIN_LAT_SAMPLE \
if (num_samples < MAX_LAT_SAMPLES && i%LAT_SAMPL_INTERVAL == 0) \
lat_begin_time = nano_time();
#define END_LAT_SAMPLE \
if (num_samples < MAX_LAT_SAMPLES && i%LAT_SAMPL_INTERVAL == 0) { \
lat_end_time = nano_time(); \
latency_samples[i/LAT_SAMPL_INTERVAL % MAX_LAT_SAMPLES] = \
lat_end_time - lat_begin_time; \
num_samples++; \
}
#define MAX_LAT_SAMPLES 50
//#define LAT_SAMPL_INTERVAL (1000*1048576)
#define LAT_SAMPL_INTERVAL block_size
#else
#define BEGIN_LAT_SAMPLE ;
#define END_LAT_SAMPLE
#endif
uint64_t mmap_test(int fd, int tid, size_t block_size, size_t filesize, char *mapped_buffer,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end) {
bool done = false;
char *buffer = NULL;
uint64_t i, j, numblocks, ret;
uint64_t begin_time, end_time, ret_token = 0;
#if SAMPLE_LATENCY
uint64_t lat_begin_time, lat_end_time;
size_t latency_samples[MAX_LAT_SAMPLES];
int num_samples = 0;
memset((void*)latency_samples, 0, sizeof(latency_samples));
#endif
buffer = (char*)malloc(block_size);
if(buffer == NULL) {
printf("Failed to allocate memory: %s\n", strerror(errno));
return -1;
}
memset((void*)buffer, 1, block_size);
begin_time = nano_time();
for(i=0; i<filesize; i+=block_size){
off_t offset = offsets[i/block_size];
BEGIN_LAT_SAMPLE;
if(optype == READ) {
//__memmove_chk_avx512_no_vzeroupper(buffer, &mapped_buffer[offset], block_size);
memcpy(buffer, &mapped_buffer[offset], block_size);
ret_token += buffer[0];
}
else if (optype == WRITE) {
//__memmove_chk_avx512_no_vzeroupper(&mapped_buffer[offset], buffer, block_size);
memcpy(&mapped_buffer[offset], buffer, block_size);
ret_token += mapped_buffer[i];
}
END_LAT_SAMPLE;
}
end_time = nano_time();
if(!silent) {
printf("%s: %" PRIu64 " bytes read in %" PRIu64 " ns.\n",
(optype==READ)?"readmap":"writemap",
(uint_least64_t)filesize, (end_time-begin_time));
// print GB/s
printf("(tid %d) %.2f\n", tid,
(double)filesize/(double)(end_time-begin_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
*begin = begin_time;
*end = end_time;
#if SAMPLE_LATENCY
printf("\nSample latency for %ld byte block:\n", block_size);
for (i = 0; i < MAX_LAT_SAMPLES; i++)
printf("\t%ld: %ld\n", i, latency_samples[i]);
#endif
return ret_token;
}
char* map_buffer(int fd, size_t size) {
char *mapped_buffer = NULL;
// Populate
mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_POPULATE, fd, 0);
// Shared
// mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
// MAP_SHARED, fd, 0);
// Anon test
// mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
// MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if(mapped_buffer == MAP_FAILED)
EXIT_MSG("Failed to mmap file of size %zu: %s\n",
size, strerror(errno));
// Might also need to gurantee page aligned - posix_memalign()
// int mret = madvise(mapped_buffer, filesize, MADV_HUGEPAGE);
// if(mret!=0) {
// fprintf(stderr, "failed madvise: %s\n", strerror(errno));
// }
return mapped_buffer;
}
size_t get_filesize(const char* filename){
int retval;
struct stat st;
retval = stat(filename, &st);
if(retval)
return -1;
else
return st.st_size;
}
void print_help_message(const char *progname) {
/* take only the last portion of the path */
const char *basename = strrchr(progname, '/');
basename = basename ? basename + 1 : progname;
printf("usage: %s [OPTION]\n", basename);
printf(" -h, --help\n"
" Print this help and exit.\n");
printf(" -b, --block[=BLOCKSIZE]\n"
" Block size used for read system calls.\n"
" For mmap tests, the size of the stride when iterating\n"
" over the file.\n"
" Defaults to %d.\n", DEFAULT_BLOCK_SIZE);
printf(" -f, --file[=FILENAME]\n"
" Perform all tests on this file (defaults to %s).\n",
DEFAULT_NAME);
printf(" --readsyscall\n"
" Perform a read test using system calls.\n");
printf(" --readmmap\n"
" Perform a read test using mmap.\n");
printf(" --writesyscall\n"
" Perform a write test using system calls.\n");
printf(" --writemmap\n"
" Perform a write test using mmap.\n");
printf(" --randomaccess\n"
" Perform random access.\n");
printf(" --threads\n"
" Number of threads to use. Defaults to one.\n");
printf(" --mixedmmap\n"
" Perfom read and write concurrently at different offsets\n");
printf(" -w, -writethreads[=0]\n"
" Number of threads that should perform write\n");
}
To compile:
$ gcc testm.c -o testm -lpthread -static -O2 -fno-builtin-memcpy
Commands to run the program:
$ dd if=/dev/zero of=bigmmaptest bs=1M count=25600 # 25 GiB file
$ ./testm -b 1024 -f bigmmaptest --threads 16 --randomaccess --readmmap
I am on a 32 core Xeon 5218 2nd Gen. L1d KiB /L2 MiB /L3 MiB -- 512 / 16 / 22
When the memcpy size is 1 KiB I get 21.7 GB/s but when the size is 256B I get 26.68 GB/s and 34.8 GB/s when the size is 4 KiB. Why is there a drop in the middle?
I observe that 2 KiB also performs poorly when compared to 256B and 4 KiB.
What's more interesting is, when I disable the L2 hardware prefetcher and without any other changes my bandwidth automatically increases for 1 KiB and 2 KiB. Without prefetch 2 KiB memcpy gives 34.8 GB/s. All of these are aggregate bandwidth.
With perf, I did measure L2 load-store misses but they turned out to not change drastically. This effect is also not seen for 8 threads and below.
I am on linux 5.0.4. I am using the glibC memcpy (gcc 7.5.0) and even with -O2 I observe the above quirk. Where 1 KiB access size gives 18.76 GiB/s with L2 prefetch and without I get 30.32 GiB/s. For comparison, 256 B access size provides 24.7 GiB/s with prefetch and 24.8 GiB/s without. Clearly, the drop in performance is because of the L2 cache pollution caused by the prefetcher, as this is not observed with smaller thread counts. I was considering if SMT could be the reason for increased pollution but I observe the effect distinctly at 16 threads on 16 physical cores.
Skimming through glibc memcpy code, I can see that any access below the size of 4 KiB uses AVX 256 instructions, so there is nothing changing there.

The smaller 256B size not seeing a drop from the L2 streamer might be due to the sequence of cache misses being too short to activate the streamer and waste bandwidth (and slots in the LFBs and L2 <-> L3 superqueue) on requests that won't be useful.
For aligned 4k, there are no bytes within the same page that you're not fetching, so the L2 prefetcher is positively useful, or at least not harmful. (Demand loads come in pretty quickly for later lines when running memcpy so I'm guessing speeds were about the same with/without HW prefetch enabled, unless HW prefetch helps getting started on a new 4k chunk while still waiting for the end of the previous.)
The L2 only sees physical addresses, and AFAIK it doesn't try to prefetch across a 4k boundary. (Even if its within the same 2M hugepage, because it doesn't know that either.) The "next-page prefetcher" Intel mentions being new in Ivy Bridge is AFAIK just a TLB prefetch, not data.
So with aligned 4k memcpy, HW prefetch stops automatically at the end of the data you're actually going to read, not wasting any bandwidth. Since mmap gives you page-aligned memory, these 4k memcopies are from a single source page.
(The destination is irrelevant as it probably stays hot in L1d cache, with maybe an occasional eviction to L2, and the reload from it after memcpy can come from store-forwarding, not even having to wait for memcpy's store to commit to L1d.)
Prediction: If your smaller memcpy source starts part way into a 4k page, but still end at the end of a 4k page, you'd probably see similar behaviour to prefetch disabled. e.g. generate a random page number, and start at 3072 bytes into it, doing a 1 KiB copy. So all your 1 KiB copies come from the ends of pages, never middles.
(You'd still have more dTLB misses per byte memcpyed, because each TLB entry is only covering 1 K of the data you ever actually read. You did you use MAP_POPULATE so you shouldn't be seeing page faults in the timed region, assuming you have enough RAM.)
L1d KiB /L2 MiB /L3 MiB -- 512 / 16 / 22
Those are aggregate totals, but L1d and L2 are private per-core! You have 32kiB L1d and 1MiB L2 per core, because this is Cascade Lake, same layout as Skylake-X.
And BTW, I'd consider using a fast PRNG like xorshift+ or xorshift* inside the timing loop; that's easily random enough to defeat prefetching; even a simple LFSR or even LCG with a power-of-2 modulo would do that (and be very cheap, just an imul and add). It avoids having to read offsets from another array, if you really want to isolate just the memcpy memory accesses. Probably doesn't make a difference though. One advantage of a very simple PRNG with a period equal to the space you're trying to cover (like an LCG) is that you won't generate the same address twice, giving you a random permutation of the blocks. But with a big enough block of memory, random cache hits even from L3 are unlikely even without that hard-to-achieve property.
Your current array of offsets is fine. (I didn't look at the code super closely, so I'm just assuming there aren't bugs.)

How to know the values of CR registers from linux user and kernel modes

I would like to know the CR0-CR4 register values on x86. Can I write inline assembly to read it out? Are there any other methods? (e.g., does OS keep any file structures to record these values)

The Linux kernel has some function to read and write Control Registers, they are the read_crX and write_crX functions for the standard CR and xgetbv,xsetbv for the extended CR.
User mode applications need a LKM to indirectly use these functions.
In theory you just need to create a LKM with one or more devices and handle IO requests by reading or writing from CR. In practice you usually have more than one CPU, so you need to handle MP.
I used the kernel module for CPUID as a template and create this LKM.
CODE IS WITHOUT ANY WARRANTY, TESTED ON DEBIAN 8 ON 64 bit VM ONLY
#include <linux/module.h> /* Needed by all modules */
#include <linux/kernel.h> /* Needed for KERN_INFO */
#include <linux/fs.h> /* Needed for KERN_INFO */
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/fcntl.h>
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/smp.h>
#include <linux/major.h>
#include <linux/fs.h>
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/uaccess.h>
#include <linux/gfp.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/xcr.h>
#define MAKE_MINOR(cpu, reg) (cpu<<8 | reg)
#define GET_MINOR_REG(minor) (minor & 0xff)
#define GET_MINOR_CPU(minor) (minor >> 8)
#define XCR_MINOR_BASE 0x80
static int major_n = 0;
static struct class *ctrlreg_class;
struct ctrlreg_info
{
unsigned int reg;
unsigned long value;
unsigned int error;
};
static void ctrlreg_smp_do_read(void* p)
{
struct ctrlreg_info* info = p;
info->error = 0;
printk(KERN_INFO "ctrlreg: do read of reg%u\n", info->reg);
switch (info->reg)
{
case 0: info->value = read_cr0(); break;
case 2: info->value = read_cr2(); break;
case 3: info->value = read_cr3(); break;
case 4: info->value = read_cr4(); break;
#ifdef CONFIG_X86_64
case 8: info->value = read_cr8(); break;
#endif
case XCR_MINOR_BASE: info->value = xgetbv(0); break;
default:
info->error = -EINVAL;
}
}
static void ctrlreg_smp_do_write(void* p)
{
struct ctrlreg_info* info = p;
info->error = 0;
switch (info->reg)
{
case 0: write_cr0(info->value); break;
case 2: write_cr2(info->value); break;
case 3: write_cr3(info->value); break;
case 4: write_cr4(info->value); break;
#ifdef CONFIG_X86_64
case 8: read_cr8(); break;
#endif
case XCR_MINOR_BASE: xgetbv(0); break;
default:
info->error = -EINVAL;
}
}
static ssize_t ctrlreg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
unsigned int minor = iminor(file_inode(file));
unsigned int cpu = GET_MINOR_CPU(minor);
unsigned int reg = GET_MINOR_REG(minor);
struct ctrlreg_info info = {.reg = reg};
int err;
printk(KERN_INFO "ctrlreg: read for cpu%u reg%u\n", cpu, reg);
printk(KERN_INFO "ctrlreg: read of %zu bytes\n", count);
if (count < sizeof(unsigned long))
return -EINVAL;
printk(KERN_INFO "ctrlreg: scheduling read\n");
err = smp_call_function_single(cpu, ctrlreg_smp_do_read, &info, 1);
if (IS_ERR_VALUE(err))
return err;
printk(KERN_INFO "ctrlreg: read success: %x\n", info.error);
if (IS_ERR_VALUE(info.error))
return err;
err = copy_to_user(buf, &info.value, sizeof(unsigned long));
printk(KERN_INFO "ctrlreg: read copy result: %x ( %lu )\n", err, sizeof(unsigned long));
if (IS_ERR_VALUE(err))
return err;
printk(KERN_INFO "ctrlreg: read done\n");
return sizeof(unsigned long);
}
static ssize_t ctrlreg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
unsigned int minor = iminor(file_inode(file));
unsigned int cpu = GET_MINOR_CPU(minor);
unsigned int reg = GET_MINOR_REG(minor);
struct ctrlreg_info info = {.reg = reg};
int err;
printk(KERN_INFO "ctrlreg: write for cpu%u reg%u\n", cpu, reg);
printk(KERN_INFO "ctrlreg: write of %zu bytes\n", count);
if (count < sizeof(unsigned long))
return -EINVAL;
printk(KERN_INFO "ctrlreg: scheduling write\n");
err = copy_from_user((void*)buf, &info.value, sizeof(unsigned long));
printk(KERN_INFO "ctrlreg: write copy data: %x ( %lu )\n", err, sizeof(unsigned long));
if (IS_ERR_VALUE(err))
return err;
err = smp_call_function_single(cpu, ctrlreg_smp_do_write, &info, 1);
if (IS_ERR_VALUE(err))
return err;
printk(KERN_INFO "ctrlreg: write success: %x\n", info.error);
if (IS_ERR_VALUE(info.error))
return err;
printk(KERN_INFO "ctrlreg: write done\n");
return sizeof(unsigned long);
}
static void ctrlreg_can_open(void *p)
{
unsigned int* reg = p;
unsigned int reg_num = *reg;
unsigned int ebx, edx, eax, ecx;
unsigned int support_xgetbv, support_ia32e;
*reg = 0; //Success
printk(KERN_INFO "ctrlreg: can open reg %u\n", reg_num);
if (reg_num <= 4 && reg_num != 1)
return;
#ifdef CONFIG_X86_64
if (reg_num == 8)
return;
#endif
cpuid_count(0x0d, 1, &eax, &ebx, &ecx, &edx);
support_xgetbv = cpuid_ecx(1) & 0x04000000;
support_ia32e = cpuid_edx(0x80000001) & 0x20000000;
printk(KERN_INFO "ctrlreg: xgetbv = %d\n", support_xgetbv);
printk(KERN_INFO "ctrlreg: ia32e = %d\n", support_ia32e);
if (support_xgetbv && support_ia32e)
return;
printk(KERN_INFO "ctrlreg: open denied");
*reg = -EIO;
}
static int ctrlreg_open(struct inode *inode, struct file *file)
{
unsigned int cpu;
unsigned int reg;
unsigned int minor;
int err;
minor = iminor(file_inode(file));
cpu = GET_MINOR_CPU(minor);
reg = GET_MINOR_REG(minor);
printk(KERN_INFO "ctrlreg: open device for cpu%u reg%u\n", cpu, reg);
if (cpu >= nr_cpu_ids || !cpu_online(cpu))
return -ENXIO; /* No such CPU */
err = smp_call_function_single(cpu, ctrlreg_can_open, &reg, 1);
if (IS_ERR_VALUE(err))
return err;
return reg;
}
static const struct file_operations ctrlreg_fops =
{
.owner = THIS_MODULE,
.read = ctrlreg_read,
.write = ctrlreg_write,
.open = ctrlreg_open
};
static int ctrlreg_device_create(int cpu)
{
struct device *dev = NULL;
int i;
printk(KERN_INFO "ctrlreg: device create for cpu %d\n", cpu);
//CR0, 2-4, 8
for (i = 0; i <= 8; i++)
{
if ((i>4 && i<8) || i == 1)
continue; //Skip non existent regs
printk(KERN_INFO "ctrlreg: device cpu%dcr%d\n", cpu, i);
dev = device_create(ctrlreg_class, NULL, MKDEV(major_n, MAKE_MINOR(cpu, i)), NULL, "cpu%dcr%d", cpu, i);
if (IS_ERR(dev))
return PTR_ERR(dev);
}
//XCR0
for (i = 0; i <= 0; i++)
{
printk(KERN_INFO "ctrlreg: device cpu%dxcr%d\n", cpu, i);
dev = device_create(ctrlreg_class, NULL, MKDEV(major_n, MAKE_MINOR(cpu, (XCR_MINOR_BASE+i))), NULL, "cpu%dxcr%d", cpu, i);
if (IS_ERR(dev))
return PTR_ERR(dev);
}
return 0;
}
static void ctrlreg_device_destroy(int cpu)
{
int i;
//CR0, 2-4, 8
for (i = 0; i <= 8; i++)
{
if ((i>4 && i<8) || i == 1)
continue; //Skip non existent regs
device_destroy(ctrlreg_class, MKDEV(major_n, MAKE_MINOR(cpu, i)));
}
//XCR0
for (i = 0; i <= 0; i++)
device_destroy(ctrlreg_class, MKDEV(major_n, MAKE_MINOR(cpu, (XCR_MINOR_BASE+i))));
}
static int ctrlreg_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
int err = 0;
switch (action)
{
case CPU_UP_PREPARE:
err = ctrlreg_device_create(cpu);
break;
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
ctrlreg_device_destroy(cpu);
break;
}
return notifier_from_errno(err);
}
static struct notifier_block __refdata ctrlreg_class_cpu_notifier =
{
.notifier_call = ctrlreg_class_cpu_callback,
};
static char* ctrlreg_devnode(struct device *dev, umode_t *mode)
{
unsigned int minor = MINOR(dev->devt), cpu = GET_MINOR_CPU(minor), reg = GET_MINOR_REG(minor);
if (reg < XCR_MINOR_BASE)
return kasprintf(GFP_KERNEL, "crs/cpu%u/cr%u", cpu, reg);
else
return kasprintf(GFP_KERNEL, "crs/cpu%u/xcr%u", cpu, reg-XCR_MINOR_BASE);
}
int __init ctrlreg_init(void)
{
int err = 0, i = 0;
printk(KERN_INFO "ctrlreg: init\n");
if ((major_n = __register_chrdev(0, 0, NR_CPUS, "crs", &ctrlreg_fops)) < 0)
return major_n;
printk(KERN_INFO "ctrlreg: major number is %u\n", major_n);
ctrlreg_class = class_create(THIS_MODULE, "ctrlreg\n");
if (IS_ERR(ctrlreg_class))
{
err = PTR_ERR(ctrlreg_class);
goto out_chrdev;
}
printk(KERN_INFO "ctrlreg: class created\n");
ctrlreg_class->devnode = ctrlreg_devnode;
cpu_notifier_register_begin();
for_each_online_cpu(i)
{
err = ctrlreg_device_create(i);
if (IS_ERR_VALUE(err))
goto out_class;
}
__register_hotcpu_notifier(&ctrlreg_class_cpu_notifier);
cpu_notifier_register_done();
printk(KERN_INFO "ctrlreg: init success\n");
err = 0;
goto out;
out_class:
i = 0;
for_each_online_cpu(i)
{
ctrlreg_device_destroy(i);
}
cpu_notifier_register_done();
class_destroy(ctrlreg_class);
out_chrdev:
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "ctrlreg");
out:
return err;
}
static void __exit ctrlreg_exit(void)
{
int cpu = 0;
cpu_notifier_register_begin();
for_each_online_cpu(cpu)
ctrlreg_device_destroy(cpu);
class_destroy(ctrlreg_class);
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "ctrlreg");
__unregister_hotcpu_notifier(&ctrlreg_class_cpu_notifier);
cpu_notifier_register_done();
}
module_init(ctrlreg_init);
module_exit(ctrlreg_exit);
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Kee Nemesis 241");
MODULE_DESCRIPTION("Read and write Control Registers");
This module create the following dev nodes:
/dev/crs/cpu0/cr0
/dev/crs/cpu0/cr2
/dev/crs/cpu0/cr3
/dev/crs/cpu0/cr4
/dev/crs/cpu0/cr8
/dev/crs/cpu0/xcr0
/dev/crs/cpu1/cr0
/dev/crs/cpu1/cr2
/dev/crs/cpu1/cr3
/dev/crs/cpu1/cr4
/dev/crs/cpu1/cr8
/dev/crs/cpu1/xcr0
...
You can read/write these dev nodes. The minimum read/write length is 4 bytes on 32 bit system and 8 bytes on 64 bit ones (Linux do some buffering anyway).
To compile this LKM, save the code above as ctrlreg.c and create this Makefile
obj-m += ctrlreg.o
all:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
clean:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
then use make to get ctrlreg.ko.
To load the module use sudo insmod ctrlreg.ko, to remove it sudo rmmod ctrlreg.
I have also written a small user mode utility to read CR:
CODE IS WITHOUT ANY WARRANTY, TESTED ON DEBIAN 8 ON 64 bit VM ONLY
#include <stdio.h>
#include <stdlib.h>
#define MAX_PATH 256
int main(int argc, char* argv[])
{
unsigned long cpu, reg;
FILE* fin;
char device[MAX_PATH];
unsigned long data;
if (argc < 3 || argc > 4)
return fprintf(stderr, "Usage:\n\t\t cr cpu reg [value]\n"), 1;
if (sscanf(argv[1], "cpu%u", &cpu) != 1)
return fprintf(stderr, "Invalid value '%s' for cpu\n", argv[1]), 2;
if (sscanf(argv[2], "cr%u", &reg) != 1 && sscanf(argv[2], "xcr%u", &reg) != 1)
return fprintf(stderr, "Invalid value '%s' for reg\n", argv[2]), 3;
if (argc == 4 && sscanf(argv[3], "%lu", &data) != 1)
return fprintf(stderr, "Invalid numeric value '%s'\n", argv[3]), 6;
snprintf(device, MAX_PATH, "/dev/crs/cpu%u/%s", cpu, argv[2]);
fin = fopen(device, argc == 4 ? "wb" : "rb");
if (!fin)
return fprintf(stderr, "Cannot open device %s\n", device), 4;
if (argc == 4)
{
if (fwrite(&data, sizeof(data), 1, fin) != 1)
return fprintf(stderr, "Cannot write device %s (%d)\n", device, ferror(fin)), 5;
}
else
{
if (fread(&data, sizeof(data), 1, fin) != 1)
return fprintf(stderr, "Cannot read device %s (%d)\n", device, ferror(fin)), 7;
printf("%016x\n", data);
}
fclose(fin);
return 0;
}
Save the code as cr.c and compile it.
To read cr0 of the second CPU you can use:
cr cpu1 cr0
To write the value 0 (be careful) into it
cr cpu1 cr0 0

undefined cudaMalloc symbols

I'm trying to compile the cublas example from the CUDA documentation
//Example 2. Application Using C and CUBLAS: 0-based indexing
//-----------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#define M 6
#define N 5
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
static __inline__ void modify (cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta){
cublasSscal (handle, n-p, &alpha, &m[IDX2C(p,q,ldm)], ldm);
cublasSscal (handle, ldm-p, &beta, &m[IDX2C(p,q,ldm)], 1);
}
int main (void){
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
int i, j;
float* devPtrA;
float* a = 0;
a = (float *)malloc (M * N * sizeof (*a));
if (!a) {
printf ("host memory allocation failed");
return EXIT_FAILURE;
}
for (j = 0; j < N; j++) {
for (i = 0; i < M; i++) {
a[IDX2C(i,j,M)] = (float)(i * M + j + 1);
}
}
cudaStat = cudaMalloc ((void**)&devPtrA, M*N*sizeof(*a));
if (cudaStat != cudaSuccess) {
printf ("device memory allocation failed");
return EXIT_FAILURE;
}
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
return EXIT_FAILURE;
}
stat = cublasSetMatrix (M, N, sizeof(*a), a, M, devPtrA, M);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data download failed");
cudaFree (devPtrA);
cublasDestroy(handle);
return EXIT_FAILURE;
}
modify (handle, devPtrA, M, N, 1, 2, 16.0f, 12.0f);
stat = cublasGetMatrix (M, N, sizeof(*a), devPtrA, M, a, M);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data upload failed");
cudaFree (devPtrA);
cublasDestroy(handle);
return EXIT_FAILURE;
}
cudaFree (devPtrA);
cublasDestroy(handle);
for (j = 0; j < N; j++) {
for (i = 0; i < M; i++) {
printf ("%7.0f", a[IDX2C(i,j,M)]);
}
printf ("\n");
}
free(a);
return EXIT_SUCCESS;
}
I saved this file into "cudaexample.c" and am trying to compile with gcc cudaexample.c -I/usr/local/cuda/include -L/usr/local/cuda/lib -lcuda -lcublas
I get an undefined symbols error:
Undefined symbols for architecture x86_64:
"_cudaFree", referenced from:
_main in ccpPWjbO.o
"_cudaMalloc", referenced from:
_main in ccpPWjbO.o
ld: symbol(s) not found for architecture x86_64
collect2: error: ld returned 1 exit status
It seems like I've specified the commands properly as other symbols (e.g. cublasCreate) are found. Why are Free and Malloc not present?
Relevant details:
OSX: 10.10.2
gcc: 4.8.4 (target: x86_64-apple-darwin14)
Graphics: NVIDA GeForce GT 650M 1024 MB
I downloaded and installed the CUDA-6.5 toolkit

Those API functions (e.g. cudaMalloc) are contained in the CUDA runtime library. You are not linking against that library, so those symbols aren't found during the link phase.
Add -lcudart to your link flags:
-I/usr/local/cuda/include -L/usr/local/cuda/lib -lcuda -lcublas -lcudart
and it should fix that issue for you.
(-lcuda is only needed if you are using CUDA driver API functions. You can remove that if you wish.)

Warning: cast to/from pointer from/to integer of different size

I'm learning Pthreads. My code executes the way I want it to, I'm able to use it. But it gives me a warning on compilation.
I compile using:
gcc test.c -o test -pthread
with GCC 4.8.1. And I get the warning
test.c: In function ‘main’:
test.c:39:46: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
pthread_create(&(tid[i]), &attr, runner, (void *) i);
^
test.c: In function ‘runner’:
test.c:54:22: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
int threadnumber = (int) param;
^
This error comes for the following code:
#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>
#define MAX_THREADS 10
int sum; /* this data is shared by the thread(s) */
void *runner(void * param);
int main(int argc, char *argv[])
{
int num_threads, i;
pthread_t tid[MAX_THREADS]; /* the thread identifiers */
pthread_attr_t attr; /* set of thread attributes */
if (argc != 2) {
fprintf(stderr, "usage: test <integer value>\n");
exit(EXIT_FAILURE);
}
if (atoi(argv[1]) <= 0) {
fprintf(stderr,"%d must be > 0\n", atoi(argv[1]));
exit(EXIT_FAILURE);
}
if (atoi(argv[1]) > MAX_THREADS) {
fprintf(stderr,"%d must be <= %d\n", atoi(argv[1]), MAX_THREADS);
exit(EXIT_FAILURE);
}
num_threads = atoi(argv[1]);
printf("The number of threads is %d\n", num_threads);
/* get the default attributes */
pthread_attr_init(&attr);
/* create the threads */
for (i=0; i<num_threads; i++) {
pthread_create(&(tid[i]), &attr, runner, (void *) i);
printf("Creating thread number %d, tid=%lu \n", i, tid[i]);
}
/* now wait for the threads to exit */
for (i=0; i<num_threads; i++) {
pthread_join(tid[i],NULL);
}
return 0;
}
/* The thread will begin control in this function */
void *runner(void * param)
{
int i;
int threadnumber = (int) param;
for (i=0; i<1000; i++) printf("Thread number=%d, i=%d\n", threadnumber, i);
pthread_exit(0);
}
How can I fix this warning?

A quick hacky fix might just to cast to long instead of int. On a lot of systems, sizeof(long) == sizeof(void *).
A better idea might be to use intptr_t.
int threadnumber = (intptr_t) param;
and
pthread_create(&(tid[i]), &attr, runner, (void *)(intptr_t)i);

pthread_create(&(tid[i]), &attr, runner, (void *) i);
You are passing the local variable i as an argument for runner, sizeof(void*) == 8 and sizeof(int) == 4 (64 bits).
If you want to pass i, you should wrap it as a pointer or something:
void *runner(void * param) {
int id = *((int*)param);
delete param;
}
int tid = new int; *tid = i;
pthread_create(&(tid[i]), &attr, runner, tid);
You may just want i, and in that case, the following should be safe (but far from recommended):
void *runner(void * param) {
int id = (int)param;
}
pthread_create(&(tid[i]), &attr, runner, (void*)(unsigned long long)(i));

I was also getting the same warning. So to resolve my warning I converted int to long and then this warning just vanished. And about the warning "cast to pointer from integer of different size" you can leave this warning because a pointer can hold the value of any variable because pointer in 64x is of 64 bit and in 32x is of 32 bit.

Try passing
pthread_create(&(tid[i]), &attr, runner, (void*)&i);

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Compile error: Kernel module - makefile

You haven't included the header for kmalloc. Add #include <linux/slab.h> to your code.

Related

Compiling LAPACKe library using C in MacOS

Why does L2 hardware prefetcher perform worse with only 1 KiB or 2 KiB access size?

How to know the values of CR registers from linux user and kernel modes

undefined cudaMalloc symbols

Warning: cast to/from pointer from/to integer of different size

Categories

Resources