Using perf_event_open to monitor Docker containers - performance

I wrote a program in C to retrieve performance events such as cpu-cycles of Docker containers.
I mean, a user space program at host level (host level monitoring, not inside docker). I give pid of the docker container as pid entry of perf_event_open(), However, I have always 0 as returned value. I have tested the program for other non-docker pids for example firefox and it works very well.
I set PERF_FLAG_PID_CGROUP as flag, nothing changes!
Here is the code:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <asm/unistd.h>
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
{
int ret;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
group_fd, flags);
return ret;
}
int
main(int argc, char **argv)
{
struct perf_event_attr pe;
long long count;
int fd;
fd = open("/sys/fs/cgroup/perf_event/docker/f42c13cd9dd700544fe670e30d0b3216bdceaf01ddc370405618fdecfd10b26d", O_RDONLY);
if (fd == -1)
return 0;
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.type = PERF_TYPE_HARDWARE;
pe.size = sizeof(struct perf_event_attr);
pe.config = PERF_COUNT_HW_CPU_CYCLES;
pe.disabled = 1;
pe.exclude_kernel = 0;
pe.exclude_hv = 0;
fd = perf_event_open(&pe, fd, -1, -1, PERF_FLAG_PID_CGROUP);
if (fd == -1) {
fprintf(stderr, "Error opening leader %llx\n", pe.config);
exit(EXIT_FAILURE);
}
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
usleep(100);
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
read(fd, &count, sizeof(long long));
printf("Used %lld instructions\n", count);
close(fd);
}
According to the Man page of perf_event_open(), I also give the fd opened on the directory of docker container in groupfs. Doesn't work !
Would you please help me to solve the problem?
Thanks
Update:
I have checked with other events for example PERF_COUNT_HW_CACHE_REFERENCES,
I see 0 as returned value!
OS: Ubuntu 16.04
Kernel: 4.15.0-041500-generic
Architecture: X86_64

You have not specified which linux kernel version you are using. I will base my answer as per the latest linux kernel version.
The parameters that you passed to perf_event_open syscall look correct, except for one.
In your case, you pass cpu = -1 as a parameter to perf_event_open.
While this usually works in normal perf event filtering (i.e. per-cpu or per-thread), passing cpu = -1 will not work in cgroup based filtering for perf. In cgroup mode, the pid argument is used to pass the fd opened to the cgroup directory in cgroupfs (which you seemed to have passed correctly). The cpu argument designates the cpu on which to monitor threads from that cgroup. And when cpu=-1 it means the perf event measures the specified process/thread on any CPU (without regarding whether that CPU belongs to the cgroup you are measuring on).
This is how it is depicted in the latest Linux code.
if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
return -EINVAL;
You can clearly see that if either PID=-1 or CPU=-1, the syscall method will return error.
Working Example
From the perf_event_open documentation, it is quite clear that --
cgroup monitoring is available only for system-wide events and may therefore require extra permissions.
Since we are doing cgroup monitoring in this case, which is already understood as we are monitoring the perf-events of a container, we will have to monitor the event system-wide. This means the monitoring happens for ALL the available CPUs in the system.
Working code
I have 4 cores in my system and hence I use CPUs - 0,1,2,3
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <asm/unistd.h>
#include <errno.h>
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
{
int ret;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
group_fd, flags);
return ret;
}
int
main(int argc, char **argv)
{
struct perf_event_attr pe;
long long count, count1, count2, count3;
int fd, fd1, fd2, fd3, fd4;
fd1 = open("/sys/fs/cgroup/perf_event/docker/001706b1a71617b0ce9d340f706d901e00ee398091dd62aded2a1863fc8c274a", O_RDONLY);
if (fd1 == -1)
return 0;
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.type = PERF_TYPE_HARDWARE;
pe.size = sizeof(struct perf_event_attr);
pe.config = PERF_COUNT_HW_INSTRUCTIONS;
pe.disabled = 1;
pe.exclude_kernel = 0;
pe.exclude_hv = 0;
fd = perf_event_open(&pe, fd1, 0, -1, PERF_FLAG_PID_CGROUP|PERF_FLAG_FD_CLOEXEC);
if (fd == -1) {
fprintf(stderr, "Error opening leader: %s\n", strerror(errno));
exit(EXIT_FAILURE);
}
fd2 = perf_event_open(&pe, fd1, 1, -1, PERF_FLAG_PID_CGROUP|PERF_FLAG_FD_CLOEXEC);
if (fd2 == -1) {
fprintf(stderr, "Error: %s\n", strerror(errno));
exit(EXIT_FAILURE);
}
fd3 = perf_event_open(&pe, fd1, 2, -1, PERF_FLAG_PID_CGROUP|PERF_FLAG_FD_CLOEXEC);
if (fd3 == -1) {
fprintf(stderr, "Error: %s\n", strerror(errno));
exit(EXIT_FAILURE);
}
fd4 = perf_event_open(&pe, fd1, 3, -1, PERF_FLAG_PID_CGROUP|PERF_FLAG_FD_CLOEXEC);
if (fd4 == -1) {
fprintf(stderr, "Error: %s\n", strerror(errno));
exit(EXIT_FAILURE);
}
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
ioctl(fd2, PERF_EVENT_IOC_RESET, 0);
ioctl(fd2, PERF_EVENT_IOC_ENABLE, 0);
ioctl(fd3, PERF_EVENT_IOC_RESET, 0);
ioctl(fd3, PERF_EVENT_IOC_ENABLE, 0);
ioctl(fd4, PERF_EVENT_IOC_RESET, 0);
ioctl(fd4, PERF_EVENT_IOC_ENABLE, 0);
sleep(10); // using sleep(10) to actually observe instructions
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
ioctl(fd2, PERF_EVENT_IOC_DISABLE, 0);
ioctl(fd3, PERF_EVENT_IOC_DISABLE, 0);
ioctl(fd4, PERF_EVENT_IOC_DISABLE, 0);
read(fd, &count, sizeof(long long));
read(fd2, &count1, sizeof(long long));
read(fd3, &count2, sizeof(long long));
read(fd4, &count3, sizeof(long long));
printf("Used %lld instructions\n", count+count1+count2+count3);
close(fd);
close(fd2);
close(fd3);
close(fd4);
}
`
Output : Used 55174 instructions

Related

Using perf_event_open in AMD for Counting and Sampling Memory Load

I want to count and sample memory load operations that happen in a chunk of code while running it on an AMD machine. I know that the following code can be used to count memory loads happening in a code chunk on Intel machine.
int
main(int argc, char **argv)
{
struct perf_event_attr pe;
long long count;
int fd;
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.type = PERF_TYPE_RAW;
pe.size = sizeof(struct perf_event_attr);
/* event number of MEM_UOPS_RETIRED.ALL_LOADS
event is 81d0 */
pe.config = 0x81d0;
pe.disabled = 1;
pe.exclude_kernel = 1;
pe.exclude_hv = 1;
fd = perf_event_open(&pe, 0, -1, -1, 0);
if (fd == -1) {
fprintf(stderr, "Error opening %llx\n", pe.config);
exit(EXIT_FAILURE);
}
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
// code chunk to be profiled is here
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
read(fd, &count, sizeof(long long));
printf("Number of load operations: %lld\n", count);
close(fd);
}
However, I don't know how to get the count of the same event from AMD machine. I have read AMD's document on performance counter and IBS in https://www.amd.com/system/files/TechDocs/24593.pdf sections 13.2 and 13.3. Yet, there is no mention of any hardware event number for memory load that can be passed to pe.config in the above code.
So, how can I count or sample memory load operations in an AMD machine by using perf_event_open?
PS: The AMD processor that I am using is AMD EPYC 7551 32-Core.

How to read i2c with out i2cget command?

I am currently using Kontron smarc-samx6i board and I am running this board with Yocto kernel. Here the kernel is not supporting the command i2cget to check i2c data to read from sensor. So, is there any other possible ways to read i2c data from the sensors?
You can add "i2c-tools" to your image:
IMAGE_INSTALL_append = " i2c-tools"
They include i2cget.c.
The recipe can be found in: /meta/recipes-devtools/i2c-tools/
You might also want to look into writing your own application using the i2c dev interface. See the kernel documentation on i2c.
read:
#include <stdio.h>
#include <fcntl.h>
#include <linux/i2c-dev.h>
#include <errno.h>
#define I2C_ADDR 0x20
int main (void) {
char buffer[1];
int fd;
fd = open("/dev/i2c-0", O_RDWR);
if (fd < 0) {
printf("Error opening file: %s\n", strerror(errno));
return 1;
}
if (ioctl(fd, I2C_SLAVE, I2C_ADDR) < 0) {
printf("ioctl error: %s\n", strerror(errno));
return 1;
}
buffer[0]=0xFF;
write(fd, buffer, 1);
read(fd, buffer, 1);
printf("0x%02X\n", buffer[0]);
return 0;
}
write:
#include <stdio.h>
#include <fcntl.h>
#include <linux/i2c-dev.h>
#include <errno.h>
#define I2C_ADDR 0x20
int main (void) {
int value;
int fd;
fd = open("/dev/i2c-0", O_RDWR);
if (fd < 0) {
printf("Error opening file: %s\n", strerror(errno));
return 1;
}
if (ioctl(fd, I2C_SLAVE, I2C_ADDR) < 0) {
printf("ioctl error: %s\n", strerror(errno));
return 1;
}
for (value=0; value<=255; value++) {
if (write(fd, &value, 1) != 1) {
printf("Error writing file: %s\n", strerror(errno));
}
usleep(100000);
}
return 0;
}

How to know the values of CR registers from linux user and kernel modes

I would like to know the CR0-CR4 register values on x86. Can I write inline assembly to read it out? Are there any other methods? (e.g., does OS keep any file structures to record these values)
The Linux kernel has some function to read and write Control Registers, they are the read_crX and write_crX functions for the standard CR and xgetbv,xsetbv for the extended CR.
User mode applications need a LKM to indirectly use these functions.
In theory you just need to create a LKM with one or more devices and handle IO requests by reading or writing from CR. In practice you usually have more than one CPU, so you need to handle MP.
I used the kernel module for CPUID as a template and create this LKM.
CODE IS WITHOUT ANY WARRANTY, TESTED ON DEBIAN 8 ON 64 bit VM ONLY
#include <linux/module.h> /* Needed by all modules */
#include <linux/kernel.h> /* Needed for KERN_INFO */
#include <linux/fs.h> /* Needed for KERN_INFO */
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/fcntl.h>
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/smp.h>
#include <linux/major.h>
#include <linux/fs.h>
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/uaccess.h>
#include <linux/gfp.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/xcr.h>
#define MAKE_MINOR(cpu, reg) (cpu<<8 | reg)
#define GET_MINOR_REG(minor) (minor & 0xff)
#define GET_MINOR_CPU(minor) (minor >> 8)
#define XCR_MINOR_BASE 0x80
static int major_n = 0;
static struct class *ctrlreg_class;
struct ctrlreg_info
{
unsigned int reg;
unsigned long value;
unsigned int error;
};
static void ctrlreg_smp_do_read(void* p)
{
struct ctrlreg_info* info = p;
info->error = 0;
printk(KERN_INFO "ctrlreg: do read of reg%u\n", info->reg);
switch (info->reg)
{
case 0: info->value = read_cr0(); break;
case 2: info->value = read_cr2(); break;
case 3: info->value = read_cr3(); break;
case 4: info->value = read_cr4(); break;
#ifdef CONFIG_X86_64
case 8: info->value = read_cr8(); break;
#endif
case XCR_MINOR_BASE: info->value = xgetbv(0); break;
default:
info->error = -EINVAL;
}
}
static void ctrlreg_smp_do_write(void* p)
{
struct ctrlreg_info* info = p;
info->error = 0;
switch (info->reg)
{
case 0: write_cr0(info->value); break;
case 2: write_cr2(info->value); break;
case 3: write_cr3(info->value); break;
case 4: write_cr4(info->value); break;
#ifdef CONFIG_X86_64
case 8: read_cr8(); break;
#endif
case XCR_MINOR_BASE: xgetbv(0); break;
default:
info->error = -EINVAL;
}
}
static ssize_t ctrlreg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
unsigned int minor = iminor(file_inode(file));
unsigned int cpu = GET_MINOR_CPU(minor);
unsigned int reg = GET_MINOR_REG(minor);
struct ctrlreg_info info = {.reg = reg};
int err;
printk(KERN_INFO "ctrlreg: read for cpu%u reg%u\n", cpu, reg);
printk(KERN_INFO "ctrlreg: read of %zu bytes\n", count);
if (count < sizeof(unsigned long))
return -EINVAL;
printk(KERN_INFO "ctrlreg: scheduling read\n");
err = smp_call_function_single(cpu, ctrlreg_smp_do_read, &info, 1);
if (IS_ERR_VALUE(err))
return err;
printk(KERN_INFO "ctrlreg: read success: %x\n", info.error);
if (IS_ERR_VALUE(info.error))
return err;
err = copy_to_user(buf, &info.value, sizeof(unsigned long));
printk(KERN_INFO "ctrlreg: read copy result: %x ( %lu )\n", err, sizeof(unsigned long));
if (IS_ERR_VALUE(err))
return err;
printk(KERN_INFO "ctrlreg: read done\n");
return sizeof(unsigned long);
}
static ssize_t ctrlreg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
unsigned int minor = iminor(file_inode(file));
unsigned int cpu = GET_MINOR_CPU(minor);
unsigned int reg = GET_MINOR_REG(minor);
struct ctrlreg_info info = {.reg = reg};
int err;
printk(KERN_INFO "ctrlreg: write for cpu%u reg%u\n", cpu, reg);
printk(KERN_INFO "ctrlreg: write of %zu bytes\n", count);
if (count < sizeof(unsigned long))
return -EINVAL;
printk(KERN_INFO "ctrlreg: scheduling write\n");
err = copy_from_user((void*)buf, &info.value, sizeof(unsigned long));
printk(KERN_INFO "ctrlreg: write copy data: %x ( %lu )\n", err, sizeof(unsigned long));
if (IS_ERR_VALUE(err))
return err;
err = smp_call_function_single(cpu, ctrlreg_smp_do_write, &info, 1);
if (IS_ERR_VALUE(err))
return err;
printk(KERN_INFO "ctrlreg: write success: %x\n", info.error);
if (IS_ERR_VALUE(info.error))
return err;
printk(KERN_INFO "ctrlreg: write done\n");
return sizeof(unsigned long);
}
static void ctrlreg_can_open(void *p)
{
unsigned int* reg = p;
unsigned int reg_num = *reg;
unsigned int ebx, edx, eax, ecx;
unsigned int support_xgetbv, support_ia32e;
*reg = 0; //Success
printk(KERN_INFO "ctrlreg: can open reg %u\n", reg_num);
if (reg_num <= 4 && reg_num != 1)
return;
#ifdef CONFIG_X86_64
if (reg_num == 8)
return;
#endif
cpuid_count(0x0d, 1, &eax, &ebx, &ecx, &edx);
support_xgetbv = cpuid_ecx(1) & 0x04000000;
support_ia32e = cpuid_edx(0x80000001) & 0x20000000;
printk(KERN_INFO "ctrlreg: xgetbv = %d\n", support_xgetbv);
printk(KERN_INFO "ctrlreg: ia32e = %d\n", support_ia32e);
if (support_xgetbv && support_ia32e)
return;
printk(KERN_INFO "ctrlreg: open denied");
*reg = -EIO;
}
static int ctrlreg_open(struct inode *inode, struct file *file)
{
unsigned int cpu;
unsigned int reg;
unsigned int minor;
int err;
minor = iminor(file_inode(file));
cpu = GET_MINOR_CPU(minor);
reg = GET_MINOR_REG(minor);
printk(KERN_INFO "ctrlreg: open device for cpu%u reg%u\n", cpu, reg);
if (cpu >= nr_cpu_ids || !cpu_online(cpu))
return -ENXIO; /* No such CPU */
err = smp_call_function_single(cpu, ctrlreg_can_open, &reg, 1);
if (IS_ERR_VALUE(err))
return err;
return reg;
}
static const struct file_operations ctrlreg_fops =
{
.owner = THIS_MODULE,
.read = ctrlreg_read,
.write = ctrlreg_write,
.open = ctrlreg_open
};
static int ctrlreg_device_create(int cpu)
{
struct device *dev = NULL;
int i;
printk(KERN_INFO "ctrlreg: device create for cpu %d\n", cpu);
//CR0, 2-4, 8
for (i = 0; i <= 8; i++)
{
if ((i>4 && i<8) || i == 1)
continue; //Skip non existent regs
printk(KERN_INFO "ctrlreg: device cpu%dcr%d\n", cpu, i);
dev = device_create(ctrlreg_class, NULL, MKDEV(major_n, MAKE_MINOR(cpu, i)), NULL, "cpu%dcr%d", cpu, i);
if (IS_ERR(dev))
return PTR_ERR(dev);
}
//XCR0
for (i = 0; i <= 0; i++)
{
printk(KERN_INFO "ctrlreg: device cpu%dxcr%d\n", cpu, i);
dev = device_create(ctrlreg_class, NULL, MKDEV(major_n, MAKE_MINOR(cpu, (XCR_MINOR_BASE+i))), NULL, "cpu%dxcr%d", cpu, i);
if (IS_ERR(dev))
return PTR_ERR(dev);
}
return 0;
}
static void ctrlreg_device_destroy(int cpu)
{
int i;
//CR0, 2-4, 8
for (i = 0; i <= 8; i++)
{
if ((i>4 && i<8) || i == 1)
continue; //Skip non existent regs
device_destroy(ctrlreg_class, MKDEV(major_n, MAKE_MINOR(cpu, i)));
}
//XCR0
for (i = 0; i <= 0; i++)
device_destroy(ctrlreg_class, MKDEV(major_n, MAKE_MINOR(cpu, (XCR_MINOR_BASE+i))));
}
static int ctrlreg_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
int err = 0;
switch (action)
{
case CPU_UP_PREPARE:
err = ctrlreg_device_create(cpu);
break;
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
ctrlreg_device_destroy(cpu);
break;
}
return notifier_from_errno(err);
}
static struct notifier_block __refdata ctrlreg_class_cpu_notifier =
{
.notifier_call = ctrlreg_class_cpu_callback,
};
static char* ctrlreg_devnode(struct device *dev, umode_t *mode)
{
unsigned int minor = MINOR(dev->devt), cpu = GET_MINOR_CPU(minor), reg = GET_MINOR_REG(minor);
if (reg < XCR_MINOR_BASE)
return kasprintf(GFP_KERNEL, "crs/cpu%u/cr%u", cpu, reg);
else
return kasprintf(GFP_KERNEL, "crs/cpu%u/xcr%u", cpu, reg-XCR_MINOR_BASE);
}
int __init ctrlreg_init(void)
{
int err = 0, i = 0;
printk(KERN_INFO "ctrlreg: init\n");
if ((major_n = __register_chrdev(0, 0, NR_CPUS, "crs", &ctrlreg_fops)) < 0)
return major_n;
printk(KERN_INFO "ctrlreg: major number is %u\n", major_n);
ctrlreg_class = class_create(THIS_MODULE, "ctrlreg\n");
if (IS_ERR(ctrlreg_class))
{
err = PTR_ERR(ctrlreg_class);
goto out_chrdev;
}
printk(KERN_INFO "ctrlreg: class created\n");
ctrlreg_class->devnode = ctrlreg_devnode;
cpu_notifier_register_begin();
for_each_online_cpu(i)
{
err = ctrlreg_device_create(i);
if (IS_ERR_VALUE(err))
goto out_class;
}
__register_hotcpu_notifier(&ctrlreg_class_cpu_notifier);
cpu_notifier_register_done();
printk(KERN_INFO "ctrlreg: init success\n");
err = 0;
goto out;
out_class:
i = 0;
for_each_online_cpu(i)
{
ctrlreg_device_destroy(i);
}
cpu_notifier_register_done();
class_destroy(ctrlreg_class);
out_chrdev:
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "ctrlreg");
out:
return err;
}
static void __exit ctrlreg_exit(void)
{
int cpu = 0;
cpu_notifier_register_begin();
for_each_online_cpu(cpu)
ctrlreg_device_destroy(cpu);
class_destroy(ctrlreg_class);
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "ctrlreg");
__unregister_hotcpu_notifier(&ctrlreg_class_cpu_notifier);
cpu_notifier_register_done();
}
module_init(ctrlreg_init);
module_exit(ctrlreg_exit);
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Kee Nemesis 241");
MODULE_DESCRIPTION("Read and write Control Registers");
This module create the following dev nodes:
/dev/crs/cpu0/cr0
/dev/crs/cpu0/cr2
/dev/crs/cpu0/cr3
/dev/crs/cpu0/cr4
/dev/crs/cpu0/cr8
/dev/crs/cpu0/xcr0
/dev/crs/cpu1/cr0
/dev/crs/cpu1/cr2
/dev/crs/cpu1/cr3
/dev/crs/cpu1/cr4
/dev/crs/cpu1/cr8
/dev/crs/cpu1/xcr0
...
You can read/write these dev nodes. The minimum read/write length is 4 bytes on 32 bit system and 8 bytes on 64 bit ones (Linux do some buffering anyway).
To compile this LKM, save the code above as ctrlreg.c and create this Makefile
obj-m += ctrlreg.o
all:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
clean:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
then use make to get ctrlreg.ko.
To load the module use sudo insmod ctrlreg.ko, to remove it sudo rmmod ctrlreg.
I have also written a small user mode utility to read CR:
CODE IS WITHOUT ANY WARRANTY, TESTED ON DEBIAN 8 ON 64 bit VM ONLY
#include <stdio.h>
#include <stdlib.h>
#define MAX_PATH 256
int main(int argc, char* argv[])
{
unsigned long cpu, reg;
FILE* fin;
char device[MAX_PATH];
unsigned long data;
if (argc < 3 || argc > 4)
return fprintf(stderr, "Usage:\n\t\t cr cpu reg [value]\n"), 1;
if (sscanf(argv[1], "cpu%u", &cpu) != 1)
return fprintf(stderr, "Invalid value '%s' for cpu\n", argv[1]), 2;
if (sscanf(argv[2], "cr%u", &reg) != 1 && sscanf(argv[2], "xcr%u", &reg) != 1)
return fprintf(stderr, "Invalid value '%s' for reg\n", argv[2]), 3;
if (argc == 4 && sscanf(argv[3], "%lu", &data) != 1)
return fprintf(stderr, "Invalid numeric value '%s'\n", argv[3]), 6;
snprintf(device, MAX_PATH, "/dev/crs/cpu%u/%s", cpu, argv[2]);
fin = fopen(device, argc == 4 ? "wb" : "rb");
if (!fin)
return fprintf(stderr, "Cannot open device %s\n", device), 4;
if (argc == 4)
{
if (fwrite(&data, sizeof(data), 1, fin) != 1)
return fprintf(stderr, "Cannot write device %s (%d)\n", device, ferror(fin)), 5;
}
else
{
if (fread(&data, sizeof(data), 1, fin) != 1)
return fprintf(stderr, "Cannot read device %s (%d)\n", device, ferror(fin)), 7;
printf("%016x\n", data);
}
fclose(fin);
return 0;
}
Save the code as cr.c and compile it.
To read cr0 of the second CPU you can use:
cr cpu1 cr0
To write the value 0 (be careful) into it
cr cpu1 cr0 0

Single-Sided communications with MPI-2

Consider the following fragment of OpenMP code which transfers private data between two threads using an intermediate shared variable
#pragma omp parallel shared(x) private(a,b)
{
...
a = somefunction(b);
if (omp_get_thread_num() == 0) {
x = a;
}
}
#pragma omp parallel shared(x) private(a,b)
{
if (omp_get_thread_num() == 1) {
a = x;
}
b = anotherfunction(a);
...
}
I would (in pseudocode ) need to transfer of private data from one process to another using a single-sided message-passing library.
Any ideas?
This is possible, but there's a lot more "scaffolding" involved -- after all, you are communicating data between potentially completely different computers.
The coordination for this sort of thing is done between windows of data which are accessible from other processors, and with lock/unlock operations which coordinate the access of this data. The locks aren't really locks in the sense of being mutexes, but they are more like synchronization points coordinating data access to the window.
I don't have time right now to explain this in the detail I'd like, but below is an example of using MPI2 to do something like shared memory flagging in a system that doesn't have shared memory:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"
int main(int argc, char** argv)
{
int rank, size, *a, geta;
int x;
int ierr;
MPI_Win win;
const int RCVR=0;
const int SENDER=1;
ierr = MPI_Init(&argc, &argv);
ierr |= MPI_Comm_rank(MPI_COMM_WORLD, &rank);
ierr |= MPI_Comm_size(MPI_COMM_WORLD, &size);
if (ierr) {
fprintf(stderr,"Error initializing MPI library; failing.\n");
exit(-1);
}
if (rank == RCVR) {
MPI_Alloc_mem(sizeof(int), MPI_INFO_NULL, &a);
*a = 0;
} else {
a = NULL;
}
MPI_Win_create(a, 1, sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &win);
if (rank == SENDER) {
/* Lock recievers window */
MPI_Win_lock(MPI_LOCK_EXCLUSIVE, RCVR, 0, win);
x = 5;
/* put 1 int (from &x) to 1 int rank RCVR, at address 0 in window "win"*/
MPI_Put(&x, 1, MPI_INT, RCVR, 0, 1, MPI_INT, win);
/* Unlock */
MPI_Win_unlock(0, win);
printf("%d: My job here is done.\n", rank);
}
if (rank == RCVR) {
for (;;) {
MPI_Win_lock(MPI_LOCK_EXCLUSIVE, RCVR, 0, win);
MPI_Get(&geta, 1, MPI_INT, RCVR, 0, 1, MPI_INT, win);
MPI_Win_unlock(0, win);
if (geta == 0) {
printf("%d: a still zero; sleeping.\n",rank);
sleep(2);
} else
break;
}
printf("%d: a now %d!\n",rank,geta);
printf("a = %d\n", *a);
MPI_Win_free(&win);
if (rank == RCVR) MPI_Free_mem(a);
MPI_Finalize();
return 0;
}

Page faults on OS X when reading with MMAP

I am trying to benchmark file system I/O on Mac OS X using mmap.
#include <unistd.h>
#include <fcntl.h>
#include <dirent.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <stdio.h>
#include <math.h>
char c;
int main(int argc, char ** argv)
{
if (argc != 2)
{
printf("no files\n");
exit(1);
}
int fd = open(argv[1], O_RDONLY);
fcntl(fd, F_NOCACHE, 1);
int offset=0;
int size=0x100000;
int pagesize = getpagesize();
struct stat stats;
fstat(fd, &stats);
int filesize = stats.st_size;
printf("%d byte pages\n", pagesize);
printf("file %s # %d bytes\n", argv[1], filesize);
while(offset < filesize)
{
if(offset + size > filesize)
{
int pages = ceil((filesize-offset)/(double)pagesize);
size = pages*pagesize;
}
printf("mapping offset %x with size %x\n", offset, size);
void * mem = mmap(0, size, PROT_READ, 0, fd, offset);
if(mem == -1)
return 0;
offset+=size;
int i=0;
for(; i<size; i+=pagesize)
{
c = *((char *)mem+i);
}
munmap(mem, size);
}
return 0;
}
The idea is that I'll map a file or portion of it and then cause a page fault by dereferencing it. I am slowly losing my sanity since this doesn't at all work and I've done similar things on Linux before.
Change this line
void * mem = mmap(0, size, PROT_READ, 0, fd, offset);
to
void * mem = mmap(0, size, PROT_READ, MAP_PRIVATE, fd, offset);
And, don't compare mem with -1. Use this instead:
if(mem == MAP_FAILED) { ... }
It's both more readable and more portable.
General advice: if you're on a different UNIX platform from what you're used to, it's a good idea to open the man page. For mmap on OS X, it can be found here. It says
Conforming applications must specify either MAP_PRIVATE or MAP_SHARED.
So, specifying 0 on the fourth
argument is not OK in OS X. I believe
this is true for BSD in general.

Resources