what is wrong with the bpf_csum_diff function call? - linux-kernel

I want to somehow redirect packets using ebpf. Took an example from the Cilium documentation: Implementation: proxy via bpf
here is an example of my macro in bpf_helpers:
...
static int (*bpf_csum_diff)(void *from, __u64 from_size, void *to, __u64 to_size, __u64 seed) = (void*) // NOLINT
BPF_FUNC_csum_diff;
...
here is the code of the proxy script itself:
#include <linux/bpf.h>
#include "../main/bpf_helpers.h"
#include "../main/bpf_endian.h"
#include <linux/in.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/types.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <stdlib.h>
#include "../main/utils_helpers.h"
#include <stddef.h>
#include <linux/pkt_cls.h>
SEC("socket_filter")
int proxy(struct __sk_buff *skb)
{
const __be32 cluster_ip = 0x846F070A; // 10.7.111.132
const __be32 pod_ip = 0x0529050A; // 10.5.41.5
const int l3_off = ETH_HLEN; // IP header offset
const int l4_off = l3_off + 20; // TCP header offset: l3_off + sizeof(struct iphdr)
__be32 sum; // IP checksum
void *data = (void *)(long)skb->data;
void *data_end = (void *)(long)skb->data_end;
if (data_end < data + l4_off) { // not our packet
return TC_ACT_OK;
}
struct iphdr *ip4 = (struct iphdr *)(data + l3_off);
if (ip4->daddr != cluster_ip || ip4->protocol != IPPROTO_TCP /* || tcp->dport == 80 */) {
return TC_ACT_OK;
}
// DNAT: cluster_ip -> pod_ip, then update L3 and L4 checksum
sum = bpf_csum_diff((void *)&ip4->daddr, 4, (void *)&pod_ip, 4, 0);
bpf_csum_diff((void *)&ip4->daddr, 4, (void *)&pod_ip, 4, 0);
bpf_skb_store_bytes(skb, l3_off + offsetof(struct iphdr, daddr), (void *)&pod_ip, 4, 0);
bpf_l3_csum_replace(skb, l3_off + offsetof(struct iphdr, check), 0, sum, 0);
bpf_l4_csum_replace(skb, l4_off + offsetof(struct tcphdr, check), 0, sum, BPF_F_PSEUDO_HDR);
return TC_ACT_OK;
}
char __license[] SEC("license") = "GPL";
Here is the code of the proxy script itself:
...
2020/06/02 21:58:17 sf.Load(): %vebpf_prog_load() failed: 0: (b7) r2 = 86574346
1: (63) *(u32 *)(r10 -4) = r2
2: (61) r2 = *(u32 *)(r1 +80)
invalid bpf_context access off=80 size=4
processed 3 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
...
Tell me how to correctly proxy packets to another port and another ip in docker?

Related

Userfaultfd write protection appears unsupported when checking through the UFFDIO_API ioctl

I am trying to use the write protection feature of Linux's userfaultfd, but it does not appear to be enabled in my kernel even though I am using version 5.13 (write protection should be fully supported in 5.10+).
When I run
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <linux/userfaultfd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <unistd.h>
#define errExit(msg) \
do { \
perror(msg); \
exit(EXIT_FAILURE); \
} while (0)
static int has_bit(uint64_t val, uint64_t bit) {
return (val & bit) == bit;
}
int main() {
long uffd; /* userfaultfd file descriptor */
struct uffdio_api uffdio_api;
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1)
errExit("userfaultfd");
uffdio_api.api = UFFD_API;
uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
errExit("ioctl-UFFDIO_API");
printf("UFFDIO_API: %d\n", has_bit(uffdio_api.ioctls, 1UL << _UFFDIO_API));
printf("UFFDIO_REGISTER: %d\n", has_bit(uffdio_api.ioctls, 1UL << _UFFDIO_REGISTER));
printf("UFFDIO_UNREGISTER: %d\n", has_bit(uffdio_api.ioctls, 1UL << _UFFDIO_UNREGISTER));
printf("UFFDIO_WRITEPROTECT: %d\n", has_bit(uffdio_api.ioctls, 1UL << _UFFDIO_WRITEPROTECT));
printf("UFFD_FEATURE_PAGEFAULT_FLAG_WP: %d\n", has_bit(uffdio_api.features, UFFD_FEATURE_PAGEFAULT_FLAG_WP));
}
The output is
UFFDIO_API: 1
UFFDIO_REGISTER: 1
UFFDIO_UNREGISTER: 1
UFFDIO_WRITEPROTECT: 0
UFFD_FEATURE_PAGEFAULT_FLAG_WP: 1
The UFFD_FEATURE_PAGEFAULT_FLAG_WP feature is enabled, but the UFFDIO_WRITEPROTECT ioctl is marked as not supported, which is necessary to enable write protection.
What might lead to this feature being disabled, and how can I enable it?
I am using Ubuntu MATE 21.10 with Linux kernel version 5.13.0-30-generic.
EDIT:
It seems like despite the man page section on the UFFD_API ioctl (https://man7.org/linux/man-pages/man2/ioctl_userfaultfd.2.html), this might be the intended behavior for a system where write protection is enabled. However, when I run a full program that spawns a poller thread and writes to the protected memory, the poller thread does not receive any notification.
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
#define errExit(msg) \
do { \
perror(msg); \
exit(EXIT_FAILURE); \
} while (0)
static int page_size;
static void* fault_handler_thread(void* arg) {
long uffd; /* userfaultfd file descriptor */
uffd = (long) arg;
/* Loop, handling incoming events on the userfaultfd
file descriptor. */
for (;;) {
/* See what poll() tells us about the userfaultfd. */
struct pollfd pollfd;
int nready;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);
if (nready == -1)
errExit("poll");
printf("\nfault_handler_thread():\n");
printf(
" poll() returns: nready = %d; "
"POLLIN = %d; POLLERR = %d\n",
nready, (pollfd.revents & POLLIN) != 0,
(pollfd.revents & POLLERR) != 0);
// received fault, exit the program
exit(EXIT_FAILURE);
}
}
int main() {
long uffd; /* userfaultfd file descriptor */
char* addr; /* Start of region handled by userfaultfd */
uint64_t len; /* Length of region handled by userfaultfd */
pthread_t thr; /* ID of thread that handles page faults */
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
struct uffdio_writeprotect uffdio_wp;
int s;
page_size = sysconf(_SC_PAGE_SIZE);
len = page_size;
/* Create and enable userfaultfd object. */
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1)
errExit("userfaultfd");
uffdio_api.api = UFFD_API;
uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
errExit("ioctl-UFFDIO_API");
addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (addr == MAP_FAILED)
errExit("mmap");
printf("Address returned by mmap() = %p\n", addr);
/* Register the memory range of the mapping we just created for
handling by the userfaultfd object. */
uffdio_register.range.start = (unsigned long) addr;
uffdio_register.range.len = len;
uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
errExit("ioctl-UFFDIO_REGISTER");
printf("uffdio_register.ioctls = 0x%llx\n", uffdio_register.ioctls);
printf("Have _UFFDIO_WRITEPROTECT? %s\n", (uffdio_register.ioctls & _UFFDIO_WRITEPROTECT) ? "YES" : "NO");
uffdio_wp.range.start = (unsigned long) addr;
uffdio_wp.range.len = len;
uffdio_wp.mode = UFFDIO_WRITEPROTECT_MODE_WP;
if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffdio_wp) == -1)
errExit("ioctl-UFFDIO_WRITEPROTECT");
/* Create a thread that will process the userfaultfd events. */
s = pthread_create(&thr, NULL, fault_handler_thread, (void*) uffd);
if (s != 0) {
errno = s;
errExit("pthread_create");
}
/* Main thread now touches memory in the mapping, touching
locations 1024 bytes apart. This will trigger userfaultfd
events for all pages in the region. */
usleep(100000);
size_t l;
l = 0xf; /* Ensure that faulting address is not on a page
boundary, in order to test that we correctly
handle that case in fault_handling_thread(). */
char i = 0;
while (l < len) {
printf("Write address %p in main(): ", addr + l);
addr[l] = i++;
printf("%d\n", addr[l]);
l += 1024;
usleep(100000); /* Slow things down a little */
}
exit(EXIT_SUCCESS);
}
The UFFD_API ioctl does not seem to ever report _UFFD_WRITEPROTECT as can be seen here in the kernel source code (1, 2). I assume that this is because whether this operation is supported or not depends on the kind of underlying mapping.
The feature is in fact reporeted on a per-registered-range basis. You will have to set the API with ioctl(uffd, UFFDIO_API, ...) first, then register a range with ioctl(uffd, UFFDIO_REGISTER, ...) and then check the uffdio_register.ioctls field.
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <linux/userfaultfd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <unistd.h>
#define errExit(msg) \
do { \
perror(msg); \
exit(EXIT_FAILURE); \
} while (0)
int main(void) {
long uffd;
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1)
errExit("userfaultfd");
struct uffdio_api uffdio_api = { .api = UFFD_API };
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
errExit("ioctl(UFFDIO_API)");
const size_t region_sz = 0x4000;
void *region = mmap(NULL, region_sz, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
if (region == MAP_FAILED)
errExit("mmap");
if (posix_memalign((void **)region, sysconf(_SC_PAGESIZE), region_sz))
errExit("posix_memalign");
printf("Region mapped at %p - %p\n", region, region + region_sz);
struct uffdio_register uffdio_register = {
.range = { .start = (unsigned long)region, .len = region_sz },
.mode = UFFDIO_REGISTER_MODE_WP
};
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
errExit("ioctl(UFFDIO_REGISTER)");
printf("uffdio_register.ioctls = 0x%llx\n", uffdio_register.ioctls);
printf("Have _UFFDIO_WRITEPROTECT? %s\n", (uffdio_register.ioctls & _UFFDIO_WRITEPROTECT) ? "YES" : "NO");
if ((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) != UFFD_API_RANGE_IOCTLS)
errExit("bad ioctl set");
struct uffdio_writeprotect wp = {
.range = { .start = (unsigned long)region, .len = region_sz },
.mode = UFFDIO_WRITEPROTECT_MODE_WP
};
if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp) == -1)
errExit("ioctl(UFFDIO_WRITEPROTECT)");
puts("ioctl(UFFDIO_WRITEPROTECT) successful.");
return EXIT_SUCCESS;
}
Output:
Region mapped at 0x7f45c48fe000 - 0x7f45c4902000
uffdio_register.ioctls = 0x5c
Have _UFFDIO_WRITEPROTECT? YES
ioctl(UFFDIO_WRITEPROTECT) successful.
I found the solution. The write-protected pages must be touched after registering but before marking them as write-protected. This is an undocumented requirement, from what I can tell.
In other words, add
for (size_t i = 0; i < len; i += page_size) {
addr[i] = 0;
}
between registering and write-protecting.
It works if I change the full example to
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
#define errExit(msg) \
do { \
perror(msg); \
exit(EXIT_FAILURE); \
} while (0)
static int page_size;
static void* fault_handler_thread(void* arg) {
long uffd; /* userfaultfd file descriptor */
uffd = (long) arg;
/* Loop, handling incoming events on the userfaultfd
file descriptor. */
for (;;) {
/* See what poll() tells us about the userfaultfd. */
struct pollfd pollfd;
int nready;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);
if (nready == -1)
errExit("poll");
printf("\nfault_handler_thread():\n");
printf(
" poll() returns: nready = %d; "
"POLLIN = %d; POLLERR = %d\n",
nready, (pollfd.revents & POLLIN) != 0,
(pollfd.revents & POLLERR) != 0);
// received fault, exit the program
exit(EXIT_FAILURE);
}
}
int main() {
long uffd; /* userfaultfd file descriptor */
char* addr; /* Start of region handled by userfaultfd */
uint64_t len; /* Length of region handled by userfaultfd */
pthread_t thr; /* ID of thread that handles page faults */
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
struct uffdio_writeprotect uffdio_wp;
int s;
page_size = sysconf(_SC_PAGE_SIZE);
len = page_size;
/* Create and enable userfaultfd object. */
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1)
errExit("userfaultfd");
uffdio_api.api = UFFD_API;
uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
errExit("ioctl-UFFDIO_API");
addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (addr == MAP_FAILED)
errExit("mmap");
printf("Address returned by mmap() = %p\n", addr);
/* Register the memory range of the mapping we just created for
handling by the userfaultfd object. */
uffdio_register.range.start = (unsigned long) addr;
uffdio_register.range.len = len;
uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
errExit("ioctl-UFFDIO_REGISTER");
printf("uffdio_register.ioctls = 0x%llx\n", uffdio_register.ioctls);
printf("Have _UFFDIO_WRITEPROTECT? %s\n", (uffdio_register.ioctls & _UFFDIO_WRITEPROTECT) ? "YES" : "NO");
for (size_t i = 0; i < len; i += page_size) {
addr[i] = 0;
}
uffdio_wp.range.start = (unsigned long) addr;
uffdio_wp.range.len = len;
uffdio_wp.mode = UFFDIO_WRITEPROTECT_MODE_WP;
if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffdio_wp) == -1)
errExit("ioctl-UFFDIO_WRITEPROTECT");
/* Create a thread that will process the userfaultfd events. */
s = pthread_create(&thr, NULL, fault_handler_thread, (void*) uffd);
if (s != 0) {
errno = s;
errExit("pthread_create");
}
/* Main thread now touches memory in the mapping, touching
locations 1024 bytes apart. This will trigger userfaultfd
events for all pages in the region. */
usleep(100000);
size_t l;
l = 0xf; /* Ensure that faulting address is not on a page
boundary, in order to test that we correctly
handle that case in fault_handling_thread(). */
char i = 0;
while (l < len) {
printf("Write address %p in main(): ", addr + l);
addr[l] = i++;
printf("%d\n", addr[l]);
l += 1024;
usleep(100000); /* Slow things down a little */
}
exit(EXIT_SUCCESS);
}

Effective implementation of conversion small string to uint64_t

#include <cstdint>
#include <cstring>
template<typename T>
T oph_(const char *s){
constexpr std::size_t MAX = sizeof(T);
const std::size_t size = strnlen(s, MAX);
T r = 0;
for(auto it = s; it - s < size; ++it)
r = r << 8 | *it;
return r;
}
inline uint64_t oph(const char *s){
return oph_<uint64_t>(s);
}
int main(){
uint64_t const a = oph("New York City");
uint64_t const b = oph("Boston International");
return a > b;
}
I want to convert first 8 characters from const char * to uint64_t so I can easily compare if two strings are greater / lesser.
I am aware that equals will semi-work.
However I am not sure if this is most efficient implementation.
I want the implementation to work on both little and big endian machines.
This is a C implementation, that should be faster that your implementation, but I still need to use strncpy which should be the bottleneck
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <byteswap.h>
union small_str {
uint64_t v;
char buf[8];
};
static uint64_t fill_small_str(const char *str)
{
union small_str ss = { 0 };
strncpy(ss.buf, str, 8);
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
return ss.v;
#else
return bswap_64(ss.v);
#endif
}
int main(void)
{
uint64_t const a = fill_small_str("Aew York City");
uint64_t const b = fill_small_str("Boston International");
printf("%lu ; %lu ; %d\n", a, b, (a < b));
return 0;
}

Missing linux/if.h

hello I am getting this error when using cygwin to try to make,
this is my makefile
flags=-g
all: icmptx
icmptx: it.o icmptx.o tun_dev.o
gcc $(flags) -o icmptx icmptx.o it.o tun_dev.o
it.o: it.c
gcc $(flags) -c it.c
icmptx.o: icmptx.c
gcc $(flags) -c icmptx.c
tun_dev.o: tun_dev.c
gcc $(flags) -c tun_dev.c
clean:
rm -f tun_dev.o it.o icmptx.o icmptx
the error is
$ make
gcc -g -c tun_dev.c
tun_dev.c:35:22: fatal error: linux/if.h: No such file or directory
compilation terminated.
Makefile:15: recipe for target `tun_dev.o' failed
make: *** [tun_dev.o] Error 1
here is my tun_dev.c file
/*
*/
/*
* tun_dev.c,v 1.1.2.4 2001/09/13 05:02:22 maxk Exp
*/
/* #include "config.h" */
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <syslog.h>
#include <errno.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <linux/if.h>
#include "vtun.h"
#include "lib.h"
/*
* Allocate TUN device, returns opened fd.
* Stores dev name in the first arg(must be large enough).
*/
int tun_open_old(char *dev)
{
char tunname[14];
int i, fd;
if( *dev ) {
sprintf(tunname, "/dev/%s", dev);
return open(tunname, O_RDWR);
}
for(i=0; i < 255; i++){
sprintf(tunname, "/dev/tun%d", i);
/* Open device */
if( (fd=open(tunname, O_RDWR)) > 0 ){
sprintf(dev, "tun%d", i);
return fd;
}
}
return -1;
}
#include <linux/if_tun.h>
/* pre 2.4.6 compatibility */
#define OTUNSETNOCSUM (('T'<< 8) | 200)
#define OTUNSETDEBUG (('T'<< 8) | 201)
#define OTUNSETIFF (('T'<< 8) | 202)
#define OTUNSETPERSIST (('T'<< 8) | 203)
#define OTUNSETOWNER (('T'<< 8) | 204)
int tun_open(char *dev)
{
struct ifreq ifr;
int fd;
if ((fd = open("/dev/net/tun", O_RDWR)) < 0)
return tun_open_old(dev);
memset(&ifr, 0, sizeof(ifr));
ifr.ifr_flags = IFF_TUN | IFF_NO_PI;
if (*dev)
strncpy(ifr.ifr_name, dev, IFNAMSIZ);
if (ioctl(fd, TUNSETIFF, (void *) &ifr) < 0) {
if (errno == EBADFD) {
/* Try old ioctl */
if (ioctl(fd, OTUNSETIFF, (void *) &ifr) < 0)
goto failed;
} else
goto failed;
}
strcpy(dev, ifr.ifr_name);
return fd;
failed:
close(fd);
return -1;
}
int tun_close(int fd, char *dev)
{
return close(fd);
}
/* Read/write frames from TUN device */
int tun_write(int fd, char *buf, int len)
{
return write(fd, buf, len);
}
int tun_read(int fd, char *buf, int len)
{
return read(fd, buf, len);
}
any idea, google shows nothing
Try:
#include <net/if.h>
instead.

SO_REUSEADDR and UDP behavior in Windows

I'm aware that using SO_REUSEADDR with UDP in a *NIX environment, behaves like a multicast, where multiple clients bound to the same port can listen and receive broadcast datagrams simultaneously. Is this the behavior on Windows as well?
Multiple UDP sockets on Windows bound to the same port will all receive broadcast packets together.
Here's a demo program you can build for windows and Linux with GCC and test with Netcat as mentioned. In both systems, only one socket (either A or B) receives each datagram when a unicast address is used as the target. If a broadcast address is used then both sockets will receive the message.
/* Tested on linux and windows 7.
* On windows use mingw-gcc:
* gcc -Wall -g -o udplisten udplisten.c -lws2_32
* Test with:
* echo hello | netcat -u machinename 9898 (unicast)
* echo hello | netcat -u 172.16.255.255 9898 (broadcast)
*/
#ifdef WIN32
#include <winsock2.h>
#include <ws2tcpip.h>
#else
#include <sys/select.h>
#include <sys/socket.h>
#include <netinet/in.h>
#define SOCKET int
#define INVALID_SOCKET -1
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#undef max
#define max(x,y) ((x) > (y) ? (x) : (y))
static void
die(const char *str)
{
perror(str);
exit(1);
}
static SOCKET
mksocket(struct sockaddr_in *addr)
{
SOCKET sock = INVALID_SOCKET;
int opt = 1;
if ((sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0)
die("socket");
if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (const char *)&opt, sizeof(opt)) < 0)
die("setsockopt");
if (bind(sock, (struct sockaddr *)addr, sizeof(struct sockaddr_in)) < 0)
die("bind");
return sock;
}
static void
process(SOCKET sock, const char *label)
{
char buffer[8192];
struct sockaddr_in caddr;
socklen_t caddr_size = sizeof(caddr);
memset(&caddr, 0, caddr_size);
int count = recvfrom(sock, buffer, sizeof(buffer), 0,
(struct sockaddr *)&caddr, &caddr_size);
if (count < 0) die(label);
printf("%s %d '", label, count);
fwrite(buffer, 1, count, stdout);
printf("'\n");
}
int
main(int argc, char *argv[])
{
struct sockaddr_in addr;
SOCKET socka = INVALID_SOCKET, sockb = INVALID_SOCKET;
fd_set read_set;
#ifdef WIN32
WSADATA wsaData;
if (WSAStartup(MAKEWORD(2,2), &wsaData))
return -1;
#endif
addr.sin_family = AF_INET;
addr.sin_port = htons(9898);
addr.sin_addr.s_addr = INADDR_ANY;
socka = mksocket(&addr);
sockb = mksocket(&addr);
for (;;) {
FD_ZERO(&read_set);
FD_SET(socka, &read_set);
FD_SET(sockb, &read_set);
if (select(max(socka,sockb)+1, &read_set, NULL, NULL, NULL) < 0)
die("select");
if (FD_ISSET(socka, &read_set))
process(socka, "A");
if (FD_ISSET(sockb, &read_set))
process(sockb, "B");
}
return 0;
}

Page faults on OS X when reading with MMAP

I am trying to benchmark file system I/O on Mac OS X using mmap.
#include <unistd.h>
#include <fcntl.h>
#include <dirent.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <stdio.h>
#include <math.h>
char c;
int main(int argc, char ** argv)
{
if (argc != 2)
{
printf("no files\n");
exit(1);
}
int fd = open(argv[1], O_RDONLY);
fcntl(fd, F_NOCACHE, 1);
int offset=0;
int size=0x100000;
int pagesize = getpagesize();
struct stat stats;
fstat(fd, &stats);
int filesize = stats.st_size;
printf("%d byte pages\n", pagesize);
printf("file %s # %d bytes\n", argv[1], filesize);
while(offset < filesize)
{
if(offset + size > filesize)
{
int pages = ceil((filesize-offset)/(double)pagesize);
size = pages*pagesize;
}
printf("mapping offset %x with size %x\n", offset, size);
void * mem = mmap(0, size, PROT_READ, 0, fd, offset);
if(mem == -1)
return 0;
offset+=size;
int i=0;
for(; i<size; i+=pagesize)
{
c = *((char *)mem+i);
}
munmap(mem, size);
}
return 0;
}
The idea is that I'll map a file or portion of it and then cause a page fault by dereferencing it. I am slowly losing my sanity since this doesn't at all work and I've done similar things on Linux before.
Change this line
void * mem = mmap(0, size, PROT_READ, 0, fd, offset);
to
void * mem = mmap(0, size, PROT_READ, MAP_PRIVATE, fd, offset);
And, don't compare mem with -1. Use this instead:
if(mem == MAP_FAILED) { ... }
It's both more readable and more portable.
General advice: if you're on a different UNIX platform from what you're used to, it's a good idea to open the man page. For mmap on OS X, it can be found here. It says
Conforming applications must specify either MAP_PRIVATE or MAP_SHARED.
So, specifying 0 on the fourth
argument is not OK in OS X. I believe
this is true for BSD in general.

Resources