I have a simple multi-threaded program where the thread performs random reads on a given file (in memory) divided evenly amongst the threads. The thread reads from the file to buffer and sets a value. This is really a program designed to test memory bandwidth. This is the following program,
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <stdbool.h>
#include <ctype.h>
#include <inttypes.h>
#include <pthread.h>
#include <assert.h>
#include <time.h>
#define NS_IN_SECOND 1000000000
uint64_t nano_time(void) {
struct timespec ts;
if( clock_gettime(CLOCK_REALTIME, &ts) == 0)
return ts.tv_sec * NS_IN_SECOND + ts.tv_nsec;
}
// avx512 test
#include <stdint.h>
void *__memmove_chk_avx512_no_vzeroupper(void *dest, void *src, size_t s);
/**
* To create 4 GB file: This will allocate space on disk
* $ dd < /dev/zero bs=1048576 count=4096 > testfile
*
* 100 GiB
* dd if=/dev/zero of=bigmmaptest bs=1M count=102400
* To clear cache:
* $ sync; echo 1 > /proc/sys/vm/drop_caches
*/
//#define SAMPLE_LATENCY 1
#define BYTES_IN_GB (1024*1024*1024)
// Block sized will be used for read and the same will be used for striding
// when iterating over a file in mmap.
#define DEFAULT_BLOCK_SIZE 4096 //8192
#define NANOSECONDS_IN_SECOND 1000000000
const char DEFAULT_NAME[] = "/mnt/tmp/mmaptest";
#define EXIT_MSG(...) \
do { \
printf(__VA_ARGS__); \
_exit(-1); \
} while (0)
uint64_t read_mmap_test(int fd, int tid, size_t block_size, size_t filesize, char* buf,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t write_mmap_test(int fd, int tid, size_t block_size, size_t filesize, char* buf,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t mmap_test(int fd, int tid, size_t block_size, size_t filesize, char *buf,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t read_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t write_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t syscall_test(int fd, int tid, size_t block_size, size_t filesize,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end);
size_t get_filesize(const char* filename);
void print_help_message(const char *progname);
char* map_buffer(int fd, size_t size);
void *run_tests(void *);
static int silent = 0;
typedef struct {
int tid;
int fd;
char *mapped_buffer;
int read_mmap;
int read_syscall;
int write_mmap;
int write_syscall;
off_t *offsets;
size_t block_size;
size_t chunk_size;
int retval;
uint64_t start_time;
uint64_t end_time;
} threadargs_t;
size_t filesize;
int main(int argc, char **argv) {
char *fname = (char*) DEFAULT_NAME;
char *mapped_buffer = NULL;
int c, fd, i, flags = O_RDWR, numthreads = 1, ret, option_index;
static int randomaccess = 0,
read_mmap = 0, read_syscall = 0,
write_mmap = 0, write_syscall = 0,
mixed_mmap = 0, write_tr = 0;
off_t *offsets = 0;
size_t block_size = DEFAULT_BLOCK_SIZE, numblocks,
new_file_size = 0;
uint64_t min_start_time, max_end_time = 0, retval;
// permissions
uint64_t mode = S_IRWXU | S_IRWXG;
pthread_t *threads;
threadargs_t *threadargs;
static struct option long_options[] =
{
// Options set a flag
{"randomaccess", no_argument, &randomaccess, 1},
{"readmmap", no_argument, &read_mmap, 1},
{"readsyscall", no_argument, &read_syscall, 1},
{"silent", no_argument, &silent, 1},
{"writemmap", no_argument, &write_mmap, 1},
{"writesyscall", no_argument, &write_syscall, 1},
{"mixedmmap", no_argument, &mixed_mmap, 1},
// Options take an argument
{"block", required_argument, 0, 'b'},
{"file", required_argument, 0, 'f'},
{"help", no_argument, 0, 'h'},
{"size", no_argument, 0, 's'},
{"threads", required_argument, 0, 't'},
{"writethreads", no_argument, 0, 'w'},
{0, 0, 0, 0}
};
//read operations
while(1) {
c = getopt_long(argc, argv, "b:f:h:s:t:w:",
long_options, &option_index);
// is end of the option
if (c == -1)
break;
switch(c)
{
case 0:
break;
case 'b':
block_size = atoi(optarg);
break;
case 'f':
fname = optarg;
break;
case 'h':
print_help_message(argv[0]);
_exit(0);
case 's':
new_file_size = (size_t)(atoi(optarg)) * BYTES_IN_GB;
break;
case 't':
numthreads = (int) (atoi(optarg));
break;
case 'w':
write_tr = atoi(optarg);
break;
default:
break;
}
}
if(!silent){
printf("PID: %d\n", getpid());
printf("Using file %s \n", fname);
}
if ((filesize = get_filesize(fname)) == -1) {
if (read_mmap || read_syscall) {
printf("Cannot obtain file size for %s: %s"
"File must exist prior to running read tests.\n",
fname, strerror(errno));
_exit(-1);
}
else
filesize = new_file_size;
}
fd = open((const char*)fname, flags, mode);
if(fd <0) {
printf("Clould not open/create file %s: %s\n",
fname, strerror(errno));
_exit(-1);
}
if(block_size < 0 || block_size > filesize){
printf("Invalid block size: %zu for file of size "
"%zu. Block size must be greater than 0 and no"
"greater than the file size.\n",
block_size, filesize);
_exit(-1);
}
/*
* Generate random block number for random file access.
* Sequential for sequential access
*/
numblocks = filesize/block_size;
if(filesize % block_size > 0)
numblocks++;
offsets = (off_t *) malloc(numblocks * sizeof(off_t));
if(offsets == 0){
printf("Failed to allocate memory: %s\n", strerror(errno));
_exit(-1);
}
for (uint64_t i = 0; i < numblocks; i++)
if(randomaccess)
offsets[i] = ((int)random() % numblocks) * block_size;
else
offsets[i] = i*block_size;
if (numblocks % numthreads != 0)
EXIT_MSG("We have %" PRIu64 " blocks and %d threads. "
"Threads must evenly divide blocks. "
"Please fix the args.\n",
(uint_least64_t)numblocks, numthreads);
if( read_mmap || write_mmap || mixed_mmap)
assert((mapped_buffer = map_buffer(fd, filesize)) != NULL);
threads = (pthread_t*)malloc(numthreads * sizeof(pthread_t));
threadargs =
(threadargs_t*)malloc(numthreads * sizeof(threadargs_t));
if (threads == NULL || threadargs == NULL)
EXIT_MSG("Could not allocate thread array for %d threads.\n", numthreads);
for (i = 0; i < numthreads; i++) {
if(mixed_mmap){
if (i < write_tr) {
write_mmap = 1;
} else {
read_mmap = 1;
}
}
threadargs[i].fd = fd;
threadargs[i].tid = i;
threadargs[i].block_size = block_size;
threadargs[i].chunk_size = filesize/numthreads;
threadargs[i].mapped_buffer = mapped_buffer;
threadargs[i].offsets = &offsets[numblocks/numthreads * i];
threadargs[i].read_mmap = read_mmap;
threadargs[i].read_syscall = read_syscall;
threadargs[i].write_mmap = write_mmap;
threadargs[i].write_syscall = write_syscall;
int ret = pthread_create(&threads[i], NULL, run_tests, &threadargs[i]);
if (ret!=0)
EXIT_MSG("pthread_create for %dth thread failed: %s\n",
i, strerror(errno));
}
for (i = 0; i< numthreads; i++){
ret = pthread_join(threads[i], NULL);
if (ret !=0)
EXIT_MSG("Thread %d failed in join: %s\n",
i, strerror(errno));
}
// for mixed mode determine read and write aggregate b/w.
if(mixed_mmap) {
// Write b/w
min_start_time = threadargs[0].start_time;
max_end_time = 0;
// Since tid 0 to write_tr-1 did writes, find it's min and max.
for(i=0; i < write_tr; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("Write: %.2f\n",
(double)write_tr*(filesize/numthreads)/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
// Read b/w
min_start_time = threadargs[write_tr].start_time;
max_end_time = 0;
for(i=write_tr; i < numthreads; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("Read: %.2f\n",
(double)(numthreads-write_tr)*(filesize/numthreads)/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
/**
* For total run time. Find the smallest start time
* and largest end time across all threads.
*/
min_start_time = threadargs[0].start_time;
max_end_time = 0;
for (i=0; i< numthreads; i++){
min_start_time = (threadargs[i].start_time < min_start_time)?
threadargs[i].start_time:min_start_time;
max_end_time = (threadargs[i].end_time > max_end_time)?
threadargs[i].end_time:max_end_time;
}
printf("%.2f\n",
(double)filesize/(double)(max_end_time-min_start_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
munmap(mapped_buffer, filesize);
close(fd);
}
void * run_tests(void *args) {
uint64_t retval;
threadargs_t t = *(threadargs_t*)args;
if(t.read_mmap) {
if(!silent)
printf("Running read mmap test:\n");
retval = read_mmap_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.mapped_buffer, t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.read_syscall) {
if(!silent)
printf("Running read syscall test:\n");
retval = read_syscall_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.write_mmap) {
if(!silent)
printf("Running write mmap test:\n");
retval = write_mmap_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.mapped_buffer, t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
else if(t.write_syscall) {
if(!silent)
printf("Running write syscall test:\n");
retval = write_syscall_test(t.fd, t.tid, t.block_size, t.chunk_size,
t.offsets,
&((threadargs_t*)args)->start_time,
&((threadargs_t*)args)->end_time);
}
return (void*) 0;
}
#define READ 1
#define WRITE 2
/**
********* SYSCALL section
*/
uint64_t read_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end) {
return syscall_test(fd, tid, block_size, filesize, READ, offsets,
begin, end);
}
uint64_t write_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
off_t *offsets, uint64_t *begin, uint64_t *end) {
return syscall_test(fd, tid, block_size, filesize, WRITE, offsets,
begin, end);
}
uint64_t syscall_test(int fd, int tid, size_t block_size, size_t filesize,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end) {
bool done = false;
char * buffer = NULL;
int i = 0;
size_t total_bytes_transferred = 0;
uint64_t begin_time, end_time, ret_token = 0;
buffer = (char*)malloc(block_size);
if(buffer == NULL) {
printf("Failed to allocate memory: %s\n", strerror(errno));
return -1;
}
memset((void*)buffer, 0, block_size);
begin_time= nano_time();
while(!done) {
size_t bytes_transferred = 0;
if(optype == READ)
bytes_transferred = pread(fd, buffer, block_size, offsets[i++]);
else if (optype == WRITE)
bytes_transferred = pwrite(fd, buffer, block_size, offsets[i++]);
if (bytes_transferred == 0)
done = true;
else if(bytes_transferred == -1){
printf("Failed to IO: %s\n", strerror(errno));
return -1;
}
else {
total_bytes_transferred += bytes_transferred;
if (optype == WRITE && total_bytes_transferred == filesize)
done = true;
// Do random operation
ret_token += buffer[0];
}
if (i*block_size >= filesize)
done = true;
}
end_time = nano_time();
if(!silent){
printf("%s: %" PRIu64 " bytes transferred in %" PRIu64 ""
" ns.\n", (optype == READ)?"read-syscall":"write-syscall",
(uint_least64_t)total_bytes_transferred, (end_time-begin_time));
// Throughput in GB/s
printf("(tid %d) %.2f\n", tid,
(double)filesize/(double)(end_time-begin_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
*begin = begin_time;
*end = end_time;
return ret_token;
}
/**
* MMAP tests
*/
uint64_t read_mmap_test(int fd, int tid, size_t block_size, size_t filesize,
char *buf, off_t *offsets, uint64_t *begin, uint64_t *end) {
return mmap_test(fd, tid, block_size, filesize, buf, READ, offsets, begin, end);
}
uint64_t write_mmap_test(int fd, int tid, size_t block_size, size_t filesize,
char *buf, off_t *offsets, uint64_t *begin, uint64_t *end){
return mmap_test(fd, tid, block_size, filesize, buf, WRITE, offsets, begin, end);
}
// Add memory addr
#if SAMPLE_LATENCY
#define BEGIN_LAT_SAMPLE \
if (num_samples < MAX_LAT_SAMPLES && i%LAT_SAMPL_INTERVAL == 0) \
lat_begin_time = nano_time();
#define END_LAT_SAMPLE \
if (num_samples < MAX_LAT_SAMPLES && i%LAT_SAMPL_INTERVAL == 0) { \
lat_end_time = nano_time(); \
latency_samples[i/LAT_SAMPL_INTERVAL % MAX_LAT_SAMPLES] = \
lat_end_time - lat_begin_time; \
num_samples++; \
}
#define MAX_LAT_SAMPLES 50
//#define LAT_SAMPL_INTERVAL (1000*1048576)
#define LAT_SAMPL_INTERVAL block_size
#else
#define BEGIN_LAT_SAMPLE ;
#define END_LAT_SAMPLE
#endif
uint64_t mmap_test(int fd, int tid, size_t block_size, size_t filesize, char *mapped_buffer,
char optype, off_t *offsets, uint64_t *begin, uint64_t *end) {
bool done = false;
char *buffer = NULL;
uint64_t i, j, numblocks, ret;
uint64_t begin_time, end_time, ret_token = 0;
#if SAMPLE_LATENCY
uint64_t lat_begin_time, lat_end_time;
size_t latency_samples[MAX_LAT_SAMPLES];
int num_samples = 0;
memset((void*)latency_samples, 0, sizeof(latency_samples));
#endif
buffer = (char*)malloc(block_size);
if(buffer == NULL) {
printf("Failed to allocate memory: %s\n", strerror(errno));
return -1;
}
memset((void*)buffer, 1, block_size);
begin_time = nano_time();
for(i=0; i<filesize; i+=block_size){
off_t offset = offsets[i/block_size];
BEGIN_LAT_SAMPLE;
if(optype == READ) {
//__memmove_chk_avx512_no_vzeroupper(buffer, &mapped_buffer[offset], block_size);
memcpy(buffer, &mapped_buffer[offset], block_size);
ret_token += buffer[0];
}
else if (optype == WRITE) {
//__memmove_chk_avx512_no_vzeroupper(&mapped_buffer[offset], buffer, block_size);
memcpy(&mapped_buffer[offset], buffer, block_size);
ret_token += mapped_buffer[i];
}
END_LAT_SAMPLE;
}
end_time = nano_time();
if(!silent) {
printf("%s: %" PRIu64 " bytes read in %" PRIu64 " ns.\n",
(optype==READ)?"readmap":"writemap",
(uint_least64_t)filesize, (end_time-begin_time));
// print GB/s
printf("(tid %d) %.2f\n", tid,
(double)filesize/(double)(end_time-begin_time)
* NANOSECONDS_IN_SECOND / BYTES_IN_GB);
}
*begin = begin_time;
*end = end_time;
#if SAMPLE_LATENCY
printf("\nSample latency for %ld byte block:\n", block_size);
for (i = 0; i < MAX_LAT_SAMPLES; i++)
printf("\t%ld: %ld\n", i, latency_samples[i]);
#endif
return ret_token;
}
char* map_buffer(int fd, size_t size) {
char *mapped_buffer = NULL;
// Populate
mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_POPULATE, fd, 0);
// Shared
// mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
// MAP_SHARED, fd, 0);
// Anon test
// mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
// MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if(mapped_buffer == MAP_FAILED)
EXIT_MSG("Failed to mmap file of size %zu: %s\n",
size, strerror(errno));
// Might also need to gurantee page aligned - posix_memalign()
// int mret = madvise(mapped_buffer, filesize, MADV_HUGEPAGE);
// if(mret!=0) {
// fprintf(stderr, "failed madvise: %s\n", strerror(errno));
// }
return mapped_buffer;
}
size_t get_filesize(const char* filename){
int retval;
struct stat st;
retval = stat(filename, &st);
if(retval)
return -1;
else
return st.st_size;
}
void print_help_message(const char *progname) {
/* take only the last portion of the path */
const char *basename = strrchr(progname, '/');
basename = basename ? basename + 1 : progname;
printf("usage: %s [OPTION]\n", basename);
printf(" -h, --help\n"
" Print this help and exit.\n");
printf(" -b, --block[=BLOCKSIZE]\n"
" Block size used for read system calls.\n"
" For mmap tests, the size of the stride when iterating\n"
" over the file.\n"
" Defaults to %d.\n", DEFAULT_BLOCK_SIZE);
printf(" -f, --file[=FILENAME]\n"
" Perform all tests on this file (defaults to %s).\n",
DEFAULT_NAME);
printf(" --readsyscall\n"
" Perform a read test using system calls.\n");
printf(" --readmmap\n"
" Perform a read test using mmap.\n");
printf(" --writesyscall\n"
" Perform a write test using system calls.\n");
printf(" --writemmap\n"
" Perform a write test using mmap.\n");
printf(" --randomaccess\n"
" Perform random access.\n");
printf(" --threads\n"
" Number of threads to use. Defaults to one.\n");
printf(" --mixedmmap\n"
" Perfom read and write concurrently at different offsets\n");
printf(" -w, -writethreads[=0]\n"
" Number of threads that should perform write\n");
}
To compile:
$ gcc testm.c -o testm -lpthread -static -O2 -fno-builtin-memcpy
Commands to run the program:
$ dd if=/dev/zero of=bigmmaptest bs=1M count=25600 # 25 GiB file
$ ./testm -b 1024 -f bigmmaptest --threads 16 --randomaccess --readmmap
I am on a 32 core Xeon 5218 2nd Gen. L1d KiB /L2 MiB /L3 MiB -- 512 / 16 / 22
When the memcpy size is 1 KiB I get 21.7 GB/s but when the size is 256B I get 26.68 GB/s and 34.8 GB/s when the size is 4 KiB. Why is there a drop in the middle?
I observe that 2 KiB also performs poorly when compared to 256B and 4 KiB.
What's more interesting is, when I disable the L2 hardware prefetcher and without any other changes my bandwidth automatically increases for 1 KiB and 2 KiB. Without prefetch 2 KiB memcpy gives 34.8 GB/s. All of these are aggregate bandwidth.
With perf, I did measure L2 load-store misses but they turned out to not change drastically. This effect is also not seen for 8 threads and below.
I am on linux 5.0.4. I am using the glibC memcpy (gcc 7.5.0) and even with -O2 I observe the above quirk. Where 1 KiB access size gives 18.76 GiB/s with L2 prefetch and without I get 30.32 GiB/s. For comparison, 256 B access size provides 24.7 GiB/s with prefetch and 24.8 GiB/s without. Clearly, the drop in performance is because of the L2 cache pollution caused by the prefetcher, as this is not observed with smaller thread counts. I was considering if SMT could be the reason for increased pollution but I observe the effect distinctly at 16 threads on 16 physical cores.
Skimming through glibc memcpy code, I can see that any access below the size of 4 KiB uses AVX 256 instructions, so there is nothing changing there.
The smaller 256B size not seeing a drop from the L2 streamer might be due to the sequence of cache misses being too short to activate the streamer and waste bandwidth (and slots in the LFBs and L2 <-> L3 superqueue) on requests that won't be useful.
For aligned 4k, there are no bytes within the same page that you're not fetching, so the L2 prefetcher is positively useful, or at least not harmful. (Demand loads come in pretty quickly for later lines when running memcpy so I'm guessing speeds were about the same with/without HW prefetch enabled, unless HW prefetch helps getting started on a new 4k chunk while still waiting for the end of the previous.)
The L2 only sees physical addresses, and AFAIK it doesn't try to prefetch across a 4k boundary. (Even if its within the same 2M hugepage, because it doesn't know that either.) The "next-page prefetcher" Intel mentions being new in Ivy Bridge is AFAIK just a TLB prefetch, not data.
So with aligned 4k memcpy, HW prefetch stops automatically at the end of the data you're actually going to read, not wasting any bandwidth. Since mmap gives you page-aligned memory, these 4k memcopies are from a single source page.
(The destination is irrelevant as it probably stays hot in L1d cache, with maybe an occasional eviction to L2, and the reload from it after memcpy can come from store-forwarding, not even having to wait for memcpy's store to commit to L1d.)
Prediction: If your smaller memcpy source starts part way into a 4k page, but still end at the end of a 4k page, you'd probably see similar behaviour to prefetch disabled. e.g. generate a random page number, and start at 3072 bytes into it, doing a 1 KiB copy. So all your 1 KiB copies come from the ends of pages, never middles.
(You'd still have more dTLB misses per byte memcpyed, because each TLB entry is only covering 1 K of the data you ever actually read. You did you use MAP_POPULATE so you shouldn't be seeing page faults in the timed region, assuming you have enough RAM.)
L1d KiB /L2 MiB /L3 MiB -- 512 / 16 / 22
Those are aggregate totals, but L1d and L2 are private per-core! You have 32kiB L1d and 1MiB L2 per core, because this is Cascade Lake, same layout as Skylake-X.
And BTW, I'd consider using a fast PRNG like xorshift+ or xorshift* inside the timing loop; that's easily random enough to defeat prefetching; even a simple LFSR or even LCG with a power-of-2 modulo would do that (and be very cheap, just an imul and add). It avoids having to read offsets from another array, if you really want to isolate just the memcpy memory accesses. Probably doesn't make a difference though. One advantage of a very simple PRNG with a period equal to the space you're trying to cover (like an LCG) is that you won't generate the same address twice, giving you a random permutation of the blocks. But with a big enough block of memory, random cache hits even from L3 are unlikely even without that hard-to-achieve property.
Your current array of offsets is fine. (I didn't look at the code super closely, so I'm just assuming there aren't bugs.)
I try to implement a web interface with a W5100 Ethernet Controller and an XMega, but my browser prints out this weird result:
Please take a look at my code:
SPIM_Config_t Config_SPIM = {
.Device = &SPIC,
.Mode = SPI_MODE_0,
.Prescaler = SPI_PRESCALER_64,
};
W5100_Config_t Config_Ethernet = {
.Submask = {255, 255, 0, 0},
.IP = {169, 254, 133, 121},
.Gateway = {169, 154, 133, 129},
.MAC = {0x00, 0x00, 0x00, 0x00, 0x00, 0xAA}
};
uint8_t Rx_Buffer[2048];
uint8_t Tx_Buffer[2048];
const char HTTP[] = "HTTP/1.0 200 OK\r\nContent-Type: text/html\r\nPragma: no-cache\r\n\r\n"
"<html>\r\n"
"<body>\r\n"
"<title>Title</title>\r\n"
"<p>Hello world</p>\r\n"
"</body>\r\n"
"</html>\r\n";
int main(void)
{
W5100_Init(&Config_SPIM, &Config_Ethernet);
while(1)
{
W5100_Status_t Status;
W5100_GetState(0, &Status);
switch(Status)
{
case W5100_SOCK_CLOSED:
{
if(W5100_Open(0, W5100_PROT_TCP, 80, W5100_MEM_2K, W5100_MEM_2K, 65535) == W5100_NO_ERROR)
{
W5100_Listen(0, ETHERNET_TIMEOUT);
}
break;
}
case W5100_SOCK_ESTABLISHED:
{
uint16_t Rx_Bytes;
if(W5100_GetBytes(0, &Rx_Bytes) == W5100_NO_ERROR)
{
if(Rx_Bytes)
{
W5100_Receive(0, Rx_Buffer, Rx_Bytes);
strcpy((char*)Tx_Buffer, HTTP);
W5100_Send(0, Tx_Buffer, strlen((char*)HTTP), ETHERNET_TIMEOUT);
}
else
{
}
}
W5100_Disconnect(0, ETHERNET_TIMEOUT);
break;
}
case W5100_SOCK_FIN_WAIT:
case W5100_SOCK_CLOSING:
case W5100_SOCK_TIME_WAIT:
case W5100_SOCK_CLOSE_WAIT:
case W5100_SOCK_LAST_ACK:
{
W5100_Close(0, ETHERNET_TIMEOUT);
break;
}
}
}
}
I think the error is somewhere in my W5100_Send function and it seems that the Controller is sending the content of different memory locations, but I canĀ“t figure out the error. The code based on the datasheet of the Ethernet Controller:
W5100_ErrorCode_t W5100_Send(uint8_t Socket, uint8_t* Buffer, uint16_t Length, uint32_t Timeout)
{
uint8_t Temp[2];
uint8_t Mask;
uint16_t SocketBase;
uint16_t Offset;
uint16_t Free;
uint16_t SocketMemory;
uint32_t Timeout_Temp = Timeout;
if(!_W5100_IsInitialized)
{
return W5100_NOT_INITIALIZED;
}
else if((Socket > 0x04) || (Buffer == NULL) || (Length == 0x00))
{
return W5100_INVALID_PARAM;
}
// Get the memory mask for address calculation
W5100_ReadRegister(W5100_REGISTER_TMSR, &Mask);
Mask &= (0x03 << (Socket << 0x01));
// Check for invalid memory by comparing the memory mask for the given socket and the socket index
if(((Socket > 0) && (Mask == 3)) || ((Socket > 1) && (Mask == 2)))
{
return W5100_INVALID_PARAM;
}
SocketBase = W5100_SOCKET_ADDR(Socket);
SocketMemory = W5100_SOCKET_MEM_OFFSET << Mask;
// Wait while the buffer is full
do
{
// Get the free bytes
W5100_ReadRegister(SocketBase + W5100_OFFSET_TX_FSR0, &Temp[0]);
W5100_ReadRegister(SocketBase + W5100_OFFSET_TX_FSR1, &Temp[1]);
Free = ((uint16_t)(Temp[0] << 0x08)) | Temp[1];
if(Timeout_Temp-- == 0x00)
{
W5100_Disconnect(Socket, Timeout);
return W5100_TIMEOUT;
}
_delay_ms(1);
}while(Free < Length);
// Get the write pointer address
W5100_ReadRegister(SocketBase + W5100_OFFSET_TX_WR0, &Temp[0]);
W5100_ReadRegister(SocketBase + W5100_OFFSET_TX_WR1, &Temp[1]);
Offset = (((uint16_t)(Temp[0] << 0x08)) | Temp[1]) & W5100_TX_MEM_MASK;
// Check for an overflow
if(Offset + Length > SocketMemory)
{
uint16_t Upper;
uint16_t Left;
Upper = SocketMemory - Offset;
Left = Length - Upper;
W5100_WriteMemory(W5100_TX_BUFFER_BASE + (SocketMemory * Socket) + Offset, Buffer, Upper);
W5100_WriteMemory(W5100_TX_BUFFER_BASE + (SocketMemory * Socket), Buffer, Left);
}
else
{
W5100_WriteMemory(W5100_TX_BUFFER_BASE + (SocketMemory * Socket) + Offset, Buffer, Length);
}
W5100_WriteRegister(SocketBase + W5100_OFFSET_TX_WR0, Offset >> 0x08);
W5100_WriteRegister(SocketBase + W5100_OFFSET_TX_WR1, Offset & 0xFF);
return W5100_ExecuteCommand(Socket, W5100_CMD_SEND, Timeout);
}
You should fully rewrite your W5100_Send, because it is full of issues.
For example, calculation of Mask value has no sense.
The cycle which is waiting for Free value always delays at least 1 ms, even when good value obtained from the beginning. Also, when timed out, it breaks, even if received Free value is good.
Offset value is damaged by & operation:
Offset = (((uint16_t)(Temp[0] << 0x08)) | Temp[1]) & W5100_TX_MEM_MASK;
This value is never increased by the written data size, and the damaged value is written back to W5100_OFFSET_TX_WR1:W5100_OFFSET_TX_WR0
The wrapping data writing has an error:
W5100_WriteMemory(W5100_TX_BUFFER_BASE + (SocketMemory * Socket) + Offset, Buffer, Upper);
W5100_WriteMemory(W5100_TX_BUFFER_BASE + (SocketMemory * Socket), Buffer, Left);
You're copying to both the parts from the start of Buffer. In the second line it should be &Buffer[Upper]
Etc etc...
First you need to determine size of sockets. I encourage you to set up the socket sizes from the beginning, thus avoiding offset and size calculation on the runtime.
But if you want to determine the socket size dynamically, then you can do it as follows:
uint16_t SocketBufAddr = W5100_TX_BUFFER_BASE; // Start of the socket memory block
SocketMemory = 0; // Size of the socket memory block
W5100_ReadRegister(W5100_REGISTER_TMSR, &Mask);
for (uint8_t i = 0 ; i <= Socket ; i++) {
SocketBufAddr += SocketMemory; // Increase the offset by the previous socket size
SocketMemory = 1024 << ((Mask >> (i * 2)) & 3);
}
now, the writing process should be something like this:
// Get the write pointer address
W5100_ReadRegister(SocketBase + W5100_OFFSET_TX_WR0, &Temp[0]);
W5100_ReadRegister(SocketBase + W5100_OFFSET_TX_WR1, &Temp[1]);
uint16_t WrPointer = (((uint16_t)(Temp[0] << 0x08)) | Temp[1]); // no & operation! It is the 16-bit pointer!!!
Offset = WrPointer & (SocketMemory - 1); // Offset inside the socket memory block. SocketMemory is always = 2^n
// Check for an overflow
if(Offset + Length > SocketMemory)
{
uint16_t Upper;
uint16_t Left;
Upper = SocketMemory - Offset ;
Left = Length - Upper;
W5100_WriteMemory(SocketBufAddr + Offset, Buffer, Upper);
W5100_WriteMemory(SocketBufAddr, &Buffer[Upper], Left);
}
else
{
W5100_WriteMemory(SocketBufAddr + Offset, Buffer, Length);
}
WrPointer += Length; // Increase full 16-bit pointer value
// Write the new pointer back
W5100_WriteRegister(SocketBase + W5100_OFFSET_TX_WR0, WrPointer >> 0x08);
W5100_WriteRegister(SocketBase + W5100_OFFSET_TX_WR1, WrPointer & 0xFF);
return W5100_ExecuteCommand(Socket, W5100_CMD_SEND, Timeout);
I have a list of N 64-bit integers whose bits represent small sets. Each integer has at most k bits set to 1. Given a bit mask, I would like to find the first element in the list that matches the mask, i.e. element & mask == element.
Example:
If my list is:
index abcdef
0 001100
1 001010
2 001000
3 000100
4 000010
5 000001
6 010000
7 100000
8 000000
and my mask is 111000, the first element matching the mask is at index 2.
Method 1:
Linear search through the entire list. This takes O(N) time and O(1) space.
Method 2:
Precompute a tree of all possible masks, and at each node keep the answer for that mask. This takes O(1) time for the query, but takes O(2^64) space.
Question:
How can I find the first element matching the mask faster than O(N), while still using a reasonable amount of space? I can afford to spend polynomial time in precomputation, because there will be a lot of queries. The key is that k is small. In my application, k <= 5 and N is in the thousands. The mask has many 1s; you can assume that it is drawn uniformly from the space of 64-bit integers.
Update:
Here is an example data set and a simple benchmark program that runs on Linux: http://up.thirld.com/binmask.tar.gz. For large.in, N=3779 and k=3. The first line is N, followed by N unsigned 64-bit ints representing the elements. Compile with make. Run with ./benchmark.e >large.out to create the true output, which you can then diff against. (Masks are generated randomly, but the random seed is fixed.) Then replace the find_first() function with your implementation.
The simple linear search is much faster than I expected. This is because k is small, and so for a random mask, a match is found very quickly on average.
A suffix tree (on bits) will do the trick, with the original priority at the leaf nodes:
000000 -> 8
1 -> 5
10 -> 4
100 -> 3
1000 -> 2
10 -> 1
100 -> 0
10000 -> 6
100000 -> 7
where if the bit is set in the mask, you search both arms, and if not, you search only the 0 arm; your answer is the minimum number you encounter at a leaf node.
You can improve this (marginally) by traversing the bits not in order but by maximum discriminability; in your example, note that 3 elements have bit 2 set, so you would create
2:0 0:0 1:0 3:0 4:0 5:0 -> 8
5:1 -> 5
4:1 5:0 -> 4
3:1 4:0 5:0 -> 3
1:1 3:0 4:0 5:0 -> 6
0:1 1:0 3:0 4:0 5:0 -> 7
2:1 0:0 1:0 3:0 4:0 5:0 -> 2
4:1 5:0 -> 1
3:1 4:0 5:0 -> 0
In your example mask this doesn't help (since you have to traverse both the bit2==0 and bit2==1 sides since your mask is set in bit 2), but on average it will improve the results (but at a cost of setup and more complex data structure). If some bits are much more likely to be set than others, this could be a huge win. If they're pretty close to random within the element list, then this doesn't help at all.
If you're stuck with essentially random bits set, you should get about (1-5/64)^32 benefit from the suffix tree approach on average (13x speedup), which might be better than the difference in efficiency due to using more complex operations (but don't count on it--bit masks are fast). If you have a nonrandom distribution of bits in your list, then you could do almost arbitrarily well.
This is the bitwise Kd-tree. It typically needs less than 64 visits per lookup operation. Currently, the selection of the bit (dimension) to pivot on is random.
#include <limits.h>
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
typedef unsigned long long Thing;
typedef unsigned long Number;
unsigned thing_ffs(Thing mask);
Thing rand_mask(unsigned bitcnt);
#define WANT_RANDOM 31
#define WANT_BITS 3
#define BITSPERTHING (CHAR_BIT*sizeof(Thing))
#define NONUMBER ((Number)-1)
struct node {
Thing value;
Number num;
Number nul;
Number one;
char pivot;
} *nodes = NULL;
unsigned nodecount=0;
unsigned itercount=0;
struct node * nodes_read( unsigned *sizp, char *filename);
Number *find_ptr_to_insert(Number *ptr, Thing value, Thing mask);
unsigned grab_matches(Number *result, Number num, Thing mask);
void initialise_stuff(void);
int main (int argc, char **argv)
{
Thing mask;
Number num;
unsigned idx;
srand (time(NULL));
nodes = nodes_read( &nodecount, argv[1]);
fprintf( stdout, "Nodecount=%u\n", nodecount );
initialise_stuff();
#if WANT_RANDOM
mask = nodes[nodecount/2].value | nodes[nodecount/3].value ;
#else
mask = 0x38;
#endif
fprintf( stdout, "\n#### Search mask=%llx\n", (unsigned long long) mask );
itercount = 0;
num = NONUMBER;
idx = grab_matches(&num,0, mask);
fprintf( stdout, "Itercount=%u\n", itercount );
fprintf(stdout, "KdTree search %16llx\n", (unsigned long long) mask );
fprintf(stdout, "Count=%u Result:\n", idx);
idx = num;
if (idx >= nodecount) idx = nodecount-1;
fprintf( stdout, "num=%4u Value=%16llx\n"
,(unsigned) nodes[idx].num
,(unsigned long long) nodes[idx].value
);
fprintf( stdout, "\nLinear search %16llx\n", (unsigned long long) mask );
for (idx = 0; idx < nodecount; idx++) {
if ((nodes[idx].value & mask) == nodes[idx].value) break;
}
fprintf(stdout, "Cnt=%u\n", idx);
if (idx >= nodecount) idx = nodecount-1;
fprintf(stdout, "Num=%4u Value=%16llx\n"
, (unsigned) nodes[idx].num
, (unsigned long long) nodes[idx].value );
return 0;
}
void initialise_stuff(void)
{
unsigned num;
Number root, *ptr;
root = 0;
for (num=0; num < nodecount; num++) {
nodes[num].num = num;
nodes[num].one = NONUMBER;
nodes[num].nul = NONUMBER;
nodes[num].pivot = -1;
}
nodes[num-1].value = 0; /* last node is guaranteed to match anything */
root = 0;
for (num=1; num < nodecount; num++) {
ptr = find_ptr_to_insert (&root, nodes[num].value, 0ull );
if (*ptr == NONUMBER) *ptr = num;
else fprintf(stderr, "Found %u for %u\n"
, (unsigned)*ptr, (unsigned) num );
}
}
Thing rand_mask(unsigned bitcnt)
{struct node * nodes_read( unsigned *sizp, char *filename)
{
struct node *ptr;
unsigned size,used;
FILE *fp;
if (!filename) {
size = (WANT_RANDOM+0) ? WANT_RANDOM : 9;
ptr = malloc (size * sizeof *ptr);
#if (!WANT_RANDOM)
ptr[0].value = 0x0c;
ptr[1].value = 0x0a;
ptr[2].value = 0x08;
ptr[3].value = 0x04;
ptr[4].value = 0x02;
ptr[5].value = 0x01;
ptr[6].value = 0x10;
ptr[7].value = 0x20;
ptr[8].value = 0x00;
#else
for (used=0; used < size; used++) {
ptr[used].value = rand_mask(WANT_BITS);
}
#endif /* WANT_RANDOM */
*sizp = size;
return ptr;
}
fp = fopen( filename, "r" );
if (!fp) return NULL;
fscanf(fp,"%u\n", &size );
fprintf(stderr, "Size=%u\n", size);
ptr = malloc (size * sizeof *ptr);
for (used = 0; used < size; used++) {
fscanf(fp,"%llu\n", &ptr[used].value );
}
fclose( fp );
*sizp = used;
return ptr;
}
Thing value = 0;
unsigned bit, cnt;
for (cnt=0; cnt < bitcnt; cnt++) {
bit = 54321*rand();
bit %= BITSPERTHING;
value |= 1ull << bit;
}
return value;
}
Number *find_ptr_to_insert(Number *ptr, Thing value, Thing done)
{
Number num=NONUMBER;
while ( *ptr != NONUMBER) {
Thing wrong;
num = *ptr;
wrong = (nodes[num].value ^ value) & ~done;
if (nodes[num].pivot < 0) { /* This node is terminal */
/* choose one of the wrong bits for a pivot .
** For this bit (nodevalue==1 && searchmask==0 )
*/
if (!wrong) wrong = ~done ;
nodes[num].pivot = thing_ffs( wrong );
}
ptr = (wrong & 1ull << nodes[num].pivot) ? &nodes[num].nul : &nodes[num].one;
/* Once this bit has been tested, it can be masked off. */
done |= 1ull << nodes[num].pivot ;
}
return ptr;
}
unsigned grab_matches(Number *result, Number num, Thing mask)
{
Thing wrong;
unsigned count;
for (count=0; num < *result; ) {
itercount++;
wrong = nodes[num].value & ~mask;
if (!wrong) { /* we have a match */
if (num < *result) { *result = num; count++; }
/* This is cheap pruning: the break will omit both subtrees from the results.
** But because we already have a result, and the subtrees have higher numbers
** than our current num, we can ignore them. */
break;
}
if (nodes[num].pivot < 0) { /* This node is terminal */
break;
}
if (mask & 1ull << nodes[num].pivot) {
/* avoid recursion if there is only one non-empty subtree */
if (nodes[num].nul >= *result) { num = nodes[num].one; continue; }
if (nodes[num].one >= *result) { num = nodes[num].nul; continue; }
count += grab_matches(result, nodes[num].nul, mask);
count += grab_matches(result, nodes[num].one, mask);
break;
}
mask |= 1ull << nodes[num].pivot;
num = (wrong & 1ull << nodes[num].pivot) ? nodes[num].nul : nodes[num].one;
}
return count;
}
unsigned thing_ffs(Thing mask)
{
unsigned bit;
#if 1
if (!mask) return (unsigned)-1;
for ( bit=random() % BITSPERTHING; 1 ; bit += 5, bit %= BITSPERTHING) {
if (mask & 1ull << bit ) return bit;
}
#elif 0
for (bit =0; bit < BITSPERTHING; bit++ ) {
if (mask & 1ull <<bit) return bit;
}
#else
mask &= (mask-1); // Kernighan-trick
for (bit =0; bit < BITSPERTHING; bit++ ) {
mask >>=1;
if (!mask) return bit;
}
#endif
return 0xffffffff;
}
struct node * nodes_read( unsigned *sizp, char *filename)
{
struct node *ptr;
unsigned size,used;
FILE *fp;
if (!filename) {
size = (WANT_RANDOM+0) ? WANT_RANDOM : 9;
ptr = malloc (size * sizeof *ptr);
#if (!WANT_RANDOM)
ptr[0].value = 0x0c;
ptr[1].value = 0x0a;
ptr[2].value = 0x08;
ptr[3].value = 0x04;
ptr[4].value = 0x02;
ptr[5].value = 0x01;
ptr[6].value = 0x10;
ptr[7].value = 0x20;
ptr[8].value = 0x00;
#else
for (used=0; used < size; used++) {
ptr[used].value = rand_mask(WANT_BITS);
}
#endif /* WANT_RANDOM */
*sizp = size;
return ptr;
}
fp = fopen( filename, "r" );
if (!fp) return NULL;
fscanf(fp,"%u\n", &size );
fprintf(stderr, "Size=%u\n", size);
ptr = malloc (size * sizeof *ptr);
for (used = 0; used < size; used++) {
fscanf(fp,"%llu\n", &ptr[used].value );
}
fclose( fp );
*sizp = used;
return ptr;
}
UPDATE:
I experimented a bit with the pivot-selection, favouring bits with the highest discriminatory value ("information content"). This involves:
making a histogram of the usage of bits (can be done while initialising)
while building the tree: choosing the one with frequency closest to 1/2 in the remaining subtrees.
The result: the random pivot selection performed better.
Construct a a binary tree as follows:
Every level corresponds to a bit
It corresponding bit is on go right, otherwise left
This way insert every number in the database.
Now, for searching: if the corresponding bit in the mask is 1, traverse both children. If it is 0, traverse only the left node. Essentially keep traversing the tree until you hit the leaf node (BTW, 0 is a hit for every mask!).
This tree will have O(N) space requirements.
Eg of tree for 1 (001), 2(010) and 5 (101)
root
/ \
0 1
/ \ |
0 1 0
| | |
1 0 1
(1) (2) (5)
With precomputed bitmasks. Formally is is still O(N), since the and-mask operations are O(N). The final pass is also O(N), because it needs to find the lowest bit set, but that could be sped up, too.
#include <limits.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
/* For demonstration purposes.
** In reality, this should be an unsigned long long */
typedef unsigned char Thing;
#define BITSPERTHING (CHAR_BIT*sizeof (Thing))
#define COUNTOF(a) (sizeof a / sizeof a[0])
Thing data[] =
/****** index abcdef */
{ 0x0c /* 0 001100 */
, 0x0a /* 1 001010 */
, 0x08 /* 2 001000 */
, 0x04 /* 3 000100 */
, 0x02 /* 4 000010 */
, 0x01 /* 5 000001 */
, 0x10 /* 6 010000 */
, 0x20 /* 7 100000 */
, 0x00 /* 8 000000 */
};
/* Note: this is for demonstration purposes.
** Normally, one should choose a machine wide unsigned int
** for bitmask arrays.
*/
struct bitmap {
char data[ 1+COUNTOF (data)/ CHAR_BIT ];
} nulmaps [ BITSPERTHING ];
#define BITSET(a,i) (a)[(i) / CHAR_BIT ] |= (1u << ((i)%CHAR_BIT) )
#define BITTEST(a,i) ((a)[(i) / CHAR_BIT ] & (1u << ((i)%CHAR_BIT) ))
void init_tabs(void);
void map_empty(struct bitmap *dst);
void map_full(struct bitmap *dst);
void map_and2(struct bitmap *dst, struct bitmap *src);
int main (void)
{
Thing mask;
struct bitmap result;
unsigned ibit;
mask = 0x38;
init_tabs();
map_full(&result);
for (ibit = 0; ibit < BITSPERTHING; ibit++) {
/* bit in mask is 1, so bit at this position is in fact a don't care */
if (mask & (1u <<ibit)) continue;
/* bit in mask is 0, so we can only select items with a 0 at this bitpos */
map_and2(&result, &nulmaps[ibit] );
}
/* This is not the fastest way to find the lowest 1 bit */
for (ibit = 0; ibit < COUNTOF (data); ibit++) {
if (!BITTEST(result.data, ibit) ) continue;
fprintf(stdout, " %u", ibit);
}
fprintf( stdout, "\n" );
return 0;
}
void init_tabs(void)
{
unsigned ibit, ithing;
/* 1 bits in data that dont overlap with 1 bits in the searchmask are showstoppers.
** So, for each bitpos, we precompute a bitmask of all *entrynumbers* from data[], that contain 0 in bitpos.
*/
memset(nulmaps, 0 , sizeof nulmaps);
for (ithing=0; ithing < COUNTOF(data); ithing++) {
for (ibit=0; ibit < BITSPERTHING; ibit++) {
if ( data[ithing] & (1u << ibit) ) continue;
BITSET(nulmaps[ibit].data, ithing);
}
}
}
/* Logical And of two bitmask arrays; simular to dst &= src */
void map_and2(struct bitmap *dst, struct bitmap *src)
{
unsigned idx;
for (idx = 0; idx < COUNTOF(dst->data); idx++) {
dst->data[idx] &= src->data[idx] ;
}
}
void map_empty(struct bitmap *dst)
{
memset(dst->data, 0 , sizeof dst->data);
}
void map_full(struct bitmap *dst)
{
unsigned idx;
/* NOTE this loop sets too many bits to the left of COUNTOF(data) */
for (idx = 0; idx < COUNTOF(dst->data); idx++) {
dst->data[idx] = ~0;
}
}