CUDA string search in large file, wrong result

CUDA string search in large file, wrong result - parallel-processing

I am working on simple naive string search in CUDA.
I am new in CUDA. It works fine fol smaller files ( aprox. ~1MB ). After I make these files bigger ( ctrl+a ctrl+c several times in notepad++ ), my program's results are higher ( about +1% ) than a
grep -o text file_name | wc -l
It is very simple function, so I don't know what could cause this. I need it to work with larger files ( ~500MB ).
Kernel code ( gpuCount is a __device__ int global variable ):
__global__ void stringSearchGpu(char *data, int dataLength, char *input, int inputLength){
int id = blockDim.x*blockIdx.x + threadIdx.x;
if (id < dataLength)
{
int fMatch = 1;
for (int j = 0; j < inputLength; j++)
{
if (data[id + j] != input[j]) fMatch = 0;
}
if (fMatch)
{
atomicAdd(&gpuCount, 1);
}
}
}
This is calling the kernel in main function:
int blocks = 1, threads = fileSize;
if (fileSize > 1024)
{
blocks = (fileSize / 1024) + 1;
threads = 1024;
}
clock_t cpu_start = clock();
// kernel call
stringSearchGpu<<<blocks, threads>>>(cudaBuffer, strlen(buffer), cudaInput, strlen(input));
cudaDeviceSynchronize();
After this I just copy the result to Host and print it.
Can anyone please help me with this?

First of all, you should always check return values of CUDA functions to check for errors. Best way to do so would be the following:
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
Wrap your CUDA calls, such as:
gpuErrchk(cudaDeviceSynchronize());
Second, your kernel accesses out of bounds memory. Suppose, dataLength=100, inputLength=7 and id=98. In your kernel code:
if (id < dataLength) // 98 is less than 100, so condition true
{
int fMatch = 1;
for (int j = 0; j < inputLength; j++) // j runs from [0 - 6]
{
// if j>1 then id+j>=100, which is out of bounds, illegal operation
if (data[id + j] != input[j]) fMatch = 0;
}
Change the condition to something like:
if (id < dataLength - inputLength)

Related

What causes cases with high ZeroMQ latency and how to avoid them?

I try to use ZeroMQ for fast message passing. Messages need to be delivered in less than 1 [ms]. I did some testing (inproc, single process on Linux, no TCP) and see that usually there is no problem with that. The latency is about 10 - 100 [us], depending on how often the messages are sent (why?). Sometimes however messages are received after 6 [ms] which is unacceptable.
What can be the cause that some messages are delayed?
Maybe the process is preempted?
Or it's because of polling used (zmq_poll())?
Example results from my test :
avg lag = 28 [us]
max lag = 5221 [us]
std dev = 25.85 [us]
big lag = 180 x above 200 [us]
"big lag" means number of cases where latency was over 200 [us]. In my tests there are 500 000 messages sent so the value 180 means that latency over 200 [us] was recorded in 180 / 500000 = 0,036%. It's a quite low number but I'd like it to be zero. Even on the expense of average latency.
The test source code is below :
#include <stdlib.h>
#include <math.h>
#include <zmq.h>
#include <pthread.h>
#define SOCKETS_NUM 5
#define RUNS 100000
void *context;
int numbers[SOCKETS_NUM];
struct {
struct timespec send_time;
struct timespec receive_time;
} times[SOCKETS_NUM * RUNS], *ptimes;
static void * worker_thread(void * dummy) {
int * number = dummy;
char endpoint[] = "inproc://endpointX";
endpoint[17] = (char)('0' + *number);
void * socket = zmq_socket(context, ZMQ_PUSH);
zmq_connect(socket, endpoint);
struct timespec sleeptime, remtime;
int rnd = rand() / 3000;
sleeptime.tv_sec = 0;
sleeptime.tv_nsec = rnd;
nanosleep(&sleeptime, &remtime);
clock_gettime(CLOCK_REALTIME, &(ptimes[*number].send_time));
zmq_send(socket, "Hello", 5, 0);
zmq_close(socket);
return NULL;
}
static void run_test(zmq_pollitem_t items[]) {
pthread_t threads[SOCKETS_NUM];
for (int i = 0; i < SOCKETS_NUM; i++) {
pthread_create(&threads[i], NULL, worker_thread, &numbers[i]);
}
char buffer[10];
int to_receive = SOCKETS_NUM;
for (int i = 0; i < SOCKETS_NUM; i++) {
int rc = zmq_poll(items, SOCKETS_NUM, -1);
for (int j = 0; j < SOCKETS_NUM; j++) {
if (items[j].revents & ZMQ_POLLIN) {
clock_gettime(CLOCK_REALTIME, &(ptimes[j].receive_time));
zmq_recv(items[j].socket, buffer, 10, 0);
}
}
to_receive -= rc;
if (to_receive == 0) break;
}
for (int i = 0; i < SOCKETS_NUM; i++) {
pthread_join(threads[i], NULL);
}
}
int main(void)
{
context = zmq_ctx_new();
zmq_ctx_set(context, ZMQ_THREAD_SCHED_POLICY, SCHED_FIFO);
zmq_ctx_set(context, ZMQ_THREAD_PRIORITY, 99);
void * responders[SOCKETS_NUM];
char endpoint[] = "inproc://endpointX";
for (int i = 0; i < SOCKETS_NUM; i++) {
responders[i] = zmq_socket(context, ZMQ_PULL);
endpoint[17] = (char)('0' + i);
zmq_bind(responders[i], endpoint);
numbers[i] = i;
}
time_t tt;
time_t t = time(&tt);
srand((unsigned int)t);
zmq_pollitem_t poll_items[SOCKETS_NUM];
for (int i = 0; i < SOCKETS_NUM; i++) {
poll_items[i].socket = responders[i];
poll_items[i].events = ZMQ_POLLIN;
}
ptimes = times;
for (int i = 0; i < RUNS; i++) {
run_test(poll_items);
ptimes += SOCKETS_NUM;
}
long int lags[SOCKETS_NUM * RUNS];
long int total_lag = 0;
long int max_lag = 0;
long int big_lag = 0;
for (int i = 0; i < SOCKETS_NUM * RUNS; i++) {
lags[i] = (times[i].receive_time.tv_nsec - times[i].send_time.tv_nsec + (times[i].receive_time.tv_sec - times[i].send_time.tv_sec) * 1000000000) / 1000;
if (lags[i] > max_lag) max_lag = lags[i];
total_lag += lags[i];
if (lags[i] > 200) big_lag++;
}
long int avg_lag = total_lag / SOCKETS_NUM / RUNS;
double SD = 0.0;
for (int i = 0; i < SOCKETS_NUM * RUNS; ++i) {
SD += pow((double)(lags[i] - avg_lag), 2);
}
double std_lag = sqrt(SD / SOCKETS_NUM / RUNS);
printf("avg lag = %l5d [us]\n", avg_lag);
printf("max lag = %l5d [us]\n", max_lag);
printf("std dev = %8.2f [us]\n", std_lag);
printf("big lag = %l5d x above 200 [us]\n", big_lag);
for (int i = 0; i < SOCKETS_NUM; i++) {
zmq_close(responders[i]);
}
zmq_ctx_destroy(context);
return 0;
}

Q : "...I'd like it to be zero."
Cool to say, yet hard to make.
As you run an ultra-fast, memory-mapped inproc:// Transport Class, the main focus will be performance tweaking of the Context()-processing. Here, you spend so awfully much setup-overhead & straight termination-overhead operations to send 1E5-times just a 5 [B], so I guess there will never be a queue-management related issue, as there will never be any "stack-growing" at all.
1 ) ( suppose we let the code as-is ) it would be a natural step for the performance tuning to at least set the ZeroMQ mapping of a socket-CPU_core ZMQ_AFFINITY ( not jumping or wandering from core to core ). It may be interesting to see, if that many ~ 5E5 socket setups/terminations on the PUSH-er side, each without ever sending more than a single shot of 5 [B] over the memory-mapped line, could get some help (for those large overheads & maintenance) from configuring the context-instance with SOCKETS_NUM I/O-threads, using the ZMQ_IO_THREADS setting ( fighting for a "RealTime"-ness, using the SCHED_FIFO, having only one I/O-thread does not help much, does it? )
2 ) next level of experimentation is to re-balance the ZMQ_THREAD_AFFINITY_CPU_ADD maps (the global context's I/O-threads onto CPU-cores) and the per-socket setup of the ZMQ_AFFINITY maps onto the context's I/O-thread(s). Having sufficient amount of CPU-cores, there may be some performance / ultra-low latency benefits from making several gangs-of-I/O-threads serving one socket-instance stay "together", on a same CPU-core, yet here we get into territory, where the actual hardware and the real-system's background workloads & still-"spare"-resources for this "RealTime"-ambition motivated experimenting start to become hard to predict without any in-vivo testing & validation.
3 ) tweaking per-socket zmq_setsockopt() parameters may help, yet unless a nano-scaled socket-lifetime ( rather an expensive one-time used "consumable-disposable" ), do not expect any breakthrough from here.
4 ) trying to measure with a nanosecond resolution, the more if used for "durations" of something, ought be used by CLOCK_MONOTONIC_RAW, that avoids ntp-injected adjustments, astronomy-correcting leap seconds injections et al.
5 ) the zmq_poll()-strategy: I would no go this way. Using the timeout == -1 is blocking the whole circus. A thing I strongly discourage in any distributed-computing system, the more in one, that has a "RealTime" ambition. Spinning the PULL-side to a max performance may go via having a 1:1 PUSH/PULL threads on either side, or if trying to challenge the grooming, have 5-PUSH-er threads, as you have it, and collect all ingress messages on a just single, Zero-Copy well oiled PULL-er ( easier polling, may use a payload-based index-helper to which send-side timestamp to put the receive-side timestamp ), anyway, the blocking poller is almost the anti-pattern for challenging any low-latency soft-realtime toys.
Anyway, do not hesistate to refactor the code and to use profiling tools to better see, where you "acquire" the big_lag-s ( my guesses are above )
#include <stdlib.h>
#include <math.h>
#include <zmq.h>
#include <pthread.h>
#define SOCKETS_NUM 5
#define RUNS 100000
void *context;
int numbers[SOCKETS_NUM];
struct {
struct timespec send_time;
struct timespec recv_time;
} times[SOCKETS_NUM * RUNS],
*ptimes;
static void *worker_thread( void *dummy ) { //-------------------------- an ovehead expensive one-shot PUSH-based "Hello"-sender & .close()
int *number = dummy;
char endpoint[] = "inproc://endpointX";
endpoint[17] = (char)( '0' + *number );
int rnd = rand() / 3000;
void *socket = zmq_socket( context, ZMQ_PUSH );
struct timespec remtime,
sleeptime;
sleeptime.tv_sec = 0;
sleeptime.tv_nsec = rnd;
zmq_connect( socket, endpoint );
nanosleep( &sleeptime, &remtime ); // anything betweed < 0 : RAND_MAX/3000 > [ns] ... easily >> 32, as #define RAND_MAX 2147483647 ~ 715 827 [ns]
clock_gettime( CLOCK_REALTIME, &( ptimes[*number].send_time) ); //............................................................................ CLK_set_NEAR_SEND
// any CLOCK re-adjustments may and will skew any non-MONOTONIC_CLOCK
zmq_send( socket, "Hello", 5, 0 );
zmq_close( socket );
return NULL;
}
static void run_test( zmq_pollitem_t items[] ) { //--------------------- zmq_poll()-blocked zmq_recv()-orchestrator ( called ~ 1E5 x !!! resources' nano-use & setup + termination overheads matter )
char buffer[10];
int to_receive = SOCKETS_NUM;
pthread_t threads[SOCKETS_NUM];
for ( int i = 0; i < SOCKETS_NUM; i++ ) { //------------------------ thread-maker ( a per-socket PUSH-er[]-s )
pthread_create( &threads[i], NULL, worker_thread, &numbers[i] );
}
for ( int i = 0; i < SOCKETS_NUM; i++ ) { //------------------------ [SERIAL]-------- [i]-stepping
int rc = zmq_poll( items, SOCKETS_NUM, -1 ); //----------------- INFINITE ??? --- blocks /\/\/\/\/\/\/\/\/\/\/\ --- several may flag ZMQ_POLLIN
for ( int j = 0; j < SOCKETS_NUM; j++ ) { //-------------------- ALL-CHECKED in a loop for an items[j].revents
if ( items[j].revents & ZMQ_POLLIN ) { //------------------- FIND IF IT WAS THIS ONE
clock_gettime( CLOCK_REALTIME, &( ptimes[j].recv_time ) );//...................................................................... CLK_set_NEAR_poll()_POSACK'd R2recv
zmq_recv( items[j].socket, buffer, 10, 0 ); //---------- READ-IN from any POSACK'd by zmq_poll()-er flag(s)
}
}
to_receive -= rc; // ---------------------------------------------------------------------------------------------- SUB rc
if (to_receive == 0) break;
}
for ( int i = 0; i < SOCKETS_NUM; i++ ) { //------------------------ thread-killer
pthread_join( threads[i], NULL );
}
}
int main( void ) {
context = zmq_ctx_new();
zmq_ctx_set( context, ZMQ_THREAD_SCHED_POLICY, SCHED_FIFO );
zmq_ctx_set( context, ZMQ_THREAD_PRIORITY, 99 );
void *responders[SOCKETS_NUM];
char endpoint[] = "inproc://endpointX";
for ( int i = 0; i < SOCKETS_NUM; i++ ) {
responders[i] = zmq_socket( context, ZMQ_PULL ); // ------------ PULL instances into []
endpoint[17] = (char)( '0' + i );
zmq_bind( responders[i], endpoint ); //------------------------- .bind()
numbers[i] = i;
}
time_t tt;
time_t t = time(&tt);
srand( (unsigned int)t );
zmq_pollitem_t poll_items[SOCKETS_NUM];
for ( int i = 0; i < SOCKETS_NUM; i++ ) { //------------------------ zmq_politem_t array[] ---pre-fill---
poll_items[i].socket = responders[i];
poll_items[i].events = ZMQ_POLLIN;
}
ptimes = times;
for ( int i = 0; i < RUNS; i++ ) { //------------------------------- 1E5 RUNs
run_test( poll_items ); // ------------------------------------- RUN TEST
ptimes += SOCKETS_NUM;
}
long int lags[SOCKETS_NUM * RUNS];
long int total_lag = 0;
long int max_lag = 0;
long int big_lag = 0;
for ( int i = 0; i < SOCKETS_NUM * RUNS; i++ ) {
lags[i] = ( times[i].recv_time.tv_nsec
- times[i].send_time.tv_nsec
+ ( times[i].recv_time.tv_sec
- times[i].send_time.tv_sec
) * 1000000000
) / 1000; // --------------------------------------- [us]
if ( lags[i] > max_lag ) max_lag = lags[i];
total_lag += lags[i];
if ( lags[i] > 200 ) big_lag++;
}
long int avg_lag = total_lag / SOCKETS_NUM / RUNS;
double SD = 0.0;
for ( int i = 0; i < SOCKETS_NUM * RUNS; ++i ) {
SD += pow( (double)( lags[i] - avg_lag ), 2 );
}
double std_lag = sqrt( SD / SOCKETS_NUM / RUNS );
printf("avg lag = %l5d [us]\n", avg_lag);
printf("max lag = %l5d [us]\n", max_lag);
printf("std dev = %8.2f [us]\n", std_lag);
printf("big lag = %l5d x above 200 [us]\n", big_lag);
for ( int i = 0; i < SOCKETS_NUM; i++ ) {
zmq_close( responders[i] );
}
zmq_ctx_destroy( context );
return 0;
}
Using nanosleep for a random (not cardinal, safely outside of any control-loop(s) activity) sleep is rather a risky luxury, as in earlier kernels caused problems:
In order to support applications requiring much more precise pauses (e.g., in order to control some time-critical hardware), nanosleep() would handle pauses of up to 2 ms by busy waiting with microsecond precision when called from a thread scheduled under a real-time policy like SCHED_FIFO or SCHED_RR. This special extension was removed in kernel 2.5.39, hence is still present in current 2.4 kernels, but not in 2.6 kernels.

bootloader avr atmega128RFA1

I am also working on the bootloader.
I had the problem in the following:
Once the cmd 'B' is received, later, 'F' is received, then I would start to call block load.
static void start_block_flash_load(uint16_t size, uint32_t *addr) {
uint16_t data_word;
uint8_t sreg = SREG;
uint16_t temp;
int i;
uint8_t my_size;
fprintf(lcdout, "B");
cli();
// Disable interrupts
(*addr) <<= 1;
if (size <= SPM_PAGESIZE) {
boot_page_erase(*addr);
boot_spm_busy_wait();
fprintf(lcdout, "%"PRIu16, size);
uint16_t i;
//store all values. PROBLEM here!!!
my_size = 208;
uint8_t buf[SPM_PAGESIZE] = { 0 };
for (i = 0; i < my_size; i++) {
//for (i=0; i<size; i++){
buf[i] = uart_getc();
// lcd_clear();
// lcd_setCursor(0, 2);
// fprintf(lcdout, "%3d", i);
// _delay_ms(500);
}
for (i = 0; i < my_size; i += 2) { //if size is odd, then use do-while
uint16_t w = buf[i];
w += buf[i + 1] << 8; //first one is low byte, second is high???
boot_page_fill((*addr)+i, w);
}
boot_page_write(*addr);
boot_spm_busy_wait();
(*addr) >>= 1;
uart_putc('\r');
} else
uart_putc('?');
boot_rww_enable ();
SREG = sreg;
}
I can see on the lcd that the size of the block is 256. However, when entering the loop to collect data, it will get stuck.
I tested with my_size and I found that only if my_size=208 the program will run further.
The strange thing is that if I put some statements inside the loop, e.g.
lcd_clear();
lcd_setCursor(0, 2);
then 'i' which I printed out on lcd will not go up to 140 something. I put different statements, the 'i' will give different value. That is very strange, since the uart_getc() will not lose data.
What I expect is that the loop will go up to 256. I cannot figure out what happened there.
Please help if you have any idea.
Thanks

Parallel hashing using openmp

I have a piece of code for parallel hashing, the insert code is as follows:
int main(int argc, char** argv){
.....
Entry* table;//hash table
for(size_t i=0;i<N;i++){
keys[i]=i;
values[i] = rand();//random key-value pairs
}
int omp_p = omp_get_max_threads();
#pragma omp parallel for
for(int p=0;p<omp_p;p++){
size_t start = p*N/omp_p;
size_t end = (p+1)*N/omp_p;//each thread gets contiguous chunks of the arrays
for(size_t i=start;i<end;i++){
size_t key = keys[i];
size_t value = values[i];
if(insert(table,key,value) == 0){
printf("Failure!\n");
}
}
}
....
return 0;
}
int insert(Entry* table,size_t key, size_t value){
Entry entry = (((Entry)key) << 32)+value; //Coalesce key and value into an entry
/*Use cuckoo hashing*/
size_t location = hash_function_1(key);
for(size_t its=0;its<MAX_ITERATIONS;its++){
entry = __sync_lock_test_and_set(&table[location],entry);
key=get_key(entry);
if(key == KEY_EMPTY)
return1;
}
/*We have replaced a valid key, try to hash it using next available hash function*/
size_t location_1 = hash_function_1(key);
size_t location_2 = hash_function_2(key);
size_t location_3 = hash_function_3(key);
if(location == location_1) location = location_2;
else if(location == location_2) location = location_3;
else location = location_1;
}
return 0;
}
The insert code doesn't scale at all. If I use a single thread, for say, 10M keys, I complete in about 170ms, whereas using 16 threads, I take > 500ms. My suspicion is that this is because the cache line (consisting of the table[] array) is being moved around between the threads during the write operation (__sync_lock_test_and_set(...)) and the invalidation results in a slow down
For example if I modify the insert code to just:
int insert(Entry* table,size_t key, size_t value){
Entry entry = (((Entry)key) << 32)+value; //Coalesce key and value into an entry
size_t location = hash_function_1(key);
table[location] = entry;
return 1;
}
I still get the same bad performance. Since this is hashing, I cannot control, where a particular element hashes to. So any suggestions? Also, if this isn't the right reason, any other pointers as to what might be going wrong? I have tried it from 1M to 100M keys, but the single threaded performance is always better.

I have a few suggestions. Since the run time of your insert function is not constant then you should use schedule(dynamic). Second, you should let OpenMP divide the tasks and not do it yourself (one reason, but not the main reason, is that the way you have it now N has to be a multiple of omp_p). If you want to have some control over how it divides the tasks then try changing the chunksize like this schedule(dynamic,n) where n is the chuck size.
#pragma omp parallel for schedule(dynamic)
for(size_t i=0;i<N;i++){
size_t key = keys[i];
size_t value = values[i];
if(insert(table,key,value) == 0){
printf("Failure!\n");
}
}

I would try experimenting with a strategy based on locks, like this simple snippet shows:
#include<omp.h>
#define NHASHES 4
#define NTABLE 1000000
typedef size_t (hash_f)(size_t);
int main(int argc, char** argv) {
Entry table [NTABLE ];
hash_f hashes[NHASHES];
omp_lock_t locks [NTABLE ]
/* ... */
for(size_t ii = 0; ii < N; ii++) {
keys [ii] = ii;
values [ii] = rand();
}
for(size_t ii = 0; ii < NTABLE; ii++) {
omp_init_lock(&locks[ii]);
}
#pragma omp parallel
{
#pragma omp for schedule(static)
for(int ii = 0; ii < N; ii++) {
size_t key = keys [ii];
size_t value = values[ii];
Entry entry = (((Entry)key) << 32) + value;
for ( jj = 0; jj < NHASHES; jj++ ) {
size_t location = hashes[jj]; // I assume this is the computationally demanding part
omp_set_lock(&locks[location]); // Locks the hash table location before working on it
if ( get_key(table[location]) == KEY_EMPTY ) {
table[location] = entry;
break;
}
omp_unset_lock(&locks[location]); // Unlocks the hash table location
}
// Handle failures here
}
} /* pragma omp parallel */
for(size_t ii = 0; ii < NTABLE; ii++) {
omp_destroy_lock(&locks[ii]);
}
/* ... */
return 0;
}
With a little more machinery you can handle a variable number of locks ranging from 1 (equivalent to a critical section) to NTABLE (equivalent to an atomic construct) and see if the granularity in-between provides some benefit.

Piecemeal processing of a matrix - CUDA

OK, so lets say I have an ( N x N ) matrix that I would like to process. This matrix is quite large for my computer, and if I try to send it to the device all at once I get a 'out of memory error.'
So is there a way to send sections of the matrix to the device? One way I can see to do it is copy portions of the matrix on the host, and then send these manageable copied portions from the host to the device, and then put them back together at the end.
Here is something I have tried, but the cudaMemcpy in the for loop returns error code 11, 'invalid argument.'
int h_N = 10000;
size_t h_size_m = h_N*sizeof(float);
h_A = (float*)malloc(h_size_m*h_size_m);
int d_N = 2500;
size_t d_size_m = d_N*sizeof(float);
InitializeMatrices(h_N);
int i;
int iterations = (h_N*h_N)/(d_N*d_N);
for( i = 0; i < iterations; i++ )
{
float* h_array_ref = h_A+(i*d_N*d_N);
cudasafe( cudaMemcpy(d_A, h_array_ref, d_size_m*d_size_m, cudaMemcpyHostToDevice), "cudaMemcpy");
cudasafe( cudaFree(d_A), "cudaFree(d_A)" );
}
What I'm trying to accomplish with the above code is this: instead of send the entire matrix to the device, I simply send a pointer to a place within that matrix and reserve enough space on the device to do the work, and then with the next iteration of the loop move the pointer forward within the matrix, etc. etc.

Not only can you do this (assuming your problem is easily decomposed this way into sub-arrays), it can be a very useful thing to do for performance; once you get the basic approach you've described working, you can start using asynchronous memory copies and double-buffering to overlap some of the memory transfer time with the time spent computing what is already on-card.
But first one gets the simple thing working. Below is a 1d example (multiplying a vector by a scalar and adding another scalar) but using a linearized 2d array would be the same; the key part is
CHK_CUDA( cudaMalloc(&xd, batchsize*sizeof(float)) );
CHK_CUDA( cudaMalloc(&yd, batchsize*sizeof(float)) );
tick(&gputimer);
int nbatches = 0;
for (int nstart=0; nstart < n; nstart+=batchsize) {
int size=batchsize;
if ((nstart + batchsize) > n) size = n - nstart;
CHK_CUDA( cudaMemcpy(xd, &(x[nstart]), size*sizeof(float), cudaMemcpyHostToDevice) );
blocksize = (size+nblocks-1)/nblocks;
cuda_saxpb<<<nblocks, blocksize>>>(xd, a, b, yd, size);
CHK_CUDA( cudaMemcpy(&(ycuda[nstart]), yd, size*sizeof(float), cudaMemcpyDeviceToHost) );
nbatches++;
}
gputime = tock(&gputimer);
CHK_CUDA( cudaFree(xd) );
CHK_CUDA( cudaFree(yd) );
You allocate the buffers at the start, and then loop through until you're done, each time doing the copy, starting the kernel, and then copying back. You free at the end.
The full code is
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <cuda.h>
#include <sys/time.h>
#include <math.h>
#define CHK_CUDA(e) {if (e != cudaSuccess) {fprintf(stderr,"Error: %s\n", cudaGetErrorString(e)); exit(-1);}}
__global__ void cuda_saxpb(const float *xd, const float a, const float b,
float *yd, const int n) {
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i<n) {
yd[i] = a*xd[i]+b;
}
return;
}
void cpu_saxpb(const float *x, float a, float b, float *y, int n) {
int i;
for (i=0;i<n;i++) {
y[i] = a*x[i]+b;
}
return;
}
int get_options(int argc, char **argv, int *n, int *s, int *nb, float *a, float *b);
void tick(struct timeval *timer);
double tock(struct timeval *timer);
int main(int argc, char **argv) {
int n=1000;
int nblocks=10;
int batchsize=100;
float a = 5.;
float b = -1.;
int err;
float *x, *y, *ycuda;
float *xd, *yd;
double abserr;
int blocksize;
int i;
struct timeval cputimer;
struct timeval gputimer;
double cputime, gputime;
err = get_options(argc, argv, &n, &batchsize, &nblocks, &a, &b);
if (batchsize > n) {
fprintf(stderr, "Resetting batchsize to size of vector, %d\n", n);
batchsize = n;
}
if (err) return 0;
x = (float *)malloc(n*sizeof(float));
if (!x) return 1;
y = (float *)malloc(n*sizeof(float));
if (!y) {free(x); return 1;}
ycuda = (float *)malloc(n*sizeof(float));
if (!ycuda) {free(y); free(x); return 1;}
/* run CPU code */
tick(&cputimer);
cpu_saxpb(x, a, b, y, n);
cputime = tock(&cputimer);
/* run GPU code */
/* only have to allocate once */
CHK_CUDA( cudaMalloc(&xd, batchsize*sizeof(float)) );
CHK_CUDA( cudaMalloc(&yd, batchsize*sizeof(float)) );
tick(&gputimer);
int nbatches = 0;
for (int nstart=0; nstart < n; nstart+=batchsize) {
int size=batchsize;
if ((nstart + batchsize) > n) size = n - nstart;
CHK_CUDA( cudaMemcpy(xd, &(x[nstart]), size*sizeof(float), cudaMemcpyHostToDevice) );
blocksize = (size+nblocks-1)/nblocks;
cuda_saxpb<<<nblocks, blocksize>>>(xd, a, b, yd, size);
CHK_CUDA( cudaMemcpy(&(ycuda[nstart]), yd, size*sizeof(float), cudaMemcpyDeviceToHost) );
nbatches++;
}
gputime = tock(&gputimer);
CHK_CUDA( cudaFree(xd) );
CHK_CUDA( cudaFree(yd) );
abserr = 0.;
for (i=0;i<n;i++) {
abserr += fabs(ycuda[i] - y[i]);
}
printf("Y = a*X + b, problemsize = %d\n", n);
printf("CPU time = %lg millisec.\n", cputime*1000.);
printf("GPU time = %lg millisec (done with %d batches of %d).\n",
gputime*1000., nbatches, batchsize);
printf("CUDA and CPU results differ by %lf\n", abserr);
free(x);
free(y);
free(ycuda);
return 0;
}
int get_options(int argc, char **argv, int *n, int *s, int *nb, float *a, float *b) {
const struct option long_options[] = {
{"nvals" , required_argument, 0, 'n'},
{"nblocks" , required_argument, 0, 'B'},
{"batchsize" , required_argument, 0, 's'},
{"a", required_argument, 0, 'a'},
{"b", required_argument, 0, 'b'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}};
char c;
int option_index;
int tempint;
while (1) {
c = getopt_long(argc, argv, "n:B:a:b:s:h", long_options, &option_index);
if (c == -1) break;
switch(c) {
case 'n': tempint = atoi(optarg);
if (tempint < 1 || tempint > 500000) {
fprintf(stderr,"%s: Cannot use number of points %s;\n Using %d\n", argv[0], optarg, *n);
} else {
*n = tempint;
}
break;
case 's': tempint = atoi(optarg);
if (tempint < 1 || tempint > 50000) {
fprintf(stderr,"%s: Cannot use number of points %s;\n Using %d\n", argv[0], optarg, *s);
} else {
*s = tempint;
}
break;
case 'B': tempint = atoi(optarg);
if (tempint < 1 || tempint > 1000 || tempint > *n) {
fprintf(stderr,"%s: Cannot use number of blocks %s;\n Using %d\n", argv[0], optarg, *nb);
} else {
*nb = tempint;
}
break;
case 'a': *a = atof(optarg);
break;
case 'b': *b = atof(optarg);
break;
case 'h':
puts("Calculates y[i] = a*x[i] + b on the GPU.");
puts("Options: ");
puts(" --nvals=N (-n N): Set the number of values in y,x.");
puts(" --batchsize=N (-s N): Set the number of values to transfer at a time.");
puts(" --nblocks=N (-B N): Set the number of blocks used.");
puts(" --a=X (-a X): Set the parameter a.");
puts(" --b=X (-b X): Set the parameter b.");
puts(" --niters=N (-I X): Set number of iterations to calculate.");
puts("");
return +1;
}
}
return 0;
}
void tick(struct timeval *timer) {
gettimeofday(timer, NULL);
}
double tock(struct timeval *timer) {
struct timeval now;
gettimeofday(&now, NULL);
return (now.tv_usec-timer->tv_usec)/1.0e6 + (now.tv_sec - timer->tv_sec);
}
Running this one gets:
$ ./batched-saxpb --nvals=10240 --batchsize=10240 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.072 millisec.
GPU time = 0.117 millisec (done with 1 batches of 10240).
CUDA and CPU results differ by 0.000000
$ ./batched-saxpb --nvals=10240 --batchsize=5120 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.066 millisec.
GPU time = 0.133 millisec (done with 2 batches of 5120).
CUDA and CPU results differ by 0.000000
$ ./batched-saxpb --nvals=10240 --batchsize=2560 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.067 millisec.
GPU time = 0.167 millisec (done with 4 batches of 2560).
CUDA and CPU results differ by 0.000000
The GPU time goes up in this case (we're doing more memory copies) but the answers stay the same.
Edited: The original version of this code had an option for running multiple iterations of the kernel for timing purposes, but that's unnecessarily confusing in this context so it's removed.

custom ITOA not working right?

I wanted to make a custom ITOA function to put large numbers into small strings, this is what I have coded :
main(){
printf("itoa(2000000000,36)= '%s'",itoa(2000000000,36));
printf("itoa(36,36)= '%s'",itoa(36,36));
printf("itoa(37,36)= '%s'",itoa(37,36));
return 1;
}
stock itoa(val, base)
{
new buf[1024] = {0,...};
new i = 1023;
new LETTERZ[37] = {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',0};
for(; val && i; --i, val /= base)
buf[i] = LETTERZ[val % base];
return buf[i+1];
}
It's based on 'C' code from this page: http://www.jb.man.ac.uk/~slowe/cpp/itoa.html
But somehow this is the output:
[20:34:35] itoa(2000000000,36)= 'X'
[20:34:35] itoa(36,36)= '1'
[20:34:35] itoa(37,36)= '1'
And this is totally wrong, I don't know which output to expect but 36 and 37 for sure can't be the same output and 2 000 000 000 can't be just 'X', as X is suposed to be 35, not 2 000 000 000,
ZZ should be 1295 I think... I want to base this on the hexadecimal system, but with all the alfabet letters.
Could anyone tell me what's wrong here?
I'm working with a typeless language called PAWN (also known as SMALL) and later i want to use this code in VB.NET

/* itoa example */
#include <stdio.h>
#include <stdlib.h>
int main ()
{
int i;
char buffer [33];
printf ("Enter a number: ");
scanf ("%d",&i);
itoa (i,buffer,10);
printf ("decimal: %s\n",buffer);
itoa (i,buffer,16);
printf ("hexadecimal: %s\n",buffer);
itoa (i,buffer,2);
printf ("binary: %s\n",buffer);
return 0;
}
You only give the number and the base, but parameter 2 needs a pointer to char already allocated. Use a buffer or try NULL, so the function will return the result.

THe solution seemed to be simple, the return buf[i+1] just returned one character so what I did is make it return an array:
new _s#T[4096];
#define sprintf(%1) (format(_s#T, SPRINTF_MAX_STRING, %1), _s#T)
main(){
new num = atoi("ABCDEFG",36);
printf("%d",num);
printf("%s",itoa(num,36));
return 1;
}
stock itoa(val, base)
{
new buf[1024] = {0,...};
new LETTERZ[37] = {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',0};
for(new pos = 0; val;++pos,val = floatround(val/base,floatround_floor))
strins(buf,sprintf("%c",LETTERZ[val % base]),0);
return buf;
}
stock atoi(val[], base)
{
new CURRNUM = 0;
new len = strlen(val);
new LETTERZ[37] = {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',0};
for(new i = 0; i < len; ++i)
{
for(new x = 0; x < base; ++x)
{
new y = (len-i)-1;
if(val[y] == LETTERZ[x])
{
CURRNUM += x*floatround(floatpower(base,i));
}
}
}
return CURRNUM;
}

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

CUDA string search in large file, wrong result - parallel-processing

Related

What causes cases with high ZeroMQ latency and how to avoid them?

bootloader avr atmega128RFA1

Parallel hashing using openmp

Piecemeal processing of a matrix - CUDA

custom ITOA not working right?

Categories

Resources