Why GPU memory updates are not read?

Why GPU memory updates are not read? - visual-studio

I have written a program like this and wanted to display in real-time progress of the kernel. I saw How can I check the progress of matrix multiplication?, but wanted not to use such specific CUDA things as cudaDeviceMapHost. The things do not work even if I allocate array bigger than GPU cache. How is this possible?
#include <chrono>
#include "cuda_runtime.h"
#include "device_atomic_functions.h"
#include "device_launch_parameters.h"
#define CUDA_CHECK(err) __cudaSafeCall(err, __FILE__, __LINE__)
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
if (err != cudaSuccess)
{
fprintf(stderr, "%s(%i): CUDA error %d (%s)\n",
file, line, int(err), cudaGetErrorString(err));
throw "CUDA error";
}
}
static const int c_dataSize = 4 << 20;
__global__ void progressKernel(int *devP)
{
while (1)
{
for (int i = 9 + threadIdx.x; i < c_dataSize; i += blockDim.x)
devP[i] = i;
}
}
int main()
{
std::vector<int> data(c_dataSize, 1);
int *devP;
auto startTime = std::chrono::system_clock::now();
cudaStream_t stream2, stream3;
CUDA_CHECK(cudaMalloc((void**)&devP, sizeof(int) * c_dataSize));
CUDA_CHECK(cudaStreamCreate(&stream2));
CUDA_CHECK(cudaGetLastError());
printf("Starting...\n");
progressKernel<<<1, 128, 0, stream2>>>(devP);
printf("Started... %llX\n", (__int64)devP);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaStreamCreate(&stream3));
CUDA_CHECK(cudaMemcpyAsync(&(data[0]), devP, sizeof(int) * c_dataSize,
cudaMemcpyDeviceToHost, stream3));
CUDA_CHECK(cudaStreamSynchronize(stream3));
while (1)
{
auto currentTime = std::chrono::system_clock::now();
auto transformed = std::chrono::duration_cast<std::chrono::nanoseconds>(currentTime - startTime).count();
printf("%.7f ms: %d, %d, %d\n", (double)transformed / 1000000,
data[10], data[100], data[c_dataSize - 1]);
cudaMemcpyAsync(&(data[0]), devP, sizeof(int) * c_dataSize,
cudaMemcpyDeviceToHost, stream3);
cudaStreamSynchronize(stream3);
}
printf("Done\n");
}
I am getting
Starting...
Started... 505200000
1520.5665000 ms: 0, 0, 0
1526.6487000 ms: 0, 0, 0
1530.3077000 ms: 0, 0, 0
1534.4480000 ms: 0, 0, 0
1538.1516000 ms: 0, 0, 0
1541.7932000 ms: 0, 0, 0
1545.4041000 ms: 0, 0, 0
1549.6127000 ms: 0, 0, 0
1553.5760000 ms: 0, 0, 0
1557.2292000 ms: 0, 0, 0
1560.8776000 ms: 0, 0, 0
1564.6736000 ms: 0, 0, 0
1568.8331000 ms: 0, 0, 0
1572.5332000 ms: 0, 0, 0 ...
Windows 10 x64 Pro 19044.2251, Visual Studio 2019 16.0.2, CUDA Toolkit 10.2

Related

kevent & USB serial ports

I'm having trouble using kevent on mac with a USB serial console. I've narrowed it down to:
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <sys/event.h>
#include <sys/ioctl.h>
#include <termios.h>
#define DEVICE "/dev/cu.usbserial-0011111D"
int main() {
int kqueue_fd = kqueue();
if (kqueue_fd < 0) {
printf("Failed to open kqueue\n");
return -1;
}
int device_fd = open(DEVICE, O_RDWR | O_NONBLOCK | O_NOCTTY);
if (device_fd < 0) {
printf("Failed to open device: %s\n", DEVICE);
return -1;
}
printf("Opened %d\n", device_fd);
enum { MAX_EVENTS = 1 };
struct kevent events[MAX_EVENTS];
EV_SET(&events[0], device_fd, EVFILT_READ, EV_ADD, 0, 0, NULL);
int r = kevent(kqueue_fd, events, 1, NULL, 0, NULL);
if (r < 0) {
printf("kevent failed: %s\n", strerror(errno));
return -1;
}
struct timespec sleep_time;
sleep_time.tv_sec = 5;
sleep_time.tv_nsec = 0;
int ready = kevent(kqueue_fd, NULL, 0, (struct kevent*) &events,
MAX_EVENTS, &sleep_time);
if (ready == 0) {
printf("No event\n");
return 0;
}
for (int i = 0; i < ready; i++) {
printf(".ident %ld, .filter %d, .flags 0x%x, .fflags 0x%x, "
".data: %ld, .udata %p\n",
events[i].ident,
events[i].filter,
events[i].flags,
events[i].fflags,
events[i].data,
events[i].udata);
int unread = 0;
r = ioctl(events[i].ident, FIONREAD, &unread);
if (r < 0) {
printf("ioctl failed: %d: %s\n", errno, strerror(errno));
}
}
}
When I run this and unplug the USB device in the middle of the call to kevent(), I get:
Opened 4
.ident 4, .filter -1, .flags 0x1, .fflags 0x0, .data: 6, .udata 0x0
ioctl failed: 6: Device not configured
My understanding is that the contents of the event translates to:
FD 4, EVFILT_READ, EV_ADD, 6 bytes remaining on fd. But the ioctl() fails (since the device was removed), and errno is also 6, so it seems as if event.data is returning the errno, not the bytes remaining.
How can I differentiate between the normal read case and the case where the device has been removed? The filter, flags & fflags appear the same in both cases.
Additional Information
If I switch from opening the serial console to a pipe, and write a single byte followed by closing the write end, I get:
pipe() fd: 5 -> 4
.ident 4, .filter -1, .flags 0x1, .fflags 0x0, .data: 1, .udata 0x0
.ident 4, .filter -1, .flags 0x8001, .fflags 0x0, .data: 0, .udata 0x0
This is what I expect, since 0x8000 is EV_EOF.

Convert binary file to image

I need to find a fast way to convert a binary file to an image.
The binary file consist of a NNN matrix and I want to associate 0 to a color and 1 to a different color.
I need to perform this operation to more then 1000 binary files.
If possible I'd like to avoid using MatLab, is there any tool/software (for unix) that would help me?
EDIT:
This is exactly what I was looking for!
On the bottom of the page it says: "TIP: To process many files, use a shell script to pass this URL and your desired parameters to wget and then direct the output to file"
Yet I can't do this.
I tried with:
wget --post-data="blocksize=10&width=10&offset=0&markval=-1&autoscale=0" \
--post-file="userfile=/path.../filename" http://www.ryanwestafer.com/stuff/bin2img.php \
> output
but all I get is the original page downloaded in my local folder!

If you have python with the PIL (Image) library installed:
import Image
def colormap(s):
s_out = []
for ch in s: # assume always '\x00' or '\x01'
if s == '\x00':
s_out.append('\x00') # black
else:
s_out.append('\xFF') # white
return ''.join(s_out)
N= 50 # for instance
fin = open('myfile.bin','rb')
data = fin.read(N*N) # read NxN bytes
data = colormap(data)
# convert string to grayscale image
img = Image.fromstring('L', (N,N), data )
# save to file
img.save('thisfile.png')
data = fin.read(N*N) # next NxN bytes
data = colormap(data)
img = Image.fromstring('L', (N,N), data )
img.save('thisfile2.png')
This can be easily modified to loop and sequence filenames, etc as needed

For 3D matrixes, I would usually convert them to VRML3D and look at them using ParallelGraphics/Cortona3D.
Otherwise, you need some sort of projection or "slicing" of the matrix in order to see all of the matrix.
This is a C implementation to dump a 3D matrix to a PNG file. Compile with
gcc -W -Wall -o bin2png bin2png.c -lpng
Code:
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <errno.h>
#include <png.h>
static png_structp png_ptr;
static png_infop info_ptr;
/**
|<--- W ---->|
+------------+ -
| 18 19 20| |
+-------------+ | |
| 9 10 11 | | |
+-------------+ |23| +--> H
| 0 1 2 | | | |
| |14 | | |
| | |26| |
| 3 4 5 | |--+ + -
| |17 | /
| |---+ +--> D
| 6 7 8 | /
+-------------+ +
#param matrix a 3D matrix. Element [i,j,k] is A[H*(D*k + j) + i]
#param W width
#param H height
#param D depth
#param WW width in W-sized chunks of target image
#param HH height in H-sized chunks of target image
#param filename output filename in PNG format
Output image:
|<----- WW = 2 --->|
+------------------+ -
| 0 1 2 9 10 11| |
| 3 4 5 12 13 14| |
| 6 7 8 15 16 17| HH = 2
| 18 19 20 | |
| 21 22 23 blank | |
| 24 25 26 | |
+------------------+ -
NOTE: W*WW and H*HH may not exceed 32760.
Return:
0 success
-1 cannot create PNG structure (write)
-2 cannot create PNG structure (info)
-3 out of memory
-4 cannot create output file
*/
int matrix3D_to_png(uint8_t *matrix, size_t W, size_t H, size_t D, size_t WW, size_t HH, char *filename)
{
FILE *fp;
png_color palette[16];
png_byte transparencies[16];
uint32_t y;
size_t x;
uint8_t *row;
if( !(png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL)) )
return -1;
if( !(info_ptr = png_create_info_struct(png_ptr)) || setjmp(png_jmpbuf(png_ptr)) ){
/* If we get here, libpng had a problem writing */
png_destroy_write_struct(&png_ptr, &info_ptr);
return -2;
}
if (NULL == (row = malloc(WW*W + 7)))
{
return -3;
}
/* Create 16-color palette for representation */
#define SETPAL(i,r,g,b,a) \
palette[i].red = r; palette[i].green = g; palette[i].blue = b; transparencies[i] = 255-a;
// We will draw the matrix in red if points are nonzero, black if zero; outside the matrix
// we use transparent white.
#define INDEX_IF_ZERO 0
#define INDEX_IF_NONZERO 3
#define INDEX_IF_BLANK 15
SETPAL(0, 0, 0, 0, 0); // Black
SETPAL(1, 255, 255, 255, 0); // Opaque white
SETPAL(2, 192, 192, 192, 0); // Light gray
SETPAL(3, 255, 0, 0, 0); // Red
SETPAL(4, 0, 255, 0, 0); // Green
SETPAL(5, 0, 0, 255, 0);// Blue
SETPAL(6, 255, 0, 0, 128); // Halftransparent red
SETPAL(7, 0, 255, 0, 128); // green
SETPAL(8, 0, 0, 255, 128); // blue
SETPAL(9, 255, 0, 0, 0); // red again :-)
SETPAL(10, 0, 255, 0, 0);
SETPAL(11, 0, 0, 255, 0);
SETPAL(12, 255, 0, 0, 0);
SETPAL(13, 0, 255, 0, 0);
SETPAL(14, 0, 0, 255, 0);
SETPAL(15, 255, 255, 255, 255); // Transparent white
/* End palette */
/* Create filename */
if (NULL == (fp = fopen(filename, "w")))
{
fprintf(stderr, "cannot open output '%s': %s\n", filename, strerror(errno));
return -4;
}
png_init_io(png_ptr, fp);
png_set_IHDR(png_ptr, info_ptr, W*WW, H*HH, 8, PNG_COLOR_TYPE_PALETTE,
PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
png_set_PLTE(png_ptr, info_ptr, palette, 16);
png_set_tRNS(png_ptr, info_ptr, transparencies, 16, NULL);
png_set_compression_level(png_ptr, Z_BEST_COMPRESSION);
png_write_info(png_ptr, info_ptr);
for (y = 0; y < H*HH; y++)
{
size_t mx = y/H;
mx = (mx*H*WW + (y%H))*W;
for (x = 0; x < WW; x++)
{
if (mx+x*H >= H*D)
memset(row+x*W, INDEX_IF_BLANK, W);
else
{
size_t ii;
for (ii = 0; ii < W; ii++)
row[x*W+ii] = (matrix[mx+x*W*H+ii]) ? INDEX_IF_NONZERO : INDEX_IF_ZERO;
}
}
png_write_row(png_ptr, row);
}
png_write_end(png_ptr, NULL /*info_ptr*/);
png_destroy_write_struct(&png_ptr, &info_ptr);
fclose(fp);
free(row);
return 0;
}
int main(int argc, char **argv)
{
FILE *fp;
uint8_t *matrix;
size_t W, H, D, WW, HH, i;
if (8 != argc)
{
fprintf(stderr, "Syntax: %s input output.png width height depth TileX TileY\n", *argv);
return EXIT_FAILURE;
}
W = atol(argv[3]);
H = atol(argv[4]);
D = atol(argv[5]);
WW = atol(argv[6]);
HH = atol(argv[7]);
if ((W * WW > 32767)||(H * HH) > 32767)
{
fprintf(stderr, "Output image would be too large\n");
return EXIT_FAILURE;
}
if (WW*HH < D)
{
fprintf(stderr, "WARNING: matrix does not fit into output image\n");
}
if (WW*HH > D*2)
{
fprintf(stderr, "WARNING: output image is far larger than input matrix\n");
}
if (NULL == (fp = fopen(argv[1], "r")))
{
fprintf(stderr, "Input file not found\n");
return EXIT_FAILURE;
}
if (NULL == (matrix = malloc(W*H*D)))
{
fprintf(stderr, "Out of memory: matrix too large\n");
return EXIT_FAILURE;
}
for (i = 0; i < D; i++)
{
int ret;
if ((int)H != (ret = fread(matrix + W*H*i, W, H, fp)))
{
fprintf(stderr, "Read error at plane %d (reading %d rows of %d elements, expecting %d, got %d)\n",
(int)i, (int)W, (int)H, (int)H, ret);
fclose(fp);
return EXIT_FAILURE;
}
}
if (matrix3D_to_png(matrix, W, H, D, WW, HH, argv[2]))
{
fprintf(stderr, "Error in creating output PNG '%s'\n", argv[2]);
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}

GNU Octave is a free Matlab-like program that a lot of people seem to like.
This site has a whole list of free alternatives: http://www.math.tu-berlin.de/~ehrhardt/matlab_alternatives.html

find the index of the highest bit set of a 32-bit number without loops obviously

Here's a tough one(atleast i had a hard time :P):
find the index of the highest bit set of a 32-bit number without using any loops.

With recursion:
int firstset(int bits) {
return (bits & 0x80000000) ? 31 : firstset((bits << 1) | 1) - 1;
}
Assumes [31,..,0] indexing
Returns -1 if no bits set
| 1 prevents stack overflow by capping the number of shifts until a 1 is reached (32)
Not tail recursive :)

Very interesting question, I will provide you an answer with benchmark
Solution using a loop
uint8_t highestBitIndex( uint32_t n )
{
uint8_t r = 0;
while ( n >>= 1 )
r++;
return r;
}
This help to better understand the question but is highly inefficient.
Solution using log
This approach can also be summarize by the log method
uint8_t highestSetBitIndex2(uint32_t n) {
return (uint8_t)(log(n) / log(2));
}
However it is also inefficient (even more than above one, see benchmark)
Solution using built-in instruction
uint8_t highestBitIndex3( uint32_t n )
{
return 31 - __builtin_clz(n);
}
This solution, while very efficient, suffer from the fact that it only work with specific compilers (gcc and clang will do) and on specific platforms.
NB: It is 31 and not 32 if we want the index
Solution with intrinsic
#include <x86intrin.h>
uint8_t highestSetBitIndex5(uint32_t n)
{
return _bit_scan_reverse(n); // undefined behavior if n == 0
}
This will call the bsr instruction at assembly level
Solution using inline assembly
LZCNT and BSR can be summarize in assembly with the below functions:
uint8_t highestSetBitIndex4(uint32_t n) // undefined behavior if n == 0
{
__asm__ __volatile__ (R"(
.intel_syntax noprefix
bsr eax, edi
.att_syntax noprefix
)"
);
}
uint8_t highestSetBitIndex7(uint32_t n) // undefined behavior if n == 0
{
__asm__ __volatile__ (R"(.intel_syntax noprefix
lzcnt ecx, edi
mov eax, 31
sub eax, ecx
.att_syntax noprefix
)");
}
NB: Do Not Use unless you know what you are doing
Solution using lookup table and magic number multiplication (probably the best AFAIK)
First you use the following function to clear all the bits except the highest one:
uint32_t keepHighestBit( uint32_t n )
{
n |= (n >> 1);
n |= (n >> 2);
n |= (n >> 4);
n |= (n >> 8);
n |= (n >> 16);
return n - (n >> 1);
}
Credit: The idea come from Henry S. Warren, Jr. in his book Hacker's Delight
Then we use an algorithm based on DeBruijn's Sequence to perform a kind of binary search:
uint8_t highestBitIndex8( uint32_t b )
{
static const uint32_t deBruijnMagic = 0x06EB14F9; // equivalent to 0b111(0xff ^ 3)
static const uint8_t deBruijnTable[64] = {
0, 0, 0, 1, 0, 16, 2, 0, 29, 0, 17, 0, 0, 3, 0, 22,
30, 0, 0, 20, 18, 0, 11, 0, 13, 0, 0, 4, 0, 7, 0, 23,
31, 0, 15, 0, 28, 0, 0, 21, 0, 19, 0, 10, 12, 0, 6, 0,
0, 14, 27, 0, 0, 9, 0, 5, 0, 26, 8, 0, 25, 0, 24, 0,
};
return deBruijnTable[(keepHighestBit(b) * deBruijnMagic) >> 26];
}
Another version:
void propagateBits(uint32_t *n) {
*n |= *n >> 1;
*n |= *n >> 2;
*n |= *n >> 4;
*n |= *n >> 8;
*n |= *n >> 16;
}
uint8_t highestSetBitIndex8(uint32_t b)
{
static const uint32_t Magic = (uint32_t) 0x07C4ACDD;
static const int BitTable[32] = {
0, 9, 1, 10, 13, 21, 2, 29,
11, 14, 16, 18, 22, 25, 3, 30,
8, 12, 20, 28, 15, 17, 24, 7,
19, 27, 23, 6, 26, 5, 4, 31,
};
propagateBits(&b);
return BitTable[(b * Magic) >> 27];
}
Benchmark with 100 million calls
compiling with g++ -std=c++17 highestSetBit.cpp -O3 && ./a.out
highestBitIndex1 136.8 ms (loop)
highestBitIndex2 183.8 ms (log(n) / log(2))
highestBitIndex3 10.6 ms (de Bruijn lookup Table with power of two, 64 entries)
highestBitIndex4 4.5 ms (inline assembly bsr)
highestBitIndex5 6.7 ms (intrinsic bsr)
highestBitIndex6 4.7 ms (gcc lzcnt)
highestBitIndex7 7.1 ms (inline assembly lzcnt)
highestBitIndex8 10.2 ms (de Bruijn lookup Table, 32 entries)
I would personally go for highestBitIndex8 if portability is your focus, else gcc built-in is nice.

Floor of logarithm-base-two should do the trick (though you have to special-case 0).
Floor of log base 2 of 0001 is 0 (bit with index 0 is set).
" " of 0010 is 1 (bit with index 1 is set).
" " of 0011 is 1 (bit with index 1 is set).
" " of 0100 is 2 (bit with index 2 is set).
and so on.
On an unrelated note, this is actually a pretty terrible interview question (I say this as someone who does technical interviews for potential candidates), because it really doesn't correspond to anything you do in practical programming.
Your boss isn't going to come up to you one day and say "hey, so we have a rush job for this latest feature, and it needs to be implemented without loops!"

You could do it like this (not optimised):
int index = 0;
uint32_t temp = number;
if ((temp >> 16) != 0) {
temp >>= 16;
index += 16;
}
if ((temp >> 8) != 0) {
temp >>= 8
index += 8;
}
...

sorry for bumping an old thread, but how about this
inline int ilog2(unsigned long long i) {
union { float f; int i; } = { i };
return (u.i>>23)-27;
}
...
int highest=ilog2(x); highest+=(x>>highest)-1;
// and in case you need it
int lowest = ilog2((x^x-1)+1)-1;

this can be done as a binary search, reducing complexity of O(N) (for an N-bit word) to O(log(N)). A possible implementation is:
int highest_bit_index(uint32_t value)
{
if(value == 0) return 0;
int depth = 0;
int exponent = 16;
while(exponent > 0)
{
int shifted = value >> (exponent);
if(shifted > 0)
{
depth += exponent;
if(shifted == 1) return depth + 1;
value >>= exponent;
}
exponent /= 2;
}
return depth + 1;
}
the input is a 32 bit unsigned integer.
it has a loop that can be converted into 5 levels of if-statements , therefore resulting in 32 or so if-statements. you could also use recursion to get rid of the loop, or the absolutely evil "goto" ;)

Let
n - Decimal number for which bit location to be identified
start - Indicates decimal value of ( 1 << 32 ) - 2147483648
bitLocation - Indicates bit location which is set to 1
public int highestBitSet(int n, long start, int bitLocation)
{
if (start == 0)
{
return 0;
}
if ((start & n) > 0)
{
return bitLocation;
}
else
{
return highestBitSet(n, (start >> 1), --bitLocation);
}
}
long i = 1;
long startIndex = (i << 31);
int bitLocation = 32;
int value = highestBitSet(64, startIndex, bitLocation);
System.out.println(value);

int high_bit_set(int n, int pos)
{
if(pos<0)
return -1;
else
return (0x80000000 & n)?pos:high_bit_set((n<<1),--pos);
}
main()
{
int n=0x23;
int high_pos = high_bit_set(n,31);
printf("highest index = %d",high_pos);
}
From your main call function high_bit_set(int n , int pos) with the input value n, and default 31 as the highest position. And the function is like above.

Paislee's solution is actually pretty easy to make tail-recursive, though, it's a much slower solution than the suggested floor(log2(n));
int firstset_tr(int bits, int final_dec) {
// pass in 0 for final_dec on first call, or use a helper function
if (bits & 0x80000000) {
return 31-final_dec;
} else {
return firstset_tr( ((bits << 1) | 1), final_dec+1 );
}
}
This function also works for other bit sizes, just change the check,
e.g.
if (bits & 0x80) { // for 8-bit
return 7-final_dec;
}

Note that what you are trying to do is calculate the integer log2 of an integer,
#include <stdio.h>
#include <stdlib.h>
unsigned int
Log2(unsigned long x)
{
unsigned long n = x;
int bits = sizeof(x)*8;
int step = 1; int k=0;
for( step = 1; step < bits; ) {
n |= (n >> step);
step *= 2; ++k;
}
//printf("%ld %ld\n",x, (x - (n >> 1)) );
return(x - (n >> 1));
}
Observe that you can attempt to search more than 1 bit at a time.
unsigned int
Log2_a(unsigned long x)
{
unsigned long n = x;
int bits = sizeof(x)*8;
int step = 1;
int step2 = 0;
//observe that you can move 8 bits at a time, and there is a pattern...
//if( x>1<<step2+8 ) { step2+=8;
//if( x>1<<step2+8 ) { step2+=8;
//if( x>1<<step2+8 ) { step2+=8;
//}
//}
//}
for( step2=0; x>1L<<step2+8; ) {
step2+=8;
}
//printf("step2 %d\n",step2);
for( step = 0; x>1L<<(step+step2); ) {
step+=1;
//printf("step %d\n",step+step2);
}
printf("log2(%ld) %d\n",x,step+step2);
return(step+step2);
}
This approach uses a binary search
unsigned int
Log2_b(unsigned long x)
{
unsigned long n = x;
unsigned int bits = sizeof(x)*8;
unsigned int hbit = bits-1;
unsigned int lbit = 0;
unsigned long guess = bits/2;
int found = 0;
while ( hbit-lbit>1 ) {
//printf("log2(%ld) %d<%d<%d\n",x,lbit,guess,hbit);
//when value between guess..lbit
if( (x<=(1L<<guess)) ) {
//printf("%ld < 1<<%d %ld\n",x,guess,1L<<guess);
hbit=guess;
guess=(hbit+lbit)/2;
//printf("log2(%ld) %d<%d<%d\n",x,lbit,guess,hbit);
}
//when value between hbit..guess
//else
if( (x>(1L<<guess)) ) {
//printf("%ld > 1<<%d %ld\n",x,guess,1L<<guess);
lbit=guess;
guess=(hbit+lbit)/2;
//printf("log2(%ld) %d<%d<%d\n",x,lbit,guess,hbit);
}
}
if( (x>(1L<<guess)) ) ++guess;
printf("log2(x%ld)=r%d\n",x,guess);
return(guess);
}
Another binary search method, perhaps more readable,
unsigned int
Log2_c(unsigned long x)
{
unsigned long v = x;
unsigned int bits = sizeof(x)*8;
unsigned int step = bits;
unsigned int res = 0;
for( step = bits/2; step>0; )
{
//printf("log2(%ld) v %d >> step %d = %ld\n",x,v,step,v>>step);
while ( v>>step ) {
v>>=step;
res+=step;
//printf("log2(%ld) step %d res %d v>>step %ld\n",x,step,res,v);
}
step /= 2;
}
if( (x>(1L<<res)) ) ++res;
printf("log2(x%ld)=r%ld\n",x,res);
return(res);
}
And because you will want to test these,
int main()
{
unsigned long int x = 3;
for( x=2; x<1000000000; x*=2 ) {
//printf("x %ld, x+1 %ld, log2(x+1) %d\n",x,x+1,Log2(x+1));
printf("x %ld, x+1 %ld, log2_a(x+1) %d\n",x,x+1,Log2_a(x+1));
printf("x %ld, x+1 %ld, log2_b(x+1) %d\n",x,x+1,Log2_b(x+1));
printf("x %ld, x+1 %ld, log2_c(x+1) %d\n",x,x+1,Log2_c(x+1));
}
return(0);
}

well from what I know the function Log is Implemented very efficiently in most programming languages, and even if it does contain loops , it is probably very few of them , internally
So I would say that in most cases using the log would be faster , and more direct.
you do have to check for 0 though and avoid taking the log of 0, as that would cause the program to crash.

Single-Sided communications with MPI-2

Consider the following fragment of OpenMP code which transfers private data between two threads using an intermediate shared variable
#pragma omp parallel shared(x) private(a,b)
{
...
a = somefunction(b);
if (omp_get_thread_num() == 0) {
x = a;
}
}
#pragma omp parallel shared(x) private(a,b)
{
if (omp_get_thread_num() == 1) {
a = x;
}
b = anotherfunction(a);
...
}
I would (in pseudocode ) need to transfer of private data from one process to another using a single-sided message-passing library.
Any ideas?

This is possible, but there's a lot more "scaffolding" involved -- after all, you are communicating data between potentially completely different computers.
The coordination for this sort of thing is done between windows of data which are accessible from other processors, and with lock/unlock operations which coordinate the access of this data. The locks aren't really locks in the sense of being mutexes, but they are more like synchronization points coordinating data access to the window.
I don't have time right now to explain this in the detail I'd like, but below is an example of using MPI2 to do something like shared memory flagging in a system that doesn't have shared memory:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"
int main(int argc, char** argv)
{
int rank, size, *a, geta;
int x;
int ierr;
MPI_Win win;
const int RCVR=0;
const int SENDER=1;
ierr = MPI_Init(&argc, &argv);
ierr |= MPI_Comm_rank(MPI_COMM_WORLD, &rank);
ierr |= MPI_Comm_size(MPI_COMM_WORLD, &size);
if (ierr) {
fprintf(stderr,"Error initializing MPI library; failing.\n");
exit(-1);
}
if (rank == RCVR) {
MPI_Alloc_mem(sizeof(int), MPI_INFO_NULL, &a);
*a = 0;
} else {
a = NULL;
}
MPI_Win_create(a, 1, sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &win);
if (rank == SENDER) {
/* Lock recievers window */
MPI_Win_lock(MPI_LOCK_EXCLUSIVE, RCVR, 0, win);
x = 5;
/* put 1 int (from &x) to 1 int rank RCVR, at address 0 in window "win"*/
MPI_Put(&x, 1, MPI_INT, RCVR, 0, 1, MPI_INT, win);
/* Unlock */
MPI_Win_unlock(0, win);
printf("%d: My job here is done.\n", rank);
}
if (rank == RCVR) {
for (;;) {
MPI_Win_lock(MPI_LOCK_EXCLUSIVE, RCVR, 0, win);
MPI_Get(&geta, 1, MPI_INT, RCVR, 0, 1, MPI_INT, win);
MPI_Win_unlock(0, win);
if (geta == 0) {
printf("%d: a still zero; sleeping.\n",rank);
sleep(2);
} else
break;
}
printf("%d: a now %d!\n",rank,geta);
printf("a = %d\n", *a);
MPI_Win_free(&win);
if (rank == RCVR) MPI_Free_mem(a);
MPI_Finalize();
return 0;
}

OpenCl cleanup causes segfault

I constructed my own little Opencl example using different sources on the net. The actual kernel works, and I get the output I want, but the cleanup functions, I found in one of the examples, cause segfaults. What did I do wrong?
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <CL/cl.h> //opencl
#define CL_CHECK(_expr) \
do { \
cl_int _err = _expr; \
if (_err == CL_SUCCESS) \
break; \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
} while (0)
#define CL_CHECK_ERR(_expr) \
({ \
cl_int _err = CL_INVALID_VALUE; \
typeof(_expr) _ret = _expr; \
if (_err != CL_SUCCESS) { \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
} \
_ret; \
})
const char* OpenCLSource[] = {
"__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
"{",
" // Index of the elements to add \n",
" unsigned int n = get_global_id(0);",
" // Sum the n’th element of vectors a and b and store in c \n",
" c[n] = a[n] + b[n];",
"}"
};
cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){
// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
cl_int _err;
*GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ;
printf("\n1-%i\n",_err);
// Get the list of GPU devices associated with this context
size_t ParmDataBytes;
CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes));
cl_device_id* GPUDevices;
GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL));
// Create a command-queue on the first GPU device
*GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err);
printf("\n2-%i\n",_err);
// Create OpenCL program with source code
*OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err);
printf("\n3-%i\n",_err);
CL_CHECK(clBuildProgram(*OpenCLProgram, 0,
NULL, NULL, NULL, NULL));
cl_int errcode;
*cl_forward1 = clCreateKernel(*OpenCLProgram,
"VectorAdd", &errcode);
printf("\n7-%i\n",errcode);
return GPUDevices;
}
int main(int argc, char** argv)
{
cl_context GPUContext;
cl_command_queue GPUCommandQueue;
cl_program OpenCLProgram;
cl_kernel OpenCLVectorAdd;
cl_device_id* GPUDevices;
GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram);
// Two integer source vectors in Host memory
int n=5 ;
int x[5]={1,2,4,6,8};
int y[5]={1,2,4,6,8};
int output[n];
int size_x = n*sizeof(x);
int size_y = n*sizeof(y);
int size_output = n*sizeof(output); // this changes for the second forward1
cl_int _err;
// Allocate GPU memory for source vectors AND initialize from CPU memory
cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, size_x, x, &_err);
printf("\n4-%i\n",_err);
cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, size_y, y, &_err);
printf("\n5-%i\n",_err);
// Allocate output memory on GPU
cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
size_output, NULL, &_err);
printf("\n6-%i\n",_err);
// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl);
clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl);
clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl);
// 7. Launch OpenCL kernel
size_t localWorkSize[1], globalWorkSize[1];
//localWorkSize = ;
globalWorkSize[0] = n;
// Launch the Kernel on the GPU
CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL));
// Copy the output in GPU memory back to CPU memory
//float* h_C = (float*) malloc(size_output);
CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue,
total_cl, CL_TRUE, 0, size_output,
output, 0, NULL, NULL));
for (int i=0; i<n;i++){
printf("\n%i",output[i]);
}
// Cleanup (each of the following lines causes a seg fault
// ******************************
CL_CHECK(free(GPUDevices));
CL_CHECK(clReleaseKernel(OpenCLVectorAdd));
CL_CHECK(clReleaseProgram(OpenCLProgram));
CL_CHECK(clReleaseCommandQueue(GPUCommandQueue));
CL_CHECK(clReleaseContext(GPUContext));
CL_CHECK(clReleaseMemObject(total_cl));
CL_CHECK(clReleaseMemObject(x_cl));
CL_CHECK(clReleaseMemObject(y_cl));
/* ****************
return 0;
}
Merci!

For people who arrives here in the future:
As Brafford suggested, this is resolved by adding clFinish(GPUCommandQueue) after clEnqueueNDRangeKernel as well as clEnqueueReadBuffer.
Apparently trying to clean up any object (e.g. release a queue) that is still under execution yields segmentation fault.

I corrected and changed several small things. So this code should work now.
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <CL/cl.h> //opencl
#define CL_CHECK(_expr) \
do { \
cl_int _err = _expr; \
if (_err == CL_SUCCESS) \
break; \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
} while (0)
#define CL_CHECK_ERR(_expr) \
({ \
cl_int _err = CL_INVALID_VALUE; \
typeof(_expr) _ret = _expr; \
if (_err != CL_SUCCESS) { \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
} \
_ret; \
})
const char* OpenCLSource[] = {
"__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
"{",
" // Index of the elements to add \n",
" unsigned int n = get_global_id(0);",
" // Sum the n’th element of vectors a and b and store in c \n",
" c[n] = a[n] + b[n];",
"}"
};
cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){
// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
cl_int _err;
*GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ;
printf("\nclCreateContextFromType:%i\n",_err);
// Get the list of GPU devices associated with this context
size_t ParmDataBytes;
CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes));
cl_device_id* GPUDevices;
GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL));
// Create a command-queue on the first GPU device
*GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err);
printf("\nclCreateCommandQueue:%i\n",_err);
// Create OpenCL program with source code
*OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err);
printf("\nclCreateProgramWithSource:%i\n",_err);
CL_CHECK(clBuildProgram(*OpenCLProgram, 0,
NULL, NULL, NULL, NULL));
cl_int errcode;
*cl_forward1 = clCreateKernel(*OpenCLProgram,
"VectorAdd", &errcode);
printf("\nclCreateKernel:%i\n",errcode);
return GPUDevices;
}
int main(int argc, char** argv)
{
cl_context GPUContext;
cl_command_queue GPUCommandQueue;
cl_program OpenCLProgram;
cl_kernel OpenCLVectorAdd;
cl_device_id* GPUDevices;
GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram);
int n=5 ;
int x[5]={1,2,4,6,8};
int y[5]={1,2,4,6,8};
int output[n];
int size_x = n*sizeof(x);
int size_y = n*sizeof(y);
int size_output = n*sizeof(output);
cl_int _err;
// Allocate GPU memory for source vectors AND initialize from CPU memory
cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, size_x, x, &_err);
printf("\nclCreateBuffer:%i\n",_err);
cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, size_y, y, &_err);
printf("\nclCreateBuffer:%i\n",_err);
// Allocate output memory on GPU
cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
size_output, NULL, &_err);
printf("\nclCreateBuffer:%i\n",_err);
// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl);
clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl);
clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl);
size_t globalWorkSize[1];
globalWorkSize[0] = n;
// Launch the Kernel on the GPU
CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL));
clFinish(GPUCommandQueue);
// Copy the output in GPU memory back to CPU memory
int* h_c = (int*) malloc(size_output);
CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue,
total_cl, CL_TRUE, 0, size_output,
h_c, 0, NULL, NULL));
clFinish(GPUCommandQueue);
for (int i=0; i<n;i++){
printf("\noutput[%i]=%i",i,h_c[i]);
}
// Cleanup
free(GPUDevices);
CL_CHECK(clReleaseKernel(OpenCLVectorAdd));
CL_CHECK(clReleaseProgram(OpenCLProgram));
CL_CHECK(clReleaseCommandQueue(GPUCommandQueue));
CL_CHECK(clReleaseContext(GPUContext));
CL_CHECK(clReleaseMemObject(x_cl));
CL_CHECK(clReleaseMemObject(total_cl));
CL_CHECK(clReleaseMemObject(y_cl));
return 0;
}

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Why GPU memory updates are not read? - visual-studio

Related

kevent & USB serial ports

Convert binary file to image

find the index of the highest bit set of a 32-bit number without loops obviously

Single-Sided communications with MPI-2

OpenCl cleanup causes segfault

Categories

Resources