Unable to compile simple CUDA example - compilation

test.cu:
#include <iostream>
#include "book.h"
__global__ void add( int a, int b, int *c ) {
*c = a + b;
}
int main( void ) {
int c;
int *dev_c;
HANDLE_ERROR( cudaMalloc( (void**)&dev_c, sizeof(int) ) );
add<<<1,1>>>( 2, 7, dev_c );
HANDLE_ERROR( cudaMemcpy( &c,
dev_c,
sizeof(int),
cudaMemcpyDeviceToHost ) );
printf( "2 + 7 = %d\n", c );
cudaFree( dev_c );
return 0;
}
I am trying to compile above example test.cu. I tried with nvcc test.cu but compiler gives error
4.cu:2:18: fatal error: book.h: No such file or directory
compilation terminated.
How can I tell compiler where book.h is present? I have installed CUDA in /usr/local/cuda.
Do I need to make Makefile?
I am new to CUDA and Makefile so question might seem trivial.

Book.h is not CUDA. It is used by "Cuda by Example" for some easy stuff.
In this example it is needed to provide the HANDLE_ERROR, you should write your own code to handle errors.
Here you can find the book.h code: http://code.google.com/p/cuda-examples/source/browse/trunk/common/book.h?r=3

I believe that using quotes ("") tells the compiler to look in the same directory as the code file, so you may want to try <book.h> instead of "book.h.
Presuming that book.h is a file included with CUDA. I've never used it before.

Related

how to compile Cuda source with Go language's cgo?

I wrote a simple program in cuda-c and it works on eclipse nsight. This is source code:
#include <iostream>
#include <stdio.h>
__global__ void add( int a,int b, int *c){
*c = a + b;
}
int main(void){
int c;
int *dev_c;
cudaMalloc((void**)&dev_c, sizeof(int));
add <<<1,1>>>(2,7,dev_c);
cudaMemcpy(&c, dev_c, sizeof(int),cudaMemcpyDeviceToHost);
printf("\n2+7= %d\n",c);
cudaFree(dev_c);
return 0;
}
Now I'm trying to use this code with Go language with cgo!!!
So I wrote this new code:
package main
//#include "/usr/local/cuda-7.0/include/cuda.h"
//#include "/usr/local/cuda-7.0/include/cuda_runtime.h"
//#cgo LDFLAGS: -lcuda
//#cgo LDFLAGS: -lcurand
////default location:
//#cgo LDFLAGS: -L/usr/local/cuda-7.0/lib64 -L/usr/local/cuda-7.0/lib
//#cgo CFLAGS: -I/usr/local/cuda-7.0/include/
//
//
//
//
//
//
//
//
//
//
/*
#include <stdio.h>
__global__ void add( int a,int b, int *c){
*c = a + b;
}
int esegui_somma(void){
int c;
int *dev_c;
cudaMalloc((void**)&dev_c, sizeof(int));
add <<<1,1>>> (2,7,dev_c);
cudaMemcpy(&c, dev_c, sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(dev_c);
return c;
}
*/
import "C"
import "fmt"
func main(){
fmt.Printf("il risultato è %d",C.esegui_somma)
}
But it doesn't work!!
I read this error message:
cgo_cudabyexample_1/main.go:34:8: error: expected expression before '<' token
add <<<1,1>>> (2,7,dev_c);
^
I think that I must to set nvcc cuda compiler for cgo instead of gcc.
How can I do it? Can I change CC environment variable?
best regards
I finally figured out how to do this. Thing biggest problem is that nvccdoes not follow gcc standard flags and unlike clang it won't silently ignore them. cgo triggers the problem by adding a bunch of flags not explicitly specified by the user.
To make it all work, you'll need to separate out your device code and the functions that directly call it into separate files and compile/package them directly using nvcc into a shared library (.so). Then you'll use cgo to link this shared library using whatever default linker you have on your system. The only thing you'll have to add is -lcudart to your LDFLAGS (linker flags) to link the CUDA runtime.

How to enable the _Generic keyword

I have this test source:
#include <stdio.h>
int main()
{
int x;
printf("x=%d\n", _Generic('x', int: 1, default: 0));
return 0;
}
Compiling with c++ (from GCC 4.9.2) fails:
t.cpp: In function ‘int main()’:
t.cpp:7:33: error: expected primary-expression before ‘int’
printf("x=%d\n", _Generic('x', int: 1, default: 0));
^
t.cpp:7:41: error: expected primary-expression before ‘default’
printf("x=%d\n", _Generic('x', int: 1, default: 0));
^
t.cpp:7:51: error: ‘_Generic’ was not declared in this scope
printf("x=%d\n", _Generic('x', int: 1, default: 0));
The compiler arguments are:
c++ --std=c++11 t.cpp -o t
What am I doing wrong?
_Generic is a C11 feature. It is not present in C++ (any version at least up to C++14 - I don't really expect it to be added either).
If you want to use it, you'll need to write C code, and use a compiler that supports that standard (reasonably recent versions of gcc and clang do for example, using -std=c11).
If you want to write C++, use overloading or templates instead, for example:
#include <iostream>
int foo(int) { return 1; }
int foo(char) { return 0; }
int main()
{
std::cout << "x=" << foo('x') << std::endl;
}
This prints x=0 in C++, the foo(char) overload is the best match.
Note that there's difference between C and C++ that might trick you here too: 'x' is a char in C++. It's an int in C. So if _Generic had been implemented (maybe as an extension) by your compiler, chances are you'd get different output when compiling your example as C versus compiling as C++.
Here's the C++ form (forgive me for using the using directive, I know its bad form):
#include <iostream>
using namespace std;
template< typename T> T do_something(T argument) {
// Put here what you need
}
int main()
{
int x;
cout << "x" << (x = do_something(x));
return 0;
}
_Generic is C11, you're probably using a C++ compiler when you meant to use a C compiler.

Separating out .cu and .cpp(using c++11 library)

I am trying to convert a c++ program I have which uses random library which is a C++11 feature. After having read through a couple of similar posts here, I tried by separating out the code into three files. At the outset I would like to say that I am not very conversant at C/C++ and mostly use R at work.
The main file looks as follows.
#ifndef _KERNEL_SUPPORT_
#define _KERNEL_SUPPORT_
#include <complex>
#include <random>
#include <iostream>
#include "my_code_header.h"
using namespace std;
std::default_random_engine generator;
std::normal_distribution<double> distribution(0.0,1.0);
const int rand_mat_length = 24561;
double rand_mat[rand_mat_length];// = {0};
void create_std_norm(){
for(int i = 0 ; i < rand_mat_length ; i++)
::rand_mat[i] = distribution(generator);
}
.
.
.
int main(void)
{
...
...
call_global();
return 0;
}
#endif
The header file looks as follows.
#ifndef mykernel_h
#define mykernel_h
void call_global();
void two_d_example(double *a, double *b, double *my_result, size_t length, size_t width);
#endif
And the .cu file looks like the following.
#ifndef _MY_KERNEL_
#define _MY_KERNEL_
#include <iostream>
#include "my_code_header.h"
#define TILE_WIDTH 8
using namespace std;
__global__ void two_d_example(double *a, double *b, double *my_result, size_t length, size_t width)
{
unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
if ((row>length) || (col>width)) {
return;
}
...
}
void call_global()
{
const size_t imageLength = 528;
const size_t imageWidth = 528;
const dim3 threadsPerBlock(TILE_WIDTH,TILE_WIDTH);
const dim3 numBlocks(((imageLength) / threadsPerBlock.x), ((imageWidth) / threadsPerBlock.y));
double *d_a, *d_b, *mys ;
...
cudaMalloc((void**)&d_a, sizeof(double) * imageLength);
cudaMalloc((void**)&d_b, sizeof(double) * imageWidth);
cudaMalloc((void**)&mys, sizeof(double) * imageLength * imageWidth);
two_d_example<<<numBlocks,threadsPerBlock>>>(d_a, d_b, mys, imageLength, imageWidth);
...
cudaFree(d_a);
cudaFree(d_b);
}
#endif
Please note that the __global__ has been removed from .h since I was getting the following error owing to it being compiled by g++.
In file included from my_code_main.cpp:12:0:
my_code_header.h:5:1: error: ‘__global__’ does not name a type
When I compile the .cu file with nvcc it is all fine and generates a my_code_kernel.o. But since I am using C++11 in my .cpp I am trying to compile it with g++ and I am getting the following error.
/tmp/ccR2rXzf.o: In function `main':
my_code_main.cpp:(.text+0x1c4): undefined reference to `call_global()'
collect2: ld returned 1 exit status
I understand that this might not have to do anything with CUDA as such and may just be the wrong use of including the header at both places. Also what is the right way to compile and most importantly link the my_code_kernel.o and my_code_main.o(hopefully)? Sorry if this question is too trivial!
It looks like you are not linking with my_code_kernel.o. You have used -c for your nvcc command (causes it to compile but not link, i.e. generate the .o file), I'm going to guess that you're not using -c with your g++ command, in which case you need to add my_code_kernel.o to the list of inputs as well as the .cpp file.
The separation you are trying to achieve is completely possible, it just looks like your not linking properly. If you still have problems, add the compilation commands to your question.
FYI: You don't need to declare two_d_example() in your header file, it is only used within your .cu file (from call_global()).

CUDA 5.0 "Generate Relocatable Device Code" leads to invalid device symbol error

I am trying to do separate compilation using CUDA 5. For this reason I set the "Generate Relocatable Device Code" to "Yes (-rdc=true)" in Visual Studio 2010. The program compiles without errors, however,
I get an invalid device symbol error when I try to initialize device constants using cudaMemcpyToSymbol.
i.e. I have the following constant
__constant__ float gdDomainOrigin[2];
and try to initialize it with
cudaMemcpyToSymbol(gdDomainOrigin, mDomainOrigin, 2*sizeof(float));
which leads to the error. The error does not occur, when I compile everything as a whole, without the aforementioned option set. Could anybody please help me with that?
I can't reproduce this. If build an application from two .cu files, one containing a __constant__ symbol and a simple kernel, and the other containing the runtime API incantations to populate that constant memory and call the kernel, it works only when relocatable device code is enabled, viz:
__constant__ float gdDomainOrigin[2];
__global__
void kernel(float *inout)
{
inout[0] = gdDomainOrigin[0];
inout[1] = gdDomainOrigin[1];
}
and
#include <cstdio>
extern __constant__ float gdDomainOrigin;
extern __global__ void kernel(float *);
inline
void gpuAssert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n",
cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
int main(void)
{
const float mDomainOrigin[2] = { 1.234f, 5.6789f };
const size_t sz = sizeof(float) * size_t(2);
float * dbuf, * hbuf;
gpuErrchk( cudaFree(0) );
gpuErrchk( cudaMemcpyToSymbol(gdDomainOrigin, mDomainOrigin, sz) );
gpuErrchk( cudaMalloc((void **)&dbuf, sz) );
kernel<<<1,1>>>(dbuf);
gpuErrchk( cudaPeekAtLastError() );
hbuf = new float[2];
gpuErrchk( cudaMemcpy(hbuf, dbuf, sz, cudaMemcpyDeviceToHost) );
fprintf(stdout, "%f %f\n", hbuf[0], hbuf[1]);
return 0;
}
Compiling and running these in CUDA 5 on a 64 bit linux system with a Kepler GPU produces the following:
$ nvcc -arch=sm_30 -o shared shared.cu shared_dev.cu
$ ./shared
GPUassert: invalid device symbol shared.cu 23
$ nvcc -arch=sm_30 -rdc=true -o shared shared.cu shared_dev.cu
$ ./shared
1.234000 5.678900
You can see that in the first compilation, without relocatable GPU code generation, the symbol isn't found. In the second case, with relocatable GPU code generation, it is found, and the elf header in the object file looks just as you would expect:
$ nvcc -arch=sm_30 -rdc=true -c shared_dev.cu
$ cuobjdump -symbols shared_dev.o
Fatbin elf code:
================
arch = sm_30
code version = [1,6]
producer = cuda
host = linux
compile_size = 64bit
identifier = shared_dev.cu
symbols:
STT_SECTION STB_LOCAL .text._Z6kernelPf
STT_SECTION STB_LOCAL .nv.constant3
STT_SECTION STB_LOCAL .nv.constant0._Z6kernelPf
STT_CUDA_OBJECT STB_LOCAL _param
STT_SECTION STB_LOCAL .nv.callgraph
STT_FUNC STB_GLOBAL _Z6kernelPf
STT_CUDA_OBJECT STB_GLOBAL gdDomainOrigin
Fatbin ptx code:
================
arch = sm_30
code version = [3,1]
producer = cuda
host = linux
compile_size = 64bit
compressed
identifier = shared_dev.cu
ptxasOptions = --compile-only
Perhaps you could try my code and compilation/diagnostic steps and see what happens with your Windows toolchain.

getline on MacOSX 10.6 crashing C compiler?

I'm having a really hard time getting an R library installed that requires some compilation in C. I'm using a Mac OSX Snow Leopard machine and trying to install this R package (here).
I've looked at the thread talking about getline on macs and have tried a few of these fixes, but nothing is working! I'm a newbie and don't know any C, so that may be why! Can anyone give me some tips on how I could modify files in this package to get it to install?? Anyhelp would be pathetically appreciated! Here's the error I'm getting:
** libs
** arch - i386
g++ -arch i386 -I/Library/Frameworks/R.framework/Resources/include -I/Library/Frameworks/R.framework/Resources/include/i386 -I/usr/local/include -D_FASTMAP -DMAQ_LONGREADS -fPIC -g -O2 -c bed2vector.C -o bed2vector.o
In file included from /usr/include/c++/4.2.1/backward/strstream:51,
from bed2vector.C:8:
/usr/include/c++/4.2.1/backward/backward_warning.h:32:2: warning: #warning This file includes at least one deprecated or antiquated header. Please consider using one of the 32 headers found in section 17.4.1.2 of the C++ standard. Examples include substituting the <X> header for the <X.h> header for C++ includes, or <iostream> instead of the deprecated header <iostream.h>. To disable this warning use -Wno-deprecated.
bed2vector.C: In function ‘int get_a_line(FILE*, BZFILE*, int, std::string&)’:
bed2vector.C:74: error: no matching function for call to ‘getline(char**, size_t*, FILE*&)’
make: *** [bed2vector.o] Error 1
chmod: /Library/Frameworks/R.framework/Resources/library/spp/libs/i386/*: No such file or directory
ERROR: compilation failed for package 'spp'
The easiest solution is probably to add a static definition for getline() to bed2vector.c. This might be good enough:
/* PASTE AT TOP OF FILE */
#include <stdio.h> /* flockfile, getc_unlocked, funlockfile */
#include <stdlib.h> /* malloc, realloc */
#include <errno.h> /* errno */
#include <unistd.h> /* ssize_t */
extern "C" ssize_t getline(char **lineptr, size_t *n, FILE *stream);
/* PASTE REMAINDER AT BOTTOM OF FILE */
ssize_t
getline(char **linep, size_t *np, FILE *stream)
{
char *p = NULL;
size_t i = 0;
if (!linep || !np) {
errno = EINVAL;
return -1;
}
if (!(*linep) || !(*np)) {
*np = 120;
*linep = (char *)malloc(*np);
if (!(*linep)) {
return -1;
}
}
flockfile(stream);
p = *linep;
for (int ch = 0; (ch = getc_unlocked(stream)) != EOF;) {
if (i > *np) {
/* Grow *linep. */
size_t m = *np * 2;
char *s = (char *)realloc(*linep, m);
if (!s) {
int error = errno;
funlockfile(stream);
errno = error;
return -1;
}
*linep = s;
*np = m;
}
p[i] = ch;
if ('\n' == ch) break;
i += 1;
}
funlockfile(stream);
/* Null-terminate the string. */
if (i > *np) {
/* Grow *linep. */
size_t m = *np * 2;
char *s = (char *)realloc(*linep, m);
if (!s) {
return -1;
}
*linep = s;
*np = m;
}
p[i + 1] = '\0';
return ((i > 0)? i : -1);
}
This doesn't handle the case where the line is longer than the maximum value that ssize_t can represent. If you run into that case, you've likely got other problems.
Zeroth question: Have you considered using a package manager like fink or MacPorts rather than compiling yourself? I know that fink has an R package.
First question: How is the R build managed? Is there a ./configure? If so have you looked at the options to it? Does it use make? Scons? Some other dependency manager?
Second question: Have you told the build system that you are working on a Mac? Can you specify that you don't have a libc with native getline?
If the build system doesn't support Mac OS---but I image that R's does---you are probably going to have to download the standalone version, and hack the build to include it. How exactly you do that depends on the build system. And you may need to hack the source some.

Resources