Shared library undefined reference for CUDA Kernel wrapper - go

So I am attempting to use the CUDA Runtime API with Go's cgo on Windows. I've been at this for a few days now and am stuck: I am getting an undefined reference to my kernel wrapper.
I have separated out my kernel and it's wrapper into the following
FILE: cGo.cuh
typedef unsigned long int ktype;
typedef unsigned char glob;
/*
function Prototypes
*/
extern "C" void kernel_kValid(int, int, ktype *, glob *);
__global__ void kValid(ktype *, glob *);
FILE: cGo.cu
#include "cGo.cuh"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_runtime.h"
//function Definitions
/*
kernel_kValid is a wrapper function for the CUDA Kernel to be called from Go
*/
extern "C" void kernel_kValid(int blocks, int threads, ktype *kInfo, glob *values) {
kValid<<<blocks, threads>>>(kInfo, values);//execute the kernel
}
/*
kValid is the CUDA Kernel which is to be executed
*/
__global__ void kValid(ktype *kInfo, glob *values) {
//lots of code
}
I compile my CUDA source code into a shared library as such:
nvcc -shared -o myLib.so cGo.cu
then I have created a header file to include in my cgo
FILE: cGo.h
typedef unsigned long int ktype;
typedef unsigned char glob;
/*
function Declarations
*/
void kernel_kValid(int , int , ktype *, glob *);
Then from the go package I utilize cgo to call my kernel wrapper I have
package cuda
/*
#cgo LDFLAGS: -LC:/Storage/Cuda/lib/x64 -lcudart //this is the Cuda library
#cgo LDFLAGS: -L${SRCDIR}/lib -lmyLib //this is my shared library
#cgo CPPFLAGS: -IC:/Storage/Cuda/include //this contains cuda headers
#cgo CPPFLAGS: -I${SRCDIR}/include //this contains cGo.h
#include <cuda_runtime.h>
#include <stdlib.h>
#include "cGo.h"
*/
import "C"
func useKernel(){
//other code
C.kernel_kValid(C.int(B), C.int(T), unsafe.Pointer(storageDevice), unsafe.Pointer(globDevice))
cudaErr, err = C.cudaDeviceSynchronize()
//rest of the code
}
So all of the calls to the CUDA runtime API don't throw errors, it's only my kernel wrapper. This is the output when I build the cuda package with go.
C:\Users\user\Documents\Repos\go\cuda_wrapper>go build cuda_wrapper\cuda
# cuda_wrapper/cuda
In file included from C:/Storage/Cuda/include/host_defines.h:50:0,
from C:/Storage/Cuda/include/device_types.h:53,
from C:/Storage/Cuda/include/builtin_types.h:56,
from C:/Storage/Cuda/include/cuda_runtime.h:86,
from C:\Go\workspace\src\cuda_wrapper\cuda\cuda.go:12:
C:/Storage/Cuda/include/crt/host_defines.h:84:0: warning: "__cdecl" redefined
#define __cdecl
<built-in>: note: this is the location of the previous definition
# cuda_wrapper/cuda
C:\Users\user\AppData\Local\Temp\go-build038297194\cuda_wrapper\cuda\_obj\cuda.cgo2.o: In function `_cgo_440ebb0a3e25_Cfunc_kernel_kValid':
/tmp/go-build\cuda_wrapper\cuda\_obj/cgo-gcc-prolog:306: undefined reference to `kernel_kValid'
collect2.exe: error: ld returned 1 exit status
It's here I'm not really sure what's wrong. I have been looking at questions asked about undefined references with cgo but nothing I have found has solved my issue. I have also been looking at the fact that the CUDA runtime API is written in C++ and if that would affect how cgo will compile this but again I haven't found anything conclusive. At this point I think I have confused myself more than anything else so I'm hoping someone more knowledgeable can point me in the right direction.

Good catch on the name manlging.
Here's a solution we used for gorgonia:
#include <math.h>
#ifdef __cplusplus
extern "C" {
#endif
__global__ void sigmoid32(float* A, int size)
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
int idx = blockId * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
if (idx >= size) {
return;
}
A[idx] = 1 / (1 + powf((float)(M_E), (-1 * A[idx])));
}
#ifdef __cplusplus
}
#endif
So... just wrap your kernel wrapper function in extern "C"

Related

Xcode warns me of implicit declaration of 'malloc' but stdlib.h is included in header

I am trying to create a stack data type in C in Xcode, therefore using a stack init function.
The following is found in DeckStack.c
#define DS DeckStack
#include "DeckStack.h"
struct _DeckStack {
void *top;
void *cards[40];
};
DeckStack* stack_init(void) {
DS *new_deckStack = NULL;
new_deckStack = (DS*) malloc(sizeof(DS));
return new_deckStack;
}
But Xcode is complaining about the implicit declaration of malloc, yet stdlib.h is included in DeckStack.h:
#ifndef DeckStack_h
#define DeckStack_h
#include <stdio.h>
#include <stdlib.h>
typedef struct _DeckStack DeckStack;
/* #brief
* Creates a new deck
*/
DeckStack* stack_init(void);
#endif /* DeckStack_h */
Solved, just press enter a few times and the .c file will update itself with the contents of the header.

undefined reference to error only for certain functions

I have a library and a C interface built for it.
My program compiles just fine with versionString() but not with loadConfig(). How is that possible?
walker.h :
#ifndef WFE_C_H
#define WFE_C_H
#ifdef __cplusplus
extern "C" {
#endif
const char* versionString();
void* loadConfig(const char *filePath, char* errorMessageBuffer, int bufferLen);
#ifdef __cplusplus
}
#endif
#endif
Working version:
package main
/*
#cgo CFLAGS: -Isrc/walker
#cgo LDFLAGS: -L${SRCDIR}/lib -lwalker
#include "walker.h"
*/
import (
"C"
)
func main() {
p := C.versionString()
version := C.GoString(p)
fmt.Println(version)
}
// output: v1.94
Not working version:
package main
/*
#cgo CFLAGS: -Isrc/walker
#cgo LDFLAGS: -L${SRCDIR}/lib -lwalker
#include "walker.h"
*/
import (
"C"
)
func main() {
errorMessageBuffer := C.CString("")
pathPtr := C.CString("/src/vali/config")
bufferLen := C.int(len(4000))
C.loadConfig(pathPtr, errorMessageBuffer, bufferLen)
}
Traceback:
/tmp/go-build056292810/github.com/testong/vali/_obj/main.cgo2.o: In
function `_cgo_b2bd1c9e3dda_Cfunc_loadConfig':
/tmp/go-build/github.com/testong/vali/_obj/cgo-gcc-prolog:43: undefined
reference to `loadConfig'
collect2: error: ld returned 1 exit status
In walker.h , the export function is
void* loadConfig(const char *filePath, char* errorMessageBuffer, int bufferLen);
And you used C.loadConfigDirectory(pathPtr, errorMessageBuffer, bufferLen).
The function loadConfigDirectory is not defined.
Maybe it is a typo?
Working version works because versionString is already defined.

How to Link Golang package to an existing C project (Using Go from C) on windows x86-64

How to use Golang Generated libgolang.a from C code to build C executable: test.exe:
these commands makes executable binary 'test' in Ubuntu x86-64 and works fine (but not in Windows x86-64):
go build -buildmode c-archive -o libgolang.a
gcc -o test _main.c libgolang.a -lpthread
with:
this is main.go file:
package main
import "C"
import "fmt"
//export Add
func Add(a, b uint64) uint64 {
return a + b
}
func main() {
fmt.Println("Hi")
}
and this is _main.c file:
#include <stdio.h>
#include <stdint.h>
#include "libgolang.h"
int main()
{
uint64_t a=10;
uint64_t b=20;
uint64_t c=Add(a,b);
printf("%ld\n",c);
return 0;
}
in Windows this command:
go build -buildmode c-archive -o libgolang.a
works fine and generates libgolang.a and libgolang.h files. libgolang.h:
/* Created by "go tool cgo" - DO NOT EDIT. */
/* package github.com/ARamazani/go/DLL */
/* Start of preamble from import "C" comments. */
/* End of preamble from import "C" comments. */
/* Start of boilerplate cgo prologue. */
#ifndef GO_CGO_PROLOGUE_H
#define GO_CGO_PROLOGUE_H
typedef signed char GoInt8;
typedef unsigned char GoUint8;
typedef short GoInt16;
typedef unsigned short GoUint16;
typedef int GoInt32;
typedef unsigned int GoUint32;
typedef long long GoInt64;
typedef unsigned long long GoUint64;
typedef GoInt64 GoInt;
typedef GoUint64 GoUint;
typedef __SIZE_TYPE__ GoUintptr;
typedef float GoFloat32;
typedef double GoFloat64;
typedef float _Complex GoComplex64;
typedef double _Complex GoComplex128;
/*
static assertion to make sure the file is being used on architecture
at least with matching size of GoInt.
*/
typedef char _check_for_64_bit_pointer_matching_GoInt[sizeof(void*)==64/8 ? 1:-1];
typedef struct { const char *p; GoInt n; } GoString;
typedef void *GoMap;
typedef void *GoChan;
typedef struct { void *t; void *v; } GoInterface;
typedef struct { void *data; GoInt len; GoInt cap; } GoSlice;
#endif
/* End of boilerplate cgo prologue. */
#ifdef __cplusplus
extern "C" {
#endif
extern GoUint64 Add(GoUint64 p0, GoUint64 p1);
#ifdef __cplusplus
}
#endif
but this command:
gcc -o test _main.c libgolang.a -lpthread
output:
libgolang.a(go.o):(.data+0x2150): undefined reference to `NtWaitForSingleObject'
libgolang.a(go.o):(.data+0x21b8): undefined reference to `WSAGetOverlappedResult'
libgolang.a(go.o):(.data+0x21d8): undefined reference to `timeBeginPeriod'
collect2.exe: error: ld returned 1 exit status
go version go1.7rc3 windows/amd64
I want to use this libgolang.a from C code to build C executable: test.exe
any workaround?
some useful links:
http://blog.ralch.com/tutorial/golang-sharing-libraries/
Using Go code in an existing C project
Using Go on existing C project
Finally found at least one way:
using this file main.go:
package main
import "C"
import "fmt"
//export Add
func Add(a, b uint64) uint64 {
return a + b
}
func main() {
fmt.Println("Hi")
}
run this command:
go build -buildmode c-archive -o libgolang.a
to generate libgolang.a and libgolang.h
then using _main.c:
#include <stdio.h>
#include <stdint.h>
#include "libgolang.h"
int main()
{
uint64_t a=10;
uint64_t b=20;
uint64_t c=Add(a,b);
printf("%ld\n",c);
return 0;
}
this is the answer (this works for me):
gcc -o test _main.c libgolang.a -lWinMM -lntdll -lWS2_32
that test.exe created.
run:
test.exe
output:
30
The Microsoft Windows always trick us!
But for production code my suggestion is use the optimization compile commands below:
go build -ldflags "-s -w" -buildmode c-archive -o libgolang.a
The parameter -ldflags "-s -w", will reduze the final file size.
The "libgolang.a" shrink from 4.295.562 to 1.994.554.
And the "test.exe" shrink from 3.430.485 to 1.715.561.
I guess this is a desired behavior!

how to compile Cuda source with Go language's cgo?

I wrote a simple program in cuda-c and it works on eclipse nsight. This is source code:
#include <iostream>
#include <stdio.h>
__global__ void add( int a,int b, int *c){
*c = a + b;
}
int main(void){
int c;
int *dev_c;
cudaMalloc((void**)&dev_c, sizeof(int));
add <<<1,1>>>(2,7,dev_c);
cudaMemcpy(&c, dev_c, sizeof(int),cudaMemcpyDeviceToHost);
printf("\n2+7= %d\n",c);
cudaFree(dev_c);
return 0;
}
Now I'm trying to use this code with Go language with cgo!!!
So I wrote this new code:
package main
//#include "/usr/local/cuda-7.0/include/cuda.h"
//#include "/usr/local/cuda-7.0/include/cuda_runtime.h"
//#cgo LDFLAGS: -lcuda
//#cgo LDFLAGS: -lcurand
////default location:
//#cgo LDFLAGS: -L/usr/local/cuda-7.0/lib64 -L/usr/local/cuda-7.0/lib
//#cgo CFLAGS: -I/usr/local/cuda-7.0/include/
//
//
//
//
//
//
//
//
//
//
/*
#include <stdio.h>
__global__ void add( int a,int b, int *c){
*c = a + b;
}
int esegui_somma(void){
int c;
int *dev_c;
cudaMalloc((void**)&dev_c, sizeof(int));
add <<<1,1>>> (2,7,dev_c);
cudaMemcpy(&c, dev_c, sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(dev_c);
return c;
}
*/
import "C"
import "fmt"
func main(){
fmt.Printf("il risultato è %d",C.esegui_somma)
}
But it doesn't work!!
I read this error message:
cgo_cudabyexample_1/main.go:34:8: error: expected expression before '<' token
add <<<1,1>>> (2,7,dev_c);
^
I think that I must to set nvcc cuda compiler for cgo instead of gcc.
How can I do it? Can I change CC environment variable?
best regards
I finally figured out how to do this. Thing biggest problem is that nvccdoes not follow gcc standard flags and unlike clang it won't silently ignore them. cgo triggers the problem by adding a bunch of flags not explicitly specified by the user.
To make it all work, you'll need to separate out your device code and the functions that directly call it into separate files and compile/package them directly using nvcc into a shared library (.so). Then you'll use cgo to link this shared library using whatever default linker you have on your system. The only thing you'll have to add is -lcudart to your LDFLAGS (linker flags) to link the CUDA runtime.

Separating out .cu and .cpp(using c++11 library)

I am trying to convert a c++ program I have which uses random library which is a C++11 feature. After having read through a couple of similar posts here, I tried by separating out the code into three files. At the outset I would like to say that I am not very conversant at C/C++ and mostly use R at work.
The main file looks as follows.
#ifndef _KERNEL_SUPPORT_
#define _KERNEL_SUPPORT_
#include <complex>
#include <random>
#include <iostream>
#include "my_code_header.h"
using namespace std;
std::default_random_engine generator;
std::normal_distribution<double> distribution(0.0,1.0);
const int rand_mat_length = 24561;
double rand_mat[rand_mat_length];// = {0};
void create_std_norm(){
for(int i = 0 ; i < rand_mat_length ; i++)
::rand_mat[i] = distribution(generator);
}
.
.
.
int main(void)
{
...
...
call_global();
return 0;
}
#endif
The header file looks as follows.
#ifndef mykernel_h
#define mykernel_h
void call_global();
void two_d_example(double *a, double *b, double *my_result, size_t length, size_t width);
#endif
And the .cu file looks like the following.
#ifndef _MY_KERNEL_
#define _MY_KERNEL_
#include <iostream>
#include "my_code_header.h"
#define TILE_WIDTH 8
using namespace std;
__global__ void two_d_example(double *a, double *b, double *my_result, size_t length, size_t width)
{
unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
if ((row>length) || (col>width)) {
return;
}
...
}
void call_global()
{
const size_t imageLength = 528;
const size_t imageWidth = 528;
const dim3 threadsPerBlock(TILE_WIDTH,TILE_WIDTH);
const dim3 numBlocks(((imageLength) / threadsPerBlock.x), ((imageWidth) / threadsPerBlock.y));
double *d_a, *d_b, *mys ;
...
cudaMalloc((void**)&d_a, sizeof(double) * imageLength);
cudaMalloc((void**)&d_b, sizeof(double) * imageWidth);
cudaMalloc((void**)&mys, sizeof(double) * imageLength * imageWidth);
two_d_example<<<numBlocks,threadsPerBlock>>>(d_a, d_b, mys, imageLength, imageWidth);
...
cudaFree(d_a);
cudaFree(d_b);
}
#endif
Please note that the __global__ has been removed from .h since I was getting the following error owing to it being compiled by g++.
In file included from my_code_main.cpp:12:0:
my_code_header.h:5:1: error: ‘__global__’ does not name a type
When I compile the .cu file with nvcc it is all fine and generates a my_code_kernel.o. But since I am using C++11 in my .cpp I am trying to compile it with g++ and I am getting the following error.
/tmp/ccR2rXzf.o: In function `main':
my_code_main.cpp:(.text+0x1c4): undefined reference to `call_global()'
collect2: ld returned 1 exit status
I understand that this might not have to do anything with CUDA as such and may just be the wrong use of including the header at both places. Also what is the right way to compile and most importantly link the my_code_kernel.o and my_code_main.o(hopefully)? Sorry if this question is too trivial!
It looks like you are not linking with my_code_kernel.o. You have used -c for your nvcc command (causes it to compile but not link, i.e. generate the .o file), I'm going to guess that you're not using -c with your g++ command, in which case you need to add my_code_kernel.o to the list of inputs as well as the .cpp file.
The separation you are trying to achieve is completely possible, it just looks like your not linking properly. If you still have problems, add the compilation commands to your question.
FYI: You don't need to declare two_d_example() in your header file, it is only used within your .cu file (from call_global()).

Resources