how to compile Cuda source with Go language's cgo? - go

I wrote a simple program in cuda-c and it works on eclipse nsight. This is source code:
#include <iostream>
#include <stdio.h>
__global__ void add( int a,int b, int *c){
*c = a + b;
}
int main(void){
int c;
int *dev_c;
cudaMalloc((void**)&dev_c, sizeof(int));
add <<<1,1>>>(2,7,dev_c);
cudaMemcpy(&c, dev_c, sizeof(int),cudaMemcpyDeviceToHost);
printf("\n2+7= %d\n",c);
cudaFree(dev_c);
return 0;
}
Now I'm trying to use this code with Go language with cgo!!!
So I wrote this new code:
package main
//#include "/usr/local/cuda-7.0/include/cuda.h"
//#include "/usr/local/cuda-7.0/include/cuda_runtime.h"
//#cgo LDFLAGS: -lcuda
//#cgo LDFLAGS: -lcurand
////default location:
//#cgo LDFLAGS: -L/usr/local/cuda-7.0/lib64 -L/usr/local/cuda-7.0/lib
//#cgo CFLAGS: -I/usr/local/cuda-7.0/include/
//
//
//
//
//
//
//
//
//
//
/*
#include <stdio.h>
__global__ void add( int a,int b, int *c){
*c = a + b;
}
int esegui_somma(void){
int c;
int *dev_c;
cudaMalloc((void**)&dev_c, sizeof(int));
add <<<1,1>>> (2,7,dev_c);
cudaMemcpy(&c, dev_c, sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(dev_c);
return c;
}
*/
import "C"
import "fmt"
func main(){
fmt.Printf("il risultato è %d",C.esegui_somma)
}
But it doesn't work!!
I read this error message:
cgo_cudabyexample_1/main.go:34:8: error: expected expression before '<' token
add <<<1,1>>> (2,7,dev_c);
^
I think that I must to set nvcc cuda compiler for cgo instead of gcc.
How can I do it? Can I change CC environment variable?
best regards

I finally figured out how to do this. Thing biggest problem is that nvccdoes not follow gcc standard flags and unlike clang it won't silently ignore them. cgo triggers the problem by adding a bunch of flags not explicitly specified by the user.
To make it all work, you'll need to separate out your device code and the functions that directly call it into separate files and compile/package them directly using nvcc into a shared library (.so). Then you'll use cgo to link this shared library using whatever default linker you have on your system. The only thing you'll have to add is -lcudart to your LDFLAGS (linker flags) to link the CUDA runtime.

Related

Haskell compile DLL without copy-past ghc/include folder

I successfully created a DLL in Haskell. My problem is that everytime I want to compile a test program which loads and uses a function of my DLL I need to copy/paste files from C:\tools\ghc-9.0.1\include in my working directory.
The following files are:
HsFFI.h
ghcconfig.h
ghcautoconf.h
ghcplatform.h
stg/Types.h
I followed the tutorial on their documentation https://downloads.haskell.org/~ghc/7.6.3/docs/html/users_guide/win32-dlls.html but used other compiler commands to make it work.
This is my Adder.hs file
{-# LANGUAGE ForeignFunctionInterface #-}
module Adder where
adder :: Int -> Int -> IO Int
adder x y = return (x+y)
foreign export ccall adder :: Int -> Int -> IO Int
This is my StartEnd.c file to start Haskell runtime
#include <Rts.h>
void HsStart()
{
int argc = 1;
char* argv[] = {"ghcDll", NULL}; // argv must end with NULL
// Initialize Haskell runtime
char** args = argv;
hs_init(&argc, &args);
}
void HsEnd()
{
hs_exit();
}
This is my MyDef.def file to add my functions
EXPORTS
adder
HsStart
HsEnd
I compiled Adder.hs by writing ghc -shared Adder.hs StartEnd.c -o Adder.dll Mydef.def
This is my test.cpp file on c++. I wrote #include "HsFFI.h" to copy/paste HsFFI.h into my working directory as it couldn't find this file by itself when writing #include <HsFFI.h>. I compiled test.cpp by writing g++ -o test test.cpp Adder.dll.a My guess is that i need to make an environment variable so g++ can find this file, but how should i name this variable so g++ can find this file?
#include "HsFFI.h"
#include "Adder_stub.h"
#include <stdio.h>
extern "C" {
void HsStart();
void HsEnd();
}
int main()
{
HsStart();
// can now safely call functions from the DLL
printf("12 + 5 = %i\n", adder(12,5)) ;
HsEnd();
return 0;
}
I used ghc-9.0.1 and windows10.

Shared library undefined reference for CUDA Kernel wrapper

So I am attempting to use the CUDA Runtime API with Go's cgo on Windows. I've been at this for a few days now and am stuck: I am getting an undefined reference to my kernel wrapper.
I have separated out my kernel and it's wrapper into the following
FILE: cGo.cuh
typedef unsigned long int ktype;
typedef unsigned char glob;
/*
function Prototypes
*/
extern "C" void kernel_kValid(int, int, ktype *, glob *);
__global__ void kValid(ktype *, glob *);
FILE: cGo.cu
#include "cGo.cuh"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_runtime.h"
//function Definitions
/*
kernel_kValid is a wrapper function for the CUDA Kernel to be called from Go
*/
extern "C" void kernel_kValid(int blocks, int threads, ktype *kInfo, glob *values) {
kValid<<<blocks, threads>>>(kInfo, values);//execute the kernel
}
/*
kValid is the CUDA Kernel which is to be executed
*/
__global__ void kValid(ktype *kInfo, glob *values) {
//lots of code
}
I compile my CUDA source code into a shared library as such:
nvcc -shared -o myLib.so cGo.cu
then I have created a header file to include in my cgo
FILE: cGo.h
typedef unsigned long int ktype;
typedef unsigned char glob;
/*
function Declarations
*/
void kernel_kValid(int , int , ktype *, glob *);
Then from the go package I utilize cgo to call my kernel wrapper I have
package cuda
/*
#cgo LDFLAGS: -LC:/Storage/Cuda/lib/x64 -lcudart //this is the Cuda library
#cgo LDFLAGS: -L${SRCDIR}/lib -lmyLib //this is my shared library
#cgo CPPFLAGS: -IC:/Storage/Cuda/include //this contains cuda headers
#cgo CPPFLAGS: -I${SRCDIR}/include //this contains cGo.h
#include <cuda_runtime.h>
#include <stdlib.h>
#include "cGo.h"
*/
import "C"
func useKernel(){
//other code
C.kernel_kValid(C.int(B), C.int(T), unsafe.Pointer(storageDevice), unsafe.Pointer(globDevice))
cudaErr, err = C.cudaDeviceSynchronize()
//rest of the code
}
So all of the calls to the CUDA runtime API don't throw errors, it's only my kernel wrapper. This is the output when I build the cuda package with go.
C:\Users\user\Documents\Repos\go\cuda_wrapper>go build cuda_wrapper\cuda
# cuda_wrapper/cuda
In file included from C:/Storage/Cuda/include/host_defines.h:50:0,
from C:/Storage/Cuda/include/device_types.h:53,
from C:/Storage/Cuda/include/builtin_types.h:56,
from C:/Storage/Cuda/include/cuda_runtime.h:86,
from C:\Go\workspace\src\cuda_wrapper\cuda\cuda.go:12:
C:/Storage/Cuda/include/crt/host_defines.h:84:0: warning: "__cdecl" redefined
#define __cdecl
<built-in>: note: this is the location of the previous definition
# cuda_wrapper/cuda
C:\Users\user\AppData\Local\Temp\go-build038297194\cuda_wrapper\cuda\_obj\cuda.cgo2.o: In function `_cgo_440ebb0a3e25_Cfunc_kernel_kValid':
/tmp/go-build\cuda_wrapper\cuda\_obj/cgo-gcc-prolog:306: undefined reference to `kernel_kValid'
collect2.exe: error: ld returned 1 exit status
It's here I'm not really sure what's wrong. I have been looking at questions asked about undefined references with cgo but nothing I have found has solved my issue. I have also been looking at the fact that the CUDA runtime API is written in C++ and if that would affect how cgo will compile this but again I haven't found anything conclusive. At this point I think I have confused myself more than anything else so I'm hoping someone more knowledgeable can point me in the right direction.
Good catch on the name manlging.
Here's a solution we used for gorgonia:
#include <math.h>
#ifdef __cplusplus
extern "C" {
#endif
__global__ void sigmoid32(float* A, int size)
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
int idx = blockId * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
if (idx >= size) {
return;
}
A[idx] = 1 / (1 + powf((float)(M_E), (-1 * A[idx])));
}
#ifdef __cplusplus
}
#endif
So... just wrap your kernel wrapper function in extern "C"

How to Link Golang package to an existing C project (Using Go from C) on windows x86-64

How to use Golang Generated libgolang.a from C code to build C executable: test.exe:
these commands makes executable binary 'test' in Ubuntu x86-64 and works fine (but not in Windows x86-64):
go build -buildmode c-archive -o libgolang.a
gcc -o test _main.c libgolang.a -lpthread
with:
this is main.go file:
package main
import "C"
import "fmt"
//export Add
func Add(a, b uint64) uint64 {
return a + b
}
func main() {
fmt.Println("Hi")
}
and this is _main.c file:
#include <stdio.h>
#include <stdint.h>
#include "libgolang.h"
int main()
{
uint64_t a=10;
uint64_t b=20;
uint64_t c=Add(a,b);
printf("%ld\n",c);
return 0;
}
in Windows this command:
go build -buildmode c-archive -o libgolang.a
works fine and generates libgolang.a and libgolang.h files. libgolang.h:
/* Created by "go tool cgo" - DO NOT EDIT. */
/* package github.com/ARamazani/go/DLL */
/* Start of preamble from import "C" comments. */
/* End of preamble from import "C" comments. */
/* Start of boilerplate cgo prologue. */
#ifndef GO_CGO_PROLOGUE_H
#define GO_CGO_PROLOGUE_H
typedef signed char GoInt8;
typedef unsigned char GoUint8;
typedef short GoInt16;
typedef unsigned short GoUint16;
typedef int GoInt32;
typedef unsigned int GoUint32;
typedef long long GoInt64;
typedef unsigned long long GoUint64;
typedef GoInt64 GoInt;
typedef GoUint64 GoUint;
typedef __SIZE_TYPE__ GoUintptr;
typedef float GoFloat32;
typedef double GoFloat64;
typedef float _Complex GoComplex64;
typedef double _Complex GoComplex128;
/*
static assertion to make sure the file is being used on architecture
at least with matching size of GoInt.
*/
typedef char _check_for_64_bit_pointer_matching_GoInt[sizeof(void*)==64/8 ? 1:-1];
typedef struct { const char *p; GoInt n; } GoString;
typedef void *GoMap;
typedef void *GoChan;
typedef struct { void *t; void *v; } GoInterface;
typedef struct { void *data; GoInt len; GoInt cap; } GoSlice;
#endif
/* End of boilerplate cgo prologue. */
#ifdef __cplusplus
extern "C" {
#endif
extern GoUint64 Add(GoUint64 p0, GoUint64 p1);
#ifdef __cplusplus
}
#endif
but this command:
gcc -o test _main.c libgolang.a -lpthread
output:
libgolang.a(go.o):(.data+0x2150): undefined reference to `NtWaitForSingleObject'
libgolang.a(go.o):(.data+0x21b8): undefined reference to `WSAGetOverlappedResult'
libgolang.a(go.o):(.data+0x21d8): undefined reference to `timeBeginPeriod'
collect2.exe: error: ld returned 1 exit status
go version go1.7rc3 windows/amd64
I want to use this libgolang.a from C code to build C executable: test.exe
any workaround?
some useful links:
http://blog.ralch.com/tutorial/golang-sharing-libraries/
Using Go code in an existing C project
Using Go on existing C project
Finally found at least one way:
using this file main.go:
package main
import "C"
import "fmt"
//export Add
func Add(a, b uint64) uint64 {
return a + b
}
func main() {
fmt.Println("Hi")
}
run this command:
go build -buildmode c-archive -o libgolang.a
to generate libgolang.a and libgolang.h
then using _main.c:
#include <stdio.h>
#include <stdint.h>
#include "libgolang.h"
int main()
{
uint64_t a=10;
uint64_t b=20;
uint64_t c=Add(a,b);
printf("%ld\n",c);
return 0;
}
this is the answer (this works for me):
gcc -o test _main.c libgolang.a -lWinMM -lntdll -lWS2_32
that test.exe created.
run:
test.exe
output:
30
The Microsoft Windows always trick us!
But for production code my suggestion is use the optimization compile commands below:
go build -ldflags "-s -w" -buildmode c-archive -o libgolang.a
The parameter -ldflags "-s -w", will reduze the final file size.
The "libgolang.a" shrink from 4.295.562 to 1.994.554.
And the "test.exe" shrink from 3.430.485 to 1.715.561.
I guess this is a desired behavior!

Unable to get the stack trace with a corefile from a cgo routine when using golang

I am using Golang and cgo. When my C code raises an assert(), I am unable to see the stack trace of the C code when using cgo.
Instead, I see the stack trace of the golang runtime that caught the assert.
Here is an example of my C code
#include <stdio.h>
#include <pthread.h>
#include <assert.h>
#include <string.h>
void fn2(char *arg)
{
int stackvar2 = 256;
printf("Argument %s\n", arg);
assert(1 == 2);
}
void fn1(int arg)
{
int stackvar3 = 512;
char var[256];
strcpy(var, "deadbeef");
fn2(var);
}
void *thread(void *arg)
{
printf("Hello from the thread... going in for an assert\n");
fn1(1092);
return NULL;
}
void hello_world(char *str)
{
pthread_t tid;
printf("Hello World from C and here the str is from Go: %s\n", str);
pthread_create(&tid, NULL, thread, NULL);
sleep(100000);
}
Here is my Go code
package main
/*
extern void hello_world(char *str);
#cgo LDFLAGS: -L. -lhello
#cgo CFLAGS: -g3
*/
import "C"
import (
_ "fmt"
)
func main() {
str := "From Golang"
cStr := C.CString(str)
C.hello_world(cStr)
select {}
}
And here is my Makefile
all:
gcc -g3 -O0 -c hello.c
ar cru libhello.a hello.o
go build hello.go
clean:
rm -f *.o hello
Besides the obvious check of ulimit -c, run the go program with GOTRACEBACK=crash. This will print out more information, and allow the program to exit with SIGABRT to trigger a core dump.
Actually I need to add this to my go code: signal.Ignore(syscall.SIGABRT). This allows me to see the stack trace of the C code that crashed.

Separating out .cu and .cpp(using c++11 library)

I am trying to convert a c++ program I have which uses random library which is a C++11 feature. After having read through a couple of similar posts here, I tried by separating out the code into three files. At the outset I would like to say that I am not very conversant at C/C++ and mostly use R at work.
The main file looks as follows.
#ifndef _KERNEL_SUPPORT_
#define _KERNEL_SUPPORT_
#include <complex>
#include <random>
#include <iostream>
#include "my_code_header.h"
using namespace std;
std::default_random_engine generator;
std::normal_distribution<double> distribution(0.0,1.0);
const int rand_mat_length = 24561;
double rand_mat[rand_mat_length];// = {0};
void create_std_norm(){
for(int i = 0 ; i < rand_mat_length ; i++)
::rand_mat[i] = distribution(generator);
}
.
.
.
int main(void)
{
...
...
call_global();
return 0;
}
#endif
The header file looks as follows.
#ifndef mykernel_h
#define mykernel_h
void call_global();
void two_d_example(double *a, double *b, double *my_result, size_t length, size_t width);
#endif
And the .cu file looks like the following.
#ifndef _MY_KERNEL_
#define _MY_KERNEL_
#include <iostream>
#include "my_code_header.h"
#define TILE_WIDTH 8
using namespace std;
__global__ void two_d_example(double *a, double *b, double *my_result, size_t length, size_t width)
{
unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
if ((row>length) || (col>width)) {
return;
}
...
}
void call_global()
{
const size_t imageLength = 528;
const size_t imageWidth = 528;
const dim3 threadsPerBlock(TILE_WIDTH,TILE_WIDTH);
const dim3 numBlocks(((imageLength) / threadsPerBlock.x), ((imageWidth) / threadsPerBlock.y));
double *d_a, *d_b, *mys ;
...
cudaMalloc((void**)&d_a, sizeof(double) * imageLength);
cudaMalloc((void**)&d_b, sizeof(double) * imageWidth);
cudaMalloc((void**)&mys, sizeof(double) * imageLength * imageWidth);
two_d_example<<<numBlocks,threadsPerBlock>>>(d_a, d_b, mys, imageLength, imageWidth);
...
cudaFree(d_a);
cudaFree(d_b);
}
#endif
Please note that the __global__ has been removed from .h since I was getting the following error owing to it being compiled by g++.
In file included from my_code_main.cpp:12:0:
my_code_header.h:5:1: error: ‘__global__’ does not name a type
When I compile the .cu file with nvcc it is all fine and generates a my_code_kernel.o. But since I am using C++11 in my .cpp I am trying to compile it with g++ and I am getting the following error.
/tmp/ccR2rXzf.o: In function `main':
my_code_main.cpp:(.text+0x1c4): undefined reference to `call_global()'
collect2: ld returned 1 exit status
I understand that this might not have to do anything with CUDA as such and may just be the wrong use of including the header at both places. Also what is the right way to compile and most importantly link the my_code_kernel.o and my_code_main.o(hopefully)? Sorry if this question is too trivial!
It looks like you are not linking with my_code_kernel.o. You have used -c for your nvcc command (causes it to compile but not link, i.e. generate the .o file), I'm going to guess that you're not using -c with your g++ command, in which case you need to add my_code_kernel.o to the list of inputs as well as the .cpp file.
The separation you are trying to achieve is completely possible, it just looks like your not linking properly. If you still have problems, add the compilation commands to your question.
FYI: You don't need to declare two_d_example() in your header file, it is only used within your .cu file (from call_global()).

Resources