undefined cudaMalloc symbols - macos

I'm trying to compile the cublas example from the CUDA documentation
//Example 2. Application Using C and CUBLAS: 0-based indexing
//-----------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#define M 6
#define N 5
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
static __inline__ void modify (cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta){
cublasSscal (handle, n-p, &alpha, &m[IDX2C(p,q,ldm)], ldm);
cublasSscal (handle, ldm-p, &beta, &m[IDX2C(p,q,ldm)], 1);
}
int main (void){
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
int i, j;
float* devPtrA;
float* a = 0;
a = (float *)malloc (M * N * sizeof (*a));
if (!a) {
printf ("host memory allocation failed");
return EXIT_FAILURE;
}
for (j = 0; j < N; j++) {
for (i = 0; i < M; i++) {
a[IDX2C(i,j,M)] = (float)(i * M + j + 1);
}
}
cudaStat = cudaMalloc ((void**)&devPtrA, M*N*sizeof(*a));
if (cudaStat != cudaSuccess) {
printf ("device memory allocation failed");
return EXIT_FAILURE;
}
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
return EXIT_FAILURE;
}
stat = cublasSetMatrix (M, N, sizeof(*a), a, M, devPtrA, M);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data download failed");
cudaFree (devPtrA);
cublasDestroy(handle);
return EXIT_FAILURE;
}
modify (handle, devPtrA, M, N, 1, 2, 16.0f, 12.0f);
stat = cublasGetMatrix (M, N, sizeof(*a), devPtrA, M, a, M);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data upload failed");
cudaFree (devPtrA);
cublasDestroy(handle);
return EXIT_FAILURE;
}
cudaFree (devPtrA);
cublasDestroy(handle);
for (j = 0; j < N; j++) {
for (i = 0; i < M; i++) {
printf ("%7.0f", a[IDX2C(i,j,M)]);
}
printf ("\n");
}
free(a);
return EXIT_SUCCESS;
}
I saved this file into "cudaexample.c" and am trying to compile with gcc cudaexample.c -I/usr/local/cuda/include -L/usr/local/cuda/lib -lcuda -lcublas
I get an undefined symbols error:
Undefined symbols for architecture x86_64:
"_cudaFree", referenced from:
_main in ccpPWjbO.o
"_cudaMalloc", referenced from:
_main in ccpPWjbO.o
ld: symbol(s) not found for architecture x86_64
collect2: error: ld returned 1 exit status
It seems like I've specified the commands properly as other symbols (e.g. cublasCreate) are found. Why are Free and Malloc not present?
Relevant details:
OSX: 10.10.2
gcc: 4.8.4 (target: x86_64-apple-darwin14)
Graphics: NVIDA GeForce GT 650M 1024 MB
I downloaded and installed the CUDA-6.5 toolkit

Those API functions (e.g. cudaMalloc) are contained in the CUDA runtime library. You are not linking against that library, so those symbols aren't found during the link phase.
Add -lcudart to your link flags:
-I/usr/local/cuda/include -L/usr/local/cuda/lib -lcuda -lcublas -lcudart
and it should fix that issue for you.
(-lcuda is only needed if you are using CUDA driver API functions. You can remove that if you wish.)

Related

Undefined symbols for architecture arm64: clang: error: linker command failed with exit code 1

I have some problems when I try to run my main.cpp file, only with mac gcc/clang/g++ compiler.
Here is the code:
random.h
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <ctime>
void initialize();
float randomFloat(int, int, int);
random.cpp
#include "random.h"
void initialize() { srand(time(NULL)); }
float randomFloat(int min, int max, int p) {
int intPart = rand() % (max - (min - 1)) + min;
if (intPart == max) {
intPart--;
}
float decimal = (float)(rand() % (int)pow(10, p)) / pow(10, p);
return intPart + decimal;
}
util.h
#include <iostream>
int askInteger(const char *, bool);
util.cpp
#include "util.h"
using namespace std;
int askInteger(const char *message, bool onlyPositive) {
int number;
if (onlyPositive) {
while (cout << "Type a correct " << message << ": ",
!(cin >> number) || number < 0) {
cerr << "Input error, try again. \n";
if (cin.fail()) {
cin.clear();
cin.ignore();
}
}
} else {
while (cout << "Type a correct " << message << ": ", !(cin >> number)) {
cerr << "Input error, try again. \n";
if (cin.fail()) {
cin.clear();
cin.ignore();
}
}
}
return number;
}
main.cpp
#include "random/random.h"
#include "util/util.h"
using namespace std;
int main() {
int n, min, max, p;
n = askInteger("numbers quantity", true);
p = askInteger("precession", true);
min = askInteger("min value (included)", false);
max = askInteger("max value (included)", false);
while (max <= min) {
cout << "max value should be greather than " << min << "\n";
max = askInteger("max value (included)", false);
}
initialize();
for (int i = 0; i < n; i++) {
cout << randomFloat(min, max, p) << "\n";
}
}
And It gives me this result:
Undefined symbols for architecture arm64:
"askInteger(char const*, bool)", referenced from:
_main in main-2552a5.o
"initialize()", referenced from:
_main in main-2552a5.o
"randomFloat(int, int, int)", referenced from:
_main in main-2552a5.o
ld: symbol(s) not found for architecture arm64
clang: error: linker command failed with exit code 1 (use -v to see invocation)
vscode applies this command to run the code:
Random-Numbers % cd "/Users/user/Downloads/Random-Numbers/" && g++ main.cpp -
o main && "/Users/user/Downloads/Random-Numbers/"main
Here is my project structure:
enter image description here
I have tried already
gcc *.h
gcc -c *.h
g++ *.cpp
g++ -o *.cpp
g++ -c *.cpp
AT LAST! I could run it
c++ vscode default commands doesn’t match with g++ mac compiler
c++ vscode default command to run main:
d "/Users/user/Downloads/Random-Numbers/" && g++ main.cpp -o main && "/Users/user/Downloads/Random-Numbers/"main
user#MacBook-Pro-de-user Random-Numbers % cd "/Users/user/Downloads/Random-Numbers/" && g++ main.cpp -o main && "/Users/
user/Downloads/Random-Numbers/"main
you have to compile by terminal with the command:
user#MacBook-Pro-de-user Random-Numbers % g++ -o ./main.exe random/random.cpp util/util.cpp main.cpp
user#MacBook-Pro-de-user Random-Numbers % ./main.exe
commands g++

Compiling LAPACKe library using C in MacOS

I need to use LAPACKe functions in a code that should run on Linux and macOS, but the problem is under OsX. I have a MacBook Pro 2021, with an M1 Pro processor, running an OsX 12.6.2
I wrote an example code
#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__
#include <Accelerate/Accelerate.h>
#else
#include <lapacke.h>
#endif
const int N= 3, NRHS=2, LDA=N, LDB=N;
const int NN=5, NRHS2=3;
int main(int argc, char** argv) {
/******* Example of using the lapack library with
the C interface for solving linear systems
for a general full matrix **************/
int ipiv[N], info;
// a[LDA*N]
double a[] = {
6.80, -2.11, 5.66,
-6.05, -3.30, 5.36,
-0.45, 2.58, -2.70
};
// b[LDB*NRHS]
double b[] = {
4.02, 6.19, -8.22,
-1.56, 4.00, -8.67
};
printf("\nTest of using LAPACKe Library\n");
printf("Matrix A : %d by %d\n", N, N);
for(int i=0; i<N; i++){
int s = i;
for(int j=0; j<N; j++, s+=LDA) printf(" % 6.2lf", a[s]);
printf("\n");
}
printf("\nRight hand side %d vectors of %d\n", NRHS, N);
for(int i=0; i<N; i++){
int s = i;
for(int j=0; j<NRHS; j++, s+=LDA) printf(" % 10.6lf", b[s]);
printf("\n");
}
/** As long the LAPACK_COL_MAJOR is used, the matrix is
filled up by columns, pay attention in the way is printed **/
info = LAPACKE_dgesv( LAPACK_COL_MAJOR, N, NRHS, a, LDA, ipiv, b, LDB);
if (info == 0) {
printf("\nFactorization of LU : %d by %d\n", N, N);
for(int i=0; i<N; i++){
int s = i;
for(int j=0; j<N; j++, s+=LDA) printf(" % 10.5lf", a[s]);
printf("\n");
}
printf("\nSolution of %dright hand side vectors of %d\n", NRHS, N);
for(int i=0; i<N; i++){
int s = i;
for(int j=0; j<NRHS; j++, s+=LDA) printf(" % 10.6lf", b[s]);
printf("\n");
}
printf("\nFactorization pivot indices by %d\n", N);
for(int i=0; i<N; i++) printf(" % 5.0d", ipiv[i]);
printf("\n\n");
} // End of if (info == 0)
else
printf("An error ocurred in the LAPACK lib dgesv, with code %d\n", info);
/******* Example of using the lapack library with
the C interface for solving linear systems
for a trigiagonal matrix **************/
int ldb = NN;
// d1[NN-1] lower diagonal
double dl[] = {1, 4, 4, 1};
// d[NN] main diagonal
double d[] = {-2, -2, -2, -2, -2};
// du[NN-1] upper diagonal
double du[] = {1, 4, 4, 1};
// bb[NN*NRHS2] number of righ hand side vectors
double bb[] = {
3., 5., 5., 5., 3.,
-1.56, 4., -8.67, 1.75, 2.86,
9.81, -4.09, -4.57, -8.61, 8.99
};
info = LAPACKE_dgtsv(LAPACK_COL_MAJOR, NN, NRHS2, dl, d, du, bb, ldb);
if (info == 0) {
printf("\nTest of using LAPACKe Library for Tridiagonal systems\n");
printf("\nSolution of %dright hand side vectors of %d\n", NRHS2, NN);
for(int i=0; i<NN; i++){
int s = i;
for(int j=0; j<NRHS2; j++, s+=ldb) printf(" % 10.6lf", bb[s]);
printf("\n");
}
} // End of if (info == 0)
else
printf("An error ocurred in the LAPACK lib dgesv, with code %d\n", info);
return 0;
}
This runs in a WSL (Windows Subsystem Linux) and compiles successfully making the command
gcc example2.c -o lapack -llapacke
But, when I try to compile it in my Mac using the line
gcc example2.c -o lapacke -framework Accelerate
I receive the following error:
example2.c:47:9: error: implicit declaration of function 'LAPACKE_dgesv' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
info = LAPACKE_dgesv( LAPACK_COL_MAJOR, N, NRHS, a, LDA, ipiv, b, LDB);
^
example2.c:47:24: error: use of undeclared identifier 'LAPACK_COL_MAJOR'
info = LAPACKE_dgesv( LAPACK_COL_MAJOR, N, NRHS, a, LDA, ipiv, b, LDB);
^
example2.c:91:9: error: implicit declaration of function 'LAPACKE_dgtsv' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
info = LAPACKE_dgtsv(LAPACK_COL_MAJOR, NN, NRHS2, dl, d, du, bb, ldb);
^
example2.c:91:9: note: did you mean 'LAPACKE_dgesv'?
example2.c:47:9: note: 'LAPACKE_dgesv' declared here
info = LAPACKE_dgesv( LAPACK_COL_MAJOR, N, NRHS, a, LDA, ipiv, b, LDB);
^
example2.c:91:23: error: use of undeclared identifier 'LAPACK_COL_MAJOR'
info = LAPACKE_dgtsv(LAPACK_COL_MAJOR, NN, NRHS2, dl, d, du, bb, ldb);
^
4 errors generated.
It looks like inside the Accelerate.h the headers of LAPACKe functions are not defined. I was looking around but I don't see anything that can help me.
BTW, I have another example using cblas and it runs smoothly

Accessing dynamically allocated arrays on device (without passing them as kernel arguments)

How can an array of structs that has been dynamically allocated on the host be used by a kernel, without passing the array of structs as a kernel argument? This seems like a common procedure with a good amount of documentation online, yet it doesn't work on the following program.
Note: Please note that the following questions have been studied before posting this question:
1) copying host memory to cuda __device__ variable 2) Global variable in CUDA 3) Is there any way to dynamically allocate constant memory? CUDA
So far, unsuccessful attempts have been made to:
Dynamically allocate array of structs with cudaMalloc(), then
Use cudaMemcpyToSymbol() with the pointer returned from cudaMalloc() to copy to a __device__ variable which can be used by the kernel.
Code attempt:
NBody.cu (error checking using cudaStatus has mostly been omitted for better readability, and function to read data from file into dynamic array removed):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCK 256
struct nbody {
float x, y, vx, vy, m;
};
typedef struct nbody nbody;
// Global declarations
nbody* particle;
// Device variables
__device__ unsigned int d_N; // Kernel can successfully access this
__device__ nbody d_particle; // Update: part of problem was here with (*)
// Aim of kernel: to print contents of array of structs without using kernel argument
__global__ void step_cuda_v1() {
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < d_N) {
printf("%.f\n", d_particle.x);
}
}
int main() {
unsigned int N = 10;
unsigned int I = 1;
cudaMallocHost((void**)&particle, N * sizeof(nbody)); // Host allocation
cudaError_t cudaStatus;
for (int i = 0; i < N; i++) particle[i].x = i;
nbody* particle_buf; // device buffer
cudaSetDevice(0);
cudaMalloc((void**)&particle_buf, N * sizeof(nbody)); // Allocate device mem
cudaMemcpy(particle_buf, particle, N * sizeof(nbody), cudaMemcpyHostToDevice); // Copy data into device mem
cudaMemcpyToSymbol(d_particle, &particle_buf, sizeof(nbody*)); // Copy pointer to data into __device__ var
cudaMemcpyToSymbol(d_N, &N, sizeof(unsigned int)); // This works fine
int NThreadBlock = (N + BLOCK - 1) / BLOCK;
for (int iteration = 0; iteration <= I; iteration++) {
step_cuda_v1 << <NThreadBlock, BLOCK >> > ();
//step_cuda_v1 << <1, 5 >> > (particle_buf);
cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(cudaStatus));
exit(-1);
}
}
return 0;
}
OUTPUT:
"ERROR: kernel launch failed."
Summary:
How can I print the contents of the array of structs from the kernel, without passing it as a kernel argument?
Coding in C using VS2019 with CUDA 10.2
With the help of #Robert Crovella and #talonmies, here is the solution that outputs a sequence that cycles from 0 to 9 repeatedly.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCK 256
//#include "Nbody.h"
struct nbody {
float x, y, vx, vy, m;
};
typedef struct nbody nbody;
// Global declarations
nbody* particle;
// Device variables
__device__ unsigned int d_N; // Kernel can successfully access this
__device__ nbody* d_particle;
//__device__ nbody d_particle; // Update: part of problem was here with (*)
// Aim of kernel: to print contents of array of structs without using kernel argument
__global__ void step_cuda_v1() {
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < d_N) {
printf("%.f\n", d_particle[i].x);
}
}
int main() {
unsigned int N = 10;
unsigned int I = 1;
cudaMallocHost((void**)&particle, N * sizeof(nbody)); // Host allocation
cudaError_t cudaStatus;
for (int i = 0; i < N; i++) particle[i].x = i;
nbody* particle_buf; // device buffer
cudaSetDevice(0);
cudaMalloc((void**)&particle_buf, N * sizeof(nbody)); // Allocate device mem
cudaMemcpy(particle_buf, particle, N * sizeof(nbody), cudaMemcpyHostToDevice); // Copy data into device mem
cudaMemcpyToSymbol(d_particle, &particle_buf, sizeof(nbody*)); // Copy pointer to data into __device__ var
cudaMemcpyToSymbol(d_N, &N, sizeof(unsigned int)); // This works fine
int NThreadBlock = (N + BLOCK - 1) / BLOCK;
for (int iteration = 0; iteration <= I; iteration++) {
step_cuda_v1 << <NThreadBlock, BLOCK >> > ();
//step_cuda_v1 << <1, 5 >> > (particle_buf);
cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(cudaStatus));
exit(-1);
}
}
return 0;
}

Openacc error ibgomp: while loading libgomp-plugin-host_nonshm.so.1: libgomp-plugin-host_nonshm.so.1: cannot

I want to compile an easy openacc sample (it was attached) , it was correctly compiled but when i run it got an error :
compile with : gcc-5 -fopenacc accVetAdd.c -lm
run with : ./a.out
got error in runtime
error: libgomp: while loading libgomp-plugin-host_nonshm.so.1: libgomp-plugin-host_nonshm.so.1: cannot open shared object file: No such file or directory
I google it and find only one page! then i ask how to fix this problem?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
int main(int argc, char* argv[])
{
// Size of vectors
int n = 10000;
// Input vectors
double *restrict a;
double *restrict b;
// Output vector
double *restrict c;
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector
a = (double*)malloc(bytes);
b = (double*)malloc(bytes);
c = (double*)malloc(bytes);
// Initialize content of input vectors, vector a[i] = sin(i)^2 vector b[i] = cos(i)^2
int i;
for (i = 0; i<n; i++) {
a[i] = sin(i)*sin(i);
b[i] = cos(i)*cos(i);
}
// sum component wise and save result into vector c
#pragma acc kernels copyin(a[0:n],b[0:n]), copyout(c[0:n])
for (i = 0; i<n; i++) {
c[i] = a[i] + b[i];
}
// Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0.0;
for (i = 0; i<n; i++) {
sum += c[i];
}
sum = sum / n;
printf("final result: %f\n", sum);
// Release memory
free(a);
free(b);
free(c);
return 0;
}
libgomp dynamically loads shared object files for the plugins it supports, such as the one implementing the host_nonshm device. If they're installed in a non-standard directory (that is, not in the system's default search path), you need to tell the dynamic linker where to look for these shared object files: either compile with -Wl,-rpath,[...], or set the LD_LIBRARY_PATH environment variable.

Cuda error on compiling: identifier "cudamalloc" is undefined

I have a CUDA C code, when I try to compile it, nvcc gives me an error with an undefined identifier error: identifier "cudamalloc" is undefined, identifier "cudamemcpy" is undefined.
I'm running Windows 7 with Visual Studio 10 and CUDA Toolkit 4.0
I have installed Cuda on drive "C" and Visual Studio on drive "E" but im not sure that it is the problem.
I use this command to compile:
nvcc -o ej1b ej1b.cu
and this is my program:
#include <cuda.h>
#include <cstdio>
#include <cuda_runtime_api.h>
#include <device_functions.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
const int N = 512;
const int C = 5;
void init_CPU_array(int vec[],const int N){
unsigned int i;
for(i = 0; i < N; i++) {
vec[i] = i;
}
}
__global__ void kernel(int vec[],const int N, const int C){
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id<N)
vec[id] = vec[id] * C;
}
int main(){
int vec[N];
int vecRES[N];
int *vecGPU;
unsigned int cantaloc=N*sizeof(int);
init_CPU_array(vec,N);
cudamalloc((void**)&vecGPU,cantaloc);
cudamemcpy(vecGPU,vec,cantaloc,cudaMemcpyHostToDevice);
dim3 dimBlock(64);
dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x);
printf("-> Variable dimBlock.x = %d\n",dimBlock.x);
kernel<<<dimGrid, dimBlock>>>(vecGPU, N, C);
cudaThreadSynchronize();
cudamemcpy(vecRES,vecGPU,cantaloc,cudaMemcpyDeviceToHost);
cudaFree(vecGPU);
printf("%s \n","-> Resultados");
int i;
for(i=0;i<10;i++){
printf("%d ",vecRES[i]);
printf("%d \n",vec[i]);
}
return 0;
I used all those #include because I don't know where the problem is.
If you read the documentation, you will find the API calls are cudaMalloc and cudaMemcpy. C and C++ are case sensitive languages and you have the names incorrect.

Resources