I'm trying to compile the cublas example from the CUDA documentation
//Example 2. Application Using C and CUBLAS: 0-based indexing
//-----------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#define M 6
#define N 5
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
static __inline__ void modify (cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta){
cublasSscal (handle, n-p, &alpha, &m[IDX2C(p,q,ldm)], ldm);
cublasSscal (handle, ldm-p, &beta, &m[IDX2C(p,q,ldm)], 1);
}
int main (void){
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
int i, j;
float* devPtrA;
float* a = 0;
a = (float *)malloc (M * N * sizeof (*a));
if (!a) {
printf ("host memory allocation failed");
return EXIT_FAILURE;
}
for (j = 0; j < N; j++) {
for (i = 0; i < M; i++) {
a[IDX2C(i,j,M)] = (float)(i * M + j + 1);
}
}
cudaStat = cudaMalloc ((void**)&devPtrA, M*N*sizeof(*a));
if (cudaStat != cudaSuccess) {
printf ("device memory allocation failed");
return EXIT_FAILURE;
}
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
return EXIT_FAILURE;
}
stat = cublasSetMatrix (M, N, sizeof(*a), a, M, devPtrA, M);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data download failed");
cudaFree (devPtrA);
cublasDestroy(handle);
return EXIT_FAILURE;
}
modify (handle, devPtrA, M, N, 1, 2, 16.0f, 12.0f);
stat = cublasGetMatrix (M, N, sizeof(*a), devPtrA, M, a, M);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data upload failed");
cudaFree (devPtrA);
cublasDestroy(handle);
return EXIT_FAILURE;
}
cudaFree (devPtrA);
cublasDestroy(handle);
for (j = 0; j < N; j++) {
for (i = 0; i < M; i++) {
printf ("%7.0f", a[IDX2C(i,j,M)]);
}
printf ("\n");
}
free(a);
return EXIT_SUCCESS;
}
I saved this file into "cudaexample.c" and am trying to compile with gcc cudaexample.c -I/usr/local/cuda/include -L/usr/local/cuda/lib -lcuda -lcublas
I get an undefined symbols error:
Undefined symbols for architecture x86_64:
"_cudaFree", referenced from:
_main in ccpPWjbO.o
"_cudaMalloc", referenced from:
_main in ccpPWjbO.o
ld: symbol(s) not found for architecture x86_64
collect2: error: ld returned 1 exit status
It seems like I've specified the commands properly as other symbols (e.g. cublasCreate) are found. Why are Free and Malloc not present?
Relevant details:
OSX: 10.10.2
gcc: 4.8.4 (target: x86_64-apple-darwin14)
Graphics: NVIDA GeForce GT 650M 1024 MB
I downloaded and installed the CUDA-6.5 toolkit
Those API functions (e.g. cudaMalloc) are contained in the CUDA runtime library. You are not linking against that library, so those symbols aren't found during the link phase.
Add -lcudart to your link flags:
-I/usr/local/cuda/include -L/usr/local/cuda/lib -lcuda -lcublas -lcudart
and it should fix that issue for you.
(-lcuda is only needed if you are using CUDA driver API functions. You can remove that if you wish.)
Related
I have some problems when I try to run my main.cpp file, only with mac gcc/clang/g++ compiler.
Here is the code:
random.h
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <ctime>
void initialize();
float randomFloat(int, int, int);
random.cpp
#include "random.h"
void initialize() { srand(time(NULL)); }
float randomFloat(int min, int max, int p) {
int intPart = rand() % (max - (min - 1)) + min;
if (intPart == max) {
intPart--;
}
float decimal = (float)(rand() % (int)pow(10, p)) / pow(10, p);
return intPart + decimal;
}
util.h
#include <iostream>
int askInteger(const char *, bool);
util.cpp
#include "util.h"
using namespace std;
int askInteger(const char *message, bool onlyPositive) {
int number;
if (onlyPositive) {
while (cout << "Type a correct " << message << ": ",
!(cin >> number) || number < 0) {
cerr << "Input error, try again. \n";
if (cin.fail()) {
cin.clear();
cin.ignore();
}
}
} else {
while (cout << "Type a correct " << message << ": ", !(cin >> number)) {
cerr << "Input error, try again. \n";
if (cin.fail()) {
cin.clear();
cin.ignore();
}
}
}
return number;
}
main.cpp
#include "random/random.h"
#include "util/util.h"
using namespace std;
int main() {
int n, min, max, p;
n = askInteger("numbers quantity", true);
p = askInteger("precession", true);
min = askInteger("min value (included)", false);
max = askInteger("max value (included)", false);
while (max <= min) {
cout << "max value should be greather than " << min << "\n";
max = askInteger("max value (included)", false);
}
initialize();
for (int i = 0; i < n; i++) {
cout << randomFloat(min, max, p) << "\n";
}
}
And It gives me this result:
Undefined symbols for architecture arm64:
"askInteger(char const*, bool)", referenced from:
_main in main-2552a5.o
"initialize()", referenced from:
_main in main-2552a5.o
"randomFloat(int, int, int)", referenced from:
_main in main-2552a5.o
ld: symbol(s) not found for architecture arm64
clang: error: linker command failed with exit code 1 (use -v to see invocation)
vscode applies this command to run the code:
Random-Numbers % cd "/Users/user/Downloads/Random-Numbers/" && g++ main.cpp -
o main && "/Users/user/Downloads/Random-Numbers/"main
Here is my project structure:
enter image description here
I have tried already
gcc *.h
gcc -c *.h
g++ *.cpp
g++ -o *.cpp
g++ -c *.cpp
AT LAST! I could run it
c++ vscode default commands doesn’t match with g++ mac compiler
c++ vscode default command to run main:
d "/Users/user/Downloads/Random-Numbers/" && g++ main.cpp -o main && "/Users/user/Downloads/Random-Numbers/"main
user#MacBook-Pro-de-user Random-Numbers % cd "/Users/user/Downloads/Random-Numbers/" && g++ main.cpp -o main && "/Users/
user/Downloads/Random-Numbers/"main
you have to compile by terminal with the command:
user#MacBook-Pro-de-user Random-Numbers % g++ -o ./main.exe random/random.cpp util/util.cpp main.cpp
user#MacBook-Pro-de-user Random-Numbers % ./main.exe
commands g++
I need to use LAPACKe functions in a code that should run on Linux and macOS, but the problem is under OsX. I have a MacBook Pro 2021, with an M1 Pro processor, running an OsX 12.6.2
I wrote an example code
#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__
#include <Accelerate/Accelerate.h>
#else
#include <lapacke.h>
#endif
const int N= 3, NRHS=2, LDA=N, LDB=N;
const int NN=5, NRHS2=3;
int main(int argc, char** argv) {
/******* Example of using the lapack library with
the C interface for solving linear systems
for a general full matrix **************/
int ipiv[N], info;
// a[LDA*N]
double a[] = {
6.80, -2.11, 5.66,
-6.05, -3.30, 5.36,
-0.45, 2.58, -2.70
};
// b[LDB*NRHS]
double b[] = {
4.02, 6.19, -8.22,
-1.56, 4.00, -8.67
};
printf("\nTest of using LAPACKe Library\n");
printf("Matrix A : %d by %d\n", N, N);
for(int i=0; i<N; i++){
int s = i;
for(int j=0; j<N; j++, s+=LDA) printf(" % 6.2lf", a[s]);
printf("\n");
}
printf("\nRight hand side %d vectors of %d\n", NRHS, N);
for(int i=0; i<N; i++){
int s = i;
for(int j=0; j<NRHS; j++, s+=LDA) printf(" % 10.6lf", b[s]);
printf("\n");
}
/** As long the LAPACK_COL_MAJOR is used, the matrix is
filled up by columns, pay attention in the way is printed **/
info = LAPACKE_dgesv( LAPACK_COL_MAJOR, N, NRHS, a, LDA, ipiv, b, LDB);
if (info == 0) {
printf("\nFactorization of LU : %d by %d\n", N, N);
for(int i=0; i<N; i++){
int s = i;
for(int j=0; j<N; j++, s+=LDA) printf(" % 10.5lf", a[s]);
printf("\n");
}
printf("\nSolution of %dright hand side vectors of %d\n", NRHS, N);
for(int i=0; i<N; i++){
int s = i;
for(int j=0; j<NRHS; j++, s+=LDA) printf(" % 10.6lf", b[s]);
printf("\n");
}
printf("\nFactorization pivot indices by %d\n", N);
for(int i=0; i<N; i++) printf(" % 5.0d", ipiv[i]);
printf("\n\n");
} // End of if (info == 0)
else
printf("An error ocurred in the LAPACK lib dgesv, with code %d\n", info);
/******* Example of using the lapack library with
the C interface for solving linear systems
for a trigiagonal matrix **************/
int ldb = NN;
// d1[NN-1] lower diagonal
double dl[] = {1, 4, 4, 1};
// d[NN] main diagonal
double d[] = {-2, -2, -2, -2, -2};
// du[NN-1] upper diagonal
double du[] = {1, 4, 4, 1};
// bb[NN*NRHS2] number of righ hand side vectors
double bb[] = {
3., 5., 5., 5., 3.,
-1.56, 4., -8.67, 1.75, 2.86,
9.81, -4.09, -4.57, -8.61, 8.99
};
info = LAPACKE_dgtsv(LAPACK_COL_MAJOR, NN, NRHS2, dl, d, du, bb, ldb);
if (info == 0) {
printf("\nTest of using LAPACKe Library for Tridiagonal systems\n");
printf("\nSolution of %dright hand side vectors of %d\n", NRHS2, NN);
for(int i=0; i<NN; i++){
int s = i;
for(int j=0; j<NRHS2; j++, s+=ldb) printf(" % 10.6lf", bb[s]);
printf("\n");
}
} // End of if (info == 0)
else
printf("An error ocurred in the LAPACK lib dgesv, with code %d\n", info);
return 0;
}
This runs in a WSL (Windows Subsystem Linux) and compiles successfully making the command
gcc example2.c -o lapack -llapacke
But, when I try to compile it in my Mac using the line
gcc example2.c -o lapacke -framework Accelerate
I receive the following error:
example2.c:47:9: error: implicit declaration of function 'LAPACKE_dgesv' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
info = LAPACKE_dgesv( LAPACK_COL_MAJOR, N, NRHS, a, LDA, ipiv, b, LDB);
^
example2.c:47:24: error: use of undeclared identifier 'LAPACK_COL_MAJOR'
info = LAPACKE_dgesv( LAPACK_COL_MAJOR, N, NRHS, a, LDA, ipiv, b, LDB);
^
example2.c:91:9: error: implicit declaration of function 'LAPACKE_dgtsv' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
info = LAPACKE_dgtsv(LAPACK_COL_MAJOR, NN, NRHS2, dl, d, du, bb, ldb);
^
example2.c:91:9: note: did you mean 'LAPACKE_dgesv'?
example2.c:47:9: note: 'LAPACKE_dgesv' declared here
info = LAPACKE_dgesv( LAPACK_COL_MAJOR, N, NRHS, a, LDA, ipiv, b, LDB);
^
example2.c:91:23: error: use of undeclared identifier 'LAPACK_COL_MAJOR'
info = LAPACKE_dgtsv(LAPACK_COL_MAJOR, NN, NRHS2, dl, d, du, bb, ldb);
^
4 errors generated.
It looks like inside the Accelerate.h the headers of LAPACKe functions are not defined. I was looking around but I don't see anything that can help me.
BTW, I have another example using cblas and it runs smoothly
How can an array of structs that has been dynamically allocated on the host be used by a kernel, without passing the array of structs as a kernel argument? This seems like a common procedure with a good amount of documentation online, yet it doesn't work on the following program.
Note: Please note that the following questions have been studied before posting this question:
1) copying host memory to cuda __device__ variable 2) Global variable in CUDA 3) Is there any way to dynamically allocate constant memory? CUDA
So far, unsuccessful attempts have been made to:
Dynamically allocate array of structs with cudaMalloc(), then
Use cudaMemcpyToSymbol() with the pointer returned from cudaMalloc() to copy to a __device__ variable which can be used by the kernel.
Code attempt:
NBody.cu (error checking using cudaStatus has mostly been omitted for better readability, and function to read data from file into dynamic array removed):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCK 256
struct nbody {
float x, y, vx, vy, m;
};
typedef struct nbody nbody;
// Global declarations
nbody* particle;
// Device variables
__device__ unsigned int d_N; // Kernel can successfully access this
__device__ nbody d_particle; // Update: part of problem was here with (*)
// Aim of kernel: to print contents of array of structs without using kernel argument
__global__ void step_cuda_v1() {
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < d_N) {
printf("%.f\n", d_particle.x);
}
}
int main() {
unsigned int N = 10;
unsigned int I = 1;
cudaMallocHost((void**)&particle, N * sizeof(nbody)); // Host allocation
cudaError_t cudaStatus;
for (int i = 0; i < N; i++) particle[i].x = i;
nbody* particle_buf; // device buffer
cudaSetDevice(0);
cudaMalloc((void**)&particle_buf, N * sizeof(nbody)); // Allocate device mem
cudaMemcpy(particle_buf, particle, N * sizeof(nbody), cudaMemcpyHostToDevice); // Copy data into device mem
cudaMemcpyToSymbol(d_particle, &particle_buf, sizeof(nbody*)); // Copy pointer to data into __device__ var
cudaMemcpyToSymbol(d_N, &N, sizeof(unsigned int)); // This works fine
int NThreadBlock = (N + BLOCK - 1) / BLOCK;
for (int iteration = 0; iteration <= I; iteration++) {
step_cuda_v1 << <NThreadBlock, BLOCK >> > ();
//step_cuda_v1 << <1, 5 >> > (particle_buf);
cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(cudaStatus));
exit(-1);
}
}
return 0;
}
OUTPUT:
"ERROR: kernel launch failed."
Summary:
How can I print the contents of the array of structs from the kernel, without passing it as a kernel argument?
Coding in C using VS2019 with CUDA 10.2
With the help of #Robert Crovella and #talonmies, here is the solution that outputs a sequence that cycles from 0 to 9 repeatedly.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCK 256
//#include "Nbody.h"
struct nbody {
float x, y, vx, vy, m;
};
typedef struct nbody nbody;
// Global declarations
nbody* particle;
// Device variables
__device__ unsigned int d_N; // Kernel can successfully access this
__device__ nbody* d_particle;
//__device__ nbody d_particle; // Update: part of problem was here with (*)
// Aim of kernel: to print contents of array of structs without using kernel argument
__global__ void step_cuda_v1() {
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < d_N) {
printf("%.f\n", d_particle[i].x);
}
}
int main() {
unsigned int N = 10;
unsigned int I = 1;
cudaMallocHost((void**)&particle, N * sizeof(nbody)); // Host allocation
cudaError_t cudaStatus;
for (int i = 0; i < N; i++) particle[i].x = i;
nbody* particle_buf; // device buffer
cudaSetDevice(0);
cudaMalloc((void**)&particle_buf, N * sizeof(nbody)); // Allocate device mem
cudaMemcpy(particle_buf, particle, N * sizeof(nbody), cudaMemcpyHostToDevice); // Copy data into device mem
cudaMemcpyToSymbol(d_particle, &particle_buf, sizeof(nbody*)); // Copy pointer to data into __device__ var
cudaMemcpyToSymbol(d_N, &N, sizeof(unsigned int)); // This works fine
int NThreadBlock = (N + BLOCK - 1) / BLOCK;
for (int iteration = 0; iteration <= I; iteration++) {
step_cuda_v1 << <NThreadBlock, BLOCK >> > ();
//step_cuda_v1 << <1, 5 >> > (particle_buf);
cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(cudaStatus));
exit(-1);
}
}
return 0;
}
I want to compile an easy openacc sample (it was attached) , it was correctly compiled but when i run it got an error :
compile with : gcc-5 -fopenacc accVetAdd.c -lm
run with : ./a.out
got error in runtime
error: libgomp: while loading libgomp-plugin-host_nonshm.so.1: libgomp-plugin-host_nonshm.so.1: cannot open shared object file: No such file or directory
I google it and find only one page! then i ask how to fix this problem?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
int main(int argc, char* argv[])
{
// Size of vectors
int n = 10000;
// Input vectors
double *restrict a;
double *restrict b;
// Output vector
double *restrict c;
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector
a = (double*)malloc(bytes);
b = (double*)malloc(bytes);
c = (double*)malloc(bytes);
// Initialize content of input vectors, vector a[i] = sin(i)^2 vector b[i] = cos(i)^2
int i;
for (i = 0; i<n; i++) {
a[i] = sin(i)*sin(i);
b[i] = cos(i)*cos(i);
}
// sum component wise and save result into vector c
#pragma acc kernels copyin(a[0:n],b[0:n]), copyout(c[0:n])
for (i = 0; i<n; i++) {
c[i] = a[i] + b[i];
}
// Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0.0;
for (i = 0; i<n; i++) {
sum += c[i];
}
sum = sum / n;
printf("final result: %f\n", sum);
// Release memory
free(a);
free(b);
free(c);
return 0;
}
libgomp dynamically loads shared object files for the plugins it supports, such as the one implementing the host_nonshm device. If they're installed in a non-standard directory (that is, not in the system's default search path), you need to tell the dynamic linker where to look for these shared object files: either compile with -Wl,-rpath,[...], or set the LD_LIBRARY_PATH environment variable.
I have a CUDA C code, when I try to compile it, nvcc gives me an error with an undefined identifier error: identifier "cudamalloc" is undefined, identifier "cudamemcpy" is undefined.
I'm running Windows 7 with Visual Studio 10 and CUDA Toolkit 4.0
I have installed Cuda on drive "C" and Visual Studio on drive "E" but im not sure that it is the problem.
I use this command to compile:
nvcc -o ej1b ej1b.cu
and this is my program:
#include <cuda.h>
#include <cstdio>
#include <cuda_runtime_api.h>
#include <device_functions.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
const int N = 512;
const int C = 5;
void init_CPU_array(int vec[],const int N){
unsigned int i;
for(i = 0; i < N; i++) {
vec[i] = i;
}
}
__global__ void kernel(int vec[],const int N, const int C){
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id<N)
vec[id] = vec[id] * C;
}
int main(){
int vec[N];
int vecRES[N];
int *vecGPU;
unsigned int cantaloc=N*sizeof(int);
init_CPU_array(vec,N);
cudamalloc((void**)&vecGPU,cantaloc);
cudamemcpy(vecGPU,vec,cantaloc,cudaMemcpyHostToDevice);
dim3 dimBlock(64);
dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x);
printf("-> Variable dimBlock.x = %d\n",dimBlock.x);
kernel<<<dimGrid, dimBlock>>>(vecGPU, N, C);
cudaThreadSynchronize();
cudamemcpy(vecRES,vecGPU,cantaloc,cudaMemcpyDeviceToHost);
cudaFree(vecGPU);
printf("%s \n","-> Resultados");
int i;
for(i=0;i<10;i++){
printf("%d ",vecRES[i]);
printf("%d \n",vec[i]);
}
return 0;
I used all those #include because I don't know where the problem is.
If you read the documentation, you will find the API calls are cudaMalloc and cudaMemcpy. C and C++ are case sensitive languages and you have the names incorrect.