Compile error in cuda - visual-studio-2010

I have a compile error in cuda and I want why this erro happen ?
I want to Know if my cuda run in 2DArray for image processing in future
My code is
#include <stdio.h>
#include <stdlib.h>
#include <conio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include "device_launch_parameters.h"
#include<iostream>
using namespace std ;
#define BLOCK_WIDTH 16
__global__ void kernel(int *d_A, size_t pitch, int rows, int cols){
//compute the row
int r = blockIdx.y*blockDim.y+threadIdx.y;
//compute the column
int c = blockIdx.x*blockDim.x+threadIdx.x;
if((r < rows) && (c < cols)){
// // update the pointer to point to the beginning of the row
//int *Row = (int*)((char*)d_A + r*pitch);
int *Row = (int*)((char*)d_A);
int elem = Row[c];
printf("%d ", elem);
}
}
void test(int **A, int rows, int cols){
int *d_A;
size_t pitch;
cudaMallocPitch((void**)&d_A, &pitch, sizeof(int)*cols, rows);
cudaMemcpy2D(d_A, pitch, A, sizeof(int)*cols, sizeof(int)*cols, rows, cudaMemcpyHostToDevice);
//Define grid and block size
int Yblocks = rows / BLOCK_WIDTH;
if(rows % BLOCK_WIDTH) Yblocks++;
int Xblocks = cols / BLOCK_WIDTH;
if(cols % BLOCK_WIDTH) Xblocks++;
// cout << Yblocks << "," << Xblocks << endl;
dim3 dimGrid(Yblocks, Xblocks, 1);
dim3 dimBlock(BLOCK_WIDTH, BLOCK_WIDTH, 1);
//Run kernel
kernel<<<dimGrid, dimBlock>>>(d_A, pitch, rows, cols);
cudaMemcpy2D(A, sizeof(int)*cols, d_A, pitch, sizeof(int)*cols, rows, cudaMemcpyDeviceToHost);
cudaFree(&d_A);
}
int main(){
int rows = 2;
int cols = 2;
int **A;
A = new int*[rows];
for(int i = 0; i < rows; ++i){
A[i] = new int[cols];
for(int j = 0; j < cols; ++j)
A[i][j] = i+2;
}
test(A, rows, cols);
for(int i = 0; i < rows; ++i){
for(int j = 0; j < cols; ++j)
cout << A[i][j] << " ";
cout << "\n";
}
for(int i = 0; i < rows; ++i) delete[] A[i];
delete[] A;
return 0;
}
I have in my laptop :
NVIDIA CUDA sample 7.5 ,
NVIDIA CUDA Toolkit 7.5 ,
NVIDIA CUDA Toolkit v5(64),
NVIDIA CUDA Tools SDK v4.0 ,
NVIDIA GPU computing SDK 4 ,
NVIDIA Graphic driver 306.94 ,
NVIDIA Nsigth visual studio edition 5.1.0.10602 ,
visual studio 2010 ,
NVIDIA GeForce 9300M GS ,
driver model :WDDM 1.1 ,
DDI version :10 ,
windows 7
I have this error
1>------ Build started: Project: 2Dexample, Configuration: Debug Win32 --
1>Build started 8/10/2016 6:29:45 AM.
1>InitializeBuildStatus:
1> Touching "Debug\2Dexample.unsuccessfulbuild".
1>AddCudaCompileDeps:
1>Skipping target "AddCudaCompileDeps" because all output files are up- to-date with respect to the input files.
1>AddCudaCompilePropsDeps:
1>Skipping target "AddCudaCompilePropsDeps" because all output files are up-to-date with respect to the input files.
1>CudaBuild:
1> Compiling CUDA source file kernel.cu...
1>
1> C:\Users\Amany\Documents\Visual Studio 2010\Projects\2Dexample\2Dexample>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.0\bin\nvcc.exe" -gencode=arch=compute_10,code=\"sm_10,compute_10\" --use-local-env --cl-version 2010 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin" - I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.0\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.0\include" -G -maxrregcount=0 --machine 32 --compile -1 -g -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o "Debug\kernel.cu.obj" "C:\Users\Amany\Documents\Visual Studio 2010\Projects\2Dexample\2Dexample\kernel.cu"
1>nvcc : fatal error : Unknown option '1'
1>C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\BuildCustomizations\CUDA 5.0.targets(592,9): error MSB3721: The command ""C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.0\bin\nvcc.exe" -gencode=arch=compute_10,code=\"sm_10,compute_10\" --use-local-env --cl-version 2010 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.0\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.0\include" -G -maxrregcount=0 --machine 32 --compile -1 -g -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o "Debug\kernel.cu.obj" "C:\Users\Amany\Documents\Visual Studio 2010\Projects\2Dexample\2Dexample\kernel.cu"" exited with code -1.
1>
1>Build FAILED.
1>
1>Time Elapsed 00:00:00.29
========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped
I try many things but not work:
How to Compile CUDA App is Visual Studio 2010?
https://devtalk.nvidia.com/default/topic/577900/error-msb3721-with-cuda-5-5-and-vs-2010-professional/
https://devtalk.nvidia.com/default/topic/577900/error-msb3721-with-cuda-5-5-and-vs-2010-professional/
but not work

I think it may be related to this weird value
nvcc.exe ....... --compile -1 .......
nvcc : fatal error : Unknown option '1'

Related

large-size page-locked memory copy get wrong result in CUDA

I found an issue about large-size page-locked memory in CUDA. Here is the source code and makefile. The code allocates 10GB page-locked memory and copy some data from device memory to this page-locked memory, the data in device memory are set 1.0 before the copy.
#include <cuda.h>
#include <assert.h>
#include <cuda_runtime.h>
#include "helper_cuda.h"
__global__
void test_k(double* x, size_t n)
{
int gid = blockIdx.x*blockDim.x + threadIdx.x;
if(gid<n) x[gid] = 1.0 ;
}
int main(int argc, char* argv[])
{
size_t n = size_t(10)*1024*1024*1024/sizeof(double);
printf("\n n: %zu, page-locked memory size: %zu MB\n", n, n*sizeof(double)/1024/1024);
double* x_h = NULL, *x_d = NULL;
int gpuid = 0;
if(argc>1 ) gpuid = atoi(argv[1]);
printf("select gpu %d\n", gpuid);
checkCudaErrors(cudaSetDevice(gpuid));
checkCudaErrors(cudaMallocHost(&x_h, sizeof(double)*n));
checkCudaErrors(cudaMalloc(&x_d, sizeof(double)*n));
for(int i = 0; i < n; ++i) x_h[i]=0.0;
int nthd = 256;
int nblk = (n+nthd-1) / nthd;
test_k<<<nblk, nthd, 0, 0>>>(x_d, n);
checkCudaErrors(cudaMemcpy(x_h, x_d, sizeof(double)*n, cudaMemcpyDeviceToHost));
int errCount = 0;
for(size_t i = 0; i < n; ++i){
if(x_h[i] == 0.0) errCount++;
}
printf("%s errCount: %d, which should be 0\n", errCount?"Error:":"Correct", errCount);
checkCudaErrors(cudaFree(x_d));
checkCudaErrors(cudaFreeHost(x_h));
return 0;
}
CUDA_PATH = /depot/cuda/cuda-11.2/
CUDA_INC = -I$(CUDA_PATH)/include -I$(CUDA_PATH)/samples/common/inc
NVCC = $(CUDA_PATH)/bin/nvcc
NVCCXXFLAGS = -std=c++11 -O3 -w -m64 -Xptxas -dlcm=cg -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 $(CUDA_INC)
all: testLargePin
testLargePin: testLargePin.cu
$(NVCC) $^ $(NVCCXXFLAGS) -o $#
clean:
rm testLargePin -f
I run the binary on three different GPU servers(all with A100-SXM4-40GB). On machine 1, the result is correct. On machine 2, it reports
CUDA error at testLargePin.cu:31 code=719(cudaErrorLaunchFailure) "cudaMemcpy(x_h, x_d, sizeof(double)*n, cudaMemcpyDeviceToHost)"
On machine 3, its copy is wrong, there are lots of zeros in the page-locked array.
n: 1342177280, page-locked memory size: 10240 MB
select gpu 0
Error: errCount: 1024, which should be 0
Anyone knows the reason and how to fix the issue? like an API to check the max page-locked memory size in specified machine? Thanks in advance.
By NVIDIA, (https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc6c391505e117393cc2558fff6bfc2e9)
Error 719 is about dereferencing an invalid device pointer, accessing out of bounds shared memory, or system specific problem...
In my experience, synchronization helped troubles about memory error and inconsistent results. Did you try adding cudaDeviceSyncronize(); after checkCudaErrors(cudaMemcpy(x_h, x_d, sizeof(double)*n, cudaMemcpyDeviceToHost)); ??
About page-locked memory, there's no limit in CUDA. I think you have to check this on your host side.

Why do I get 128 teams/thread blocks and 96 threads in each teams/thread blocks using #pragma omp target team distribute parallel for in OpenMP?

I am running this code on Ubuntu 18.04, clang/llvm compiler with Nvidia GTX 1070 GPU
#pragma omp target data map(to: A,B) map(from: C)
{
#pragma omp target teams distribute
for(int n=0; n<Row; n++)
{
int team_id= omp_get_team_num();
#pragma omp parallel for default(shared) schedule(auto)
for(int j = 0; j <Col; j++)
{
int thread_id = omp_get_thread_num();
printf("Iteration= c[ %d ][ %d ], Team=%d, Thread=%d\n",n, j, team_id, thread_id);
C[n][j] = A[n][j] + B[n][j];
}
}
}
in the above code, max value of team is 127 and thread is 95
compile flags: clang++ -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_61 -Wall -O3 debug.cpp -o debug

undefined cudaMalloc symbols

I'm trying to compile the cublas example from the CUDA documentation
//Example 2. Application Using C and CUBLAS: 0-based indexing
//-----------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#define M 6
#define N 5
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
static __inline__ void modify (cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta){
cublasSscal (handle, n-p, &alpha, &m[IDX2C(p,q,ldm)], ldm);
cublasSscal (handle, ldm-p, &beta, &m[IDX2C(p,q,ldm)], 1);
}
int main (void){
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
int i, j;
float* devPtrA;
float* a = 0;
a = (float *)malloc (M * N * sizeof (*a));
if (!a) {
printf ("host memory allocation failed");
return EXIT_FAILURE;
}
for (j = 0; j < N; j++) {
for (i = 0; i < M; i++) {
a[IDX2C(i,j,M)] = (float)(i * M + j + 1);
}
}
cudaStat = cudaMalloc ((void**)&devPtrA, M*N*sizeof(*a));
if (cudaStat != cudaSuccess) {
printf ("device memory allocation failed");
return EXIT_FAILURE;
}
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
return EXIT_FAILURE;
}
stat = cublasSetMatrix (M, N, sizeof(*a), a, M, devPtrA, M);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data download failed");
cudaFree (devPtrA);
cublasDestroy(handle);
return EXIT_FAILURE;
}
modify (handle, devPtrA, M, N, 1, 2, 16.0f, 12.0f);
stat = cublasGetMatrix (M, N, sizeof(*a), devPtrA, M, a, M);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("data upload failed");
cudaFree (devPtrA);
cublasDestroy(handle);
return EXIT_FAILURE;
}
cudaFree (devPtrA);
cublasDestroy(handle);
for (j = 0; j < N; j++) {
for (i = 0; i < M; i++) {
printf ("%7.0f", a[IDX2C(i,j,M)]);
}
printf ("\n");
}
free(a);
return EXIT_SUCCESS;
}
I saved this file into "cudaexample.c" and am trying to compile with gcc cudaexample.c -I/usr/local/cuda/include -L/usr/local/cuda/lib -lcuda -lcublas
I get an undefined symbols error:
Undefined symbols for architecture x86_64:
"_cudaFree", referenced from:
_main in ccpPWjbO.o
"_cudaMalloc", referenced from:
_main in ccpPWjbO.o
ld: symbol(s) not found for architecture x86_64
collect2: error: ld returned 1 exit status
It seems like I've specified the commands properly as other symbols (e.g. cublasCreate) are found. Why are Free and Malloc not present?
Relevant details:
OSX: 10.10.2
gcc: 4.8.4 (target: x86_64-apple-darwin14)
Graphics: NVIDA GeForce GT 650M 1024 MB
I downloaded and installed the CUDA-6.5 toolkit
Those API functions (e.g. cudaMalloc) are contained in the CUDA runtime library. You are not linking against that library, so those symbols aren't found during the link phase.
Add -lcudart to your link flags:
-I/usr/local/cuda/include -L/usr/local/cuda/lib -lcuda -lcublas -lcudart
and it should fix that issue for you.
(-lcuda is only needed if you are using CUDA driver API functions. You can remove that if you wish.)

how do i include sm_11_atomic_function.h? [duplicate]

I'm having a issue with my kernel.cu class
Calling nvcc -v kernel.cu -o kernel.o I'm getting this error:
kernel.cu(17): error: identifier "atomicAdd" is undefined
My code:
#include "dot.h"
#include <cuda.h>
#include "device_functions.h" //might call atomicAdd
__global__ void dot (int *a, int *b, int *c){
__shared__ int temp[THREADS_PER_BLOCK];
int index = threadIdx.x + blockIdx.x * blockDim.x;
temp[threadIdx.x] = a[index] * b[index];
__syncthreads();
if( 0 == threadIdx.x ){
int sum = 0;
for( int i = 0; i<THREADS_PER_BLOCK; i++)
sum += temp[i];
atomicAdd(c, sum);
}
}
Some suggest?
You need to specify an architecture to nvcc which supports atomic memory operations (the default architecture is 1.0 which does not support atomics). Try:
nvcc -arch=sm_11 -v kernel.cu -o kernel.o
and see what happens.
EDIT in 2015 to note that the default architecture in CUDA 7.0 is now 2.0, which supports atomic memory operations, so this should not be a problem in newer toolkit versions.
Today with the latest cuda SDK and toolkit this solution will not work.
People also say that adding:
compute_11,sm_11; OR compute_12,sm_12; OR compute_13,sm_13;
compute_20,sm_20;
compute_30,sm_30;
to CUDA in the Project Properties in Visual Studio 2010 will work. It doesn't.
You have to specify this for the .cu file itself in its own properties (Under the C++/CUDA->Device->Code Generation) tab such as:
compute_13,sm_13;
compute_20,sm_20;
compute_30,sm_30;

Cuda doesn't calculate what it is expected to, just silently ignores my code

I'm encountering a very strange problem: Mu 9800GT doesnt seem to calculate at all.
I've tried all hello-worlds i've found in the internet, here's one of them:
this program creates 1..100 array on hosts, sends it to device, calculates a square of each value, returns it to host, prints the results.
#include "stdafx.h"
#include <stdio.h>
#include <cuda.h>
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 100; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
}
so the output is expected to be:
1 1.000
2 4.000
3 9.000
4 16.000
..
I swear back in 2009 it worked perfectly (vista 32, deviceemu)
now i get output:
1 1.000
2 2.000
3 3.000
4 4.000
so my card doesnt do anything. What can be the problem?
Configuration is:
win7x64
visual studio 2010 32bit
cuda toolkit 3.2 64bit
compilation settings: cuda 3.2 toolkit, 32-bit target platform, deviceemu or not - doesnt matter, the results are the same.
i also tried it on my vmware xp(32bit) visual studio 2008. the result is the same.
Please help me, i barely made the programe to compile, now i need it to work.
You can also view my project with all it needs from my post at nvidia forums ( 2.7 kb)
Thanks, Ilya
Your code produces the intended results on my Linux system so I would suggest checking the error codes returned by cudaMalloc and cudaMemcpy to ensure there are no silent driver/runtime errors. For example
cudaError_t error = cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
printf("error status: %s\n", cudaGetErrorString(error));
should print
error status: no error
if the call is successful.
Also, I believe device emulation was deprecated in CUDA 3.0 and removed entirely in CUDA 3.1. I don't know if that's related to your problem though.
To compile several files you'd just do something like this
$nvcc -c foo.cu
$nvcc -c bar.cu
$nvcc -o foobar foo.o bar.o
alternatively, you can do the linking in the last step with g++ like so
$g++ -o foobar foo.o bar.o -L/usr/local/cuda/lib64 -lcudart

Resources