Breakpoints inside CUDA kernel __global__ not hitting - visual-studio-2010

Using visual studios 2010. Win 7. Nsight 2.1
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// incrementArray.cu
#include <stdio.h>
#include <assert.h>
void incrementArrayOnHost(float *a, int N)
{
int i;
for (i=0; i < N; i++) a[i] = a[i]+1.f;
}
__global__ void incrementArrayOnDevice(float *a, int N)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int j = idx;
int i = 2;
i = i+j; //->breakpoint here
if (idx<N) a[idx] = a[idx]+1.f; //->breakpoint here
}
int main(void)
{
float *a_h, *b_h; // pointers to host memory
float *a_d; // pointer to device memory
int i, N = 10;
size_t size = N*sizeof(float);
// allocate arrays on host
a_h = (float *)malloc(size);
b_h = (float *)malloc(size);
// allocate array on device
cudaMalloc((void **) &a_d, size);
// initialization of host data
for (i=0; i<N; i++) a_h[i] = (float)i;
// copy data from host to device
cudaMemcpy(a_d, a_h, sizeof(float)*N, cudaMemcpyHostToDevice);
// do calculation on host
incrementArrayOnHost(a_h, N);
// do calculation on device:
// Part 1 of 2. Compute execution configuration
int blockSize = 4;
int nBlocks = N/blockSize + (N%blockSize == 0?0:1);
// Part 2 of 2. Call incrementArrayOnDevice kernel
incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d, N);
// Retrieve result from device and store in b_h
cudaMemcpy(b_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// check results
for (i=0; i<N; i++) assert(a_h[i] == b_h[i]);
// cleanup
free(a_h); free(b_h); cudaFree(a_d);
return 0;
}
I've tried inserting breakpoints as listed above inside my global void incrementArrayOnDevice(float *a, int N) but they're not hitting.
When I run debug (f5) in visual studios, I tried to step into incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d, N); but they would skip the entire kernel code section.
tried to add a watch on the variables i and j but there was an error "CXX0017: Error: symbol "i" not found."
Is this issue normal? Can someone please try on their pc and let me know if they can hit the breakpoints? If you can, what possible problem could mine be? Please help! :(

Nsight debugging is different from VS debugging . You need to use Nsight debugging to hit the kernel breakpoints. However, for this you need 2 GPU cards. Do you have 2 cards in the first place? Please check

You can debug on a single GPU but on the following conditions:
You have to be using 5.0 toolkit
You have to be programming on a GPU that suports 303.xx NForceWare or higher

Related

CUDA Initialize Array on Device

I am very new to CUDA and I am trying to initialize an array on the device and return the result back to the host to print out to show if it was correctly initialized. I am doing this because the end goal is a dot product solution in which I multiply two arrays together, storing the results in another array and then summing up the entire thing so that I only need to return the host one value.
In the code I am working on all I am only trying to see if I am initializing the array correctly. I am trying to create an array of size N following the patterns of 1,2,3,4,5,6,7,8,1,2,3....
This is the code that I've written and it compiles without issue but when I run it the terminal is hanging and I have no clue why. Could someone help me out here? I'm so incredibly confused :\
#include <stdio.h>
#include <stdlib.h>
#include <chrono>
#define ARRAY_SIZE 100
#define BLOCK_SIZE 32
__global__ void cu_kernel (int *a_d,int *b_d,int *c_d, int size)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int temp;
if(temp != 8){
a_d[x] = temp;
temp++;
} else {
a_d[x] = temp;
temp = 1;
}
}
int main (int argc, char *argv[])
{
//declare pointers for arrays
int *a_d, *b_d, *c_d, *sum_h, *sum_d,a_h[ARRAY_SIZE];
//set space for device variables
cudaMalloc((void**) &a_d, sizeof(int) * ARRAY_SIZE);
cudaMalloc((void**) &b_d, sizeof(int) * ARRAY_SIZE);
cudaMalloc((void**) &c_d, sizeof(int) * ARRAY_SIZE);
cudaMalloc((void**) &sum_d, sizeof(int));
// set execution configuration
dim3 dimblock (BLOCK_SIZE);
dim3 dimgrid (ARRAY_SIZE/BLOCK_SIZE);
// actual computation: call the kernel
cu_kernel <<<dimgrid, dimblock>>> (a_d,b_d,c_d,ARRAY_SIZE);
cudaError_t result;
// transfer results back to host
result = cudaMemcpy (a_h, a_d, sizeof(int) * ARRAY_SIZE, cudaMemcpyDeviceToHost);
if (result != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed.");
exit(1);
}
// print reversed array
printf ("Final state of the array:\n");
for (int i =0; i < ARRAY_SIZE; i++) {
printf ("%d ", a_h[i]);
}
printf ("\n");
}
There are at least 3 issues with your kernel code.
you are using shared memory variable temp without initializing it.
you are not resolving the order in which threads access a shared variable as discussed here.
you are imagining (perhaps) a particular order of thread execution, and CUDA provides no guarantees in that area
The first item seems self-evident, however naive methods to initialize it in a multi-threaded environment like CUDA are not going to work. Firstly we have the multi-threaded access pattern, again, Furthermore, in a multi-block scenario, shared memory in one block is logically distinct from shared memory in another block.
Rather than wrestle with mechanisms unsuited to create the pattern you desire, (informed by notions carried over from a serial processing environment), I would simply do something trivial like this to create the pattern you desire:
__global__ void cu_kernel (int *a_d,int *b_d,int *c_d, int size)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < size) a_d[x] = (x&7) + 1;
}
Are there other ways to do it? certainly.
__global__ void cu_kernel (int *a_d,int *b_d,int *c_d, int size)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int temp;
if (!threadIdx.x) temp = blockIdx.x*blockDim.x;
__syncthreads();
if (x < size) a_d[x] = ((temp+threadIdx.x) & 7) + 1;
}
You can get as fancy as you like.
These changes will still leave a few values at zero at the end of the array, which would require changes to your grid sizing. There are many questions about this already, or study a sample code like vectorAdd.

Accessing dynamically allocated arrays on device (without passing them as kernel arguments)

How can an array of structs that has been dynamically allocated on the host be used by a kernel, without passing the array of structs as a kernel argument? This seems like a common procedure with a good amount of documentation online, yet it doesn't work on the following program.
Note: Please note that the following questions have been studied before posting this question:
1) copying host memory to cuda __device__ variable 2) Global variable in CUDA 3) Is there any way to dynamically allocate constant memory? CUDA
So far, unsuccessful attempts have been made to:
Dynamically allocate array of structs with cudaMalloc(), then
Use cudaMemcpyToSymbol() with the pointer returned from cudaMalloc() to copy to a __device__ variable which can be used by the kernel.
Code attempt:
NBody.cu (error checking using cudaStatus has mostly been omitted for better readability, and function to read data from file into dynamic array removed):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCK 256
struct nbody {
float x, y, vx, vy, m;
};
typedef struct nbody nbody;
// Global declarations
nbody* particle;
// Device variables
__device__ unsigned int d_N; // Kernel can successfully access this
__device__ nbody d_particle; // Update: part of problem was here with (*)
// Aim of kernel: to print contents of array of structs without using kernel argument
__global__ void step_cuda_v1() {
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < d_N) {
printf("%.f\n", d_particle.x);
}
}
int main() {
unsigned int N = 10;
unsigned int I = 1;
cudaMallocHost((void**)&particle, N * sizeof(nbody)); // Host allocation
cudaError_t cudaStatus;
for (int i = 0; i < N; i++) particle[i].x = i;
nbody* particle_buf; // device buffer
cudaSetDevice(0);
cudaMalloc((void**)&particle_buf, N * sizeof(nbody)); // Allocate device mem
cudaMemcpy(particle_buf, particle, N * sizeof(nbody), cudaMemcpyHostToDevice); // Copy data into device mem
cudaMemcpyToSymbol(d_particle, &particle_buf, sizeof(nbody*)); // Copy pointer to data into __device__ var
cudaMemcpyToSymbol(d_N, &N, sizeof(unsigned int)); // This works fine
int NThreadBlock = (N + BLOCK - 1) / BLOCK;
for (int iteration = 0; iteration <= I; iteration++) {
step_cuda_v1 << <NThreadBlock, BLOCK >> > ();
//step_cuda_v1 << <1, 5 >> > (particle_buf);
cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(cudaStatus));
exit(-1);
}
}
return 0;
}
OUTPUT:
"ERROR: kernel launch failed."
Summary:
How can I print the contents of the array of structs from the kernel, without passing it as a kernel argument?
Coding in C using VS2019 with CUDA 10.2
With the help of #Robert Crovella and #talonmies, here is the solution that outputs a sequence that cycles from 0 to 9 repeatedly.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define BLOCK 256
//#include "Nbody.h"
struct nbody {
float x, y, vx, vy, m;
};
typedef struct nbody nbody;
// Global declarations
nbody* particle;
// Device variables
__device__ unsigned int d_N; // Kernel can successfully access this
__device__ nbody* d_particle;
//__device__ nbody d_particle; // Update: part of problem was here with (*)
// Aim of kernel: to print contents of array of structs without using kernel argument
__global__ void step_cuda_v1() {
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < d_N) {
printf("%.f\n", d_particle[i].x);
}
}
int main() {
unsigned int N = 10;
unsigned int I = 1;
cudaMallocHost((void**)&particle, N * sizeof(nbody)); // Host allocation
cudaError_t cudaStatus;
for (int i = 0; i < N; i++) particle[i].x = i;
nbody* particle_buf; // device buffer
cudaSetDevice(0);
cudaMalloc((void**)&particle_buf, N * sizeof(nbody)); // Allocate device mem
cudaMemcpy(particle_buf, particle, N * sizeof(nbody), cudaMemcpyHostToDevice); // Copy data into device mem
cudaMemcpyToSymbol(d_particle, &particle_buf, sizeof(nbody*)); // Copy pointer to data into __device__ var
cudaMemcpyToSymbol(d_N, &N, sizeof(unsigned int)); // This works fine
int NThreadBlock = (N + BLOCK - 1) / BLOCK;
for (int iteration = 0; iteration <= I; iteration++) {
step_cuda_v1 << <NThreadBlock, BLOCK >> > ();
//step_cuda_v1 << <1, 5 >> > (particle_buf);
cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(cudaStatus));
exit(-1);
}
}
return 0;
}

Openacc error ibgomp: while loading libgomp-plugin-host_nonshm.so.1: libgomp-plugin-host_nonshm.so.1: cannot

I want to compile an easy openacc sample (it was attached) , it was correctly compiled but when i run it got an error :
compile with : gcc-5 -fopenacc accVetAdd.c -lm
run with : ./a.out
got error in runtime
error: libgomp: while loading libgomp-plugin-host_nonshm.so.1: libgomp-plugin-host_nonshm.so.1: cannot open shared object file: No such file or directory
I google it and find only one page! then i ask how to fix this problem?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
int main(int argc, char* argv[])
{
// Size of vectors
int n = 10000;
// Input vectors
double *restrict a;
double *restrict b;
// Output vector
double *restrict c;
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector
a = (double*)malloc(bytes);
b = (double*)malloc(bytes);
c = (double*)malloc(bytes);
// Initialize content of input vectors, vector a[i] = sin(i)^2 vector b[i] = cos(i)^2
int i;
for (i = 0; i<n; i++) {
a[i] = sin(i)*sin(i);
b[i] = cos(i)*cos(i);
}
// sum component wise and save result into vector c
#pragma acc kernels copyin(a[0:n],b[0:n]), copyout(c[0:n])
for (i = 0; i<n; i++) {
c[i] = a[i] + b[i];
}
// Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0.0;
for (i = 0; i<n; i++) {
sum += c[i];
}
sum = sum / n;
printf("final result: %f\n", sum);
// Release memory
free(a);
free(b);
free(c);
return 0;
}
libgomp dynamically loads shared object files for the plugins it supports, such as the one implementing the host_nonshm device. If they're installed in a non-standard directory (that is, not in the system's default search path), you need to tell the dynamic linker where to look for these shared object files: either compile with -Wl,-rpath,[...], or set the LD_LIBRARY_PATH environment variable.

Error: "Value cannot be null. Parameter name: pSrcNativeVariant" in VS2010

when I want to debug program with nsight this message show: "Value cannot be null. Parameter name: pSrcNativeVariant". when i rebuild project this error not shown. but i must perform this action repeatedly for debug program. previously this action Not required.
this is my code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
using namespace std;
#define COLUMNS 3
#define ROWS 2
__global__ void add(int *a, int *b, int *c)
{
*a=345678;
int x = blockIdx.x;
int y = blockIdx.y;
int i = (COLUMNS*y) + x;
c[i] = a[i] + b[i];
}
int main()
{
int a[ROWS][COLUMNS], b[ROWS][COLUMNS], c[ROWS][COLUMNS];
int *dev_a, *dev_b, *dev_c;
int *x;
int r;
x=&r;
cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int));
for (int y = 0; y < ROWS; y++) // Fill Arrays
for (int x = 0; x < COLUMNS; x++)
{
a[y][x] = x;
b[y][x] = y;
}
cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
dim3 grid(COLUMNS,ROWS);
add<<<grid,1>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int),
cudaMemcpyDeviceToHost);
for (int y = 0; y < ROWS; y++) // Output Arrays
{
for (int x = 0; x < COLUMNS; x++)
{
printf("[%d][%d]=%d ",y,x,c[y][x]);
}
printf("\n");
}
return 0;
}
I encountered exactly the same problem. After trying many things, I found this problem could be simply solved by just running the visual studio in the administration mode once. Under the admin mode, run the nsight debugger, then the problem is solved. Admin mode is not required later. At least this works for me, good luck to you.
Added on May 12, 2014:
This problem happened again today. This time I solved it by switching the platform from Win32 to X64 then switch back
Added on May 22, 2014:
It happens again and everything I tried before did not work this time. Finally it is solved in this way:
I fixed this by deleting the visual studio solution user options file (.suo).
NuGet Package restore failed for project Miscellaneous Files: Value cannot be null or an empty string. Parameter name: root. 0 0
It is a problem with the NuGet package.
Solution:
Update it if it is out-of-date(restart).
Uninstall and reinstall(restart).
Open the "Package Manager Console", which is under "Tools/NuGet Package Manager/Package Manager Console", type any command to make sure it works.
The problem should be solved.

how to increase memory limit in Visual Studio C++

Need Help.I'm stuck at a problem when running a C++ code on Windows- Visual Studio.
When I run that code in Linux environment, there is no restriction on the memory I am able to allocate dynamically(till the size available in RAM).
But on VS Compiler, it does not let me create an array beyond a limited size.
I've tried /F option and 20-25 of google links to increase memory size but they dont seem to help much.
I am currently able to assign only around 100mb out of 3gb available.
If there is a solution for this in Windows and not in Visual Studio's compiler, I will be glad to hear that as I have a CUDA TeslaC2070 card which is proving to be pretty useless on Windows as I wanted to run my CUDA/C++ code on Windows environment.
Here's my code. it fails when LENGTH>128(no of images 640x480pngs. less than 0.5mb each. I've also calculated the approximate memory size it takes by counting data structures and types used in OpenCV and by me but still it is very less than 2gb). stackoverflow exception. Same with dynamic allocation. I've already maximized the heap and stack sizes.
#include "stdafx.h"
#include <cv.h>
#include <cxcore.h>
#include <highgui.h>
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define LENGTH 100
#define SIZE1 640
#define SIZE2 480
#include <iostream>
using namespace std;
__global__ void square_array(double *img1_d, long N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
img1_d[idx]= 255.0-img1_d[idx];
}
int _tmain(int argc, _TCHAR* argv[])
{
IplImage *img1[LENGTH];
// Open the file.
for(int i=0;i<LENGTH;i++)
{ img1[i] = cvLoadImage("abstract3.jpg");}
CvMat *mat1[LENGTH];
for(int i=0;i<LENGTH;i++)
{
mat1[i] = cvCreateMat(img1[i]->height,img1[i]->width,CV_32FC3 );
cvConvert( img1[i], mat1[i] );
}
double a[LENGTH][2*SIZE1][SIZE2][3];
for(int m=0;m<LENGTH;m++)
{
for(int i=0;i<SIZE1;i++)
{
for(int j=0;j<SIZE2;j++)
{
CvScalar scal = cvGet2D( mat1[m],j,i);
a[m][i][j][0] = scal.val[0];
a[m][i][j][1] = scal.val[1];
a[m][i][j][2] = scal.val[2];
a[m][i+SIZE1][j][0] = scal.val[0];
a[m][i+SIZE1][j][1] = scal.val[1];
a[m][i+SIZE1][j][2] = scal.val[2];
}
} }
//cuda
double *a_d;
int N=LENGTH*2*SIZE1*SIZE2*3;
cudaMalloc((void **) &a_d, N*sizeof(double));
cudaMemcpy(a_d, a, N*sizeof(double), cudaMemcpyHostToDevice);
int block_size = 370;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
cout<<n_blocks<<block_size;
square_array <<< n_blocks, block_size >>> (a_d, N);
cudaMemcpy(a, a_d, N*sizeof(double), cudaMemcpyDeviceToHost);
//cuda end
char name[]= "Image: 00000";
name[12]='\0';
int x=0,y=0;
for(int m=0;m<LENGTH;m++)
{
for (int i = 0; i < img1[m]->width*img1[m]->height*3; i+=3)
{
img1[m]->imageData[i]= a[m][x][y][0];
img1[m]->imageData[i+1]= a[m][x][y][1];
img1[m]->imageData[i+2]= a[m][x][y][2];
if(x==SIZE1)
{
x=0;
y++;
}
x++;
}
switch(name[11])
{
case '9': switch(name[10])
{
case '9':
switch(name[9])
{
case '9': name[11]='0';name[10]='0';name[9]='0';name[8]++;
break;
default : name[11]='0';
name[10]='0';
name[9]++;
}break;
default : name[11]='0'; name[10]++;break;
}
break;
default : name[11]++;break;
}
// Display the image.
cvNamedWindow(name, CV_WINDOW_AUTOSIZE);
cvShowImage(name,img1);
//cvSaveImage(name ,img1);
}
// Wait for the user to press a key in the GUI window.
cvWaitKey(0);
// Free the resources.
//cvDestroyWindow(x);
//cvReleaseImage(&img1);
//cvDestroyWindow("Image:");
//cvReleaseImage(&img2);
return 0;
}
The problem is that you are allocating a huge multidimensional array on the stack in your main function (double a[..][..][..]). Do not allocate this much memory on the stack. Use malloc/new to allocate on the heap.

Resources