I have been trying for days to get a Qt project file running on a 32-bit Windows 7 system, in which I want/need to include Cuda code. This combination of things is either so simple that no one ever bothered to put an example online, or so difficult that nobody ever succeeded, it seems. Whatever way, the only helpful forum threads I found were the same issue on Linux or Mac, or with Visual Studio on a Windows.
All of these give all sorts of different errors, however, whether due to linking or clashing libraries, or spaces in file names or non-existing folders in the Windows version of the Cuda SDK.
Is there someone who has a clear .pro file to offer that does the trick?
I am aiming to compile a simple programme with ordinary C++ code in Qt style, with Qt 4.8 libraries, which reference several Cuda modules in .cu files. Something of the form:
TestCUDA \
TestCUDA.pro
main.cpp
test.cu
So I finally managed to assemble a .pro file that works on my and probably on all Windows systems. The following is an easy test programme that should probably do the trick. The following is a small project file plus test programme that works at least on my system.
The file system looks as follows:
TestCUDA \
TestCUDA.pro
main.cpp
vectorAddition.cu
The project file reads:
TARGET = TestCUDA
# Define output directories
DESTDIR = release
OBJECTS_DIR = release/obj
CUDA_OBJECTS_DIR = release/cuda
# Source files
SOURCES += src/main.cpp
# This makes the .cu files appear in your project
OTHER_FILES += vectorAddition.cu
# CUDA settings <-- may change depending on your system
CUDA_SOURCES += src/cuda/vectorAddition.cu
CUDA_SDK = "C:/ProgramData/NVIDIA Corporation/NVIDIA GPU Computing SDK 4.2/C" # Path to cuda SDK install
CUDA_DIR = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v4.2" # Path to cuda toolkit install
SYSTEM_NAME = Win32 # Depending on your system either 'Win32', 'x64', or 'Win64'
SYSTEM_TYPE = 32 # '32' or '64', depending on your system
CUDA_ARCH = sm_11 # Type of CUDA architecture, for example 'compute_10', 'compute_11', 'sm_10'
NVCC_OPTIONS = --use_fast_math
# include paths
INCLUDEPATH += $$CUDA_DIR/include \
$$CUDA_SDK/common/inc/ \
$$CUDA_SDK/../shared/inc/
# library directories
QMAKE_LIBDIR += $$CUDA_DIR/lib/$$SYSTEM_NAME \
$$CUDA_SDK/common/lib/$$SYSTEM_NAME \
$$CUDA_SDK/../shared/lib/$$SYSTEM_NAME
# Add the necessary libraries
LIBS += -lcuda -lcudart
# The following library conflicts with something in Cuda
QMAKE_LFLAGS_RELEASE = /NODEFAULTLIB:msvcrt.lib
QMAKE_LFLAGS_DEBUG = /NODEFAULTLIB:msvcrtd.lib
# The following makes sure all path names (which often include spaces) are put between quotation marks
CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
# Configuration of the Cuda compiler
CONFIG(debug, debug|release) {
# Debug mode
cuda_d.input = CUDA_SOURCES
cuda_d.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
cuda_d.commands = $$CUDA_DIR/bin/nvcc.exe -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
cuda_d.dependency_type = TYPE_C
QMAKE_EXTRA_COMPILERS += cuda_d
}
else {
# Release mode
cuda.input = CUDA_SOURCES
cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
cuda.commands = $$CUDA_DIR/bin/nvcc.exe $$NVCC_OPTIONS $$CUDA_INC $$LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
cuda.dependency_type = TYPE_C
QMAKE_EXTRA_COMPILERS += cuda
}
Note the QMAKE_LFLAGS_RELEASE = /NODEFAULTLIB:msvcrt.lib: it took me a long time to figure out, but this library seems to clash with other things in Cuda, which produces strange linking warnings and errors. If someone has an explanation for this, and potentially a prettier way to get around this, I'd like to hear it.
Also, since Windows file paths often include spaces (and NVIDIA's SDK by default does so too), it is necessary to artificially add quotation marks around the include paths. Again, if someone knows a more elegant way of solving this problem, I'd be interested to know.
The main.cpp file looks like this:
#include <cuda.h>
#include <builtin_types.h>
#include <drvapi_error_string.h>
#include <QtCore/QCoreApplication>
#include <QDebug>
// Forward declare the function in the .cu file
void vectorAddition(const float* a, const float* b, float* c, int n);
void printArray(const float* a, const unsigned int n) {
QString s = "(";
unsigned int ii;
for (ii = 0; ii < n - 1; ++ii)
s.append(QString::number(a[ii])).append(", ");
s.append(QString::number(a[ii])).append(")");
qDebug() << s;
}
int main(int argc, char* argv [])
{
QCoreApplication(argc, argv);
int deviceCount = 0;
int cudaDevice = 0;
char cudaDeviceName [100];
unsigned int N = 50;
float *a, *b, *c;
cuInit(0);
cuDeviceGetCount(&deviceCount);
cuDeviceGet(&cudaDevice, 0);
cuDeviceGetName(cudaDeviceName, 100, cudaDevice);
qDebug() << "Number of devices: " << deviceCount;
qDebug() << "Device name:" << cudaDeviceName;
a = new float [N]; b = new float [N]; c = new float [N];
for (unsigned int ii = 0; ii < N; ++ii) {
a[ii] = qrand();
b[ii] = qrand();
}
// This is the function call in which the kernel is called
vectorAddition(a, b, c, N);
qDebug() << "input a:"; printArray(a, N);
qDebug() << "input b:"; printArray(b, N);
qDebug() << "output c:"; printArray(c, N);
if (a) delete a;
if (b) delete b;
if (c) delete c;
}
The Cuda file vectorAddition.cu, which describes a simple vector addition, look like this:
#include <cuda.h>
#include <builtin_types.h>
extern "C"
__global__ void vectorAdditionCUDA(const float* a, const float* b, float* c, int n)
{
int ii = blockDim.x * blockIdx.x + threadIdx.x;
if (ii < n)
c[ii] = a[ii] + b[ii];
}
void vectorAddition(const float* a, const float* b, float* c, int n) {
float *a_cuda, *b_cuda, *c_cuda;
unsigned int nBytes = sizeof(float) * n;
int threadsPerBlock = 256;
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
// allocate and copy memory into the device
cudaMalloc((void **)& a_cuda, nBytes);
cudaMalloc((void **)& b_cuda, nBytes);
cudaMalloc((void **)& c_cuda, nBytes);
cudaMemcpy(a_cuda, a, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(b_cuda, b, nBytes, cudaMemcpyHostToDevice);
vectorAdditionCUDA<<<blocksPerGrid, threadsPerBlock>>>(a_cuda, b_cuda, c_cuda, n);
// load the answer back into the host
cudaMemcpy(c, c_cuda, nBytes, cudaMemcpyDeviceToHost);
cudaFree(a_cuda);
cudaFree(b_cuda);
cudaFree(c_cuda);
}
If you get this to work, then more complicated examples are self-evident, I think.
Edit (24-1-2013): I added the QMAKE_LFLAGS_DEBUG = /NODEFAULTLIB:msvcrtd.lib and the CONFIG(debug) with the extra D_DEBUG flag, such that it also compiles in debug mode.
Using msvc 2010 I found that the linker does not accept the -l parameter, however nvcc needs it. Therefore I made a simple change in the .pro file:
# Add the necessary libraries
CUDA_LIBS = cuda cudart
# The following makes sure all path names (which often include spaces) are put between quotation marks
CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
# LIBRARIES IN FORMAT NEEDED BY NVCC
NVCC_LIBS = $$join(CUDA_LIBS,' -l','-l', '')
# LIBRARIES IN FORMAT NEEDED BY VISUAL C++ LINKER
LIBS += $$join(CUDA_LIBS,'.lib ', '', '.lib')
And the nvcc command (release version):
cuda.commands = $$CUDA_DIR/bin/nvcc.exe $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
$$NVCC_LIBS was inserted instead of $$LIBS.
The whole .pro file, which works for me:
QT += core
QT -= gui
TARGET = TestCUDA
CONFIG += console
CONFIG -= app_bundle
TEMPLATE = app
# Define output directories
DESTDIR = release
OBJECTS_DIR = release/obj
CUDA_OBJECTS_DIR = release/cuda
# Source files
SOURCES += main.cpp
# This makes the .cu files appear in your project
OTHER_FILES += vectorAddition.cu
# CUDA settings <-- may change depending on your system
CUDA_SOURCES += vectorAddition.cu
#CUDA_SDK = "C:/ProgramData/NVIDIA Corporation/NVIDIA GPU Computing SDK 4.2/C" # Path to cuda SDK install
CUDA_DIR = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v5.0" # Path to cuda toolkit install
SYSTEM_NAME = win32 # Depending on your system either 'Win32', 'x64', or 'Win64'
SYSTEM_TYPE = 32 # '32' or '64', depending on your system
CUDA_ARCH = sm_11 # Type of CUDA architecture, for example 'compute_10', 'compute_11', 'sm_10'
NVCC_OPTIONS = --use_fast_math
# include paths
INCLUDEPATH += $$CUDA_DIR/include
#$$CUDA_SDK/common/inc/ \
#$$CUDA_SDK/../shared/inc/
# library directories
QMAKE_LIBDIR += $$CUDA_DIR/lib/$$SYSTEM_NAME
#$$CUDA_SDK/common/lib/$$SYSTEM_NAME \
#$$CUDA_SDK/../shared/lib/$$SYSTEM_NAME
# The following library conflicts with something in Cuda
QMAKE_LFLAGS_RELEASE = /NODEFAULTLIB:msvcrt.lib
QMAKE_LFLAGS_DEBUG = /NODEFAULTLIB:msvcrtd.lib
# Add the necessary libraries
CUDA_LIBS = cuda cudart
# The following makes sure all path names (which often include spaces) are put between quotation marks
CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
NVCC_LIBS = $$join(CUDA_LIBS,' -l','-l', '')
LIBS += $$join(CUDA_LIBS,'.lib ', '', '.lib')
# Configuration of the Cuda compiler
CONFIG(debug, debug|release) {
# Debug mode
cuda_d.input = CUDA_SOURCES
cuda_d.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
cuda_d.commands = $$CUDA_DIR/bin/nvcc.exe -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
cuda_d.dependency_type = TYPE_C
QMAKE_EXTRA_COMPILERS += cuda_d
}
else {
# Release mode
cuda.input = CUDA_SOURCES
cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
cuda.commands = $$CUDA_DIR/bin/nvcc.exe $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
cuda.dependency_type = TYPE_C
QMAKE_EXTRA_COMPILERS += cuda
}
I also added some essential declarations, i.e. QT += core for the app to work, and also removed the SDK part, which I did not find useful in this case.
I tried this combination to work. Could not make it work due to a number of dependencies in
my project.
My final solution was to break the application into two separate applications on Windows
1)
CUDA application developed in VC and running as a service/DLL in Windows
GUI interface developed in QT and using the DLL for CUDA related tasks.
Hope it saves some time of others
Related
I'm trying to use CUDA with Qt Creator, Win7, and VS2012 compiler. I have experience with Qt on Windows, but have been unsuccessful setting up to integrate CUDA code into a Qt project. I've tried several posted solutions (such as Compiling Cuda code in Qt Creator on Windows), but have had no success. I finally decided to simplify and base my code on this blog post: https://cudaspace.wordpress.com/2012/07/05/qt-creator-cuda-linux-review/ but am still having issues.
Currently, I get the error "LNK1104: cannot open file 'obj\cuda_code.obj'"
My .pro file is:
QT += core
QT -= gui
TARGET = QtCuda
CONFIG += console
CONFIG -= app_bundle
TEMPLATE = app
SOURCES += main.cpp \
cuda_code.cu
# project build directories
DESTDIR = $$PWD
OBJECTS_DIR = $$DESTDIR/obj
# C++ flags
QMAKE_CXXFLAGS_RELEASE =-O3
# Cuda sources
CUDA_SOURCES += cuda_code.cu
# Path to cuda toolkit install
CUDA_DIR = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v7.0"
# Path to header and libs files
INCLUDEPATH += $$CUDA_DIR/include
QMAKE_LIBDIR += $$CUDA_DIR/lib/x64
# libs used in your code
LIBS += -lcudart -lcuda
# GPU architecture
CUDA_ARCH = sm_50
# Here are some NVCC flags I've always used by default.
NVCCFLAGS = --compiler-options -use_fast_math
# Prepare the extra compiler configuration (taken from the nvidia forum - i'm not an expert in this part)
CUDA_INC = $$join(INCLUDEPATH,' -I','-I',' ')
cuda.commands = $$CUDA_DIR/bin/nvcc -m64 -O3 -arch=$$CUDA_ARCH -c $$NVCCFLAGS \
$$CUDA_INC $$LIBS ${QMAKE_FILE_NAME} -o ${QMAKE_FILE_OUT} \
2>&1 | sed -r \"s/\\(([0-9]+)\\)/:\\1/g\" 1>&2
cuda.dependency_type = TYPE_C
cuda.depend_command = $$CUDA_DIR/bin/nvcc -O3 -M $$CUDA_INC $$NVCCFLAGS ${QMAKE_FILE_NAME}
cuda.input = $$CUDA_SOURCES
cuda.output = $$OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.obj
# Tell Qt that we want add more stuff to the Makefile
QMAKE_EXTRA_COMPILERS += cuda
My main.cpp
#include <QtCore/QCoreApplication>
#include <iostream>
using namespace std;
#include <cuda_runtime.h>
extern "C"
cudaError_t cuda_main();
int main(int argc, char *argv[])
{
QCoreApplication a(argc, argv);
cudaError_t cuerr = cuda_main();
if (cuerr != cudaSuccess) cout << "CUDA Error: " << cudaGetErrorString( cuerr ) << endl;
return a.exec();
}
My cuda file (cuda_code.cu):
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
extern "C"
cudaError_t cuda_main()
{
// generate 16M random numbers on the host
thrust::host_vector<int> h_vec(1 << 24);
thrust::generate(h_vec.begin(), h_vec.end(), rand);
// transfer data to the device
thrust::device_vector<int> d_vec = h_vec;
// sort data on the device (805 Mkeys/sec on GeForce GTX 480)
thrust::sort(d_vec.begin(), d_vec.end());
// transfer data back to host
thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
return cudaGetLastError();
}
The OP was able to get a successful compile link by making the following changes:
1) In the .pro file, added
MSVCRT_LINK_FLAG_DEBUG = "/MDd"
MSVCRT_LINK_FLAG_RELEASE = "/MD"
along with (to the cuda.command statement)
-Xcompiler $$MSVCRT_LINK_FLAG_DEBUG -or- -Xcompiler $$MSVCRT_LINK_FLAG_RELEASE
as described in:
Compile cuda file error: "runtime library" mismatch value 'MDd_DynamicDebug' doesn't match value 'MTd_StaticDebug' in vectorAddition_cuda.o
2) Also had a very strange detail in the makefile that I had to fix manually. I hope that there is a real fix for this, but I haven't been able to figure it out.
At the top of the makefile, there are several definitions, including one for LIBS. After close inspection of this definition, I found that there was an extra set of quotation marks in the specification of library locations. Like this:
LIBS = /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0\lib\x64" ""C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0\lib\x64"\cuda.lib" ""C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0\lib\x64"\cudart.lib" /LIBPATH:C:\Qt\5.2.1\msvc2012_64_opengl\lib C:\Qt\5.2.1\msvc2012_64_opengl\lib\Qt5Cored.lib
If you look closely, you can see the extra set of quotation marks in the locations for cuda.lib and cudart.lib. I couldn't figure out what might be causing this (probably something in my .pro file), but if I manually removed the extra quotations, the compile/link worked. Here's the corrected line in the makefile:
LIBS = /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0\lib\x64" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0\lib\x64\cuda.lib" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0\lib\x64\cudart.lib" /LIBPATH:C:\Qt\5.2.1\msvc2012_64_opengl\lib C:\Qt\5.2.1\msvc2012_64_opengl\lib\Qt5Cored.lib
I would sure like to be able to fix this in my .pro file so that these extra quotations didn't appear. Suggestions would be appreciated.
For reference, here's my latest .pro file.
QT += core
QT -= gui
TARGET = QtCuda
CONFIG += console
CONFIG -= app_bundle
TEMPLATE = app
SOURCES += main.cpp \
cuda_code.cu
# project build directories
DESTDIR = $$PWD
OBJECTS_DIR = $$DESTDIR/obj
# C++ flags
QMAKE_CXXFLAGS_RELEASE =-O3
# Cuda sources
CUDA_SOURCES += cuda_code.cu
# Path to cuda toolkit install
CUDA_DIR = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v7.0"
# Path to header and libs files
INCLUDEPATH += $$CUDA_DIR/include
QMAKE_LIBDIR += $$CUDA_DIR/lib/x64
SYSTEM_TYPE = 64 # '32' or '64', depending on your system
# libs used in your code
LIBS += -lcuda -lcudart
# GPU architecture
CUDA_ARCH = sm_50
# Here are some NVCC flags I've always used by default.
NVCCFLAGS = --use_fast_math
# Prepare the extra compiler configuration (taken from the nvidia forum - i'm not an expert in this part)
CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
# MSVCRT link option (static or dynamic, it must be the same with your Qt SDK link option)
MSVCRT_LINK_FLAG_DEBUG = "/MDd"
MSVCRT_LINK_FLAG_RELEASE = "/MD"
# Tell Qt that we want add more stuff to the Makefile
QMAKE_EXTRA_COMPILERS += cuda
# Configuration of the Cuda compiler
CONFIG(debug, debug|release) {
# Debug mode
cuda_d.input = CUDA_SOURCES
cuda_d.output = $$OBJECTS_DIR/${QMAKE_FILE_BASE}.obj
cuda_d.commands = $$CUDA_DIR/bin/nvcc.exe -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$LIBS --machine $$SYSTEM_TYPE \
-arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME} -Xcompiler $$MSVCRT_LINK_FLAG_DEBUG
cuda_d.dependency_type = TYPE_C
QMAKE_EXTRA_COMPILERS += cuda_d
}
else {
# Release mode
cuda.input = CUDA_SOURCES
cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}.obj
cuda.commands = $$CUDA_DIR/bin/nvcc.exe $$NVCC_OPTIONS $$CUDA_INC $$LIBS --machine $$SYSTEM_TYPE \
-arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME} -Xcompiler $$MSVCRT_LINK_FLAG_RELEASE
cuda.dependency_type = TYPE_C
QMAKE_EXTRA_COMPILERS += cuda
}
[Note: this answer has been created from an edit to the question which included the solution. It has been added as a community wiki entry to get the question off the unanswered list for the CUDA tag]
I'm having problems when trying to integrate Qt with CUDA. I am running on a 64Bit Mac with the 64Bit CUDA toolkit installed, however when I try to build my code the error ld: file not found: #rpath/CUDA.framework/Versions/A/CUDA for architecture x86_64 is thrown.
I have verified all my paths but the same error is consistently thrown. My .pro configuration code is as follows:
QT += core gui
QT += multimedia
QT += multimediawidgets
QT += concurrent
greaterThan(QT_MAJOR_VERSION, 4): QT += widgets
TARGET = WebcamFilter
TEMPLATE = app
SOURCES += main.cpp\
mainwindow.cpp \
camerafeed.cpp \
HEADERS += mainwindow.h \
camerafeed.h
FORMS += mainwindow.ui
# CUDA Resources
CUDA_SOURCES += gaussian.cu
CUDA_DIR = /usr/local/cuda
# Path to header and lib files
INCLUDEPATH += $$CUDA_DIR/include
QMAKE_LIBDIR += $$CUDA_DIR/lib
# Libs used for source code
LIBS += -lcudart -lcuda
# GPU Architecture
CUDA_ARCH = sm_20
# Custom flags for nvcc
NVCCFLAGS = --compiler-options -fno-strict-aliasing -use_fast_math --ptxas-options=-v
# Prepare extra compiler configuration
CUDA_INC = $$join(INCLUDEPATH,' -I','-I',' ')
cuda.commands = $$CUDA_DIR/bin/nvcc -m64 -O3 -arch=$$CUDA_ARCH -c $$NVCCFLAGS \
$$CUDA_INC $$LIBS ${QMAKE_FILE_NAME} -o ${QMAKE_FILE_OUT} \
2>&1 | sed -r \"s/\\(([0-9]+)\\)/:\\1/g\" 1>&2
cuda.dependency_type = TYPE_C
cuda.depend_command = $$CUDA_DIR/bin/nvcc -O3 -M $$CUDA_INC $$NVCCFLAGS ${QMAKE_FILE_NAME}
cuda.input = CUDA_SOURCES
cuda.output = ${OBJECTS_DIR}${QMAKE_FILE_BASE}_cuda.o
# Tell Qt that we want add more stuff to the Makefile
QMAKE_EXTRA_COMPILERS += cuda
I came across this problem a few months ago (plus some other issues after this was fixed) so I figured I'd just post a fully working QT/CUDA example now that I have it mostly figured out. I pulled most of the .pro file from a larger project for both Linux and Mac (CUDA stuff is in the gpu folder) but this bit of code has only been tested on OS X.
I'm currently using:
CUDA 7.0 driver V7.0.27
OS X Yosemite 10.10.3
QT 5.3.1
If you haven't updated recently make sure the CUDA deviceQuery and bandwidthTest samples are still working before trying this code.
The .pro file below might be all you need to solve your problems but the C++ code is below as well. The code comments do most of the explaining.
qtcuda.pro
#-------------------------------------------------
#
# Project created by QtCreator 2015-05-02T02:37:39
#
#-------------------------------------------------
QT += core gui
greaterThan(QT_MAJOR_VERSION, 4): QT += widgets
TARGET = qtcuda
TEMPLATE = app
# project build directories (if not using shadow build)
DESTDIR = $$system(pwd)
BUILDDIR = $$DESTDIR/build
MOC_DIR = $$BUILDDIR # moc_... files
UI_DIR = $$BUILDDIR # ui_mainwindow.cpp
OBJECTS_DIR = $$BUILDDIR/bin # .o binary files
SOURCES += main.cpp\
mainwindow.cpp
HEADERS += mainwindow.h
FORMS += mainwindow.ui
# NOTE: C++ flags are needed here for
# the CUDA Thrust library
############### UNIX FLAGS #####################
unix {
QMAKE_CXXFLAGS += -std=c++11
}
############### MAC FLAGS #####################
macx {
# libs that don't get passed to nvcc (we'll remove them from LIBS later)
NON_CUDA_LIBS += -stdlib=libc++
LIBS += $$NON_CUDA_LIBS
QMAKE_CXXFLAGS += -stdlib=libc++ -mmacosx-version-min=10.7
QMAKE_LFLAGS += -mmacosx-version-min=10.7
QMAKE_MACOSX_DEPLOYMENT_TARGET = 10.7
# specific to computers without older sdks
MAC_SDK = /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9. sdk/
if( exists( $$MAC_SDK) ) {
QMAKE_MAC_SDK = macosx10.9 # lowest sdk on my computer :/
}
# don't show warnings for c++11 extentions
QMAKE_CXXFLAGS += -Wno-c++11-extensions
}
################### CUDA ###################### (similar to your setup)
unix:!macx {
SED_STUFF = 2>&1 | sed -r \"s/\\(([0-9]+)\\)/:\\1/g\" 1>&2
}
macx {
SED_STUFF = 2>&1 | sed -E \"s/\\(([0-9]+)\\)/:\\1/g\" 1>&2
}
CUDA_DIR = /usr/local/cuda
# make sure cuda is available on the computer
if ( exists( $$CUDA_DIR/ ) ) {
message( "Configuring for cuda...");
DEFINES += CUDA_7 # # same as putting this in code -> #define CUDA_7
# Cuda sources
CUDA_SOURCES += cuda/wrappers.cu
# show files in working tree
OTHER_FILES += cuda/wrappers.cu \
cuda/wrappers.cuh \
cuda/helper_cuda.h
# Path to cuda install
CUDA_LIB = $$CUDA_DIR/lib
# Pather to header and lib files
INCLUDEPATH += $$CUDA_DIR/include \
cuda # my cuda files
QMAKE_LIBDIR += $$CUDA_LIB
# prevents warnings from code we didn't write
QMAKE_CXXFLAGS += -isystem $$CUDA_DIR/include
LIBS += -lcudart # add other cuda libs here (-lcublas -lcurand, etc.)
# SPECIFY THE R PATH FOR NVCC!!!!! (your problem...previously my problem)
QMAKE_LFLAGS += -Wl,-rpath,$$CUDA_LIB
NVCCFLAGS = -Xlinker -rpath,$$CUDA_LIB
# libs used in the code
CUDA_LIBS = $$LIBS
CUDA_LIBS -= $$NON_CUDA_LIBS # remove libs nvcc won't recognize
# GPU architecture (might be a way to detect this somehow instead of hardcoding)
CUDA_ARCH = sm_20 # <- based on specs from your code. This was tested with sm_30
# Some default NVCC flags
NVCCFLAGS += --compiler-options -fno-strict-aliasing -use_fast_math --ptxas-options=-v --std=c++11
# Prepare the extra compiler configuration (taken from the nvidia forum)
CUDA_INC = $$join(INCLUDEPATH,' -I','-I',' ')
cuda.commands = $$CUDA_DIR/bin/nvcc -m64 -O3 -arch=$$CUDA_ARCH -c $$NVCCFLAGS \
$$CUDA_INC $$CUDA_LIBS ${QMAKE_FILE_NAME} -o ${QMAKE_FILE_OUT} \
$$SED_STUFF
# nvcc error printout format ever so slightly different from gcc
# http://forums.nvidia.com/index.php?showtopic=171651
cuda.dependency_type = TYPE_C
cuda.depend_command = $$CUDA_DIR/bin/nvcc -O3 -M $$CUDA_INC $$NVCCFLAGS ${QMAKE_FILE_NAME}
cuda.input = CUDA_SOURCES
cuda.output = ${OBJECTS_DIR}${QMAKE_FILE_BASE}_cuda.o
# Tell Qt that we want add more stuff to the Makefile
QMAKE_EXTRA_COMPILERS += cuda
} # endif CUDA
The following two files are composed of extern functions used to execute CUDA code. The .cu file defines functions that contain CUDA code and gets compiled with NVCC (as specified in the .pro file). The .cuh file is used as a header file and simply declares the same functions so they can be referenced by C++ files. Only wrappers.cuh needs to be included in the C++ code.
Note: The referenced helper_cuda.h file can be found here
NoteNote: This project assumes wrappers.cuh, wrappers.cu, and helper_cuda.h are kept in a folder labeled cuda within the project directory.
cuda/wrappers.cuh
#ifndef WRAPPERS_CUH
#define WRAPPERS_CUH
typedef unsigned int uint;
extern "C"
{
void cudaInit();
void allocateArray(void **devPtr, int size);
void freeArray(void *devPtr);
void copyArrayToDevice(void *device, const void *host, int offset, int size);
void copyArrayFromDevice(void *host, const void *device, int size);
uint sumNumbers(uint *dNumbers, uint n);
// not used here but useful when calling kernel functions
void computeGridSize(uint n, uint blockSize, uint &numBlocks, uint &numThreads);
}
#endif // WRAPPERS_CUH
cuda/wrappers.cu
#include <cuda_runtime.h>
#include <cuda_gl_interop.h>
#include <thrust/device_ptr.h>
#include <thrust/reduce.h>
#include "helper_cuda.h"
typedef unsigned int uint;
extern "C"
{
void cudaInit()
{
int devID;
// use device with highest Gflops/s
devID = findCudaDevice();
if (devID < 0)
{
printf("No CUDA Capable devices found, exiting...\n");
exit(EXIT_SUCCESS);
}
}
void allocateArray(void **devPtr, size_t size)
{
checkCudaErrors(cudaMalloc(devPtr, size));
}
void freeArray(void *devPtr)
{
checkCudaErrors(cudaFree(devPtr));
}
void copyArrayToDevice(void *device, const void *host, int offset, int size)
{
checkCudaErrors(cudaMemcpy((char *) device + offset, host, size, cudaMemcpyHostToDevice));
}
void copyArrayFromDevice(void *host, const void *device, int size)
{
checkCudaErrors(cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost));
}
uint sumNumbers(uint *dNumbers, uint n)
{
// simple reduction from 1 to n
thrust::device_ptr<uint> dp_numbers(dNumbers);
return thrust::reduce(dp_numbers, dp_numbers + n);
}
//Round a / b to nearest higher integer value
uint iDivUp(uint a, uint b)
{
return (a % b != 0) ? (a / b + 1) : (a / b);
}
// compute grid and thread block size for a given number of elements
void computeGridSize(uint n, uint blockSize, uint &numBlocks, uint &numThreads)
{
numThreads = min(blockSize, n);
numBlocks = iDivUp(n, numThreads);
}
}
The next three files create a simple QT window and check for mouse events. Every time the mouse is moved the X and Y pixel positions are added together to create n. Then a CUDA function is used to find 1 + 2 + ... + n (yes this is weird and random; the point was to show CUDA running in a quick and easy way).
So if the mouse is at (23, 45) then:
n = (23 + 45) = 68 and
1 + 2 + ... + n = 2346
This is then displayed at the bottom of the window.
main.cpp
#include "mainwindow.h"
#include <QApplication>
int main(int argc, char *argv[])
{
QApplication a(argc, argv);
MainWindow w;
w.show();
return a.exec();
}
mainwindow.h
#ifndef MAINWINDOW_H
#define MAINWINDOW_H
#include <QMainWindow>
namespace Ui {
class MainWindow;
}
class MainWindow : public QMainWindow
{
Q_OBJECT
public:
explicit MainWindow(QWidget *parent = 0);
~MainWindow();
// events are passed here
virtual bool eventFilter(QObject *obj, QEvent *event);
private:
Ui::MainWindow *ui;
uint *m_dNumbers; // device array
};
#endif // MAINWINDOW_H
mainwindow.cpp
#include "mainwindow.h"
#include "ui_mainwindow.h"
#include <QEvent>
#include <QMouseEvent>
#include <assert.h>
#include "wrappers.cuh"
const uint MAX_NUMBERS = 5000;
MainWindow::MainWindow(QWidget *parent) :
QMainWindow(parent),
ui(new Ui::MainWindow)
{
// basic ui setup and event filter for mouse movements
ui->setupUi(this);
qApp->installEventFilter(this);
// create a host array and initialize it to {1, 2, 3, ..., MAX_NUMBERS}
uint hNumbers[MAX_NUMBERS];
for (uint i = 0; i < MAX_NUMBERS; i++)
{
hNumbers[i] = i + 1;
}
// CUDA FUNCTIONS:
cudaInit(); // initialiaze the cuda device
allocateArray((void**)&m_dNumbers, MAX_NUMBERS*sizeof(int)); // allocate device array
copyArrayToDevice(m_dNumbers, hNumbers, 0, MAX_NUMBERS*sizeof(int)); // copy host array to device array
}
MainWindow::~MainWindow()
{
// CUDA FUNCTION: free device memory
freeArray(m_dNumbers);
delete ui;
}
// used to detect mouse movement events
bool MainWindow::eventFilter(QObject *, QEvent *event)
{
if (event->type() == QEvent::MouseMove)
{
// find mouseX + mouseY
QMouseEvent *mouseEvent = static_cast<QMouseEvent*>(event);
QPoint p = mouseEvent->pos();
uint n = std::min((uint)(p.x() + p.y()), MAX_NUMBERS);
// CUDA FUNCTION:
// compute the sum of 1 + 2 + 3 + ... + n
uint sum = sumNumbers(m_dNumbers, n);
// check that the sum is correct
assert(sum == ( (n * (n+1) ) / 2 ) );
// show the sum at the bottom of the window
statusBar()->showMessage(QString("Mouse pos: (%1, %2) Sum from 0 to %3 = %4").arg(p.x()).arg(p.y()). arg(n).arg(sum));
}
return false;
}
And last but not least the .ui file if you want to actually build and run the project:
mainwindow.ui
<?xml version="1.0" encoding="UTF-8"?>
<ui version="4.0">
<class>MainWindow</class>
<widget class="QMainWindow" name="MainWindow">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>400</width>
<height>300</height>
</rect>
</property>
<property name="windowTitle">
<string>MainWindow</string>
</property>
<widget class="QWidget" name="centralWidget"/>
<widget class="QMenuBar" name="menuBar">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>400</width>
<height>22</height>
</rect>
</property>
</widget>
<widget class="QToolBar" name="mainToolBar">
<attribute name="toolBarArea">
<enum>TopToolBarArea</enum>
</attribute>
<attribute name="toolBarBreak">
<bool>false</bool>
</attribute>
</widget>
<widget class="QStatusBar" name="statusBar"/>
</widget>
<layoutdefault spacing="6" margin="11"/>
<resources/>
<connections/>
</ui>
I know the QT/CUDA process can be annoying and it's been half a year of silence since you asked the question but hopefully this helps.
I'm such a newby concerning OpenCL programming, and I want to run a simple program which is in "OpenCL Parallel Programming Development Cookbook".
In fact, I want to query OpenCl platforms by this simple prog:
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
void displayPlatformInfo(cl_platform_id id,
cl_platform_info param_name,
const char* paramNameAsStr) {
cl_int error = 0;
size_t paramSize = 0;
error = clGetPlatformInfo( id, param_name, 0, NULL, ¶mSize );
char* moreInfo = (char*)malloc( sizeof(char) * paramSize);
error = clGetPlatformInfo( id, param_name, paramSize,moreInfo, NULL );
if (error != CL_SUCCESS ) {
perror("Unable to find any OpenCL platform information");
return;
}
printf("%s: %s\n", paramNameAsStr, moreInfo);
}
int main() {
/* OpenCL 1.2 data structures */
cl_platform_id* platforms;
/* OpenCL 1.1 scalar data types */
cl_uint numOfPlatforms;
cl_int error;
/*
Get the number of platforms
Remember that for each vendor's SDK installed on the
Computer, the number of available platform also
*/
error = clGetPlatformIDs(0, NULL, &numOfPlatforms);
if(error < 0) {
perror("Unable to find any OpenCL platforms");
exit(1);
}
// Allocate memory for the number of installed platforms.
// alloca(...) occupies some stack space but is
// automatically freed on return
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id)
* numOfPlatforms);
printf("Number of OpenCL platforms found: %d\n",
numOfPlatforms);
// We invoke the API 'clPlatformInfo' twice for each
// parameter we're trying to extract
// and we use the return value to create temporary data
// structures (on the stack) to store
// the returned information on the second invocation.
for(cl_uint i = 0; i < numOfPlatforms; ++i) {
displayPlatformInfo( platforms[i],
CL_PLATFORM_PROFILE,
"CL_PLATFORM_PROFILE" );
displayPlatformInfo( platforms[i],
CL_PLATFORM_VERSION,
"CL_PLATFORM_VERSION" );
displayPlatformInfo( platforms[i],
CL_PLATFORM_NAME,
"CL_PLATFORM_NAME" );
displayPlatformInfo( platforms[i],
CL_PLATFORM_VENDOR,
"CL_PLATFORM_VENDOR" );
displayPlatformInfo( platforms[i],
CL_PLATFORM_EXTENSIONS,
"CL_PLATFORM_EXTENSIONS" );
}
return 0;
}
I'm on Qt Creator, and my pc's config concerning video is : NVIDIA GEFORCE GT 635M & Intel(R) HD Graphics 4000 under Windows 8.1
My .pro file is :
SOURCES += \
main.cpp
QMAKE_CXXFLAGS += -std=c++0x
INCLUDEPATH += \
$$quote(C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v6.5/include)
LIBS += \
$$quote(C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v6.5/lib/x64/OpenCL.lib)
Because of spaces in file path. So, my question is : Why, when i'm compiling my project, does the problem "undefined reference to clGetPlatformInfo#20'" appear? There's 2 others errors (one which exactly the same, the other is "undefined reference toclGetPlatformIDs#12'")
I search on the web for a lot of days and I can't find the answer (these prob has answer but on Linux or on Mac..)
Thanks in advance !
Mathieu
It looks like you are trying to build 32-bit application, while linking with 64-bit version of OpenCL.lib:
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v6.5/lib/x64/OpenCL.lib
So, either build application in 64-bit mode, or fix the path to point to 32-bit version of OpenCL.lib.
I am trying to do separate compilation using CUDA 5. For this reason I set the "Generate Relocatable Device Code" to "Yes (-rdc=true)" in Visual Studio 2010. The program compiles without errors, however,
I get an invalid device symbol error when I try to initialize device constants using cudaMemcpyToSymbol.
i.e. I have the following constant
__constant__ float gdDomainOrigin[2];
and try to initialize it with
cudaMemcpyToSymbol(gdDomainOrigin, mDomainOrigin, 2*sizeof(float));
which leads to the error. The error does not occur, when I compile everything as a whole, without the aforementioned option set. Could anybody please help me with that?
I can't reproduce this. If build an application from two .cu files, one containing a __constant__ symbol and a simple kernel, and the other containing the runtime API incantations to populate that constant memory and call the kernel, it works only when relocatable device code is enabled, viz:
__constant__ float gdDomainOrigin[2];
__global__
void kernel(float *inout)
{
inout[0] = gdDomainOrigin[0];
inout[1] = gdDomainOrigin[1];
}
and
#include <cstdio>
extern __constant__ float gdDomainOrigin;
extern __global__ void kernel(float *);
inline
void gpuAssert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n",
cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
int main(void)
{
const float mDomainOrigin[2] = { 1.234f, 5.6789f };
const size_t sz = sizeof(float) * size_t(2);
float * dbuf, * hbuf;
gpuErrchk( cudaFree(0) );
gpuErrchk( cudaMemcpyToSymbol(gdDomainOrigin, mDomainOrigin, sz) );
gpuErrchk( cudaMalloc((void **)&dbuf, sz) );
kernel<<<1,1>>>(dbuf);
gpuErrchk( cudaPeekAtLastError() );
hbuf = new float[2];
gpuErrchk( cudaMemcpy(hbuf, dbuf, sz, cudaMemcpyDeviceToHost) );
fprintf(stdout, "%f %f\n", hbuf[0], hbuf[1]);
return 0;
}
Compiling and running these in CUDA 5 on a 64 bit linux system with a Kepler GPU produces the following:
$ nvcc -arch=sm_30 -o shared shared.cu shared_dev.cu
$ ./shared
GPUassert: invalid device symbol shared.cu 23
$ nvcc -arch=sm_30 -rdc=true -o shared shared.cu shared_dev.cu
$ ./shared
1.234000 5.678900
You can see that in the first compilation, without relocatable GPU code generation, the symbol isn't found. In the second case, with relocatable GPU code generation, it is found, and the elf header in the object file looks just as you would expect:
$ nvcc -arch=sm_30 -rdc=true -c shared_dev.cu
$ cuobjdump -symbols shared_dev.o
Fatbin elf code:
================
arch = sm_30
code version = [1,6]
producer = cuda
host = linux
compile_size = 64bit
identifier = shared_dev.cu
symbols:
STT_SECTION STB_LOCAL .text._Z6kernelPf
STT_SECTION STB_LOCAL .nv.constant3
STT_SECTION STB_LOCAL .nv.constant0._Z6kernelPf
STT_CUDA_OBJECT STB_LOCAL _param
STT_SECTION STB_LOCAL .nv.callgraph
STT_FUNC STB_GLOBAL _Z6kernelPf
STT_CUDA_OBJECT STB_GLOBAL gdDomainOrigin
Fatbin ptx code:
================
arch = sm_30
code version = [3,1]
producer = cuda
host = linux
compile_size = 64bit
compressed
identifier = shared_dev.cu
ptxasOptions = --compile-only
Perhaps you could try my code and compilation/diagnostic steps and see what happens with your Windows toolchain.
I'm trying to get started working with Intel's Array Building Blocks, and there seems to only be one tutorial on "Hello World", at http://www.hpc.lsu.edu/training/tutorials/sc10/tutorials/SC10Tutorials/docs/M07/M07.pdf . And I'm not really getting it.
I'm using Visual Studio 2010 and this is the code I got from the above link, kinda.
#include <C:/Program Files/intel/arbb/Beta6/include/arbb.hpp>
//What do I have to do to make just "#include <arbb.hpp>" work?
using namespace arbb;
void my_function(f32& result, f32 input){
std::cout << "Hello, world!" << std::endl;
result = input + 1.0f; //"Error: no operator "+" matches these operands
}
int main(){
typedef closure<void (f32&, f32)> mfc;
mfc a = capture(my_function);
mfc b = call(my_function);
mfc c = call(my_function);
}
What else do I need to do to get "Hello World" working?
There are many samples available in arbb installation path. You can use the visual studio solution files to start with any of the sample. That is the easiest way.
In order to compile and run your own application from scratch, you have to have the include and dependencies set.
On Linux, you can add the path ~/(whatever)/intel/arbb/Beta6/include in the compile option using -I
On Windows, you can do:
set INCLUDE=C:/Program Files/intel/arbb/Beta6/include/arbb.hpp;
Or have a batch script that will ensure all the environment variables are set by default.
--- contents of the batch file ---
SET ARBB_OPT_LEVEL=O3
SET PATH=%ARBB_ROOT%\bin\ia32;%ARBB_ROOT%\bin\ia32\vs%MSVS_VERSION%;%OPENCV_ROOT%\bin;%FFTW_ROOT%;%FREEGLUT_ROOT%;%PTHREADS_ROOT%\lib;%PATH%
---- here is hello world program in arbb ---
#include <arbb.hpp>
void arbb_hello_map(arbb::i32& val)
{
val = val * 2;
}
void arbb_hello(arbb::dense<arbb::i32>& data)
{
using namespace arbb;
map(arbb_hello_map)(data);
}
int main()
{
using namespace arbb;
int size = 5;
dense<i32> data = dense<i32>(size);
range<i32> write_data = data.write_only_range();
for (int i = 0; i < size; ++i)
write_data[i] = i;
arbb::call(arbb_hello)(data);
std::cout << "hello: " << std::endl;
const_range<i32> read_data = data.read_only_range();
for (int i = 0; i < size; ++i)
std::cout <<"data["<<i<<"] = " << read_data[i] <<"\n";
return 0;
}
And compile it using
g++ -m64 -I/home/YOUR_NAME/arbb/install//include -Wall -Werror -O3 -W -Wshadow temp.cpp -o temp -L/home/YOUR_NAME/arbb/install/lib/intel64 -larbb_dev -ltbb -littnotify
Run it using
./temp