I am writing a simple OpenCL application, which is going to calculate the maximum experiment FLOPS of a target GPU device. I have decided to keep my cl kernel as simple as possible. Here are my OpenCL kernel and my host code. Kernel code is:
__kernel void flops(__global float *data) {
int gid = get_global_id(0);
double s = data[gid];
data[gid] = s * 0.35;
}
And the host code is:
#include <iostream>
#include <sstream>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "support.h"
#include "Event.h"
#include "ResultDatabase.h"
#include "OptionParser.h"
#include "ProgressBar.h"
using namespace std;
std::string kernels_folder = "/home/users/saman/shoc/src/opencl/level3/FlopsFolder/";
std::string kernel_file = "flops.cl";
static const char *opts = "-cl-mad-enable -cl-no-signed-zeros "
"-cl-unsafe-math-optimizations -cl-finite-math-only";
cl_program createProgram (cl_context context,
cl_device_id device,
const char* fileName) {
cl_int errNum;
cl_program program;
std::ifstream kernelFile (fileName, std::ios::in);
if (!kernelFile.is_open()) {
std::cerr << "Failed to open file for reading: " << fileName << std::endl;
}
std::ostringstream oss;
oss << kernelFile.rdbuf();
std::string srcStdStr = oss.str();
const char *srcStr = srcStdStr.c_str();
program = clCreateProgramWithSource (context, 1, (const char **)&srcStr,
NULL, &errNum);
CL_CHECK_ERROR(errNum);
errNum = clBuildProgram (program, 0, NULL, NULL, NULL, NULL);
CL_CHECK_ERROR (errNum);
return program;
}
bool createMemObjects (cl_context context, cl_command_queue queue,
cl_mem* memObject,
const int memFloatsSize, float *a) {
cl_int err;
*memObject = clCreateBuffer (context, CL_MEM_READ_WRITE,
memFloatsSize * sizeof(float), NULL, &err);
CL_CHECK_ERROR(err);
if (*memObject == NULL) {
std::cerr << "Error creating memory objects. " << std::endl;
return false;
}
Event evWrite("write");
err = clEnqueueWriteBuffer (queue, *memObject, CL_FALSE, 0, memFloatsSize * sizeof(float),
a, 0, NULL, &evWrite.CLEvent());
CL_CHECK_ERROR(err);
err = clWaitForEvents (1, &evWrite.CLEvent());
CL_CHECK_ERROR(err);
return true;
}
void cleanup (cl_context context, cl_command_queue commandQueue,
cl_program program, cl_kernel kernel, cl_mem memObject) {
if (memObject != NULL)
clReleaseMemObject (memObject);
if (kernel != NULL)
clReleaseKernel (kernel);
if (program != NULL)
clReleaseProgram (program);
}
void addBenchmarkSpecOptions(OptionParser &op) {
}
void RunBenchmark(cl_device_id id,
cl_context ctx,
cl_command_queue queue,
ResultDatabase &resultDB,
OptionParser &op)
{
for (float i = 0.1; i <= 0.2; i+=0.1 ) {
std::cout << "Deploying " << 100*i << "%" << std::endl;
bool verbose = false;
cl_int errNum;
cl_program program = 0;
cl_kernel kernel;
cl_mem memObject = 0;
char maxFloatsStr[128];
char testStr[128];
program = createProgram (ctx, id, (kernels_folder + kernel_file).c_str());
if (program == NULL) {
exit (0);
}
if (verbose) std::cout << "Program created successfully!" << std::endl;
kernel = clCreateKernel (program, "flops", &errNum);
CL_CHECK_ERROR(errNum);
if (verbose) std::cout << "Kernel created successfully!" << std::endl;
// Identify maximum size of the global memory on the device side
cl_long maxAllocSizeBytes = 0;
cl_long maxComputeUnits = 0;
cl_long maxWorkGroupSize = 0;
clGetDeviceInfo (id, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
sizeof(cl_long), &maxAllocSizeBytes, NULL);
clGetDeviceInfo (id, CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(cl_long), &maxComputeUnits, NULL);
clGetDeviceInfo (id, CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(cl_long), &maxWorkGroupSize, NULL);
// Let's use 80% of this memory for transferring data
cl_long maxFloatsUsageSize = ((maxAllocSizeBytes / 4) * 0.8);
if (verbose) std::cout << "Max floats usage size is " << maxFloatsUsageSize << std::endl;
if (verbose) std::cout << "Max compute unit is " << maxComputeUnits << std::endl;
if (verbose) std::cout << "Max Work Group size is " << maxWorkGroupSize << std::endl;
// Prepare buffer on the host side
float *a = new float[maxFloatsUsageSize];
for (int j = 0; j < maxFloatsUsageSize; j++) {
a[j] = (float) (j % 77);
}
if (verbose) std::cout << "Host buffer been prepared!" << std::endl;
// Creating buffer on the device side
if (!createMemObjects(ctx, queue, &memObject, maxFloatsUsageSize, a)) {
exit (0);
}
errNum = clSetKernelArg (kernel, 0, sizeof(cl_mem), &memObject);
CL_CHECK_ERROR(errNum);
size_t wg_size, wg_multiple;
cl_ulong local_mem, private_usage, local_usage;
errNum = clGetKernelWorkGroupInfo (kernel, id,
CL_KERNEL_WORK_GROUP_SIZE,
sizeof (wg_size), &wg_size, NULL);
CL_CHECK_ERROR (errNum);
errNum = clGetKernelWorkGroupInfo (kernel, id,
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
sizeof (wg_multiple), &wg_multiple, NULL);
CL_CHECK_ERROR (errNum);
errNum = clGetKernelWorkGroupInfo (kernel, id,
CL_KERNEL_LOCAL_MEM_SIZE,
sizeof (local_usage), &local_usage, NULL);
CL_CHECK_ERROR (errNum);
errNum = clGetKernelWorkGroupInfo (kernel, id,
CL_KERNEL_PRIVATE_MEM_SIZE,
sizeof (private_usage), &private_usage, NULL);
CL_CHECK_ERROR (errNum);
if (verbose) std::cout << "Work Group size is " << wg_size << std::endl;
if (verbose) std::cout << "Preferred Work Group size is " << wg_multiple << std::endl;
if (verbose) std::cout << "Local memory size is " << local_usage << std::endl;
if (verbose) std::cout << "Private memory size is " << private_usage << std::endl;
size_t globalWorkSize[1] = {maxFloatsUsageSize};
size_t localWorkSize[1] = {1};
Event evKernel("flops");
errNum = clEnqueueNDRangeKernel (queue, kernel, 1, NULL,
globalWorkSize, localWorkSize,
0, NULL, &evKernel.CLEvent());
CL_CHECK_ERROR (errNum);
if (verbose) cout << "Waiting for execution to finish ";
errNum = clWaitForEvents(1, &evKernel.CLEvent());
CL_CHECK_ERROR(errNum);
evKernel.FillTimingInfo();
if (verbose) cout << "Kernel execution terminated successfully!" << std::endl;
delete[] a;
sprintf (maxFloatsStr, "Size: %d", maxFloatsUsageSize);
sprintf (testStr, "Flops: %f\% Memory", 100*i);
double flopCount = maxFloatsUsageSize * 16000;
double gflop = flopCount / (double)(evKernel.SubmitEndRuntime());
resultDB.AddResult (testStr, maxFloatsStr, "GFLOPS", gflop);
// Now it's time to read back the data
a = new float[maxFloatsUsageSize];
errNum = clEnqueueReadBuffer(queue, memObject, CL_TRUE, 0, maxFloatsUsageSize*sizeof(float), a, 0, NULL, NULL);
CL_CHECK_ERROR(errNum);
if (verbose) {
for (int j = 0; j < 10; j++) {
std::cout << a[j] << " ";
}
}
delete[] a;
if (memObject != NULL)
clReleaseMemObject (memObject);
if (program != NULL)
clReleaseProgram (program);
if (kernel != NULL)
clReleaseKernel (kernel);
}
std::cout << "Program executed successfully!" << std::endl;
}
Explaining the code, in the kernel code I actually do a single floating point operation, which means every single task will do on FOPS. In the host code, I first retrieve the maximum global memory size of the GPU, allocate portion of it (for loop define how much of it), then push the data and kernel execution into it. I will measure the execution time of clEnqueueNDRangeKernel and then calculate the GFLOPS of application. In my current implementation, no matter what is the size of cl_mem, I get around 0.28 GFLOPS of performance, which is much less than the advertised power. I assume I do specific things inefficiently here. Or in general my method for calculating the GPU performance is not right. Does anyone can tell my what kind of changes should I make into the code?
With local group size of 1, you are wasting 31/32 of the resources (thus you can have 1/32 of the peak performance at most). You need local group size of at least 32 (and is multiple of 32) to fully utilize computation resources and 64 to achieve 100% occupancy (100% occupancy is not necessary though).
Memory access has high latency and low bandwidth. Your kernel will always be waiting for memory controllers if other things are right. You need do more arithmetic operations to make the ALU's busy.
You need read the document first and make use of the Visual Profiler. In the previous two parts I just want to tell that things are stranger than you thought. But more strange things are waiting.
You can achieve peak performance eaily on CPU with assembly language (By doing only independent arithmetic operations. If you write such code in C it will simply be dropped by the compiler). NVidia only provides us an IL interface called PTX, and I'm not sure if compiler will optimize it. And you can only use PTX in CUDA I think.
edit: It seems that compiler will optimize unused PTX code away, at least in inline assembers.
Related
I want to read a chunk of data which is just one frame of many frames stored in one dataset. The shape of the whole dataset is (10, 11214,3), 10 frames each frame has 11214 rows and 4 columns. Here is the file. The chunk I want to read would have the shape (11214,3). I can print the predefined array using, but I'm not sure how can I read data from a hdf5 file. Here is my code,
#include <h5xx/h5xx.hpp>
#include <boost/multi_array.hpp>
#include <iostream>
#include <vector>
#include <cstdio>
typedef boost::multi_array<int, 2> array_2d_t;
const int NI=10;
const int NJ=NI;
void print_array(array_2d_t const& array)
{
for (unsigned int j = 0; j < array.shape()[1]; j++)
{
for (unsigned int i = 0; i < array.shape()[0]; i++)
{
printf("%2d ", array[j][i]);
}
printf("\n");
}
}
void write_int_data(std::string const& filename, array_2d_t const& array)
{
h5xx::file file(filename, h5xx::file::trunc);
std::string name;
{
// --- create dataset and fill it with the default array data (positive values)
name = "integer array";
h5xx::create_dataset(file, name, array);
h5xx::write_dataset(file, name, array);
// --- create a slice object (aka hyperslab) to specify the location in the dataset to be overwritten
std::vector<int> offset; int offset_raw[2] = {4,4}; offset.assign(offset_raw, offset_raw + 2);
std::vector<int> count; int count_raw[2] = {2,2}; count.assign(count_raw, count_raw + 2);
h5xx::slice slice(offset, count);
}
}
void read_int_data(std::string const& filename)
{
h5xx::file file(filename, h5xx::file::in);
std::string name = "integer array";
// read and print the full dataset
{
array_2d_t array;
// --- read the complete dataset into array, the array is resized and overwritten internally
h5xx::read_dataset(file, name, array);
printf("original integer array read from file, negative number patch was written using a slice\n");
print_array(array);
printf("\n");
}
}
int main(int argc, char** argv)
{
std::string filename = argv[0];
filename.append(".h5");
// --- do a few demos/tests using integers
{
array_2d_t array(boost::extents[NJ][NI]);
{
const int nelem = NI*NJ;
int data[nelem];
for (int i = 0; i < nelem; i++)
data[i] = i;
array.assign(data, data + nelem);
}
write_int_data(filename, array);
read_int_data(filename);
}
return 0;
}
I'm using the h5xx — a template-based C++ wrapper for the HDF5 library link and boost library.
The datasets are stored in particles/lipids/box/positions path. The dataset name value holds the frames.
argv[0] is not what you want (arguments start at 1, 0 is the program name). Consider bounds checking as well:
std::vector<std::string> const args(argv, argv + argc);
std::string const filename = args.at(1) + ".h5";
the initialization can be done directly, without a temporary array (what is multi_array for, otherwise?)
for (size_t i = 0; i < array.num_elements(); i++)
array.data()[i] = i;
Or indeed, make it an algorithm:
std::iota(array.data(), array.data() + array.num_elements(), 0);
same with vectors:
std::vector<int> offset; int offset_raw[2] = {4,4}; offset.assign(offset_raw, offset_raw + 2);
std::vector<int> count; int count_raw[2] = {2,2}; count.assign(count_raw, count_raw + 2);
besides being a formatting mess can be simply
std::vector offset{4,4}, count{2,2};
h5xx::slice slice(offset, count);
On To The Real Question
The code has no relevance to the file. At all. I created some debug/tracing code to dump the file contents:
void dump(h5xx::group const& g, std::string indent = "") {
auto dd = g.datasets();
auto gg = g.groups();
for (auto it = dd.begin(); it != dd.end(); ++it) {
std::cout << indent << " ds:" << it.get_name() << "\n";
}
for (auto it = gg.begin(); it != gg.end(); ++it) {
dump(*it, indent + "/" + it.get_name());
}
}
int main()
{
h5xx::file xaa("xaa.h5", h5xx::file::mode::in);
dump(xaa);
}
Prints
/particles/lipids/box/edges ds:box_size
/particles/lipids/box/edges ds:step
/particles/lipids/box/edges ds:time
/particles/lipids/box/edges ds:value
/particles/lipids/box/positions ds:step
/particles/lipids/box/positions ds:time
/particles/lipids/box/positions ds:value
Now we can drill down to the dataset. Let's see whether we can figure out the correct type. It certainly is NOT array_2d_t:
h5xx::dataset ds(xaa, "particles/lipids/box/positions/value");
array_2d_t a;
h5xx::datatype detect(a);
std::cout << "type: " << std::hex << ds.get_type() << std::dec << "\n";
std::cout << "detect: " << std::hex << detect.get_type_id() << std::dec << "\n";
Prints
type: 30000000000013b
detect: 30000000000000c
That's a type mismatch. I guess I'll have to learn to read that gibberish as well...
Let's add some diagnostics:
void diag_type(hid_t type)
{
std::cout << " Class " << ::H5Tget_class(type) << std::endl;
std::cout << " Size " << ::H5Tget_size(type) << std::endl;
std::cout << " Sign " << ::H5Tget_sign(type) << std::endl;
std::cout << " Order " << ::H5Tget_order(type) << std::endl;
std::cout << " Precision " << ::H5Tget_precision(type) << std::endl;
std::cout << " NDims " << ::H5Tget_array_ndims(type) << std::endl;
std::cout << " NMembers " << ::H5Tget_nmembers(type) << std::endl;
}
int main()
{
h5xx::file xaa("xaa.h5", h5xx::file::mode::in);
// dump(xaa);
{
h5xx::group g(xaa, "particles/lipids/box/positions");
h5xx::dataset ds(g, "value");
std::cout << "dataset: " << std::hex << ds.get_type() << std::dec << std::endl;
diag_type(ds.get_type());
}
{
array_2d_t a(boost::extents[NJ][NI]);
h5xx::datatype detect(a);
std::cout << "detect: " << std::hex << detect.get_type_id() << std::dec << std::endl;
diag_type(detect.get_type_id());
}
}
Prints
dataset: 30000000000013b
Class 1
Size 4
Sign -1
Order 0
Precision 32
NDims -1
NMembers -1
detect: 30000000000000c
Class 0
Size 4
Sign 1
Order 0
Precision 32
NDims -1
NMembers -1
At least we know that HST_FLOAT (class 1) is required. Let's modify array_2d_t:
using array_2d_t = boost::multi_array<float, 2>;
array_2d_t a(boost::extents[11214][3]);
This at least makes the data appear similarly. Let's ... naively try to read:
h5xx::read_dataset(ds, a);
Oops, that predictably throws
terminate called after throwing an instance of 'h5xx::error'
what(): /home/sehe/Projects/stackoverflow/deps/h5xx/h5xx/dataset/boost_multi_array.hpp:176:read_dataset(): dataset "/particles/lipi
ds/box/positions/value" and target array have mismatching dimensions
No worries, we can guess:
using array_3d_t = boost::multi_array<float, 3>;
array_3d_t a(boost::extents[10][11214][3]);
h5xx::read_dataset(ds, a);
At least this does work. Adapting the print function:
template <typename T> void print_array(T const& array) {
for (auto const& row : array) {
for (auto v : row) printf("%5f ", v);
printf("\n");
}
}
Now we can print the first frame:
h5xx::read_dataset(ds, a);
print_array(*a.begin()); // print the first frame
This prints:
80.480003 35.360001 4.250000
37.450001 3.920000 3.960000
18.530001 -9.690000 4.680000
55.389999 74.339996 4.600000
22.110001 68.709999 3.850000
-4.130000 24.040001 3.730000
40.160000 6.390000 4.730000
-5.400000 35.730000 4.850000
36.669998 22.450001 4.080000
-3.680000 -10.660000 4.180000
(...)
That checks out with h5ls -r -d xaa.h5/particles/lipids/box/positions/value:
particles/lipids/box/positions/value Dataset {75/Inf, 11214, 3}
Data:
(0,0,0) 80.48, 35.36, 4.25, 37.45, 3.92, 3.96, 18.53, -9.69, 4.68,
(0,3,0) 55.39, 74.34, 4.6, 22.11, 68.71, 3.85, -4.13, 24.04, 3.73,
(0,6,0) 40.16, 6.39, 4.73, -5.4, 35.73, 4.85, 36.67, 22.45, 4.08, -3.68,
(0,9,1) -10.66, 4.18, 35.95, 36.43, 5.15, 57.17, 3.88, 5.08, -23.64,
(0,12,1) 50.44, 4.32, 6.78, 8.24, 4.36, 21.34, 50.63, 5.21, 16.29,
(0,15,1) -1.34, 5.28, 22.26, 71.25, 5.4, 19.76, 10.38, 5.34, 78.62,
(0,18,1) 11.13, 5.69, 22.14, 59.7, 4.92, 15.65, 47.28, 5.22, 82.41,
(0,21,1) 2.09, 5.24, 16.87, -11.68, 5.35, 15.54, -0.63, 5.2, 81.25,
(...)
The Home Stretch: Adding The Slice
array_2d_t read_frame(int frame_no) {
h5xx::file xaa("xaa.h5", h5xx::file::mode::in);
h5xx::group g(xaa, "particles/lipids/box/positions");
h5xx::dataset ds(g, "value");
array_2d_t a(boost::extents[11214][3]);
std::vector offsets{frame_no, 0, 0}, counts{1, 11214, 3};
h5xx::slice slice(offsets, counts);
h5xx::read_dataset(ds, a, slice);
return a;
}
There you have it. Now we can print any frame:
print_array(read_frame(0));
Printing the same as before. Let's try the last frame:
print_array(read_frame(9));
Prints
79.040001 36.349998 3.990000
37.250000 3.470000 4.140000
18.600000 -9.270000 4.900000
55.669998 75.070000 5.370000
21.920000 67.709999 3.790000
-4.670000 24.770000 3.690000
40.000000 6.060000 5.240000
-5.340000 36.320000 5.410000
36.369999 22.490000 4.130000
-3.520000 -10.430000 4.280000
(...)
Checking again with h5ls -r -d xaa.h5/particles/lipids/box/positions/value |& grep '(9' | head confirms:
(9,0,0) 79.04, 36.35, 3.99, 37.25, 3.47, 4.14, 18.6, -9.27, 4.9, 55.67,
(9,3,1) 75.07, 5.37, 21.92, 67.71, 3.79, -4.67, 24.77, 3.69, 40, 6.06,
(9,6,2) 5.24, -5.34, 36.32, 5.41, 36.37, 22.49, 4.13, -3.52, -10.43,
(9,9,2) 4.28, 35.8, 36.43, 4.99, 56.6, 4.09, 5.04, -23.37, 49.42, 3.81,
(9,13,0) 6.31, 8.83, 4.56, 22.01, 50.38, 5.43, 16.3, -2.92, 5.4, 22.02,
(9,16,1) 70.09, 5.36, 20.23, 11.12, 5.66, 78.48, 11.34, 6.09, 20.26,
(9,19,1) 61.45, 5.35, 14.25, 48.32, 5.35, 79.95, 1.71, 5.38, 17.56,
(9,22,1) -11.61, 5.39, 15.64, -0.19, 5.06, 80.43, 71.77, 5.29, 75.54,
(9,25,1) 35.14, 5.26, 22.45, 56.86, 5.56, 16.47, 52.97, 6.16, 20.62,
(9,28,1) 65.12, 5.26, 19.68, 71.2, 5.52, 23.39, 49.84, 5.28, 22.7,
Full Listing
#include <boost/multi_array.hpp>
#include <h5xx/h5xx.hpp>
#include <iostream>
using array_2d_t = boost::multi_array<float, 2>;
template <typename T> void print_array(T const& array)
{
for (auto const& row : array) { for (auto v : row)
printf("%5f ", v);
printf("\n");
}
}
void dump(h5xx::group const& g, std::string indent = "") {
auto dd = g.datasets();
auto gg = g.groups();
for (auto it = dd.begin(); it != dd.end(); ++it) {
std::cout << indent << " ds:" << it.get_name() << std::endl;
}
for (auto it = gg.begin(); it != gg.end(); ++it) {
dump(*it, indent + "/" + it.get_name());
}
}
array_2d_t read_frame(int frame_no) {
h5xx::file xaa("xaa.h5", h5xx::file::mode::in);
h5xx::group g(xaa, "particles/lipids/box/positions");
h5xx::dataset ds(g, "value");
array_2d_t arr(boost::extents[11214][3]);
std::vector offsets{frame_no, 0, 0}, counts{1, 11214, 3};
h5xx::slice slice(offsets, counts);
h5xx::read_dataset(ds, arr, slice);
return arr;
}
int main()
{
print_array(read_frame(9));
}
Note: I have tagged this with both programming and windows networking tags, so please don't shout, I'm just trying to expose this to as many people as may be able to help!
I am trying to set the receive and send buffers for a small client and server I have written, so that when I perform a network capture, I see the window size I have set in the TCP handshake.
For the programmers, please consider the following very simple code for a client and server.
For the none-programmers, please skip past this section to my image.
Client:
#include <WinSock2.h>
#include <mstcpip.h>
#include <Ws2tcpip.h>
#include <thread>
#include <iostream>
using namespace std;
int OutputWindowSize(SOCKET s, unsigned int nType)
{
int buflen = 0;
int nSize = sizeof(buflen);
if (getsockopt(s, SOL_SOCKET, nType, (char *)&buflen, &nSize) == 0)
return buflen;
return -1;
}
bool SetWindowSizeVal(SOCKET s, unsigned int nSize)
{
if (setsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&nSize, sizeof(nSize)) == 0)
if (setsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&nSize, sizeof(nSize)) == 0)
return true;
return false;
}
int main(int argc, char** argv)
{
if (argc != 3) { cout << "not enough args!\n"; return 0; }
const char* pszHost = argv[1];
const int nPort = atoi(argv[2]);
WSADATA wsaData;
DWORD Ret = 0;
if ((Ret = WSAStartup((2, 2), &wsaData)) != 0)
{
printf("WSAStartup() failed with error %d\n", Ret);
return 1;
}
struct sockaddr_in sockaddr_IPv4;
memset(&sockaddr_IPv4, 0, sizeof(struct sockaddr_in));
sockaddr_IPv4.sin_family = AF_INET;
sockaddr_IPv4.sin_port = htons(nPort);
if (!InetPtonA(AF_INET, pszHost, &sockaddr_IPv4.sin_addr)) { return 0; }
SOCKET clientSock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); // Create active socket: one which is passed to connect().
if (!SetWindowSizeVal(clientSock, 12345))
{
cout << "Failed to set window size " << endl;
return -1;
}
cout << "Set window size on client socket as: RECV" << OutputWindowSize(clientSock, SO_RCVBUF) <<
" SEND: " << OutputWindowSize(clientSock, SO_SNDBUF) << endl;
int nRet = connect(clientSock, (sockaddr*)&sockaddr_IPv4, sizeof(sockaddr_in));
if (nRet != 0) { return 0; }
char buf[100] = { 0 };
nRet = recv(clientSock, buf, 100, 0);
cout << "Received " << buf << " from the server!" << endl;
nRet = send(clientSock, "Hello from the client!\n", strlen("Hello from the client!\n"), 0);
closesocket(clientSock);
return 0;
}
Server:
#include <WinSock2.h>
#include <mstcpip.h>
#include <Ws2tcpip.h>
#include <iostream>
using namespace std;
int OutputWindowSize(SOCKET s, unsigned int nType)
{
int buflen = 0;
int nSize = sizeof(buflen);
if (getsockopt(s, SOL_SOCKET, nType, (char *)&buflen, &nSize) == 0)
return buflen;
return -1;
}
bool SetWindowSizeVal(SOCKET s, unsigned int nSize)
{
if (setsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&nSize, sizeof(nSize)) == 0)
if (setsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&nSize, sizeof(nSize)) == 0)
return true;
return false;
}
int main()
{
WSADATA wsaData;
DWORD Ret = 0;
if ((Ret = WSAStartup((2, 2), &wsaData)) != 0)
{
printf("WSAStartup() failed with error %d\n", Ret);
return 1;
}
struct sockaddr_in sockaddr_IPv4;
memset(&sockaddr_IPv4, 0, sizeof(struct sockaddr_in));
sockaddr_IPv4.sin_family = AF_INET;
sockaddr_IPv4.sin_port = htons(19982);
int y = InetPton(AF_INET, L"127.0.0.1", &sockaddr_IPv4.sin_addr);
if (y != 1) return 0;
socklen_t addrlen = sizeof(sockaddr_IPv4);
SOCKET sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
if (!SetWindowSizeVal(sock, 12345))
{
cout << "Failed to set window size " << endl;
return -1;
}
cout << "Set window size on listen socket as: RECV" << OutputWindowSize(sock, SO_RCVBUF) <<
" SEND: " << OutputWindowSize(sock, SO_SNDBUF) << endl;
if (bind(sock, (sockaddr*)&sockaddr_IPv4, sizeof(sockaddr_IPv4)) != 0) { /* error */ }
if (listen(sock, SOMAXCONN) != 0) { return 0; }
while (1)
{
SOCKET sockAccept = accept(sock, (struct sockaddr *) &sockaddr_IPv4, &addrlen);
if (!SetWindowSizeVal(sockAccept, 12345))
{
cout << "Failed to set window size " << endl;
return -1;
}
cout << "Set window size as on accepted socket as: RECV" << OutputWindowSize(sock, SO_RCVBUF) <<
" SEND: " << OutputWindowSize(sock, SO_SNDBUF) << endl;
if (sockAccept == -1) return 0;
int nRet = send(sockAccept, "Hello from the server!\n", strlen("Hello from the server!\n"), 0);
if (!nRet) return 0;
char buf[100] = { 0 };
nRet = recv(sockAccept, buf, 100, 0);
cout << "Received " << buf << " from the client!" << endl;
if (nRet == 0) { cout << "client disonnected!" << endl; }
closesocket(sockAccept);
}
return 0;
}
The output from my program states that the window sizes have been set succesfully:
Set window size on listen socket as: RECV12345 SEND: 12345
Set window size as on accepted socket as: RECV12345 SEND: 12345
for the server, and for the client:
Set window size on listen socket as: RECV12345 SEND: 12345
However, when I capture the traffic using RawCap, I see that the client window size is set fine, but server's window size is not what I set it to be, it is 8192:
Now, I have read this MS link and it says to add a registry value; I did this, adding the value 0x00001234, but it still made no difference.
The interesting thing is, the same code works fine on a Windows 10 machine, which makes me think it is Windows 7 specific. However, I'm not 100% sure on my code, there might be some errors in it.
Can anyone suggest how I can get Windows to honour my requested parameters please?
These are not 'window sizes'. They are send and receive buffer sizes.
There is no such thing as 'output window size'. There is a receive window and a congestion window, and the latter is not relevant to your question.
The send buffer size has exactly nothing to do with the receive window size, and the receive buffer size only determines the maximum receive window size.
The actual receive window size is adjusted dynamically by the protocol. It is the actual size that you are seeing in Wireshark.
The platform is entitled by the specification to adjust the supplied values for the send and receive buffers up or down, and the documentation advises you to get the corresponding values if you want to be sure what they really are.
There is no problem here to solve.
NB You don't have to set the receive window size on an accepted socket if you already set it on the listening socket. It is inherited.
I'm a beginner at OpenCL. I was trying to build a simple app which just add 2 vectors to get results. This is my following host code
#define USE_PLATFORM 0
#define USE_DEVICE 2
#define DATA_SIZE 1024
#define USE_KERNEL_PATH "/Users/huangxin/Documents/August13Programming/FirstEGOpenCL/FirstEGOpenCL/kernel.cl"
using namespace std;
int main(int argc, const char * argv[]) {
int err;
cl_uint numPlatforms;
cl_uint numDevices;
cl_command_queue command;
size_t global;
//Query the number of platforms supported.
err = clGetPlatformIDs(0, NULL, &numPlatforms);
if (err != CL_SUCCESS || USE_PLATFORM >= numPlatforms)
{
printf("Error at: clGetPlatformIDs(querying platforms count failed):\n");
exit(-1);
}
//Get all platforms.
vector<cl_platform_id> platforms(numPlatforms);
err = clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms);
if (err != CL_SUCCESS)
{
printf("Error at: clGetPlatformIDs(getting all platforms failed):\n");
exit(-1);
}
//Query the number of devices supported by the platform spicified.
err = clGetDeviceIDs(platforms[USE_PLATFORM], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
if (err != CL_SUCCESS || USE_PLATFORM >= numDevices)
{
printf("Error at: clGetDeviceIDs(querying devices count failed):\n");
exit(-1);
}
//Get all devices.
vector<cl_device_id> devices(numDevices);
err=clGetDeviceIDs(platforms[USE_PLATFORM], CL_DEVICE_TYPE_ALL, numDevices, &devices[0], &numDevices);
if (err != CL_SUCCESS)
{
printf("Error at: clGetDeviceIDs(getting all devices failed):\n");
exit(-1);
}
//Get device infomation.
char deviceInfo[1024];
//get device max work item dimensions.
size_t maxItemSize[3];
clGetDeviceInfo(devices[USE_DEVICE], CL_DEVICE_NAME, sizeof(deviceInfo)*1024, deviceInfo, NULL);
clGetDeviceInfo(devices[USE_DEVICE], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, maxItemSize, NULL);
cout << "Device selected: " << deviceInfo << endl;
cout << "Max item size: " << maxItemSize[0] << "," << maxItemSize[1] << ","<< maxItemSize[2] << endl;
//Set property with certain platform
cl_context_properties prop[] = {CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platforms[USE_PLATFORM]), 0};
//create context with certain property.
cl_context context = clCreateContextFromType(prop, CL_DEVICE_TYPE_ALL, NULL, NULL, &err);
if (err != CL_SUCCESS)
{
printf("Error at: clCreateContextFromType(get context failed):\n");
exit(-1);
}
//create command queue using selected device and context.
command = clCreateCommandQueue(context, devices[USE_DEVICE], 0, NULL);
//create program with specified kernel source.
const char *kernelSource = getKernelSource(USE_KERNEL_PATH);
cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, 0, &err);
if (err != CL_SUCCESS)
{
printf("Error at: clCreateProgramWithSource(get program failed):\n");
exit(-1);
}
//since OpenCL is a dynamic-compile architechture, we need to build the program.
err = clBuildProgram(program, 0, 0, 0, 0, 0);
if (err != CL_SUCCESS)
{
cout << err << endl;
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, devices[USE_DEVICE], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
//kernel是OpenCL中对执行在一个最小粒度的compute item上的代码及参数的抽象
//create the kernel function using the built program.
cl_kernel adder = clCreateKernel(program, "adder", &err);
if (err != CL_SUCCESS)
{
printf("Error at: clCreateKernel(get kernel function failed):\n");
exit(-1);
}
//create the vector of input random data.
vector<float> inA(DATA_SIZE), inB(DATA_SIZE);
for(int i = 0; i < DATA_SIZE; i++) {
inA[i] = (float)(random() % DATA_SIZE) / 1000;
inB[i] = (float)(random() % DATA_SIZE) / 1000;
}
//create the read-only device mem using specified context, that is to copy the host mem to the device mem.
cl_mem cl_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * DATA_SIZE, &inA[0], NULL);
cl_mem cl_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * DATA_SIZE, &inB[0], NULL);
//create the result mem.
cl_mem cl_res = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * DATA_SIZE, NULL, NULL);
//setting up the arguement of kernel memory
clSetKernelArg(adder, 0, sizeof(cl_mem), &cl_a);
clSetKernelArg(adder, 1, sizeof(cl_mem), &cl_b);
clSetKernelArg(adder, 2, sizeof(cl_mem), &cl_res);
START_CHECK_RUNNING_TIME
//enqueue the kernel into the specified command(#TODO:come back later to check the remaining arguement.
global = DATA_SIZE;
err = clEnqueueNDRangeKernel(command, adder, 1, 0, &global, 0, 0, 0, 0);
if (err != CL_SUCCESS)
{
printf("Error at: clEnqueueNDRangeKernel(enqueue kernel failed):\n");
exit(-1);
}
printf("*****************FLAG***************");
//copy the results from the kernel into the host(CPU).
vector<float> res(DATA_SIZE);
err = clEnqueueReadBuffer(command, cl_res, CL_TRUE, 0, sizeof(float) * DATA_SIZE, &res[0], 0, 0, 0);
END_CHECK_RUNNING_TIME
//check the number of right compute.
int cnt = 0;
for (int i = 0; i < res.size(); i++) {
cnt += (res[i] == inA[i] + inB[i] ? 1 : 0);
}
cout << "Computed " << res.size() << " values\n";
cout << "Correct values:(" << cnt << "/" << res.size() << "),correct rate:" << (float)cnt / res.size() * 100 << "%" << endl;
gettimeofday(&sTime, NULL);
for (int i = 0; i < res.size(); i++) {
for (int j = 0; j < 10000; j++)
res[i] = inA[i] + inB[i];
}
gettimeofday(&eTime, NULL);timeuse = 1000000 * ( eTime.tv_sec - sTime.tv_sec ) + eTime.tv_usec -sTime.tv_usec; printf("Running time: %fs\n", (double)timeuse/(1000000));
//cleaning up the variables.
clReleaseKernel(adder);
clReleaseProgram(program);
clReleaseMemObject(cl_a);
clReleaseMemObject(cl_b);
clReleaseMemObject(cl_res);
clReleaseCommandQueue(command);
clReleaseContext(context);
return 0;
}
It's a bit long code, but it's really doing simple stuff. this is my kernel code
kernel void adder(global const float* a, global const float* b, global float* result)
{
size_t idx = get_global_id(0);
for (int i = 0; i < 10000; i++)
result[idx] = a[idx] +b[idx];
}
And I got the following result:
Device selected: GeForce GT 650M
-11
Error: Failed to build program executable!
No kernels or only kernel prototypes found.
I don't quite understand what "No kernels or only kernel prototypes found." mean and it's really strange that if I use the first device(CPU) or my second device(HD Graphics 4000), the same code runs perfectly.
I want to know what is wrong and why it happens.
I was running these code in the Xcode with Mac OS X 10.10.
As the comments say, is a good practice to use:
__kernel void adder(__global const float* a, __global const float* b, __global float* result)
Because that way you clearly define those are special CL flags. Tpically all the CL kernels follow that rule, even if the spec allows both.
But your problem is probably due to running the clBuildProgram() without any device in the devices list. Therefore, not compiling anything at all!
In CL every device has an specific compiler (the CPUs don't have the same compiler as GPU, sometimes not even the same instruction sets). So you should give the API the list of devices for which the kernels have to be compiled.
The proper way would be this:
err = clBuildProgram(program, 1, &devices[USE_DEVICE], "", 0, 0);
Note: I added "", because probably in the future you will want to add some build parameters, better to have it ready :)
Working on WinXP SP3.
Visual Studio 2005.
Trying to read memory of another process.
std::cout<<"Reading Process Memory\n";
const DWORD pid = 3476;
HANDLE handle = OpenProcess(PROCESS_VM_READ,FALSE,pid);
if(handle == NULL) {std::cout<<"Failed to open process\n";return 0;}
char* buffer1 = new char[256];
char* buffer2 = new char[256];
memset(buffer1,0,256*sizeof(char));
memset(buffer2,0,256*sizeof(char));
DWORD nbr = 0;
int address = 0x400000;
BOOL result = ReadProcessMemory(handle,&address,buffer1,32,&nbr);
if(result!=1) std::cout<<"Failed to read memory\n";
address = 0x400000+0x1000;
result = ReadProcessMemory(handle,&address,buffer2,32,&nbr);
if(result!=1) std::cout<<"Failed to read memory\n";
int i = 0;
while(i++<10)
{
if(buffer1[i]!=buffer2[i]) {std::cout<<"Buffers are different\n";break;}
}
delete[] buffer1;
delete[] buffer2;
CloseHandle(handle);
std::cin>>i;
return 0;
The problem is that both buffers are getting the same values. ReadProcMemory returns 1 and number of bytes read is the same as requested.
Your calls to ReadProcessMemory are incorrect. You should be using address directly, not &address. You may need to cast it to a const void *.
result = ReadProcessMemory(handle, reinterpret_cast<const void *>(address), buffer, 32, &nbr);
And you probably should declaring address as a type large enough to handle a pointer, like std::ssize_t or INT_PTR.
INT_PTR address = 0x400000;
buffer couldn't be a char, it has to be int, thats a working example
#include <windows.h>
#include <iostream>
#include <string.h>
using namespace std;
int main()
{
int point1=0;
int i=0;
int d=0;
char* value[4];
SIZE_T stBytes = 0;
HWND hwnd;
HANDLE phandle;
DWORD pid;
hwnd = FindWindow(NULL, "calc"); // calc is the name of the windows process
if (hwnd != 0) {
GetWindowThreadProcessId(hwnd, &pid);
phandle = OpenProcess(PROCESS_ALL_ACCESS, 0, pid);
} else {
cout << "process is not executing";
cin.get();
return 0;
}
if (phandle != 0) {
for(i=0;i<4;i++) // 4 or wathever
{
cout << "The pointer is 0x1001000" << endl; //Print the pointer
ReadProcessMemory(phandle, (LPVOID)0x1001000+i, &point1, 4, &stBytes); //Get the content from 0x1001000 and store it in point1
cout << "decimal content point1 " << point1 << " (DEC)" << endl; //Print the decimal content of point1
printf("%x \n",point1); // print hexadecimal content of point1
char *p=(char*)&point1; // point point1 buffer
for(d=0;d<4;d++)
printf("%x",(unsigned int)(unsigned char) *(p+d)); // print backwards (because the buffer is like a LIFO) and see the dbg debugger
}
ReadProcessMemory(phandle, (LPVOID)point1, &value, 6, &stBytes); //Get the value that is in the address pointed by the pointer
cout << "The value in the non-static address is " << (char*)value << endl << endl; //Print the value
cout << "Press ENTER to exit." << endl;
cin.get();
} else {
cout << "Couldn't get a handle";
cin.get();
// address 0x1001000 content hex 5278DA77
}
}
I wrote a cuda function for Matlab to perform a LU factorization of a batch of matrices using cublasDgetrfBatched(). The toolkit documentation of this function is here.
It works fine for matrices up to size 32x32. But it fails with status code CUBLAS_STATUS_INVALID_VALUE for bigger matrices. Below is my source code (gpuBatchedLU.cu):
#include "mex.h"
#include "gpu/mxGPUArray.h"
/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <string>
#include <sstream>
static std::string cublasGetErrorString(cublasStatus_t error) {
switch (error) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
inline bool cublasAssert(cublasStatus_t code, const char* file, int line) {
if (code != CUBLAS_STATUS_SUCCESS) {
std::stringstream ss;
ss << "cublasAssert: " << cublasGetErrorString(code) << " in "
<< std::string(file) << ", line " << line << ".";
mexErrMsgTxt(ss.str().c_str());
}
return code == CUBLAS_STATUS_SUCCESS;
}
inline bool cudaAssert(cudaError_t code, const char* file, int line) {
if (code != cudaSuccess) {
std::stringstream ss;
ss << "cudaAssert: " << cudaGetErrorString(code) << " in "
<< std::string(file) << ", line " << line << ".";
mexErrMsgTxt(ss.str().c_str());
}
return code == cudaSuccess;
}
inline bool mexGPUAssert(int code, const char* file, int line) {
if (code != MX_GPU_SUCCESS) {
std::stringstream ss;
ss << "mexGPUAssert: could not initialize the Mathworks GPU API in "
<< std::string(file) << ", line " << line << ".";
mexErrMsgTxt(ss.str().c_str());
}
return code == MX_GPU_SUCCESS;
}
#define cublasErrchk(ans) { cublasAssert((ans), __FILE__, __LINE__); }
#define cudaErrchk(ans) { cudaAssert((ans), __FILE__, __LINE__); }
#define mxGPUErrchk(ans) { mexGPUAssert((ans), __FILE__, __LINE__); }
void mexFunction(int nlhs, mxArray *plhs[], /* Output variables */int nrhs,
const mxArray *prhs[]) /* Input variables */{
if (nrhs != 1) { /* end if not one function arguments */
mexErrMsgTxt("This function requires one input argument.");
return;
}
if (nlhs > 3) { /* take three outputs */
mexErrMsgTxt("This function takes a maximum of three output variables.");
return;
}
mxGPUErrchk(mxInitGPU());
const mxGPUArray* in1_gpu = mxGPUCreateFromMxArray(prhs[0]);
size_t ndims = mxGPUGetNumberOfDimensions(in1_gpu);
const size_t* dim = (const size_t*) mxGPUGetDimensions(in1_gpu);
if (ndims != 3) { /* end if input arguments are of different dimensions */
mexErrMsgTxt("The input argument must be a 3-dimensional array.");
return;
}
cublasHandle_t handle;
cublasErrchk(cublasCreate(&handle));
int no_matrices = dim[2];
int nrow = dim[0];
int ncol = dim[1];
int matrix_size = nrow * ncol;
size_t i;
std::stringstream ss;
ss << "dim[2] = " << dim[2] << "\nno_matrices = " << no_matrices << "\nnrow = " << nrow << "\nmatrix_size = " << nrow << " x " << ncol << " = " << matrix_size << std::endl;
mexPrintf(ss.str().c_str());
mxGPUArray* gpu_array_inout = mxGPUCopyFromMxArray(prhs[0]);
double* inout_storage = (double*) mxGPUGetData(gpu_array_inout);
size_t info_dimensions[1] = { no_matrices };
mxGPUArray* gpu_array_info = mxGPUCreateGPUArray(1, (mwSize*) info_dimensions, mxINT32_CLASS, mxREAL,
MX_GPU_INITIALIZE_VALUES);
int* out_info = (int*) mxGPUGetData(gpu_array_info);
mexPrintf("after defining gpu_array_info\n");
size_t pivot_dimensions[2] = { nrow, no_matrices };
mxGPUArray* gpu_array_pivot = mxGPUCreateGPUArray(2, (mwSize*) pivot_dimensions, mxINT32_CLASS, mxREAL,
MX_GPU_DO_NOT_INITIALIZE);
int* out_pivot = (int*) mxGPUGetData(gpu_array_pivot);
mexPrintf("after defining gpu_array_pivot\n");
double** inout_pointers_CPU = (double**) malloc(no_matrices * sizeof(double*));
for (i = 0; i < no_matrices; i++) {
inout_pointers_CPU[i] = (double*) ((char*) inout_storage + i * ((size_t) matrix_size) * sizeof(double));
}
double** inout_pointers_GPU;
cudaErrchk(cudaMalloc((void** )&inout_pointers_GPU, no_matrices * sizeof(double*)));
cudaErrchk(
cudaMemcpy(inout_pointers_GPU, inout_pointers_CPU, no_matrices * sizeof(double*), cudaMemcpyHostToDevice));
free(inout_pointers_CPU);
ss.clear();
ss << "check again before calling cublasDgetrfBatched:\nnrow = " << nrow << "\nno_matrices = " << no_matrices << std::endl;
mexPrintf(ss.str().c_str());
cublasErrchk(cublasDgetrfBatched(handle, nrow, inout_pointers_GPU, nrow, out_pivot, out_info, no_matrices));
cublasErrchk(cublasDestroy(handle));
cudaErrchk(cudaFree(inout_pointers_GPU));
if (mxIsGPUArray(prhs[0])) {
plhs[0] = mxGPUCreateMxArrayOnGPU(gpu_array_inout);
if (nlhs > 1) {
plhs[1] = mxGPUCreateMxArrayOnGPU(gpu_array_pivot);
if (nlhs > 2) {
plhs[2] = mxGPUCreateMxArrayOnGPU(gpu_array_info);
}
}
} else {
plhs[0] = mxGPUCreateMxArrayOnCPU(gpu_array_inout);
if (nlhs > 1) {
plhs[1] = mxGPUCreateMxArrayOnCPU(gpu_array_pivot);
if (nlhs > 2) {
plhs[2] = mxGPUCreateMxArrayOnCPU(gpu_array_info);
}
}
}
mxGPUDestroyGPUArray(gpu_array_inout);
mxGPUDestroyGPUArray(gpu_array_pivot);
mxGPUDestroyGPUArray(gpu_array_info);
mxFree((void*) dim);
return;
}
I compile as follows:
mex -L/usr/local/cuda/lib64 -lcudart -lcublas gpuBatchedLU.cu
And I call from MATLAB:
[a1,b1,c1]=gpuBatchedLU(randn(32,32,5)); %no problem
[a2,b2,c2]=gpuBatchedLU(randn(33,33,5)); %produces CUBLAS_STATUS_INVALID_VALUE
I use Matlab R2013b with the parallel toolbox, Cuda 5.5, and a NVS 5200M graphics chip.
Can anyone replicate this problem? I would appreciate any suggestions on how to solve this problem.
The problem seems to be with Matlab R2013b using libcublas.so in version 5.0. The file link is in /MATLAB/R2013b/bin/glnxa64/. Once I changed the link to the libcublas.so of my Cuda 5.5 installation it worked fine.