I am interested in the question why the algorithm written for OpenCL works correctly for small-dimensional matrices, but it does not work correctly with large-dimensional matrices.
Moreover, for the same input data, the inverse matrix is different each time on OpenCL.
The usual algorithm for the CPU calculates everything correctly, using the Gauss method.
The GPU algorithm for large matrices gives different incorrect answers each time.
I wondered if there might be a problem in synchronizing threads? However, I have added synchronization both at the host level and in the kernel itself, using barriers.
But all to no avail. I am attaching the source code for CPU and GPU (OpenCL), please help.
Code for CPU (the matrix has already been augmented to the expanded unit matrix on the right):
void doInverse() {
for (int i = 0; i < size; i++) getUpperTriangularMatrix(i);
if (isNullDeterminantMatrix()) printf("This matrix does not have an inverse matrix\n");
else {
for (int i = 0; i < size; i++) {
mainDiagonal(i);
processingTopRows(i);
}
}
}
void getUpperTriangularMatrix(int i) {
double elem = matrix[i][i];
int iConst = i;
i++;
for (i; i < size; i++) {
double elem2 = matrix[i][iConst];
for (int j = 0; j < size * 2; j++) matrix[i][j] -= matrix[iConst][j] * elem2 / elem;
}
}
void mainDiagonal(int i) {
double count = matrix[i][i];
for (int j = 0; j < size * 2; j++) matrix[i][j] /= count;
}
void processingTopRows(int i) {
int iConst = i;
i--;
for (i; i >= 0; i--) {
double elem = matrix[i][iConst];
for (int j = 0; j < size * 2; j++) matrix[i][j] -= elem * matrix[iConst][j];
}
}
bool isNullDeterminantMatrix() {
for (int i = 0; i < size; i++)
if (!matrix[i][i]) return true;
return false;
}
Code for GPU (the matrix has already been augmented to the expanded unit matrix on the right):
cl_event event;
size_t globalSize[2] = { size, size * 2 };
cl_event eventRead = NULL;
cl_kernel kernel = clCreateKernel(program, "getUpperTriangularMatrix", &clStatus);
//if (clStatus != CL_SUCCESS) error("Error: creating kernel\n", ERROR_CREATING_KERNEL);
clStatus = clSetKernelArg(kernel, 0, sizeof(matrix_clmem), (void*)&matrix_clmem);
//if (clStatus != CL_SUCCESS) error("Error: set arg 0", ERROR_SET_KERNEL_ARG);
clStatus = clSetKernelArg(kernel, 1, sizeof(int), (int*)&size);
//if (clStatus != CL_SUCCESS) error("Error: set arg 1", ERROR_SET_KERNEL_ARG);
clStatus = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalSize, NULL, 0, NULL, &event);
//if (clStatus != CL_SUCCESS) error("Error: enqueue\n", ERROR_ENQUEUE);
clWaitForEvents(1, &event);
clStatus = clEnqueueReadBuffer(commandQueue, matrix_clmem, CL_TRUE, 0, size * size * 2 * sizeof(cl_double), matrix, 0, 0, NULL);
//if (clStatus != CL_SUCCESS) error("Error: read buffer\n", ERROR_READ_BUFFER);
clWaitForEvents(1, &eventRead);
clStatus = clReleaseKernel(kernel);
clStatus = clFlush(commandQueue);
clStatus = clFinish(commandQueue);
if (isNullDeterminantMatrix()) printf("This matrix does not have an inverse matrix\n");
else {
cl_kernel kernel2 = clCreateKernel(program, "mainDiagonal_processingTopRows", &clStatus);
//if (clStatus != CL_SUCCESS) error("Error: create kernel\n", ERROR_CREATING_KERNEL);
clStatus = clSetKernelArg(kernel2, 0, sizeof(matrix_clmem), (void*)&matrix_clmem);
//if (clStatus != CL_SUCCESS) error("Error: set arg 0", ERROR_SET_KERNEL_ARG);
clStatus = clSetKernelArg(kernel2, 1, sizeof(int), (int*)&size);
//if (clStatus != CL_SUCCESS) error("Error: set arg 1", ERROR_SET_KERNEL_ARG);
clStatus = clEnqueueNDRangeKernel(commandQueue, kernel2, 2, NULL, globalSize, NULL, 0, NULL, &event);
//if (clStatus != CL_SUCCESS) error("Error: enqueue\n", ERROR_ENQUEUE);
clWaitForEvents(1, &event);
clStatus = clEnqueueReadBuffer(commandQueue, matrix_clmem, CL_TRUE, 0, size * size * 2 * sizeof(cl_double), matrix, 0, 0, NULL);
//if (clStatus != CL_SUCCESS) error("Error: read buffer\n", ERROR_READ_BUFFER);
clWaitForEvents(1, &eventRead);
clStatus = clReleaseKernel(kernel2);
}
clStatus = clFlush(commandQueue);
clStatus = clFinish(commandQueue);
Code for kernel (GPU):
#pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void getUpperTriangularMatrix(__global double *matrix, int size) {
const int i = get_global_id(0);
const int j = get_global_id(1);
for (int ii = 0; ii < size; ii++) {
if (i + 1 < size && i >= ii) {
double elem = matrix[ii * size * 2 + ii];
double elem2 = matrix[(i + 1) * size * 2 + ii];
matrix[(i + 1) * size * 2 + j] -= matrix[ii * size * 2 + j] * elem2 / elem;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
__kernel void mainDiagonal_processingTopRows(__global double *matrix, int size) {
const int i = get_global_id(0);
const int j = get_global_id(1);
double count = matrix[i * size * 2 + i];
matrix[i * size * 2 + j] /= count;
for (int ii = 0; ii < size; ii++) {
if (i - 1 >= 0 && i <= ii) {
double elem = matrix[(i - 1) * size * 2 + ii];
matrix[(i - 1) * size * 2 + j] -= elem * matrix[ii * size * 2 + j];
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
Related
I'm trying to create a IWICBitmap from an EXR file (error checks removed).
#pragma pack(push,1)
struct fl
{
float r, g, b, a;
};
#pragma pack(pop)
HRESULT Open(const char* f,IWICBitmap** d)
{
exr_context_initializer_t ctxtinit = EXR_DEFAULT_CONTEXT_INITIALIZER;
exr_context_t myfile = {};
exr_result_t rv = exr_start_read(&myfile, f, &ctxtinit);
int part_index = 0;
const exr_attr_chlist_t* chl = 0;
exr_get_channels(myfile, part_index, &chl);
int32_t ck = 0;
rv = exr_get_chunk_count(myfile, part_index, &ck);
int32_t sl = 0;
rv = exr_get_scanlines_per_chunk(myfile, part_index, &sl);
int y = 0;
int wi = 0;
int he = 0;
std::vector<fl> data; // put here the floats
exr_decode_pipeline_t dec = {};
for (int32_t cuk = 0; cuk < ck; cuk++)
{
exr_chunk_info_t ch = {};
exr_read_scanline_chunk_info(myfile, part_index, y, &ch);
wi = ch.width;
he += ch.height;
y += sl;
bool first = 0;
if (dec.decompress_fn == 0)
{
rv = exr_decoding_initialize(myfile, part_index, &ch, &dec);
rv = exr_decoding_choose_default_routines(myfile, part_index, &dec);
first = 1;
}
if (!first)
rv = exr_decoding_update(myfile, part_index,&ch,&dec);
rv = exr_decoding_run(myfile, part_index, &dec);
int NumPixels = (wi * ch.height);
auto BytesPerPixel = ch.unpacked_size / NumPixels;
if (true)
{
// RGB(A)
if (chl->entries[0].pixel_type == EXR_PIXEL_HALF)
{
if (BytesPerPixel == chl->num_channels * 2)
{
auto ds = data.size();
data.resize(ds + NumPixels);
auto p = data.data() + ds;
char* x = (char*)dec.unpacked_buffer;
for (int j = 0; j < NumPixels; j++)
{
uint16_t* u = (uint16_t*)x;
p->a = 1.0f;
for (int jH = 0; jH < chl->num_channels; jH++)
{
half ha(Imath_3_2::half::FromBits,*u);
ha.setBits(*u);
if (strcmp(chl->entries[jH].name.str, "R") == 0) p->r = ha.operator float();
if (strcmp(chl->entries[jH].name.str, "G") == 0) p->g = ha.operator float();
if (strcmp(chl->entries[jH].name.str, "B") == 0) p->b = ha.operator float();
if (strcmp(chl->entries[jH].name.str, "A") == 0) p->a = ha.operator float();
u++;
}
x += BytesPerPixel;
p++;
}
}
else
break;
}
if (chl->entries[0].pixel_type == EXR_PIXEL_FLOAT)
{
// code removed for simplicity, I guess the same issue happens here unless it's a problem of the half-float
}
}
}
rv = exr_decoding_destroy(myfile, &dec);
exr_finish(&myfile);
CComPtr<IWICImagingFactory2> wbfact = 0;
CoCreateInstance(CLSID_WICImagingFactory2, 0, CLSCTX_INPROC_SERVER,
__uuidof(IWICImagingFactory2), (void**)&wbfact);
return wbfact->CreateBitmapFromMemory(wi, he, GUID_WICPixelFormat128bppPRGBAFloat, wi * 16,(UINT)data.size()*16, (BYTE*)data.data(), d);
}
What am I doing wrong? The pixel number I'm reading is correct (in this image 800x800).
My result:
Photoshop:
Is there a problem with the half-float? I'm just using the OpenEXR's IMath implementation.
Here I attach my code that I use to Draw the Histogram of the Contrasted image and also to convert a gray image into Contrast Image. Here I used low pint as 122 and highest point as 244. In the output histogram it reduce the height of the histogram.
I cannot find the error in my code
#include "opencv2/opencv.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/core.hpp"
using namespace cv;
using namespace std;
int main(int argc, char* argv[]) {
Mat img = imread(argv[1], 1);
if (!img.data) {
cout << "Could not find the image!" << endl;
return -1;
}
int height = img.rows;
int width = img.cols;
int widthstep = img.step;
int ch = img.channels();
printf("Height : %d\n", height);
printf("Width : %d\n", width);
printf("Widthstep : %d\n", widthstep);
printf("No of channels : %d\n", ch);
Mat gray_image(height, width, CV_8UC1, Scalar(0));
cvtColor(img, gray_image, COLOR_BGR2GRAY);
Mat new_image = gray_image.clone();
int v;
int output{};
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
int v = (int)gray_image.at<uchar>(y, x);
if (v >= 0 && v <= 122) {
output = int((6 / 122) * v);
}
else if (v > 100 && v <= 244) {
output = int(((244) / (122)) * (v - 122) + 6);
}
else if (v > 244 && v <= 255) {
output = int(((5) / (11)) * (v - 244) + 250);
}
new_image.at<uchar>(y, x) = (uchar)output;
}
}
int histn[256];
for (int i = 0; i < 256; i++) {
histn[i] = 0;
}
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
histn[(int)new_image.at<uchar>(y, x)] = histn[(int)new_image.at<uchar>(y, x)] + 1;
}
}
for (int i = 0; i < 256; i++) {
cout << i << ":" << histn[i] << endl;
}
int hist_wn = 512;
int hist_hn = 400;
int bin_wn = cvRound((double)hist_wn / 256);
Mat new_histogramImage(hist_hn, hist_wn, CV_8UC1, Scalar(255));
int maxn = histn[0];
for (int i = 0; i < 256; i++) {
if (maxn < histn[i]) {
maxn = histn[i];
}
}
for (int i = 0; i < 256; i++) {
histn[i] = ((double)histn[i] / maxn) * new_histogramImage.rows;
}
for (int i = 0; i < 256; i++) {
line(new_histogramImage, Point(bin_wn * (i), hist_hn), Point(bin_wn * (i), hist_hn - histn[i]), Scalar(0), 1, 8, 0);
}
imwrite("Gray_Image.png", gray_image);
imwrite("newcontrast_Image.png", new_image);
imwrite("Histogram.png", new_histogramImage);
namedWindow("Image");
imshow("Image", img);
namedWindow("Gray_Image");
imshow("Gray_Image", gray_image);
namedWindow("newcontrast_Image");
imshow("newcontrast_Image", new_image);
namedWindow("New_Histogram");
imshow("New_Histogram", new_histogramImage);
namedWindow("Old_Histogram");
imshow("Old_Histogram", histImage);
waitKey(0);
return 0;
}
Here are the new and old histograms that I got as outputs
I found the solution for the question. Here I changed the lowest and highest point values as 100 and 240 and when using the values set those as decimals values.
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
int v = (int)gray_image.at<uchar>(y, x);
if (v >= 0 && v <= 100) {
output = int((5.0/ 100.0) * v);
}
else if (v > 100 && v <= 240) {
output = int(((245.0) / (140.0)) * (v - 100.0) + 5.0);
}
else if (v > 240 && v <= 255) {
output = int(((5.0) / (15.0)) * (v - 240.0) + 250.0);
}
new_image.at<uchar>(y, x) = (uchar)output;
}
}
I'm trying to use mnist dataset for neural networks but im getting a Access violation writing location 0x00000000
the code is
for (int i = 0; i < length; i++) {
innerarray = (int8_t*)malloc(width * height);
for (int j = 0; j < width * height; j++) {
int8_t value = 0;
innerarray[j] = value;
}
temparray[i] = innerarray;
}
for (int i = 0; i < length; i++) {
for (int j = 0; j < width * height; j++) {
int8_t grayscale;
rf.read((char*)&grayscale, 1);
temparray[i][j] = grayscale; //error happens here
}
}
variable values:
int length = 10000;
int width = 28;
int height = 28;
The weird thing is it only happen when i >= 2512. Also replacing grayscale with 0 doesn't work. I can hower set temparray[2512][0] to 0 before the last nested for loop.
Like this:
for (int i = 0; i < length; i++) {
innerarray = (int8_t*)malloc(width * height);
for (int j = 0; j < width * height; j++) {
int8_t value = 0;
innerarray[j] = value;
}
temparray[i] = innerarray;
}
temparray[2512][0] = 0; //works
for (int i = 0; i < length; i++) {
for (int j = 0; j < width * height; j++) {
int8_t grayscale;
rf.read((char*)&grayscale, 1);
temparray[i][j] = 0; //error still happens here
}
}
The full code is:
#include<iostream>
#include<fstream>
#include<cstdint>
#include<cstdlib>
#include<array>
using namespace std;
struct images {
int32_t height = 0;
int32_t width = 0;
int32_t magicnumber = 0;
int32_t numberofimages = 0;
int8_t** images[];
void setimages(int8_t** newimages) {
delete[] this->images;
int8_t** images = (int8_t**)malloc(numberofimages);
int8_t* innerarray;
for (int i = 0; i < numberofimages; i++) {
innerarray = (int8_t*)malloc(width * height);
images[i] = innerarray;
}
for (int i = 0; i < numberofimages; i++) {
for (int j = 0; j < width * height; j++) {
images[i][j] = newimages[i][j];
}
}
};
};
struct labels {
int32_t magicnumber = 0;
int32_t numberoflabels = 0;
int8_t labels[];
};
int32_t litleendiantobig(int32_t litle) {//reverse works as well
int32_t big = ((4278190080 & litle) >> 24) + ((255 & litle) << 24) + ((16711680 & litle) >> 8) + ((65280 & litle) << 8);
return big;
}
images loadimages(string filename, int32_t magicalnumber) {
ifstream rf(filename, ios::out | ios::binary);
if (!rf) {
cout << "Cannot open file! " << filename << endl;
exit(1);
}
int32_t magicnumberoffile;
rf.read((char*)&magicnumberoffile, 4);
magicnumberoffile = litleendiantobig(magicnumberoffile);
if (magicalnumber != magicnumberoffile) {
cout << "Wrong magic number!" << endl;
cout << "expected:" << magicalnumber << endl;
cout << "got:" << magicnumberoffile << endl;
exit(1);
}
images img;
int32_t length;
rf.read((char*)&length, 4);
length = litleendiantobig(length);
img.numberofimages = length;
int32_t width;
rf.read((char*)&width, 4);
width = litleendiantobig(width);
img.width = width;
int32_t height;
rf.read((char*)&height, 4);
height = litleendiantobig(height);
img.height = height;
int8_t** temparray = (int8_t**)malloc(length);
int8_t* innerarray;
for (int i = 0; i < length; i++) {
innerarray = (int8_t*)malloc(width * height);
for (int j = 0; j < width * height; j++) {
int8_t value = 0;
innerarray[j] = value;
}
temparray[i] = innerarray;
}
for (int i = 0; i < length; i++) {
for (int j = 0; j < width * height; j++) {
int8_t grayscale;
rf.read((char*)&grayscale, 1);
temparray[i][j] = grayscale; //error happens here
}
}
img.setimages(temparray);
rf.close();
return img;
}
int main() {
images testimages;
loadimages("t10k-images.bin", 2051);
cout << testimages.images;
return 0;
}
I don't now how to solve the problem and can't find it anywhere else. Thanks for helping me out.
Your using malloc has done you in.
int* array = (int*)malloc(width* height); // allocate width * height bytes.
array[i] = x; // Sets the [i] _integer_ of array to x.
// but you allocated space for BYTE size elemennts.
The correct way to allocate integers using malloc:
int* array = (int*)malloc(width* height * sizeof(int)); // allocate width * height ints
Either that or your original intent was to allocate 8 bit pixels. In that case, your pointers should be declared as unsigned char*.
In either case, when coding in C++, types are important, and using operator new to allocate your arrays would have saved you from these troubles.
I have a problem with 3 mpi_bcast and one mpi_scatter, my program don't work well ,mpi_scatter don't work and globalparcsr don't scatter between nodes. when i delete second and third mpi_bcast ,mpi_scatter work well. I want broadcast a and globalindividual and globalfitness and then scatter globalparcsr, part of my code as bellow:
int malloc2dint(int ***array, int n, int m) {
/* allocate the n*m contiguous items */
int *p = (int *)malloc(n*m * sizeof(int));
if (!p) return -1;
/* allocate the row pointers into the memory */
(*array) = (int **)malloc(n * sizeof(int*));
if (!(*array)) {
free(p);
return -1;
}
/* set up the pointers into the contiguous memory */
for (int i = 0; i<n; i++)
(*array)[i] = &(p[i*m]);
return 0;
}
int main(int argc, char *argv[]) {
int size, rank, divided_pop_size, sum = 0, root = 0, procgridsize, sum2 = 0,generation=0;
int **globalindividual, **localindividual;
int *globalfitness, *localfitness;
int *globalparcsr, *localparcsr;
int **recbuf;
int *sendcounts, *parsendcount; //specifying the number of elements to send to each processor
int *displs, *pardispls; //Entry i specifies the displacement
MPI_Status status;
int offset, rows;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
divided_pop_size = n_initial_pop / size;
if (rank == root)
{
malloc2dint(&globalindividual, n_initial_pop, num_vertices);
read_graph();
globalfitness = (int*)malloc(n_initial_pop * sizeof(int));
globalparcsr = (int*)malloc(n_initial_pop * sizeof(int));
globalindividual = initial_population(globalindividual, n_initial_pop);
for (int i = 0; i < n_initial_pop; i++) {
printf("\n");
for (int j = 0; j < num_vertices; j++)
printf("%d", globalindividual[i][j]);
}
}
for (int p = 0; p < size; p++) {
if (rank == p) {
malloc2dint(&localindividual,n_initial_pop + 2, num_vertices);
localindividual = initial_population(localindividual, divided_pop_size + 2);
}
}
MPI_Bcast(&a[0][0], 5000 * 5000, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Bcast(&globalindividual[0][0], n_initial_pop*num_vertices, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&globalfitness, n_initial_pop, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Finalize();
return 0;
}
a is a 2d array and globalindividual is a 2d array with 12 rows and 8 columns and globalfitness is 1d array with size 12
please help me.
I'm trying hard all day to implement matrix multiply with help of MPI, all examples from the Internet didn't work for me (I don't know why, it compiles, run but not computing). Here is what I'm doing:
From bash:
mpirun -n 2 out/lb8
It reading matrix 2x4 (1 row per process) and starting to compute.
The problem is in SendRecv block (or generally in collecting data)
void Matrix_MPY(double **matrix_a, double **matrix_b, double ***matrix_c, int a_rows, int a_cols) {
int i, j;
int process_rank, process_count;
MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
MPI_Comm_size(MPI_COMM_WORLD, &process_count);
if (a_rows % process_count != 0) {
error_code = NOT_DEVIDED_BY_RANK_EXCEPTION;
return;
}
int rows_per_process = a_rows / process_count;
int current_row = rows_per_process * process_rank;
double **temp;
temp = (double **) malloc(sizeof(double *) * a_rows);
for (i = 0; i < a_rows; ++i){
temp[i] = (double *) malloc(sizeof(double) * a_rows);
}
for (i = current_row; i < current_row + rows_per_process; ++i) {
for (j = 0; j < a_rows; ++j)
{
int k;
for(k = 0; k < a_cols; ++k){
temp[i][j] += matrix_a[i][k] * matrix_b[k][j];
}
}
MPI_Sendrecv(temp[i], a_rows, MPI_DOUBLE, ROOT, TAG, temp[i], a_rows, MPI_DOUBLE, process_rank, TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
*matrix_c = temp;
}
This solution is worked for me
....
if (process_rank != ROOT)
MPI_Send(temp[i], a_rows, MPI_DOUBLE, ROOT, i, MPI_COMM_WORLD);
}
if (process_rank == ROOT) {
for (i = 1; i < process_count; ++i)
{
for (j = i * rows_per_process; j < i * rows_per_process + rows_per_process; ++j)
{
MPI_Recv(temp[j], a_rows, MPI_DOUBLE, i, j, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
}
}
}
*matrix_c = temp;