Grayscale Using MPICH2 MPI - image

Now, I'm learning about image processing using MPI (MPICH2)
I have a problem when connecting between the C + + code with MPI, particularly on the grayscale. here the code
int main(int argc, char* argv[])
{
int jumpros, idpros;
IplImage* image_input = cvLoadImage("test.jpg", CV_LOAD_IMAGE_UNCHANGED);
IplImage* image_output =
cvCreateImage(cvGetSize(image_input),IPL_DEPTH_8U,channels);
unsigned char *h_out = (unsigned char*)image_output->imageData;
unsigned char *h_in = (unsigned char*)image_input->imageData;
width = image_input->width;
height = image_input->height;
channels = image_input->nChannels;
widthStep = image_input->widthStep;
widthStepOutput = image_output->widthStep;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&jumpros);
MPI_Comm_rank(MPI_COMM_WORLD,&idpros);
for (int n = idpros; n <=n-1; n+=jumpros){
for(int i=0;i<height;i++){
for(int j=0;j<width;j++){
int index = h_in[i*widthStep + j*channels];
int gray = 0.3*(index)+0.6*(index+1)+0.1*(index+2);
h_out[i*widthStepOutput+j]=gray;
}
}
}
cvShowImage("Original", image_input);
cvShowImage("CPU", image_output);
cvReleaseImage(&image_input);
cvReleaseImage(&image_output);
waitKey(0);
MPI_Finalize;
return 0;
}
But when the program is run, the results of blank images(white), Unlike when using openCV before the resulting images are grayscale.
Please help me, whats wrong with my code??? :(
thanks

Related

Can I speed up type conversion using intrinsics?

I am working on an application which needs to convert data to float.
The data are unsigned char or unsigned short.
I am using both AVX2 and other SIMDs intrinsics in this code.
I wrote the conversion like this:
unsigned char -> float :
#ifdef __AVX2__
__m256i tmp_v =_mm256_lddqu_si256(reinterpret_cast<const __m256i*>(src+j));
v16_avx[0] = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(tmp_v,0x0));
v16_avx[1] = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(tmp_v,0x1));
v32_avx[0] = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(v16_avx[0],0x0));
v32_avx[1] = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(v16_avx[0],0x1));
v32_avx[2] = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(v16_avx[1],0x0));
v32_avx[3] = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(v16_avx[1],0x1));
for (int l=0; l<4; l++) {
__m256 vc1_ps = _mm256_cvtepi32_ps(_mm256_and_si256(v32_avx[l],m_lt_avx[l]));
__m256 vc2_ps = _mm256_cvtepi32_ps(_mm256_and_si256(v32_avx[l],m_ge_avx[l]));
/*
....
some processing there.
*/
}
#endif
#ifdef __SSE2__
#ifdef __SSE3__
__m128i tmp_v = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(src+j));
#else
__m128i tmp_v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src+j));
#endif
#ifdef __SSE4_1__
v16[0] = _mm_cvtepu8_epi16(tmp_v);
tmp_v = _mm_shuffle_epi8(tmp_v,mask8);
v16[1] = _mm_cvtepu8_epi16(tmp_v);
v32[0] = _mm_cvtepi16_epi32(v16[0]);
v16[0] = _mm_shuffle_epi32(v16[0],0x4E);
v32[1] = _mm_cvtepi16_epi32(v16[0]);
v32[2] = _mm_cvtepi16_epi32(v16[1]);
v16[1] = _mm_shuffle_epi32(v16[1],0x4E);
v32[3] = _mm_cvtepi16_epi32(v16[1]);
#else
__m128i tmp_v_l = _mm_slli_si128(tmp_v,8);
__m128i tmp_v_r = _mm_srli_si128(tmp_v,8);
v16[0] = _mm_unpacklo_epi8(tmp_v,tmp_v_l);
v16[1] = _mm_unpackhi_epi8(tmp_v,tmp_v_r);
tmp_v_l = _mm_srli_epi16(v16[0],8);
tmp_v_r = _mm_srai_epi16(v16[0],8);
v32[0] = _mm_unpacklo_epi16(v16[0],tmp_v_l);
v32[1] = _mm_unpackhi_epi16(v16[0],tmp_v_r);
v16[0] = _mm_unpacklo_epi8(tmp_v,tmp_v_l);
v16[1] = _mm_unpackhi_epi8(tmp_v,tmp_v_r);
tmp_v_l = _mm_srli_epi16(v16[1],8);
tmp_v_r = _mm_srai_epi16(v16[1],8);
v32[2] = _mm_unpacklo_epi16(v16[1],tmp_v_l);
v32[3] = _mm_unpackhi_epi16(v16[1],tmp_v_r);
#endif
for (int l=0; l<4; l++) {
__m128 vc1_ps = _mm_cvtepi32_ps(_mm_and_si128(v32[l],m_lt[l]));
__m128 vc2_ps = _mm_cvtepi32_ps(_mm_and_si128(v32[l],m_ge[l]));
/*
...
some processing there.
*/
}
#endif
unsigned short -> float
#ifdef __AVX2__
v32_avx[0] = _mm256_cvtepu16_epi32(_mm256_extractf128_si256(tmp_v,0x0));
v32_avx[1] = _mm256_cvtepu16_epi32(_mm256_extractf128_si256(tmp_v,0x1));
for(int l=0;l<2;l++) {
__m256 vc1_ps = _mm256_cvtepi32_ps(_mm256_and_si256(v32_avx[l],m_lt_avx[l]));
__m256 vc2_ps = _mm256_cvtepi32_ps(_mm256_and_si256(v32_avx[l],m_ge_avx[l]));
/*
...
some processing there.
*/
}
#endif
#ifdef __SSE2__
#ifdef __SSE3__
__m128i tmp_v = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(src+j));
#else
__m128i tmp_v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src+j));
#endif
#ifdef __SSE4_1__
v32[0] = _mm_cvtepu16_epi32(tmp_v);
tmp_v = _mm_shuffle_epi32(tmp_v,0x4E);
v32[1] = _mm_cvtepu16_epi32(tmp_v);
#else
__m128i tmp_v_l = _mm_slli_si128(tmp_v,8);
__m128i tmp_v_r = _mm_srli_si128(tmp_v,8);
v32[0] = _mm_unpacklo_epi16(tmp_v,tmp_v_l);
v32[1] = _mm_unpackhi_epi16(tmp_v,tmp_v_r);
#endif
for(int l=0;l<2;l++) {
__m128 vc1_ps = _mm_cvtepi32_ps(_mm_and_si128(v32[l],m_lt[l]));
__m128 vc2_ps = _mm_cvtepi32_ps(_mm_and_si128(v32[l],m_ge[l]));
/*
...
some processing there.
*/
}
#endif
The processing in the comments have nothing to do with the conversion step.
I would like to speed up those conversions.
I read in SSE: convert short integer to float and in Converting Int to Float/Float to Int using Bitwise that it's possible to do this using bitwise operations.
Are those approaches really any faster?
I experimented with the implementation in the first link; there was almost no change in processing time, it worked fine for signed short and also for unsigned short as long as the value is included between 0 and MAX_SHRT (32767 on my system):
#include <immintrin.h>
#include <iterator>
#include <iostream>
#include <chrono>
void convert_sse_intrinsic(const ushort *source,const int len, int *destination)
{
__m128i zero2 = _mm_setzero_si128();
for (int i = 0; i < len; i+=4)
{
__m128i value = _mm_unpacklo_epi16(_mm_set_epi64x(0,*((long long*)(source+i)) /**ps*/), zero2);
value = _mm_srai_epi32(_mm_slli_epi32(value, 16), 16);
_mm_storeu_si128(reinterpret_cast<__m128i*>(destination+i),value);
}
}
void convert_sse_intrinsic2(const ushort *source,const int len, int *destination)
{
for (int i = 0; i < len; i+=8)
{
__m128i value = _mm_loadu_si128(reinterpret_cast<const __m128i*>(source+i));
_mm_storeu_si128(reinterpret_cast<__m128i*>(destination+i),_mm_cvtepu16_epi32(value));
value = _mm_shuffle_epi32(value,0x4E);
_mm_storeu_si128(reinterpret_cast<__m128i*>(destination+i+4),_mm_cvtepu16_epi32(value));
}
}
int main(int argc, char *argv[])
{
ushort CV_DECL_ALIGNED(32) toto[16] =
{0,500,1000,5000,
10000,15000,20000,25000,
30000,35000,40000,45000,
50000,55000,60000,65000};
int CV_DECL_ALIGNED(32) tutu[16] = {0};
std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
convert_sse_intrinsic(toto,16,tutu);
std::chrono::steady_clock::time_point stop = std::chrono::steady_clock::now();
std::cout<<"processing time 1st method : "<<std::chrono::duration_cast<std::chrono::nanoseconds>(stop-start).count()<<" : ns"<<std::endl;
std::copy(tutu,tutu+16,std::ostream_iterator<int>(std::cout," "));
std::cout<<std::endl;
start = std::chrono::steady_clock::now();
convert_sse_intrinsic2(toto,16,tutu);
stop = std::chrono::steady_clock::now();
std::cout<<"processing time 2nd method : "<<std::chrono::duration_cast<std::chrono::nanoseconds>(stop-start).count()<<" : ns"<<std::endl;
std::copy(tutu,tutu+16,std::ostream_iterator<int>(std::cout," "));
std::cout<<std::endl;
return 0;
}
Thanks in advance for any help.
Well I think there is not really any faster way to convert an unsigned char or an unsigned short to float rather than the intrinsics already there.
I tried several other ways using bitwise operators, but none was significantly faster.
So I think that it's not interesting to let this topic linger any longer.

Inverting an image using MPI

I am trying to invert a PGM image using MPI. The grayscale (PGM) image should be loaded on the root processor and then be sent to each of the s^2 processors. Each processor will invert a block of the given image, and the inverted blocks will be gathered back on the root processor, which will assemble the blocks into the final image and write it to a PGM image. I ran the following code, but did not get any output. The image was read after running the code, but there was no indication of writing the resultant image. Could you please let me know what could be wrong with it?
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <time.h>
#include <string.h>
#include <math.h>
#include <memory.h>
#define max(x, y) ((x>y) ? (x):(y))
#define min(x, y) ((x<y) ? (x):(y))
int xdim;
int ydim;
int maxraw;
unsigned char *image;
void ReadPGM(FILE*);
void WritePGM(FILE*);
#define s 2
int main(int argc, char **argv) {
MPI_Init(&argc, &argv);
int p, rank;
MPI_Comm_size(MPI_COMM_WORLD, &p);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
const int NPROWS=s; /* number of rows in _decomposition_ */
const int NPCOLS=s; /* number of cols in _decomposition_ */
const int BLOCKROWS = xdim/NPROWS; /* number of rows in _block_ */
const int BLOCKCOLS = ydim/NPCOLS; /* number of cols in _block_ */
int i, j;
FILE *fp;
float BLimage[BLOCKROWS*BLOCKCOLS];
for (int ii=0; ii<BLOCKROWS*BLOCKCOLS; ii++)
BLimage[ii] = 0;
float BLfilteredMat[BLOCKROWS*BLOCKCOLS];
for (int ii=0; ii<BLOCKROWS*BLOCKCOLS; ii++)
BLfilteredMat[ii] = 0;
if (rank == 0) {
/* begin reading PGM.... */
ReadPGM(fp);
}
MPI_Datatype blocktype;
MPI_Datatype blocktype2;
MPI_Type_vector(BLOCKROWS, BLOCKCOLS, ydim, MPI_FLOAT, &blocktype2);
MPI_Type_create_resized( blocktype2, 0, sizeof(float), &blocktype);
MPI_Type_commit(&blocktype);
int disps[NPROWS*NPCOLS];
int counts[NPROWS*NPCOLS];
for (int ii=0; ii<NPROWS; ii++) {
for (int jj=0; jj<NPCOLS; jj++) {
disps[ii*NPCOLS+jj] = ii*ydim*BLOCKROWS+jj*BLOCKCOLS;
counts [ii*NPCOLS+jj] = 1;
}
}
MPI_Scatterv(image, counts, disps, blocktype, BLimage, BLOCKROWS*BLOCKCOLS, MPI_FLOAT, 0, MPI_COMM_WORLD);
//************** Invert the block **************//
for (int proc=0; proc<p; proc++) {
if (proc == rank) {
for (int j = 0; j < BLOCKCOLS; j++) {
for (int i = 0; i < BLOCKROWS; i++) {
BLfilteredMat[j*BLOCKROWS+i] = 255 - image[j*BLOCKROWS+i];
}
}
} // close if (proc == rank) {
MPI_Barrier(MPI_COMM_WORLD);
} // close for (int proc=0; proc<p; proc++) {
MPI_Gatherv(BLfilteredMat, BLOCKROWS*BLOCKCOLS,MPI_FLOAT, image, counts, disps,blocktype, 0, MPI_COMM_WORLD);
if (rank == 0) {
/* Begin writing PGM.... */
WritePGM(fp);
free(image);
}
MPI_Finalize();
return (1);
}
It is very likely MPI is not the right tool for the job. The reason for this is that your job is inherently bandwidth limited.
Think of it this way: You have a coloring book with images which you all want to color in.
Method 1: you take your time and color them in one by one.
Method 2: you copy each page to a new sheet of paper and mail it to a friend who then colors it in for you. He mails it back to you and in the end you glue all the pages you received from all of your friends together to make one colored-in book.
Note that method two involves copying the whole book, which is arguably the same amount of work needed to color in the whole book. So method two is less time-efficient without even considering the overhead of shoving the pages into an envelope, licking the stamp, going to the post office and waiting for the letter to be delivered.
If you look at your code, every transmitted byte is only touched once throughout the whole program in this line:
BLfilteredMat[j*BLOCKROWS+i] = 255 - image[j*BLOCKROWS+i];
The single processor is much faster at subtracting two integers than it is at sending an integer of the wire, therefore one must advise against using MPI for your particular problem.
My suggestion to solve your problem: Try to avoid unneccessary communication whenever possible. Do all processes have access to the file system on which the files are located? You could try reading them directly from the filesystem.

Sobel filter in cuda (cant show full image)

I have a classic problem about the output of sobel filter using CUDA.
this is a main class (main.cpp)
/*main class */
int main(int argc, char** argv)
{
IplImage* image_source = cvLoadImage("test.jpg",
CV_LOAD_IMAGE_GRAYSCALE);
IplImage* image_input = cvCreateImage(cvGetSize(image_source),
IPL_DEPTH_8U,image_source->nChannels);
IplImage* image_output = cvCreateImage(cvGetSize(image_source),
IPL_DEPTH_8U,image_source->nChannels);
/* Convert from IplImage tofloat */
cvConvert(image_source,image_input);
unsigned char *h_out = (unsigned char*)image_output->imageData;
unsigned char *h_in = (unsigned char*)image_input->imageData;
width = image_input->width;
height = image_input->height;
widthStep = image_input->widthStep;
sobel_parallel(h_in, h_out, width, height, widthStep);
cvShowImage( "CPU", image_output );
cvReleaseImage( &image_output );
waitKey(0);
}
And this is the CUDA file (kernel_gpu.cu)
__global__ void kernel ( unsigned char *d_in , unsigned char *d_out , int width ,
int height, int widthStep ) {
int col = blockIdx . x * blockDim . x + threadIdx . x ;
int row = blockIdx . y * blockDim . y + threadIdx . y ;
int dx [3][3] = { -1 , 0 , 1 ,
-2 , 0 , 2 ,
-1 , 0 , 1};
int dy [3][3] = {1 ,2 ,1 ,
0 ,0 ,0 ,
-1 , -2 , -1};
int s;
if( col < width && row < height)
{
int i = row;
int j = col;
// apply kernel in X direction
int sum_x=0;
for(int m=-1; m<=1; m++)
for(int n=-1; n<=1; n++)
{
s=d_in[(i+m)*widthStep+j+n]; // get the (i,j) pixel value
sum_x+=s*dx[m+1][n+1];
}
// apply kernel in Y direction
int sum_y=0;
for(int m=-1; m<=1; m++)
for(int n=-1; n<=1; n++)
{
s=d_in[(i+m)*widthStep+j+n]; // get the (i,j) pixel value
sum_y+=s*dy[m+1][n+1];
}
int sum=abs(sum_x)+abs(sum_y);
if (sum>255)
sum=255;
d_out[i*widthStep+j]=sum; // set the (i,j) pixel value
}
}
// Kernel Calling Function
extern "C" void sobel_parallel( unsigned char* h_in, unsigned char* h_out,
int rows, int cols, int widthStep){
unsigned char* d_in;
unsigned char* d_out;
cudaMalloc((void**) &d_in, rows*cols);
cudaMalloc((void**) &d_out, rows*cols);
cudaMemcpy(d_in, h_in, rows*cols*sizeof( unsigned char), cudaMemcpyHostToDevice);
dim3 block (16,16);
dim3 grid ((rows * cols) / 256.0);
kernel<<<grid,block>>>(d_in, d_out, rows, cols, widthStep);
cudaMemcpy(h_out, d_out, rows*cols*sizeof( unsigned char), cudaMemcpyDeviceToHost);
cudaFree(d_in);
cudaFree(d_out);
}
Error :
the result image does not appear in their entirety, only part of the image.
Why is the result(GPU) like this?? (I tried to make CPU computation using the same function and no problem).
You are creating 1 Dimensional grid, while using 2D indexing inside the kernel which will cover only the x direction and only the top 16 rows of the image will be filtered (because the height of the block is 16).
dim3 grid ((rows * cols) / 256.0); //This is incorrect in current case
Consider creating 2 dimensional grid, so that it spans all the rows of the image.
dim3 grid ((cols + 15)/16, (rows + 15)/16);
Check the width and widthStep variables to see if they are actually equal or not because in your sobel_parallel function you are implicitly assuming this (which might not be true since your data is aligned). If this is not true the code
cudaMalloc((void**) &d_in, rows*cols);
will actually allocate less memory than necessary and hence you will only process part of your image. It would be better to use
cudaMalloc((void**) &d_in, rows*widthStep);
And of course adjust the rest of your code as necessary.
You are also calling
void sobel_parallel( unsigned char* h_in, unsigned char* h_out,
int rows, int cols, int widthStep)
with
sobel_parallel(h_in, h_out, width, height, widthStep);
which exchanges rows with cols and this is again exchanged when you are calling your kernel. This will cause a problem when you use the above suggestion.

How to read a txt file in MPI by a single process? Why my approach does not work?

I new to MPI.
I am trying to read a text file by using standard c++ code as follows.
int main(int argc, char* argv[] ){
int np, pid, ierr;
ierr = MPI_Init(&argc, &argv);
ierr = MPI_Comm_size(MPI_COMM_WORLD, &np);
ierr = MPI_Comm_rank(MPI_COMM_WORLD, &pid);
const int imgWidth = 1000; // the width of the image (count in pixel)
const int imgHeight = 1000; // the height of the image
double* Y;
Y = (double *)malloc(imgHeight*imgWidth*sizeof(double));
if(pid == 0)
{
string input = "Im.txt";
readData(input.c_str(), Y);
}
MPI_Bcast(Y, imgHeight*imgWidth, MPI_DOUBLE, 0, MPI_COMM_WORLD);
free(Y);
MPI_Finalize();
return 1;
}
The readData function is defined as:
bool readData(const char *fileName, double* Y){
printf("Reading the data file!\n");
ifstream fin(fileName);
int i = 0;
while(fin>>Y[i])
{
i++;
};
cout<<"In total, "<<i<<" data are imported."<<endl;
//close the file
fin.close();
return 1;
}
The file "Im.txt" includes a bunch of numbers. However, when I run the program, there is no data imported. Can anyone give me a hint? I do not need to use multiply processes to read this file in parallel.
Finally, I find the problem. I am working under win7 with visual studio. Seems I have to indicate explicitly the path of my file. Even I put "Im.txt" to the same folder with the source code file, it does not work.

XCode 4.2 - Openssl build error

I have successfully created the OpenSSL library for iPhone Simulator. I have successfully imported all the headers and libs. However, I am having problems in building the project and XCode tells me that there is an incomplete definition of a struct X509_ALGOR. Here is the code:
- (NSData *)encodePBEWithMD5AndDESData:(NSData *)inData password:(NSString *)password direction:(int)direction
{
// Change salt and number of iterations for your project !!!
static const char gSalt[] =
{
(unsigned char)0xaa, (unsigned char)0xd1, (unsigned char)0x3c, (unsigned char)0x31,
(unsigned char)0x53, (unsigned char)0xa2, (unsigned char)0xee, (unsigned char)0x05
};
unsigned char *salt = (unsigned char *)gSalt;
int saltLen = strlen(gSalt);
int iterations = 15;
EVP_CIPHER_CTX cipherCtx;
unsigned char *mResults; // allocated storage of results
int mResultsLen = 0;
const char *cPassword = [password UTF8String];
unsigned char *mData = (unsigned char *)[inData bytes];
int mDataLen = [inData length];
SSLeay_add_all_algorithms();
X509_ALGOR *algorithm = PKCS5_pbe_set(NID_pbeWithMD5AndDES_CBC,
iterations, salt, saltLen);
memset(&cipherCtx, 0, sizeof(cipherCtx));
if (algorithm != NULL)
{
EVP_CIPHER_CTX_init(&(cipherCtx));
**if (EVP_PBE_CipherInit(algorithm->algorithm, cPassword, strlen(cPassword),
algorithm->parameter, &(cipherCtx), direction))**
{
EVP_CIPHER_CTX_set_padding(&cipherCtx, 1);
int blockSize = EVP_CIPHER_CTX_block_size(&cipherCtx);
int allocLen = mDataLen + blockSize + 1; // plus 1 for null terminator on decrypt
mResults = (unsigned char *)OPENSSL_malloc(allocLen);
unsigned char *in_bytes = mData;
int inLen = mDataLen;
unsigned char *out_bytes = mResults;
int outLen = 0;
The pointer to struct X509_ALGOR, which is 'algorithm' is found to be incompletely defined. I don't have any clue about this. Can anyone help me please?
if (EVP_PBE_CipherInit(algorithm->algorithm, cPassword, strlen(cPassword),
algorithm->parameter, &(cipherCtx), direction))
I am not sure if it is appropriate to post answer to my own question. However, I am doing this if someone has the same problem, he/she may get some help from this.
The problem with the above mentioned code was the linker flags. Once I set the "-ObjC -load_all" in "Other Linker Flags" under "Build Settings", the problem was gone.
Regards.

Resources