execution time work sharing vs spmd - openmp

I'm writing two similar programs to compare the execution time in spmd vs. worsharing.
To my surprise I'm getting more execution time in work sharing while time in spmd is considerably less.
What did I do wrong to get that?
Here are my codes:
SPMD code:
#define N 1000
float A[N][N], B[N][N], C[N][N]; // declaring matrices of NxN size
int main() {
/* DECLARING VARIABLES */
int i, j, k; // indices for matrix multiplication
/* FILLING MATRICES WITH RANDOM NUMBERS */
srand ( time(NULL) );
for(i=0;i<N;i++) {
for(j=0;j<N;j++) {
A[i][j]= (rand()%10);
B[i][j]= (rand()%10);
}
}
/* MATRIX MULTIPLICATION */
omp_set_num_threads(4);
int id;
int m;
int nthrds;
#pragma omp parallel // num_threads(4)
{
id=omp_get_thread_num();
nthrds=omp_get_num_threads();
printf("number of threads %d",nthrds);
double wtime = omp_get_wtime();
for (i=id;i<N;i=i+nthrds) {
//dummy=i+nthrds;
for(j=id;j<N;j=j+nthrds) {
C[i][j]=0;
for(k=id;k<N;k=k+nthrds) {
C[i][j]+=A[i][k]*B[k][j];
}
// printf("%d ", C[i][j]);
}
//printf("\n");
}
wtime = omp_get_wtime() - wtime;
printf( "Time taken is %f\n", wtime );
getchar();
}
}
the work sharing code is :
.......
int main() {
/* DECLARING VARIABLES */
int i, j, k; // indices for matrix multiplication
/* FILLING MATRICES WITH RANDOM NUMBERS */
srand ( time(NULL) );
for(i=0;i<N;i++) {
for(j=0;j<N;j++) {
A[i][j]= (rand()%10);
B[i][j]= (rand()%10);
}
}
/* MATRIX MULTIPLICATION */
//printf("Max number of threads: %i \n",omp_get_max_threads());
double wtime = omp_get_wtime();
#pragma omp parallel private(j,k) num_threads(8)
{
#pragma omp for
for (i=0;i<N;i++) {
for(j=0;j<N;j++) {
C[i][j]=0;
for(k=0;k<N;k++) {
C[i][j]+=A[i][k]*B[k][j];
}
// printf("%d ", C[i][j]);
}
//printf("\n");
}
double etime = omp_get_wtime() - wtime;
printf( "Time taken is %f\n", etime );
getchar();
}
}

Related

There is a increase in execution time every time I increase the number of threads. Shouldn't parallel execution lead to speed-up?

Rows represents the number of elements which were sorted and time is in milliseconds:
I have set thread using export OMP_NUM_THREADS=n
There is a constant increasing in execution time irrespective of the number of elements I am taking. Where am I going wrong?
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "omp.h"
/*
OpenMP implementation example
Details of implementation/tutorial can be found here:
http://madhugnadig.com/articles/parallel-
processing/2017/02/25/parallel-computing-in-c-using-openMP.html
*/
void mergeSort( int a[], int i, int j);
void merge( int a[], int i1, int j1, int i2, int j2);
int main()
{ clock_t t;
t = clock();
int *a, num, i;
scanf("%d",&num);
a = ( int *)malloc(sizeof(int) * num);
for(i=0;i<num;i++)
scanf("%d",&a[i]);
mergeSort(a, 0, num-1);
printf("\nsorted array :\n");
for(i=0;i<num;i++)
printf("%d ",a[i]);
t = clock() - t;
double time_taken = ((double)t)/CLOCKS_PER_SEC; // in seconds
printf("\n\n\nYour sorting took %f seconds to execute \n", time_taken);
return 0;
}
void mergeSort( int a[], int i, int j)
{
int mid;
if(i<j)
{
mid=(i+j)/2;
#pragma omp parallel sections
{
#pragma omp section
{
mergeSort(a,i,mid); //left recursion
}
#pragma omp section
{
mergeSort(a,mid+1,j); //right recursion
}
}
merge(a,i,mid,mid+1,j); //merging of two sorted sub-arrays
}
}
void merge( int a[], int i1, int j1, int i2, int j2)
{
int temp[1001000]; //array used for merging
int i,j,k;
i=i1; //beginning of the first list
j=i2; //beginning of the second list
k=0;
while(i<=j1 && j<=j2) //while elements in both lists
{
if(a[i]<a[j])
temp[k++]=a[i++];
else
temp[k++]=a[j++];
}
while(i<=j1) //copy remaining elements of the first list
temp[k++]=a[i++];
while(j<=j2) //copy remaining elements of the second list
temp[k++]=a[j++];
//Transfer elements from temp[] back to a[]
for(i=i1,j=0;i<=j2;i++,j++)
a[i]=temp[j];
}
This is how I have run the code on my macbook:

generating random variables with openmp in c++

How can I generate in parallel (is it efficient? possible?) random variables with my linear congruential generator below:
double* uniform(long N)
{
long i,j;
long a=16807;
long long m=(((long long)1)<<31)-1;
long I[N];
double *U;
#pragma omp parallel for firstprivate(i)
for (j = 0; j < N;j++)
{
if (i==0)
{
int y= omp_get_thread_num(); // undefined ref error here
I[y];
i++;
}
else
{
I[j] = (a*I[j-1])%m;
}
}
#pragma omp parallel for
for (i=0; i<N; i++)
U[i] = (double)I[i]/(m+1.0);
return U;
}
My goal is to generate 2 variables to use them in another function (box-muller method):
double* gauss(long int N)
{
double *X, *Y, *U;
X = generator(N/2);
Y = generator(N/2);
#pragma omp parallel for
for (i=0;i<N/2;i++)
{
U[2*i]=sqrt(-2 * log(X[i]))*sin(Y[i]*2*3.14);
U[2*i+1]=sqrt(-2 * log(X[i]))*cos(Y[i]*2*3.14);
}
return U;
}
How want to know how can I get different seeds when generating uniform variables with the function uniform?

Optimizing N-queen with openmp

I am learning OPENMP and wrote the following code to solve nqueens problem.
//Full Code: https://github.com/Shafaet/Codes/blob/master/OPENMP/Parallel%20N- Queen%20problem.cpp
int n;
int call(int col,int rowmask,int dia1,int dia2)
{
if(col==n)
{
return 1;
}
int row,ans=0;
for(row=0;row<n;row++)
{
if(!(rowmask & (1<<row)) & !(dia1 & (1<<(row+col))) & !(dia2 & (1<<((row+n-1)-col))))
{
ans+=call(col+1,rowmask|1<<row,dia1|(1<<(row+col)), dia2|(1<<((row+n-1)-col)));
}
}
return ans;
}
double parallel()
{
double st=omp_get_wtime();
int ans=0;
int i;
int rowmask=0,dia1=0,dia2=0;
#pragma omp parallel for reduction(+:ans) shared(i,rowmask)
for(i=0;i<n;i++)
{
rowmask=0;
dia1=0,dia2=0;
int col=0,row=i;
ans+=call(1,rowmask|1<<row,dia1|(1<<(row+col)), dia2|(1<<((row+n-1)-col)));
}
printf("Found %d configuration for n=%d\n",ans,n);
double en=omp_get_wtime();
printf("Time taken using openmp %lf\n",en-st);
return en-st;
}
double serial()
{
double st=omp_get_wtime();
int ans=0;
int i;
int rowmask=0,dia1=0,dia2=0;
for(i=0;i<n;i++)
{
rowmask=0;
dia1=0,dia2=0;
int col=0,row=i;
ans+=call(1,rowmask|1<<row,dia1|(1<<(row+col)), dia2|(1<<((row+n-1)-col)));
}
printf("Found %d configuration for n=%d\n",ans,n);
double en=omp_get_wtime();
printf("Time taken without openmp %lf\n",en-st);
return en-st;
}
int main()
{
double average=0;
int count=0;
for(int i=2;i<=13;i++)
{
count++;
n=i;
double stime=serial();
double ptime=parallel();
printf("OpenMP is %lf times faster for n=%d\n",stime/ptime,n);
average+=stime/ptime;
puts("===============");
}
printf("On average OpenMP is %lf times faster\n",average/count);
return 0;
}
Parallel code is already faster than normal one but i wonder how can i optimize it more using openmp pragmas. I want to know what i should do for better performance and what i should not do.
Thanks in advance.
(Please dont suggest any optimizations which are non-related to parallel programming)
Your code seems to use classic backtracking N-Queens recursive algorithm, which is not the fastest possible for N-Queens solving, but (due to simplicity) is the most vivid one in terms of practicing with parallelism basics.
That's being said: this is very simple, thus you don't expect it to naturally demonstrate lots of advanced OpenMP means except basic "parallel for" and reduction.
But, as far as you're looking for learning parallelism and probably for more clearness and better learning curve, there is one more (out of many possible) implementation available, which uses the same algorithm but tends to be more readable and vivid from educational perspective:
void setQueen(int queens[], int row, int col) {
//check all previously placed rows for attacks
for(int i=0; i<row; i++) {
// vertical attacks
if (queens[i]==col) {
return;
}
// diagonal attacks
if (abs(queens[i]-col) == (row-i) ) {
return;
}
}
// column is ok, set the queen
queens[row]=col;
if(row==size-1) {
#pragma omp atomic
nrOfSolutions++; //Placed final queen, found a solution
}
else {
// try to fill next row
for(int i=0; i<size; i++) {
setQueen(queens, row+1, i);
}
}
}
//Function to find all solutions for nQueens problem on size x size chessboard.
void solve() {
#pragma omp parallel for
for(int i=0; i<size; i++) {
// try all positions in first row
int * queens = new int[size]; //array representing queens placed on a chess board. Index is row position, value is column.
setQueen(queens, 0, i);
delete[](queens);
}
}
This given code is one of Intel Advisor XE samples (for both C++ and Fortran); the parallelization aspects for given sample are discussed in very detailed manner in Chapter 10 of given Parallel Programming Book (in fact, given chapter just uses N-Queens to demonstrate how to use tools in order to parallelize serial code in general).
Given Advisor n-queens sample uses essentially the same algorithm as yours, but it replaces explicit reduction with combination of simple parallel for + atomic. This code is expected to be less efficient, but more "procedural-style" and more "educational", since it demonstrates "hidden" data race. In case you upload given samplecode, you will actually find 4 equialent N-Queens parallel implementatons using TBB, Cilk Plus and OpenMP (OMP is for C++ and Fortran).
I know I am a little late for the party, but you can use task queueing for further optimization.(about 7-10% faster results).No idea why. Here's the code,that i am using :
#include <iostream> // std::cout, cin, cerr ...
#include <iomanip> // modify std::out
#include <omp.h>
using namespace std;
int nrOfSolutions=0;
int size=0;
void print(int queens[]) {
cerr << "Solution " << nrOfSolutions << endl;
for(int row=0; row<size; row++) {
for(int col=0; col<size; col++) {
if(queens[row]==col) {
cout << "Q";
}
else {
cout << "-";
}
}
cout << endl;
}
}
void setQueen(int queens[], int row, int col, int id) {
for(int i=0; i<row; i++) {
// vertical attacks
if (queens[i]==col) {
return;
}
// diagonal attacks
if (abs(queens[i]-col) == (row-i) ) {
return;
}
}
// column is ok, set the queen
queens[row]=col;
if(row==size-1) {
// only one thread should print allowed to print at a time
{
// increasing the solution counter is not atomic
#pragma omp critical
nrOfSolutions++;
#ifdef _DEBUG
#pragma omp critical
print(queens);
#endif
}
}
else {
// try to fill next row
for(int i=0; i<size; i++) {
setQueen(queens, row+1, i, id);
}
}
}
void solve() {
int myid=0 ;
#pragma omp parallel
#pragma omp single
{
for(int i=0; i<size; i++) {
/*
#ifdef _OMP //(???)
myid = omp_get_thread_num();
#endif
#ifdef _DEBUG
cout << "ThreadNum: " << myid << endl ;
#endif
*/
// try all positions in first row
// create separate array for each recursion
// started here
#pragma omp task
setQueen(new int[size], 0, i, myid);
}
}
}
int main(int argc, char*argv[]) {
if(argc !=2) {
cerr << "Usage: nq-openmp-taskq boardSize.\n";
return 0;
}
size = atoi(argv[1]);
cout << "Starting OpenMP Task Queue solver for size " << size << "...\n";
double st=omp_get_wtime();
solve();
double en=omp_get_wtime();
printf("Time taken using openmp %lf\n",en-st);
cout << "Number of solutions: " << nrOfSolutions << endl;
return 0;
}

Piecemeal processing of a matrix - CUDA

OK, so lets say I have an ( N x N ) matrix that I would like to process. This matrix is quite large for my computer, and if I try to send it to the device all at once I get a 'out of memory error.'
So is there a way to send sections of the matrix to the device? One way I can see to do it is copy portions of the matrix on the host, and then send these manageable copied portions from the host to the device, and then put them back together at the end.
Here is something I have tried, but the cudaMemcpy in the for loop returns error code 11, 'invalid argument.'
int h_N = 10000;
size_t h_size_m = h_N*sizeof(float);
h_A = (float*)malloc(h_size_m*h_size_m);
int d_N = 2500;
size_t d_size_m = d_N*sizeof(float);
InitializeMatrices(h_N);
int i;
int iterations = (h_N*h_N)/(d_N*d_N);
for( i = 0; i < iterations; i++ )
{
float* h_array_ref = h_A+(i*d_N*d_N);
cudasafe( cudaMemcpy(d_A, h_array_ref, d_size_m*d_size_m, cudaMemcpyHostToDevice), "cudaMemcpy");
cudasafe( cudaFree(d_A), "cudaFree(d_A)" );
}
What I'm trying to accomplish with the above code is this: instead of send the entire matrix to the device, I simply send a pointer to a place within that matrix and reserve enough space on the device to do the work, and then with the next iteration of the loop move the pointer forward within the matrix, etc. etc.
Not only can you do this (assuming your problem is easily decomposed this way into sub-arrays), it can be a very useful thing to do for performance; once you get the basic approach you've described working, you can start using asynchronous memory copies and double-buffering to overlap some of the memory transfer time with the time spent computing what is already on-card.
But first one gets the simple thing working. Below is a 1d example (multiplying a vector by a scalar and adding another scalar) but using a linearized 2d array would be the same; the key part is
CHK_CUDA( cudaMalloc(&xd, batchsize*sizeof(float)) );
CHK_CUDA( cudaMalloc(&yd, batchsize*sizeof(float)) );
tick(&gputimer);
int nbatches = 0;
for (int nstart=0; nstart < n; nstart+=batchsize) {
int size=batchsize;
if ((nstart + batchsize) > n) size = n - nstart;
CHK_CUDA( cudaMemcpy(xd, &(x[nstart]), size*sizeof(float), cudaMemcpyHostToDevice) );
blocksize = (size+nblocks-1)/nblocks;
cuda_saxpb<<<nblocks, blocksize>>>(xd, a, b, yd, size);
CHK_CUDA( cudaMemcpy(&(ycuda[nstart]), yd, size*sizeof(float), cudaMemcpyDeviceToHost) );
nbatches++;
}
gputime = tock(&gputimer);
CHK_CUDA( cudaFree(xd) );
CHK_CUDA( cudaFree(yd) );
You allocate the buffers at the start, and then loop through until you're done, each time doing the copy, starting the kernel, and then copying back. You free at the end.
The full code is
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <cuda.h>
#include <sys/time.h>
#include <math.h>
#define CHK_CUDA(e) {if (e != cudaSuccess) {fprintf(stderr,"Error: %s\n", cudaGetErrorString(e)); exit(-1);}}
__global__ void cuda_saxpb(const float *xd, const float a, const float b,
float *yd, const int n) {
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i<n) {
yd[i] = a*xd[i]+b;
}
return;
}
void cpu_saxpb(const float *x, float a, float b, float *y, int n) {
int i;
for (i=0;i<n;i++) {
y[i] = a*x[i]+b;
}
return;
}
int get_options(int argc, char **argv, int *n, int *s, int *nb, float *a, float *b);
void tick(struct timeval *timer);
double tock(struct timeval *timer);
int main(int argc, char **argv) {
int n=1000;
int nblocks=10;
int batchsize=100;
float a = 5.;
float b = -1.;
int err;
float *x, *y, *ycuda;
float *xd, *yd;
double abserr;
int blocksize;
int i;
struct timeval cputimer;
struct timeval gputimer;
double cputime, gputime;
err = get_options(argc, argv, &n, &batchsize, &nblocks, &a, &b);
if (batchsize > n) {
fprintf(stderr, "Resetting batchsize to size of vector, %d\n", n);
batchsize = n;
}
if (err) return 0;
x = (float *)malloc(n*sizeof(float));
if (!x) return 1;
y = (float *)malloc(n*sizeof(float));
if (!y) {free(x); return 1;}
ycuda = (float *)malloc(n*sizeof(float));
if (!ycuda) {free(y); free(x); return 1;}
/* run CPU code */
tick(&cputimer);
cpu_saxpb(x, a, b, y, n);
cputime = tock(&cputimer);
/* run GPU code */
/* only have to allocate once */
CHK_CUDA( cudaMalloc(&xd, batchsize*sizeof(float)) );
CHK_CUDA( cudaMalloc(&yd, batchsize*sizeof(float)) );
tick(&gputimer);
int nbatches = 0;
for (int nstart=0; nstart < n; nstart+=batchsize) {
int size=batchsize;
if ((nstart + batchsize) > n) size = n - nstart;
CHK_CUDA( cudaMemcpy(xd, &(x[nstart]), size*sizeof(float), cudaMemcpyHostToDevice) );
blocksize = (size+nblocks-1)/nblocks;
cuda_saxpb<<<nblocks, blocksize>>>(xd, a, b, yd, size);
CHK_CUDA( cudaMemcpy(&(ycuda[nstart]), yd, size*sizeof(float), cudaMemcpyDeviceToHost) );
nbatches++;
}
gputime = tock(&gputimer);
CHK_CUDA( cudaFree(xd) );
CHK_CUDA( cudaFree(yd) );
abserr = 0.;
for (i=0;i<n;i++) {
abserr += fabs(ycuda[i] - y[i]);
}
printf("Y = a*X + b, problemsize = %d\n", n);
printf("CPU time = %lg millisec.\n", cputime*1000.);
printf("GPU time = %lg millisec (done with %d batches of %d).\n",
gputime*1000., nbatches, batchsize);
printf("CUDA and CPU results differ by %lf\n", abserr);
free(x);
free(y);
free(ycuda);
return 0;
}
int get_options(int argc, char **argv, int *n, int *s, int *nb, float *a, float *b) {
const struct option long_options[] = {
{"nvals" , required_argument, 0, 'n'},
{"nblocks" , required_argument, 0, 'B'},
{"batchsize" , required_argument, 0, 's'},
{"a", required_argument, 0, 'a'},
{"b", required_argument, 0, 'b'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}};
char c;
int option_index;
int tempint;
while (1) {
c = getopt_long(argc, argv, "n:B:a:b:s:h", long_options, &option_index);
if (c == -1) break;
switch(c) {
case 'n': tempint = atoi(optarg);
if (tempint < 1 || tempint > 500000) {
fprintf(stderr,"%s: Cannot use number of points %s;\n Using %d\n", argv[0], optarg, *n);
} else {
*n = tempint;
}
break;
case 's': tempint = atoi(optarg);
if (tempint < 1 || tempint > 50000) {
fprintf(stderr,"%s: Cannot use number of points %s;\n Using %d\n", argv[0], optarg, *s);
} else {
*s = tempint;
}
break;
case 'B': tempint = atoi(optarg);
if (tempint < 1 || tempint > 1000 || tempint > *n) {
fprintf(stderr,"%s: Cannot use number of blocks %s;\n Using %d\n", argv[0], optarg, *nb);
} else {
*nb = tempint;
}
break;
case 'a': *a = atof(optarg);
break;
case 'b': *b = atof(optarg);
break;
case 'h':
puts("Calculates y[i] = a*x[i] + b on the GPU.");
puts("Options: ");
puts(" --nvals=N (-n N): Set the number of values in y,x.");
puts(" --batchsize=N (-s N): Set the number of values to transfer at a time.");
puts(" --nblocks=N (-B N): Set the number of blocks used.");
puts(" --a=X (-a X): Set the parameter a.");
puts(" --b=X (-b X): Set the parameter b.");
puts(" --niters=N (-I X): Set number of iterations to calculate.");
puts("");
return +1;
}
}
return 0;
}
void tick(struct timeval *timer) {
gettimeofday(timer, NULL);
}
double tock(struct timeval *timer) {
struct timeval now;
gettimeofday(&now, NULL);
return (now.tv_usec-timer->tv_usec)/1.0e6 + (now.tv_sec - timer->tv_sec);
}
Running this one gets:
$ ./batched-saxpb --nvals=10240 --batchsize=10240 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.072 millisec.
GPU time = 0.117 millisec (done with 1 batches of 10240).
CUDA and CPU results differ by 0.000000
$ ./batched-saxpb --nvals=10240 --batchsize=5120 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.066 millisec.
GPU time = 0.133 millisec (done with 2 batches of 5120).
CUDA and CPU results differ by 0.000000
$ ./batched-saxpb --nvals=10240 --batchsize=2560 --nblocks=20
Y = a*X + b, problemsize = 10240
CPU time = 0.067 millisec.
GPU time = 0.167 millisec (done with 4 batches of 2560).
CUDA and CPU results differ by 0.000000
The GPU time goes up in this case (we're doing more memory copies) but the answers stay the same.
Edited: The original version of this code had an option for running multiple iterations of the kernel for timing purposes, but that's unnecessarily confusing in this context so it's removed.

Run time comparison of Bubble sort and quick sort

I wish to do a run time comparison of two sorting algorithms- Bubble sot and Randomized Quick sort. My code works fine but I guess I am using some primitive techniques. The 'clock' calculations happen in int, so even if I try to get the time in micro seconds, I get something like 20000.000 micro seconds.
The code:
Bubblesort:
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <time.h>
int bubblesort(int a[], int n);
int main()
{
int arr[9999], size, i, comparisons;
clock_t start;
clock_t end;
float function_time;
printf("\nBuBBleSort\nEnter number of inputs:");
scanf("%d", &size);
//printf("\nEnter the integers to be sorted\n");
for(i=0;i<size;i++)
arr[i]= rand()%10000;
start = clock();
comparisons= bubblesort(arr, size);
end = clock();
/* Get time in milliseconds */
function_time = (float)(end - start) /(CLOCKS_PER_SEC/1000000.0);
printf("Here is the output:\n");
for(i=0;i<size ;i++)
printf("%d\t",arr[i]);
printf("\nNumber of comparisons are %d\n", comparisons);
printf("\nTime for BuBBle sort is: %f micros\n ", function_time);
return 0;
}
int bubblesort(int a[], int n)
{
bool swapped = false;
int temp=0, counter=0;
for (int j = n-1; j>0; j--)
{
swapped = false;
for (int k = 0; k<j; k++)
{
counter++;
if (a[k+1] < a[k])
{
//swap (a,k,k+1)
temp= a[k];
a[k]= a[k+1];
a[k+1]= temp;
swapped = true;
}
}
if (!swapped)
break;
}
return counter;
}
Sample Output:
BuBBleSort
Enter number of inputs:2000
Time for BuBBle sort is: 20000.000000 micros
Quicksort:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int n, counter=0;
void swap(int *a, int *b)
{
int x;
x = *a;
*a = *b;
*b = x;
}
void quicksort(int s[], int l, int h)
{
int p; /* index of partition */
if ((h- l) > 0)
{
p= partition(s, l, h);
quicksort(s, l, p- 1);
quicksort(s, p+ 1, h);
}
}
int partition(int s[], int l, int h)
{
int i;
int p; /* pivot element index */
int firsthigh; /* divider position for pivot element */
p= l+ (rand())% (h- l+ 1);
swap(&s[p], &s[h]);
firsthigh = l;
for (i = l; i < h; i++)
if(s[i] < s[h])
{
swap(&s[i], &s[firsthigh]);
firsthigh++;
}
swap(&s[h], &s[firsthigh]);
return(firsthigh);
}
int main()
{
int arr[9999],i;
clock_t start;
clock_t end;
float function_time;
printf("\nRandomized Quick Sort");
printf("\nEnter the no. of elements…");
scanf("%d", &n);
//printf("\nEnter the elements one by one…");
for(i=0;i<n;i++)
arr[i]= rand()%10000;
start = clock();
quicksort(arr,0,n-1);
end = clock();
/* Get time in milliseconds */
function_time = (float)(end - start) / (CLOCKS_PER_SEC/1000.0);
printf("\nCounter is %d\n\n", counter);
printf("\nAfter sorting…\n");
for(i=0;i<n;i++)
printf("%d\t",arr[i]);
printf("\nTime for Randomized Quick Sort is: %f ms\n", function_time);
return 0;
}
Sample Output:
Randomized Quick Sort
Enter the no. of elements…9999
Time for Randomized Quick Sort is: 0.000000 ms
As you can see, Quicksort doesn't show any run time with my technique even with a much larger input size than Bubblesort.
Could someone help in refining it with that part of run time comparison?
p.s.: The code is liberally borrowed from online sources
Try the follwoing code.
printf("Clock() %ld", clock());
sleep(1);
printf("\t%ld\n", clock());
my result is...
Clock() 6582 6637
gettimeofday(2) is better than clock(3). Because gettiemofday(2) store time in a struct
struct timeval {
time_t tv_sec; /* seconds */
suseconds_t tv_usec; /* microseconds */
};
Record start time and stop time, then you can get elapsed time in microseconds by the formula
(stop.tv_sec - start.tv_sec) * 1000000. + stop.tv_usec - start.tv_usec

Resources