Documentation

Samples programs

Pi computation [ Pi.java ]
Matrix Multiplication [ MatrixPar.java ]
LCR distributed election [ ElectionLCR.java ]
NAS Benchmark IS problem [ Is.java ]

Pi

Here is a first simple program to compute Pi, extracted from the book "Using MPI" from W. Gropp, E. Lusk, A. SKjellum, p.25. It makes uses of a broadcast (Bcast) and reduce (Reduce) collective operation

import p2pmpi.mpi.*;

public class Pi {
	public static void main(String[] args) {
		int rank, size, i;
		double PI25DT = 3.141592653589793238462643;
		double h, sum, x;

		MPI.Init(args);
		double startTime = MPI.Wtime();

		size = MPI.COMM_WORLD.Size();
		rank = MPI.COMM_WORLD.Rank();

		int[] n = new int[1];
		double[] mypi = new double[1];
		double[] pi   = new double[1];

		if(rank == 0) {
			n[0] = 1000000; // number of interval
		}

		MPI.COMM_WORLD.Bcast(n, 0, 1, MPI.INT, 0);

		h = 1.0 / (double)n[0];
		sum = 0.0;
		for(i = rank + 1; i <= n[0]; i+= size) {
			x = h * ((double)i - 0.5);
			sum += (4.0/(1.0 + x*x));
		}
		mypi[0] = h * sum;

		MPI.COMM_WORLD.Reduce(mypi, 0, pi, 0, 1, MPI.DOUBLE, MPI.SUM, 0);

		if(rank == 0) {
			System.out.println("Pi is approximately " + pi[0]);
			System.out.println("Error is " + (pi[0] - PI25DT));
			double stopTime = MPI.Wtime();
			System.out.println("Time usage = " + (stopTime - startTime) + " ms");
		}

		MPI.Finalize();
	}
}

Matrix Multiplication

This example demonstrates the use of Recv and Send.

import p2pmpi.mpi.*;
public class MatrixPar {
	public static void main(String[] args) {
		int N = 1500;
		int MASTER = 0;
		int FROM_MASTER = 1;
		int FROM_WORKER = 2;
		int numtasks,	/* number of tasks in partition */
    		taskid,		/* a task identifier */
   		numworkers,	/* number of worker tasks */
    		source,		/* task id of message source */
    		dest,		/* task id of message destination */
    		nbytes,		/* number of bytes in message */
    		mtype,		/* message type */
    		intsize,	/* size of an integer in bytes */
    		dbsize,		/* size of a double float in bytes */
    		averow, extra, /* used to determine rows sent to each worker */
    		i, j, k,	/* misc */
    		count;
		int[] a = new int[N*N]; 		/* matrix A to be multiplied */
       		int[] b = new int[N*N];	  	/* matrix B to be multiplied */
      		int[] c = new int[N*N];		/* result matrix C */
		int[] offset = new int[1];
    		int[] rows  = new int[1];         	/* rows of matrix A sent to each worker */

		long[] computeTime = new long[1];
		long[] maxComputeTime = new long[1];
		MPI.Init(args);
		taskid = MPI.COMM_WORLD.Rank();
		numtasks = MPI.COMM_WORLD.Size();
		numworkers = numtasks - 1;

		/* *************** Master Task ****************** */
		if(taskid == MASTER) {
			//Init matrix A,B
			for(i = 0; i < N; i++) {
				for(j = 0; j < N; j++) {
					a[(i*N)+j] = 1;
					b[(i*N)+j] = 2;
				}
			}
			
			// Send matrix data to worker tasks
			long start = System.currentTimeMillis();
			averow = N / numworkers;
			extra  = N % numworkers;
			offset[0] = 0;
			mtype = FROM_MASTER;

			long startsend = System.currentTimeMillis();
			for(dest = 1; dest <= numworkers; dest++) {
				if(dest <= extra) {
					rows[0] = averow + 1;
				} else {
					rows[0] = averow;
				}
				MPI.COMM_WORLD.Send(offset, 0, 1, MPI.INT, dest, mtype);
				MPI.COMM_WORLD.Send(rows, 0, 1, MPI.INT, dest, mtype);
				count = rows[0] * N;
				MPI.COMM_WORLD.Send(a, (offset[0]*N), count, MPI.INT, dest, mtype);
				count = N*N;
				MPI.COMM_WORLD.Send(b, 0, count, MPI.INT, dest, mtype);
				offset[0] = offset[0] + rows[0];
			}
			long stopsend = System.currentTimeMillis();
			// Wait for results from all worker tasks
			computeTime[0] = 0;
			mtype = FROM_WORKER;
			for(i = 1; i <= numworkers; i++) {
				source = i;
				MPI.COMM_WORLD.Recv(computeTime, 0, 1, MPI.LONG, source, mtype);
				System.out.println("Rank " + i + " uses " + computeTime[0] + " for computing");
				MPI.COMM_WORLD.Recv(offset, 0, 1, MPI.INT, source, mtype);
				MPI.COMM_WORLD.Recv(rows, 0, 1, MPI.INT, source, mtype);
				count = rows[0] * N;
				MPI.COMM_WORLD.Recv(c, offset[0]*N, count, MPI.INT, source, mtype);
			}
			long stop = System.currentTimeMillis();
			//System.out.println("Result of matrix c[0] = " + c[0] + ", c[1000*1000] = " + c[100*100]);
			System.out.println("Time Usage = " + (stop - start));
			System.out.println("Sending Time Usage = " + (stopsend - startsend));
		}

		/* *************************** worker task *********************************** */
		if(taskid > MASTER) {
			mtype = FROM_MASTER;
			source = MASTER;
			MPI.COMM_WORLD.Recv(offset, 0, 1, MPI.INT, source, mtype);
			MPI.COMM_WORLD.Recv(rows, 0, 1, MPI.INT, source, mtype);
			count = rows[0] * N;
			MPI.COMM_WORLD.Recv(a, 0, count, MPI.INT, source, mtype);
			count = N * N;
			MPI.COMM_WORLD.Recv(b, 0, count, MPI.INT, source, mtype);

			long startCompute = System.currentTimeMillis();
			for(i = 0; i < rows[0]; i++) {
				for(k = 0; k < N; k++) {
					c[(i*N)+k] = 0;
					for(j = 0; j < N; j++) {
						c[(i*N)+k] = c[(i*N)+k] + a[(i*N)+j] * b[(j*N)+k];
					}
				}
			}
			long stopCompute = System.currentTimeMillis();
			computeTime[0] = (stopCompute - startCompute);
			mtype = FROM_WORKER;
			MPI.COMM_WORLD.Send(computeTime, 0, 1, MPI.LONG, MASTER, mtype);
			MPI.COMM_WORLD.Send(offset, 0, 1, MPI.INT, MASTER, mtype);
			MPI.COMM_WORLD.Send(rows, 0, 1, MPI.INT, MASTER, mtype);
			MPI.COMM_WORLD.Send(c, 0, rows[0]*N, MPI.INT, MASTER, mtype);
		}

		MPI.COMM_WORLD.Reduce(computeTime, 0, maxComputeTime, 0, 1, MPI.LONG, MPI.MAX, 0);
		if(taskid == 0) {
			System.out.println("Max compute time/machine = " + maxComputeTime[0]);
		}
		MPI.Finalize();
	}
}

Distributed election

This program is an implementation of the algorithm called LCR that solves the problem of a disributed election. It uses Send, IRecv, Wait, with equivalents of MPI_ANY_TAG and MPI_ANY_SOURCE.

/**
 * Election algorithm on a ring "LCR" after the names of Le Lann,Chang, and Roberts.
 *
 * The Communication is unidirectional.  The size of the ring is not known.
 * Only the leader have to perform output as leader. 
 * Uses the comparison on UIDs of every node.  
 * Process with the largest UID output as leader.
  
 * Each process sends its identifier around the ring, when a process receives 
 * an incoming identifier, it compare that identifier to its own.
 * If the incoming identifier is greater then its own, it keeps passing the identifier.
 * If it is less than its own, it discard it and 
 * if it is equal to its own the process declare itself as leader.
 *
 * The leader declares itself leader after n rounds (n = size of the ring)
 **/
import p2pmpi.mpi.*;
import java.util.Random;

public class ElectionLCR {
	public static void main(String[] args) {
		int rank, size, i, nbround;
		int TAGUID=1;
		int TAGSTATE=2;

		MPI.Init(args);
		double startTime;
		Request req = null; 
		Status status = null;

		size = MPI.COMM_WORLD.Size();
		rank = MPI.COMM_WORLD.Rank();

		int mynum;
		int[] r = new int[1];
		int[] s = new int[1];
		int e;
for (e=0;e<200;e++) {
		System.out.println("\nElection number : "+e);
		System.out.println("=======================================================");
                // generates an uid (assumes it is unique)
		mynum = MPI.Rand(1000);
		s[0] = mynum;
		System.out.println("[rank " + rank + "] generates uid="+mynum+". Let us start.");
		nbround=1;
	     	startTime = MPI.Wtime();
		MPI.COMM_WORLD.Send(s, 0, 1, MPI.INT, (rank+1)%size,TAGUID);

            // loop receiving message from left neighbourg on ring, 
            while (true) {
		   req = MPI.COMM_WORLD.Irecv(r,0,1, MPI.INT, (rank == 0 ? size-1 : rank-1), MPI.ANY_TAG);
               status = req.Wait();
		   // ----- Election phase -------
		   if (status.tag == TAGUID) {
                    if ( r[0] > s[0] ) {
                            MPI.COMM_WORLD.Send(r, 0, 1, MPI.INT, (rank+1)%size,TAGUID);
                    }
                    else {
                            if (r[0]==s[0]) {
                                    System.out.println("[rank " + rank + "] After "+nbround+" rounds, I know I am the (unique) leader with "+s[0]);
						// I am the unique leader: initiate now another round to broadcast a halting state
                                    MPI.COMM_WORLD.Send(r, 0, 1, MPI.INT, (rank+1)%size,TAGSTATE);
						// ok, the message will eventually come back. Consumes the mesage and Stop after this.
                                    MPI.COMM_WORLD.Recv(r, 0, 1, MPI.INT, (rank == 0 ? size-1 : rank-1),TAGSTATE);
						break;
                            }
                            // else ( r < s ) do nothing
                    }
	         }
		   // ---- Halting phase -------
               if (status.tag == TAGSTATE) {
                     System.out.println("[rank " + rank + "] i just get informed "+r[0]+" is elected.");
                     MPI.COMM_WORLD.Send(r, 0, 1, MPI.INT, (rank+1)%size,TAGSTATE);
                     break;
               }
		   nbround++;
            }
            double stopTime = MPI.Wtime();
		System.out.println("Time usage = " + (stopTime - startTime) + " ms");
		System.out.println("Number of iterations: "+nbround);
}
		MPI.Finalize();
	}
}

Integer Sorting

This program is a translation into Java of the C program for the IS from the NAS benchmark (NPB3.2). It uses Alltoall, Alltoallv, Allreduce and Reduce.

import p2pmpi.mpi.*;

public class Is {
	double[] start;
	double[] elapsed;

	static final char CLASS 	 = 'A';
	static final int NUM_PROCS	 = 4;
	static final int MAX_PROCS	 = 512;
	static final int MAX_ITERATIONS	 = 10;
	static final int TEST_ARRAY_SIZE = 5;
	
	int TOTAL_KEYS_LOG_2;
	int MAX_KEY_LOG_2;
	int NUM_BUCKETS_LOG_2;

	int TOTAL_KEYS;
	int MAX_KEY;
	int NUM_BUCKETS;
	int NUM_KEYS;

	int SIZE_OF_BUFFERS;

	int my_rank, comm_size;


	// Some Global Info
	//int[] key_buff_ptr_global;
	int total_local_keys, total_lesser_keys;
	int passed_verification;

	// These are the three main arrays
	// See SIZE_OF_BUFFERS def above
	int[] key_array;
	int[] key_buff1;
	int[] key_buff2;
	int[] bucket_size;
	int[] bucket_size_totals;
	int[] bucket_ptrs;
	int[] process_bucket_distrib_ptr1;
	int[] process_bucket_distrib_ptr2;
	int[] send_count;
	int[] recv_count;
	int[] send_displ;
	int[] recv_displ;
	int min_key_val;

	// Partial verif info
	int[] test_index_array;
	int[] test_rank_array;
	int[] S_test_index_array = {48427, 17148, 23627, 62548, 4431};
	int[] S_test_rank_array  = {0, 18, 346, 64917, 65463};
	int[] W_test_index_array = {357773,934767,875723,898999,404505};
	int[] W_test_rank_array  = {1249,11698,1039987,1043896,1048018};
	int[] A_test_index_array = {2112377,662041,5336171,3642833,4250760};
	int[] A_test_rank_array  = {104,17523,123928,8288932,8388264};
	int[] B_test_index_array = {41869,812306,5102857,18232239,26860214};
	int[] B_test_rank_array  = {33422937,10244,59149,33135281,99};
	int[] C_test_index_array = {44172927,72999161,74326391,129606274,21736814};
	int[] C_test_rank_array  = {61147,882988,266290,133997595,133525895};

	// Use in randlc 
	// because Java does not support static local variable
	// then makes it as global
	int KS = 0;
	double R23, R46, T23, T46;
	
	Is() {
		start = new double[64];
		elapsed = new double[64];

		switch(CLASS)  {
			case 'S' :
			   TOTAL_KEYS_LOG_2 	= 16;
			   MAX_KEY_LOG_2	= 11;
			   NUM_BUCKETS_LOG_2	= 9;
			   break;
			case 'W' :
			   TOTAL_KEYS_LOG_2 	= 20;
			   MAX_KEY_LOG_2	= 16;
			   NUM_BUCKETS_LOG_2	= 10;
			   break;
			case 'A' :
			   TOTAL_KEYS_LOG_2 	= 23;
			   MAX_KEY_LOG_2	= 19;
			   NUM_BUCKETS_LOG_2	= 10;
			   break;
			case 'B' :
			   TOTAL_KEYS_LOG_2 	= 25;
			   MAX_KEY_LOG_2	= 21;
			   NUM_BUCKETS_LOG_2	= 10;
			   break;
			case 'C' :
			   TOTAL_KEYS_LOG_2 	= 27;
			   MAX_KEY_LOG_2	= 23;
			   NUM_BUCKETS_LOG_2	= 10;
			   break;
		}

		TOTAL_KEYS 	= (1 << TOTAL_KEYS_LOG_2);
		MAX_KEY		= (1 << MAX_KEY_LOG_2);
		NUM_BUCKETS	= (1 << NUM_BUCKETS_LOG_2);
		NUM_KEYS	= (TOTAL_KEYS/NUM_PROCS);

		if(NUM_PROCS < 256) {
			SIZE_OF_BUFFERS = 3*NUM_KEYS/2;
		} else {
			SIZE_OF_BUFFERS = 6*NUM_KEYS;
		}

		System.out.println("Size of buffers = " + SIZE_OF_BUFFERS);
		key_array = new int[SIZE_OF_BUFFERS];
		key_buff1 = new int[SIZE_OF_BUFFERS];
		key_buff2 = new int[SIZE_OF_BUFFERS];
		bucket_size = new int[NUM_BUCKETS+TEST_ARRAY_SIZE];
		bucket_size_totals = new int[NUM_BUCKETS+TEST_ARRAY_SIZE];
		bucket_ptrs = new int[NUM_BUCKETS];
		process_bucket_distrib_ptr1 = new int[NUM_BUCKETS+TEST_ARRAY_SIZE];
		process_bucket_distrib_ptr2 = new int[NUM_BUCKETS+TEST_ARRAY_SIZE];
		send_count = new int[MAX_PROCS];
		recv_count = new int[MAX_PROCS];
		send_displ = new int[MAX_PROCS];
		recv_displ = new int[MAX_PROCS];


		test_index_array = new int[TEST_ARRAY_SIZE];
		test_rank_array  = new int[TEST_ARRAY_SIZE];

	}

	void c_print_results(	String name, 
				char tclass,
				int n1,
				int n2,
				int n3,
				int niter,
				int nprocs_compiled,
				int nprocs_total,
				double t,
				double mops,
				String optype,
				int passed_verification,
				String npbversion,
				String compiletime,
				String mpicc,
				String clink,
				String cmpi_lib,
				String cmpi_inc,
				String cflags,
				String clinkflags)
	{
		String evalue = "1000";
		System.out.println("\n\n" + name + " Benchmark Completed");
		System.out.println(" Class = " + tclass);
		if(n2 == 0 && n3 == 0)
			System.out.println("Size = " + n1);
		else
			System.out.println("Size = ");
	}

				
	void timer_clear(int n) {
		elapsed[n] = 0.0;
	}

	void timer_start(int n) {
		start[n] = MPI.Wtime();
	}

	void timer_stop(int n) {
		double t, now;

		now = MPI.Wtime();
		t = now - start[n];
		elapsed[n] += t;
	}

	double timer_read(int n) {
		return(elapsed[n]);
	}

	/**********************************************************************
	 *****************            R A N D L C            ******************
	 *****************                                   ******************
	 *****************  protable random number generator ******************
	 *********************************************************************/
	double randlc(Object X, double A)
	{
		double 	T1, T2, T3, T4;
		double	A1;
		double	A2;
		double 	X1;
		double	X2;
		double	Z;
		int 	i, j;

		double[] lX = (double[])X;

		if(KS == 0) {
			R23 = 1.0;
			R46 = 1.0;
			T23 = 1.0;
			T46 = 1.0;

			for(i = 1; i <= 23; i++) {
				R23 = 0.50 * R23;
				T23 = 2.0  * T23;
			}
			for(i = 1; i <= 46; i++) {
				R46 = 0.50 * R46;
				T46 = 2.0  * T46;
			}
			KS = 1;
		}

		/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N */
		T1 = R23 * A;
		j = (int)T1;
		A1 = j;
		A2 = A - T23 * A1;

		/* Break X into two parts such that X = 2^23 * X1 + X2, compute
		 * Z = A* X2 + A2 * X1 (mod 2^23) and then
		 * X = 2^23 * Z + A2 * X2 (mod 2^46) */
		T1 = R23 * lX[0];
		j  = (int)T1;
		X1 = j;
		X2 = lX[0] - T23 * X1;
		T1 = A1 * X2 + A2 * X1;

		j = (int)(R23 * T1);
		T2 = j;
		Z = T1 - T23 * T2;
		T3 = T23 * Z + A2 * X2;
		j  = (int)(R46 * T3);
		T4 = j;
		lX[0] = T3 - T46 * T4;

		return (R46 * lX[0]);
	}

	double find_my_seed(long kn, long np, long nn, double s, double a) 
	{
		long i;
		double t3, an;
		long  mq, nq, kk, ik;
		double[] t1 	= new double[1];
		double[] t2   	= new double[1];

		nq = nn / np;
		for(mq = 0; nq > 1; mq++, nq/=2);

		t1[0] 		= a;

		for(i = 1; i <= mq; i++) {
			t2[0] = randlc(t1, t1[0]);
		}

		an = t1[0];
		kk = kn;
		t1[0] = s;
		t2[0] = an;
		for(i = 1; i <= 100; i++) {
			ik = kk/2;
			if(2 * ik != kk)
				t3 = randlc(t1, t2[0]);
			if(ik == 0) 
				break;
			t3 = randlc(t2, t2[0]);
			kk = ik;
		}

		return t1[0];
	}



	void create_seq(double seed, double a) {
		double x;
		int i, k;
		double seeda[] = new double[1];
		seeda[0] = seed;

		k = MAX_KEY / 4;
		for(i = 0; i < NUM_KEYS; i++) {
			x = randlc(seeda, a);
			x += randlc(seeda, a);
			x += randlc(seeda, a);
			x += randlc(seeda, a);

			key_array[i] = (int)(k*x);
		}
	}



	void full_verify() {
		int i, j, k;
		int [] ka = new int[1];
		Request req = null;

		for(i = 0; i < total_local_keys; i++)  {
			//CHANGE: key_array[--key_buff_ptr_global[key_buff2[i]] - 
			key_array[--(key_buff1[key_buff2[i] - min_key_val]) - 
				total_lesser_keys] = key_buff2[i];
		}

		if(my_rank > 0) {
			req = MPI.COMM_WORLD.Irecv(ka, 0, 1,
						   MPI.INT, my_rank -1,
						   1000);	
		}

		if(my_rank < comm_size-1) {
			MPI.COMM_WORLD.Send(key_array, total_local_keys-1,
					    1, MPI.INT, my_rank+1, 1000); 	
		}

		if(my_rank > 0) {
			//MPI.COMM_WORLD.Recv(ka, 0, 1, MPI.INT, my_rank-1, 1000);
			req.Wait();	
		}

		k = ka[0];
		// Confirm that neighbot's greatest key value
		// is not greater than my least key value
		j = 0;
		if(my_rank > 0)
			if(k > key_array[0])
				j++;

		// Confirm keys correctly sorted : count incorrectly sorted keys, if any
		for(i = 1; i < total_local_keys; i++)
			if(key_array[i - 1] > key_array[i])
				j++;

		if(j != 0) {
			//System.out.println("Processor " + my_rank + ": Full_verify: number of key out of sort:" + j);
		} else {
			passed_verification++;
		}

	}

	//********************************************************************
	//****************		RANK		**********************
	//********************************************************************
	void rank(int iteration) {
		int i, j, k, m;
		int shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
		int key;
		int bucket_sum_accumulator;
		int local_bucket_sum_accumulator;
		int max_key_val;
		//CHANGE: int key_buff_ptr;

		//long rankTimer = System.currentTimeMillis();

		// Iteration alteration of keys
		if(my_rank == 0) {
			key_array[iteration] = iteration;
			key_array[iteration + MAX_ITERATIONS] = MAX_KEY - iteration;
		}

		// Initatilize
		for(i = 0; i < NUM_BUCKETS+TEST_ARRAY_SIZE; i++) {
			bucket_size[i] = 0;
			bucket_size_totals[i] = 0;
			process_bucket_distrib_ptr1[i] = 0;
			process_bucket_distrib_ptr2[i] = 0;
		}

		// Determine where the partial verify test keys are, load into
		// top of array bucket_size
		for(i = 0; i < TEST_ARRAY_SIZE; i++) {
			if((test_index_array[i]/NUM_KEYS) == my_rank) {
				bucket_size[NUM_BUCKETS+i] = 
					key_array[test_index_array[i] % NUM_KEYS];
			}
		}

		// Determine the number of keys in each bucket
		for(i = 0; i < NUM_KEYS; i++)
			bucket_size[key_array[i] >> shift]++;

		// Accumulative bucket sizes are the bucket pointers
		bucket_ptrs[0] = 0;
		for(i = 1; i < NUM_BUCKETS; i++)
			bucket_ptrs[i] = bucket_ptrs[i - 1] + bucket_size[i - 1];

		// Sort into appropriate bucket
		for(i = 0; i < NUM_KEYS; i++) {
			key = key_array[i];
			key_buff1[bucket_ptrs[key >> shift]++] = key;
		}

		timer_stop(2);
		timer_start(3);

		//rankTimer = System.currentTimeMillis();

		MPI.COMM_WORLD.Allreduce(bucket_size, 0, 
					bucket_size_totals, 0,
				      	NUM_BUCKETS+TEST_ARRAY_SIZE,
					MPI.INT,
					MPI.SUM);

		timer_stop(3);
		timer_start(2);

		bucket_sum_accumulator = 0;
		local_bucket_sum_accumulator = 0;
		send_displ[0] = 0;
		process_bucket_distrib_ptr1[0] = 0;
		for(i = 0, j = 0; i < NUM_BUCKETS; i++) {
			bucket_sum_accumulator 		+= bucket_size_totals[i];
			local_bucket_sum_accumulator 	+= bucket_size[i];
			if(bucket_sum_accumulator >= (j+1)*NUM_KEYS) {
				send_count[j] = local_bucket_sum_accumulator;
				if(j != 0) {
					send_displ[j] = send_displ[j-1] + send_count[j-1];
					process_bucket_distrib_ptr1[j] =
						process_bucket_distrib_ptr2[j - 1] + 1;
				}
				process_bucket_distrib_ptr2[j++] = i;
				local_bucket_sum_accumulator = 0;
			}
		}
		      
		timer_stop(2);
		timer_start(3);

		// This is the redistribution section : first find out how many keys
		// each processor will sen to every other processors:

		MPI.COMM_WORLD.Alltoall(send_count, 0, 1, MPI.INT, 
					recv_count, 0, 1, MPI.INT);

		// Determine the receive array displacements for the buckets
		recv_displ[0] = 0;
		for(i = 1; i < comm_size; i++)
			recv_displ[i] = recv_displ[i - 1] + recv_count[i - 1];


		MPI.COMM_WORLD.Alltoallv(key_buff1, 0, send_count, send_displ, MPI.INT,
					 key_buff2, 0, recv_count, recv_displ, MPI.INT);

		timer_stop(3);
		timer_start(2);


		// The starting and ending bucket numbers on each processor are
		// multiplied by the interval size of the buckets to obtain the
		// smallest possible min and greastest possible max value of any
		// key on each processor
		min_key_val = process_bucket_distrib_ptr1[my_rank] << shift;
		max_key_val = ((process_bucket_distrib_ptr2[my_rank] + 1) << shift)-1;
		// Clear the work array
		for(i = 0; i < max_key_val - min_key_val + 1; i++)
			key_buff1[i] = 0;

		// Determine the total number of keys on all other
		// processors holding keys of lesser value
		m = 0;
		for(k=0; k < my_rank; k++)
			for(i  = process_bucket_distrib_ptr1[k];
			    i <= process_bucket_distrib_ptr2[k];
			    i++)

			    m += bucket_size_totals[i]; // m has total # of lesser keys

		// Determine total number of keys on this processor
		j = 0;
		for( i =  process_bucket_distrib_ptr1[my_rank];
		     i <= process_bucket_distrib_ptr2[my_rank];
		     i++) {

			j += bucket_size_totals[i];
		}

		// Ranking of all keys occurs in this section :
		// shift it backwards so no subtractions are necessary in loop
		//CHANGE :key_buff_ptr = key_buff1 - min_key_val;

		// In this section, the keys themselves are used as thier
		// own indexes to determne how many of each there are: thier
		// individual population
		for(i = 0; i < j; i++)
			//CHANGE: key_buff_ptr[key_buff2[i]]++;
			key_buff1[key_buff2[i] - min_key_val]++;

		// To obtain ranks of each key, successively add the individual key
		// population, not forgetting to add m, the total of less keys,
		// to the first key population

		//CHANGE: key_buff_ptr[min_key_val] += m;
		//SO :key_buff1[min_key_val - min_key_val] += m;
		key_buff1[0] += m;

		for(i = min_key_val; i < max_key_val; i++) {
			//CHANGE:key_buff_ptr[i + 1] += key_buff_ptr[i];
			key_buff1[i + 1 - min_key_val] += key_buff1[i - min_key_val];
		}

		// This is the partial verify test section
		// Observe that test_rank_array vals are
		// shifted differently for different cases

		for(i = 0;  i < TEST_ARRAY_SIZE; i++) {
			k = bucket_size_totals[i + NUM_BUCKETS]; // key were hidden here
			if(min_key_val <= k && k <= max_key_val) 
				switch( CLASS ) {
					case 'S' :
						if( i <= 2) {
							//CHANGE: if(key_buff_ptr[k - 1] != test_rank_array[i]+iteration) {
							if(key_buff1[k - 1 - min_key_val] != test_rank_array[i]+iteration) {

								System.out.println("Failed partial verification:" +
									"iteration " + iteration + ", processor " +
									my_rank + ", test key " + i);
							} else
								passed_verification++;
						} else {
							// CHANGE: if(key_buff_ptr[k - 1] != test_rank_array[i]-iteration) {
							if(key_buff1[k - 1 - min_key_val] != test_rank_array[i]-iteration) {
								System.out.println("Failed partial verification:" +
									"iteration " + iteration + ", processor " +
									my_rank + ", test key " + i);
							} else
								passed_verification++;
						}
						break;

					case 'W' :
						if ( i < 2) {
							//CHANGE: if(key_buff_ptr[k - 1] !=
							if(key_buff1[k - 1 - min_key_val] !=
									test_rank_array[i]+(iteration-2)) {
								System.out.println("Failed partial verification:" +
									"iteration " + iteration + ", processor " +
									my_rank + ", test key " + i);
							} else 
								passed_verification++;
						} else {
							//CHANGE: if(key_buff_ptr[k - 1] != test_rank_array[i]-iteration) {
							if(key_buff1[k - 1 - min_key_val] != test_rank_array[i]-iteration) {
								System.out.println("Failed partial verification:" +
									"iteration " + iteration + ", processor " +
									my_rank + ", test key " + i);
							} else
								passed_verification++;
						}
						break;

					case 'A' :
						if( i <= 2) {
							//CHANGE: if(key_buff_ptr[k - 1] != test_rank_array[i]+(iteration-1)) {
							if(key_buff1[k - 1 - min_key_val] != test_rank_array[i]+(iteration-1)) {
								System.out.println("Failed partial verification:" +
									"iteration " + iteration + ", processor " +
									my_rank + ", test key " + i);
							} else
								passed_verification++;
						} else {
							//CHANGE: if(key_buff_ptr[k - 1] != test_rank_array[i]-(iteration-1)) {
							if(key_buff1[k - 1 - min_key_val] != test_rank_array[i]-(iteration-1)) {
								System.out.println("Failed partial verification:" +
									"iteration " + iteration + ", processor " +
									my_rank + ", test key " + i);
							} else
								passed_verification++;
						}
						break;
				
					case 'B' :
						if( i == 1 || i == 2 || i == 4) {
							//CHANGE: if(key_buff_ptr[k - 1] != test_rank_array[i]+iteration) {
							if(key_buff1[k - 1 - min_key_val] != test_rank_array[i]+iteration) {
								System.out.println("Failed partial verification:" +
									"iteration " + iteration + ", processor " +
									my_rank + ", test key " + i);
							} else
								passed_verification++;
						} else {
							//CHANGE: if(key_buff_ptr[k - 1] != test_rank_array[i]-iteration) {
							if(key_buff1[k - 1 - min_key_val] != test_rank_array[i]-iteration) {
								System.out.println("Failed partial verification:" +
									"iteration " + iteration + ", processor " +
									my_rank + ", test key " + i);
							} else
								passed_verification++;
						}
						break;

					case 'C' :
						if( i <= 2) {
							//CHANGE: if(key_buff_ptr[k - 1] != test_rank_array[i]+iteration) {
							if(key_buff1[k - 1 - min_key_val] != test_rank_array[i]+iteration) {
								System.out.println("Failed partial verification:" +
									"iteration " + iteration + ", processor " +
									my_rank + ", test key " + i);
							} else
								passed_verification++;
						} else {
							//CHANGE: if(key_buff_ptr[k - 1] != test_rank_array[i]-iteration) {
							if(key_buff1[k - 1 - min_key_val] != test_rank_array[i]-iteration) {
								System.out.println("Failed partial verification:" +
									"iteration " + iteration + ", processor " +
									my_rank + ", test key " + i);
							} else
								passed_verification++;
						}
						break;

				}
		}
		// Make copies of rank info for use by full_verify: these variables
		// in rank are local; making them global slows down the code, probably
		// since they cannot be made register by compiler

		if(iteration == MAX_ITERATIONS) {
			//CHANGE: key_buff_ptr_global = key_buff1[-min_val_key];
			total_local_keys 	= j;
			total_lesser_keys 	= m;
		}
	}

	// **********************************************************
	// ************ 		MAIN		*************
	// **********************************************************
	void doTest(String[] args) {
		int i, iteration;
		int[] itemp= new int[1];
		double[] timecounter 	= new double[1];
		double[] maxtime 	= new double[1];
		int[] pass_v_array 	= new int[1];

		//////////////////// Display Varaible values ////////////////////////
		/*System.out.println("Class = " + CLASS + ", NumProcs = " + NUM_PROCS);
		System.out.println("Total Keys Log  = " + TOTAL_KEYS_LOG_2);
		System.out.println("Max Key Log     = " + MAX_KEY_LOG_2);
		System.out.println("Num buckets Log = " + NUM_BUCKETS_LOG_2);
		System.out.println("Total Keys  = " + TOTAL_KEYS);
		System.out.println("Max Key     = " + MAX_KEY);
		System.out.println("Num buckets = " + NUM_BUCKETS);
		System.out.println("Num keys    = " + NUM_KEYS);*/
		/////////////////////////////////////////////////////////////////////

		double a = 1220703125.00;
		double[] ai = new double[1];

		ai[0] = a;
		double t2 = randlc(ai, a);

		MPI.Init(args);
		comm_size = MPI.COMM_WORLD.Size();
		my_rank   = MPI.COMM_WORLD.Rank();

		for(i = 0; i < TEST_ARRAY_SIZE; i++) {
			switch( CLASS ) {
				case 'S' :
					test_index_array[i] = S_test_index_array[i];
					test_rank_array[i]  = S_test_rank_array[i];
					break;
				case 'A' :
					test_index_array[i] = A_test_index_array[i];
					test_rank_array[i]  = A_test_rank_array[i];
					break;
				case 'W' :
					test_index_array[i] = W_test_index_array[i];
					test_rank_array[i]  = W_test_rank_array[i];
					break;
				case 'B' :
					test_index_array[i] = B_test_index_array[i];
					test_rank_array[i]  = B_test_rank_array[i];
					break;
				case 'C' :
					test_index_array[i] = C_test_index_array[i];
					test_rank_array[i]  = C_test_rank_array[i];
					break;
			}
		}

		/* Check that actual and compiled number of processors agree */
		if(comm_size != NUM_PROCS) {
			if(my_rank == 0) 
				System.out.println("\nERROR: not enough processes");
			MPI.Finalize();
			System.exit(1);

		}

		/* Check to see whether total number of processes is within bounds. */
		if(comm_size > MAX_PROCS) {
			if(my_rank == 0) 
				System.out.println("\nERROR: number of processes exceeds maximum\n");
			MPI.Finalize();
			System.exit(1);
		}

		if(my_rank == 0) {
			System.out.println("NAS Parallel Benchmarks 3.1 -- IS Benchmark");
			System.out.println("Size : " + TOTAL_KEYS + " (class " + CLASS + ")");
			System.out.println("Iteration: " + MAX_ITERATIONS);
			System.out.println("Number of processes : " + comm_size);
		}

		/* Generate random number sequence and subsequent keys on all procs */
		create_seq(find_my_seed(my_rank, 
					comm_size, 
					4*TOTAL_KEYS, 
					314159265.00,
					1220703125.00),
				1220703125.00);

		//System.out.println("Starting...  rank 1");
		rank(1);
		//System.out.println("Finishing... rank 1");

		// Start verification counter
		passed_verification = 0;

		if(my_rank == 0 && CLASS != 'S') System.out.println("\n iteration");

		//Initialize timer
		timer_clear(0);

		//Initalize separate communication, computation timing 
		for(i = 1; i <= 3; i++) timer_clear(i);

		 timer_start(0);
		 timer_start(1);
		 timer_start(2);

		// This is the main iteration 
		for(iteration=1; iteration<= MAX_ITERATIONS; iteration++) {

			if(my_rank == 0 && CLASS != 'S') System.out.println("\t" + iteration);
			rank(iteration);
		}

		timer_stop(2);
		timer_stop(1);
		
		timer_stop(0);
		timecounter[0] = timer_read(0);

	
		MPI.COMM_WORLD.Reduce(	timecounter, 0, maxtime, 0,
					1, MPI.DOUBLE, MPI.MAX, 0);

		// TIMING ENABLED
		double[] tmin;
		double[] tsum;
		double[] tmax;
		
		tmin = new double[1];	
		tsum = new double[1];
		tmax = new double[1];
		if(my_rank == 0) {
			System.out.println("\ntimer 1/2/3 = total/computation/communication time");	
			System.out.println("\t\tmin\t\tavg\t\tmax");
		}
		for(i = 1; i <= 3; i++) {
			timecounter[0] = timer_read(i);
			MPI.COMM_WORLD.Reduce(timecounter, 0, tmin, 0, 1, MPI.DOUBLE, MPI.MIN, 0);
			MPI.COMM_WORLD.Reduce(timecounter, 0, tsum, 0, 1, MPI.DOUBLE, MPI.SUM, 0);
			MPI.COMM_WORLD.Reduce(timecounter, 0, tmax, 0, 1, MPI.DOUBLE, MPI.MAX, 0);
			if(my_rank == 0) {
				System.out.println("timer" + i + "\t\t" + (tmin[0]/(double)1000) + 
				"\t\t" + (tsum[0]/(double)(comm_size *1000)) + 
				"\t\t" + (tmax[0]/(double)1000));
			}
		}

		// END TIMING ENABLED
		full_verify();

		itemp[0] = passed_verification;
		//pass_v_array[0] = passed_verification;
		MPI.COMM_WORLD.Reduce(	itemp, 0, pass_v_array, 0,
					1, MPI.INT, MPI.SUM, 0);

		passed_verification = pass_v_array[0];
		// The Final printout	
		if(my_rank == 0) {
			if(passed_verification != 5*MAX_ITERATIONS + comm_size)
				passed_verification = 0;

			System.out.println("\n");
			System.out.println("IS Benchmark Completed\n");
			System.out.println("Class	    = " + CLASS);
			System.out.println("Size	    = " + TOTAL_KEYS);
			System.out.println("Iteration	    = "	+ MAX_ITERATIONS);
			System.out.println("Time in seconds = " + (maxtime[0]/1000));
			System.out.println("Total processes = " + comm_size);
			double mopTotal = ((double)(MAX_ITERATIONS*TOTAL_KEYS))/(maxtime[0]/1000)/1000000.0;

			System.out.println("Mop/s total     = " + mopTotal);
			System.out.println("Mop/s/process   = " + (mopTotal/comm_size));
			if(passed_verification != 0) {
				System.out.println("Verification    = SUCCESSFUL");
			} else {
				System.out.println("Verification    = UNSUCCESSFUL");
			}
		}
		MPI.Finalize();
	}


	public static void main(String[] args) {
		Is isNPB = new Is();
		isNPB.doTest(args);
	}
}