/*                                                                      */
/* CS 4331: Parallel Programming                        Fall, 2012      */
/*                                                                      */
/* This MPI program computes a vector inner product in parallel.	*/
/*                                                                      */
/* The computation is  vip = sum(i=0; i<length){b0[i]*b1[i]}		*/
/* Each processor computes it local inner product and then MPI_Reduce	*/
/* is called to "MPI_SUM" all the "vip"s to one "result" in PROC_0.	*/
/*                                                                      */
/* The "new" stuff that's used is:					*/
/*                                                                      */
/* #define  PROC_0  0		   processor 0				*/
/* #define  B0_TYPE 176		   message "types"			*/
/* #define  B1_TYPE 177							*/
/* MPI_Status stat;		MPI structure for return codes		*/
/*                                                                      */
/* MPI_Comm_size(MPI_COMM_WORLD, &numprocs);	how many processors?	*/
/* MPI_Comm_rank(MPI_COMM_WORLD, &myid);	which one am I?		*/
/*                                                                      */
/* MPI_Send(b0, MAX_LEN, MPI_DOUBLE, p, B0_TYPE, MPI_COMM_WORLD);	*/
/* MPI_Send(b1, MAX_LEN, MPI_DOUBLE, p, B1_TYPE, MPI_COMM_WORLD);	*/
/*                                                                      */
/* MPI_Recv(b0, MAX_LEN, MPI_DOUBLE, PROC_0, B0_TYPE, MPI_COMM_WORLD, &stat); */
/* MPI_Recv(b1, MAX_LEN, MPI_DOUBLE, PROC_0, B1_TYPE, MPI_COMM_WORLD, &stat); */
/*                                                                      */
/* MPI_Barrier(MPI_COMM_WORLD);						*/
/*                                                                      */
/* MPI_Reduce(&vip, &result, 1, MPI_DOUBLE, MPI_SUM, PROC_0, MPI_COMM_WORLD); */
/*                                                                      */

#include <stdio.h>
#include <stdlib.h>	/* for the random number generator	*/
#include "mpi.h"

#define  MAX_LEN 1 << 18	/* maximum vector length	*/
#define  PROC_0  0		/* processor 0			*/
#define  B0_TYPE 176		/* message "types"		*/
#define  B1_TYPE 177

int main(argc,argv)
    int argc;
    char *argv[];

{
    int numprocs, p,		/* number of processors, proc index	*/
	myid,			/* this processor's "rank"		*/
	length,			/* vector length			*/
	i;			

    double b0[MAX_LEN], b1[MAX_LEN],	/* vectors			*/
	   vip,				/* local vector inner product	*/
	   result;			/* global   "     "      "	*/

    double start_time, end_time;	/* "wallclock" times		*/

    char  proc_name[MPI_MAX_PROCESSOR_NAME];
    int   proc_name_len;

    MPI_Status stat;		/* MPI structure containing return	*/
				/* codes for message passing operations */

    MPI_Init(&argc,&argv);			/* initialize MPI	*/

    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);	/*how many processors?	*/
    MPI_Comm_rank(MPI_COMM_WORLD, &myid);	/*which one am I?	*/
    MPI_Get_processor_name(proc_name,&proc_name_len);	/* what's my name? */

    if (myid == PROC_0)
	{
	printf("There are %d processes.\n", numprocs);
	printf("Hello from processor 0: %s\n", proc_name);

	/* generate and distribute vectors for other processors */

	srand48(0xFEEDFACE);

	for (p=1; p<numprocs; ++p)
	    {
	    for (i=0; i<MAX_LEN; ++i)
		{
		b0[i] = (double) drand48(); /* / (double) (1 << 30);*/
		b1[i] = (double) drand48(); /* / (double) (1 << 30);*/
	    	}
	    MPI_Send(b0, MAX_LEN, MPI_DOUBLE, p, B0_TYPE, MPI_COMM_WORLD);
	    MPI_Send(b1, MAX_LEN, MPI_DOUBLE, p, B1_TYPE, MPI_COMM_WORLD);
	    }

	/* generate processor 0's vector */

	for (i=0; i<MAX_LEN; ++i)
	    {
	    b0[i] = (double) drand48();
	    b1[i] = (double) drand48();
	    }

	}
    else	// all other processors do this
	{
	printf("Hello from processor %s.  My rank is %d.\n", proc_name, myid);

	/* receive vector from processor 0 */

	MPI_Recv(b0, MAX_LEN, MPI_DOUBLE, PROC_0, B0_TYPE, MPI_COMM_WORLD, &stat);
	MPI_Recv(b1, MAX_LEN, MPI_DOUBLE, PROC_0, B1_TYPE, MPI_COMM_WORLD, &stat);
	}

    /* measure computational speed for vectors of various lengths */

    for(length=MAX_LEN; length>=1024; length/=2)
        {
	/* synchronize the processors (roughly) on each pass */

	MPI_Barrier(MPI_COMM_WORLD);

	/* start the clock in processor 0 */

	if (myid == PROC_0)
		start_time = MPI_Wtime();

	/* compute the local inner product */

	vip = 0.0;
	for (i=0; i<length; ++i)
	    vip += b0[i] * b1[i];

	/* combine local inner products, result is in processor 0 only */

	MPI_Reduce(&vip, &result, 1, MPI_DOUBLE, MPI_SUM,
	    PROC_0, MPI_COMM_WORLD);

	/* stop the clock and print the time */

	if (myid == PROC_0)
	    {
	    end_time = MPI_Wtime();
	    printf("Length = %d\tElapsed time=%lf\t %f\n",
		length*numprocs, end_time - start_time, result);
	    }
	}

    MPI_Finalize();
}