/*************************************************************************/
/*                                                                       */
/* Licensed Materials - Property of IBM                                  */
/*                                                                       */
/* (C) Copyright IBM Corp. 2010                                          */
/* All Rights Reserved                                                   */
/*                                                                       */
/* US Government Users Restricted Rights - Use, duplication or           */
/* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.     */
/*                                                                       */
/*************************************************************************/

/**************************************************************************
 This example application illustrates how to use OpenCL together with
 OpenMPI. It is assumed that each MPI rank in the cluster is the same and
 each rank works on part of the answer using the same OpenCL kernel.

 The problem to be solved is the standard 2D Laplace equation, an elliptical
 second order partial differential equation with two variables. Dirichlet
 (fixed) boundary conditions are used on a unit square "plate". The boundary
 conditions used are:
    u(x,0) = sin(pi * x)
    u(x,1) = sin(pi * x) * pow(e, -pi)
    u(0,y) = 0
    u(1,y) = 0

 Although any boundary conditions could be specified the advantage of these
 conditions is that they have a known analytical solution as follows:
    u(x,y) = sin(pi * x) * exp(-pi * y)

 For the purposes of demonstrating OpenCL, this application uses a Jacobi
 iteration method which stops after the error for every node is less than
 a given tolerance. The Jacobi method uses a simple 5 point finite-difference
 stencil where the new node is the average of the 4 neighboring nodes i.e.
    unew(i,j) = 0.25 * ( u(i-1,j) + u(i+1,j) + u(i,j-1) + u(i,j+1) )

 For the purposes of demonstrating the use of MPI, the number of compute
 nodes can be specified in both the x and y dimensions which allows for
 horizontal strips, vertical strips, and rectangles.

 A bitmap representation of the final matrix is output in PPM format. You
 can use netbpm to convert the file to a different format or view the output.

 **************************************************************************/


/**************************************************************************
 Include Files
 **************************************************************************/

#include <unistd.h>
#include <libgen.h>
#include <getopt.h>
#include <limits.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/timeb.h>
#include <math.h>

/* OpenCL */
#include <CL/opencl.h>

#include "clu.h"

/* MPI */
#include <mpi.h>


/**************************************************************************
 Defines and Macros
 **************************************************************************/

/* PI of course */
#define PI                   3.14159265

/* default tolerance for Jacobi iteration */
#define DEFAULT_TOLERANCE    (0.000001f)

/* initial guess */
#define INITIAL_GUESS        0.5

/* minimum number of nodes */
#define NODE_SIZE_MIN        8

/* default number of nodes (same for all dimensions) */
#define DEFAULT_NODES        8

/* default number of MPI ranks (same for all dimensions) */
#define DEFAULT_RANKS        1

/* default size of OpenCL workblock (same for all dimensions) */
#define DEFAULT_BLOCK        8

/* maximum number of iterations */
#define MAX_ITERATIONS       1000000

/* maximum size of array that can be printed */
#define MAX_PRINT_SIZE       16

/* value of "device" used for reference or exact calculation */
/* these values should not overlap predefined OpenCL device types */
#define REFERENCE_CALC       -10
#define EXACT_CALC           -11

/* dimension indices */
#define X                    0
#define Y                    1
#define DIMENSIONS           2

/* 5 point stencil in 2 dimensions */
#define STENCIL_SIZE         5

/* ghost cell width */
#define GHOST_CELL_WIDTH     1

/* indices for old and new arrays */
#define OLD 0
#define NEW 1


/* used for MPI message send and receive */
#define NOT_PERIODIC           0  /* false */
#define CAN_REORDER_RANKING    1
#define SEND_TAG               0
#define RECEIVE_TAG            0

/* for OpenCL Workgroup size */
#define DEFAULT_WORKGROUP_SIZE 8

/* used for bitmap output as V value in HSV color scheme */
#define BRIGHTNESS_DEFAULT     192

/* macros to help convert HSV color to RGB color */
#define CALC_H1(h, s)        ((unsigned char)(BRIGHTNESS_DEFAULT * (1.0f - (h-s))))
#define CALC_H2(h, s)        ((unsigned char)(BRIGHTNESS_DEFAULT * (h-s)))

/* MIN value macro */
#define    MIN(a, b)         ( (a>b) ? (b) : (a) )

/* macros to swap array pointers */
#define SWAP_PTR(a, b)       { value_type *tmp = a; a = b; b = tmp; }
#define SWAP_BUF(a, b)       { cl_mem tmp = a; a = b; b = tmp; }

/* error macro and message macros */
#define QUIT(fmt ...)   { ERR(fmt); exit_app(EXIT_FAILURE);}
#define ERR(fmt ...)    { if (INT_MAX != ranks[UP_RANK_INDEX]) fprintf(stderr,"[%d]", ranks[0]); \
                              fprintf(stderr,fmt); }
#define MSG(fmt ...)    { if (INT_MAX != ranks[UP_RANK_INDEX]) printf("[%d]", ranks[0]); \
                              printf(fmt); }

/* macros for asynchronous reading from or writing data to device memory */
#define READ_DEVICE_MEMORY(queue, device_mem, offset, size, host_mem) {                      \
        CLU_CHECK_ERROR("clEnqueueReadBuffer failed",                                        \
            clEnqueueReadBuffer(queue, device_mem, CL_FALSE, sizeof(value_type) * offset,    \
                                sizeof(value_type)*size, host_mem+offset, 0, NULL, NULL)); }

#define WRITE_DEVICE_MEMORY(queue, device_mem, offset, size, host_mem) {                     \
        CLU_CHECK_ERROR("clEnqueueWriteBuffer failed",                                       \
            clEnqueueWriteBuffer(queue, device_mem, CL_FALSE, sizeof(value_type) * offset,   \
                                sizeof(value_type)*size, host_mem+offset, 0, NULL, NULL)); }


/**************************************************************************
 Types
 **************************************************************************/

/*  Define precision for values. If you change this you also need to look
    at MPI_FLOAT, CL_FLOAT and the OpenCL kernel code. */
typedef float value_type;


/**************************************************************************
 Local Functions
 **************************************************************************/

static void exit_app(int rc);
static void * allocx(size_t alloc_size, const char *name);
static void convert_RGB(value_type v, unsigned char *r,
                        unsigned char *g, unsigned char *b);
static void exact_compute(value_type *a,
                        size_t size[DIMENSIONS],
                        value_type origin[DIMENSIONS],
                        value_type d[DIMENSIONS]);
static void exchange_ghost_cells(value_type *a,
                        size_t size[DIMENSIONS],
                        int mpi_ranks[DIMENSIONS],
                        int rank_pos[DIMENSIONS],
                        MPI_Comm mpi_comm);
static void get_arguments(int argc, char *argv[],
                         unsigned int *max_iter,
                        size_t size[DIMENSIONS],
                        size_t block_size[DIMENSIONS],
                        int rank_size[DIMENSIONS],
                        cl_device_type *device,
                        unsigned int *full_copy,
                        unsigned int *verify);

static MPI_Comm init_mpi(int size[DIMENSIONS], int rank_pos[DIMENSIONS]);
static void read_ghost_cells_from_device(cl_mem a_buf,
                                        value_type *a,
                                        size_t size[DIMENSIONS],
                                        int mpi_ranks[DIMENSIONS],
                                        int rank_pos[DIMENSIONS],
                                        cl_command_queue queue,
                                        unsigned int full_copy);
static void write_ghost_cells_to_device(cl_mem a_buf,
                                        value_type *a,
                                        size_t size[DIMENSIONS],
                                        int mpi_ranks[DIMENSIONS],
                                        int rank_pos[DIMENSIONS],
                                        cl_command_queue queue,
                                        unsigned int full_copy);
static cl_device_id ocl_get_device_id(clu_t clu,
                                    cl_device_type *device_type,
                                    int mpi_ranks[DIMENSIONS],
                                    MPI_Comm mpi_comm);
static value_type ocl_jacobi_reduce(value_type *delta,
                                    size_t delta_size[DIMENSIONS]);
static void ocl_jacobi(value_type *a[2],
                        unsigned int max_iter,
                        size_t size[DIMENSIONS],
                        value_type tolerance,
                        int mpi_ranks[DIMENSIONS],
                        int rank_pos[DIMENSIONS],
                        value_type origin[DIMENSIONS],
                        value_type d[DIMENSIONS],
                        MPI_Comm mpi_comm,
                        size_t local_workblock_size[DIMENSIONS],
                        cl_device_type device_type,
                        unsigned int full_copy);
static void reference_jacobi(value_type *a[2],
                            unsigned int max_iter,
                            size_t size[DIMENSIONS],
                            value_type tolerance,
                            int mpi_ranks[DIMENSIONS],
                            int rank_pos[DIMENSIONS],
                            value_type origin[DIMENSIONS],
                            value_type d[DIMENSIONS],
                            MPI_Comm mpi_comm);
static value_type reference_jacobi_kernel(value_type *a, value_type *anew,
                                          size_t size[DIMENSIONS]);
static void print_array(const char *legend, 
                        value_type *a,
                        size_t size[DIMENSIONS],
                        int mpi_ranks[DIMENSIONS],
                        value_type origin[DIMENSIONS],
                        value_type d[DIMENSIONS],
                        MPI_Comm mpi_comm);
static void set_boundary_conditions(value_type *a,
                        size_t size[DIMENSIONS],
                        int mpi_ranks[DIMENSIONS],
                        int rank_pos[DIMENSIONS],
                        value_type origin[DIMENSIONS],
                        value_type d[DIMENSIONS]);
static void set_initial_solution(value_type *a,
                        size_t size[DIMENSIONS],
                        value_type guess);
static void save_bitmap(value_type *a,
                        size_t size[DIMENSIONS],
                        int mpi_size[DIMENSIONS],
                        int rank_pos[DIMENSIONS]);
static void usage(char *me);


/**************************************************************************
 Globals
 **************************************************************************/

/* only globals are those needed for exit_app() which cleans up after an error
   or the end of the application */

/* 2D matrices for Jacobi iteration, each one is used alternately as input
 * and result. They are global as need to be freed on any application exit.
 */
static value_type *u[DIMENSIONS] = { NULL, NULL};

/* 2D matrices like u but used for verification purposes */
static value_type *v[DIMENSIONS] = { NULL, NULL};

/*  MPI rank - INT_MAX indicates that MPI is not initialized and a negative
    number indicates it doesn't have a neighbor. Index 0 is me, other indices
    are clockwise starting from 12 o'clock
 */
#define MY_RANK_INDEX       0
#define UP_RANK_INDEX       1
#define RIGHT_RANK_INDEX    2
#define DOWN_RANK_INDEX     3
#define LEFT_RANK_INDEX     4
static int ranks[STENCIL_SIZE] = { 0 , INT_MAX, INT_MAX, INT_MAX, INT_MAX };


/**************************************************************************
 Function: allocx

 Memory allocator with builtin self-check.

 params:
    alloc_size    size in bytes to allocate
    name        name of variable for which memory is being allocated
 returns:
    memory pointer
 **************************************************************************/
static void * allocx(size_t alloc_size, const char *name) {
    void * ptr = malloc(alloc_size);
    if (!ptr) {
        QUIT("Error: malloc for %s failed.\n", name);
    }
    return ptr;
}


/**************************************************************************
 Function: exit_app

 params:
    rc        return code for application
 **************************************************************************/
static void exit_app(int rc) {
    /* flush any output */
    fflush(stdout);

    /* finish up with MPI */
    if (INT_MAX != ranks[UP_RANK_INDEX]) {
        MPI_Finalize();
    }

    /* free matrices */
    if (u[OLD]) {
        free(u[OLD]);
        u[OLD] = NULL;
    }
    if (u[NEW]) {
        free(u[NEW]);
        u[NEW] = NULL;
    }
    if (v[OLD]) {
        free(v[OLD]);
        v[OLD] = NULL;
    }
    if (v[NEW]) {
        free(v[NEW]);
        v[NEW] = NULL;
    }

    /* exit with supplied return code */
    exit(rc);
}


/**************************************************************************
 Function: get_arguments

 Parses the command line arguments, displays usage instructions and returns
 updated parameters depending on supplied command line arguments.

 params:
    argc            standard command line argument count
    argv            standard command line argument list
    size            size array in 2 dimensions 
    ranks           ranks array in 2 dimensions
    device          OpenCL device type
    full_copy       full copy flag  
    verify          verify flag
 **************************************************************************/
static void get_arguments(int argc, char *argv[],
                          unsigned int *max_iter,
                          size_t size[DIMENSIONS],
                          size_t block_size[DIMENSIONS],
                          int rank_size[DIMENSIONS],
                          cl_device_type *device,
                          unsigned int *full_copy,
                          unsigned int *verify) {

    /* short flags for getopt() function */
    const char *short_args = "acefghHi:m:n:p:q:rvx:y:";

    /* long options for getopt() function */
    struct option long_options[] =
    {
        {"accel",       no_argument, NULL, 'a'},
        {"cpu",         no_argument, NULL, 'c'},
        {"gpu",         no_argument, NULL, 'g'},
        {"exact",       no_argument, NULL, 'e'},
        {"reference",   no_argument, NULL, 'r'},
        {"fullcopy",    no_argument, NULL, 'f'},
        {"iter",        required_argument, NULL, 'i'},
        {"mdim",        required_argument, NULL, 'm'},
        {"ndim",        required_argument, NULL, 'n'},
        {"pdim",        required_argument, NULL, 'p'},
        {"qdim",        required_argument, NULL, 'q'},
        {"verify",      no_argument, NULL, 'v'},
        {"xdim",        required_argument, NULL, 'x'},
        {"ydim",        required_argument, NULL, 'y'},
        {"help",        no_argument, NULL, 'h'},
        {0, 0, 0, 0}
    };

    /* computation type */
    cl_device_type type = 0;

    /* option from getopt_long */
    int c;

    /* Change current working directory to that of the invocation path so that jacsovler can
     * be run from any current working directory.
     */
    char *name;

    name = basename(argv[0]);
    (void)chdir(dirname(argv[0]));

    /* keep parsing command line arguments */
    do {
        /* getopt_long stores the option index here */
        int option_index = 0;

        /* get next command line option to process */
        c = getopt_long(argc, argv, short_args, long_options, &option_index);

        switch (c) {
            case (-1):
                break;
            case 'a':
                if (0 == type) {
                    type = CL_DEVICE_TYPE_ACCELERATOR;
                } else {
                    QUIT("Cannot specify more than one computation type\n");
                }
                break;
            case 'c':
                if (0 == type) {
                    type = CL_DEVICE_TYPE_CPU;
                } else {
                    QUIT("Cannot specify more than one computation type\n");
                }
                break;
            case 'e':
                if (0 == type) {
                    type = EXACT_CALC;
                } else {
                    QUIT("Cannot specify more than one computation type\n");
                }
                break;
            case 'f':
                *full_copy = 1;  /* True */
                break;
            case 'g':
                if (0 == type) {
                    type = CL_DEVICE_TYPE_GPU;
                } else {
                    QUIT("Cannot specify more than one computation type\n");
                }
                break;
            case 'i':
                *max_iter = atoi(optarg);
                break;
            case 'm':
                block_size[X] = atoi(optarg);
                break;
            case 'n':
                block_size[Y] = atoi(optarg);
                break;
            case 'p':
                rank_size[X] = atoi(optarg);
                break;
            case 'q':
                rank_size[Y] = atoi(optarg);
                break;
            case 'r':
                if (0 == type) {
                    type = REFERENCE_CALC;
                } else {
                    QUIT("Cannot specify more than one computation type\n");
                }
                break;
            case 'v':
                *verify = 1;  /* True */
                break;
            case 'x':
                size[X] = atoi(optarg);
                break;
            case 'y':
                size[Y] = atoi(optarg);
                break;
            case 'h':
            case 'H':
	        usage(name);
                exit(0);
                break;
            default:
                QUIT("Use -h to see usage help.\n");
        }
    } while (-1 != c); /* end do parsing of command line arguments */

    /* Print any remaining command line arguments (not options) */
    while (optind < argc) {
        QUIT("Extra arguments '%s'. Use -h to see usage help.\n", argv[optind++]);
    }

    /* return computation/device type, if specified by user */
    if (0 != type) {
        *device = type;
    }

    /* error checking on argument values */
    if (size[X] < NODE_SIZE_MIN || size[X] > INT_MAX) {
        QUIT("Number of nodes in x dimension must be between %d and %d.\n",
            NODE_SIZE_MIN, INT_MAX);
    }
    if (size[Y] < NODE_SIZE_MIN || size[Y] > INT_MAX) {
        QUIT("Number of nodes in y dimension must be between %d and %d.\n",
            NODE_SIZE_MIN, INT_MAX);
    }
    if (block_size[X] < 1 || block_size[X] > INT_MAX) {
        QUIT("Size of workblock in x dimension must be between %d and %d.\n", 1, INT_MAX);
    }
    if (block_size[Y] < 1 || block_size[Y] > INT_MAX) {
        QUIT("Size of workblock in y dimension must be between %d and %d.\n", 1, INT_MAX);
    }
    if (rank_size[X] < 1 || rank_size[X] > INT_MAX) {
        QUIT("Number of ranks in x dimension must be between %d and %d.\n", 1, INT_MAX);
    }
    if (rank_size[Y] < 1 || rank_size[Y] > INT_MAX) {
        QUIT("Number of ranks in y dimension must be between %d and %d.\n", 1, INT_MAX);
    }
    if (0 != (size[X] % rank_size[X])) {
        QUIT("Number of nodes in x dimension (%d) must be multiple of the MPI nodes (%d).\n",
            (unsigned int)size[X], (unsigned int)rank_size[X]);
    }
    if (0 != (size[Y] % rank_size[Y])) {
        QUIT("Number of nodes in y dimension (%d) must be multiple of the MPI nodes (%d).\n",
            (unsigned int)size[Y], (unsigned int)rank_size[Y]);
    }

    if ((EXACT_CALC != (int)*device) && (REFERENCE_CALC != (int)*device)) {
        if (0 != ((size[X] / rank_size[X])) % block_size[X]) {
            QUIT("MPI node size in x dimension (%d) must be multiple of the Workblock size (%d).\n",
                (unsigned int)size[X] / rank_size[X], (unsigned int)block_size[X]);
        }
        if (0 != ((size[Y] / rank_size[Y])) % block_size[Y]) {
            QUIT("MPI node size in y dimension (%d) must be multiple of the Workblock size (%d).\n",
                (unsigned int)size[Y] / rank_size[Y], (unsigned int)block_size[Y]);
        }
    }
}


/**************************************************************************
 Function: usage

 Displays usage instructions for the application command line

 params:
    me        name of this application
 **************************************************************************/
static void usage(char *me) {
    printf("Usage: %s [-h|--help] [-a|-c|-g|-e|-r] [OPTIONS...] [OPENCL OPTIONS...]\n", me);
    printf("\nExamples:\n");
    printf("  %s --accel -x128 -y128\tCompute 128x128 array on accelerator\n", me);
    printf("  mpirun -np 4 %s -r -p2 -q2\tRun reference computation on 4 MPI ranks\n", me);
    printf("\nComputation type (choose one, default is first OpenCL device):\n");
    printf("  [ -a  | --accel ]     use OpenCL CBEA accelerator for compute\n");
    printf("  [ -c  | --cpu ]       use OpenCL host CPU for compute\n");
    printf("  [ -g  | --gpu ]       use OpenCL GPU for compute\n");
    printf("  [ -e  | --exact ]     compute using analytical implementation (i.e. no OpenCL)\n");
    printf("  [ -r  | --reference ] compute using reference implementation (i.e. no OpenCL)\n");
    printf("\nOptions:\n");
    printf("  [ -in | --iter=n ]    number of iterations, default is %d\n", MAX_ITERATIONS);
    printf("  [ -xn | --xdim=n ]    number of nodes in x dimension, default is %d\n", DEFAULT_NODES);
    printf("  [ -yn | --ydim=n ]    number of nodes in y dimension, default is %d\n", DEFAULT_NODES);
    printf("  [ -pn | --pdim=n ]    number of MPI ranks in x dimension, default is %d\n", DEFAULT_RANKS);
    printf("  [ -qn | --qdim=n ]    number of MPI ranks in y dimension, default is %d\n", DEFAULT_RANKS);
    printf("  [ -v  | --verify ]    verify computation against the reference implementation\n");
    printf("\nOpenCL specific options:\n");
    printf("  [ -f  | --fullcopy ]  full copy of device memory, default is ghost cells only\n");
    printf("  [ -mn | --mdim=n ]    size of OpenCL workblock in x dimension, default is %d\n", DEFAULT_BLOCK);
    printf("  [ -nn | --ndim=n ]    size of OpenCL workblock in y dimension, default is %d\n", DEFAULT_BLOCK);
    printf("\nNotes:\n");
    printf("1. Computation types/devices (-a|-c|-g|-e|-r) are mutually exclusive.\n");
    printf("2. Data values are only shown if the size of the array is less than %d.\n",
                MAX_PRINT_SIZE - 2*GHOST_CELL_WIDTH);
    printf("3. Some MPI installations may require the following parameters to mpirun:\n");
    printf("\t--mca btl tcp,self\n");
    fflush(stdout);
}


/**************************************************************************
 Function: convert_rgb

 Converts a floating point number in the range [0,1] to a RGB color from
 blue (0) through to red (1) using an intermediate hue value

 params:
    v            value to convert (should be in the range [0,1]
 returns:
    r            byte value for red
    g            byte value for green
    b            byte value for blue
 **************************************************************************/
static void convert_RGB(value_type v, unsigned char *r,
                        unsigned char *g, unsigned char *b) {
    value_type hue;
    unsigned int segment;

    /*  Use HSB (aka HSV) cylindrical color model and then convert to RGB. The
        calculation assumes a saturation of 1.0 and uses BRIGHTNESS_DEFAULT for
        brightness. The hue ranges from red (1.0) to blue (0.0) to avoid both
        0.0 and 1.0 being the same color */
    hue = 4.0f * (1- v);

    /* Use 0 through 4 of the 6 segments in the hue circle */
    segment = (int)floor(hue);

    switch (segment) {
        /* color in 0 to 60 degrees range (first sixth) */
        case 0:
            *r = BRIGHTNESS_DEFAULT;
            *g = CALC_H2(hue, segment);
            *b = 0;
            break;

        /* color in 60 to 120 degrees (second sixth) */
        case 1:
            *r = CALC_H1(hue, segment);
            *g = BRIGHTNESS_DEFAULT;
            *b = 0;
            break;

        /* color in 120 to 180 degrees (third sixth) */
        case 2:
            *r = 0;
            *g = BRIGHTNESS_DEFAULT;
            *b = CALC_H2(hue, segment);
            break;

        /* color in 180 to 240 degrees (fourth sixth) */
        case 3:
            *r = 0;
            *g = CALC_H1(hue, segment);
            *b = BRIGHTNESS_DEFAULT;
            break;

        /* color in 240 to 300 degrees (fifth sixth) */
        case 4:
            *r = CALC_H2(hue, segment);
            *g = 0;
            *b = BRIGHTNESS_DEFAULT;
            break;

        /* should not get here as hue is from 0 to 4 */
        default:
            *r = BRIGHTNESS_DEFAULT;
            *g = 0;
            *b = CALC_H1(hue, segment);
            break;
    }
}



/**************************************************************************
 Function: save_bitmap

 Saves a bitmap representation of the array for this MPI rank. As a
 simplification, each MPI rank writes a separate file where the filename
 includes the cartesian coordinate of the rank so that the bitmap can be
 put together offline using a tool such as pnmcat. Another alternative
 would be to use the MPI2 I/O functions and write different parts of
 the file using MPI.

 params:
    a           array to set boundary conditions
    size        size of array for this MPI rank
    mpi_size    number of MPI ranks in each dimension
    rank_pos    position in MPI cartesian space
 **************************************************************************/
static void save_bitmap(value_type *a,
                        size_t size[DIMENSIONS],
                        int mpi_size[DIMENSIONS],
                        int rank_pos[DIMENSIONS]) {

    int i, j, width, height, bitmap_size;
    char bitmap_filename[128];
    FILE *fh;
    int err;

    /* buffer to hold bitmap pixels before being written to file */
    unsigned char *pixels;

    /* do not include ghost nodes or boundary nodes */
    width = size[X];
    height = size[Y];

    /* allocate space for display pixels */
    bitmap_size = 3 * width * height;
    pixels = (unsigned char *)allocx(bitmap_size, "bitmap");

    /* convert values to RGB pixels */
    for (i=GHOST_CELL_WIDTH; i<width+GHOST_CELL_WIDTH; ++i) {
        for (j=GHOST_CELL_WIDTH; j<height+GHOST_CELL_WIDTH; ++j) {
            /* change to bitmap origin at lower left corner */
            int p = 3 * ((height - j) * width + (i-1));
            convert_RGB(a[i*(height+2)+j], &pixels[p], &pixels[p+1], &pixels[p+2]);
        }
    }

    /* generate file name */
    if ((1<mpi_size[X]) || (1<mpi_size[Y])) {
        sprintf(bitmap_filename, "jac%dx%d_%d_%d.ppm", width, height,
                rank_pos[X], rank_pos[Y]);
    } else {
        sprintf(bitmap_filename, "jac%dx%d.ppm", width, height);
    }

    /* try to open the file */
    fh = fopen(bitmap_filename, "wb");
    if (fh == NULL) {
        free(pixels);
        QUIT("Open of bitmap file %s failed errno=%d\n", bitmap_filename, errno);
    }

    /* output PPM type P6 file header */
    fprintf(fh, "P6\n%d %d\n255\n", (unsigned int)size[X], (unsigned int)size[Y]);

    /* write pixels into the file */
    err = fwrite(pixels, 1, bitmap_size, fh);
    if (err != (int) bitmap_size) {
        fclose(fh);
        free(pixels);
        QUIT("Writing of bitmap pixels sized %d failed err=%d errno=%d\n",
             bitmap_size, err, errno);
    }

    /* close the file */
    err = fclose(fh);
    if (err != 0) {
        QUIT("Closing of bitmap file failed err=%d errno=%d\n", err, errno);
    }

    /* output completion message and cleanup */
    MSG("Bitmap of generated data saved in %s\n", bitmap_filename);
    free(pixels);
}


/**************************************************************************
 Function: init_mpi

 Displays usage instructions for the application command line

 params:
    mpi_size    number of MPI ranks in each dimension
    rank_pos    position in MPI cartesian space
 **************************************************************************/
static MPI_Comm init_mpi(int mpi_size[DIMENSIONS], int rank_pos[DIMENSIONS]) {
    /* size of MPI space and our rank in the MPI world */
    int total_ranks;
    int periods[DIMENSIONS] = { NOT_PERIODIC, NOT_PERIODIC };

    /* MPI communications area */
    MPI_Comm mpi_comm;

    /* initialize MPI */
    MPI_Init(NULL, NULL);

    /* find out our rank in the world and the size of MPI space */
    MPI_Comm_rank(MPI_COMM_WORLD, &ranks[MY_RANK_INDEX]);
    MPI_Comm_size(MPI_COMM_WORLD, &total_ranks);

    /* verify that the size of MPI space matches the expected size */
    if (total_ranks != mpi_size[X] * mpi_size[Y]) {
        QUIT("Mismatch in number of MPI ranks; parameters require %d not %d.\n",
             (unsigned int)(mpi_size[X]* mpi_size[Y]), total_ranks);
    }

    /* figure out our rank, and relative position in the grid */
    MPI_Cart_create(MPI_COMM_WORLD, DIMENSIONS, mpi_size,
                    periods, CAN_REORDER_RANKING, &mpi_comm);
    MPI_Comm_rank(mpi_comm, &ranks[MY_RANK_INDEX]);
    MPI_Cart_coords(mpi_comm, ranks[MY_RANK_INDEX], DIMENSIONS, rank_pos);

    MSG("MPI initialized at position (%d, %d).\n", rank_pos[X], rank_pos[Y]);

    /* figure out neighboring ranks in 4 directions, negative means no neighbor */
    MPI_Cart_shift(mpi_comm, Y, -1,
                   &ranks[UP_RANK_INDEX], &ranks[DOWN_RANK_INDEX]);
    MPI_Cart_shift(mpi_comm, X, -1,
                   &ranks[RIGHT_RANK_INDEX], &ranks[LEFT_RANK_INDEX]);
    if (1 != total_ranks) {
        MSG("Neighboring ranks are %d (up), %d (right), %d (down), and %d (left).\n",
                ranks[UP_RANK_INDEX], ranks[RIGHT_RANK_INDEX],
                ranks[DOWN_RANK_INDEX], ranks[LEFT_RANK_INDEX]);
    }

    /* MPI communications area needed later for send/receives */
    return mpi_comm;
}


/**************************************************************************
 Function: print_array

 Prints the contents of the given array using each MPI rank in turn.
 Note that ghost cells are repeated.

 params:
    legend      text data to describe the array
    a           array to set boundary conditions
    size        size of array for this MPI rank
    mpi_ranks   number of MPI ranks in each dimension    
    origin      origin for this rank
    d           discretion size
    mpi_comm    MPI communications structure
 **************************************************************************/
static void print_array(const char *legend,
                        value_type *a,
                        size_t size[DIMENSIONS],
                        int mpi_ranks[DIMENSIONS],
                        value_type origin[DIMENSIONS],
                        value_type d[DIMENSIONS],
                        MPI_Comm mpi_comm) {

    unsigned int i, j, ystride;
    int r;

    /* convenience for y stride in array */
    ystride = size[Y] + 2*GHOST_CELL_WIDTH;

    /* return if array is too large */
    if ((MAX_PRINT_SIZE < size[X]+2*GHOST_CELL_WIDTH) || (MAX_PRINT_SIZE < ystride)) {
        return;
    }

    /* print title for array contents using only one MPI rank */
    if (NULL != mpi_comm) {
        MPI_Barrier(mpi_comm);
    }

    if (0 == ranks[MY_RANK_INDEX]) {
        printf("%s -------------------------\n", legend);
        fflush(stdout);
    }

    /* synchronize across MPI ranks */
    if (NULL != mpi_comm) {
        MPI_Barrier(mpi_comm);
    }

    /* print array for each rank in turn */
    for (r=0; r<mpi_ranks[X] * mpi_ranks[Y]; r++) {

        /* print if our rank */
        if (r == ranks[MY_RANK_INDEX]) {
            /* print x val header */
            printf("[%4d] x=", ranks[MY_RANK_INDEX]);
            for (i=0; i<size[X]+2*GHOST_CELL_WIDTH; ++i) {
                printf("%0.3lf    ", origin[X] + i * d[X]);
            }
            printf("\n");

            /* print y prefix and x values */
            for (j=0; j<size[Y]+2*GHOST_CELL_WIDTH; ++j) {
                printf("y=%0.3lf ", origin[Y] + j * d[Y]);
                for (i=0; i<size[X]+2*GHOST_CELL_WIDTH; ++i) {
                    printf("%8.5lf ", a[i * ystride + j]);
                }
                printf("\n");
            }

            /* make sure output is complete */
            fflush(stdout);

            /* added sleep here to ensure output from MPI rank is printed
               correctly without overlapping other output */
            sleep(1);   
        }

        /* synchronize MPI ranks */
        if (NULL != mpi_comm) {
            MPI_Barrier(mpi_comm);
        }
    }
}


/**************************************************************************
 Function: exchange_ghost_cells

 Swaps the ghost cells with each neighbor in turn. Deadlocking of the
 synchronous send/receive is eliminated by doing all exchanges in the same
 direction i.e. the first step is to send to the left and receive a send
 from the right neighbor; and so on for the other three directions.
 
 No special checking is needed when this node doesn't have a neighbor; 
 it is all taken care of by MPI_Sendrecv.

 params:
    a           array to set boundary conditions
    size        size of array for this MPI rank
    mpi_ranks   number of MPI ranks in each dimension
    rank_pos    position in MPI cartesian space
    mpi_comm    MPI communications structure
 **************************************************************************/
static void exchange_ghost_cells(value_type *a,
                        size_t size[DIMENSIONS],
                        int mpi_ranks[DIMENSIONS],
                        int rank_pos[DIMENSIONS],
                        MPI_Comm mpi_comm) {

    /* convenience for y stride in array */
    unsigned int ystride = size[Y]+2*GHOST_CELL_WIDTH;

    /* buffers for non-contingous data in x direction */
    value_type *row_send, *row_recv;
    unsigned int i, err;

    MPI_Status status;

    /* Contingous data: send to left, receive from right */
    err = MPI_Sendrecv(&a[ystride], ystride, MPI_FLOAT,
                       ranks[LEFT_RANK_INDEX], SEND_TAG,
                       &a[(size[X]+GHOST_CELL_WIDTH) * ystride], ystride, MPI_FLOAT,
                       ranks[RIGHT_RANK_INDEX], RECEIVE_TAG,
                       mpi_comm, &status);
    if (MPI_SUCCESS != err) {
        QUIT("MPI Send to left returned %d from %d\n", err, status.MPI_SOURCE);
    }

    /* Contingous data: send to right, receive from left */
    err = MPI_Sendrecv(&a[size[X] * ystride], ystride, MPI_FLOAT,
                       ranks[RIGHT_RANK_INDEX], SEND_TAG,
                       &a[OLD], ystride, MPI_FLOAT,
                       ranks[LEFT_RANK_INDEX],RECEIVE_TAG,
                       mpi_comm, &status);
    if (MPI_SUCCESS != err) {
        QUIT("MPI Send to right returned %d from %d\n", err, status.MPI_SOURCE);
    }

    /* allocate data buffers for non-contiguous data */
    row_send = (value_type *)allocx(sizeof(value_type) * (size[X]+2*GHOST_CELL_WIDTH), "MPI row send");
    row_recv = (value_type *)allocx(sizeof(value_type) * (size[X]+2*GHOST_CELL_WIDTH), "MPI row receive");

    /* copy in data to be sent to "down" neighbor */
    for (i=0; i<size[X]+2*GHOST_CELL_WIDTH; ++i) {
        row_send[i] = a[i * ystride + 1];
    }

    /* Non-contingous data: send to down, recv from up */
    err = MPI_Sendrecv(&row_send[X], size[X]+2*GHOST_CELL_WIDTH, MPI_FLOAT,
                       ranks[DOWN_RANK_INDEX], SEND_TAG,
                       row_recv, size[X]+2*GHOST_CELL_WIDTH, MPI_FLOAT,
                       ranks[UP_RANK_INDEX],RECEIVE_TAG,
                       mpi_comm, &status);
    if (MPI_SUCCESS != err) {
        QUIT("MPI Send to down returned %d from %d\n", err, status.MPI_SOURCE);
    }

    /* copy out data received from "up" neighbor, but only if there is one */
    if (rank_pos[Y] < mpi_ranks[Y]-1) {
        for (i=0; i<size[X]+2*GHOST_CELL_WIDTH; ++i) {
            a[i * ystride + size[Y]+GHOST_CELL_WIDTH] = row_recv[i];
        }
    }

    /* copy in data to be sent to "up" neighbor */
    for (i=0; i<size[X]+2*GHOST_CELL_WIDTH; ++i) {
        row_send[i] = a[i * ystride + size[Y]];
    }

    /* Non-contingous data: send to up, recv from down */
    err = MPI_Sendrecv(&row_send[X], size[X]+2*GHOST_CELL_WIDTH, MPI_FLOAT,
                       ranks[UP_RANK_INDEX],SEND_TAG,
                       row_recv, size[X]+2*GHOST_CELL_WIDTH, MPI_FLOAT,
                       ranks[DOWN_RANK_INDEX],RECEIVE_TAG,
                       mpi_comm, &status);
    if (MPI_SUCCESS != err) {
        QUIT("MPI Send to up returned %d from %d\n", err, status.MPI_SOURCE);
    }

    /* copy out data received from "down" neighbor, but only if there is one */
    if (rank_pos[Y] > 0) {
        for (i=0; i<size[X]+2*GHOST_CELL_WIDTH; ++i) {
            a[i * ystride] = row_recv[i];
        }
    }

    /* free data buffers */
    if (row_send) {
        free(row_send);
    }
    if (row_recv) {
        free(row_recv);
    }
}


/**************************************************************************
 Function: exact_compute

 Calculates the exact contents of the array using the analytical solution:
    u(x,y) = sin(pi * x) * exp(-pi * y)analyitcal solution

 params:
    a           array to compute solution into
    size        size of array for this MPI rank
    origin      origin for this rank
    d           discretion size
 **************************************************************************/
static void exact_compute(value_type *a,
                        size_t size[DIMENSIONS],
                        value_type origin[DIMENSIONS],
                        value_type d[DIMENSIONS]) {

    unsigned int i, j, ystride;

    /* convenience for y stride in array */
    ystride = size[Y]+2*GHOST_CELL_WIDTH;

    /* loop over array including the boundaries and compute the result */
    for (i=0; i<size[X]+2*GHOST_CELL_WIDTH; ++i) {
        for (j=0; j<size[Y]+2*GHOST_CELL_WIDTH; ++j) {
            a[i * ystride + j] = (value_type) (sin(PI*(origin[X] + i*d[X])) *
                                               exp(-PI*(origin[Y] + j*d[Y])));
        }
    }
}


/**************************************************************************
 Function: ocl_get_device_id

  params:
    clu            clu struture
    device_type    OpenCL device type
    ranks          number of MPI ranks in each dimension
    mpi_comm       MPI communications structure

  returns:
    first device that matches device type
 **************************************************************************/
static cl_device_id ocl_get_device_id(clu_t clu,
                                      cl_device_type *device_type,
                                      int mpi_ranks[DIMENSIONS],
                                      MPI_Comm mpi_comm) {
    int r, rc, devices;
    cl_int err;
    cl_uint num_devices = 0;
    cl_device_id *device_ids, device_id = NULL;

    /* get the number of devices in the platform */
    err = clGetDeviceIDs(cluGetCLPlatformID(clu), *device_type, 0, NULL, &num_devices);

    /* check there is at least one device */
    if (0 == num_devices || CL_SUCCESS != err) {
        QUIT("No OpenCL devices available of type %s.\n",
                cluGetCLDeviceTypeString(*device_type));
    }
    devices = (int)num_devices;

    /* find the total number of devices across all of the MPI ranks */
    if (1 != mpi_ranks[X] * mpi_ranks[Y]) {
        rc = MPI_Allreduce(MPI_IN_PLACE, &devices, 1, MPI_INT, MPI_SUM, mpi_comm);
        if (MPI_SUCCESS != rc) {
            QUIT("MPI_Allreduce returned %d\n", rc);
        }
    }

    /* check if there are enough devices for all the ranks */
    if (devices < mpi_ranks[0] * mpi_ranks[1]) {
        QUIT("Only found %d device(s) but need %d.\n", devices,
                        (unsigned int)(mpi_ranks[0] * mpi_ranks[1]));
    }

    /* allocate storage and get all of the devices */
    device_ids = (cl_device_id *)allocx(sizeof(cl_device_id) * num_devices, "device_ids");
    CLU_CHECK_ERROR("clGetDeviceIDs failed",
        clGetDeviceIDs(cluGetCLPlatformID(clu), *device_type, num_devices, device_ids, NULL));

    /* sychronize across MPI ranks */
    if (NULL != mpi_comm) {
        MPI_Barrier(mpi_comm);
    }

    /* get device for each rank in turn */
    for (r=0; r<mpi_ranks[0] * mpi_ranks[1]; r++) {
        if (r == ranks[MY_RANK_INDEX]) {
            char *device_name;
            size_t len;

            /* assign device to this rank */
            device_id = device_ids[r % num_devices];

            /* get the device name */
            CLU_CHECK_ERROR("clGetDeviceInfo device name failed",
                clGetDeviceInfo(device_id, CL_DEVICE_NAME, 0, NULL, &len));
            device_name = (char *)allocx(len, "device name");
            CLU_CHECK_ERROR("clGetDeviceInfo device name failed",
                clGetDeviceInfo(device_id, CL_DEVICE_NAME, len, device_name, NULL));
                
            /* get device type and print device name */                
            CLU_CHECK_ERROR("clGetDeviceInfo device type failed",
                clGetDeviceInfo(device_id, CL_DEVICE_TYPE,
                        sizeof(cl_device_type), device_type, NULL));
            MSG("Device is %s of type %s\n", device_name, cluGetCLDeviceTypeString(*device_type));
            free(device_name);    

            /* sychronize across MPI ranks */
            if (NULL != mpi_comm) {
                MPI_Barrier(mpi_comm);
            }
        }
    }

    /* free allocated memory */
    free(device_ids);
    device_ids = NULL;

    /* return found device */
    return device_id;
}


/**************************************************************************
 Function: ocl_jacobi_reduce

 Calculates maximum error over array of differences returned from the device

 params:
    delta               delta differences between two iterations
    delta_buf           OpenCL buffer for deltas
    delta_size          size of delta buffer over dimensions
    delta_buffer_size   size of delta buffer in bytes
    queue               OpenCL command queue for kernel
    kernel_execution    OpenCL kernel event

 returns:
     maximum difference for this iteration
 **************************************************************************/
static value_type ocl_jacobi_reduce(value_type *delta,
                                    size_t delta_size[DIMENSIONS]) {

    unsigned int i, j;
    value_type max_diff;

    /* find final delta difference on previous iteration */
    max_diff = 0.0;
    for (i=0; i<delta_size[X]; ++i) {
        for (j=0; j<delta_size[Y]; ++j) {
            max_diff = fmax(max_diff, delta[i * delta_size[Y] + j]);
        }
    }

    /* return the maximum difference from last iteration */
    return max_diff;
}


/**************************************************************************
  Function: read_ghost_cells_from_device

 This routine copies the ghost cells (or complete buffer) from host memory
 to device memory

 params:
    a_buf        OpenCL device memory equivalent to a
    a            array on host
    size         size of array for this MPI rank
    mpi_ranks    number of MPI ranks in each dimension
    rank_pos     cartesian position of this rank
    queue        OpenCL command queue
    full_copy    boolean if full buffer copy is to be done
 **************************************************************************/
static void read_ghost_cells_from_device(cl_mem a_buf,
                                        value_type *a,
                                        size_t size[DIMENSIONS],
                                        int mpi_ranks[DIMENSIONS],
                                        int rank_pos[DIMENSIONS],
                                        cl_command_queue queue,
                                        unsigned int full_copy) {
    /* convenience for y stride in array */
    cl_uint ystride = size[Y]+2*GHOST_CELL_WIDTH;

    if (full_copy) {
        size_t array_size = (size[X]+2*GHOST_CELL_WIDTH) * ystride;
        READ_DEVICE_MEMORY(queue, a_buf, 0, array_size, a);
    } else {
        size_t offset;

        /* copy out second column if there is a left neighbor */
        if (rank_pos[X] > 0) {
            offset = ystride + GHOST_CELL_WIDTH;
            READ_DEVICE_MEMORY(queue, a_buf, offset, ystride-2*GHOST_CELL_WIDTH, a);
        }

        /* copy out last but one column if there is a right neighbor */
        if (rank_pos[X] < mpi_ranks[X] - 1) {
            offset = size[X] * ystride + GHOST_CELL_WIDTH;
            READ_DEVICE_MEMORY(queue, a_buf, offset, ystride-2*GHOST_CELL_WIDTH, a);
        }

        /* copy out bottom row plus one, one cell at a time, if there is a bottom neighbor */
        if (rank_pos[Y] > 0) {
#ifdef CL_VERSION_1_1
	    const size_t buffer_origin[3] = { GHOST_CELL_WIDTH*sizeof(value_type), GHOST_CELL_WIDTH, 0 };
	    const size_t host_origin[3] = { GHOST_CELL_WIDTH*sizeof(value_type), GHOST_CELL_WIDTH, 0 };
	    const size_t region[3] = {GHOST_CELL_WIDTH * sizeof(value_type), size[X], 1 };

	    CLU_CHECK_ERROR("clEnqueueReadBufferRect failed",
			    clEnqueueReadBufferRect(queue, a_buf, CL_FALSE, buffer_origin, host_origin, region, 
						    ystride*sizeof(value_type), 0, ystride*sizeof(value_type), 0, 
						    a, 0, NULL, NULL));
#else /* !CL_VERSION_1_1 */
	    unsigned int i;
	    for (i=GHOST_CELL_WIDTH; i<size[X]+GHOST_CELL_WIDTH; i++) {
                offset = i * ystride + GHOST_CELL_WIDTH;
                READ_DEVICE_MEMORY(queue, a_buf, offset, GHOST_CELL_WIDTH, a);
            }
#endif /* CL_VERSION_1_1 */
        }

        /* copy out top row minus one, one cell at a time, if there is a top neighbor */
        if (rank_pos[Y] < mpi_ranks[Y] - 1) {
#ifdef CL_VERSION_1_1
  	    const size_t buffer_origin[3] = {size[Y]*sizeof(value_type), GHOST_CELL_WIDTH, 0 };
	    const size_t host_origin[3] = {size[Y]*sizeof(value_type), GHOST_CELL_WIDTH, 0 };
	    const size_t region[3] = {GHOST_CELL_WIDTH * sizeof(value_type), size[X], 1 };

	    CLU_CHECK_ERROR("clEnqueueReadBufferRect failed",
			    clEnqueueReadBufferRect(queue, a_buf, CL_FALSE, buffer_origin, host_origin, region, 
						    ystride*sizeof(value_type), 0, ystride*sizeof(value_type), 0, 
						    a, 0, NULL, NULL));
#else /* !CL_VERSION_1_1 */
	    unsigned int i;
            for (i=GHOST_CELL_WIDTH; i<size[X]+GHOST_CELL_WIDTH; i++) {
                offset = i * ystride + size[Y];
                READ_DEVICE_MEMORY(queue, a_buf, offset, GHOST_CELL_WIDTH, a);
            }
#endif /* CL_VERSION_1_1 */
        }
    }

    /* wait for copies to complete */
    CLU_CHECK_ERROR("clFinish failed", clFinish(queue));
}


/**************************************************************************
 Function: write_ghost_cells_to_device

 This routine copies the ghost cells (or complete buffer) from device memory
 to the host memory

 params:
    a_buf        OpenCL device memory equivalent to a
    a            array on host
    size         size of array for this MPI rank
    mpi_ranks    number of MPI ranks in each dimension
    rank_pos     cartesian position of this rank
    queue        OpenCL command queue
    full_copy    boolean if full buffer copy is to be done
 **************************************************************************/
static void write_ghost_cells_to_device(cl_mem a_buf,
                                        value_type *a,
                                        size_t size[DIMENSIONS],
                                        int mpi_ranks[DIMENSIONS],
                                        int rank_pos[DIMENSIONS],
                                        cl_command_queue queue,
                                        unsigned int full_copy) {
    /* convenience for y stride in array */
    cl_uint ystride = size[Y]+2*GHOST_CELL_WIDTH;

    if (full_copy) {
        size_t array_size = (size[X]+2*GHOST_CELL_WIDTH) * ystride;
        WRITE_DEVICE_MEMORY(queue, a_buf, 0, array_size, a);
    } else {
        size_t offset;

        /* copy in first column ghost cells if there is a left neighbor */
        if (rank_pos[X] > 0) {
            offset = 0;
            WRITE_DEVICE_MEMORY(queue, a_buf, offset, ystride, a);
        }

        /* copy in last column ghost cells if there is a right neighbor */
        if (rank_pos[X] < mpi_ranks[X] - 1) {
            offset = (size[X] + GHOST_CELL_WIDTH) * ystride;
            WRITE_DEVICE_MEMORY(queue, a_buf, offset, ystride, a);
        }

        /* copy in bottom row ghost cells, one cell at a time, if there is a bottom neighbor */
        if (rank_pos[Y] > 0) {
#ifdef CL_VERSION_1_1
	    const size_t buffer_origin[3] = { 0, GHOST_CELL_WIDTH, 0 };
	    const size_t host_origin[3] = { 0, GHOST_CELL_WIDTH, 0 };
	    const size_t region[3] = {GHOST_CELL_WIDTH * sizeof(value_type), size[X], 1 };

	    CLU_CHECK_ERROR("clEnqueueWriteBufferRect failed",
			    clEnqueueWriteBufferRect(queue, a_buf, CL_FALSE, buffer_origin, host_origin, region, 
						     ystride*sizeof(value_type), 0, ystride*sizeof(value_type), 0, 
						     a, 0, NULL, NULL));
#else /* !CL_VERSION_1_1 */
	    unsigned int i;
            for (i=GHOST_CELL_WIDTH; i<size[X]+GHOST_CELL_WIDTH; i++) {
                offset = i * ystride;
                WRITE_DEVICE_MEMORY(queue, a_buf, offset, GHOST_CELL_WIDTH, a);
            }
#endif /* CL_VERSION_1_1 */
        }

        /* copy in top row ghost cells, one cell at a time, if there is a top neighbor */
        if (rank_pos[Y] < mpi_ranks[Y] - 1) {
#ifdef CL_VERSION_1_1
  	    const size_t buffer_origin[3] = {(size[Y]+GHOST_CELL_WIDTH)*sizeof(value_type), GHOST_CELL_WIDTH, 0 };
	    const size_t host_origin[3] = {(size[Y]+GHOST_CELL_WIDTH)*sizeof(value_type), GHOST_CELL_WIDTH, 0 };
	    const size_t region[3] = {GHOST_CELL_WIDTH * sizeof(value_type), size[X], 1 };

	    CLU_CHECK_ERROR("clEnqueueWriteBufferRect failed",
			    clEnqueueWriteBufferRect(queue, a_buf, CL_FALSE, buffer_origin, host_origin, region, 
						     ystride*sizeof(value_type), 0, ystride*sizeof(value_type), 0, 
						     a, 0, NULL, NULL));
#else /* !CL_VERSION_1_1 */
	    unsigned int i;
            for (i=GHOST_CELL_WIDTH; i<size[X]+GHOST_CELL_WIDTH; i++) {
                offset = i * ystride + size[Y] + GHOST_CELL_WIDTH;
                WRITE_DEVICE_MEMORY(queue, a_buf, offset, GHOST_CELL_WIDTH, a);
            }
#endif /* CL_VERSION_1_1 */
        }

        /* wait for copies to complete */
        CLU_CHECK_ERROR("clFinishr failed", clFinish(queue));
    }
}


/**************************************************************************
 Function: ocl_jacobi

  This routine contains the main iteration loop for the Jacobi iteration
  using OpenCL kernel.

 params:
    a                       two arrays to compute solution into
    max_iter                maximum number of iterations
    size                    size of array for this MPI rank
    tolerance               all differences should be les than this tolerance value
    mpi_ranks               number of MPI ranks in each dimension
    rank_pos                cartesian position of this rank
    origin                  origin for this rank
    d                       discretion size
    mpi_comm                MPI communications structure
    local_workblock_size    size of local workblock for OpenCL kernel
    device_type             OpenCL device type
    full_copy               boolean if full buffer copy is to be done
 **************************************************************************/
static void ocl_jacobi(value_type *a[2],
                        unsigned int max_iter,
                        size_t size[DIMENSIONS],
                        value_type tolerance,
                        int mpi_ranks[DIMENSIONS],
                        int rank_pos[DIMENSIONS],
                        value_type origin[DIMENSIONS],
                        value_type d[DIMENSIONS],
                        MPI_Comm mpi_comm,
                        size_t local_workblock_size[DIMENSIONS],
                        cl_device_type device_type,
                        unsigned int full_copy) {

    size_t array_size;
    unsigned int i, j, rc, iter = 0;
    size_t delta_buffer_size, delta_size[DIMENSIONS];
    size_t tile_delta_size, tile_cache_size;
    value_type max_diff, timer;
    clu_t clu;
    cl_device_id device_id;
    cl_kernel kernel;
    cl_context context;
    cl_command_queue queue;
    cl_int err;
    cl_mem a_buf[2], delta_buf;
    value_type *delta;
    struct timeb start_time, stop_time;

    /* convenience for y stride in array */
    cl_uint ystride = size[Y]+2*GHOST_CELL_WIDTH;

    /* initialize clu */
    clu = cluInit(NULL);

    /* find OpenCL device */
    device_id  = ocl_get_device_id(clu, &device_type, mpi_ranks, mpi_comm);

    /* create OpenCl queue and context for the device */
    queue = cluCreateCmdQueue(clu, device_id, device_type, 0);
    context = cluGetCLContext(clu);

    /* build the kernel and verify the kernel */
    if (CL_DEVICE_TYPE_ACCELERATOR == device_type) {
        kernel = cluCreateKernel(clu, queue, "jacsolver_kernel.cl", "ocl_jacobi_async_copy",
                                 "-Werror",  CLU_SOURCE);
    } else {
        kernel = cluCreateKernel(clu, queue, "jacsolver_kernel.cl", "ocl_jacobi_local_copy",
                                 "-Werror",  CLU_SOURCE);
    }

    /* calculate size of kernel local memory  - also used later for kernel params */
    tile_delta_size = sizeof(value_type) * local_workblock_size[X] * local_workblock_size[Y];
    tile_cache_size = sizeof(value_type) * (local_workblock_size[X]+2*GHOST_CELL_WIDTH)*
                                           (local_workblock_size[Y]+2*GHOST_CELL_WIDTH);

    /* verify the device has enough resources for this device */
    if ((cluGetAvailableLocalMem(device_id, kernel) < tile_delta_size + tile_cache_size) ||
        (! cluCheckLocalWorkgroupSize(device_id, kernel, DIMENSIONS, local_workblock_size))) {
        local_workblock_size[X] = 1;
        local_workblock_size[Y] = 1;
    }

    MSG("Estimating solution using OpenCL Jacobi iteration with %d x %d workblock.\n",
        (int)local_workblock_size[X], (int)local_workblock_size[Y]);
    fflush(stdout);

    /* init arrays by setting the initial value and the boundary conditions */
    set_initial_solution(a[OLD], size, INITIAL_GUESS);
    set_initial_solution(a[NEW], size, INITIAL_GUESS);
    set_boundary_conditions(a[OLD], size, mpi_ranks, rank_pos, origin, d);
    set_boundary_conditions(a[NEW], size, mpi_ranks, rank_pos, origin, d);

    /* print the initial solution guess */
    print_array("Init ", a[NEW], size, mpi_ranks, origin, d, mpi_comm);

    /* get start time - includes start and finish overhead of OpenCL */
    ftime(&start_time);

    /* allocate memory for differences */
    delta_size[X] = size[X] / local_workblock_size[X];
    delta_size[Y] = size[Y] / local_workblock_size[Y];
    delta_buffer_size = sizeof(value_type) * delta_size[X] * delta_size[Y];
    delta = (value_type *)allocx(delta_buffer_size, "diff matrix");
    
    /* initialize deltas so that first execution of kernel with overlapping 
     * reduction on the host will work correctly and not prematurely exit
     */
    for (i=0; i<delta_size[X]; ++i) {
        for (j=0; j<delta_size[Y]; ++j) {
            delta[i * delta_size[Y] + j] = 1.0;
        }
    }

    /* create buffers for OpenCL device using host memory */
    array_size = sizeof(value_type) * (size[X]+2*GHOST_CELL_WIDTH) * ystride;
    a_buf[OLD] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                array_size, NULL, &err);
    CLU_CHECK_ERROR("clCreateBuffer for a[OLD]", err);
    a_buf[NEW] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                array_size, NULL, &err);
    CLU_CHECK_ERROR("clCreateBuffer for a[NEW]", err);
    delta_buf = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                delta_buffer_size, NULL, &err);
    CLU_CHECK_ERROR("clCreateBuffer for delta", err);

    /* copy over buffers to device */
    CLU_CHECK_ERROR("clEnqueueWriteBuffer of a_buf[OLD] failed",
        clEnqueueWriteBuffer(queue, a_buf[OLD], CL_TRUE, 0, array_size, a[OLD], 0, NULL, NULL));
    CLU_CHECK_ERROR("clEnqueueWriteBuffer of a_buf[NEW] failed",
        clEnqueueWriteBuffer(queue, a_buf[NEW], CL_TRUE, 0, array_size, a[NEW], 0, NULL, NULL));

    /* set the kernel execution type  - data parallel */
    cluSetKernelNDRange(clu, kernel, DIMENSIONS, NULL, size, local_workblock_size);

    /*  iterate until maximum difference is less than the given tolerance
        or number of iterations is too high */
    do {
        /* swap array pointers for next iteration */
        SWAP_PTR(a[OLD], a[NEW]);
        SWAP_BUF(a_buf[OLD], a_buf[NEW]);

        cluRunKernel(clu, kernel, NULL, 6,
                    sizeof(cl_mem), (void *) &a_buf[OLD],
                    sizeof(cl_mem), (void *) &a_buf[NEW],
                    tile_delta_size, NULL,
                    tile_cache_size, NULL,
                    sizeof(cl_mem), (void *) &delta_buf,
                    sizeof(cl_uint), (void *) &ystride);

        /* while the kernel is running, calculate the reduction for the previous iteration */
        max_diff = ocl_jacobi_reduce(delta, delta_size);
        
        /* enqueue a synchronous copy of the delta. This will not occur until the kernel 
         * has finished. The deltas for each workgroup is a much smaller array to process
         */
        CLU_CHECK_ERROR("clEnqueueReadBuffer of reduced deltas failed",
            clEnqueueReadBuffer(queue, delta_buf, CL_TRUE, 0,
                                delta_buffer_size, delta, 0, NULL, NULL));

        /*  If there is only one node then we can skip all of the MPI work, otherwise
            exchange ghost cells and avoid copying of data in and out of device memory
            which makes a big difference in the performance.
         */
        if (1 != mpi_ranks[X] * mpi_ranks[Y]) {
            /* read out ghost cells */
            read_ghost_cells_from_device(a_buf[NEW], a[NEW], size,
                                         mpi_ranks, rank_pos, queue, full_copy);

            /* swap the result ghost cells with the MPI neighbors */
            exchange_ghost_cells(a[NEW], size, mpi_ranks, rank_pos, mpi_comm);

            /* write out ghost cells */
            write_ghost_cells_to_device(a_buf[NEW], a[NEW], size,
                                        mpi_ranks, rank_pos, queue, full_copy);

            /* find the maximum value of max_diff across all of the MPI ranks */
            rc = MPI_Allreduce(MPI_IN_PLACE, &max_diff, 1, MPI_FLOAT, MPI_MAX, mpi_comm);
            if (MPI_SUCCESS != rc) {
                QUIT("MPI_Allreduce returned %d\n", rc);
            }
        }

        /* output status for user, overwrite the same line */
        if ((0 == iter % 100) && (0 == ranks[MY_RANK_INDEX])) {
            printf("Iteration=%5d, max difference=%0.7f, target=%0.7f\r",
                        iter, max_diff, tolerance);
            fflush(stdout);
        }


#ifdef DEBUG
        /* debug output, if needed (may need to copy out array from device) */
        {
            char istring[10];
            CLU_CHECK_ERROR("clEnqueueReadBuffer of ghost cells failed",
                clEnqueueReadBuffer(queue, a_buf[NEW], CL_TRUE, 0, array_size, a[NEW], 0, NULL, NULL));
            sprintf(istring, "Iter %d", iter);
            print_array(istring, a[NEW], size, mpi_ranks, origin, d, mpi_comm);
        }
#endif
        /* increment the iteration counter */
        iter++;
    } while (max_diff > tolerance && max_iter >= iter); /* do loop */

    /* read back the final result */
    CLU_CHECK_ERROR("clEnqueueReadBuffer of result failed",
        clEnqueueReadBuffer(queue, a_buf[NEW], CL_TRUE,    0, array_size, a[NEW], 0, NULL, NULL));

    /* get end time and calculate difference */
    ftime(&stop_time);
    timer = (stop_time.time - start_time.time) +
            (stop_time.millitm - start_time.millitm)/1000.0f;

    /* output final iteration count and maximum difference value */
    if (0 == ranks[MY_RANK_INDEX]) {
        printf("Iteration=%5d, max difference=%0.7f, execution time=%.3f seconds\n",
                    iter-1, max_diff, timer);
        fflush(stdout);
    }

    /* finish usage of OpenCL device */
    CLU_CHECK_ERROR("clReleaseMemObject of a_buf[OLD] failed",
        clReleaseMemObject(a_buf[OLD]));
    CLU_CHECK_ERROR("clReleaseMemObject of a_buf[NEW] failed",
        clReleaseMemObject(a_buf[NEW]));
    CLU_CHECK_ERROR("clReleaseMemObject of deltas failed",
        clReleaseMemObject(delta_buf));
    free(delta);

    /* finish usage of clu */
    cluDestroy(clu);
}


/**************************************************************************
 Function: reference_jacobi_kernel

 Computes single Jacobi iteration using host code (no OpenCL) - 5 point stencil

 params:
    a            input array
    size        size of array for this MPI rank
    anew        output array
 returns:
    diff        maximum difference from last iteration
 **************************************************************************/
static value_type reference_jacobi_kernel(value_type *a, value_type *anew,
                                          size_t size[DIMENSIONS]) {

    unsigned int i, j, ystride;
    value_type new_val, max_diff = 0.0;

    /* convenience for y stride in array */
    ystride = size[Y]+2*GHOST_CELL_WIDTH;

    for (i=GHOST_CELL_WIDTH; i<size[X]+GHOST_CELL_WIDTH; ++i) {
        for (j=GHOST_CELL_WIDTH; j<size[Y]+GHOST_CELL_WIDTH; ++j) {
            new_val = (value_type) (0.25 * (a[(i-1) * ystride + j] +
                                            a[(i+1) * ystride + j] +
                                            a[i * ystride + j-1] +
                                            a[i * ystride + j+1]));
            max_diff = (value_type)fmax(max_diff, fabs(new_val - a[i * ystride + j]));
            anew[i * ystride + j] = new_val;
        }
    }

    /* return the maximum difference from last iteration */
    return max_diff;
}


/**************************************************************************
 Function: reference_jacobi

 This routine contains the main iteration loop for the Jacobi iteration
 reference implementation (no OpenCL).

 params:
    a           two arrays to compute solution into
    max_iter    maximum number of iterations   
    size        size of array for this MPI rank
    tolerance   all differences should be les than this tolerance value
    mpi_ranks   number of MPI ranks in each dimension
    rank_pos    cartesian position of this rank    
    origin      origin for this rank
    d           discretion size
    mpi_comm    MPI communications structure
 **************************************************************************/
static void reference_jacobi(value_type *a[2],
                            unsigned int max_iter,
                            size_t size[DIMENSIONS],
                            value_type tolerance,
                            int mpi_ranks[DIMENSIONS],
                            int rank_pos[DIMENSIONS],
                            value_type origin[DIMENSIONS],
                            value_type d[DIMENSIONS],
                            MPI_Comm mpi_comm) {

    unsigned int rc, iter = 0;
    value_type max_diff, timer;
    struct timeb start_time, stop_time;

    /* init arrays by setting the initial value and the boundary conditions */
    set_initial_solution(a[OLD], size, INITIAL_GUESS);
    set_initial_solution(a[NEW], size, INITIAL_GUESS);
    set_boundary_conditions(a[OLD], size, mpi_ranks, rank_pos, origin, d);
    set_boundary_conditions(a[NEW], size, mpi_ranks, rank_pos, origin, d);

    /* print the initial solution guess */
    print_array("Init ", a[NEW], size, mpi_ranks, origin, d, mpi_comm);

    /* get start time */
    ftime(&start_time);

    /*  iterate until maximum difference is less than the given tolerance
        or number of iterations is too high
     */
    do {
        /* swap array pointers for next iteration */
        SWAP_PTR(a[OLD], a[NEW]);

        /* iterate using a[OLD] as the input and a[NEW] as the output */
        max_diff = reference_jacobi_kernel(a[OLD], a[NEW], size);

        /*  If there is only one node then we can skip all of the MPI work, otherwise
         *  exchange ghost cells and perform the reduction.
         */   
        if (1 != mpi_ranks[X] * mpi_ranks[Y]) {
            /* swap the result ghost cells with the MPI neighbors */
            exchange_ghost_cells(a[NEW], size, mpi_ranks, rank_pos, mpi_comm);

            /* find the maximum value of max_diff across all of the MPI ranks */
            rc = MPI_Allreduce(MPI_IN_PLACE, &max_diff, 1, MPI_FLOAT, MPI_MAX, mpi_comm);
            if (MPI_SUCCESS != rc) {
                QUIT("MPI_Allreduce returned %d\n", rc);
            }
        }

        /* output status for user, overwrite the same line */
        if ((0 == iter % 100) && (0 == ranks[MY_RANK_INDEX])) {
            printf("Iteration=%5d, max difference=%0.7f, target=%0.7f\r",
                iter, max_diff, tolerance);
            fflush(stdout);
        }

#ifdef DEBUG
        /* debug output, if needed */
        {
            char istring[10];
            sprintf(istring, "Iter %d", iter);
             print_array(istring, a[NEW], size, mpi_ranks, origin, d, mpi_comm);
        }
#endif

        /* increment counter */
        iter++;
    } while (max_diff > tolerance && max_iter > iter); /* do loop */

    /* get end time and calculate difference */
    ftime(&stop_time);
    timer = (stop_time.time - start_time.time) +
            (stop_time.millitm - start_time.millitm)/1000.0f;

    /* output final iteration count and maximum difference value */
    if (0 == ranks[MY_RANK_INDEX]) {
        printf("Iteration=%5d, max difference=%0.7f, execution time=%.3f seconds\n",
                    iter, max_diff, timer);
    }
}




/**************************************************************************
 Function: set_boundary_conditions

 Sets the following boundary conditions in the given array
    u(x,0) = sin(pi * x)
    u(x,1) = sin(pi * x) * pow(e, -pi)
    u(0,y) = 0
    u(1,y) = 0

 params:
    a            array to set boundary conditions
    size        size of array for this MPI rank
    rank_pos    cartesian position of this rank
    origin         origin for this rank
    d            discretion size
 **************************************************************************/
static void set_boundary_conditions(value_type *a,
                                    size_t size[DIMENSIONS],
                                    int mpi_ranks[DIMENSIONS],
                                    int rank_pos[DIMENSIONS],
                                    value_type origin[DIMENSIONS],
                                    value_type d[DIMENSIONS]) {
    unsigned int i, j, ystride;

    /* convenience for y stride in array */
    ystride = size[Y]+2*GHOST_CELL_WIDTH;

    /* Set condition if we are part of the bottom edge (y = 0.0) */
    if (0 == rank_pos[Y]) {
        for (i=GHOST_CELL_WIDTH; i<size[X]+GHOST_CELL_WIDTH; ++i) {        /* exclude corners */
            a[i * ystride] =
                (value_type)sin(PI * (origin[X] + i * d[X]));
        }
    }

    /* Set condition if we are part of the top edge (y = 1.0) */
    if (mpi_ranks[Y]-1 == rank_pos[Y]) {
        for (i=GHOST_CELL_WIDTH; i<size[X]+GHOST_CELL_WIDTH; ++i) {        /* exclude corners */
            a[i * ystride + size[Y]+GHOST_CELL_WIDTH] =
                (value_type)(sin(PI * (origin[X] + i * d[X])) * exp(-PI));
        }
    }

    /* Set condition if we are part of the left edge (x = 0.0) */
    if (0 == rank_pos[X]) {
        for (j=0; j<ystride; ++j) {            /* include corners */
            a[j] = 0;
        }
    }

    /* Set condition if we are part of the right edge (x = 1.0) */
    if (mpi_ranks[X]-1 == rank_pos[X]) {
        for (j=0; j<ystride; ++j) {            /* include corners */
            a[(size[X]+GHOST_CELL_WIDTH) * ystride + j] = 0;
        }
    }
}


/**************************************************************************
 Function: set_initial_solution

 Sets the initial solution in the given array

 params:
    a            array to set boundary conditions
    size        size of array for this MPI rank
    guess        initial value
 **************************************************************************/
static void set_initial_solution(value_type *a,
                                 size_t size[DIMENSIONS],
                                 value_type guess) {

    unsigned int i, j, ystride;

    /* convenience for y stride in array */
    ystride = size[Y]+2*GHOST_CELL_WIDTH;

    /* set guess */
    for (i=0; i<size[X]+2*GHOST_CELL_WIDTH; ++i) {
        for (j=0; j<ystride; ++j) {
            a[i * ystride + j] = guess;
        }
    }
}


/**************************************************************************
 Function: main

 This is the main control flow for the example. After reading and verifying
 the command line arguments, MPI is initialized. Depending on which option
 is chosen one of 3 computations are called for analytical, Jacobi iteration
 using OpenCL, or Jacabo iteration using host and no OpenCL.

 After the computation is complete, a bitmap representation of the final
 array is stored in a PPM format file.

 **************************************************************************/
int main(int argc, char *argv[]) {

    /* size of matrices for iteration - input parameter */
    size_t mat_size[DIMENSIONS] = { DEFAULT_NODES, DEFAULT_NODES };

    /* number of MPI ranks in each dimension - input parameter */
    int mpi_ranks[DIMENSIONS] = { DEFAULT_RANKS, DEFAULT_RANKS };

    /* size of OpenCL workblock in each dimension - input parameter */
    size_t block_size[DIMENSIONS] = { DEFAULT_BLOCK, DEFAULT_BLOCK };

    /* my cartesian position in MPI space */
    int my_position[DIMENSIONS] = { 0, 0 };

    /* size of matrix for each rank */
    size_t rank_size[DIMENSIONS];

    /* size of the array needed */
    size_t array_size;

    /* discretion size in each dimension assuming a unit square */
    value_type d[DIMENSIONS];

    /* origin of this rank in the x,y plane, assuming a unit square */
    value_type my_origin[DIMENSIONS];

    /* maximum number of iterations */
    unsigned int max_iter = MAX_ITERATIONS;

    /* boolean for full copy of buffer or just ghost cells */
    unsigned int full_copy = 0;

    /* boolean for verificaton against reference implementation */
    unsigned int verify = 0;

    /* OpenCL device type - input parameter can override this */
    cl_device_type dev_type = CL_DEVICE_TYPE_DEFAULT;

    /* MPI communications area */
    MPI_Comm mpi_comm = NULL;

    /* print welcome message */
    MSG("A simple %dD iterative Jacobi solver", DIMENSIONS);
    if (1 == argc) {
        printf("; use -h or --help to get help");
    }
    printf(".\n");

    /* parse command line arguments and display them */
    get_arguments(argc, argv, &max_iter, mat_size, block_size, mpi_ranks,
                    &dev_type, &full_copy, &verify);
    MSG("Requested size is (%d,%d) with MPI space of [%d,%d].\n",
            (unsigned int)mat_size[X], (unsigned int)mat_size[Y],
            (unsigned int)mpi_ranks[X], (unsigned int)mpi_ranks[Y]);

    /* if required, initialize MPI with required size and determine my position */
    if (1 != mpi_ranks[X] * mpi_ranks[Y]) {
        mpi_comm = init_mpi(mpi_ranks, my_position);
    }

    /* calculate size of each node in the unit square plate */
    d[X] = (value_type)(1.0 / (mat_size[X]+1));
    d[Y] = (value_type)(1.0 / (mat_size[Y]+1));

    /* calculate size of array for a single MPI node */
    rank_size[X] = mat_size[X] / mpi_ranks[X];
    rank_size[Y] = mat_size[Y] / mpi_ranks[Y];

    /* calculate our origin in the overall space */
    my_origin[X] = my_position[X] * rank_size[X] * d[X];
    my_origin[Y] = my_position[Y] * rank_size[Y] * d[Y];

    /* flush output before starting the compute */
    fflush(stdout);

    /* calculate size of array and allocate the memory */
    /* size is 2 bigger to account for "ghost cells" and/or boundaries */
    /* exit_app takes care of cleaning up u */
    array_size = sizeof(value_type) * (rank_size[X]+2*GHOST_CELL_WIDTH) *
                                      (rank_size[Y]+2*GHOST_CELL_WIDTH);
    u[OLD] = (value_type *)allocx(array_size, "initial matrix");
    u[NEW] = (value_type *)allocx(array_size, "result matrix");

    /*  At this point the main computation can occur. For the purposes of this example
     *  we are using a simple Jacobi iteration technique. This converges very slowly. 
     */

    /* compute the exact solution, result in u[NEW] */
    if (EXACT_CALC == (signed int)dev_type) {
        MSG("Calculating solution using analytical formula.\n");
        exact_compute(u[NEW], rank_size, my_origin, d);
    } else if (REFERENCE_CALC == (signed int)dev_type) {
        /* compute solution using reference implementation, result in u[NEW] */
        MSG("Estimating solution using Jacobi reference implementation.\n");
        reference_jacobi(u, max_iter, rank_size, DEFAULT_TOLERANCE, mpi_ranks,
                      my_position, my_origin, d, mpi_comm);

    /* compute solution using OpenCL kernel, result in u[NEW] */
    } else {
        ocl_jacobi(u, max_iter, rank_size, DEFAULT_TOLERANCE, mpi_ranks,
                   my_position, my_origin, d, mpi_comm,
                   block_size, dev_type, full_copy);
    }

    /* print solution */
    print_array("Solve", u[NEW], rank_size, mpi_ranks, my_origin, d, mpi_comm);

    /* perform verification if required */
    if (verify) {
        unsigned int i, j, ystride;
        value_type max_diff = 0.0;

        /* allocate for verification matrices */
        v[OLD] = (value_type *)allocx(array_size, "verify initial matrix");
        v[NEW] = (value_type *)allocx(array_size, "verify result matrix");

        /* compute verfication solution, result in v[NEW] */
        MSG("Starting verification using Jacobi reference implementation.\n");
        reference_jacobi(v, max_iter, rank_size, DEFAULT_TOLERANCE, mpi_ranks,
                      my_position, my_origin, d, mpi_comm);

        /* compute array differences and store in v[OLD] */
        for (i=0; i<rank_size[X]+2*GHOST_CELL_WIDTH; ++i) {
            /* convenience for y stride in array */
            ystride = rank_size[Y]+2*GHOST_CELL_WIDTH;

            for (j=0; j<ystride; ++j) {
                /* calculate difference and update maximum difference */
                v[OLD][i * ystride + j] = v[NEW][i * ystride + j] - u[NEW][i * ystride + j];
                max_diff = (value_type)fmax(max_diff, fabs(v[OLD][i * ystride + j]));
            }
        }

        /* output differences */
        print_array("Verify Diff", v[OLD], rank_size, mpi_ranks, my_origin, d, mpi_comm);
        MSG("Verification complete, max difference with reference implemention is %0.5f.\n", max_diff);

        /* return error code if max difference is too large */
        if (max_diff > 0.0001) {
            QUIT("Error: max difference greater than required tolerance.\n");
        }
    }

    /* save bitmap of computation */
    save_bitmap(u[NEW], rank_size, mpi_ranks, my_position);

    /* deallocate arrays and finish use of MPI */
    exit_app(0);
    return 0;
}

