/*************************************************************************/
/*                                                                       */
/* Licensed Materials - Property of IBM                                  */
/*                                                                       */
/*                                                                       */
/* (C) Copyright IBM Corp. 2009, 2010                                    */
/* All Rights Reserved                                                   */
/*                                                                       */
/* US Government Users Restricted Rights - Use, duplication or           */
/* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.     */
/*                                                                       */
/*************************************************************************/

#include <math.h>
#include <stdio.h>
#include <getopt.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <libgen.h>
#include <unistd.h>

#include <CL/opencl.h>

#include "clock.h"
#include "clu.h"
#include "bsop.h"

#define MAX_DEVICE_NAME_LEN 128
#define MAX_KERNEL_SELECTOR_LEN 128

/* Here is the file where the kernel code resides */
char kernel_source_file[8] = "bsop.cl";

/* Variables used to manage the OpenCL environment */
cl_kernel* kernels;
cl_command_queue* cmd_queues;
cl_device_id* devices;
cl_mem **memobjs;
clu_t clu;

size_t global_work_size[1];
size_t local_work_size[1];

cl_int wg_validity_checked = 0;


/* =================================================================================================== */
/*  Print command and command line argument usage */
/* =================================================================================================== */

void usage()
{
  printf("\n");
  printf("Usage: bsop [DEVICE] [KERNEL] [OPTIONS]\n");
  printf("\n");
  printf(" Device Type:\n");
  printf("\n");
  printf("  -a, --accel              use CBEA Accelerator for compute\n");
  printf("                           (default for Cell/B.E. machines)\n");
  printf("  -c, --cpu                use CPU for compute\n");
  printf("  -g, --gpu                use GPU for compute\n");
  printf("                           (default for GPU-equiped machines)\n");
  printf("\n");
  printf(" Kernel Type:\n");
  printf("\n");
  printf("  --rangeLS                Use the NDRange kernel which directly indexes\n");
  printf("                           into main memory.\n");
  printf("  --rangeAWGC              Use the NDRange kernel which accesses the main\n");
  printf("                           array using asynchronous workgroup copies.\n");
  printf("                           (Intended for local work group sizes greater\n");
  printf("                           than one.)\n");
  printf("                           (default for GPU-equipped machines)\n");
  printf("  --taskDB                 Use the Task kernel which performs double-buffered\n");
  printf("                           copies of data into local memory.\n");
  printf("                           (default for Cell/B.E. machines)\n");
  printf("  --taskSB                 Use the Task kernel which performs single-buffered\n");
  printf("                           copies of data into local memory.\n");
  printf("  --taskLS                 Use the Task kernel which directly indexes into\n");
  printf("                           main memory.\n");
  printf("\n");
  printf(" General Kernel Options:\n");
  printf("\n");
  printf("  -A N, --arraysize=N      Use N for the arraysize, where N is a power of 2\n");
  printf("                           between 1 and 16777216.\n");
  printf("                           (default: %d)\n", DEFAULT_ARRAY_SIZE);
  printf("  -w N, --vectorwidth=N\n");
  printf("                           Number of elements (1, 2, 4, 8, or 16) to process\n");
  printf("                           per kernel (or per loop within a Task).\n");
  if (DEFAULT_VECTOR_WIDTH != AUTO_VECTOR_WIDTH) {
    printf("                           (default: %d)\n", DEFAULT_VECTOR_WIDTH);
  } else {
    printf("                           (default: device perferred width)\n");
  }
  printf("\n");
  printf("  -u, --buffer [none|use|alloc|copy|alloc_copy] Selects the buffer scheme:\n");
  printf("                           none - the application allocates and initializes\n");
  printf("                              temporary buffers, the OpenCL runtime allocates OpenCL\n");
  printf("                              memory objects and initializes them by writing from the\n");
  printf("                              temporary buffers into them, and the application frees \n");
  printf("                              the temporary buffers.\n");
  printf("                           use - the application allocates and initialize buffers on\n");
  printf("                              the host space and the OpenCL runtime uses them directly.\n");
  printf("                           alloc - the OpenCL runtime allocates buffers, and the\n");
  printf("                              application maps, initializes, and unmaps them. (default)\n");
  printf("                           copy or alloc_copy - the application allocates and initializes\n");
  printf("                              temporary host buffers, the OpenCL runtime allocates buffers and\n");
  printf("                              initializes them by coping from the temp buffers into them,\n");
  printf("                              and the application frees the temp buffers.\n");
  printf("\n");
  printf(" --fastmath /--nofastmath  Enable fast native math option or not\n");
  printf("                           The fastmath version will enable the \n");
  printf("                           cl-fast-relaxed-math build option and the\n");
  printf("                           native versions of the math functions\n");
  printf("                           (default: %s)\n",
         DEFAULT_FASTMATH_FLAG == 1 ? "--fastmath" : "--nofastmath");
  printf("\n");


  printf(" NDRange Kernel Options: (valid only with --rangeLS or --rangeAWGC)\n");
  printf("\n");
  printf("  -l N, --lwgsize=N        Use a local workgroup size of N, where N is in:\n");
  printf("                           { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048}.\n");
  printf("                           (default: %d)\n",
         DEFAULT_LOCAL_WORK_GROUP_SIZE);
  printf("\n");

  printf(" General Options:\n");
  printf("\n");
  printf("  --numa / --nonuma        Enable NUMA or not\n");
  printf("                           (default: %s)\n",
         DEFAULT_NUMA_FLAG == 1 ? "--numa" : "--nonuma");

  printf("\n");
  printf("  --double / --single      Use double precision or single precision\n");
  printf("                           (default: %s)\n",
         DEFAULT_DOUBLE_FLAG == 1 ? "--double" : "--single");
  printf("  --verify / --noverify    Verify or skip verification of computed output\n");
  printf("                           (default: %s)\n",
         DEFAULT_VERIFY_FLAG == 1 ? "--verify" : "--noverify");
  printf("  --verbose / --noverbose  Produce verbose output messages.\n");
  printf("                           (default: %s)\n",
         DEFAULT_VERBOSE_FLAG == 1 ? "--verbose" : "--noverbose");
  printf("  -h, --help               This usage message\n");
}

/* =================================================================================================== */
/*  Convert code selector to string */
/* =================================================================================================== */

const char *code_selector_to_string(int code_selector)
{
  switch (code_selector) {
  case RANGE_LOAD_STORE:
    return "rangeLS";
  case RANGE_ASYNC_WORKGROUP_COPY:
    return "rangeAWGC";
  case TASK_DOUBLE_BUFFER:
    return "taskDB";
  case TASK_SINGLE_BUFFER:
    return "taskSB";
  case TASK_LOAD_STORE:
    return "taskLS";
  }
  return "<unknown>";
}

/* =================================================================================================== */
/*  Convert buffer selector to string */
/* =================================================================================================== */

const char *buffer_selector_to_string(int buffer_selector)
{
  switch (buffer_selector) {
  case NONE_BUFFER:
    return "none";
  case ALLOC_BUFFER:
    return "alloc";
  case COPY_BUFFER:
    return "copy";
  case ALLOC_COPY_BUFFER:
    return "alloc_copy";
  case USE_BUFFER:
    return "use";
  }
  return "<unknown>";
}

  /* Global variables used to hold user-specified overrides and intermediate control values derived from them. */
  cl_device_type device_type = CL_DEVICE_TYPE_DEFAULT;
  int code_selector = DEFAULT_CODE_SELECTOR;
  int vector_width = DEFAULT_VECTOR_WIDTH;
  cl_ulong local_work_group_size = DEFAULT_LOCAL_WORK_GROUP_SIZE;
  cl_ulong array_size = DEFAULT_ARRAY_SIZE;
  int verify_flag = DEFAULT_VERIFY_FLAG;
  int double_flag = DEFAULT_DOUBLE_FLAG;
  int numa_flag = DEFAULT_NUMA_FLAG;
  int use_fast_native_math = DEFAULT_FASTMATH_FLAG;
  int verbose_flag = DEFAULT_VERBOSE_FLAG;
  int buffer_selector = DEFAULT_BUFFER_SELECTOR;

/* =================================================================================================== */
/*  Parse the command line arguments and return selector variables */
/* =================================================================================================== */

void parse_cmdline(int argc, char **argv)
{
  int opt;
  int option_index;
  long int argval;
  char *endptr;
  char *name;

  struct option long_options[] = {
    {"help", no_argument, NULL, 'h'},
    {"accel", no_argument, NULL, 'a'},
    {"cpu", no_argument, NULL, 'c'},
    {"gpu", no_argument, NULL, 'g'},
    {"rangeLS", no_argument, &code_selector, RANGE_LOAD_STORE},
    {"rangeAWGC", no_argument, &code_selector, RANGE_ASYNC_WORKGROUP_COPY},
    {"taskDB", no_argument, &code_selector, TASK_DOUBLE_BUFFER},
    {"taskSB", no_argument, &code_selector, TASK_SINGLE_BUFFER},
    {"taskLS", no_argument, &code_selector, TASK_LOAD_STORE},
    {"vectorwidth", required_argument, NULL, 'w'},
    {"arraysize", required_argument, NULL, 'A'},
    {"lwgsize", required_argument, NULL, 'l'},
    {"verbose", no_argument, &verbose_flag, 1},
    {"noverbose", no_argument, &verbose_flag, 0},
    {"verify", no_argument, &verify_flag, 1},
    {"noverify", no_argument, &verify_flag, 0},
    {"double", no_argument, &double_flag, 1},
    {"single", no_argument, &double_flag, 0},
    {"numa", no_argument, &numa_flag, 1},
    {"nonuma", no_argument, &numa_flag, 0},
    {"fastmath", no_argument, &use_fast_native_math, 1},
    {"nofastmath", no_argument, &use_fast_native_math, 0},
    {"buffer", required_argument, NULL, 'u'},
    {NULL, 0, NULL, 0}
  };

  /* Change current working directory to that of the invocation path so that bsop can
   * be run from any current working directory.
   */
  name = basename(argv[0]);
  (void)chdir(dirname(argv[0]));

  while (1) {
    opt =
        getopt_long(argc, argv, "hacgw:A:n:l:mu:", long_options, &option_index);

    if (opt == -1) {
      break;
    }

    switch (opt) {

      /*  -h, --help */
    case 'h':
      usage();
      exit(EXIT_SUCCESS);

      /*  -a, --accel */
    case 'a':
      device_type = CL_DEVICE_TYPE_ACCELERATOR;
      break;

      /*  -c, --cpu */
    case 'c':
      device_type = CL_DEVICE_TYPE_CPU;
      break;

      /*  -g, --gpu */
    case 'g':
      device_type = CL_DEVICE_TYPE_GPU;
      break;

      /*  -w, --vectorwidth */
    case 'w':
      argval = strtol(optarg, &endptr, 0);
      if (endptr[0] != '\0') {
        printf("%s: Invalid characters found parsing '%s'\n", name, argv[optind - 1]);
        exit(EXIT_FAILURE);
      }
      vector_width = (int) argval;
      if ((vector_width < 1) || (vector_width > 16)
          || (vector_width & (vector_width - 1))) {
        printf
            ("%s: vectorwidth must be a power of 2 between 1 and 16, inclusive.  You entered %d\n",
             name, (int) vector_width);
        exit(EXIT_FAILURE);
      }
      break;

      /*  -A, --arraysize */
    case 'A':
      argval = strtol(optarg, &endptr, 0);
      if (endptr[0] != '\0') {
        printf("%s: Invalid characters found parsing '%s'\n", name, argv[optind - 1]);
        exit(EXIT_FAILURE);
      }
      array_size = argval;
      if ((array_size < 1) || (array_size > 16777216)
          || (array_size & (array_size - 1))) {
        printf
            ("%s: arraysize must be a power of 2 between 1 and 16777216, inclusive.  You entered %d\n",
             name, (int) array_size);
        exit(EXIT_FAILURE);
      }
      break;

      /*  -l, --lwgsize */
    case 'l':
      argval = strtol(optarg, &endptr, 0);
      if (endptr[0] != '\0') {
        printf("%s: Invalid characters found parsing '%s'\n", name, argv[optind - 1]);
        exit(EXIT_FAILURE);
      }
      local_work_group_size = argval;
      if ((local_work_group_size < 1) || (local_work_group_size > 2048)
          || (local_work_group_size & (local_work_group_size - 1))) {
        printf
            ("%s: lwgsize must be a power of 2 between 1 and 2048, inclusive.  You entered %d\n",
             name, (int) local_work_group_size);
        exit(EXIT_FAILURE);
      }
      break;

    case 'u':
      if (strcmp(optarg, "none") == 0)
        buffer_selector = NONE_BUFFER;
      else if (strcmp(optarg, "alloc") == 0)
        buffer_selector = ALLOC_BUFFER;
      else if (strcmp(optarg, "copy") == 0)
        buffer_selector = COPY_BUFFER;
      else if (strcmp(optarg, "alloc_copy") == 0)
        buffer_selector = ALLOC_COPY_BUFFER;
      else if (strcmp(optarg, "use") == 0)
        buffer_selector = USE_BUFFER;
      else {
        printf("%s: buffer option must be either none, alloc, copy, alloc_copy or use. You entered: %s\n",
	       name, optarg);
        exit(EXIT_FAILURE);
      }
      break;
    case '?':
      printf("Try '%s --help' for more information.\n", name);
      exit(EXIT_FAILURE);

    }
  }

  if (optind != argc) {
    printf("%s: unrecognized option '%s'.\n", name, argv[optind]);
    printf("Try '%s --help' for more information.\n", name);
    exit(EXIT_FAILURE);
  }
}

/* =================================================================================================== */
/*  N validation function. Here are the two reference routines used by validation */
/*  (the second one, bsop_reference, calls the first one, N, in two places). */
/*  These are used at the end of the main routine to verify the values in the returned array. */
/*  This first routine estimates the cumulative distribution function using a quintic polynomial. */
/* =================================================================================================== */

double N(double x)
{
  double A1 = 0.319381530;
  double A2 = -0.356563782;
  double A3 = 1.781477937;
  double A4 = -1.821255978;
  double A5 = 1.330274429;
  double INV_ROOT2PI = 0.39894228;
  double k, n;
  double accum;
  double candidate_answer;
  int flag;
  flag = (x < 0);
  x = (x < 0) ? -x : x;
  k = 1.0 / (1.0 + 0.2316419 * x);
  accum = A4 + A5 * k;
  accum = k * accum + A3;
  accum = k * accum + A2;
  accum = k * accum + A1;
  accum = k * accum;
  n = exp(-0.5 * x * x);
  n *= INV_ROOT2PI;
  candidate_answer = 1.0 - n * accum;
  return (flag ? 1.0 - candidate_answer : candidate_answer);
}

/* =================================================================================================== */
/*  BSOP Reference validation function. This function invokes N in two places. */
/* =================================================================================================== */

double bsop_reference(int cpflag, double S0, double K, double r,
                      double sigma, double T)
{
  double d1, d2, c, p, Nd1, Nd2, expval, answer;
  d1 = log(S0 / K) + (r + 0.5 * sigma * sigma) * T;
  d1 /= (sigma * sqrt(T));
  expval = exp(-r * T);
  d2 = d1 - sigma * sqrt(T);
  Nd1 = N(d1);
  Nd2 = N(d2);
  c = S0 * Nd1 - K * expval * Nd2;
  p = K * expval * (1.0 - Nd2) - S0 * (1.0 - Nd1);
  answer = cpflag ? c : p;
  return answer;
}

/* =================================================================================================== */
/*  Validation function. This function invokes bsop_reference.                                         */
/* =================================================================================================== */

template <typename FLOAT> void validate(FLOAT *S0_fptr, FLOAT *K_fptr, FLOAT *r_fptr,
                                        FLOAT *sigma_fptr, FLOAT *T_fptr, FLOAT *answer_fptr,
                                        FLOAT *cpflag_fptr, unsigned long array_size,
                                        double *maxouterr, int *maxouterrindex)
{
  *maxouterr = -1.0;
  *maxouterrindex = -1;
  unsigned long i;
  for (i = 0; i < array_size; i += 1) {
    cl_double a, b, absb, del, abserr, relerr, outerr;
    int *temp_int;
    a = (cl_double) answer_fptr[i];
    temp_int = (int *) &cpflag_fptr[i];
    b = bsop_reference(*temp_int, (cl_double) S0_fptr[i],
                       (cl_double) K_fptr[i], (cl_double) r_fptr[i],
                       (cl_double) sigma_fptr[i], (cl_double) T_fptr[i]);
    del = a - b;
    abserr = del;
    del = (del < 0.0f) ? -del : del;
    absb = (b < 0.0f) ? -b : b;
    relerr = del / absb;
    outerr = (del > relerr) ? relerr : del;
    if (outerr > *maxouterr) {
      *maxouterr = outerr;
      *maxouterrindex = i;
    }
  }
}


/*===================================================================================================
 * This specialist driver routine prepares for a call to an NDRange kernel which performs simple
 * load-store operations to effect data movement in and out.
 *===================================================================================================*/
int bsop_rangeLS(cl_ulong array_size,
                 cl_ulong local_work_group_size,
                 int num_devices, 
                 int vector_width,
                 int verbose_flag)
{
  int i;

  cl_ulong num_workgroups;

  /* Compute the number of work groups needed to handle the array (only used when kernel is not a Task) */
  num_workgroups = array_size / (vector_width * local_work_group_size);

  global_work_size[0] = (size_t) ((num_workgroups * local_work_group_size)/num_devices);
  local_work_size[0] = (size_t) local_work_group_size;
  if (verbose_flag) {
    printf("BlackScholes workload: global_work_size = %d, local_work_size = %d\n",
           (int) global_work_size[0], (int) local_work_size[0]);
  }


  for (i = 0; i < num_devices; i++)
  {
    /* make sure the local work group size is okay */
    if (wg_validity_checked == 0)
    {
      if (cluCheckLocalWorkgroupSize (devices[i], kernels[i], 1 , local_work_size) == CL_FALSE)
      {
        fprintf (stderr, "BlackScholes workload error: local work group size = %zu is not supported\n", 
            local_work_size[0]);
        exit (EXIT_FAILURE);
      }
      wg_validity_checked = 1;
    }
    cluSetKernelNDRange (clu, kernels[i], 1, NULL, global_work_size, local_work_size);
    cluRunKernel (clu, kernels[i], NULL, 7, sizeof (cl_mem), (void*) &memobjs[i][0],
                                            sizeof (cl_mem), (void*) &memobjs[i][1],
                                            sizeof (cl_mem), (void*) &memobjs[i][2],
                                            sizeof (cl_mem), (void*) &memobjs[i][3],
                                            sizeof (cl_mem), (void*) &memobjs[i][4],
                                            sizeof (cl_mem), (void*) &memobjs[i][5],
                                            sizeof (cl_mem), (void*) &memobjs[i][6]);
  } // for i

  for (i = 0; i < num_devices; i++)
  {
    cl_int err;
    err = clFinish (cmd_queues[i]);
    CLU_CHECK_ERROR ("clFinish", err);
  }


  return 0;
}

/*===================================================================================================*/
/* This specialist driver routine prepares for a call to an NDRange kernel which performs            */
/* single-buffered async_work_group_copy calls to read and write data for an entire local work group.*/
/*===================================================================================================*/

template <typename FLOAT> int bsop_rangeAWGC(FLOAT primer, cl_ulong array_size,
                                             cl_ulong local_work_group_size,
                                             int num_devices, 
                                             int vector_width,
                                             int verbose_flag)
{
  int kernel_size;
  cl_int err;
  cl_ulong num_workgroups;
  int i;

  /* Compute the number of work groups needed to handle the array (only used when kernel is not a Task) */
  num_workgroups = array_size / (vector_width * local_work_group_size);

  global_work_size[0] = (size_t) ((num_workgroups * local_work_group_size)/num_devices);
  local_work_size[0] = (size_t) local_work_group_size;
  if (verbose_flag) {
    printf("BlackScholes workload: global_work_size = %d, local_work_size = %d\n",
           (int) global_work_size[0], (int) local_work_size[0]);
  }

  kernel_size = vector_width * 2 * local_work_group_size;


  for (i = 0; i < num_devices; i++)
  {
    if (wg_validity_checked == 0)
    {
      /* make sure the local work group size is okay */
      if (cluCheckLocalWorkgroupSize (devices[i], kernels[i], 1 , local_work_size) == CL_FALSE)
      {
        fprintf (stderr, "BlackScholes workload error: local work group size = %zu is not supported\n", 
            local_work_size[0]);
        exit (EXIT_FAILURE);
      }

      wg_validity_checked = 1;

    }

    cluSetKernelNDRange (clu, kernels[i], 1, NULL, global_work_size, local_work_size);
    cluRunKernel (clu, kernels[i], NULL, 14,
                                         sizeof (cl_mem), (void*) &memobjs[i][0],
                                         sizeof (cl_mem), (void*) &memobjs[i][1],
                                         sizeof (cl_mem), (void*) &memobjs[i][2],
                                         sizeof (cl_mem), (void*) &memobjs[i][3],
                                         sizeof (cl_mem), (void*) &memobjs[i][4],
                                         sizeof (cl_mem), (void*) &memobjs[i][5],
                                         sizeof (cl_mem), (void*) &memobjs[i][6],
                                         sizeof(FLOAT) * kernel_size, NULL,
                                         sizeof(FLOAT) * kernel_size, NULL,
                                         sizeof(FLOAT) * kernel_size, NULL,
                                         sizeof(FLOAT) * kernel_size, NULL,
                                         sizeof(FLOAT) * kernel_size, NULL,
                                         sizeof(FLOAT) * kernel_size, NULL,
                                         sizeof(FLOAT) * kernel_size, NULL);
  } // for i

  for (i = 0; i < num_devices; i++)
  {
    err = clFinish (cmd_queues[i]);
    CLU_CHECK_ERROR ("clFinish", err);
  }


  return 0;
}

/*===================================================================================================*/
/* This specialist driver routine prepares for a call to a "task" kernel which operates on large     */
/* subsections of the input and output array, using double-buffered async_work_group_copy calls.     */
/*===================================================================================================*/
int bsop_task(cl_ulong array_size, 
              cl_uint copy_size,
              unsigned int num_devices, 
              int code_selector,
              int vector_width,
              int verbose_flag,
              unsigned int num_total_tasks)
{
  int num_work_items_per_task;
  cl_ulong wgsz;
  unsigned int i, j;

  wgsz = array_size / vector_width;     /* Unroll by vector width, for example, 4 */

  if (verbose_flag) {
    printf("BlackScholes workload: number of tasks = %d\n", num_total_tasks);
  }
  num_work_items_per_task = wgsz / num_total_tasks;

  
  for (j = 0; j < num_devices; j++)
  {
    /* Execute the kernel as tasks */
    for (i = 0; i < num_total_tasks/num_devices; ++i) {
      if (code_selector == TASK_LOAD_STORE) {
        cluRunKernel (clu, kernels[j], NULL, 9,  
                                             sizeof (cl_mem), (void*) &memobjs[j][0],
                                             sizeof (cl_mem), (void*) &memobjs[j][1],
                                             sizeof (cl_mem), (void*) &memobjs[j][2],
                                             sizeof (cl_mem), (void*) &memobjs[j][3],
                                             sizeof (cl_mem), (void*) &memobjs[j][4],
                                             sizeof (cl_mem), (void*) &memobjs[j][5],
                                             sizeof (cl_mem), (void*) &memobjs[j][6],
                                             sizeof (cl_uint), (void*) &i,
                                             sizeof (cl_uint), (void*)&num_work_items_per_task);
       
       } else {
        cluRunKernel (clu, kernels[j], NULL, 17,  
                                             sizeof (cl_mem), (void*) &memobjs[j][0],
                                             sizeof (cl_mem), (void*) &memobjs[j][1],
                                             sizeof (cl_mem), (void*) &memobjs[j][2],
                                             sizeof (cl_mem), (void*) &memobjs[j][3],
                                             sizeof (cl_mem), (void*) &memobjs[j][4],
                                             sizeof (cl_mem), (void*) &memobjs[j][5],
                                             sizeof (cl_mem), (void*) &memobjs[j][6],
                                             sizeof (cl_uint), (void*) &i,
                                             2 * copy_size, NULL,
                                             2 * copy_size, NULL,
                                             2 * copy_size, NULL,
                                             2 * copy_size, NULL,
                                             2 * copy_size, NULL,
                                             2 * copy_size, NULL,
                                             2 * copy_size, NULL,
                                             sizeof (cl_uint), (void*)&num_work_items_per_task,
                                             sizeof (cl_uint), (void*)&copy_size);
      }
    } // for i
  } // for j

  for (j = 0; j < num_devices; j++)
  {
    cl_int err = clFinish (cmd_queues[j]);
    CLU_CHECK_ERROR ("clFinish", err);
  }


  return 0;
}

template <typename FLOAT> int bsop(FLOAT primer) {

  unsigned int i;
  cl_int rc, err;
  cl_uint max_compute_units;
  cl_ulong* num_tasks;
  cl_context context;
  cl_device_id device_id;
  
  cl_uint copy_size = DEFAULT_COPY_SIZE;
  unsigned char** rawbufs;
  unsigned int num_total_tasks = 0;

  /* Variables used for performance measurement */
  float timev1;
  float timev2;
  float timev3;
  float timev4;
  float timev5;
  float timev6;
  float timev7;

  /* Pointers used to allocate memory and split that memory into input and output arrays */
  /* These pointers point to the data buffers needed for Black Scholes computation */
  FLOAT *cpflag_fptr;
  FLOAT *S0_fptr, *K_fptr, *r_fptr, *sigma_fptr, *T_fptr, *answer_fptr;
  void *cpflag, *S0, *K, *r, *sigma, *T, *answer;

  char device_name[MAX_DEVICE_NAME_LEN] = { 0 };
  char kernel_selector[MAX_KERNEL_SELECTOR_LEN];

  unsigned int num_devices = 0;

  int numa_available = 0;
#if defined(cl_ext_migrate_memobject) && defined(cl_ext_device_fission)
  clCreateSubDevicesEXT_fn clCreateSubDevicesEXT = NULL;
  clReleaseDeviceEXT_fn clReleaseDeviceEXT = NULL;
  clEnqueueMigrateMemObjectEXT_fn clEnqueueMigrateMemObjectEXT = NULL;
#endif

  /* ================================================================ */
  /*  Setup */
  /* ================================================================ */

  /* Start timing when setup begins */
  startclock();
  timev1 = readclock();      

  clu = cluInit(NULL);
  /* get the cl_context from CLU */
  context = cluGetCLContext (clu);

  /* get the root device */
  err = clGetDeviceIDs (clu->platform, device_type, 1, &device_id, NULL);
  CLU_CHECK_ERROR ("clGetDeviceIDs", err);

  /* query the device properties and supported extensions, we want out-of-order queue and numa    */
  cl_command_queue_properties device_q_prop;
  err = clGetDeviceInfo (device_id, CL_DEVICE_QUEUE_PROPERTIES, sizeof (cl_command_queue_properties), &device_q_prop, NULL);

  /* see if extension required for double precision is supported on the device */

  if (double_flag) {
    if (cluCheckDeviceExtensions(device_id, "cl_khr_fp64") == CL_FALSE) {
    //if (cluCheckDeviceExtensions(device_id, "cl_amd_fp64") == CL_FALSE) {
      fprintf(stderr, "double precision was requested, but is not supported by the device.  Leaving...\n"); fflush(stderr);
      exit(EXIT_SUCCESS);
    }
  }

  /* see if extension required for numa are supported on the device */

  num_devices = 1;
  devices = (cl_device_id*)malloc (num_devices * sizeof (cl_device_id));
  if (!devices)
  {
    fprintf (stderr, "BlackScholes app Error - Cannot allocate memory for device. Exit!, file = %s, line = %d\n", __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  devices[0] = device_id;

  numa_available = cluCheckDeviceExtensions(device_id, "cl_ext_device_fission cl_ext_migrate_memobject");
#if defined(cl_ext_migrate_memobject) && defined(cl_ext_device_fission)
  if (numa_available) {
    cl_device_partition_property_ext dpp_list[5];
    size_t dpp_return_size;
    CLU_CHECK_ERROR("clGetDeviceInfo CL_DEVICE_AFFINITY_DOMAINS_EXT", clGetDeviceInfo
                  (device_id, CL_DEVICE_AFFINITY_DOMAINS_EXT,
                   6*sizeof(cl_device_partition_property_ext), (cl_device_partition_property_ext *) &dpp_list, (size_t *) &dpp_return_size));
    int found = 0;
    for (i=0; i<dpp_return_size/(sizeof(cl_device_partition_property_ext)); ++i) if (dpp_list[i] == CL_AFFINITY_DOMAIN_NUMA_EXT) found = 1;
    if (!found) numa_available = 0;
  }
#endif
  if (numa_flag) {
    if (numa_available) {
       unsigned int num_sub_devices = 0;
#if defined(cl_ext_migrate_memobject) && defined(cl_ext_device_fission)
       clEnqueueMigrateMemObjectEXT = 
       (cl_int (*)(_cl_command_queue*, cl_uint, _cl_mem* const*, cl_mem_migration_flags_ext, cl_uint, _cl_event* const*, _cl_event**))
       clGetExtensionFunctionAddress("clEnqueueMigrateMemObjectEXT");
  
      /* get the device fission extension functions */
      clCreateSubDevicesEXT = (cl_int (*) (_cl_device_id*, const cl_device_partition_property_ext*, cl_uint, _cl_device_id**, cl_uint*))
         clGetExtensionFunctionAddress("clCreateSubDevicesEXT");
      clReleaseDeviceEXT = (cl_int (*)(_cl_device_id*)) clGetExtensionFunctionAddress("clReleaseDeviceEXT");
  
      /* set the properties for partitioning the root device */
      cl_device_partition_property_ext properties[3] = {CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT, CL_AFFINITY_DOMAIN_NUMA_EXT, 0};
      
      rc = clCreateSubDevicesEXT(device_id, properties, 0, NULL, &num_sub_devices);
      CLU_CHECK_ERROR("clCreateSubDevicesEXT getting number of sub_devices", rc);
  
      free(devices); /* undo the previous allocation */
      devices = (cl_device_id*)malloc (num_sub_devices * sizeof (cl_device_id));
      if (!devices)
      {
        fprintf (stderr, "BlackScholes app Error - Cannot allocate memory for sub devices. Exit!, file = %s, line = %d\n", __FILE__, __LINE__);
        exit (EXIT_FAILURE);
      }

      rc = clCreateSubDevicesEXT(device_id, properties, num_sub_devices, devices, NULL);
      CLU_CHECK_ERROR("clCreateSubDevicesEXT getting number of sub_devices", rc);
      num_devices = num_sub_devices;
#endif

      if (verbose_flag)
        printf("BlackScholes workload: using numa - num_sub_devices=%d\n", num_sub_devices);
    }
    else {
      if (verbose_flag)
        printf("BlackScholes workload: numa was requested, but is unavailable.\n");
    }
  }

  cmd_queues = (cl_command_queue*)malloc (num_devices * sizeof (cl_command_queue));
  if (!cmd_queues)
  {
    fprintf (stderr, "BlackScholes app Error - Cannot allocate memory for command queue. Exit!, file = %s, line = %d\n", 
        __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  
  num_tasks = (cl_ulong*)malloc (num_devices * sizeof (cl_ulong));
  if (!num_tasks)
  {
    fprintf (stderr, "BlackScholes app Error - Cannot allocate memory for num_tasks. Exit!, file = %s, line = %d\n", 
        __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  for (i = 0; i < num_devices; i++)
  {
    if (device_q_prop & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
    {
      if ((i == 0) && (verbose_flag))
      {
        printf ("BlackScholes workload:  using an out-of-order queue\n");
      }
      cmd_queues[i] = cluCreateCmdQueue (clu, devices[i], 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
    }
    else
    {
      if (verbose_flag)
      {
        printf ("BlackScholes workload: using an in-order queue\n");
      }
      cmd_queues[i] = cluCreateCmdQueue (clu, devices[i], 0, 0);
    }

    CLU_CHECK_ERROR("clGetDeviceInfo CL_DEVICE_MAX_COMPUTE_UNITS", 
        clGetDeviceInfo
                (devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
                 &max_compute_units, NULL));

     /* use the max number of computer units for the device */
    num_tasks[i] = max_compute_units;

    num_total_tasks = num_total_tasks + num_tasks[i];

  } 

  /* round the number of tasks down to the next power of 2. The goal here is to
  * saturate all the compute units with tasks  */
  max_compute_units = num_total_tasks;
  num_total_tasks = 1 << (ilogbf((float)(num_total_tasks)));
  if (num_total_tasks < max_compute_units) num_total_tasks = num_total_tasks << 1;
  printf ("num_total_tasks = %d\n", num_total_tasks);

  /* Determine device information for this device. 
   * Start by reading back the device type, in case it was left as DEFAULT */
  CLU_CHECK_ERROR("clGetDeviceInfo CL_DEVICE_TYPE", clGetDeviceInfo
              (device_id, CL_DEVICE_TYPE, (size_t) sizeof(cl_device_type),
               (void *) &device_type, NULL));

  /* If the kernel type / code selector was not set, set it to the default for the device */
  if (code_selector == AUTO_KERNEL_TYPE) {
    if (device_type == CL_DEVICE_TYPE_CPU) {
      code_selector = DEFAULT_CODE_SELECTOR_CPU;
    } else if (device_type == CL_DEVICE_TYPE_ACCELERATOR) {
      code_selector = DEFAULT_CODE_SELECTOR_ACCELERATOR;
    } else if (device_type == CL_DEVICE_TYPE_GPU) {
      code_selector = DEFAULT_CODE_SELECTOR_GPU;
    } else {
      fprintf(stderr, "ERROR:  Unknown device type\n");
      exit(EXIT_FAILURE);
    }
  }

  CLU_CHECK_ERROR("clGetDeviceInfo CL_DEVICE_NAME", clGetDeviceInfo
              (device_id, CL_DEVICE_NAME, MAX_DEVICE_NAME_LEN, device_name,
               NULL));

  if (verbose_flag) {
    printf("BlackScholes workload: using device_type: %s\n", cluGetCLDeviceTypeString(device_type));
    printf("BlackScholes workload: using kernel: %s\n", code_selector_to_string(code_selector));
    printf("BlackScholes workload: using buffering method: %s\n", buffer_selector_to_string(buffer_selector)); 
  }

  /* If the user has not specified a vector_width, set to OpenCL's preferred vector width. */
  if (vector_width == AUTO_VECTOR_WIDTH) {
    if (sizeof(FLOAT) == 4) {
      CLU_CHECK_ERROR("clGetDeviceInfo CL_DEVICE_PERFERRED_VECTOR_WIDTH_FLOAT", clGetDeviceInfo
                     (device_id, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
                      sizeof(cl_uint), (cl_uint *) & vector_width, NULL));
    }
    else {
      CLU_CHECK_ERROR("clGetDeviceInfo CL_DEVICE_PERFERRED_VECTOR_WIDTH_DOUBLE", clGetDeviceInfo
                     (device_id, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
                      sizeof(cl_uint), (cl_uint *) & vector_width, NULL));
    }
  }

  if (verbose_flag) {
    printf("BlackScholes workload: using vector_width: %u\n", vector_width);
  }



  /* Time when setup ends and buffer creation and initialization begins */
  timev2 = readclock();    

  /*================================================================
   *  Data Initialization:
   *    There are several steps to initialize data depends on the 
   *    buffering method one selects 
   *    
   *    For NONE_BUFFER
   *       _ malloc data on host
   *       _ initialize data with some starting values
   *       _ create OpenCL Memory buffer objects with the appropriate flags
   *       _ write the host data to the memory buffers through clEnqueueWriteBuffer
   * 
   *    For COPY_BUFFER
   *       _ malloc data on host
   *       _ initialize data with some starting values
   *       _ create OpenCL Memory buffer objects with the host buffer
   *       
   * 
   *    For ALLOC_COPY
   *       _ malloc data on host
   *       _ initialize data with some starting values
   *       _ create OpenCL Memory buffer objects with the host buffer and
   *          appropriate flatg (CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR) 
   *       _ free the data malloced on host
   *  
   *    For ALLOC_BUFFER
   *       _ create OpenCL mem buffer objects
   *       _ map them
   *       _ initialize data with some starting values
   *       _ unmap them
   *
   *    For USE_HOST_PTR
   *       _ malloc data on host
   *       _ initialize data with some starting values
   *       _ create OpenCL mem buffer objects with the host malloced data
   *       
   *                                                       
   *================================================================ */

  memobjs = (cl_mem**)malloc (num_devices * sizeof (cl_mem*));
  if (!memobjs)
  {
    fprintf (stderr, "BlackScholes workload error: cannot allocate memory for memobjs, FILE = %s, line = %d\n", 
        __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  rawbufs = (unsigned char**)malloc (num_devices * sizeof (void*));
  if (!rawbufs)
  {
    fprintf (stderr, "BlackScholes workload error: cannot allocate memory for data, FILE = %s, line = %d\n", 
        __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  /* declare some variables for intializing data */ 
  int idx;
  int S0Kdex, rdex, sigdex, Tdex;
  FLOAT S0_array[4] = { 42.0, 30.0, 54.0, 66.0 };
  FLOAT K_array[16] = { 40.0, 36.0, 44.0, 48.0,
    24.0, 28.0, 32.0, 36.0,
    48.0, 52.0, 56.0, 60.0,
    60.0, 64.0, 68.0, 72.0
  };
  FLOAT r_array[4] = { 0.1, 0.09, 0.11, 0.12 };
  FLOAT sigma_array[4] = { 0.2, 0.15, 0.25, 0.30 };
  FLOAT T_array[4] = { 0.5, 0.25, 0.75, 1.0 };
  idx = 0;

  cpflag = NULL;
  S0 = NULL;
  K = NULL;
  r = NULL;
  sigma = NULL;
  T = NULL;
  answer = NULL;

  /*  find the size of memory needed for each array */
  int memsize = (array_size * sizeof(FLOAT))/num_devices;


  cl_uint preferred_alignment;
  err = clGetDeviceInfo(device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &preferred_alignment, NULL);
  preferred_alignment /= 8;  /* convert from units of bits to units of bytes */

  for (i = 0; i < num_devices; i++)
  {
    unsigned int k;

    /* regardless of the buffering method we use, we need the OpenCL mem objects so
     * let's allocate them */
    memobjs[i] = (cl_mem*) malloc (7 * sizeof (cl_mem));
    if (!memobjs[i])
    {
      fprintf (stderr, "BlackScholes workload error: cannot allocate memory for memobjs[%d], FILE = %s, line = %d\n", 
          i, __FILE__, __LINE__);
      exit (EXIT_FAILURE);
    }

    /* Allocate the host buffers when needed */
    switch (buffer_selector)
    {
      case NONE_BUFFER:
      case COPY_BUFFER:
      case ALLOC_COPY_BUFFER:
      case USE_BUFFER:
        {
          void* tmp;
          posix_memalign (&tmp, preferred_alignment, memsize * 7);
          rawbufs[i] = (unsigned char*)tmp;
          if (rawbufs[i] == NULL)
          {
            fprintf (stderr, "BlackScholes workload error: cannot allocate memory for data buffers, FILE = %s, line = %d\n",
                       __FILE__, __LINE__);
            exit (EXIT_FAILURE);
          }

          cpflag = (void*)(rawbufs[i]);
          S0     = (void*)&(rawbufs[i][1 * memsize]);
          K      = (void*)&(rawbufs[i][2 * memsize]);
          r      = (void*)&(rawbufs[i][3 * memsize]);
          sigma  = (void*)&(rawbufs[i][4 * memsize]);
          T      = (void*)&(rawbufs[i][5 * memsize]);
          answer = (void*)&(rawbufs[i][6 * memsize]); 
        }
        break;
      case ALLOC_BUFFER:
        /*  no-op */
        break;
    }

    /* Create the OpenCL memory objects for NONE_BUFFER and ALLOC_BUFFER method since
     * these two methods require the memory objects to exist before we can
     * initialize the buffers with starting values */
    switch (buffer_selector)
    {
      case NONE_BUFFER:
      case ALLOC_BUFFER:
        {
          /*  create 7 OpenCL memory buffers for the seven data buffers */
          for (k = 0; k < 7; k++)
          {
            memobjs[i][k] =
                clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                               memsize, NULL, &rc);
            CLU_CHECK_ERROR ("clCreateBuffer memobjs[i][0]", rc);
#if defined(cl_ext_migrate_memobject) && defined(cl_ext_device_fission)
            if (numa_flag)
            {
              err = clEnqueueMigrateMemObjectEXT (cmd_queues[i], 1, &memobjs[i][k], 0, 0, NULL, NULL);
              CLU_CHECK_ERROR ("clEnqueueMigrateMemObjectEXT", err);
            }
#endif 
          }
#if defined(cl_ext_migrate_memobject) && defined(cl_ext_device_fission)
          if (numa_flag)
          {
            err = clFinish (cmd_queues[i]);
            CLU_CHECK_ERROR ("clFinish", err);
          }
#endif 
          if (buffer_selector == ALLOC_BUFFER)
          {
            /*  Map the 7 memory buffers */
            cpflag =
                clEnqueueMapBuffer(cmd_queues[i], memobjs[i][0], CL_TRUE, CL_MAP_WRITE, 0,
                                   memsize, 0, NULL, NULL, &rc);
            CLU_CHECK_ERROR ("clEnqueueMapBuffer cpflag buffer", rc);

            S0 = clEnqueueMapBuffer(cmd_queues[i], memobjs[i][1], CL_TRUE, CL_MAP_WRITE, 0,
                                    memsize, 0, NULL, NULL, &rc);
            CLU_CHECK_ERROR ("clEnqueueMapBuffer S0", rc);

            K = clEnqueueMapBuffer(cmd_queues[i], memobjs[i][2], CL_TRUE, CL_MAP_WRITE, 0,
                                   memsize, 0, NULL, NULL, &rc);
            CLU_CHECK_ERROR ("clEnqueueMapBuffer K", rc);
            r = clEnqueueMapBuffer(cmd_queues[i], memobjs[i][3], CL_TRUE, CL_MAP_WRITE, 0,
                                   memsize, 0, NULL, NULL, &rc);
            CLU_CHECK_ERROR ("clEnqueueMapBuffer r buffer", rc);

            sigma =
                clEnqueueMapBuffer(cmd_queues[i], memobjs[i][4], CL_TRUE, CL_MAP_WRITE, 0,
                                   memsize, 0, NULL, NULL, &rc);
            CLU_CHECK_ERROR ("clEnqueueMapBuffer sigma buffer", rc);

            T = clEnqueueMapBuffer(cmd_queues[i], memobjs[i][5], CL_TRUE, CL_MAP_WRITE, 0,
                                   memsize, 0, NULL, NULL, &rc);
            CLU_CHECK_ERROR ("clEnqueueMapBuffer T", rc);
            answer =
                clEnqueueMapBuffer(cmd_queues[i], memobjs[i][6], CL_TRUE, CL_MAP_WRITE, 0,
                                   memsize, 0, NULL, NULL, &rc);
            CLU_CHECK_ERROR ("clEnqueueMapBuffer answer buffer", rc);
          }

        }
        break;
      case COPY_BUFFER:
      case ALLOC_COPY_BUFFER:
      case USE_BUFFER:
        /*  no-op */
        break;
    }

    /* Load the 7 input arrays with reasonable test numbers */
    cpflag_fptr = (FLOAT *) cpflag;
    S0_fptr = (FLOAT *) S0;
    K_fptr = (FLOAT *) K;
    r_fptr = (FLOAT *) r;
    sigma_fptr = (FLOAT *) sigma;
    T_fptr = (FLOAT *) T;
    answer_fptr = (FLOAT *) answer;

    /* Here we load some values to simulate real-world options parameters. 
     * Users who wish to provide live data would replace this clause 
     * with their own initialization of the arrays. */
    for (k = 0; k < array_size/num_devices; ++k) {
      int *temp_int;
      Tdex = (idx >> 1) & 0x3;
      sigdex = (idx >> 3) & 0x3;
      rdex = (idx >> 5) & 0x3;
      S0Kdex = (idx >> 7) & 0xf;

      temp_int = (int *) &cpflag_fptr[k];
      temp_int[0] = (idx & 1) ? 0xffffffff : 0;
      if (sizeof(FLOAT) == 8) temp_int[1] = (idx & 1) ? 0xffffffff : 0;

      S0_fptr[k] = S0_array[S0Kdex >> 2];
      K_fptr[k] = K_array[S0Kdex];
      r_fptr[k] = r_array[rdex];
      sigma_fptr[k] = sigma_array[sigdex];
      T_fptr[k] = T_array[Tdex];
      answer_fptr[k] = 0.0f;
      idx++;
    }

    switch (buffer_selector)
    {
      case NONE_BUFFER:
        {
          /* Write the malloced host buffers to the OpenCL memory objects 
           */
          rc = clEnqueueWriteBuffer(cmd_queues[i], memobjs[i][0], CL_TRUE, 0, memsize,
                                    cpflag, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueWriteBuffer cpflag buffer", rc);
          rc = clEnqueueWriteBuffer(cmd_queues[i], memobjs[i][1], CL_TRUE, 0, memsize, S0,
                                    0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueWriteBuffer S0 buffer", rc);

          rc = clEnqueueWriteBuffer(cmd_queues[i], memobjs[i][2], CL_TRUE, 0, memsize, K,
                                    0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueWriteBuffer K buffer", rc);

          rc = clEnqueueWriteBuffer(cmd_queues[i], memobjs[i][3], CL_TRUE, 0, memsize, r,
                                    0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueWriteBuffer r buffer", rc);

          rc = clEnqueueWriteBuffer(cmd_queues[i], memobjs[i][4], CL_TRUE, 0, memsize,
                                    sigma, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueWriteBuffer sigma buffer", rc);

          rc = clEnqueueWriteBuffer(cmd_queues[i], memobjs[i][5], CL_TRUE, 0, memsize, T,
                                    0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueWriteBuffer T buffer", rc);

          rc = clEnqueueWriteBuffer(cmd_queues[i], memobjs[i][6], CL_TRUE, 0, memsize, answer,
                                    0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueWriteBuffer answer buffer", rc);

        }
        break;
      case COPY_BUFFER:
      case ALLOC_COPY_BUFFER:
      case USE_BUFFER:
        {
        
          cl_mem_flags flags = 0;
          switch (buffer_selector) 
          {
          case COPY_BUFFER:
            flags |= CL_MEM_COPY_HOST_PTR;
            break;
          case ALLOC_COPY_BUFFER:
            flags |= CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR;
            break;
          case USE_BUFFER:
            flags |= CL_MEM_USE_HOST_PTR;
            break;
          }

          /*  Create OpenCL memory objects based on the cl_mem_flags above with previously 
           *  initialized host data buffers */
          memobjs[i][0] = clCreateBuffer(context, CL_MEM_READ_ONLY | flags, memsize, cpflag, &rc);
          CLU_CHECK_ERROR ("clCreateBuffer for cpflag buffer", rc);

          memobjs[i][1] = clCreateBuffer(context, CL_MEM_READ_ONLY | flags, memsize, S0, &rc);
          CLU_CHECK_ERROR ("clCreateBuffer for S0 buffer", rc);

          memobjs[i][2] = clCreateBuffer(context, CL_MEM_READ_ONLY | flags, memsize, K, &rc);
          CLU_CHECK_ERROR ("clCreateBuffer for K buffer", rc);

          memobjs[i][3] = clCreateBuffer(context, CL_MEM_READ_ONLY | flags, memsize, r, &rc);
          CLU_CHECK_ERROR ("clCreateBuffer for r buffer", rc);

          memobjs[i][4] = clCreateBuffer(context, CL_MEM_READ_ONLY | flags, memsize, sigma, &rc);
          CLU_CHECK_ERROR ("clCreateBuffer for sigma buffer", rc);

          memobjs[i][5] = clCreateBuffer(context, CL_MEM_READ_ONLY | flags, memsize, T, &rc);
          CLU_CHECK_ERROR ("clCreateBuffer for T buffer", rc);

          memobjs[i][6] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | flags, memsize, answer, &rc);
          CLU_CHECK_ERROR ("clCreateBuffer for answer buffer", rc);

          /* Free up transient host buffers. They're no longer needed. */
          if (buffer_selector == ALLOC_COPY_BUFFER) {
            free(rawbufs[i]);
          }
        }
        break;
      case ALLOC_BUFFER:
        {
          /* Unmap OpenCL buffer memory objects */
          rc = clEnqueueUnmapMemObject(cmd_queues[i], memobjs[i][0], cpflag, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueUnmapMemObject cpflag buffer", rc);

          rc = clEnqueueUnmapMemObject(cmd_queues[i], memobjs[i][1], S0, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueUnmapMemObject S0 buffer", rc);

          rc = clEnqueueUnmapMemObject(cmd_queues[i], memobjs[i][2], K, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueUnmapMemObject K buffer", rc);
         
          rc = clEnqueueUnmapMemObject(cmd_queues[i], memobjs[i][3], r, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueUnmapMemObject r buffer", rc);

          rc = clEnqueueUnmapMemObject(cmd_queues[i], memobjs[i][4], sigma, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueUnmapMemObject sigma buffer", rc);

          rc = clEnqueueUnmapMemObject(cmd_queues[i], memobjs[i][5], T, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueUnmapMemObject T buffer", rc);

          rc = clEnqueueUnmapMemObject(cmd_queues[i], memobjs[i][6], answer, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueUnmapMemObject answer buffer", rc);
        }
        break;
    }
  
    rc = clFinish (cmd_queues[i]);
    CLU_CHECK_ERROR ("clFinish", rc);
  }
 
  /* Time when data initialization finished, kernels build begin */
  timev3 = readclock();    

  /* ================================================================ */
  /*  Build */
  /* ================================================================ */

  /* The next batch of code computes the string containing the various 
   * defines needed for correct kernel operation */
  switch (code_selector) {
  case RANGE_LOAD_STORE:
    sprintf(kernel_selector, "-DRANGE_LOAD_STORE");
    break;
  case RANGE_ASYNC_WORKGROUP_COPY:
    sprintf(kernel_selector, "-DRANGE_ASYNC_WORKGROUP_COPY");
    break;
  case TASK_DOUBLE_BUFFER:
    sprintf(kernel_selector, "-DTASK_DOUBLE_BUFFER");
    break;
  case TASK_SINGLE_BUFFER:
    sprintf(kernel_selector, "-DTASK_SINGLE_BUFFER");
    break;
  case TASK_LOAD_STORE:
    sprintf(kernel_selector, "-DTASK_LOAD_STORE");
    break;
  }

  if (double_flag) {
    sprintf(kernel_selector, "%s -D_DOUBLE_ -DSIZE=%d ", kernel_selector, vector_width);
  }
  else {
    if (use_fast_native_math) {
      sprintf(kernel_selector, "%s -D_%s_ -DSIZE=%d -cl-fast-relaxed-math -DUSE_FAST_MATH", kernel_selector, "SINGLE", vector_width);
    }
    else {
      sprintf(kernel_selector, "%s -D_SINGLE_ -DSIZE=%d ", kernel_selector, vector_width);
    }
  }

  printf ("BlackScholes workload: loading and compiling kernel with build options %s...\n", kernel_selector);
  /* Create the compute kernel in the program we wish to run */
  kernels = (cl_kernel*)malloc (num_devices * sizeof (cl_kernel));
  for (i = 0; i < num_devices; i++)
  {
    kernels[i] = cluCreateKernel (clu, cmd_queues[i], kernel_source_file, "bsop_kernel", kernel_selector, CLU_SOURCE);
  }

  /* Time when kernel builds are finished, and execution begins */
  timev4 = readclock();     
  /* ================================================================ */
  /*  Verification of input parameters to ensure there's enough resources  */
  /* ================================================================ */
  /*
   * Verifying if there's enough  local memory on the device to execute the kernel
   */
  for (i = 0; i < num_devices; i++)
  {
    cl_ulong avail_local_mem;
    int kernel_size = vector_width * 2 * local_work_group_size;

    avail_local_mem = cluGetAvailableLocalMem (devices[i], kernels[i]);
    if (code_selector == RANGE_ASYNC_WORKGROUP_COPY)
    {
      if ((sizeof (FLOAT) * 7 * kernel_size) > avail_local_mem)
      {
        fprintf (stderr, "BlackScholes workload error: need %llu bytes of memory in local memory, avail local mem = %llu\n", 
            (unsigned long long)(sizeof (FLOAT) * 7 * kernel_size), (unsigned long long) avail_local_mem);
        exit (EXIT_FAILURE);
      }
    }
    else if ((code_selector == TASK_DOUBLE_BUFFER) || (code_selector == TASK_SINGLE_BUFFER))
    {
      /*  There are 7 buffers, we're double buffering so we need 14 * copysize buffers in local memory  */
      if ((14 * copy_size) > avail_local_mem)
      {
        /* readjust copysize to fit  */
        copy_size = avail_local_mem/16; /* use 16 instead of 14 to leave more room */
        /* makesure it's a power of 2 */
        copy_size  = 1 << (ilogbf((FLOAT)(copy_size)));
      }
      if (verbose_flag)
      {
        printf ("BlackScholes workload: copy_size for task kernel = %d\n",  copy_size);
      }
    }
  }

  cl_ulong max_alloc_size;

  /*
   * Verifying that all the requested data fit in available memory 
   */
  CLU_CHECK_ERROR("clGetDeviceInfo CL_DEVICE_MAX_MEM_ALLOC_SIZE", clGetDeviceInfo
              (device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong),
               &max_alloc_size, NULL));

  /* Check the selector variables to ensure they contain reasonable values, otherwise exit */
  if (array_size * 7 * sizeof(FLOAT) > max_alloc_size) { 
    fprintf(stderr, "BlackScholes workload error: array_size = %llu exceeds available memory = %llu\n",
        (unsigned long long)(array_size * 7 * sizeof(FLOAT)), (unsigned long long)max_alloc_size);
    exit(EXIT_FAILURE);
  }

  if (code_selector >= TASK_DOUBLE_BUFFER) {
    if (array_size < vector_width * num_total_tasks * copy_size/sizeof (FLOAT)) {
      printf
          ("array_size must be at least  %d\n", 
          (int)((vector_width * num_total_tasks * copy_size) / (sizeof (FLOAT))));
      exit(EXIT_FAILURE);
    }
  } else {                      /*  NDRange kernel */
    if (array_size < vector_width * local_work_group_size) {
      printf
          ("array_size must be at least %llu\n", (unsigned long long) (vector_width * local_work_group_size));
      exit(EXIT_FAILURE);
    }
  }


  /* ================================================================ */
  /*  Execution */
  /* ================================================================ */

  /* Call the appropriate specialist to finalize preparations for executing the kernel */

  /* Execute range or task kernel */
  printf ("BlackScholes workload: executing the kernel\n");
  if (code_selector == RANGE_LOAD_STORE)
  {
    rc = bsop_rangeLS(array_size, local_work_group_size, num_devices, 
        vector_width, verbose_flag);
  }
  else if (code_selector == RANGE_ASYNC_WORKGROUP_COPY)
  {
    rc = bsop_rangeAWGC(primer, array_size, local_work_group_size, num_devices, 
        vector_width, verbose_flag);
  }
  else
  {
    rc = bsop_task(array_size, copy_size, num_devices, code_selector, 
        vector_width, verbose_flag, num_total_tasks);
  }


  /*----------------*/
  /* Post-execution */
  /*----------------*/
  /* Time when post-execution finished, and verification begins */
  timev5 = readclock();    

  /* ================================================================ */
  /*  Data Verification */
  /* ================================================================ */

  if (verify_flag) {

    /*-----------------------*/
    /* Pre-data verification */
    /*-----------------------*/

    double maxouterr;
    int maxouterrindex;
    if (verbose_flag) {
      printf("array_size = %llu\n",  (unsigned long long)array_size);
    }

    printf ("BlackScholes workload: verifying results\n");

    int memsize = (array_size * sizeof(FLOAT))/num_devices;
    for (i = 0; i < num_devices; i++)
    {
      switch (buffer_selector)
      {
        case ALLOC_BUFFER:
        case ALLOC_COPY_BUFFER:
        {
          /* Map OpenCL buffer memory objects */
          cpflag =
              clEnqueueMapBuffer(cmd_queues[i], memobjs[i][0], CL_TRUE, CL_MAP_READ, 0,
                                 memsize, 0, NULL, NULL, &rc);
          CLU_CHECK_ERROR ("clEnqueueMapBuffer cpflag buffer", rc);

          S0 = clEnqueueMapBuffer(cmd_queues[i], memobjs[i][1], CL_TRUE, CL_MAP_READ, 0,
                                  memsize, 0, NULL, NULL, &rc);
          CLU_CHECK_ERROR ("clEnqueueMapBuffer S0", rc);

          K = clEnqueueMapBuffer(cmd_queues[i], memobjs[i][2], CL_TRUE, CL_MAP_READ, 0,
                                 memsize, 0, NULL, NULL, &rc);
          CLU_CHECK_ERROR ("clEnqueueMapBuffer K", rc);
          r = clEnqueueMapBuffer(cmd_queues[i], memobjs[i][3], CL_TRUE, CL_MAP_READ, 0,
                                 memsize, 0, NULL, NULL, &rc);
          CLU_CHECK_ERROR ("clEnqueueMapBuffer r buffer", rc);

          sigma =
              clEnqueueMapBuffer(cmd_queues[i], memobjs[i][4], CL_TRUE, CL_MAP_READ, 0,
                                 memsize, 0, NULL, NULL, &rc);
          CLU_CHECK_ERROR ("clEnqueueMapBuffer sigma buffer", rc);

          T = clEnqueueMapBuffer(cmd_queues[i], memobjs[i][5], CL_TRUE, CL_MAP_READ, 0,
                                 memsize, 0, NULL, NULL, &rc);
          CLU_CHECK_ERROR ("clEnqueueMapBuffer T", rc);
          answer =
              clEnqueueMapBuffer(cmd_queues[i], memobjs[i][6], CL_TRUE, CL_MAP_READ, 0,
                                 memsize, 0, NULL, NULL, &rc);
          CLU_CHECK_ERROR ("clEnqueueMapBuffer answer buffer", rc);

        }
        break;
        case NONE_BUFFER:
        case COPY_BUFFER:
        {
          /* Read OpenCL answer memory object into the corresponding host buffer */
          rc = clEnqueueReadBuffer(cmd_queues[i], memobjs[i][6], CL_TRUE, 0, memsize,
                                   answer, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueReadBuffer answer buffer", rc);

        }
        break;

        case USE_BUFFER:
        {
          answer =
              clEnqueueMapBuffer(cmd_queues[i], memobjs[i][6], CL_TRUE, CL_MAP_READ, 0,
                                 memsize, 0, NULL, NULL, &rc);
          CLU_CHECK_ERROR ("clEnqueueMapBuffer answer buffer", rc);
        }
        break;
      }

        
      /*-------------------*/
      /* Data verification */
      /*-------------------*/

      /* Get addresses of single precision floating point input arrays for validation */
      cpflag_fptr = (FLOAT *) cpflag;
      S0_fptr = (FLOAT *) S0;
      K_fptr = (FLOAT *) K;
      r_fptr = (FLOAT *) r;
      sigma_fptr = (FLOAT *) sigma;
      T_fptr = (FLOAT *) T;
      answer_fptr = (FLOAT *) answer;

      /* Verify answers using single precision validation function */
      validate(S0_fptr, K_fptr, r_fptr, sigma_fptr, T_fptr, answer_fptr,
               cpflag_fptr, array_size/num_devices, &maxouterr, &maxouterrindex);

      /* Is maximum error outside the acceptable range, if so, flag it */
      if (maxouterr > 0.00002) {
        fprintf(stderr, "BlackScholes workload error: Verification failure at index %d, max error is %e\n", maxouterrindex, maxouterr);
        exit (EXIT_FAILURE);
      } else {
        if (verbose_flag) {
          printf("BlackScholes workload: Verification passes - max error is %e at index %d\n", maxouterr, maxouterrindex);
        } else {
          printf("BlackScholes workload: Maximum detected error is within acceptable tolerances.\n");
        }
      }

      /*------------------------*/
      /* Post-data verification */
      /*------------------------*/

      switch (buffer_selector)
      {
        case ALLOC_BUFFER:
        case ALLOC_COPY_BUFFER:
        {
          /* Unmap OpenCL buffer memory objects */
          rc = clEnqueueUnmapMemObject(cmd_queues[i], memobjs[i][0], cpflag, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueUnmapMemObject cpflag buffer", rc);

          rc = clEnqueueUnmapMemObject(cmd_queues[i], memobjs[i][1], S0, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueUnmapMemObject S0 buffer", rc);

          rc = clEnqueueUnmapMemObject(cmd_queues[i], memobjs[i][2], K, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueUnmapMemObject K buffer", rc);
         
          rc = clEnqueueUnmapMemObject(cmd_queues[i], memobjs[i][3], r, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueUnmapMemObject r buffer", rc);

          rc = clEnqueueUnmapMemObject(cmd_queues[i], memobjs[i][4], sigma, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueUnmapMemObject sigma buffer", rc);

          rc = clEnqueueUnmapMemObject(cmd_queues[i], memobjs[i][5], T, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueUnmapMemObject T buffer", rc);

          rc = clEnqueueUnmapMemObject(cmd_queues[i], memobjs[i][6], answer, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueUnmapMemObject answer buffer", rc);
          
          rc = clFinish (cmd_queues[i]);
          CLU_CHECK_ERROR ("clFinish", rc);

        }
        break;
        
        case USE_BUFFER:
        {
          rc = clEnqueueUnmapMemObject(cmd_queues[i], memobjs[i][6], answer, 0, NULL, NULL);
          CLU_CHECK_ERROR ("clEnqueueUnmapMemObject answer buffer", rc);
          rc = clFinish (cmd_queues[i]);
        }


        break;
        case NONE_BUFFER:
        case COPY_BUFFER:
          /*  no-op */

        break;

      }
    } /*  end switch */
  } /*  end for loop */


  /* Time when verification ends and shutdown begins */
  timev6 = readclock();   

  /* ================================================================ */
  /*  Shutdown */
  /* ================================================================ */

  unsigned int j;
  /* Release memory object resources */
    
  for (j = 0; j < num_devices; j++)
  {
    for (i = 0; i < 7; i++)
      CLU_CHECK_ERROR("clReleaseMemObject", clReleaseMemObject(memobjs[j][i]));
  }
  

  /* delete all the mem_objects created */
  for (i = 0; i < num_devices; i++)
  {
#if defined(cl_ext_migrate_memobject) && defined(cl_ext_device_fission)
    if (numa_flag)
    {
      /* release the numa sub-devices */
      err = clReleaseDeviceEXT (devices[i]);
      CLU_CHECK_ERROR ("clReleaseDeviceEXT", err);
    }
#endif 
    free (memobjs[i]);

    /* Free transient host buffer storage */
    switch (buffer_selector) {
    case NONE_BUFFER:
    case COPY_BUFFER:
    case USE_BUFFER:
      free(rawbufs[i]);
      break;
    case ALLOC_BUFFER:
    case ALLOC_COPY_BUFFER:
      /* No-op */
      break;
    }
  }

  cluDestroy (clu);

  free (memobjs);
  free (cmd_queues);
  free (kernels);
  free (num_tasks);
  free (devices);
 
  /* Time when shutdown ends */
  timev7 = readclock();    

  /* ================================================================ */
  /*  Compute and print out the timing results for the various sections */
  /* ================================================================ */

  printf("Timing...\n");

  printf("setup time: %f\n", timev2 - timev1);
  printf("data initialization time: %f\n", timev3 - timev2);
  printf("kernel build time: %f\n", timev4 - timev3);
  printf("kernel execution time: %f\n", timev5 - timev4);
  printf("number of black scholes computes/sec: %e\n", (double) array_size/(timev5 - timev4));
  printf("data verification time: %f\n", timev6 - timev5);
  printf("shutdown time: %f\n", timev7 - timev6);

  return rc;
}

/* =================================================================================================== */
/*  main */
/* =================================================================================================== */

int main(int argc, char *argv[]) {
  int rc;

  /* Parse command line arguments to find out what overrides the user has specified */
  parse_cmdline(argc, argv);

  if (double_flag) {
    rc = bsop(0.0);  /* Pass in a double precision number to select the DP version of BSOP */
  }
  else {
    rc = bsop(0.0f); /* Pass in a single precision number to select the DP version of BSOP */
  }
  return rc;
}
