/*************************************************************************/
/*                                                                       */
/* Licensed Materials - Property of IBM                                  */
/*                                                                       */
/*                                                                       */
/* (C) Copyright IBM Corp. 2010                                          */
/* All Rights Reserved                                                   */
/*                                                                       */
/* US Government Users Restricted Rights - Use, duplication or           */
/* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.     */
/*                                                                       */
/*************************************************************************/

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <getopt.h>
#include <libgen.h>
#include <unistd.h>

#include <CL/opencl.h>

#include <dlfcn.h>
#include "clock.h"
#include "clu.h"

#define DEFAULT_VERBOSE_FLAG 0 /* By default, less information will print out when you run SpMV. */
#define DEFAULT_NUMA_FLAG    0 /* By default, we won't request OpenCL "fission". */
#define DEFAULT_VERIFY_FLAG  0 /* By default, results will be verified against a trivial host-based calculation. */
#define DEFAULT_TIMING_FLAG  0 /* By default, perform a perforemance run, and print out performance results. */
#define DEFAULT_DOUBLE_FLAG  0 /* By default, computations will be done in single precision. */

#define MAX_DEVICES 32               /* The most devices we expect to encounter in a NUMA environment. */
#define MAX_KERNEL_SELECTOR_LEN 256  /* Length of the string used to pass 'define' data to the OpenCL kernel code. */

#define KERNEL_DEFAULT 0
#define KERNEL_LS      1 /* The "load/store" kernel. */
#define KERNEL_AWGC    2 /* The "async work group copy" kernel. */

#define MAX_WGSZ 1024 /* This constant should be a multiple of 512 */
#define CPU_WGSZ 1  /* Work group size when running on a CPU (or an ACCELERATOR). */

/* Macro to check success of each memory allccation. */
#define MEMORY_ALLOC_CHECK(_addr, _len, _addrstr) {                                                             \
   posix_memalign((void **) &(_addr), preferred_alignment, _len);                                               \
   if (_addr == NULL) {                                                                                         \
      printf("Failed allocation of %lld bytes for %s\n", (unsigned long long) (unsigned int) (_len), _addrstr); \
      exit (EXIT_FAILURE);                                                                                      \
   }                                                                                                            \
}

/* ===================================================================== */
/* Procedure to print command and command line argument usage.           */
/* ===================================================================== */

void usage()
{
  printf("\n");
  printf("Usage: spmv -f <matrixfile> [device_type] [kernel_type] [options]\n");
  printf("\n");
  printf("Note: <matrixfile> should include the relative path from this executable.\n");
  printf("\n");
  printf(" Device Type:\n");
  printf("\n");
  printf("  -c, --cpu          Use CPU device for kernel computations.\n");
  printf("  -g, --gpu          Use GPU device for kernel computations.\n");
  printf("  -a, --accel        Use ACCELERATOR device for kernel computations.\n");
  printf("\n");
  printf(" Kernel Type (default is -A for ACCELERATOR device, -L otherwise):\n");
  printf("\n");
  printf("  -L, --ls           Use 'load-store' kernel to solve problem.\n");
  printf("  -A, --awgc         Use 'async-work-group-copy' kernel to solve problem.\n");
  printf("\n");
  printf(" Options (all options default to 'not selected'):\n");
  printf("\n");
  printf("  -l, --lwgsize [n]  Specify local work group size for GPU use (coerced to power of 2).\n");
  printf("  -d, --double       Use double precision values for input, matrix and output.\n");
  printf("  -n, --numa         Use 'numa' to split the problem across devices.\n");
  printf("  -t, --timing       Execute performance runs, and print performance results.\n");
  printf("  -V, --verbose      Produce verbose output messages.\n");
  printf("  -v, --verify       Verify computed output.\n");
  printf("\n");
  printf("  -h, --help         Print this usage message.\n");
  printf("\n");
}

/* Global variables (visible to all routines) */
static cl_device_type device_type = CL_DEVICE_TYPE_DEFAULT;
static cl_uint kernel_type = KERNEL_DEFAULT;
static int double_flag = DEFAULT_DOUBLE_FLAG;
static int verify_flag = DEFAULT_VERIFY_FLAG;
static int verbose_flag = DEFAULT_VERBOSE_FLAG;
static int timing_flag = DEFAULT_TIMING_FLAG;
static int numa_flag = DEFAULT_NUMA_FLAG;
static int gpu_wgsz = MAX_WGSZ;
static char *file_name;

/* ============================================================================== */
/* Procedure to Parse the command line arguments and return selector variables.   */
/* ============================================================================== */

void parse_cmdline(int argc, char **argv)
{
   int opt;
   int option_index;

   struct option long_options[] = {
      {"help", no_argument, NULL, 'h'},
      {"accel", no_argument, NULL, 'a'},
      {"cpu", no_argument, NULL, 'c'},
      {"gpu", no_argument, NULL, 'g'},
      {"numa", no_argument, NULL, 'n'},   
      {"ls", no_argument, NULL, 'L'},   
      {"awgc", no_argument, NULL, 'A'},   
      {"verbose", no_argument, NULL, 'V'},
      {"timing", no_argument, NULL, 't'},
      {"verify", no_argument, NULL, 'v'},
      {"double", no_argument, NULL, 'd'},
      {"lwgsize", required_argument, NULL, 'l'},
      {"filename", required_argument, NULL, 'f'},
      {NULL, 0, NULL, 0}
   };
   char *name;

   /* Change current working directory to that of the invocation path so that spmv can
    * be run from any current working directory.
    */
   name = basename(argv[0]);
   (void)chdir(dirname(argv[0]));

   while (1) {
      opt = getopt_long(argc, argv, "hacgnLAVtvdl:f:", long_options, &option_index);

      if (opt == -1) break;

      switch (opt) {

      /* -h, --help */
      case 'h': usage(); exit(EXIT_SUCCESS);

      /* -a, --accel */
      case 'a': device_type = CL_DEVICE_TYPE_ACCELERATOR; break;

      /* -c, --cpu */
      case 'c': device_type = CL_DEVICE_TYPE_CPU; break;

      /* -g, --gpu */
      case 'g': device_type = CL_DEVICE_TYPE_GPU; break;

      /* -L, --ls */
      case 'L': kernel_type = KERNEL_LS; break;

      /* -A, --awgc */
      case 'A': kernel_type = KERNEL_AWGC; break;

      /* -d, --double */
      case 'd': double_flag = 1; break;

      /* -v, --verify */
      case 'v': verify_flag = 1; break;

      /* -V, --verbose */
      case 'V': verbose_flag = 1; break;

      /* -t, --timing */
      case 't': timing_flag = 1; break;

      /* -n, --numa */
      case 'n': numa_flag = 1; break;

      /* -l, --lwgsize */
      case 'l': gpu_wgsz = atoi(optarg); break;


      /* -f, --filename */
      case 'f':
         posix_memalign((void **) &file_name, 128, 1+strlen(optarg));
         strcpy(file_name, optarg);
         break;

      case '?':
         printf("Try '%s --help' for more information.\n", name);
         exit(EXIT_FAILURE);
      }
   }

   if (optind != argc) {
      printf("%s: unrecognized option '%s'.\n", name, argv[optind]);
      printf("Try '%s --help' for more information.\n", name);
      exit(EXIT_FAILURE);
   }
}

/* ================================================================================= */
/* Here is the routine which does 99% of the work in the host-based code.            */
/* It is templated to allow for single or double precision operations.               */
/* Note the use of the "primer" parameter, whose entire purpose is to distinguish    */
/* between a single-precision implementation, and a double-precision implementation. */
/* ================================================================================= */

template <typename FLOAT> int tiled_SpMV(FLOAT primer) {

   /* These two structures are defined both in spmv.cpp and spmv.cl (using different variable types). */
   /* If you change something here, change it in the other file as well.                              */

   typedef struct _slab_header {
      cl_uint offset; /* in units of packets */
      cl_uint outindex;
      cl_uint outspan;
   } slab_header;

   typedef struct _packet {
      cl_uint seg_input_offset;
      cl_uint future_seg_input_offset;
      cl_uint npackets_remaining;
      cl_uint seg_output_offset;
      cl_uint pad1;
      cl_uint pad2;
      cl_uint pad3;
      cl_uint pad4;
      cl_ushort input_offset_short[16];
      FLOAT matdata[16];
   } packet;

   /* Variables used to manage the OpenCL environment. */
   cl_int rc;
   cl_device_id base_device_id;
   cl_context context;
   cl_command_queue base_cmd_queue;
   cl_mem output_buffer = NULL;
   size_t return_size[1];
   unsigned int output_buffer_size = 0;
   cl_ulong resolved_local_mem_size;
   cl_uint resolved_max_compute_units;
   cl_device_id sub_device_id[MAX_DEVICES];
   clu_t clu;
   float totalbytes;
   unsigned int column_span = 0;
   
   /* These variables deal with the source file for the kernel, and the names of the kernels contained therein. */
   char kernel_source_file[8] = "spmv.cl";
   char kernel_name_LS[21]   = "tiled_spmv_kernel_LS";
   char kernel_name_AWGC[23] = "tiled_spmv_kernel_AWGC";
   char kernel_name[32];
   
   /* Variables to manage the measurement of performance. */
   float time_setup = 0.0f;
   float time_kernel_creation = 0.0f;
   float time_matrix_creation = 0.0f;
   float time_buffer_handling = 0.0f;
   float time_matrix_copy = 0.0f;
   float time_single_kernel_run = 0.0f;
   float time_trivial_compute = 0.0f;
   float time_verify = 0.0f;
   float time_performance_run = 0.0f;
   
   /* Basic "size of problem" variables. */
   unsigned int nx; /* Number of elements in the X direction (length of the "input" vector. */
   unsigned int ny; /* Number of elements in the Y direction (length of the "answer" vector. */
   unsigned int non_zero; /* Number of non_zero elements in the matrix. */
   unsigned int nx_pad, nyround; /* Rounded versions of nx and ny. */
   
   /* Variables used to hold user-specified overrides and intermediate control values derived from them. */
   unsigned int nslabs;
   unsigned int matrixbytes, n_inputpackets;
   int numa_available = 0;
   unsigned int *slab_startrow;
   
   unsigned int segcachesize;
   unsigned int max_slabheight; /* Maximum matrix chunksize in the matrix dimension of the couput vector. */
   unsigned int reps; /* How many times to run the kernel when doing performance checking. */
   
   double density; /* Fraction of the sparse matrix that's non_zero. */
   unsigned int i, j, n;
   unsigned int num_devices;
   unsigned int k;

   /* Tracking allocated storage, to compare with available space. */
   unsigned long long bytes_allocated = 0LL;

   /* The text file containing the matrix data in "Matrix Market" format. */
   FILE *inputMTX;
   int err;
   cl_uint preferred_alignment = 16, preferred_alignment_by_elements = 4;

#if defined(cl_ext_migrate_memobject) && defined(cl_ext_device_fission)
   clCreateSubDevicesEXT_fn clCreateSubDevicesEXT = NULL;
   clReleaseDeviceEXT_fn clReleaseDeviceEXT = NULL;
   clEnqueueMigrateMemObjectEXT_fn clEnqueueMigrateMemObjectEXT = NULL;
#endif

   /* Variables used in characterizing the input matrix read from disk. */
   int data_present, symmetric;

   /* Start timing when setup begins. */
   startclock();

   /* ================================================================================== */
   /* Start up OpenCL, using CLU.                                                        */
   /* ================================================================================== */

   clu = cluInit(NULL);

   /* ================================================================================== */
   /* Create a dummy command queue to resolve a possibly NULL device into a device of a  */
   /* known type, and then set the device type.                                          */
   /* ================================================================================== */

   base_cmd_queue = cluCreateCmdQueue(clu, NULL, device_type, 0);
   CLU_CHECK_ERROR("clGetCommandQueueInfo CL_QUEUE_TYPE", 
                    clGetCommandQueueInfo(base_cmd_queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), (void *) &base_device_id, (size_t *) NULL));
   CLU_CHECK_ERROR("clGetDeviceInfo CL_DEVICE_TYPE", 
                    clGetDeviceInfo(base_device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL));

   if (verbose_flag) {
      if (device_type == CL_DEVICE_TYPE_CPU) {
         printf("using device_type CL_DEVICE_TYPE_CPU\n");
      }
      if (device_type == CL_DEVICE_TYPE_GPU) {
         printf("using device_type CL_DEVICE_TYPE_GPU\n");
      }
      if (device_type == CL_DEVICE_TYPE_ACCELERATOR) {
         printf("using device_type CL_DEVICE_TYPE_ACCELERATOR\n");
      }
   }

   /* ================================================================================== */
   /* SubBuffer functionality will be used, if it is available.  It is definitely not    */
   /* available in OpenCL version 1.0, but even in OpenCL 1.1, it may not be available.  */
   /* Thus, "dlopen" and "dlsym" calls from the dynamic linking loader will be used to   */
   /* properly determine its availability if the OpenCL 1.1 header files are present.    */
   /* Furthermore, in the OpenCL 1.0 case, to simplify the code and not require "#if"    */
   /* conditions every time the buffers are processed, scaffold data types are defined   */
   /* to ensure proper compilation of the code.                                          */
   /* ================================================================================== */

   size_t version_string_size;
   char *version_string;
   CLU_CHECK_ERROR("clGetDeviceInfo CL_DEVICE_VERSION", 
                    clGetDeviceInfo(base_device_id, CL_DEVICE_VERSION, 0, NULL, &version_string_size));
   MEMORY_ALLOC_CHECK(version_string, version_string_size, "version_string") 
   CLU_CHECK_ERROR("clGetDeviceInfo CL_DEVICE_VERSION", 
                    clGetDeviceInfo(base_device_id, CL_DEVICE_VERSION, version_string_size, version_string, NULL));

   /* ================================================================================== */
   /* Grab the version number out of the CL_DEVICE_VERSION variable, and treat it as a   */
   /* floating point number, to determine as a flag whether this version of OpenCL is    */
   /* at 1.1 or higher.                                                                  */
   /* ================================================================================== */

   int versionflag_1_1;
   float versionfloat; 
   char temp[8];
   sscanf(version_string, "%s %f\n", temp, &versionfloat);
   versionflag_1_1 = (versionfloat > 1.0f);

#ifndef CL_VERSION_1_1
   typedef struct {
      unsigned int origin;
      unsigned int size;
   } cl_buffer_region;
   typedef int cl_buffer_create_type;
#define CL_BUFFER_CREATE_TYPE_REGION 1
#endif

   cl_mem (*clCreateSubBuffer_ptr)(cl_mem, cl_mem_flags, cl_buffer_create_type, const void *, cl_int *);
   if (versionflag_1_1) {
      void *handle;
      handle = dlopen(NULL, RTLD_NOW);
      if (handle == NULL) {
         fprintf(stderr, "dlopen call failed (dynamic linking loader failure)\n");
         exit(EXIT_FAILURE);
      }
      clCreateSubBuffer_ptr = (cl_mem (*)(cl_mem, cl_mem_flags, cl_buffer_create_type, const void *, cl_int *)) dlsym(handle, "clCreateSubBuffer");
   }
   else {
      /* force the variable to be null, so that no attempts to call this function will occur */
      clCreateSubBuffer_ptr = (cl_mem (*)(cl_mem, cl_mem_flags, cl_buffer_create_type, const void *, cl_int *)) 0;
   }

   /* ================================================================================== */
   /* If double precision is requested, check to see if it is supported.                 */
   /* ================================================================================== */

   if (double_flag) {
      if (cluCheckDeviceExtensions(base_device_id, "cl_khr_fp64") == CL_FALSE) {
         printf("double precision was requested, but is not supported by the device.  Leaving...\n");
         exit(EXIT_SUCCESS);
      }
   }

   /* ================================================================================== */
   /* Create the context, set the preferred alignment, and query some data.              */
   /* ================================================================================== */

   context = cluGetCLContext(clu);

   err = clGetDeviceInfo(base_device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &preferred_alignment, NULL);
   if (preferred_alignment > 1024) preferred_alignment = 1024;
   preferred_alignment /= 8;  /* Convert from units of bits to units of bytes. */

   /* compute alignment to accommodate sub-buffer requirements and packet requirements */
   preferred_alignment_by_elements = preferred_alignment / sizeof(FLOAT);
   if (preferred_alignment_by_elements < 16) preferred_alignment_by_elements = 16;

   /* ================================================================================== */
   /* Determine if NUMA is wanted and/or available.                                      */
   /* If NUMA is requested, and it's available, then make the opencl calls to enable     */
   /* the splitting or replication of the memory objects across the multiple devices.    */
   /* ================================================================================== */

   sub_device_id[0] = base_device_id; 
   num_devices = 1;

   numa_available = (cluCheckDeviceExtensions(base_device_id, "cl_ext_device_fission cl_ext_migrate_memobject"));
   if (numa_available) {
#if defined(cl_ext_migrate_memobject) && defined(cl_ext_device_fission)
      cl_device_partition_property_ext dpp_list[5];
      size_t dpp_return_size;
      CLU_CHECK_ERROR("clGetDeviceInfo CL_DEVICE_AFFINITY_DOMAINS_EXT", clGetDeviceInfo
                    (base_device_id, CL_DEVICE_AFFINITY_DOMAINS_EXT,
                     6*sizeof(cl_device_partition_property_ext), (cl_device_partition_property_ext *) &dpp_list, (size_t *) &dpp_return_size));
      int found = 0;
      for (i=0; i<dpp_return_size/(sizeof(cl_device_partition_property_ext)); ++i) if (dpp_list[i] == CL_AFFINITY_DOMAIN_NUMA_EXT) found = 1;
      if (!found) numa_available = 0;
#endif
   }

   if (numa_flag) {
      if (numa_available) {
#if defined(cl_ext_migrate_memobject) && defined(cl_ext_device_fission)
         clEnqueueMigrateMemObjectEXT = (cl_int (*)(_cl_command_queue*, cl_uint, _cl_mem* const*, cl_mem_migration_flags_ext, cl_uint, _cl_event* const*, _cl_event**)) clGetExtensionFunctionAddress("clEnqueueMigrateMemObjectEXT");
         clCreateSubDevicesEXT = (cl_int (*)(_cl_device_id*, const cl_device_partition_property_ext*, cl_uint, _cl_device_id**, cl_uint*)) clGetExtensionFunctionAddress("clCreateSubDevicesEXT");
         clReleaseDeviceEXT = (cl_int (*)(_cl_device_id*)) clGetExtensionFunctionAddress("clReleaseDeviceEXT");
         cl_device_partition_property_ext properties[3] = {CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT, CL_AFFINITY_DOMAIN_NUMA_EXT, 0};
         rc = clCreateSubDevicesEXT(base_device_id, properties, 0, NULL, &num_devices);
         CLU_CHECK_ERROR("clCreateSubDevicesEXT getting sub_devices", rc);
         if (num_devices > MAX_DEVICES) num_devices = MAX_DEVICES;
         rc = clCreateSubDevicesEXT(base_device_id, properties, num_devices, sub_device_id, NULL);
         CLU_CHECK_ERROR("clCreateSubDevicesEXT getting sub_devices", rc);
#endif
      }
      else {
         printf("NUMA requested, but not available on this device.\n");
      }
   }

   cl_command_queue cmd_queue[num_devices];
   cl_kernel kernel[num_devices];
   unsigned int output_subbuffer_size[num_devices];
   cl_mem input_buffer[num_devices];
   cl_mem matrix_buffer[num_devices];
   cl_mem output_subbuffer[num_devices];
   unsigned int input_buffer_size[num_devices];
   unsigned int matrix_buffer_size[num_devices];
   size_t kernel_wg_size[num_devices];
   size_t global_work_size[num_devices][3];
   size_t local_work_size[num_devices][3];
   cl_ulong local_mem_size[num_devices];
   cl_uint max_compute_units[num_devices];
   unsigned int slab_start[num_devices], slab_stop[num_devices], nslabs_round[num_devices], memsize[num_devices];

   for (n=0; n<num_devices; ++n) {
      cl_command_queue_properties command_queue_properties;
      clGetDeviceInfo (sub_device_id[n], CL_DEVICE_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &command_queue_properties, NULL); 
      command_queue_properties &= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
      cmd_queue[n] = cluCreateCmdQueue(clu, sub_device_id[n], device_type, command_queue_properties);
   }

   /* Time when setup ends and kernel creation begins. */
   time_setup = intervalclock();

   /* =============================================================== */
   /* Create the OpenCL Kernel.                                       */
   /* =============================================================== */

   if (kernel_type == KERNEL_DEFAULT) {
      kernel_type = (device_type == CL_DEVICE_TYPE_ACCELERATOR) ? KERNEL_AWGC : KERNEL_LS;
   }

   switch (kernel_type) {
      case KERNEL_LS:
      strcpy(kernel_name, kernel_name_LS);
      break;
      case KERNEL_AWGC: 
      strcpy(kernel_name, kernel_name_AWGC);
      break;
   }

   if (verbose_flag) {
      printf("Creating kernel...\n");
   }

   for (n=0; n<num_devices; ++n) {
      kernel[n] = cluCreateKernel(clu, cmd_queue[n], kernel_source_file, kernel_name, (double_flag ? "-DDOUBLE" : NULL), CLU_SOURCE);
   }

   /* Get more information about these device. */
   resolved_max_compute_units = 0;
   for (n=0; n<num_devices; ++n) {
      clGetKernelWorkGroupInfo (kernel[n], sub_device_id[n], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &kernel_wg_size[n], return_size);
      local_mem_size[n] = cluGetAvailableLocalMem(sub_device_id[n], kernel[n]);
      clGetDeviceInfo (sub_device_id[n], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units[n], NULL); 
      resolved_max_compute_units += max_compute_units[n];
   }
   for (n=1; n<num_devices; ++n) if (kernel_wg_size[0] < kernel_wg_size[n]) kernel_wg_size[0] = kernel_wg_size[n];
   resolved_local_mem_size = local_mem_size[0];
   for (n=1; n<num_devices; ++n) {
      if (resolved_local_mem_size < local_mem_size[n]) resolved_local_mem_size = local_mem_size[n];
   }
   if (verbose_flag) {
      printf("local memory size = %d\n", (int) resolved_local_mem_size);
   }

   /* Time when kernel creation is finished */
   time_kernel_creation = intervalclock();

   /* =============================================================== */
   /* Open Matrix File and read first lines of data.                  */
   /* =============================================================== */

   inputMTX = fopen(file_name, "r");
   if (inputMTX == NULL) {
      printf("Error opening maxtrix file %s\n", file_name);
      exit(EXIT_FAILURE);
   }
   else {
      char *ptr;
      char tmp[20], pattern_flag[20], symmetric_flag[20];
      char line[1025];
      if (5 != fscanf(inputMTX, "%19s %19s %19s %19s %19s\n", tmp, tmp, tmp, pattern_flag, symmetric_flag)) {
         fprintf(stderr, "error reading matrix market format header line\n");
         exit(EXIT_FAILURE);
      }
      data_present = strcmp(pattern_flag, "pattern");
      symmetric = strcmp(symmetric_flag, "general");

      /* Process comment lines */
      do {
	 ptr = fgets(line, (int)sizeof(line), inputMTX);
      } while (ptr && (*ptr == '%'));
      sscanf(line, "%d %d %d\n", &nx, &ny, &non_zero);
   }

   /* =============================================================== */
   /* Create working storage for initial processing of matrix.        */
   /* =============================================================== */

   slab_header *matrix_header[num_devices];

   unsigned int *count_array;
   FLOAT **line_data_array;
   unsigned int **line_x_index_array;

   FLOAT *raw_data;
   unsigned int *raw_ix;
   unsigned int *raw_iy;

   MEMORY_ALLOC_CHECK(raw_ix, (non_zero * sizeof (int)), "raw_ix") 
   MEMORY_ALLOC_CHECK(raw_iy, (non_zero * sizeof (int)), "raw_iy") 
   MEMORY_ALLOC_CHECK(raw_data, (non_zero * sizeof (FLOAT)), "raw_data") 
   MEMORY_ALLOC_CHECK(line_data_array, (ny * sizeof (FLOAT *)), "line_data_array") 
   MEMORY_ALLOC_CHECK(line_x_index_array, (ny * sizeof (int *)), "line_x_index_array") 
   MEMORY_ALLOC_CHECK(count_array, (ny * sizeof (int)), "count_array") 
   for (i=0; i<ny; ++i) {
      count_array[i] = 0;
   }

   if (verbose_flag) {
      printf("storage allocation done... computing binary data.\n");
   }

   /* =============================================================== */
   /* Read in the raw data from the matrix file.                      */
   /* Check for anomalous data, and handle symmetric matrices.        */
   /* =============================================================== */

   unsigned int curry, actual_non_zero;
   curry = actual_non_zero = 0;
   unsigned int explicit_zero_count = 0;
   for (i=0; i<non_zero; ++i) {
      unsigned int ix, iy;
      FLOAT data; 
      fscanf(inputMTX, "%d %d\n", &ix, &iy);
      if (i == 0) {
         curry = iy-1;
      }
      if (data_present) {
         double double_data;
         fscanf(inputMTX, "%lf\n", &double_data);
         data = (FLOAT) double_data;
      }
      else data = ((FLOAT) (rand() & 0x7fff)) * 0.001f - 15.0f;
      if (data_present && data == 0.0) {
         ++explicit_zero_count;
      }
      else {
         --ix;
         --iy;
         raw_ix[actual_non_zero] = ix;
         raw_iy[actual_non_zero] = iy;
         raw_data[actual_non_zero] = data;
         ++actual_non_zero;
         ++count_array[iy];
         if (symmetric && (ix != iy)) {
            ++count_array[ix];
         }
         if (iy != curry) {
            if (iy != curry+1) {
               printf("gap in the input (non-invertible matrix): i = %d, iy = %d, curry = %d\n", actual_non_zero, iy, curry);
            }
            curry = iy;
         }
      }
   }
   if (explicit_zero_count) {
      printf("explicit_zero_count = %d\n", explicit_zero_count);
   }
   non_zero = actual_non_zero;

   /* =============================================================== */
   /* Create working storage for each row's data.                     */
   /* =============================================================== */

   for (i=0; i<ny; ++i) {
      if (verbose_flag) {
         if (i && ((i % 10000) == 0)) {
            printf("%4d of %4d\n", i, ny);
         }
      }
      MEMORY_ALLOC_CHECK(line_data_array[i], (count_array[i] * sizeof (FLOAT)), "line_data_array[i]") 
      MEMORY_ALLOC_CHECK(line_x_index_array[i], (count_array[i] * sizeof (int)), "line_x_index_array[i]") 
      count_array[i] = 0;
   }

   /* Fill in each row (special handling for symmetric matrices). */
   for (i=0; i<non_zero; ++i) {
      line_data_array[raw_iy[i]][count_array[raw_iy[i]]] = raw_data[i];
      line_x_index_array[raw_iy[i]][count_array[raw_iy[i]]] = raw_ix[i];
      ++count_array[raw_iy[i]];
      if (symmetric && (raw_ix[i] != raw_iy[i])) {
         line_data_array[raw_ix[i]][count_array[raw_ix[i]]] = raw_data[i];
         line_x_index_array[raw_ix[i]][count_array[raw_ix[i]]] = raw_iy[i];
         ++count_array[raw_ix[i]];
      }
   }

   /* The non_zero is now recalculated, as it will be larger if the matrix was symmetric. */
   non_zero = 0;
   for (i=0; i<ny; ++i) non_zero += count_array[i];
   density = ((double) non_zero) / ((double) nx * (double) ny);
   printf("nx = %d, ny = %d, non_zero = %d, density = %f\n", nx, ny, non_zero, density);

   nyround = (ny + (preferred_alignment_by_elements - 1)) & (~(preferred_alignment_by_elements - 1));

   if (nyround < num_devices * preferred_alignment_by_elements) nyround = num_devices * preferred_alignment_by_elements;

   /* now that we know the size, we can prevent excessive segmentation of small matrices */
   unsigned int min_compute_units = (nyround + preferred_alignment_by_elements - 1) / preferred_alignment_by_elements;
   if (resolved_max_compute_units > min_compute_units) resolved_max_compute_units = min_compute_units;

   /* Release no-longer-needed arrays. */
   free(raw_ix);
   free(raw_iy);
   free(raw_data);

   /* Arrays to hold the "compressed storage row" (CSR) matrix data. */
   FLOAT *data_array;
   unsigned int *x_index_array;
   unsigned int *row_index_array;

   /* =============================================================== */
   /* Create and load the actual CSR arrays.                          */
   /* =============================================================== */

   MEMORY_ALLOC_CHECK(data_array, (non_zero * sizeof (FLOAT)), "data_array") 
   bytes_allocated += (unsigned long long) (non_zero * sizeof(FLOAT));

   MEMORY_ALLOC_CHECK(x_index_array, ((non_zero+1) * sizeof (int)), "x_index_array") 
   bytes_allocated += (unsigned long long) (non_zero * sizeof(int));

   MEMORY_ALLOC_CHECK(row_index_array, ((nyround+1) * sizeof (int)), "row_index_array") 
   bytes_allocated += (unsigned long long) ((nyround+1) * sizeof(int));

   unsigned int index = 0;

   for (i=0; i<ny; ++i) {
      row_index_array[i] = index;
      for (j=0; j<count_array[i]; ++j) {
         data_array[index] = line_data_array[i][j];
         x_index_array[index] = line_x_index_array[i][j];
         ++index;
      }
   }
   for (i=ny; i<=nyround; ++i) {
      row_index_array[i] = non_zero;
   }

   for (i=0; i<ny; ++i) {
      if (count_array[i]) {
         free(line_data_array[i]);
         free(line_x_index_array[i]);
      }
   }
   free(line_data_array);
   free(line_x_index_array);
   free(count_array);

   /* ============================================================================= */
   /* Now that we have the CSR format of the matrix (in "row_index_array",          */
   /* "x_index_array", and "data_array", we begin to compute the best size and      */
   /* shape for the tiles of the final Tiled format of the matrix.                  */
   /* ============================================================================= */

   unsigned int nslabs_base, target_workpacket, candidate_row, target_value, slabsize;
   unsigned int slab_threshhold;

   /* Decide how big the tiles should be, in the X direction.   */
   /* This decision is driven by three factors:                 */
   /* (1) the tile width should not exceed the matrix width     */
   /* (2) the tile width should not exceed 65536 (16 bit index) */
   /* (3) the tile width should not overwhelm local memory.     */
   /* The variable "column_span" holds this tile width.         */

   if (kernel_type == KERNEL_AWGC) {
      column_span = resolved_local_mem_size / 64;
   }
   if (kernel_type == KERNEL_LS) {
      column_span = 65536;
   }
   if (column_span > nx) {
      column_span = nx;
   }
   if (column_span > 65536) {
      column_span = 65536;
   }
   while (column_span & (column_span-1)) {
      ++column_span; /* Raise up to a power of 2. */
   }
   if (verbose_flag) {
      printf("column_span...%d\n", column_span);
   }
   nx_pad  = (nx + (column_span*num_devices-1)) & (~(column_span*num_devices-1));

   /* Decide how big the tiles should be, in the Y direction, based on local memory considerations. */
   /* While "column_span" is fixed for all tiles, the tile size in the Y direction can vary.        */
   /* Each "slab" of data should be thought as a horizontal row of tiles. The variable              */
   /* "slab_threshhold" holds the largest height that will be permitted for any such slab.          */

   nslabs_base = resolved_max_compute_units;

   if (kernel_type == KERNEL_AWGC) {
      slab_threshhold = ((7 * (resolved_local_mem_size/sizeof(FLOAT))) / 16) - 1;
      slab_threshhold &= ~(preferred_alignment_by_elements - 1);
      unsigned int expected_nslabs = nyround / slab_threshhold;
      if (expected_nslabs < nslabs_base) {
         expected_nslabs = nslabs_base;
      }
      target_workpacket = non_zero / expected_nslabs;
      if (verbose_flag) {
         printf("non_zero = %d, slab_threshhold = %d, nslabs_base = %d, nyround = %d, target_workpacket = %d\n", 
                 non_zero,      slab_threshhold,      nslabs_base,      nyround,      target_workpacket);
      }
      /* Decide how big the local cache for packet data should be, based on local memory considerations. */
      /* (Typically we will read in 16 or 32 packets at a time.)                                          */
      segcachesize = resolved_local_mem_size / 8192;
      while (segcachesize & (segcachesize-1)) {
         ++segcachesize; /* raise up to a power of 2 */
      }
      if (verbose_flag) {
         printf("segcachesize...%d\n", segcachesize);
      }

      /* Scan matrix data to find best split of data for each contiguous group of rows ("slabs"). */
      candidate_row = 0;
      target_value = target_workpacket;
      slabsize = 0;
      nslabs = 0;
      while (candidate_row < nyround) {
         while (row_index_array[candidate_row] < target_value && (slabsize+preferred_alignment_by_elements) < slab_threshhold && candidate_row < nyround) {
            candidate_row += preferred_alignment_by_elements;
            slabsize += preferred_alignment_by_elements;
         }
         ++nslabs;
         slabsize = 0;
         target_value = row_index_array[candidate_row] + target_workpacket;
      }
   
      /* Allocate an array to hold row index of beginning of each of these "slabs". */
      MEMORY_ALLOC_CHECK(slab_startrow, ((nslabs + 1) * sizeof (unsigned int)), "slab_startrow") 
      slab_startrow[0] = 0;
      slab_startrow[nslabs] = nyround;
      candidate_row = 0;
      target_value = target_workpacket;
      slabsize = 0;
      nslabs = 0;

      /* Scan matrix data to implement previously computed split of data for each contiguous group of rows. */
      while (candidate_row < nyround) {
         while (row_index_array[candidate_row] < target_value && slabsize < slab_threshhold && candidate_row < nyround) {
            candidate_row += preferred_alignment_by_elements;
            slabsize += preferred_alignment_by_elements;
         }
         ++nslabs;
         slabsize = 0;
         slab_startrow[nslabs] = candidate_row;
         target_value = row_index_array[candidate_row] + target_workpacket;
      }
   
      max_slabheight = 0;
      for (i=0; i<nslabs; ++i) {
         if (slab_startrow[i+1] - slab_startrow[i] > max_slabheight) {
            max_slabheight = slab_startrow[i+1] - slab_startrow[i];
         }
      }
   }
   else {
      if (device_type == CL_DEVICE_TYPE_GPU) {
         if (gpu_wgsz > MAX_WGSZ) {
            printf("coercing gpu work group size to MAX WORK GROUP SIZE, which is %d\n", MAX_WGSZ);
            gpu_wgsz = MAX_WGSZ;
         }
         if (gpu_wgsz < 16) {
            printf("coercing gpu work group size to MIN WORK GROUP SIZE, which is 16\n");
            gpu_wgsz = 16;
         }
         if (gpu_wgsz & (gpu_wgsz-1)) {
            while (gpu_wgsz & (gpu_wgsz-1)) --gpu_wgsz;
            printf("coersing gpu work group size to next lower power of 2, which is %d\n", gpu_wgsz);
         }
         if (gpu_wgsz > (int) kernel_wg_size[0]) {
            while (gpu_wgsz > (int) kernel_wg_size[0]) {
               gpu_wgsz /= 2;
            }
            printf("coercing gpu work group size to fit within hardware limits.  New size is %d\n", gpu_wgsz);
         }
   
         nslabs = (nyround + gpu_wgsz - 1) / gpu_wgsz;
         while (nslabs < resolved_max_compute_units) {
            gpu_wgsz /= 2;
            nslabs = (nyround + gpu_wgsz - 1) / gpu_wgsz;
         }
         MEMORY_ALLOC_CHECK(slab_startrow, ((nslabs + 1) * sizeof (unsigned int)), "slab_startrow") 
         for (i=0; i<nslabs; ++i) slab_startrow[i] = gpu_wgsz * i;
         slab_startrow[nslabs] = nyround;
         max_slabheight = gpu_wgsz;
      }
      else {
         nslabs = resolved_max_compute_units;
         while (nyround / nslabs >= (resolved_local_mem_size/sizeof(FLOAT))) nslabs *= 2;
         MEMORY_ALLOC_CHECK(slab_startrow, ((nslabs + 1) * sizeof (unsigned int)), "slab_startrow") 
         for (i=0; i<=nslabs; ++i) slab_startrow[i] = (((nyround/preferred_alignment_by_elements) * i) / nslabs) * preferred_alignment_by_elements;
         max_slabheight = 0;
         for (i=0; i<nslabs; ++i) {
            unsigned int temp = slab_startrow[i+1] - slab_startrow[i];
            if (max_slabheight < temp) max_slabheight = temp;
         }
      }
   }

   /* ============================================================================= */
   /* Now that we have computed the size and shape for our tiles, we can allocate   */
   /* space for working storage to hold the data in an intermediate format, as we   */
   /* move towards the final Tiled format.                                          */
   /* ============================================================================= */

   if (verbose_flag) {
      printf("converting matrix into tiled format...\n");
   }

   unsigned int biggest_slab = 0;
   unsigned int smallest_slab = 0x7fffffff;
   unsigned int totpackets = 0;
   unsigned int totslabs = 0;
   n_inputpackets = 0;

   unsigned int *row_start, *row_curr;
   unsigned int tot_memsize = 0;
   MEMORY_ALLOC_CHECK(row_start, (4*(max_slabheight+1)), "row_start") 
   MEMORY_ALLOC_CHECK(row_curr, (4*(max_slabheight)), "row_curr") 
   bytes_allocated += (unsigned long long) (2 * non_zero + max_slabheight + sizeof(packet));
   unsigned int realdata = 0;
   unsigned int totaldata = 0;
   unsigned int current_slab;
   current_slab = 0;

   packet *slab_ptr;

   /* =============================================================== */
   /* Now we create the Tiled Format of the matrix.                   */
   /* The "seg_workspace" array holds the starting point for each     */
   /* device's header data, and linear list of packets.              */
   /* =============================================================== */

   packet *seg_workspace[num_devices];

   unsigned int num_header_packets;
   /* each header packet holds information for 512 threads */
   num_header_packets = ((kernel_type == KERNEL_AWGC) || (device_type != CL_DEVICE_TYPE_GPU)) ? 0 : (MAX_WGSZ+511)/512;
   
   float interpacket_bloat = 1.0f;
   /* This large loop does the bulk of the hard work to load the data into the packets. */
   int seg_index;
   for (n = 0; n < num_devices; ++n) {
      /* The size of the array is admittedly derived using heuristics, but has been found satisfactory   */
      /* for all matrices that have been run through this program during development.                    */
      int temp_count;
      temp_count = (non_zero > 16) ? non_zero : 16;
      MEMORY_ALLOC_CHECK(seg_workspace[n], ((temp_count/2) / num_devices) * sizeof(packet), "seg_workspace[n]") 
      bytes_allocated += (unsigned long long) (64 * non_zero);
      for (i = 0; i < (temp_count/num_devices)>>1; ++i) {
         for (j=0; j<16; ++j) { /* Pre-load input and output indices with flag saying "no data here". */
            seg_workspace[n][i].input_offset_short[j] = (cl_ushort) 0;
            seg_workspace[n][i].matdata[j] = 0.0f;
         }
      }
      /* The entire matrix is split across the multiple devices, and as such, */
      /* We need to know, for each device, where do the slabs start and stop. */
      slab_start[n] = current_slab;
      slab_stop[n] = current_slab + ((n+1)*nslabs) / num_devices - (n*nslabs) / num_devices;
      nslabs_round[n] = slab_stop[n] - slab_start[n];
      /* The variable "memsize" is a count of how much of the "seg_workspace" array has been loaded with data. */
      memsize[n] = 3 * 4 * (nslabs_round[n]+1);
      memsize[n] += sizeof(packet);
      memsize[n] /= sizeof(packet);
      memsize[n] *= sizeof(packet);

      /* The majority of the tiled matrix format is composed of packets, but the first bytes are header information. */
      /* Use this temporary "matrix_header" variable to load that data. */
      matrix_header[n] = (slab_header *) seg_workspace[n];
      slab_ptr = &seg_workspace[n][memsize[n]/sizeof(packet)];
      bytes_allocated += (unsigned long long) (nslabs_round[n]+1) * sizeof(unsigned int);
      seg_index = 0;
      int acctg_maxcount = 0;
      float acctg_avgcount = 0.0f;
      for (i=slab_start[n]; i<slab_stop[n]; ++i) {
         uint nteams = gpu_wgsz/16;
         /* Load the header data into "seg_workspace" via the "matrix_header" proxy variable. */
         matrix_header[n][current_slab - slab_start[n]].offset = memsize[n] / sizeof(packet);
         matrix_header[n][current_slab - slab_start[n]].outindex = slab_startrow[i]-slab_startrow[slab_start[n]];
         matrix_header[n][current_slab - slab_start[n]].outspan = slab_startrow[i+1]-slab_startrow[i];
         if (row_index_array[slab_startrow[i]] == row_index_array[slab_startrow[i+1]]) {
            /* set up structure of two packets to record "no work to do in this slab" */
            unsigned int jloop;
            /* if we're using header packets, then use them.  Otherwise just zero out one packet */
            jloop = (num_header_packets > 0) ? num_header_packets : 1;
            for (j=0; j<jloop; ++j) {
               int *foo;
               foo = (int *) &slab_ptr[seg_index];
               for (k=0; k<sizeof(packet)/sizeof(int); ++k) foo[k] = 0;
               ++seg_index;
               memsize[n] += sizeof(packet);
            }
         }
         else {
            /* Here we start actually loading packet data. */
            for (j=0; j<=slab_startrow[i+1]-slab_startrow[i]; ++j) {
               row_start[j] = row_index_array[slab_startrow[i]+j];
            }
            if ((device_type != CL_DEVICE_TYPE_GPU) || (kernel_type == KERNEL_AWGC)) {
               for (j=0; j<nx_pad; j+= column_span) {
                  unsigned int kk;
                  for (k=0; k<slab_startrow[i+1] - slab_startrow[i]; k+= 16) {
                     unsigned int count[16];
                     for (kk = 0; kk<16; ++kk) {
                        count[kk] = 0;
                        row_curr[k+kk] = row_start[k+kk];
                        while ((x_index_array[row_curr[k+kk]] < (j+column_span)) && row_curr[k+kk] < row_index_array[slab_startrow[i] + k + kk +1]) {
                           ++row_curr[k+kk];
                           ++count[kk];
                        }
                     }
                     unsigned int maxcount = 0;
                     for (kk=0; kk<16; ++kk) {
                        if (count[kk] > maxcount) {
                           maxcount = count[kk];
                        }
                     }
                     unsigned int sum = 0;
                     for (kk=0; kk<16; ++kk) {
                        sum += count[kk];
                     }
                     realdata += sum;
                     totaldata += 16 * maxcount;
                     unsigned int countdex;
                     for (countdex = 0; countdex < maxcount; ++countdex) {
                        slab_ptr[seg_index].seg_input_offset = j;
                        slab_ptr[seg_index].seg_output_offset = k;
                        for (kk=0; kk<16; ++kk) {
                           if (countdex < count[kk]) {
                              slab_ptr[seg_index].input_offset_short[kk] = 
                                  (unsigned short) (x_index_array[row_start[k+kk]+countdex] & (column_span-1));
                              slab_ptr[seg_index].matdata[kk] = data_array[row_start[k+kk]+countdex];
                           }
                        }
                        ++seg_index;
                        memsize[n] += sizeof(packet);
                     }
                     for (kk = 0; kk<16; ++kk) {
                        row_start[k+kk] = row_curr[k+kk];
                     }
                  }
               }
            }
            else {
               int last_seg_index[nteams];
               int last_seg_index_initialized[nteams];
               int *first_team_offset;
               for (j=0; j<nteams; ++j) last_seg_index[j] = seg_index;
               for (j=0; j<nteams; ++j) last_seg_index_initialized[j] = 0;
               first_team_offset = (int *) &slab_ptr[seg_index];
               for (j=0; j<num_header_packets; ++ j) {
                  ++seg_index;
                  memsize[n] += sizeof(packet);
               }
               int packet_offset = 0;
               for (k=0; k<slab_startrow[i+1] - slab_startrow[i]; k+= 16) {
                  int packet_count = 0;
                  for (j=0; j<nx_pad; j+= column_span) {
                     unsigned int kk;
                     unsigned int count[16];
                     for (kk = 0; kk<16; ++kk) {
                        count[kk] = 0;
                        row_curr[k+kk] = row_start[k+kk];
                        while ((x_index_array[row_curr[k+kk]] < (j+column_span)) && row_curr[k+kk] < row_index_array[slab_startrow[i] + k + kk +1]) {
                           ++row_curr[k+kk];
                           ++count[kk];
                        }
                     }
                     unsigned int maxcount = 0;
                     for (kk=0; kk<16; ++kk) {
                        if (count[kk] > maxcount) {
                           maxcount = count[kk];
                        }
                     }
                     unsigned int sum = 0;
                     for (kk=0; kk<16; ++kk) {
                        sum += count[kk];
                     }
                     realdata += sum;
                     totaldata += 16 * maxcount;
                     unsigned int countdex;
                     for (countdex = 0; countdex < maxcount; ++countdex) {
                        slab_ptr[seg_index].seg_input_offset = j;
                        slab_ptr[seg_index].seg_output_offset = k;
                        for (kk=0; kk<16; ++kk) {
                           if (countdex < count[kk]) {
                              slab_ptr[seg_index].input_offset_short[kk] = 
                                  (unsigned short) (x_index_array[row_start[k+kk]+countdex] & (column_span-1));
                              slab_ptr[seg_index].matdata[kk] = data_array[row_start[k+kk]+countdex];
                           }
                        }
                        ++seg_index;
                        ++packet_count;
                        memsize[n] += sizeof(packet);
                     }
                     for (kk = 0; kk<16; ++kk) {
                        row_start[k+kk] = row_curr[k+kk];
                     }
                  }
                  if ((packet_offset > 65535) || (packet_count > 65535)) {
                     printf("eek!\n");
                     return(-1);
                  }
                  first_team_offset[k>>4] = packet_offset * 65536 + packet_count;
                  packet_offset += packet_count;
               }
               for (k = slab_startrow[i+1] - slab_startrow[i]; k < 16*nteams; k+= 16) {
                  first_team_offset[k>>4] = 0;
               }
               int tempmaxcount = 0;
               int tempavgcount = 0;
               for (k=0; k<nteams; ++k) {
                  int tempcount;
                  tempcount = first_team_offset[k] % 65536;
                  tempavgcount += tempcount;
                  if (tempcount > tempmaxcount) tempmaxcount = tempcount;
               }
               acctg_avgcount += ((float) tempavgcount) / nteams;
               acctg_maxcount += tempmaxcount;
            }
            /* With on exception, all actual packet data is now loaded into the "seg_workspace" array, for this device. */
         }
         if (verbose_flag && (i && ((i%1000) == 0))) {
            printf("%d of %d slabs done\n", i, slab_stop[n]);
         }
         ++current_slab;
      }
      interpacket_bloat = (device_type == CL_DEVICE_TYPE_GPU) ? (acctg_maxcount / acctg_avgcount) : 1.0;

      /* To ensure closure, we fill the "trailing slab" header data with information to tell the kernel "no work to be done here." */
      for (i=slab_stop[n]; i<nslabs_round[n]; ++i) {
         matrix_header[n][current_slab - slab_start[n]].offset = memsize[n]/sizeof(packet);
         matrix_header[n][current_slab - slab_start[n]].outindex = slab_startrow[slab_stop[n]]-slab_startrow[slab_start[n]];
         matrix_header[n][current_slab - slab_start[n]].outspan = 0;
         ++current_slab;
      }
      matrix_header[n][current_slab - slab_start[n]].offset = memsize[n]/sizeof(packet);
      matrix_header[n][current_slab - slab_start[n]].outindex = slab_startrow[slab_stop[n]]-slab_startrow[slab_start[n]];
      matrix_header[n][current_slab - slab_start[n]].outspan = 0;
      if (verbose_flag) {
         printf("non_zero = 0x%x, memsize[n] = 0x%08x, bytes per elements = %d\n", non_zero, memsize[n], memsize[n]/(non_zero/num_devices));
      }
   }

   /* This loop records some statistics, and sets one final value into the packets. */
   for (n = 0; n < num_devices; ++n) {
      for (i=slab_start[n]; i<slab_stop[n]; ++i) {
         unsigned int npackets = matrix_header[n][i+1-slab_start[n]].offset - matrix_header[n][i-slab_start[n]].offset;
         if (npackets < smallest_slab) {
            smallest_slab = npackets;
         }
         if (npackets > biggest_slab) {
            biggest_slab = npackets;
         }
         totpackets += npackets;
         ++totslabs;
         /* load the first "real" packet of data with the count of how many such packets there are in this slab */
         seg_workspace[n][matrix_header[n][i-slab_start[n]].offset+num_header_packets].npackets_remaining = npackets-num_header_packets;
      }
      memsize[n] += 32*sizeof(packet); /* Add room for reading past end of data, so we don't abnormally terminate. */
      tot_memsize += memsize[n];

      for (i=slab_start[n]; i<slab_stop[n]; ++i) {
         /* For each row of tiles, we now start at the end, and work backward, to load one last datum into each packet. */
         slab_ptr = &seg_workspace[n][matrix_header[n][i-slab_start[n]].offset];
         seg_index = matrix_header[n][(i+1)-slab_start[n]].offset - matrix_header[n][i-slab_start[n]].offset;
         --seg_index; // back up into set of packets
         unsigned int curr_input_offset, next_input_offset;
         curr_input_offset = slab_ptr[seg_index].seg_input_offset;
         next_input_offset = 0;
         ++n_inputpackets;
         while (seg_index >= (int) num_header_packets) {
            if (slab_ptr[seg_index].seg_input_offset < curr_input_offset) {
               next_input_offset = curr_input_offset;
               curr_input_offset = slab_ptr[seg_index].seg_input_offset;
               ++n_inputpackets;
            }
            /* Here is the "exception".  We now load the "input offset for a future tile" into the data, for the benefit of the double-buffered AWGC kernel. */
            slab_ptr[seg_index].future_seg_input_offset = next_input_offset; 
            --seg_index;
         }
      }
   }

   time_matrix_creation = intervalclock();

   if (verbose_flag) {
      printf("realdata = %8d, totaldata = %8d, valid percentage = %5.3f\n", 
      realdata, totaldata, ((float) realdata) / ((float) totaldata));

      printf("range of number of packets in a slab: %5d to %5d, average = %8.2f, number of slabs = %4d\n", 
      smallest_slab, biggest_slab, (float) totpackets / (float) totslabs, nslabs);
   }

   /* =============================================================================================== */
   /* Compute the local and global work group sizes.                                                  */
   /* =============================================================================================== */

   unsigned int ndims;
   unsigned int team_size;

   if (kernel_type == KERNEL_AWGC) {
      ndims = 1;
      for (n=0; n<num_devices; ++n) {
         global_work_size[n][0] = nslabs_round[n];
         local_work_size[n][0] = 1;
      }
   }
   else {
      ndims = 2;
      team_size = (device_type == CL_DEVICE_TYPE_GPU) ? 16 : 1;
      for (n=0; n<num_devices; ++n) {
         global_work_size[n][1] = nslabs_round[n];
         local_work_size[n][1] = 1;
         global_work_size[n][0] = local_work_size[n][0] = (device_type == CL_DEVICE_TYPE_GPU) ? gpu_wgsz : CPU_WGSZ;
      }
      int max_aggregate_local_work_group_size = 0;
      for (n=0; n<num_devices; ++n) {
         int aggregate_local_work_group_size = 1;
         for (i=0; i<ndims; ++i) {
            aggregate_local_work_group_size *= local_work_size[n][i];
         }
         if (max_aggregate_local_work_group_size < aggregate_local_work_group_size) max_aggregate_local_work_group_size = aggregate_local_work_group_size;
      }
      if (max_aggregate_local_work_group_size > (int) kernel_wg_size[0]) {
         while (max_aggregate_local_work_group_size > (int) kernel_wg_size[0]) {
            for (n=0; n<num_devices; ++n) local_work_size[n][0] /= 2;
            gpu_wgsz /= 2;
            max_aggregate_local_work_group_size /= 2;
         }
         printf("coercing work group size to fit within hardware limits.  New size is %d\n", gpu_wgsz);
      }
   }

   /* =============================================================================================== */
   /* Our Tiled format is now complete, but still in "working storage".  We cannot allocate its       */
   /* buffer in OpenCL until we know how big it is, and now, we finally know how big it is.  So, we   */
   /* create the Input and Output arrays, and the final array to hold the Tiled Format of the Matrix. */
   /* =============================================================================================== */

   /* Arrays to hold input and output data, and the finished tiled matrix data. */
   FLOAT *input_array[num_devices], *output_array[num_devices], *output_array1;
   unsigned int *tilebuffer[num_devices];
   
   bytes_allocated += (unsigned long long) (nyround * sizeof(float));
   MEMORY_ALLOC_CHECK(output_array1, (nyround * sizeof(FLOAT)), "output_array1") 
   if (output_array1 == NULL) {
      fprintf(stderr, "insufficient memory to perform this workload.\n"); fflush(stderr);
      exit(EXIT_FAILURE);
   }

   bytes_allocated += (unsigned long long) ((nx_pad + nyround) * sizeof(FLOAT) + tot_memsize);
   if (verbose_flag) {
      printf("bytes_allocated = %llu\n", (unsigned long long) bytes_allocated);
   }

   /* =============================================================================================== */
   /* To demonstrate the SubBuffer and NUMA extensions, we will create input and matrix devices for   */
   /* use by NUMA, and also a single output buffer to be subdivided using clCreateSubBuffer.          */
   /* =============================================================================================== */

   /* Create the input and matrix buffer memory objects. */
   for (n=0; n<num_devices; ++n) {
      input_buffer_size[n] = (nx_pad * sizeof(FLOAT));
      input_buffer[n] = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, input_buffer_size[n], NULL, &rc);
      if (rc != CL_SUCCESS) {
         fprintf(stderr, "Couldn't create input buffer...rc=%d\n", rc);
      }
      matrix_buffer_size[n] = memsize[n];
      matrix_buffer[n] = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, matrix_buffer_size[n], NULL, &rc);
      if (rc != CL_SUCCESS) {
         fprintf(stderr, "Couldn't create tiled matrix buffer...rc=%d\n", rc);
      }
   }

   /* =============================================================================================== */
   /* If NUMA is available and requested, perform migration on the input and matrix buffers.          */
   /* =============================================================================================== */

   cl_event events[3*num_devices];

#if defined(cl_ext_migrate_memobject) && defined(cl_ext_device_fission)
   if (numa_available && numa_flag) {
      for (n=0; n<num_devices; ++n) {
         err = clEnqueueMigrateMemObjectEXT (cmd_queue[n], 1, &input_buffer[n], 0, 0, NULL, &events[3*n+0]);
         CLU_CHECK_ERROR ("clEnqueueMigrateMemObjectEXT", err);
         err = clEnqueueMigrateMemObjectEXT (cmd_queue[n], 1, &matrix_buffer[n], 0, 0, NULL, &events[3*n+1]);
         CLU_CHECK_ERROR ("clEnqueueMigrateMemObjectEXT", err);
      }
      for (n=0; n<num_devices; ++n) clWaitForEvents(2, &events[3*n+0]);
   }
#endif

   for (n=0; n<num_devices; ++n) {
      output_subbuffer_size[n] = (slab_startrow[slab_stop[n]] - slab_startrow[slab_start[n]]) * sizeof(FLOAT);
   }

   if (clCreateSubBuffer_ptr) { /* if sub_buffers are supported */
      /* Create the output buffer memory object. */
      cl_buffer_region output_buffer_region[num_devices];
      output_buffer_size = 0;
      for (n=0; n<num_devices; ++n) {
         output_buffer_region[n].origin = output_buffer_size;
         output_buffer_region[n].size = output_subbuffer_size[n];
         output_buffer_size += output_subbuffer_size[n];
      }
      output_buffer = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, output_buffer_size, NULL, &rc);
      if (rc != CL_SUCCESS) {
         fprintf(stderr, "Couldn't create output buffer...rc=%d\n", rc);
      }

      /* Create the output sub-buffers for each device. */
      for (n=0; n<num_devices; ++n) {
         output_subbuffer[n] = 
            clCreateSubBuffer_ptr(output_buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &output_buffer_region[n], &rc);
         if (rc != CL_SUCCESS) {
            fprintf(stderr, "Couldn't create output sub-buffer...rc=%d\n", rc);
         }
      }
   }
   else {
      /* Create multiple output buffer memory objects, directly. */
      for (n=0; n<num_devices; ++n) {
         output_subbuffer[n] = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, output_subbuffer_size[n], NULL, &rc);
         if (rc != CL_SUCCESS) {
            fprintf(stderr, "Couldn't create input buffer...rc=%d\n", rc);
         }
      }

      /* =============================================================================================== */
      /* If NUMA is available and requested, perform migration on the output buffers.                    */
      /* =============================================================================================== */

#if defined(cl_ext_migrate_memobject) && defined(cl_ext_device_fission)
      if (numa_available && numa_flag) {
         for (n=0; n<num_devices; ++n) {
            err = clEnqueueMigrateMemObjectEXT (cmd_queue[n], 1, &output_subbuffer[n], 0, 0, NULL, &events[3*n+2]);
            CLU_CHECK_ERROR ("clEnqueueMigrateMemObjectEXT", err);
         }
         for (n=0; n<num_devices; ++n) clWaitForEvents(1, &events[3*n+2]);
      }
#endif /* migrate and fission */
   }

   /* =============================================================================================== */
   /* Map these buffers to allocate pointers into these buffers that we can use to load them.         */
   /* =============================================================================================== */

   for (n=0; n<num_devices; ++n) {
      input_array[n] =       (FLOAT *) clEnqueueMapBuffer(cmd_queue[n], 
                                                          input_buffer[n], 
                                                          CL_TRUE, 
                                                          CL_MAP_WRITE, 
                                                          0, 
                                                          (size_t) input_buffer_size[n], 
                                                          0, 
                                                          NULL, 
                                                          NULL, 
                                                          &rc);
      if (rc != CL_SUCCESS) {
         fprintf(stderr, "Couldn't map input buffer...rc=%d\n", rc);
      }

      tilebuffer[n] = (unsigned int *) clEnqueueMapBuffer(cmd_queue[n], 
                                                          matrix_buffer[n], 
                                                          CL_TRUE, 
                                                          CL_MAP_WRITE, 
                                                          0, 
                                                          (size_t) matrix_buffer_size[n], 
                                                          0, 
                                                          NULL, 
                                                          NULL, 
                                                          &rc);
      if (rc != CL_SUCCESS) {
         fprintf(stderr, "Couldn't map tiled matrix buffer...rc=%d\n", rc);
      }
   }

   if (clCreateSubBuffer_ptr) { /* if sub_buffers are supported */
      output_array[0] =     (FLOAT *) clEnqueueMapBuffer(cmd_queue[0], 
                                                         output_buffer, 
                                                         CL_TRUE, 
                                                         CL_MAP_WRITE, 
                                                         0, 
                                                         (size_t) output_buffer_size, 
                                                         0, 
                                                         NULL, 
                                                         NULL, 
                                                         &rc);
      if (rc != CL_SUCCESS) {
         fprintf(stderr, "Couldn't map output buffer A...rc=%d\n", rc);
      }
   }
   else {
      for (n=0; n<num_devices; ++n) {
         output_array[n] =     (FLOAT *) clEnqueueMapBuffer(cmd_queue[n], 
                                                            output_subbuffer[n], 
                                                            CL_TRUE, 
                                                            CL_MAP_WRITE, 
                                                            0, 
                                                            (size_t) output_subbuffer_size[n], 
                                                            0, 
                                                            NULL, 
                                                            NULL, 
                                                            &rc);
         if (rc != CL_SUCCESS) {
            fprintf(stderr, "Couldn't map output buffer A...rc=%d\n", rc);
         }
      }
   }

   time_buffer_handling = intervalclock();

   if (verbose_flag) {
      matrixbytes = 0;
   }

   /* =============================================================================================== */
   /* Copy the tiled matrix into the memory buffer, and then unmap it.                                */
   /* =============================================================================================== */
   for (n=0; n<num_devices; ++n) {
      memcpy(tilebuffer[n], seg_workspace[n], sizeof(packet) * (matrix_header[n][slab_stop[n] - slab_start[n]].offset));
      matrixbytes += sizeof(packet) * (matrix_header[n][slab_stop[n] - slab_start[n]].offset);
      rc = clEnqueueUnmapMemObject(cmd_queue[n], matrix_buffer[n], tilebuffer[n], 0, NULL, &events[n]);
      if (rc != CL_SUCCESS) {
         fprintf(stderr, "Couldn't unmap tilebuffer buffer...rc=%d\n", rc);
      }
   }
   clWaitForEvents(num_devices, events);

   /* Time when tiled matrix creation is complete. */
   time_matrix_copy = intervalclock();

   /* Compute "flops per byte". */
   rc = 0;
   unsigned long long int in_bytes, mat_bytes, out_bytes;
   in_bytes = sizeof(packet) * ((unsigned long long) n_inputpackets);
   mat_bytes = (unsigned long long) matrixbytes;
   out_bytes = (unsigned long long) (ny * 4);

   unsigned long long int flopcount;
   float flops_per_byte;
   flopcount = 2ULL * (unsigned long long) non_zero;
   flops_per_byte = ((float) (flopcount)) / ((float) (in_bytes+mat_bytes+out_bytes));
   double packet_density = ((float) non_zero) / (matrixbytes / (4.0 + sizeof(FLOAT)));

   if (verbose_flag) {
      printf("nx = %7d, ny = %7d, non_zero = %8d, n_inputpackets = %6d, n_packets = %6d\n", 
              nx, ny, non_zero, n_inputpackets, (matrixbytes >> 7));
      printf("sparsity = %9.4f, packet density = %6.4f, tiled bytes-per-element = %f\n", 
              ((float) nx * (float) ny) / ((float) non_zero), 
              packet_density,
              ((float) matrixbytes) / ((float) non_zero));
      printf("bytesmoved = %10lld %10lld %10lld %10lld, flops_per_byte = %6.2f\n", 
              in_bytes, mat_bytes, out_bytes, (in_bytes+mat_bytes+out_bytes), flops_per_byte);
   }

   /* Load random data into the input array.                                                      */
   /* The user is encouraged to substitute initialization of real data at this point in the code. */
   for (i=0; i<nx; ++i) {
      FLOAT rval;
      rval = ((FLOAT) (rand() & 0x7fff)) * 0.001f - 15.0f;
      for (n=0; n<num_devices; ++n) {
         input_array[n][i] = rval;
      }
   }

   /* Zero out the output array.                                                             */
   /* Note that this is only needed because some matrices are singular and have whole rows   */
   /* that are all zero, which is detected, and no work is done on those rows, so that they  */
   /* will never get written by the kernel, so to be safe, we zero it all out here, as well. */

   if (clCreateSubBuffer_ptr) { /* if sub_buffers are supported */
      memset((void *) output_array[0], 0, (size_t) ny * sizeof(FLOAT));
   }
   else {
      for (n=0; n<num_devices; ++n) {
         memset((void *) output_array[n], 0, output_subbuffer_size[n]);
      }
   }

   /* =============================================================================================== */
   /* Unmap the input and output memory buffers, to prepare for kernel execution.                     */
   /* =============================================================================================== */

   for (n=0; n<num_devices; ++n) {
      rc = clEnqueueUnmapMemObject(cmd_queue[n], input_buffer[n], input_array[n],   0, NULL, &events[n]);
      if (rc != CL_SUCCESS) {
         fprintf(stderr, "Couldn't unmap input buffer...rc=%d\n", rc);
      }
   }
   if (clCreateSubBuffer_ptr) { /* if sub_buffers are supported */
      rc = clEnqueueUnmapMemObject(cmd_queue[0], output_buffer, output_array[0], 0, NULL, &events[num_devices]);
      if (rc != CL_SUCCESS) {
         fprintf(stderr, "Couldn't unmap output buffer...rc=%d\n", rc);
      }
      clWaitForEvents(num_devices+1, events);
   }
   else {
      for (n=0; n<num_devices; ++n) {
         rc = clEnqueueUnmapMemObject(cmd_queue[n], output_subbuffer[n], output_array[n], 0, NULL, &events[num_devices+n]);
         if (rc != CL_SUCCESS) {
            fprintf(stderr, "Couldn't unmap output buffer...rc=%d\n", rc);
         }
      }
      clWaitForEvents(2*num_devices, events);
   }

   if (verbose_flag) {
      for (n=0; n<num_devices; ++n) {
         printf("max_slabheight = %d\n", max_slabheight);
         printf("ndims = %d\n", ndims);
         for (i=0; i<ndims; ++i) {
            printf("global[%d] = %d, local[%d] = %d\n", i, (int) global_work_size[n][i], i, (int) local_work_size[n][i]);
         }
      }
   }


   /* =============================================================================================== */
   /* Execution: Multiplication of the input array times the Tiled Format of the Matrix.              */
   /* =============================================================================================== */

   /* Run once to verifying correct answer, and computing a baseline number of repetitions for later performance measurements. */
   startclock();

   for (n=0; n<num_devices; ++n) {
      cluSetKernelNDRange(clu, kernel[n], ndims, 0, global_work_size[n], local_work_size[n]);
      if (kernel_type == KERNEL_LS) {
         cluRunKernel(clu, kernel[n], &events[n], 8, sizeof(cl_mem), (void *) &input_buffer[n],     /* input_array */
                                                     sizeof(cl_mem), (void *) &output_subbuffer[n], /* output_array */
                                                     sizeof(cl_mem), (void *) &matrix_buffer[n],    /* tilebuffer */
                                                     sizeof(cl_uint), &column_span,
                                                     sizeof(cl_uint), &max_slabheight,
                                                     sizeof(cl_uint), &team_size,
                                                     sizeof(cl_uint), &num_header_packets,
                                                     (size_t) (max_slabheight * sizeof(FLOAT)), (void *) NULL);
      }
      else {
         cluRunKernel(clu, kernel[n], &events[n], 10, sizeof(cl_mem), (void *) &input_buffer[n],     /* input_array */
                                                      sizeof(cl_mem), (void *) &output_subbuffer[n], /* output_array */
                                                      sizeof(cl_mem), (void *) &matrix_buffer[n],    /* tilebuffer */
                                                      sizeof(cl_uint), &column_span,
                                                      sizeof(cl_uint), &max_slabheight,
                                                      sizeof(cl_uint), &segcachesize,
                                                      sizeof(cl_uint), &num_header_packets,
                                                      (size_t) (2 * column_span * sizeof(FLOAT)), (void *) NULL,
                                                      (size_t) (max_slabheight * sizeof(FLOAT)), (void *) NULL,
                                                      (size_t) (segcachesize * sizeof(packet)), (void *) NULL);
      }
   }
   clWaitForEvents(num_devices, events);

   /* Map the input and output arrays, to enable verification. */
   time_single_kernel_run = intervalclock();

   /* We want the performance run to last for a longer time */
   reps = 1 + (int) (6.0f / time_single_kernel_run);

   if (verbose_flag && timing_flag) {
      printf("reps = %d\n", reps); 
   }

   if (clCreateSubBuffer_ptr) { /* if sub_buffers are supported */
      output_array[0] = (FLOAT *) clEnqueueMapBuffer(cmd_queue[0], 
                                                     output_buffer, 
                                                     CL_TRUE, 
                                                     (CL_MAP_READ|CL_MAP_WRITE), 
                                                     0, 
                                                     (size_t) output_buffer_size, 
                                                     0, 
                                                     NULL, 
                                                     NULL, 
                                                     &rc);
      if (rc != CL_SUCCESS) {
         fprintf(stderr, "Couldn't map output buffer ...rc=%d\n", rc);
      }
   }
   else {
      for (n=0; n<num_devices; ++n) {
         output_array[n] = (FLOAT *) clEnqueueMapBuffer(cmd_queue[n], 
                                                        output_subbuffer[n], 
                                                        CL_TRUE, 
                                                        (CL_MAP_READ|CL_MAP_WRITE), 
                                                        0, 
                                                        (size_t) output_subbuffer_size[n], 
                                                        0, 
                                                        NULL, 
                                                        NULL, 
                                                        &rc);
         if (rc != CL_SUCCESS) {
            fprintf(stderr, "Couldn't map output buffer ...rc=%d\n", rc);
         }
      }
   }

   for (n=0; n<num_devices; ++n) {
      input_array[n]   = (FLOAT *) clEnqueueMapBuffer(cmd_queue[n], 
                                                      input_buffer[n], 
                                                      CL_TRUE, 
                                                      (CL_MAP_READ|CL_MAP_WRITE), 
                                                      0, 
                                                      (size_t) input_buffer_size[n], 
                                                      0, 
                                                      NULL, 
                                                      NULL, 
                                                      &rc);
      if (rc != CL_SUCCESS) {
         fprintf(stderr, "Couldn't map input buffer...rc=%d\n", rc);
      }
   }

   /* =============================================================== */
   /* Data Verification.                                              */
   /* =============================================================== */

   rc = 0;
   if (verbose_flag) {
      if (kernel_type == KERNEL_LS && device_type == CL_DEVICE_TYPE_GPU) printf("local wgsz = %4d ", gpu_wgsz);
   }
   if (verify_flag) {
      if (verbose_flag) {
         printf("verify...");
      }
      startclock();

      /* Run the trivial (reference) spmv calculation, using the data previously loaded into CSR format. */
      unsigned int i;
      for (i=0; i<ny; ++i) {
         FLOAT t = 0;
         unsigned int lb = row_index_array[i];
         unsigned int ub = row_index_array[i+1];
         for (j=lb; j<ub; ++j) {
            t += data_array[j] * input_array[0][x_index_array[j]];
         }
         output_array1[i] = t;
      }

      /* Compute and print out performance of trivial SpMV calculation. */
      time_trivial_compute = stopclock();
      if (timing_flag) {
         unsigned int flops;
         flops = 2 * non_zero;
         double gflops = 0.000000001 * ((double) flops) / ((double) time_trivial_compute);
         printf("Host Trivial Code exec time: %f (%10.6lf gflops) ", time_trivial_compute, gflops);
      }

      /* If we're not using sub-buffers, some extra work is needed here to move the output data to a unified buffer. */
      FLOAT *aggregate_output_array;
      if (clCreateSubBuffer_ptr) { /* if sub_buffers are supported */
         aggregate_output_array = output_array[0];
      }
      else {
         MEMORY_ALLOC_CHECK(aggregate_output_array, (nyround * sizeof (FLOAT)), "aggregate_output_array") 
         int idx = 0;
         for (n=0; n<num_devices; ++n) {
            for (i=0; i < (output_subbuffer_size[n] / sizeof(FLOAT)); ++i) {
               aggregate_output_array[idx++] = output_array[n][i];
            }
         }
      }

      /* Compare results of kernel computations against trivial calculation results. */
      startclock();
      double sum;
      double diffsum;
      sum = 0.0;
      diffsum = 0.0;
      for (i=0; i<ny; ++i) {
         FLOAT a, b;
         double abs_a, delta;
         a = output_array1[i];
         b = aggregate_output_array[i];
if (a != a) printf("n = %d, i = %d, a = %lf\n", n, i, a);
if (b != b) printf("n = %d, i = %d, a = 0x%16llx, b = 0x%016llx\n", n, i, *(long long int *) &a, *(long long int *) &b);
         abs_a = ((double) a);
         delta = (((double) a) - ((double) b));
         abs_a = (abs_a < 0.0) ? -abs_a : abs_a;
         delta = (delta < 0.0) ? -delta : delta;
         sum += abs_a;
         diffsum += delta;
      }
      time_verify = stopclock();
      printf("avg error = %le, ", diffsum / sum);
      if (diffsum / sum > 0.0001) {
         rc = -1;
      }
   }

   if (timing_flag) {

      /* Unmap input and output buffer memory objects, in preparation for performance runs. */
      for (n=0; n<num_devices; ++n) {
         rc = clEnqueueUnmapMemObject(cmd_queue[n], input_buffer[n], input_array[n],   0, NULL, &events[n]);
         if (rc != CL_SUCCESS) {
            fprintf(stderr, "Couldn't unmap input buffer...rc=%d\n", rc);
         }
      }
      if (clCreateSubBuffer_ptr) { /* if sub_buffers are supported */
         rc = clEnqueueUnmapMemObject(cmd_queue[0], output_buffer, output_array[0], 0, NULL, &events[num_devices]);
         if (rc != CL_SUCCESS) {
            fprintf(stderr, "Couldn't unmap output buffer...rc=%d\n", rc);
         }
         clWaitForEvents(num_devices+1, events);
      }
      else {
         for (n=0; n<num_devices; ++n) {
            rc = clEnqueueUnmapMemObject(cmd_queue[n], output_subbuffer[n], output_array[n], 0, NULL, &events[num_devices+n]);
            if (rc != CL_SUCCESS) {
               fprintf(stderr, "Couldn't unmap output buffer...rc=%d\n", rc);
            }
         }
         clWaitForEvents(2*num_devices, events);
      }

      /* Run kernel multiple times for performance measurement. */
      startclock();

      for (i=0; i<reps; ++i) {
         for (n=0; n<num_devices; ++n) {
            cluSetKernelNDRange(clu, kernel[n], ndims, 0, global_work_size[n], local_work_size[n]);
            if (kernel_type == KERNEL_LS) {
               cluRunKernel(clu, kernel[n], NULL, 8, 
                            sizeof(cl_mem), (void *) &input_buffer[n],     /* input_array */
                            sizeof(cl_mem), (void *) &output_subbuffer[n], /* output_array */
                            sizeof(cl_mem), (void *) &matrix_buffer[n],    /* tilebuffer */
                            sizeof(cl_uint), &column_span,
                            sizeof(cl_uint), &max_slabheight,
                            sizeof(cl_uint), &team_size,
                            sizeof(cl_uint), &num_header_packets,
                            (size_t) (max_slabheight * sizeof(FLOAT)), (void *) NULL);
            }
            else {
               cluRunKernel(clu, kernel[n], NULL, 10, 
                            sizeof(cl_mem), (void *) &input_buffer[n],     /* input_array */
                            sizeof(cl_mem), (void *) &output_subbuffer[n], /* output_array */
                            sizeof(cl_mem), (void *) &matrix_buffer[n],    /* tilebuffer */
                            sizeof(cl_uint), &column_span,
                            sizeof(cl_uint), &max_slabheight,
                            sizeof(cl_uint), &segcachesize,
                            sizeof(cl_uint), &num_header_packets,
                            (size_t) (2 * column_span * sizeof(FLOAT)), (void *) NULL,
                            (size_t) (max_slabheight * sizeof(FLOAT)), (void *) NULL,
                            (size_t) (segcachesize * sizeof(packet)), (void *) NULL);
            }
         }
         //if ((i&1023)==1023) for (n=0; n<num_devices; ++n) clFinish(cmd_queue[n]);
      }

      /* Wait until kernel execution completes. */
      for (n=0; n<num_devices; ++n) clFinish(cmd_queue[n]);

      /* Time when execution is completed. */
      time_performance_run = stopclock();
    
      /* Compute and print out performance results. */
      int flops;
      totalbytes = (float) reps * ((float) in_bytes + (float) mat_bytes + (float) out_bytes);
      flops = 2 * non_zero;
      double gflops = 0.000000001 * ((double) flops * ((double) reps)) / ((double) time_performance_run);
      double gflops_projected = gflops / (packet_density);
      gflops_projected *= interpacket_bloat;
      printf("Kernel exec time: %f (%10.6lf gflops [bloat-compensated: %10.6lf], %10.6lf GB/sec) ", 
              time_performance_run, gflops, gflops_projected, totalbytes * 0.000000001f / time_performance_run);

   } /* if (timing_flag) */

   printf("(matrix %s)\n", file_name);

   /* ================= */
   /* Shut Down OpenCL. */
   /* ================= */

   cluDestroy(clu);

   /* ================================================================== */
   /* Compute and print out the timing results for the various sections. */
   /* ================================================================== */

   if (timing_flag) {
      printf("Timing...\n");
      printf("OpenCL setup time: %f\n", time_setup);
      printf("kernel creation time: %f\n", time_kernel_creation);
      printf("tiled matrix creation time: %f\n", time_matrix_creation);
      printf("buffer allocation and mapping time: %f\n", time_buffer_handling);
      printf("tiled matrix copying time: %f\n", time_matrix_copy);
      printf("results comparison time: %f\n\n", time_verify);
   }

   return rc;
}

/* ================================================================================================== */
/* Main.                                                                                              */
/* ================================================================================================== */

int main(int argc, char *argv[]) {
   int rc;

   parse_cmdline(argc, argv);

   if (double_flag) {
      rc = tiled_SpMV(0.0);  /* Pass in a double precision number to select the DP version of the routine. */
   }
   else {
      rc = tiled_SpMV(0.0f); /* Pass in a single precision number to select the SP version of the routine. */
   }
   return rc;
}
