/*************************************************************************/
/*                                                                       */
/* Licensed Materials - Property of IBM                                  */
/*                                                                       */
/*                                                                       */
/* (C) Copyright IBM Corp. 2010                                          */
/* All Rights Reserved                                                   */
/*                                                                       */
/* US Government Users Restricted Rights - Use, duplication or           */
/* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.     */
/*                                                                       */
/*************************************************************************/

#define _GNU_SOURCE 1
#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#ifndef _WIN32
#include <unistd.h>
#endif
#include <dirent.h>
#include <limits.h>
#include <ctype.h>
#include "clu.h"

#define BINARY_SUFFIX	".ocl_bin"


#ifdef _WIN32
char *strcasestr (char *haystack, char *needle)
{
  char *p, *startn = 0, *np = 0;

  for (p = haystack; *p; p++) {
    if (np) {
      if (toupper(*p) == toupper(*np)) {
	if (!*++np)
	  return startn;
      } else
	np = 0;
    } else if (toupper(*p) == toupper(*needle)) {
      np = needle + 1;
      startn = p;
    }
  }
  return 0;
}
#endif

/* 
 * =====================================================================================
 *                                CLU INTERNAL FUNCTIONS, MACROS and STRUCTURES
 * =====================================================================================
 */

#define clu_i_alloc_buffer(_type,_size,_ret_ptr)                    \
{                                                                   \
  _ret_ptr = (_type)malloc ((_size));                               \
  if (_ret_ptr == NULL) {					    \
    fprintf (stderr, "CLU ERROR in func %s, file %s, line %d - cannot allocate memory\n", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
  }                                                                 \
}


#define INITIAL_VECTOR_ARRAY_SIZE 5             /* initial size of vector container */

/* 
 * ===  MACROS  ========================================================================
 * The following macros are used to emulate C++ templates. These are basically 
 * vector data structures. The following template functions are defined:
 *   vector_create
 *   vector_add
 *   vector_get_element        
 *   vector_del_element
 *   vector_get_size
 *   vector_free
 * =====================================================================================
 */

/*
 * creating a program_vector with the "init_capacity" elements,
 * each elements have size "element_size"
 */
#define generic_vector_create(_vector_type,_cl_type,_func_name)		\
static _vector_type * _func_name(int init_capacity)			\
{									\
  _vector_type *v = (_vector_type *) malloc (sizeof (_vector_type));	\
  if (!v)								\
    return NULL;							\
  v->curr_capacity = init_capacity;					\
  v->curr_index = 0;							\
  v->data_ptr =								\
    (_cl_type *) malloc ((sizeof (_cl_type)) * init_capacity);		\
  if (!v->data_ptr) {							\
    free (v);								\
    return NULL;							\
  }									\
  return v;								\
}

/*
 * adds element to the end of program_vector v, increases the
 * capacity of program_vector v if necessary. Returns the index
 * of the element, if fails, returns -1
 */
#define generic_vector_add(_vector_type,_cl_type,_func_name)		\
static int _func_name (_vector_type * v, _cl_type element)		\
{									\
  if (v->curr_index < v->curr_capacity) {				\
    /*programs to the address of element */				\
    v->data_ptr[v->curr_index] = element;				\
    v->curr_index++;							\
  } else {								\
    /* increment the current capacity */				\
    v->curr_capacity = v->curr_capacity + 5;				\
    v->data_ptr = (_cl_type*)realloc (v->data_ptr, (sizeof (_cl_type)) * v->curr_capacity); \
    if (v->data_ptr) {							\
      v->data_ptr[v->curr_index] = element;				\
      v->curr_index++;							\
    } else {								\
      return -1;							\
    }									\
  }									\
  return v->curr_index;							\
}

/*
 * gets the element at index from program_vector v. Returns the element
 * at index. If index is invalid, return NULL
 */
#define generic_vector_get_element(_vector_type,_cl_type,_func_name)    \
static _cl_type _func_name (_vector_type * v, int index)		\
{									\
  _cl_type nothing=NULL;						\
  if ((index < v->curr_capacity) && (index >= 0)) {			\
    return v->data_ptr[index];						\
  } else {								\
    return  nothing;							\
  }									\
}

/*
 * delete element "index" from program_vector v, shifts any subsequent element
 * to the left. returns the deleted element.   
 */
#define generic_vector_del_element(_vector_type,_cl_type,_func_name)	\
static _cl_type _func_name (_vector_type* v, int index)			\
{									\
  int i, idx;								\
  _cl_type val;								\
  val = NULL;								\
  idx = index;								\
  if ((idx < v->curr_capacity) && (idx >= 0)) {				\
    val = v->data_ptr[idx];						\
    for (i = idx; i < (v->curr_index - 1); i++) {			\
      v->data_ptr[i] = v->data_ptr[i + 1];				\
    }									\
    v->curr_index--;							\
    return val;								\
  } else {								\
    return val;								\
  }									\
}

#define generic_vector_get_size(_vector_type,_func_name)	\
static int _func_name (_vector_type* v)				\
{								\
  return v->curr_index;						\
}

#define generic_vector_free(_vector_type,_cl_type,_func_name)	\
static int _func_name (_vector_type * v)			\
{								\
  free (v->data_ptr);						\
  free (v);							\
  return 1;							\
}

/*
 * Use the above Macros to define vector functions for program vector, cmdq vector
 * and event vector in the clu_t structure
 */
generic_vector_create (clu_i_program_vector_t, clu_i_program_t, program_vector_create);
generic_vector_add (clu_i_program_vector_t, clu_i_program_t, program_vector_add);
generic_vector_get_element (clu_i_program_vector_t, clu_i_program_t, program_vector_get_element);
generic_vector_free (clu_i_program_vector_t, clu_i_program_t, program_vector_free);
generic_vector_get_size (clu_i_program_vector_t, program_vector_get_size);

generic_vector_create (clu_i_cmdq_vector_t, cl_command_queue, cmdq_vector_create);
generic_vector_add (clu_i_cmdq_vector_t, cl_command_queue, cmdq_vector_add);
generic_vector_free (clu_i_cmdq_vector_t, cl_command_queue, cmdq_vector_free);

generic_vector_create (clu_i_event_vector_t, cl_event, event_vector_create);
generic_vector_add (clu_i_event_vector_t, cl_event, event_vector_add);
generic_vector_get_element (clu_i_event_vector_t, cl_event, event_vector_get_element);
generic_vector_free (clu_i_event_vector_t, cl_event, event_vector_free);
generic_vector_get_size (clu_i_event_vector_t, event_vector_get_size);

/*
 * The vector release functions can not be encapsulated in the macros so 
 * we define them separately here
 */
static void
program_vector_release_elements (clu_i_program_vector_t * v)
{
  int i;
  for (i = 0; i < (v->curr_index); i++) {
    clReleaseProgram (v->data_ptr[i]->program);
    free (v->data_ptr[i]);
  }
}

static void
cmdq_vector_release_elements (clu_i_cmdq_vector_t * v)
{
  int i;
  for (i = 0; i < (v->curr_index); i++) {
    clReleaseCommandQueue (v->data_ptr[i]);
  }
}
static void
event_vector_release_elements (clu_i_event_vector_t * v)
{
  int i;
  for (i = 0; i < (v->curr_index); i++) {
    clReleaseEvent (v->data_ptr[i]);
  }
  v->curr_index = 0;
}


/* 
 * === hash table MACROS     ===========================================================
 *         
 *  The following macros are used to emulate C++ template for a hash table. In this
 *  hash table, we have a key = generic_key_t and value = generic_value_t 
 *
 *     generic_hash
 *     generic_hash_table_create
 *     generic_hash_table_add
 *     generic_hash_table_search
 *     generic_hash_table_destroy
 *
 *  We're using the Multiplication method for hashing the cl_kernel pointer into
 *  a hash table of integers. The table has size of GENERIC_HASH_TABLE_SIZE. Collision
 *  is handled by using linked list
 * =====================================================================================
 */

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  generic_hash
 *  Description:  returns the hash value of the given generic_key_t
 * =====================================================================================
 */
#define generic_hash(generic_key_t,_func_name)                                          \
static unsigned int _func_name (generic_key_t key)                                      \
{                                                                                       \
  unsigned int rc_key = (unsigned int) ((uintptr_t)key);                                \
  return ((rc_key * GENERIC_HASH_CONSTANT) >> (32 - GENERIC_HASH_TABLE_SIZE_LOG));      \
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  generic_hash_table_create
 *  Description:  create the kernel hash table
 * =====================================================================================
 */
#define generic_hash_table_create(_hash_table_t,_func_name)		\
static _hash_table_t * _func_name (void)				\
{									\
  int i;								\
  _hash_table_t *table;							\
  table = malloc (sizeof (_hash_table_t));				\
  if (!table) {								\
    fprintf (stderr, "ERROR in func %s, failed to alloc mem for hash table, file=%s, line=%d\n", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
    exit (EXIT_FAILURE);						\
  }									\
  for (i = 0; i < GENERIC_HASH_TABLE_SIZE; i++) {			\
    table->nodes[i] = NULL;						\
  }									\
  return table;								\
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  generic_hash_table_add
 *  Description:  add a key/value pair into the hash table
 * =====================================================================================
 */
#define generic_hash_table_add(_hash_table_t,_hash_node_t,generic_value_t,generic_key_t,_hash_func,_func_name) \
static void _func_name (_hash_table_t * hash_table,generic_value_t value,generic_key_t key_val)	\
{									\
  unsigned int key = _hash_func (key_val);				\
  /* there's nothing at that slot */					\
  if (hash_table->nodes[key] == NULL) {					\
    hash_table->nodes[key] = malloc (sizeof (_hash_table_t));		\
    if (!hash_table->nodes[key]) {					\
      fprintf (stderr, "ERROR in func %s, failed to alloc mem for hash table node, file=%s, line=%d\n", \
	       __PRETTY_FUNCTION__, __FILE__, __LINE__);		\
      exit (EXIT_FAILURE);						\
    }									\
    hash_table->nodes[key]->value = value;				\
    hash_table->nodes[key]->next = NULL;				\
    hash_table->nodes[key]->key = key_val;				\
  } else {								\
    /* there's a collision */						\
    _hash_node_t *node = hash_table->nodes[key];			\
    while (node->next != NULL) {					\
      node = node->next;						\
    }									\
    node->next = malloc (sizeof (_hash_node_t));			\
    if (!node->next) {							\
      fprintf (stderr, "ERROR in func %s, failed to alloc mem for hash table node, file=%s, line=%d\n", \
	       __PRETTY_FUNCTION__, __FILE__, __LINE__);		\
      exit (EXIT_FAILURE);						\
    }									\
    node->next->value = value;						\
    node->next->next = NULL;						\
    node->next->key = key_val;						\
  }									\
}

/* 
 * ===  MACRO ==========================================================================
 *         Name:  generic_hash_table_search
 *  Description:  given a key, return the value from the hash table
 * =====================================================================================
 */
#define generic_hash_table_search(_hash_table_t,_hash_node_t,generic_value_t,generic_key_t,_hash_func,_func_name) \
static generic_value_t _func_name (_hash_table_t * hash_table, generic_key_t key_val) \
{									\
  unsigned int key = _hash_func (key_val);				\
  _hash_node_t *node = hash_table->nodes[key];				\
  while (node != NULL) {						\
    if (node->key == key_val) {						\
      return node->value;						\
    }									\
    node = node->next;							\
  }									\
  return NULL;								\
} 

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  kernel_hash_table_destroy
 *  Description:  release resources associated with the kernel hash table
 * =====================================================================================
 */
#define generic_hash_table_destroy(_hash_table_t,_hash_node_t,generic_value_t,_free_value_func,_func_name) \
static void _func_name (_hash_table_t * hash_table)			\
{									\
  int i;								\
  for (i = 0; i < GENERIC_HASH_TABLE_SIZE; i++) {			\
    _hash_node_t *node = hash_table->nodes[i];				\
    while (node != NULL) {						\
      generic_value_t value = node->value;				\
      _hash_node_t *next_node = node->next;				\
      _free_value_func (value);						\
      free (node);							\
      node = next_node;							\
    }									\
  }									\
  free (hash_table);							\
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  clu_i_free_kernel
 *  Description:  free a clu_i_kernel_t structure and all the resources associated with
 *                the kernel
 * =====================================================================================
 */
static void clu_i_free_kernel (clu_i_kernel_t kernel)
{
  if (kernel == NULL) {
    fprintf (stderr, "CLU ERROR in func %s, file %s, line %d - the input kernel is NULL\n", __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  if (kernel->created_by_clu) {
    clReleaseKernel (kernel->kernel);
  }
  if (kernel->profiling_on) {
    event_vector_release_elements (kernel->events);
    event_vector_free (kernel->events);
  }
  if (kernel->global_work_offset) free((void *)kernel->global_work_offset);
  if (kernel->global_work_size)   free((void *)kernel->global_work_size);
  if (kernel->local_work_size)    free((void *)kernel->local_work_size);

  free (kernel);
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  clu_i_free_device
 *  Description:  free a clu_i_device_t structure and all the resources associated with
 *                the device
 * =====================================================================================
 */
static void clu_i_free_device (clu_i_device_t device)
{
  if (device == NULL) {
    CLU_EXIT_ERROR ("The input device is NULL.\n");
  }

  if (device->device_name != NULL) {
    free (device->device_name);
  }
}



/*-----------------------------------------------------------------------------
 *  Definitions for the kernel hash functions based on the generic hash macros
 *-----------------------------------------------------------------------------*/
generic_hash(cl_kernel,kernel_hash); 
generic_hash_table_create(clu_i_kernel_hash_table_t,kernel_hash_table_create);
generic_hash_table_add(clu_i_kernel_hash_table_t,clu_i_hash_kernel_node_t,clu_i_kernel_t,cl_kernel,kernel_hash,kernel_hash_table_add);
generic_hash_table_search(clu_i_kernel_hash_table_t,clu_i_hash_kernel_node_t,clu_i_kernel_t,cl_kernel,kernel_hash,kernel_hash_table_search);
generic_hash_table_destroy(clu_i_kernel_hash_table_t,clu_i_hash_kernel_node_t,clu_i_kernel_t,clu_i_free_kernel,kernel_hash_table_destroy);

/*-----------------------------------------------------------------------------
 *  Definitions for the device hash functions based on the generic hash macros
 *-----------------------------------------------------------------------------*/
generic_hash(cl_device_id,device_hash); 
generic_hash_table_create(clu_i_device_hash_table_t,device_hash_table_create);
generic_hash_table_add(clu_i_device_hash_table_t,clu_i_hash_device_node_t,clu_i_device_t,cl_device_id,device_hash,device_hash_table_add);
generic_hash_table_search(clu_i_device_hash_table_t,clu_i_hash_device_node_t,clu_i_device_t,cl_device_id,device_hash,device_hash_table_search);
generic_hash_table_destroy(clu_i_device_hash_table_t,clu_i_hash_device_node_t,clu_i_device_t,clu_i_free_device,device_hash_table_destroy);

#ifdef __powerpc__
/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluInternalGetVendorName
 *  Description:  Internal CLU function, returns the vendor name for the current clu
 *                platform. Caller to this function must free the allocated name buffer
 * =====================================================================================
 */
static char* cluInternalGetVendorName (clu_t clu)
{
  char* vendor_name;
  size_t name_size;

  cl_int err;

  err = clGetPlatformInfo (clu->platform, CL_PLATFORM_VENDOR, 0, NULL, &name_size);
  CLU_CHECK_ERROR ("clGetPlatformInfo CL_PLATFORM_VENDOR size", err);

  vendor_name = (char*)malloc (name_size);
  if (!vendor_name) {
    fprintf (stderr, "In func %s, cannot malloc buffer of size %d in file %s, at line %d\n", __PRETTY_FUNCTION__, (int)name_size, __FILE__, __LINE__);
    exit (EXIT_FAILURE); 
  }

  err = clGetPlatformInfo (clu->platform, CL_PLATFORM_VENDOR, name_size, vendor_name, NULL);
  CLU_CHECK_ERROR ("clGetPlatformInfo CL_PLATFORM_VENDOR string", err);

  return vendor_name;
}
#endif

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluInternalGetDeviceVendorName
 *  Description:  Internal CLU function, returns the vendor name for the specified
 *                device. Caller to this function must free the allocated name buffer
 * =====================================================================================
 */
static char* cluInternalGetDeviceVendorName (cl_device_id device)
{
  size_t vendor_name_len;
  char *vendor_name;
  cl_int err;

  if (device == NULL) {
    fprintf (stderr, "ERROR: given device is invalid in file %s at line %d\n", __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  err = clGetDeviceInfo (device, CL_DEVICE_VENDOR, 0, NULL, &vendor_name_len);
  if (err != CL_SUCCESS) {
    fprintf(stderr, "clGetDeviceInfo, CL_DEVICE_VENDOR failed: file %s line %d (%s)\n",
	    __FILE__, __LINE__, cluGetErrorString (err));
    exit (EXIT_FAILURE);
  }

  vendor_name = (char *) malloc (vendor_name_len);
  if (!vendor_name) {
    fprintf (stderr, "ERROR in function %s, failed to allocate mem for vendor_name, file=%s, line=%d\n", 
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  err = clGetDeviceInfo (device, CL_DEVICE_VENDOR, vendor_name_len, vendor_name, NULL);
  if (err != CL_SUCCESS) {
    fprintf(stderr, "clGetDeviceInfo, CL_DEVICE_VENDOR failed: file %s line %d (%s)\n",
	    __FILE__, __LINE__, cluGetErrorString (err));
    exit (EXIT_FAILURE);
  }
  return vendor_name;
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluInternalFindBinary
 *  Description:  internal function to CLU, used to find a binary file in the 
 *                current directory that has the same base name with a timestamp that
 *                newer (less than) the timestamp parsed from the binary file name 
 * =====================================================================================
 */
static char *
cluInternalFindBinary (const char *base_name, long long time_to_cmp)
{
  struct dirent *dp;
  const char *dir_path = ".";
  DIR *dir = opendir (dir_path);

  size_t size_base_name = strlen (base_name);

  /* Loop through the directory to look for a file with the same base_name as base_name */
  while ((dp = readdir (dir)) != NULL) {
    char *name = dp->d_name;

    if ((strncmp(base_name, name, size_base_name) == 0) && (strcmp(name+size_base_name, BINARY_SUFFIX) == 0)) {
      struct stat source_last_modified;
      long long timestamp;

      /* find the time last modified */
      if (stat (name, &source_last_modified) == -1) {
	fprintf (stderr, "ERROR: error looking at the source file %s in function %s, in file %s, at line %d, errno=%d, %s\n",
		 name, __PRETTY_FUNCTION__, __FILE__, __LINE__, errno, strerror (errno));
	exit (EXIT_FAILURE);
      }
      timestamp = (long long) source_last_modified.st_mtime;
      /* If time_to_cmp (timestamp of the source file, usually) is older than the timestamp of
       * the binary, then just return the binary 
       */
      if (time_to_cmp < timestamp) {
	return name;
      } else {
	/* If time_to_cmp (timestamp of the source file, usually) is newer than the timestamp 
	 * of the binary, then delete the binary since that binary is obsolete anyway  
	 */
	if (remove (name) != 0) {
	  fprintf (stderr, "ERROR in function %s, failed to remove obsolete binary file, err=%s, file = %s, line=%d\n",
		   __PRETTY_FUNCTION__, strerror(errno), __FILE__, __LINE__);
	}
      }
    }
  }
  return NULL;
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluInternalGetDeviceName
 *  Description:  internal function for CLU. returns the name of the device. Remember to
 *                free the memory after finish with the name. 
 * =====================================================================================
 */
static char *
cluInternalGetDeviceName (cl_device_id device)
{
  size_t device_name_len;
  char *device_name;
  cl_int err;
  if (device == NULL) {
    fprintf (stderr, "ERROR: given device is invalid in file %s at line %d\n", __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  err = clGetDeviceInfo (device, CL_DEVICE_NAME, 0, NULL, &device_name_len);
  if (err != CL_SUCCESS) {
    fprintf(stderr, "clGetDeviceInfo, CL_DEVICE_NAME failed: file %s line %d (%s)\n",
	    __FILE__, __LINE__, cluGetErrorString (err));
    exit (EXIT_FAILURE);
  }

  device_name = (char *) malloc (device_name_len);
  if (!device_name) {
    fprintf (stderr, "ERROR in function %s, failed to allocate mem for device_name, file=%s, line=%d\n", 
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  err = clGetDeviceInfo (device, CL_DEVICE_NAME, device_name_len, device_name, NULL);
  if (err != CL_SUCCESS) {
    fprintf(stderr, "clGetDeviceInfo, CL_DEVICE_NAME failed: file %s line %d (%s)\n",
	    __FILE__, __LINE__, cluGetErrorString (err));
    exit (EXIT_FAILURE);
  }
  return device_name;
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluInternalGenerateBinaryFilename
 *  Description:  
 *  Construct the file name for a binary based on the device name 
 *  build options, and a base_file_name
 * =====================================================================================
 */
static char *
cluInternalGenerateBinaryFilename (cl_device_id device, const char *build_options, const char *base_file_name)
{
  char *device_name;
  size_t len, name_size;
  char *converted_device_name;
  char *filename;
  char *converted_build_options = NULL;
  unsigned int i = 0;

  if (device == NULL) {
    fprintf (stderr, "ERROR: given cl_device_id is invalid in file %s at line %d\n", __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  /* 2 is for the 2 dots: */
  /*   filename.device_name.build_options */
  name_size = strlen (base_file_name) + 2;

  if (build_options != NULL) {
    len = strlen(build_options);
    converted_build_options = malloc (len + 1);
    if (!converted_build_options) {
      fprintf (stderr, "ERROR, cannot allocate %d bytes of memory for converted_build_options in file %s at line %d\n",
               (int)(len+1), __FILE__, __LINE__);
      exit (EXIT_FAILURE);
    }
    strcpy (converted_build_options, build_options);
    /* replace non-alphanumeric character with underscore for build options */
    for (i = 0; i < len; i++) {
      if (!isalnum ((int)converted_build_options[i])) {
	converted_build_options[i] = '_';
      }
    }
    name_size += len;
  }

  /* get the device name */
  device_name = cluInternalGetDeviceName (device);

  /* add 1 to strlen(device_name) for null-terminating byte */
  len = strlen(device_name);

  converted_device_name = (char *) malloc (len + 1);
  if (!converted_device_name) {
    fprintf (stderr, "ERROR, cannot allocate %d bytes of memory for tmp buf in file %s at line %d\n",
	     (int)(len+1), __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  strcpy (converted_device_name, device_name);
  for (i = 0; i < len; i++) {
    /*  replace non-alphanumeric character with underscore for device name */
    if (!isalnum ((int)converted_device_name[i])) {
      converted_device_name[i] = '_';
    }
  }

  name_size += len;
  filename = (char *) malloc (name_size + 1);

  if (!filename) {
    fprintf (stderr, "ERROR, cannot allocate %d bytes of memory for filename in file %s at line %d\n",
	     (int)(name_size+1), __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  /* Concatenate the base name, device name and the local work group size that binary 
   * is compiled for */
  if (build_options != NULL) {
    sprintf (filename, "%s.%s.%s", base_file_name,
         converted_device_name, converted_build_options);
  } else {
    sprintf (filename, "%s.%s.", base_file_name, converted_device_name);
  }

  if (build_options != NULL) free (converted_build_options);
  free (converted_device_name);
  free (device_name);
  return filename;
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluInternalLoadProgramBinary
 *  Description:  internal CLU function.  Load a binary file with filename into a 
 *                char* buffer. 
 * =====================================================================================
 */
static unsigned char *
cluInternalLoadProgramBinary (const char *filename, size_t * size)
{
  struct stat statbuf;
  unsigned char *binary;
  FILE *fh = fopen (filename, "rb");
  if (fh == NULL) {
    fprintf (stderr, "Failed to open %s (%s)\n", filename, strerror (errno));
    return NULL;
  }

  stat (filename, &statbuf);

  binary = (unsigned char *) malloc (statbuf.st_size);
  if (!binary) {
    fprintf (stderr, "ERROR in function %s, failed to allocate mem for storing the binary, file=%s, line=%d\n", 
    __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (fread (binary, statbuf.st_size, 1, fh) != 1) {
    fprintf (stderr, "Error reading %s\n", filename);
    free (binary);
    binary = NULL;
  } else {
    *size = (size_t) statbuf.st_size;
  }

  fclose (fh);

  return binary;
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluInternalLoadProgramSource
 *  Description:  internal CLU function. Load a source file with filename into a char*
 *                buffer 
 * =====================================================================================
 */
static char *
cluInternalLoadProgramSource (const char *filename)
{
  struct stat statbuf;
  char* source;

  FILE *fh = fopen (filename, "rb");
  if (fh == 0) {
    fprintf (stderr, "Failed to open %s (%s)\n", filename, strerror (errno));
    return NULL;
  }

  stat (filename, &statbuf);
  source = (char *) malloc (statbuf.st_size + 1);
  if (!source) {
    fprintf (stderr, "ERROR in function %s, failed to allocate mem for storing the source file, file=%s, line=%d\n", 
    __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (fread (source, statbuf.st_size, 1, fh) != 1) {
    fprintf (stderr, "Error reading %s (%s)\n", filename, strerror (errno));
    free (source);
    source = NULL;
  } else {
    source[statbuf.st_size] = '\0';
  }

  fclose (fh);

  return source;
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluInternalSaveProgramBinary
 *  Description:  Internal CLU function. Save a cl_program binary for the cl_device_id.
 *                The saved file has the name binary_filename
 * =====================================================================================
 */
static void
cluInternalSaveProgramBinary (cl_program program, cl_device_id dev_id, const char *binary_filename)
{
  size_t *binary_size;
  size_t size_ret;
  unsigned char **binaries;
  cl_device_id *device_ids;
  cl_uint num_devices;
  int match;
  unsigned int i;
  FILE *fh;
  size_t err; 
  /* check for input errors */
  if (program == NULL) {
    fprintf (stderr, "ERROR, cluInternalSaveProgramBinary has a NULL program parameter in %s at %d\n", __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  if (dev_id == NULL) {
    fprintf (stderr, "ERROR, cluInternalSaveProgramBinary has a NULL device parameter in %s at %d\n", __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  /*  Get the number of devices associated with the program */
  CLU_CHECK_ERROR ("clGetProgramInfo", clGetProgramInfo (program, CL_PROGRAM_NUM_DEVICES, sizeof (cl_uint), &num_devices, NULL));

  /*  malloc an array large enough to contain num_devices of cl_device */
  device_ids = (cl_device_id *) malloc (sizeof (cl_device_id) * num_devices);
  if (!device_ids) {
    fprintf (stderr, "ERROR in function %s, failed to allocate mem for device_ids, file=%s, line=%d\n", __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  /* malloc an array large enough to contain num_devices of pointers to binaries */
  binaries = (unsigned char **) malloc (sizeof (unsigned char *) * num_devices);
  if (!binaries) {
    fprintf (stderr, "ERROR in function %s, failed to allocate mem for binaries array, file=%s, line=%d\n", __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  /* get the device_ids associated with the program */
  CLU_CHECK_ERROR ("clGetProgramInfo", clGetProgramInfo (program, CL_PROGRAM_DEVICES, sizeof (cl_device_id) * num_devices, device_ids, NULL));

  /* find index of matching devid */
  match = -1;

  for (i = 0; i < num_devices; i++) {
    if (device_ids[i] == dev_id) {
      match = i;
      break;
    }
  }
  if (match == -1) {
    fprintf (stderr, "ERROR: the given device is not associated with the device in %s, line = %d\n", __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  /*  Get the size of the binaries associated with the program */
  CLU_CHECK_ERROR ("clGetProgramInfo", clGetProgramInfo (program, CL_PROGRAM_BINARY_SIZES, 0, NULL, &size_ret));

  /*  allocate an array to store the sizes of the binaries */
  binary_size = (size_t *) malloc (size_ret);
  if (!binary_size) {
    fprintf (stderr, "ERROR in function %s, failed to allocate mem for binary_size array, file=%s, line=%d\n", 
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  /* query the sizes of the binaries */
  CLU_CHECK_ERROR ("clGetProgramInfo", clGetProgramInfo (program, CL_PROGRAM_BINARY_SIZES, size_ret, binary_size, NULL));

  /* allocate space for each binary associated with the program from the sizes queried above */
  for (i = 0; i < num_devices; i++) {
    if (binary_size[i] > 0) {
      binaries[i] = (unsigned char *) malloc (sizeof (unsigned char) * binary_size[i]);
      if (!binaries[i]) {
	fprintf (stderr, "ERROR in function %s, failed to allocate memory for storing binaries[i], file=%s, line=%d\n", 
		 __PRETTY_FUNCTION__, __FILE__, __LINE__);
	exit (EXIT_FAILURE);
      }
    } else {
      binaries[i] = NULL;
    }
  }

  /* Get the binaries */
  CLU_CHECK_ERROR ("clGetProgramInfo", clGetProgramInfo (program, CL_PROGRAM_BINARIES, num_devices * (sizeof (unsigned char *)), binaries, NULL));

  fh = fopen (binary_filename, "w");
  if (fh == NULL) {
    fprintf (stderr, "fopen(%s,'w') failed errno=%d\n", binary_filename, errno);
    exit (EXIT_FAILURE);
  }
  /*  write the binary into the file */
  err = fwrite (binaries[match], 1, binary_size[match], fh);
  if (err != (size_t) binary_size[match]) {
    fprintf (stderr, "ERROR in func %s, cannot write binary to filename %s, in file %s, at line %d, errno msg=%s",
	     __PRETTY_FUNCTION__, binary_filename, __FILE__, __LINE__, strerror(errno));
    exit (EXIT_FAILURE);
  }

  err = fclose (fh);
  if (err != 0) {
    fprintf (stderr, "close failed err=%d errno=%d\n", (int)err, errno);
    exit (EXIT_FAILURE);
  }
  for (i = 0; i < num_devices; i++) {
    if (binaries[i] != NULL) {
      free (binaries[i]);
    }
  }
  free (binaries);
  free (binary_size);
  free (device_ids);
}


/* 
 * === STRUCTURES  =====================================================================
 * The following structures are used to parameterize the display of device info data.
 * =====================================================================================
 */
struct list_bitfield {
  cl_bitfield name;
  char *name_string;
};

struct list_uint {
  cl_uint name;
  char *name_string;
};


struct device_param {
  cl_device_info name;
  char *name_string;
  int reqd_major_version;
  int reqd_minor_version;
  char *reqd_extension;
  void (*print_func)(cl_device_id id, struct device_param *);
  void *list;
  cl_device_info dim;
};


/* ===  FUNCTION  ======================================================================
 *         Name:  cluInternalPrintDeviceUint
 *  Description:  Internal CLU function used to print unsigned integer device 
 *                information.
 * =====================================================================================
 */

static void cluInternalPrintDeviceUint(cl_device_id id, struct device_param *param)
{
  cl_uint value;

  CLU_CHECK_ERROR ("clGetDeviceInfo", clGetDeviceInfo (id, param->name, sizeof(value), &value, NULL));
  printf ("%u\n", value);
}

/* ===  FUNCTION  ======================================================================
 *         Name:  cluInternalPrintDeviceUlong
 *  Description:  Internal CLU function used to print unsigned long device 
 *                information.
 * =====================================================================================
 */
static void cluInternalPrintDeviceUlong(cl_device_id id, struct device_param *param)
{
  cl_ulong value;

  CLU_CHECK_ERROR ("clGetDeviceInfo", clGetDeviceInfo (id, param->name, sizeof(value), &value, NULL));
  printf ("%llu\n", (long long)value);
}

/* ===  FUNCTION  ======================================================================
 *         Name:  cluInternalPrintDeviceSizet
 *  Description:  Internal CLU function used to print size_t device information. This
 *                function supports both a single value size_t and multi-value size_t
 *                info. The dimension is obtained by quering the info for param->dim.
 * =====================================================================================
 */
static void cluInternalPrintDeviceSizet(cl_device_id id, struct device_param *param)
{

  if (param->dim) {
    /* Multi-dimension size_t */
    int i;
    cl_uint dim;
    size_t *values;

    CLU_CHECK_ERROR ("clGetDeviceInfo", clGetDeviceInfo (id, param->dim, sizeof(dim), &dim, NULL));
    values = (size_t *)malloc(sizeof(size_t) * dim);

    if (values == NULL) {
      fprintf (stderr, "ERROR: cannot allocate %zd byte in file %s, at line %d\n", sizeof(size_t) * dim,
	       __FILE__, __LINE__);
      exit (EXIT_FAILURE);
    }

    CLU_CHECK_ERROR ("clGetDeviceInfo", clGetDeviceInfo (id, param->name, dim * sizeof(size_t), values, NULL));
    printf("{");
    for (i = 0; i < dim; i++) {
      printf ("%s%zd", (i) ? ", " : "", values[i]);
    }
    printf ("}\n");
    free (values);
  } else {
    /* Single dimension size_t */
    size_t value;

    CLU_CHECK_ERROR ("clGetDeviceInfo", clGetDeviceInfo (id, param->name, sizeof(value), &value, NULL));
    printf ("%zd\n", value);
  }
}

/* ===  FUNCTION  ======================================================================
 *         Name:  cluInternalPrintDeviceBool
 *  Description:  Internal CLU function used to print boolean device information. If
 *                the boolean is zero, "CL_FALSE" is printed. Otherwise, "CL_TRUE" is
 *                printed.
 * =====================================================================================
 */
static void cluInternalPrintDeviceBool(cl_device_id id, struct device_param *param)
{
  cl_bool value;

  CLU_CHECK_ERROR ("clGetDeviceInfo", clGetDeviceInfo (id, param->name, sizeof(value), &value, NULL));
  printf ("%s\n", (value) ? "CL_TRUE" : "CL_FALSE");
}


/* ===  FUNCTION  ======================================================================
 *         Name:  cluInternalPrintDevicePtr
 *  Description:  Internal CLU function used to print pointer device information.
 * =====================================================================================
 */
static void cluInternalPrintDevicePtr(cl_device_id id, struct device_param *param)
{
  void * value;

  CLU_CHECK_ERROR ("clGetDeviceInfo", clGetDeviceInfo (id, param->name, sizeof(value), &value, NULL));
  printf ("%p\n", value);
}


/* ===  FUNCTION  ======================================================================
 *         Name:  cluInternalPrintDeviceString
 *  Description:  Internal CLU function used to print string device information.
 * =====================================================================================
 */
static void cluInternalPrintDeviceString(cl_device_id id, struct device_param *param)
{
  size_t param_size;
  char *value;

  CLU_CHECK_ERROR ("clGetDeviceInfo", clGetDeviceInfo (id, param->name, 0, NULL, &param_size));
  if ((value = (char *)malloc(param_size))) {
    CLU_CHECK_ERROR ("clGetDeviceInfo", clGetDeviceInfo (id, param->name, param_size, value, NULL));
    printf ("\"%s\"\n", value);
    free (value);
  } else {
    fprintf (stderr, "ERROR: cannot allocate %zd bytes for string buffer in file %s, at line %d\n", 
	     param_size, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
}


/* ===  FUNCTION  ======================================================================
 *         Name:  cluInternalPrintDeviceBitflags
 *  Description:  Internal CLU function used to print bitflags device information.
 *                The bitflags are printed as a OR ("|") of known bitflags enumerants.
 *                Unknown bitflags are printed as a hexadecimal number.
 * =====================================================================================
 */
static void cluInternalPrintDeviceBitflags(cl_device_id id, struct device_param *param)
{
  int first = 1;
  cl_bitfield value;
  struct list_bitfield *flags;

  CLU_CHECK_ERROR ("clGetDeviceInfo", clGetDeviceInfo (id, param->name, sizeof(value), &value, NULL));

  flags = (struct list_bitfield *)param->list;

  while (flags->name) {
    if (value & flags->name) {
      if (first) {
	first = 0;
      } else {
	printf(" | ");
      }
      printf("%s", flags->name_string);
      value &= ~flags->name;
    }
    flags++;
  }
  if (value) {
    if (first) printf(" | ");
    printf("0x%llx", (long long)value);
  }

  printf("\n");
}

/* ===  FUNCTION  ======================================================================
 *         Name:  cluInternalPrintDeviceEnum
 *  Description:  Internal CLU function used to print cl_ulong enumerated device 
 *                information.
 * =====================================================================================
 */
static void cluInternalPrintDeviceEnum(cl_device_id id, struct device_param *param)
{
  cl_bitfield value;
  struct list_bitfield *enums;

  CLU_CHECK_ERROR ("clGetDeviceInfo", clGetDeviceInfo (id, param->name, sizeof(value), &value, NULL));

  enums = (struct list_bitfield *)param->list;

  while (enums->name) {
    if (value == enums->name) break;
    enums++;
  }
  if (enums->name_string) {
    printf("%s\n", enums->name_string);
  } else {
    printf("<UNKNOWN> (0x%llx)\n", (long long)value);
  }
}

/* ===  FUNCTION  ======================================================================
 *         Name:  cluInternalPrintDeviceEnumUint
 *  Description:  Internal CLU function used to print cl_uint enumerated device 
 *                information.
 * =====================================================================================
 */
static void cluInternalPrintDeviceEnumUint(cl_device_id id, struct device_param *param)
{
  cl_uint value;
  struct list_uint *enums;

  CLU_CHECK_ERROR ("clGetDeviceInfo", clGetDeviceInfo (id, param->name, sizeof(value), &value, NULL));

  enums = (struct list_uint *)param->list;

  while (enums->name) {
    if (value == enums->name) break;
    enums++;
  }
  if (enums->name_string) {
    printf("%s\n", enums->name_string);
  } else {
    printf("<UNKNOWN> (0x%x)\n", value);
  }
}

/* 
 * =====================================================================================
 *                                CLU EXTERNAL FUNCTIONS
 * =====================================================================================
 */

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluInit
 *  Description:  cluInit initializes the platform, query all the devices available on 
 *                the platform, and create a context based on the queried devices. 
 *   
 *  Returns    :  a clu structure
 * =====================================================================================
 */
clu_t
cluInit (cl_platform_id platform)
{
  clu_t rc;
  _clu_t *clu;
  cl_uint num_platforms;
  cl_int err;
  cl_context_properties ctx_prop[3];

  clu = (_clu_t *) malloc (sizeof (_clu_t));
  if (!clu) {
    fprintf (stderr, "%s: cannot allocate memory for the clu_t structure, file=%s, line=%d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  /* If there is no provided platform, get the first platform available */
  if (platform == NULL) {
    err = clGetPlatformIDs (1, &clu->platform, &num_platforms);
    if ((num_platforms == 0) || (err != CL_SUCCESS)) {
      fprintf (stderr, "clGetPLatformIDs failed: cannot find a valid platform. file %s, line %d, err=(%s)\n",
	       __FILE__, __LINE__, cluGetErrorString (err));
      exit (EXIT_FAILURE);
    }
  } else {
    clu->platform = platform;
  }

  ctx_prop[0] = CL_CONTEXT_PLATFORM;
  ctx_prop[1] = (cl_context_properties) (clu->platform);
  ctx_prop[2] = 0;

  clu->context = clCreateContextFromType (ctx_prop, CL_DEVICE_TYPE_ALL,
                      NULL, NULL, &err);
  CLU_CHECK_ERROR ("clCreateContextFromType", err);

  /* Query the default device and create a default command queue for 
   * the default device 
   */

  /*
   * Create all the containers
   */
  clu->programs = program_vector_create (INITIAL_VECTOR_ARRAY_SIZE);
  if (!clu->programs) {
    CLU_EXIT_ERROR ("Cannot create vector container for program objects.\n");
  }

  clu->kernels = kernel_hash_table_create ();
  if (!clu->kernels) {
    CLU_EXIT_ERROR ("Cannot create hash table to store kernel objects.\n");
  }

  clu->devices = device_hash_table_create ();
  if (!clu->devices) {
    CLU_EXIT_ERROR ("Cannot create hash table to store device objects.\n");
  }

  clu->cmd_queues = cmdq_vector_create (INITIAL_VECTOR_ARRAY_SIZE);
  if (!clu->cmd_queues) {
    fprintf (stderr, "ERROR: in func %s cannot create array to store command queue objects in file %s, line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  rc = (clu_t) clu;

  return rc;
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluGetCLDeviceTypeString
 *  Description:  returns a character array representing the type of the device
 * =====================================================================================
 */
const char *cluGetCLDeviceTypeString (cl_device_type device_type)
{
  if (device_type == CL_DEVICE_TYPE_DEFAULT) {
    return "DEFAULT";
  } else if (device_type == CL_DEVICE_TYPE_ACCELERATOR) {
    return "ACCELERATOR";
  } else if (device_type == CL_DEVICE_TYPE_CPU) {
    return "CPU";
  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    return "GPU";
  }
  return "<UNKNOWN>";
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluGetDeviceID
 *  Description:  returns the cl_device_id that matches the given device_type, 
 *                dev_vendor, and dev_name.
 *
 *                If dev_name or dev_vendor is NULL, then its not used in the 
 *                search criteria.
 *
 *                If there are devices that match the device_type, and that the dev_name
 *                and/or dev_vender matches one of the devices, then return that device. 
 *
 *                For ease of use, the input dev_name and the device_name are both
 *                converted to lower case before comparing
 *  Parameters:
 *
 *  Returns:
 *                a valid device_id if found one, NULL otherwise
 * =====================================================================================
 */
cl_device_id cluGetDeviceID (clu_t clu, cl_device_type device_type, char *dev_vendor, char* dev_name)
{
  cl_uint num_devices;
  cl_device_id* devices;
  unsigned int i;
  cl_device_id ret_dev = NULL;
  clu_i_device_t device;

  if (clu == NULL) {
    CLU_EXIT_ERROR ("The input parameter clu is NULL.\n");
  }

  /* get the number of devices on the box */
  CLU_CHECK_ERROR ("clGetDeviceIDs", clGetDeviceIDs (clu->platform, device_type, 0, NULL, &num_devices));

  /*  If there is no device, then return NULL */
  if (num_devices < 1) {
    return ret_dev;
  }

  /*  malloc space for the devices */
  devices = (cl_device_id*)malloc (num_devices * sizeof (cl_device_id));
  if (devices == NULL) {
    CLU_EXIT_ERROR ("Unable to allocate memory for devices array.\n");
  }

  /* get the devices of type device_type */
  CLU_CHECK_ERROR ("clGetDeviceIDs", clGetDeviceIDs (clu->platform, device_type, num_devices, devices, NULL));

  for (i = 0; i < num_devices; i++) {
    int match = 1;
    
    if (dev_name) {
      char *name;
      name = cluInternalGetDeviceName (devices[i]);
      if (strcasestr (name, dev_name) == NULL) match = 0;
      free(name);
    }

    if (dev_vendor) {
      char *name;
      name = cluInternalGetDeviceVendorName (devices[i]);
      if (strstr (name, dev_vendor) == NULL) match = 0;
      free(name);
    }

    if (match) {
      ret_dev = devices[i];
      break;
    }
  }

  if (ret_dev) {
    device = (clu_i_device_t) malloc (sizeof (_clu_i_device_t));
    if (!device) {
      CLU_EXIT_ERROR ("Cannot allocate space for clu_i_device_t structure.");
    }

    device->device_id = ret_dev;
    device->device_name = cluInternalGetDeviceName (ret_dev); 

    device_hash_table_add (clu->devices, device, ret_dev);
  }
  return ret_dev;
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluGetDeviceName
 *  Description:  Gets the device name string for the specified device
 * =====================================================================================
 */
const char * cluGetDeviceName (clu_t clu, cl_device_id device_id)
{
  clu_i_device_t clu_device;
  if (clu == NULL) {
    fprintf (stderr, "ERROR: %s, the first argument clu_t is NULL in file %s at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (device_id == NULL) {
    CLU_EXIT_ERROR ("The input cl_device_id is NULL.");
  } 

  clu_device = device_hash_table_search (clu->devices, device_id);

  if (!clu_device) {
    /* Device currently not available in hash table. Allocate a new entry.
     */
    clu_device = (clu_i_device_t) malloc (sizeof (_clu_i_device_t));
    if (!clu_device) {
      CLU_EXIT_ERROR ("Cannot allocate space for clu_i_device_t structure.");
    }
    clu_device->device_id = device_id;
    clu_device->device_name = cluInternalGetDeviceName (device_id); 
    device_hash_table_add (clu->devices, clu_device, device_id);
  }
  return ((const char *)clu_device->device_name);
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluCreateCmdQueue
 *  Description:  initializes the first device that matches the given device_type. It also
 *            creates a command queue for the device and returns the command queue
 *          
 *
 *  Parameters:
 *      clu
 *      dev_id:  if not 0, then we just use this device_id to create the clu device. 
 *               The dtype is going to be ignored
 *      dtype:   device_type. If dev_id is 0, then we are going to use the type to find
 *           a matching device in the platform 
 *      properties: specifies a list of properties for the command queue. see OpenCL spec 
 *
 *  Returns    :  a cl_command_queue structure
 * =====================================================================================
 */
cl_command_queue
cluCreateCmdQueue (clu_t clu, cl_device_id dev_id, cl_device_type device_type, cl_command_queue_properties properties)
{
  cl_int err;
  cl_command_queue cmd_queue;
  cl_device_id use_dev_id;

  if (clu == NULL) {
    fprintf (stderr, "ERROR: %s, the first argument clu_t is NULL in file %s at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (dev_id != 0) {
    /* if the given device id is not null */
    use_dev_id = dev_id;
  } else {
    /* find the first matching device */
    if ((device_type != CL_DEVICE_TYPE_DEFAULT) &&
	(device_type != CL_DEVICE_TYPE_GPU) &&
	(device_type != CL_DEVICE_TYPE_CPU) &&
	(device_type != CL_DEVICE_TYPE_ACCELERATOR)) {
      fprintf (stderr, "ERROR in clu function  %s, input device_type %s is invalid  %s at line %d\n",
	       __PRETTY_FUNCTION__, cluGetCLDeviceTypeString (device_type), __FILE__, __LINE__);
      exit (EXIT_FAILURE);
    }

    err = clGetDeviceIDs (clu->platform, device_type, 1, &use_dev_id, NULL);
    CLU_CHECK_ERROR ("clGetDeviceIDs", err);
  }

  /* create a command queue */
  cmd_queue = clCreateCommandQueue (clu->context, use_dev_id, properties, &err);
  CLU_CHECK_ERROR ("clCreateCommandQueue", err);

  cmdq_vector_add (clu->cmd_queues, cmd_queue);

  return cmd_queue;
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluCreateKernel
 *  Description:  creates an OpenCL program from either source or binary that's associated
 *                with the given device, builds the program with build_options, then create
 *                the kernel based on the given kernel_name and returns the clu_i_kernel_t object.  
 *  Parameters: 
 *      clu:        valid clu_t object
 *      cmd_queue:      the cl_command_queue object we want to run the kernel on.  
 *      filename:   filename for either the source or binary file
 *      kernel_name:    kernel_name
 *      build_options:  build_options
 *      flag:       bitfield with the following options
 *              CL_SOURCE: build from the source file. If there's a cached version of
 *              the program available, it will be used to create the kernel, otherwise,
 *              CLU will rebuild the program from the source. 
 *                          
 *              CL_BINARY: build the program from the binary. The name of the binary
 *              file is given  by filename. 
 *
 *              CL_NO_CACHE: build the program from the source file regardless
 *              if there's a cached version of the binary available. The resulting
 *              binary will not be cached either.   
 *
 *  Returns: a clu_kernel object
 * =====================================================================================
 */
cl_kernel
cluCreateKernel (clu_t clu, cl_command_queue cmd_queue, const char *filename,
		 const char *kernel_name, const char *build_options, clu_create_program_flag_t flag)
{
  cl_program program = NULL;
  struct _clu_t *clu_platform;
  cl_int err;
  int i;
  cl_bool build_from_source = CL_FALSE;
  cl_bool save_binary = CL_TRUE;
  char *program_source;
  clu_i_program_t clu_program = NULL;
  clu_i_kernel_t clu_kernel;
  cl_device_id device_id;

  /* All the error checking for input parameters */
  if (clu == NULL) {
    fprintf (stderr, "ERROR: function %s has been invoked with clu object = NULL in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  clu_platform = (struct _clu_t *) clu;

  if (cmd_queue == NULL) {
    fprintf (stderr, "ERROR: function %s has been invoked with cl_command_queue object = NULL in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (kernel_name == NULL) {
    fprintf (stderr, "ERROR: function %s has been called with kernel_name = NULL in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (filename == NULL) {
    fprintf (stderr, "ERROR: function %s has been called with filename = NULL in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  /* query the device id of the command queue */
  err = clGetCommandQueueInfo (cmd_queue, CL_QUEUE_DEVICE, sizeof (cl_device_id), &device_id, NULL);
  CLU_CHECK_ERROR ("clGetCommandQueueInfo", err);

  /*-----------------------------------------------------------------------------
   *  before creating another program, search for an existing program with the 
   *  the same source file and build options 
   *-----------------------------------------------------------------------------
   */
  for (i = 0; i < program_vector_get_size (clu_platform->programs); i++) {
    clu_i_program_t p = program_vector_get_element (clu_platform->programs, i);

    /* If there's a source file */
    if (p->source_filename != NULL) {
      if (((flag & CLU_NO_CACHE) && (flag & CLU_SOURCE)) || (flag & CLU_SOURCE)) {
        if ((strcmp (p->source_filename, filename) == 0) && (p->device == device_id)) {
          if ((((build_options != NULL) && (p->build_options != NULL)) && (strcmp (p->build_options, build_options) == 0)) || 
	      ((build_options == NULL) && (p->build_options == NULL))) {
            clu_program = p;
            program = clu_program->program;

            save_binary = CL_FALSE;
            break;
          }
        }
      } else {
        if (p->binary_filename != NULL) {
          if ((strcmp (p->binary_filename, filename) == 0) && (p->device == device_id)) {
            if ((((build_options != NULL) && (p->build_options != NULL)) && (strcmp (p->build_options, build_options) == 0)) || 
		((build_options == NULL) && (p->build_options == NULL))) {
              clu_program = p;
              program = clu_program->program;
              save_binary = CL_FALSE;
              break;
            }
          }
        }/* if the stored binary_filename is not NULL */
      }
    } /* end if there's a source file */
  }

  /*-----------------------------------------------------------------------------
   *  If there's no existing clu_program, let's create one
   *-----------------------------------------------------------------------------*/
  if (clu_program == NULL) {
    /* Detect whether we need to cache the binary once we finish building the program */
    cl_bool no_cache = CL_TRUE;
    char *no_cache_env = getenv ("IBM_OPENCL_CLU_NO_CACHE");
    if (no_cache_env != NULL) {
      no_cache = CL_TRUE;
    }

#ifdef __powerpc__
    {
      /* get the vendor name, we only cache for IBM Power */
      char *vendor_name = cluInternalGetVendorName (clu);
    
      if (strcmp (vendor_name, "IBM") == 0) {
        no_cache = CL_FALSE;
      }
    
      free (vendor_name);
    }
#endif

    /* If user specifies no caching, or if we're not on IBM platform, then
     * don't cache the binary */
    if (((flag & CLU_NO_CACHE) || no_cache) && (flag & CLU_SOURCE)) {
      build_from_source = CL_TRUE;
      save_binary = CL_FALSE;
    } else if (flag & CLU_SOURCE)
    /*  If we are building from source */
     {
      struct stat source_last_modified;
      char *gen_name;
      char *built_binary_name;

      /* check to see if there's already a built binary for this. First
       * find the generated name for this  binary 
       */
      gen_name = cluInternalGenerateBinaryFilename (device_id, build_options, filename);
      if (stat (filename, &source_last_modified) == -1) {
	fprintf (stderr, "ERROR: error looking at the source file %s in function %s, in file %s, at line %d, errno=%d, %s\n",
		 filename, __PRETTY_FUNCTION__, __FILE__, __LINE__, errno, strerror (errno));
	exit (EXIT_FAILURE);
      }

      built_binary_name = cluInternalFindBinary (gen_name, (long long) source_last_modified.st_mtime);

      /*  If we can find an existing binary for this source file with this build options that's newer than
       *  the source, just load it and create an OpenCL program with it  
       */
      if (built_binary_name != NULL) {
	size_t binary_size;
	unsigned char *program_binary;

	program_binary = cluInternalLoadProgramBinary (built_binary_name, &binary_size);
    
	/*  if we cannot load the binary for some reason then just build from source */
	if (program_binary == NULL) {
	  build_from_source = CL_TRUE;
	} else {
	  /*  otherwise we create the program from the binary */
	  program = clCreateProgramWithBinary (clu->context, 1, &device_id,
					       (size_t *) & binary_size,
					       (const unsigned char **)
					       &program_binary, NULL, &err);
	  if (err != CL_SUCCESS) {
	    build_from_source = CL_TRUE;
	  }
	  clu_program = (clu_i_program_t) malloc (sizeof (_clu_i_program_t));
	  if (!clu_program) {
	    fprintf (stderr, "ERROR: in CLU function %s cannot allocate memory for clu_program in file %s, at line %d\n",
		     __PRETTY_FUNCTION__, __FILE__, __LINE__);
	    exit (EXIT_FAILURE);
	  }
	  clu_program->built_from_source = CL_FALSE;
	  clu_program->binary_filename = built_binary_name;
	  clu_program->source_filename = filename;
	  clu_program->program = program;
	  free (program_binary);
	}
      } else {
	/* cannot find a built binary */
	build_from_source = CL_TRUE;
      }
    } else if (flag & CLU_BINARY) {
      /*  If we're building from Binary */
      size_t binary_size;
      unsigned char *program_binary = cluInternalLoadProgramBinary (filename, &binary_size);

      if (program_binary == NULL) {
	fprintf (stderr, "Error: failed to load binary file %s, file %s, line %d\n",
		 filename, __FILE__, __LINE__);
	exit (EXIT_FAILURE);
      }

      /* create program from the input binary file */
      program = clCreateProgramWithBinary (clu->context, 1, &device_id,
					   (size_t *) & binary_size,
					   (const unsigned char **) &program_binary,
					   NULL, &err);
      
      if (err != CL_SUCCESS) {
	fprintf (stderr, "Error: failed clCreateProgramWithBinary, file %s, line %d, error=%s\n",
		 __FILE__, __LINE__, cluGetErrorString (err));
	exit (EXIT_FAILURE);
      }
      clu_program = (clu_i_program_t) malloc (sizeof (_clu_i_program_t));
      if (!clu_program) {
	fprintf (stderr, "ERROR: in CLU function %s cannot allocate memory for clu_program in file %s, at line %d\n",
		 __PRETTY_FUNCTION__, __FILE__, __LINE__);
	exit (EXIT_FAILURE);
      }

      clu_program->built_from_source = CL_FALSE;
      clu_program->binary_filename = filename;
      clu_program->source_filename = NULL;
      clu_program->program = program;

      free (program_binary);
    } else {
      /* If the flag is something else then it's not a valid flag, print error and exit  */
      fprintf (stderr, "Error: function %s has invalid input for cl_create_program_flag_t in file %s, at line %d\n",
	       __PRETTY_FUNCTION__, __FILE__, __LINE__);
      exit (EXIT_FAILURE);
    }

    /* If we're really building from source */
    if (build_from_source) {
      /*  Load the program from the source file */
      program_source = cluInternalLoadProgramSource (filename);

      if (program_source == NULL) {
	fprintf (stderr, "Error: Failed to load program source from file %s, file %s, line %d\n",
		 filename, __FILE__, __LINE__);
	exit (EXIT_FAILURE);
      }

      /* create program */
      program = clCreateProgramWithSource (clu->context, 1, (const char **) &program_source, NULL, &err);

      if (err != CL_SUCCESS) {
	fprintf(stderr, "clCreateProgramWithSource failed: prg %s file %s line %d (%s)\n",
		filename, __FILE__, __LINE__, cluGetErrorString (err));
	exit (EXIT_FAILURE);
      }
      clu_program = (clu_i_program_t) malloc (sizeof (_clu_i_program_t));
      if (!clu_program) {
	fprintf (stderr, "ERROR: in CLU function %s cannot allocate memory for clu_program in file %s, at line %d\n",
		 __PRETTY_FUNCTION__, __FILE__, __LINE__);
	exit (EXIT_FAILURE);
      }

      clu_program->built_from_source = CL_TRUE;
      clu_program->binary_filename = NULL;
      clu_program->source_filename = filename;
      clu_program->program = program;

      free (program_source);
    }

    /*-----------------------------------------------------------------------------
     *  Compile the newly created program for the given device with the input 
     *  build options
     *-----------------------------------------------------------------------------*/

    err = clBuildProgram (program, 1, &device_id, build_options, NULL, NULL);

    /* If the build process returns an error then print the build log and exit */
    if (err != CL_SUCCESS) {
      size_t len;
      char *buffer;
      printf ("clBuildProgram failed: (%s)\n", cluGetErrorString (err));

      err = clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
      if (err != CL_SUCCESS) {
        fprintf(stderr, "clGetProgramBuildInfo get log length failed: prg %s file %s line %d (%s)\n",
		filename, __FILE__, __LINE__, cluGetErrorString (err));
        exit (EXIT_FAILURE);
      }

      buffer = (char *) malloc (len);
      if (!buffer) {
        fprintf (stderr, "cannot allocate memory for build log. file %s, line %d\n", __FILE__, __LINE__);
        exit (EXIT_FAILURE);
      }

      clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, len, buffer, NULL);
      printf ("%s\n", buffer);
      exit (EXIT_FAILURE);
    }

    clu_program->build_options = build_options;
    clu_program->device = device_id;

    /* If we are caching the binary for later use */
    if ((build_from_source) && (save_binary)) {
      size_t suffix_len = strlen(BINARY_SUFFIX);
      char *gen_name = cluInternalGenerateBinaryFilename (device_id, build_options, filename);
      char *bin_name = (char *) malloc (strlen (gen_name) + suffix_len + 1);

      if (!bin_name) {
	fprintf (stderr, "ERROR in function %s, failed to allocate memory for storing binary name bin_name, file=%s, line=%d\n", 
		 __PRETTY_FUNCTION__, __FILE__, __LINE__);
	exit (EXIT_FAILURE);
      }

      sprintf (bin_name, "%s%s", gen_name, BINARY_SUFFIX);

      cluInternalSaveProgramBinary (program, device_id, bin_name);

      free (gen_name);
      free (bin_name);
    }
    program_vector_add (clu_platform->programs, clu_program);
  }

  clu_kernel = (_clu_i_kernel_t *) malloc (sizeof (_clu_i_kernel_t));
  if (clu_kernel == NULL){
    fprintf (stderr, "ERROR in func %s, cannot allocate memory for clu_i_kernel_t object in file %s, line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  /*  We are now creating the kernel */
  clu_kernel->kernel = clCreateKernel (program, kernel_name, &err);
  if (err != CL_SUCCESS) {
    fprintf (stderr, "Error: in func %s, clCreateKernel failed on kernel %s, file %s, line %d, errString %s\n",
	     __PRETTY_FUNCTION__, kernel_name, __FILE__, __LINE__,
	     cluGetErrorString (err));
    exit (EXIT_FAILURE);
  }

  /*  Initialize the clu_kernel structure */
  clu_kernel->clu = (clu_t) clu_platform;
  clu_kernel->device_id = device_id;
  clu_kernel->name = kernel_name;
  clu_kernel->cmd_queue = cmd_queue;
  clu_kernel->work_dim = 0;
  clu_kernel->global_work_offset = NULL;
  clu_kernel->global_work_size = NULL;
  clu_kernel->local_work_size = NULL;
  clu_kernel->num_events_in_wait_list = 0;
  clu_kernel->event_wait_list = NULL;
  clu_kernel->type = CLU_TASK;
  clu_kernel->created_by_clu = CL_TRUE;
  clu_kernel->profiling_on = CL_FALSE;
  err = clGetKernelInfo(clu_kernel->kernel, CL_KERNEL_NUM_ARGS, sizeof(cl_uint), &clu_kernel->num_args, NULL);
  CLU_CHECK_ERROR ("clGetKernelInfo CL_KERNEL_NUM_ARGS", err);


  /*  Add the value/key pair clu_kernel/cl_kernel structure into the hash table */
  kernel_hash_table_add (clu_platform->kernels, clu_kernel, clu_kernel->kernel);

  return clu_kernel->kernel;
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluSetKernelNDRange
 *  Description: 
 *            Set this kernel to be an NDRange Kernel
 *            Set all the NDRange parameters for the kernel 
 * =====================================================================================
 */
void
cluSetKernelNDRange (clu_t clu, cl_kernel kernel, cl_uint work_dim,
		     const size_t * global_work_offset,
		     const size_t * global_work_size,
		     const size_t * local_work_size)
{
  size_t array_size;

  clu_i_kernel_t clu_kernel;
  if (clu == NULL) {
    fprintf (stderr, "ERROR: function %s has been invoked with clu object = NULL in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (kernel == NULL) {
    fprintf (stderr, "ERROR in func %s, parameter kernel is not initialized in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  clu_kernel = kernel_hash_table_search (clu->kernels, kernel);

  if (!clu_kernel) {
    fprintf (stderr, "ERROR in func %s, the given kernel was not created by cluCreateKernel or does not have a command queue associated with it in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (work_dim == 0) {
    fprintf (stderr, "ERROR in func %s, 0 work_dim encounterd in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  clu_kernel->work_dim = work_dim;
  clu_kernel->type = CLU_NDRANGE;

  /*  Allocate memory for the global_work_offset, global_work_size, and local_work_size 
   *  parameters and cache the caller's parameters.
   */
  if (clu_kernel->global_work_offset) free((void *)clu_kernel->global_work_offset);
  if (clu_kernel->global_work_size)   free((void *)clu_kernel->global_work_size);
  if (clu_kernel->local_work_size)    free((void *)clu_kernel->local_work_size);

  array_size = work_dim * sizeof(size_t);

  if (global_work_offset) {
    if ((clu_kernel->global_work_offset = (size_t *)malloc(array_size))) {
      (void)memcpy((void *)clu_kernel->global_work_offset, global_work_offset, array_size);
    } else {
      fprintf(stderr, "ERROR in func %s, unable to allocate global_work_offset array of size %zd in file %s, at line %d\n", 
	      __PRETTY_FUNCTION__, array_size, __FILE__, __LINE__);
      exit (EXIT_FAILURE);
    }
  } else {
    clu_kernel->global_work_offset = NULL;
  }

  if ((clu_kernel->global_work_size = (size_t *)malloc(array_size))) {
    (void)memcpy((void *)clu_kernel->global_work_size, global_work_size, array_size);
  } else {
    fprintf(stderr, "ERROR in func %s, unable to allocate global_work_size array of size %zd in file %s, at line %d\n", 
	    __PRETTY_FUNCTION__, array_size, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  
  if (local_work_size) {
    if ((clu_kernel->local_work_size = (size_t *)malloc(array_size))) {
      (void)memcpy((void *)clu_kernel->local_work_size, local_work_size, array_size);
    } else {
      fprintf(stderr, "ERROR in func %s, unable to allocate local_work_size array of size %zd in file %s, at line %d\n", 
	      __PRETTY_FUNCTION__, array_size, __FILE__, __LINE__);
      exit (EXIT_FAILURE);
    }
  } else {
    clu_kernel->local_work_size = NULL;
  }
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluSetKernelDependency
 *  Description:  sets the event dependency list for this kernel
 *
 *  Parameters: 
 *      num_events_wait_list: 
 *      events:             
 * =====================================================================================
 */
void
cluSetKernelDependency (clu_t clu, cl_kernel kernel, cl_uint num_events_in_wait_list, const cl_event * event_wait_list)
{
  clu_i_kernel_t clu_kernel;
  if (clu == NULL) {
    fprintf (stderr, "ERROR: function %s has been invoked with clu object = NULL in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  if (((num_events_in_wait_list == 0) && (event_wait_list != NULL)) ||
      ((num_events_in_wait_list > 0) && (event_wait_list == NULL))) {
    fprintf (stderr, "ERROR in func %s, event_wait_list is incorrectly specified in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

 if (kernel == NULL) {
   fprintf (stderr, "ERROR in func %s, parameter kernel is not initialized in file %s, at line %d\n",
	    __PRETTY_FUNCTION__, __FILE__, __LINE__);
   exit (EXIT_FAILURE);
 }
 clu_kernel = kernel_hash_table_search (clu->kernels, kernel);

 if (clu_kernel == NULL) {
    fprintf (stderr, "ERROR in func %s, the given cl_kernel was not created by cluCreateKernel or does not have a command queue associated with it  in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
 }
  clu_kernel->num_events_in_wait_list = num_events_in_wait_list;
  clu_kernel->event_wait_list = event_wait_list;
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluEnableKernelProfiling
 *  Description:  enables profiling for the specified kernel. This function must be called
 *            prior to running cluRunKernel.
 *
 *            There must also be a command queue associated with the kernel either
 *            through creating a kernel via clu interface cluCreateKernel or 
 *            cluSetKernelCmdQueue. The command queue must have been created with
 *            CL_QUEUE_PROFILING_ENABLE
 *
 *  Parameters:
 *
 *      clu: specifies a valid clu_t object
 *      kernel: specifies a valid cl_kernel object. This kernel must has a cl_command_queue
 *              associated with it. 
 *
 *  Return:
 *      null
 * =====================================================================================
 */
void
cluEnableKernelProfiling (clu_t clu, cl_kernel kernel)
{
  clu_i_kernel_t clu_kernel;
  cl_command_queue cmd_queue;
  cl_int err;
  cl_command_queue_properties properties;

  if (clu == NULL) {
    fprintf (stderr, "ERROR: function %s has been invoked with clu object = NULL in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }


  if (kernel == NULL) {
    fprintf (stderr, "ERROR in func %s, parameter kernel is not initialized in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  clu_kernel = kernel_hash_table_search (clu->kernels, kernel);

  if (clu_kernel == NULL) {
    fprintf (stderr, "ERROR in func %s, the given cl_kernel was not created by cluCreateKernel or does not have a command queue associated with it in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  /* check the command queue to see if the command queue is set */
  /* up for profiling, if yes, then save the events for this kernel */
  clu_kernel->events = event_vector_create (20);
  if (clu_kernel->events == NULL) {
    fprintf (stderr, "Error: in func %s, cannot create event vector for clu kernel in file %s, line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
 
  cmd_queue = clu_kernel->cmd_queue;

  /* make sure the command queue has profiling enable, otherwise, exit with an 
   * error message 
   */
  err = clGetCommandQueueInfo (cmd_queue, CL_QUEUE_PROPERTIES, sizeof (cl_command_queue_properties), &properties, NULL);
  CLU_CHECK_ERROR ("clGetCommandQueueInfo CL_QUEUE_PROPERTIES", err);

  if (!(properties & CL_QUEUE_PROFILING_ENABLE)) {
    fprintf (stderr, "CLU Error in func %s, the provided command queue does not have profiling enabled in file %s, line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  clu_kernel->profiling_on = CL_TRUE;
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluGetKernelExecTime
 *  Description: If the input clu_profiling_flag_t flag is CLU_PROFILING_ELAPSED_TIME 
 *               then this function returns the total time elapsed between the time when 
 *               the first executing instance of the kernel starts to run and the time 
 *               when the last executing instance of the kernel is finished running.
 *
 *               If the input clu_profilling_flag_t flag is CLU_PROFILING_ACCUM_TIME 
 *               then this function returns  the accumulated executing time for all 
 *               executing instances each kernel where the executing time for each 
 *               instance is the difference between the time when the executing instance 
 *               starts and the time when the executing instance ends.  Note that 
 *               executing instances may be happen in parallel so the accumulated time 
 *               might not be similar to the total elapsed time.  
 *
 *               All outstanding executing instances of the kernel must be complete for 
 *               this function to work properly. 
 *
 *               There must be a command queue associated with the kernel either through 
 *               the cluCreateKernel API or through the cluSetKernelCmdQueue API
 *
 *               Function cluEnableKernelProfiling must be previously invoked and the
 *               command queue associated with the kernel must have CL_QUEUE_PROFILING_ENABLE 
 *               property
 *
 *               This function release CLU's hold on events for this kernel on exit. 
 *               The next invocation of this function will return the profiling 
 *               information for events created since the last invocation only.   
 *               
 *
 *  Parameters:
 *      clu: specifies a valid clu_t object
 *      kernel: specifies a valid cl_kernel object. This kernel must be one that was
 *              executed via the cluRunKernel call.
 *      flag:   CLU_PROFILING_ELAPSED_TIME
 *              CLU_PROFILING_ACCUM_TIME
 *
 *  Return:
 *      number of seconds the kernel spent executing as described above 
 * =====================================================================================
 */
float
cluGetKernelExecTime (clu_t clu, cl_kernel kernel, clu_profiling_flag_t flag)
{
  clu_i_kernel_t clu_kernel;
  cl_event event;
  cl_int err;
  cl_ulong time_start;
  cl_ulong time_done, total_time;
  clu_i_event_vector_t *events;
  float num_seconds;
  cl_ulong max_time_done;
  cl_ulong min_time_start;
  cl_command_queue_properties properties;
  cl_command_queue cmd_queue;
  int i;

  total_time = 0;
  if (clu == NULL) {
    fprintf (stderr, "ERROR: function %s has been invoked with clu object = NULL in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (kernel == NULL) {
    fprintf (stderr, "ERROR in func %s, parameter kernel is not initialized in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  clu_kernel = kernel_hash_table_search (clu->kernels, kernel);

  if (clu_kernel == NULL) {
    fprintf (stderr, "ERROR in func %s, the given cl_kernel was not created by cluCreateKernel or does not have a command queue associated with it in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  /* make sure the command queue has profiling enable, otherwise, exit with an 
   * error message 
   */
  cmd_queue = clu_kernel->cmd_queue;
  err = clGetCommandQueueInfo (cmd_queue, CL_QUEUE_PROPERTIES, sizeof (cl_command_queue_properties), &properties, NULL);
  CLU_CHECK_ERROR ("clGetCommandQueueInfo CL_QUEUE_PROPERTIES", err);

  if (!(properties & CL_QUEUE_PROFILING_ENABLE)) {
    fprintf (stderr, "CLU Error in func %s, the provided command queue does not have profiling enabled in file %s, line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (clu_kernel->profiling_on != CL_TRUE) {
    return 0.0f;
  }

  min_time_start = (cl_ulong)(-1);/*  ULLONG_MAX; */
  max_time_done = 0;
  total_time = 0;


  /*-----------------------------------------------------------------------------
   *  Loop through the list of events for this particular kernel to determine
   *  the profiling information.
   *-----------------------------------------------------------------------------
   */
  events = clu_kernel->events;
  for (i = 0; i < event_vector_get_size (events); i++) {
    cl_int exec_status;
    event = event_vector_get_element (events, i);

    /* make sure the event is complete, if not, print error and exit  */
    err = clGetEventInfo (event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (cl_int), &exec_status, NULL);
    CLU_CHECK_ERROR ("clGetEventInfo CL_EVENT_COMMAND_EXECUTION_STATUS", err);
    if (exec_status != CL_COMPLETE) {
      fprintf (stderr, "ERROR in func %s, file %s, line %d, one of the executing instance of the given kernel has not been completed\n", 
	       __PRETTY_FUNCTION__, __FILE__, __LINE__);
      exit (EXIT_FAILURE);
    }

    err = clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &time_start, NULL);
    CLU_CHECK_ERROR ("clGetEventProfilingInfo CL_PROFILING_COMMAND_QUEUED", err);

    err = clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &time_done, NULL);
    CLU_CHECK_ERROR ("clGetEventProfilingInfo CL_PROFILING_COMMAND_END", err);

    if (flag == CLU_PROFILING_ELAPSED_TIME) {
      if (time_start < min_time_start) {
        min_time_start = (int)time_start;
      } 
      if (time_done > max_time_done) {
        max_time_done = (int)time_done;
      }
    } else if (flag == CLU_PROFILING_ACCUM_TIME) {
      total_time = total_time + (time_done - time_start); 
    } else {
      fprintf (stderr, "ERROR in func %s, file %s, line %d, the input parameter flag is invalid\n", 
	       __PRETTY_FUNCTION__, __FILE__, __LINE__);
      exit (EXIT_FAILURE);
    }
  }

  if (flag == CLU_PROFILING_ELAPSED_TIME) {
    total_time = max_time_done - min_time_start;
  }

  /* convert to seconds */
  num_seconds = (float) (total_time) * 1e-9f;

  /* release all events */
  event_vector_release_elements (events);
  return (num_seconds);
}

/* 
* ===  FUNCTION  ======================================================================
*         Name:  cluDisableKernelProfiling
*  Description:  disables profiling for the specified kernel. Once this function is 
*                invoked, further invocations to cluGetKernelExecTime will return 0
*
*  Parameters:
*
*   clu: specifies a valid clu_t object
*   kernel: specifies a valid cl_kernel object. 
*
*  Return:
*      void
* =====================================================================================
*/
void cluDisableKernelProfiling (clu_t clu, cl_kernel kernel)
{
  clu_i_kernel_t clu_kernel;

  if (clu == NULL) {
    fprintf (stderr, "ERROR: function %s has been invoked with clu object = NULL in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (kernel == NULL) {
    fprintf (stderr, "ERROR in func %s, parameter kernel is not initialized in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  clu_kernel = kernel_hash_table_search (clu->kernels, kernel);

  if (clu_kernel == NULL) {
    fprintf (stderr, "ERROR in func %s, the given cl_kernel was not created by cluCreateKernel or does not have a command queue associated with it in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  clu_kernel->profiling_on = CL_FALSE;

  /* release all events if there are events here */
  event_vector_release_elements (clu_kernel->events);
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluSetKernelCmdQueue
 *  Description:  This function associates the given kernel with a command queue for
 *                execution.  This function is needed if the kernel
 *                was created outside of CLU.  If user wants to change the command queue
 *                of a kernel, this function can be used also. 
 *
 *                If the kernel's program has not been built for the command queue's
 *                device, CLU will print out an error message and exit. 
 *
 *                User will be responsible for releasing this kernel since it was not
 *                created by CLU
 *
 *  Parameters:
 *      clu: specifies a valid clu_t object
 *      kernel: specifies a valid cl_kernel object
 *      cmd_queue: specifies a valid cl_command_queue object
 *
 *  return:
 *     void
 *
 * =====================================================================================
 */
void
cluSetKernelCmdQueue (clu_t clu, cl_kernel kernel, cl_command_queue cmd_queue)
{
  clu_i_kernel_t clu_kernel;
  cl_program program;
  cl_device_id device_id;
  cl_int err;

  if (clu == NULL) {
    fprintf (stderr, "ERROR: function %s has been invoked with clu object = NULL in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (cmd_queue == NULL) {
    fprintf (stderr, "ERROR in func %s, parameter cmd_queue is not initialized in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (kernel == NULL) {
    fprintf (stderr, "ERROR in func %s, parameter kernel is not initialized in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }


  /* find the program associated with the kernel */
  err = clGetKernelInfo (kernel, CL_KERNEL_PROGRAM, sizeof (cl_program), &program, NULL);
  CLU_CHECK_ERROR ("clGetKernelInfo CL_KERNEL_PROGRAM", err);

  /* find the device_id for the command queue */
  err = clGetCommandQueueInfo (cmd_queue, CL_QUEUE_DEVICE, sizeof (cl_device_id), &device_id, NULL);
  CLU_CHECK_ERROR ("clGetCommandQueueInfo CL_QUEUE_DEVICE", err);

  /* find the devices associated with the program */
  clu_kernel = kernel_hash_table_search (clu->kernels, kernel);
  if (clu_kernel == NULL) {
    clu_kernel = (_clu_i_kernel_t *) malloc (sizeof (_clu_i_kernel_t));
    if (clu_kernel == NULL) {

      fprintf (stderr, "ERROR in func %s, cannot allocate memory for clu_i_kernel_t object in file %s, line %d\n",
	       __PRETTY_FUNCTION__, __FILE__, __LINE__);
      exit (EXIT_FAILURE);
    }
    clu_kernel->kernel = kernel;
    clu_kernel->clu = clu;
    clu_kernel->device_id = device_id;
    clu_kernel->name = NULL;    
    clu_kernel->cmd_queue = cmd_queue;
    clu_kernel->work_dim = 0;
    clu_kernel->global_work_offset = NULL;
    clu_kernel->global_work_size = NULL;
    clu_kernel->local_work_size = NULL;
    clu_kernel->num_args = 0;
    clu_kernel->num_events_in_wait_list = 0;
    clu_kernel->event_wait_list = NULL;
    clu_kernel->type = CLU_TASK;
    clu_kernel->created_by_clu = CL_FALSE;
    clu_kernel->profiling_on = CL_FALSE;

    kernel_hash_table_add (clu->kernels, clu_kernel, clu_kernel->kernel);
  } else {
    clu_kernel->device_id = device_id;
    clu_kernel->cmd_queue = cmd_queue;
  }
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluRunKernel
 *  Description:  enqueue this kernel onto the kernel's device command queue for 
 *            execution
 *            The kernel can be an NDRange Kernel or a Task Kernel.  If it's an
 *            NDRange kernel, cluSetKernelNDRange should be called earlier 
 * 
 *  Parameters: 
 *      clu: specfies the clu object
 *      kernel: specifies the cl_kernel object
 *      event: returns an event object that identifies this particular kernel execution
 *             instance. If event is NULL, no event will be created for this kernel execution 
 *             instance and therefore it will not be possible for the application to 
 *             query or queue a wait for this particular kernel execution instance
 *      num_args: the number of input arguments for this kernel
 *
 *  Return:
 *      void 
 *      
 * =====================================================================================
 */
void
cluRunKernel (clu_t clu, cl_kernel kernel, cl_event * event, cl_uint num_args, ...)
{
  cl_int err;
  cl_uint arg_index;
  size_t arg_size;
  const void *arg_val;
  clu_i_kernel_t clu_kernel;

  va_list arg_list;
  unsigned int i;
  cl_event ret_event = NULL;
  cl_event* event_ptr;

  cl_command_queue queue;
  if (clu == NULL) {
    fprintf (stderr, "ERROR: function %s has been invoked with clu object = NULL in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (kernel == NULL) {
    fprintf (stderr, "ERROR in func %s, parameter kernel is not initialized in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  clu_kernel = kernel_hash_table_search (clu->kernels, kernel);

  if (clu_kernel == NULL) {
    fprintf (stderr, "ERROR in func %s, the given cl_kernel was not created by cluCreateKernel or there is no command queue associated with it, in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (clu_kernel->num_args != num_args) {
    fprintf (stderr, "ERROR in func %s, Number of arguments (%d) does not match number of arguments in kernel signature %d for kernel %s at file %s line %d\n", 
	     __PRETTY_FUNCTION__, num_args, clu_kernel->num_args, clu_kernel->name, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
 

  /*-----------------------------------------------------------------------------
   *  loop through the arguments and call clSetKernelArg for each argument
   *-----------------------------------------------------------------------------*/
  va_start (arg_list, num_args);
  for (i = 0; i < num_args; i++) {
    arg_index = i;
    arg_size = va_arg (arg_list, size_t);
    arg_val = va_arg (arg_list, void *);

    err = clSetKernelArg (kernel, arg_index, arg_size, arg_val);
    if (err != CL_SUCCESS) {
      fprintf (stderr, "clSetKernelArg [%d] for kernel %s failed: file %s line %d (%s)\n",
	       arg_index, clu_kernel->name, __FILE__, __LINE__, cluGetErrorString (err));
      exit (EXIT_FAILURE);
    }
  }
  va_end (arg_list);

  queue = clu_kernel->cmd_queue;

  if (clu_kernel->profiling_on == CL_TRUE) { 
    event_ptr = &ret_event;
  } else {
    event_ptr = event;
  }
  if (clu_kernel->type == CLU_NDRANGE) {
    err = clEnqueueNDRangeKernel (clu_kernel->cmd_queue, kernel,
				  clu_kernel->work_dim,
				  clu_kernel->global_work_offset,
				  clu_kernel->global_work_size,
				  clu_kernel->local_work_size,
				  clu_kernel->num_events_in_wait_list,
				  clu_kernel->event_wait_list, event_ptr);
    CLU_CHECK_ERROR ("clEnqueueNDRangeKernel", err);
  } else if (clu_kernel->type == CLU_TASK) {
    err = clEnqueueTask (clu_kernel->cmd_queue, kernel,
			 clu_kernel->num_events_in_wait_list,
			 clu_kernel->event_wait_list, event_ptr);
    CLU_CHECK_ERROR ("clEnqueueTask", err);
  } else {
    fprintf (stderr, "ERROR in func %s, clu_kernel->type = %d is invalid, %s, at line %d\n",
	     __PRETTY_FUNCTION__, clu_kernel->type, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if ((event_ptr != NULL) && (clu_kernel->profiling_on == CL_TRUE)) {
    if (event != NULL) {
      *event = ret_event;
    }
    err = clRetainEvent (*event_ptr);
    CLU_CHECK_ERROR ("clRetainEvent", err);
    event_vector_add (clu_kernel->events, *event_ptr);
  }
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluGetCLContext
 *  Description:  get the context associated with clu
 *
 *  Returns:
 *      cl_context
 * =====================================================================================
 */
cl_context
cluGetCLContext (clu_t clu)
{
  _clu_t *clu_platform = (_clu_t *) clu;

  if (clu == NULL) {
    fprintf (stderr, "ERROR: function %s is called with an invalid clu_t object in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  return clu_platform->context;
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluGetCLPlatformID
 *  Description:  get the cl_platform_id associated with clu
 * =====================================================================================
 */
cl_platform_id
cluGetCLPlatformID (clu_t clu)
{
  _clu_t *clu_platform = (_clu_t *) clu;
  if (clu == NULL) {
    fprintf (stderr, "ERROR: function %s is called with an invalid clu_t object in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  return clu_platform->platform;
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluDestroy
 *  Description:  cleans up clu, releases resources associated with clu.  Once this
 *            function returns, references to clu is going to result in 
 *            undetermined behavior.
 *
 *  Returns:
 *          void 
 * =====================================================================================
 */
void
cluDestroy (clu_t clu_platform)
{
  _clu_t *clu = (_clu_t *) clu_platform;

  if (clu == NULL) {
    fprintf (stderr, "ERROR: function %s is called with an invalid clu_t object in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  /* release all programs */
  program_vector_release_elements (clu->programs);
  program_vector_free (clu->programs);

  /* release all kernels */
  kernel_hash_table_destroy (clu->kernels);

  /* release all kernels */
  device_hash_table_destroy (clu->devices);

  /* release all command queues */
  cmdq_vector_release_elements (clu->cmd_queues);
  cmdq_vector_free (clu->cmd_queues);

  clReleaseContext (clu->context);

  free (clu);
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluPrintDeviceInfo
 *  Description:  prints all information that can be queried about the device
 * =====================================================================================
 */


static struct list_bitfield device_type_list[] = {
  { CL_DEVICE_TYPE_ACCELERATOR, "CL_DEVICE_TYPE_ACCELERATOR" },
  { CL_DEVICE_TYPE_CPU, "CL_DEVICE_TYPE_CPU" },
  { CL_DEVICE_TYPE_GPU, "CL_DEVICE_TYPE_GPU" },
  { 0, NULL },					/* end of list */
};

static struct list_bitfield device_fp_config_list[] = {
  { CL_FP_DENORM, "CL_FP_DENORM" },
  { CL_FP_INF_NAN, "CL_FP_INF_NAN" },
  { CL_FP_ROUND_TO_NEAREST, "CL_FP_ROUND_TO_NEAREST" },
  { CL_FP_ROUND_TO_ZERO, "CL_FP_ROUND_TO_ZERO" },
  { CL_FP_ROUND_TO_INF, "CL_FP_ROUND_TO_INF" },
  { CL_FP_FMA, "CL_FP_FMA" },
#ifdef CL_VERSION_1_1
  { CL_FP_SOFT_FLOAT, "CL_FP_SOFT_FLOAT" },
#endif
  { 0, NULL },					/* end of list */
};

static struct list_bitfield device_exec_capabilities_list[] = {
  { CL_EXEC_KERNEL, "CL_EXEC_KERNEL" },
  { CL_EXEC_NATIVE_KERNEL, "CL_EXEC_NATIVE_KERNEL" },
  { 0, NULL },					/* end of list */
};

static struct list_bitfield device_queue_properties_list[] = {
  { CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE" },
  { CL_QUEUE_PROFILING_ENABLE, "CL_QUEUE_PROFILING_ENABLE" },
  { 0, NULL },					/* end of list */
};


static struct list_uint device_global_mem_cache_type_list[] = {
  { CL_NONE, "CL_NONE" },
  { CL_READ_ONLY_CACHE, "CL_READ_ONLY_CACHE" },
  { CL_READ_WRITE_CACHE, "CL_READ_WRITE_CACHE" },
  { 0, NULL },					/* end of list */
};


static struct list_uint device_local_mem_type_list[] = {
  { CL_LOCAL, "CL_LOCAL" },
  { CL_GLOBAL, "CL_GLOBAL" },
  { 0, NULL },					/* end of list */
};



struct device_param device_params[] = {
  { CL_DEVICE_ADDRESS_BITS, "CL_DEVICE_ADDRESS_BITS", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_AVAILABLE, "CL_DEVICE_AVAILABLE", 1, 0, NULL, &cluInternalPrintDeviceBool, NULL, 0 },
  { CL_DEVICE_COMPILER_AVAILABLE, "CL_DEVICE_COMPILER_AVAILABLE", 1, 0, NULL, &cluInternalPrintDeviceBool, NULL, 0 },
#ifdef CL_VERSION_1_1
  { CL_DEVICE_DOUBLE_FP_CONFIG, "CL_DEVICE_DOUBLE_FP_CONFIG", 1, 1, "cl_khr_fp64", &cluInternalPrintDeviceBitflags, device_fp_config_list, 0 },
#endif
  { CL_DEVICE_ENDIAN_LITTLE, "CL_DEVICE_ENDIAN_LITTLE", 1, 0, NULL, &cluInternalPrintDeviceBool, NULL, 0 },
  { CL_DEVICE_ERROR_CORRECTION_SUPPORT, "CL_DEVICE_ERROR_CORRECTION_SUPPORT", 1, 0, NULL, &cluInternalPrintDeviceBool, NULL, 0 },
  { CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES", 1, 0, NULL, &cluInternalPrintDeviceBitflags, device_exec_capabilities_list, 0 },
  { CL_DEVICE_EXTENSIONS, "CL_DEVICE_EXTENSIONS", 1, 0, NULL, &cluInternalPrintDeviceString, NULL, 0 },
  { CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, "CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, "CL_DEVICE_GLOBAL_MEM_CACHE_SIZE", 1, 0, NULL, &cluInternalPrintDeviceUlong, NULL, 0 },
  { CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, "CL_DEVICE_GLOBAL_MEM_CACHE_TYPE", 1, 0, NULL, &cluInternalPrintDeviceEnumUint, device_global_mem_cache_type_list, 0 },
  { CL_DEVICE_GLOBAL_MEM_SIZE, "CL_DEVICE_GLOBAL_MEM_SIZE", 1, 0, NULL, &cluInternalPrintDeviceUlong, NULL, 0 },
#ifdef CL_VERSION_1_1
  { CL_DEVICE_HOST_UNIFIED_MEMORY, "CL_DEVICE_HOST_UNIFIED_MEMORY", 1, 1, NULL, &cluInternalPrintDeviceBool, NULL, 0 },
#endif
  { CL_DEVICE_IMAGE2D_MAX_HEIGHT, "CL_DEVICE_IMAGE2D_MAX_HEIGHT", 1, 0, NULL, &cluInternalPrintDeviceSizet, NULL, 0 },
  { CL_DEVICE_IMAGE2D_MAX_WIDTH, "CL_DEVICE_IMAGE2D_MAX_WIDTH", 1, 0, NULL, &cluInternalPrintDeviceSizet, NULL, 0 },
  { CL_DEVICE_IMAGE3D_MAX_DEPTH, "CL_DEVICE_IMAGE3D_MAX_DEPTH", 1, 0, NULL, &cluInternalPrintDeviceSizet, NULL, 0 },
  { CL_DEVICE_IMAGE3D_MAX_HEIGHT, "CL_DEVICE_IMAGE3D_MAX_HEIGHT", 1, 0, NULL, &cluInternalPrintDeviceSizet, NULL, 0 },
  { CL_DEVICE_IMAGE3D_MAX_WIDTH, "CL_DEVICE_IMAGE3D_MAX_WIDTH", 1, 0, NULL, &cluInternalPrintDeviceSizet, NULL, 0 },
  { CL_DEVICE_IMAGE_SUPPORT, "CL_DEVICE_IMAGE_SUPPORT", 1, 0, NULL, &cluInternalPrintDeviceBool, NULL, 0 },
  { CL_DEVICE_LOCAL_MEM_SIZE, "CL_DEVICE_LOCAL_MEM_SIZE", 1, 0, NULL, &cluInternalPrintDeviceUlong, NULL, 0 },
  { CL_DEVICE_LOCAL_MEM_TYPE, "CL_DEVICE_LOCAL_MEM_TYPE", 1, 0, NULL, &cluInternalPrintDeviceEnumUint, device_local_mem_type_list, 0 },
  { CL_DEVICE_MAX_CLOCK_FREQUENCY, "CL_DEVICE_MAX_CLOCK_FREQUENCY", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_MAX_COMPUTE_UNITS, "CL_DEVICE_MAX_COMPUTE_UNITS", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_MAX_CONSTANT_ARGS, "CL_DEVICE_MAX_CONSTANT_ARG", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, "CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE", 1, 0, NULL, &cluInternalPrintDeviceUlong, NULL, 0 },
  { CL_DEVICE_MAX_MEM_ALLOC_SIZE, "CL_DEVICE_MAX_MEM_ALLOC_SIZE", 1, 0, NULL, &cluInternalPrintDeviceUlong, NULL, 0 },
  { CL_DEVICE_MAX_PARAMETER_SIZE, "CL_DEVICE_MAX_PARAMETER_SIZE", 1, 0, NULL, &cluInternalPrintDeviceSizet, NULL, 0 },
  { CL_DEVICE_MAX_READ_IMAGE_ARGS, "CL_DEVICE_MAX_READ_IMAGE_ARGS", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_MAX_SAMPLERS, "CL_DEVICE_MAX_SAMPLERS", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_MAX_WORK_GROUP_SIZE, "CL_DEVICE_MAX_WORK_GROUP_SIZE", 1, 0, NULL, &cluInternalPrintDeviceSizet, NULL, 0 },
  { CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_MAX_WORK_ITEM_SIZES, "CL_DEVICE_MAX_WORK_ITEM_SIZES", 1, 0, NULL, &cluInternalPrintDeviceSizet, NULL, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS },
  { CL_DEVICE_MAX_WRITE_IMAGE_ARGS, "CL_DEVICE_MAX_WRITE_IMAGE_ARGS", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_MEM_BASE_ADDR_ALIGN, "CL_DEVICE_MEM_BASE_ADDR_ALIGN", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, "CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_NAME, "CL_DEVICE_NAME", 1, 0, NULL, &cluInternalPrintDeviceString, NULL, 0 },
#ifdef CL_VERSION_1_1
  { CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, "CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR", 1, 1, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, "CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE", 1, 1, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, "CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT", 1, 1, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, "CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF", 1, 1, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, "CL_DEVICE_NATIVE_VECTOR_WIDTH_INT", 1, 1, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, "CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG", 1, 1, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, "CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT", 1, 1, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_OPENCL_C_VERSION, "CL_DEVICE_OPENCL_C_VERSION", 1, 1, NULL, &cluInternalPrintDeviceString, NULL, 0 },
#endif
  { CL_DEVICE_PLATFORM, "CL_DEVICE_PLATFORM", 1, 0, NULL, &cluInternalPrintDevicePtr, NULL, 0 },
  { CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
#ifdef CL_VERSION_1_1
  { CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF", 1, 1, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
#endif 
  { CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_PROFILE, "CL_DEVICE_PROFILE", 1, 0, NULL, &cluInternalPrintDeviceString, NULL, 0 },
#ifdef CL_VERSION_1_1
  { CL_DEVICE_PROFILING_TIMER_RESOLUTION, "CL_DEVICE_PROFILING_TIMER_RESOLUTION", 1, 1, NULL, &cluInternalPrintDeviceSizet, NULL, 0 },
#endif
  { CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES", 1, 0, NULL, &cluInternalPrintDeviceBitflags, device_queue_properties_list , 0}, 
  { CL_DEVICE_SINGLE_FP_CONFIG, "CL_DEVICE_SINGLE_FP_CONFIG", 1, 0, NULL, &cluInternalPrintDeviceBitflags, device_fp_config_list, 0 },
  { CL_DEVICE_TYPE, "CL_DEVICE_TYPE", 1, 0, NULL, &cluInternalPrintDeviceEnum, device_type_list, 0 },
  { CL_DEVICE_VENDOR, "CL_DEVICE_VENDOR", 1, 0, NULL, &cluInternalPrintDeviceString, NULL, 0 },
  { CL_DEVICE_VENDOR_ID, "CL_DEVICE_VENDOR_ID", 1, 0, NULL, &cluInternalPrintDeviceUint, NULL, 0 },
  { CL_DEVICE_VERSION, "CL_DEVICE_VERSION", 1, 0, NULL, &cluInternalPrintDeviceString, NULL, 0 },
  { CL_DRIVER_VERSION, "CL_DRIVER_VERSION", 1, 0, NULL, &cluInternalPrintDeviceString, NULL, 0 },
};



void
cluPrintDeviceInfo (cl_device_id id)
{
  int i;
  int major_version, minor_version;

  if (id == NULL) {
    fprintf (stderr, "ERROR: function %s is called with an invalid clu_device_id object in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  /* Fetch the device version, so that queries may be filtered accoring to the device version. */
  {
    size_t param_size;
    char *version;

    CLU_CHECK_ERROR ("clGetDeviceInfo", clGetDeviceInfo (id, CL_DEVICE_VERSION, 0, NULL, &param_size));

    version = (char *)malloc(param_size);
    if (!version) {
      fprintf (stderr, "ERROR: cannot allocate %zd bytes for version string in file %s, at line %d\n",
	       param_size, __FILE__, __LINE__);
      exit (EXIT_FAILURE);
    }

    CLU_CHECK_ERROR ("clGetDeviceInfo", clGetDeviceInfo (id, CL_DEVICE_VERSION, param_size, version, NULL));

    if (sscanf(version, "OpenCL %d.%d", &major_version, &minor_version) != 2) {
      fprintf (stderr, "ERROR: unexpected CL_DEVICE_VERSION text (%s) in file %s, at line %d\n",
	       version, __FILE__, __LINE__);
      exit (EXIT_FAILURE);
    }
    free (version);
  }

  for (i=0; i<sizeof(device_params)/sizeof(struct device_param); i++) {
    if (((major_version > device_params[i].reqd_major_version) ||
	 ((major_version == device_params[i].reqd_major_version) && (minor_version >=  device_params[i].reqd_minor_version))) &&
	cluCheckDeviceExtensions(id, device_params[i].reqd_extension)) {
      printf("%-39s = ", device_params[i].name_string);
      (*device_params[i].print_func)(id, &device_params[i]);
    }
  }
  return;
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluPrintPlatformInfo
 *  Description:  prints all information that can be queried about the platform
 * =====================================================================================
 */
#define NUM_PLATFORM_PARAMS	(sizeof(platform_params)/sizeof(cl_platform_info))

static cl_platform_info platform_params[] = {
  CL_PLATFORM_NAME,
  CL_PLATFORM_VERSION,
  CL_PLATFORM_VENDOR,
  CL_PLATFORM_PROFILE,
  CL_PLATFORM_EXTENSIONS
};

static char *platform_params_desc[] = {
  "CL_PLATFORM_NAME",
  "CL_PLATFORM_VERSION",
  "CL_PLATFORM_VENDOR",
  "CL_PLATFORM_PROFILE",
  "CL_PLATFORM_EXTENSIONS"
};

void
cluPrintPlatformInfo (clu_t clu)
{
  int i;
  cl_platform_id id;

  if (clu == NULL) {
    fprintf (stderr, "ERROR: function %s is called with an invalid clu_t object in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  id = cluGetCLPlatformID (clu);

  for (i = 0; i < NUM_PLATFORM_PARAMS; i++) {
    size_t param_size;
    char *param_value;

    CLU_CHECK_ERROR ("clGetPlatformInfo", clGetPlatformInfo (id, platform_params[i], 0, NULL, &param_size));

    param_value = malloc (param_size);

    if (!param_value) {
      fprintf (stderr, "ERROR: cannot allocate memory for string buffer in file %s, at line %d\n", __FILE__, __LINE__);
      exit (EXIT_FAILURE);
    }

    CLU_CHECK_ERROR ("clGetPlatformInfo", clGetPlatformInfo (id, platform_params[i], param_size, param_value, NULL));

    printf ("%-39s = \"%s\"\n", platform_params_desc[i], param_value);
    free (param_value);
  }
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluGetAvailableLocalMem
 *  Description:  returns the amount (in bytes) of available local memory on the input device to run
 *                the input kernel
 *
 *  Params: 
 *  device_id: specifies the cl_device_id
 *  kernel: specifies the cl_kernel
 *
 *  Returns:
 *      The amount (in bytes) of available local memory 
 * =====================================================================================
 */
cl_ulong
cluGetAvailableLocalMem (cl_device_id device_id, cl_kernel kernel)
{
  cl_ulong total_local_mem;
  cl_ulong used_local_mem;
  cl_int rc;

  if (device_id == NULL) {
    fprintf (stderr, "ERROR: the input device_id to function %s, in file %s, line %d is not initializes\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (kernel == NULL) {
    fprintf (stderr, "ERROR: the input device_id to function %s, in file %s, line %d is not initializes\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }


  rc = clGetDeviceInfo (device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof (cl_ulong), (void *) &total_local_mem, NULL);
  CLU_CHECK_ERROR ("clGetDeviceInfo CL_DEVICE_LOCAL_MEM_SIZE", rc);

  rc = clGetKernelWorkGroupInfo (kernel, device_id, CL_KERNEL_LOCAL_MEM_SIZE, sizeof (cl_ulong), &used_local_mem, NULL);
  CLU_CHECK_ERROR ("clGetDeviceInfo CL_KERNEL_LOCAL_MEM_SIZE", rc);

  return (total_local_mem - used_local_mem);
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluCheckLocalWorkgroupSize
 *  Description:  checks whether the user's required number of work item dimensions and
 *                local_work_sizes can be supported by the given device and kernel
 *
 *  Parameters:
 *      device_id: specifies the cl_device_id
 *      kernel: specifies the cl_kernel 
 *      work_dim: specifies the number of dimensions in the work group.
 *      local_work_sizes: points to an array of work_dim unsigned values that describe the
 *           number of work-items that make up a work-group. 
 *
 *  Returns
 *      CL_TRUE if the work item dimensions work_dim and the local_work_sizes are supported
 *      CL_FALSE if not
 * =====================================================================================
 */
cl_bool
cluCheckLocalWorkgroupSize (cl_device_id device_id, cl_kernel kernel,
                cl_uint work_dim, const size_t * local_work_sizes)
{
  unsigned int i;
  cl_uint max_dimensions;
  size_t *max_work_item_sizes;
  size_t max_work_group_size;
  size_t kernel_work_group_size;
  size_t total_local_wg_size = 1;
  cl_int rc;

  if (device_id == NULL) {
    fprintf (stderr, "ERROR: the input device_id to function %s, in file %s, line %d is not initialized\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (kernel == NULL) {
    fprintf (stderr, "ERROR: the input device_id to function %s, in file %s, line %d is not initialized\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  rc = clGetDeviceInfo (device_id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof (cl_uint), (void *) &max_dimensions, NULL);
  CLU_CHECK_ERROR ("clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS", rc);

  if (work_dim > max_dimensions) {
    fprintf (stderr, "The input number of work dimensions work_dim = %d is greater than the device max dimensions = %d in func %s\n",
	     work_dim, max_dimensions, __PRETTY_FUNCTION__);
    return CL_FALSE;
  }

  max_work_item_sizes = (size_t *) malloc (max_dimensions * sizeof (size_t));
  if (!max_work_item_sizes) {
    fprintf (stderr, "ERROR: cannot allocate memory for max_work_item_sizes in func %s, file %s, line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }
  rc = clGetDeviceInfo (device_id, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof (size_t) * max_dimensions, (void *) max_work_item_sizes, NULL);
  CLU_CHECK_ERROR ("clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES", rc);

  rc = clGetDeviceInfo (device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof (size_t), (void *) &max_work_group_size, NULL);
  CLU_CHECK_ERROR ("clGetDeviceInfo CL_DEVICE_MAX_WORK_GROUP_SIZE", rc);

  rc = clGetKernelWorkGroupInfo (kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &kernel_work_group_size, NULL);
  CLU_CHECK_ERROR ("clGetKernelWorkGroupInfo CL_KERNEL_WORK_GROUP_SIZE", rc);


  if (local_work_sizes != NULL) {
    for (i = 0; i < work_dim; i++) {
      total_local_wg_size *= local_work_sizes[i];
      if (local_work_sizes[i] > max_work_item_sizes[i]) {
        fprintf (stderr, "local_work_sizes[%d]=%d is greater than max_work_item_sizes[%d] = %d\n",
                 i, (int) local_work_sizes[i], i, (int) max_work_item_sizes[i]);
        free (max_work_item_sizes);
        return CL_FALSE;
      }
    }
  }
  free (max_work_item_sizes);

  if (total_local_wg_size > max_work_group_size) {
    fprintf (stderr, "total local work size %d is greater than device max work group size %d\n", 
             (int)total_local_wg_size, (int)max_work_group_size);
    return CL_FALSE;
  }
  if (total_local_wg_size > kernel_work_group_size) {
    fprintf (stderr, "total local work size %d is greater than kernel work group size %d\n",
	     (int)total_local_wg_size, (int)kernel_work_group_size);
    return CL_FALSE;
  }
  return CL_TRUE;
}



/* 
* ===  FUNCTION  ======================================================================
*         Name:  cluGetErrorString
*  Description:  returns a constant charater string representing the CL error
* =====================================================================================
*/
const char *cluGetErrorString (cl_int errcode)
{
  switch (errcode) {
  case CL_SUCCESS:
    return "CL_SUCCESS";
  case CL_DEVICE_NOT_FOUND:
    return "CL_DEVICE_NOT_FOUND";
  case CL_DEVICE_NOT_AVAILABLE:
    return "CL_DEVICE_NOT_AVAILABLE";
  case CL_COMPILER_NOT_AVAILABLE:
    return "CL_COMPILER_NOT_AVAILABLE";
  case CL_MEM_OBJECT_ALLOCATION_FAILURE:
    return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
  case CL_OUT_OF_RESOURCES:
    return "CL_OUT_OF_RESOURCES";
  case CL_OUT_OF_HOST_MEMORY:
    return "CL_OUT_OF_HOST_MEMORY";
  case CL_PROFILING_INFO_NOT_AVAILABLE:
    return "CL_PROFILING_INFO_NOT_AVAILABLE";
  case CL_MEM_COPY_OVERLAP:
    return "CL_MEM_COPY_OVERLAP";
  case CL_IMAGE_FORMAT_MISMATCH:
    return "CL_IMAGE_FORMAT_MISMATCH";
  case CL_IMAGE_FORMAT_NOT_SUPPORTED:
    return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
  case CL_BUILD_PROGRAM_FAILURE:
    return "CL_BUILD_PROGRAM_FAILURE";
  case CL_INVALID_VALUE:
    return "CL_INVALID_VALUE";
  case CL_INVALID_DEVICE_TYPE:
    return "CL_INVALID_DEVICE_TYPE";
  case CL_INVALID_PLATFORM:
    return "CL_INVALID_PLATFORM";
  case CL_INVALID_DEVICE:
    return "CL_INVALID_DEVICE";
  case CL_INVALID_CONTEXT:
    return "CL_INVALID_CONTEXT";
  case CL_INVALID_QUEUE_PROPERTIES:
    return "CL_INVALID_QUEUE_PROPERTIES";
  case CL_INVALID_COMMAND_QUEUE:
    return "CL_INVALID_COMMAND_QUEUE";
  case CL_INVALID_HOST_PTR:
    return "CL_INVALID_HOST_PTR";
  case CL_INVALID_MEM_OBJECT:
    return "CL_INVALID_MEM_OBJECT";
  case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
    return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
  case CL_INVALID_IMAGE_SIZE:
    return "CL_INVALID_IMAGE_SIZE";
  case CL_INVALID_SAMPLER:
    return "CL_INVALID_SAMPLER";
  case CL_INVALID_BINARY:
    return "CL_INVALID_BINARY";
  case CL_INVALID_BUILD_OPTIONS:
    return "CL_INVALID_BUILD_OPTIONS";
  case CL_INVALID_PROGRAM:
    return "CL_INVALID_PROGRAM";
  case CL_INVALID_PROGRAM_EXECUTABLE:
    return "CL_INVALID_PROGRAM_EXECUTABLE";
  case CL_INVALID_KERNEL_NAME:
    return "CL_INVALID_KERNEL_NAME";
  case CL_INVALID_KERNEL_DEFINITION:
    return "CL_INVALID_KERNEL_DEFINITION";
  case CL_INVALID_KERNEL:
    return "CL_INVALID_KERNEL";
  case CL_INVALID_ARG_INDEX:
    return "CL_INVALID_ARG_INDEX";
  case CL_INVALID_ARG_VALUE:
    return "CL_INVALID_ARG_VALUE";
  case CL_INVALID_ARG_SIZE:
    return "CL_INVALID_ARG_SIZE";
  case CL_INVALID_KERNEL_ARGS:
    return "CL_INVALID_KERNEL_ARGS";
  case CL_INVALID_WORK_DIMENSION:
    return "CL_INVALID_WORK_DIMENSION";
  case CL_INVALID_WORK_GROUP_SIZE:
    return "CL_INVALID_WORK_GROUP_SIZE";
  case CL_INVALID_WORK_ITEM_SIZE:
    return "CL_INVALID_WORK_ITEM_SIZE";
  case CL_INVALID_GLOBAL_OFFSET:
    return "CL_INVALID_GLOBAL_OFFSET";
  case CL_INVALID_EVENT_WAIT_LIST:
    return "CL_INVALID_EVENT_WAIT_LIST";
  case CL_INVALID_EVENT:
    return "CL_INVALID_EVENT";
  case CL_INVALID_OPERATION:
    return "CL_INVALID_OPERATION";
  case CL_INVALID_GL_OBJECT:
    return "CL_INVALID_GL_OBJECT";
  case CL_INVALID_BUFFER_SIZE:
    return "CL_INVALID_BUFFER_SIZE";
  case CL_INVALID_MIP_LEVEL:
    return "CL_INVALID_MIP_LEVEL";
  case CL_MAP_FAILURE:
    return "CL_MAP_FAILURE";
#ifdef CL_INVALID_GLOBAL_WORK_SIZE
  case CL_INVALID_GLOBAL_WORK_SIZE:
    return "CL_INVALID_GLOBAL_WORK_SIZE";
#endif
  default:
    return "Unknown";
  };
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluCheckDeviceExtensions
 *  Description:  Checks availability of an OpenCL extension or set of extensions.
 *
 *  Parameters:
 *     dev_id:    specifies the device to the checked for extension support
 *     ext_names: a space separated list of OpenCL extension names
 *
 * =====================================================================================
 */
cl_bool cluCheckDeviceExtensions (cl_device_id dev_id, const char *ext_names)
{
  cl_bool rc = CL_TRUE;

  if (dev_id == NULL) {
    fprintf (stderr, "ERROR: function %s is called with an invalid clu_devive_id in file %s, at line %d\n",
	     __PRETTY_FUNCTION__, __FILE__, __LINE__);
    exit (EXIT_FAILURE);
  }

  if (ext_names) {
    size_t len, n, m;
    char *ext_str, *ptr;
    cl_bool match;

    /* Allocate memory for the extension string */
    CLU_CHECK_ERROR ("clGetDeviceInfo extensions", clGetDeviceInfo (dev_id, CL_DEVICE_EXTENSIONS, 0, NULL, &len));

    if ((ext_str = (char *)malloc(len)) == NULL) {
      fprintf (stderr, "ERROR in func %s, failed to alloc %zd mem for extension string, file=%s, line=%d\n",
	       __PRETTY_FUNCTION__, len, __FILE__, __LINE__);
      exit (EXIT_FAILURE);
    }
    /* Get the extension string */
    CLU_CHECK_ERROR ("clGetDeviceInfo extensions", clGetDeviceInfo (dev_id, CL_DEVICE_EXTENSIONS, len, ext_str, NULL));
    
    /* For each extension in ext_names */
    while (*ext_names != '\0') {
      n = strcspn(ext_names, " ");
      if (n) {
	/* For each of the extensions in the device's extension string */
	ptr = ext_str;
	
	match = CL_FALSE;

	while (*ptr != '\0') {
	  m = strcspn(ptr, " ");
	  if (m) {
	    if ((n == m) && (strncmp(ptr, ext_names, n) == 0)) {
	      /* Extension string found */
	      match = CL_TRUE;
	      break;
	    }
	    ptr += m;
	  } else {
	    ptr++;	/* Skip spaces */
	  }
	}
	ext_names += n;

	rc &= match;
      } else {
	ext_names++;	/* Skip spaces */
      }
    }
    free(ext_str);
  }
  return rc;
}
