/*************************************************************************/
/*                                                                       */
/* Licensed Materials - Property of IBM                                  */
/*                                                                       */
/*                                                                       */
/* (C) Copyright IBM Corp. 2010                                          */
/* All Rights Reserved                                                   */
/*                                                                       */
/* US Government Users Restricted Rights - Use, duplication or           */
/* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.     */
/*                                                                       */
/*************************************************************************/


#ifndef _CLU_H_
#define _CLU_H_ 1

#ifdef __cplusplus
extern "C"
{
#endif
#include <stdio.h>
#include <stdarg.h>
#include <CL/opencl.h>

#ifdef _WIN32
#define __PRETTY_FUNCTION__ __FUNCSIG__
#else
#include <execinfo.h>
#endif


/* #####   TYPE DEFINITIONS  -  EXTERNAL for CLU users #################### 
 *   NOTE: All external CLU structures and defs are prefixed with clu or CLU
 * ######################################################################## */


/* forward declaration of clu_t structure */
struct _clu_t;

/* typedef for clu_create_program_flag_t and clu_profiling_flag_t */
typedef cl_bitfield clu_create_program_flag_t;
typedef cl_uint clu_profiling_flag_t;

/* clu_create_program_flag_t bitfield */
#define CLU_SOURCE 			((cl_ulong)1 << (cl_ulong)0)
#define CLU_BINARY 			((cl_ulong)1 << (cl_ulong)1)
#define CLU_NO_CACHE 			((cl_ulong)1 << (cl_ulong)2)

/* clu_profiling_flag_t  bitfield */
#define CLU_PROFILING_ELAPSED_TIME  0x100 
#define CLU_PROFILING_ACCUM_TIME     0x101


/* #####   TYPE DEFINITIONS  -  INTERNAL TO CLU   ######################### 
 *   NOTE: All internal CLU structures are prefixed with clu_i 
 * ######################################################################## */

/*
 * clu_i_program_t: 
 *   Structure to keep information about a cl_program  
 */
typedef struct _program_t
{
  const char *source_filename;
  const char *binary_filename;
  const char *build_options;

  cl_bool built_from_source;
  cl_program program;

  cl_device_id device;
} _clu_i_program_t;
typedef _clu_i_program_t *clu_i_program_t;

/*
 * clu_i_progarm_vector_t
 *   Container structure that contains all the clu_i_program_t's that 
 *   are created in clu 
 */
typedef struct _program_vector
{
  int curr_index;
  int curr_capacity;
  clu_i_program_t *data_ptr;
} clu_i_program_vector_t;

/*
 * clu_i_cmdq_vector_t 
 *   Container structure that contains all the cl_command_queues that
 *   are created in CLU
 */
typedef struct _cmdq_vector
{
  int curr_index;
  int curr_capacity;
  cl_command_queue *data_ptr;
} clu_i_cmdq_vector_t;

/*
 * clu_i_event_vector_t
 *   Container structure that contains all the cl_events that need to
 *   be kept in CLU
 */
typedef struct _event_vector
{
  int curr_index;
  int curr_capacity;
  cl_event *data_ptr;
} clu_i_event_vector_t;

/*
 * clu_i_kernel_type_t
 *   typedef enum for kernel types, NDRange and Task
 */
typedef enum
{
  CLU_NDRANGE,
  CLU_TASK,
} clu_i_kernel_type_t;

/*
 * clu_i_kernel_t
 *   internal structure used to keep all information about a cl_kernel
 *   that CLU uses
 */
typedef struct _kernel_t
{
  cl_kernel kernel;
  cl_command_queue cmd_queue;
  const char *name;
  cl_device_id device_id;
  struct _clu_t *clu;
  cl_uint work_dim;
  size_t *global_work_offset;
  size_t *global_work_size;
  size_t *local_work_size;
  cl_uint num_events_in_wait_list;
  const cl_event *event_wait_list;
  clu_i_kernel_type_t type;
  clu_i_event_vector_t *events;

  unsigned int num_args;
  cl_bool created_by_clu;
  cl_bool profiling_on;

} _clu_i_kernel_t;
typedef _clu_i_kernel_t *clu_i_kernel_t;

/*
 * clu_i_device_t
 *   internal structure used to keep all information about a cl_device_id
 *   that CLU uses
 */
typedef struct _device_t
{
  cl_device_id device_id;                       /* device_id */
  char* device_name;                            /* name of the device. will need to free */
  clu_i_cmdq_vector_t cmd_queues;               /* stores all the cmd_queues for this device */
  cl_command_queue active_queue;                /* active command queue */
} _clu_i_device_t;
typedef _clu_i_device_t *clu_i_device_t;


/*
 * Macros definitions for the generic_hash_table_t
 */
#define GENERIC_HASH_TABLE_SIZE 32
#define GENERIC_HASH_TABLE_SIZE_LOG 5
/* 2^32 * ((sqrt(5)-1)/2) - used by multiplicative hash function */
#define GENERIC_HASH_CONSTANT (2654435769U) 

/* 
 * clu_i_hash_kernel_node_t
 *   a hash_kernel_node
 * */
typedef struct _hash_kernel_node
{
  clu_i_kernel_t value;
  cl_kernel key;
  struct _hash_kernel_node *next;
} clu_i_hash_kernel_node_t;

/*
 * clu_i_kernel_hash_table_t
 *   a hash table for clu_i_kernel_t.
 */
typedef struct _kernel_hash_table
{
  clu_i_hash_kernel_node_t *nodes[GENERIC_HASH_TABLE_SIZE];
} clu_i_kernel_hash_table_t;

/* 
 * clu_i_hash_device_node_t
 *   a hash_device_node
 * */
typedef struct _hash_device_node
{
  clu_i_device_t value;
  cl_device_id key;
  struct _hash_device_node *next;
} clu_i_hash_device_node_t;

/*
 * clu_i_device_hash_table_t
 *   a hash table for clu_i_device_t.
 */
typedef struct _device_hash_table
{
  clu_i_hash_device_node_t *nodes[GENERIC_HASH_TABLE_SIZE];
} clu_i_device_hash_table_t;


/* #####   TYPE DEFINITIONS  -  EXTERNAL for CLU users #################### */

/*
 * clu_t 
 *   main structure that clu uses to maintain state 
 */
typedef struct _clu_t
{
  cl_platform_id platform;                      /* platform id. currently only one platform */
  cl_context context;                           /* OpenCL context  */
  clu_i_program_vector_t *programs;             /* stores all the programs  */
  clu_i_kernel_hash_table_t *kernels;           /* stores all the clu kernels */
  clu_i_device_hash_table_t *devices;           /* stores all the clu_devices */
  clu_i_cmdq_vector_t *cmd_queues;              /* stores all the command queues */
  cl_command_queue default_cmd_queue;
  cl_command_queue active_cmd_queue;
} _clu_t;
typedef _clu_t *clu_t;


#ifdef _WIN32
#define CLU_CHECK_ERROR(prefix,errcode)					\
if (errcode) {								\
  fprintf(stderr, "CLU ERROR in function %s, file %s, line: %d:  %s = %d (%s)\n", __PRETTY_FUNCTION__, __FILE__, __LINE__, prefix, errcode, cluGetErrorString(errcode)); \
  exit(EXIT_FAILURE);							\
};

#define CLU_EXIT_ERROR(fmt, ...)					\
  fprintf(stderr, "CLU ERROR in function %s, file %s, line %d\n", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
  fprintf(stderr, fmt, __VA_ARGS__);					\
  exit (EXIT_FAILURE);
#else	/* !_WIN32 */

#define CLU_CHECK_ERROR(prefix,errcode)					\
if (errcode) {								\
  size_t ii, size;							\
  void *array[10];							\
  char **strings;							\
									\
  /* Fetch and print backtrace */					\
  fprintf(stderr, "CLU ERROR in function %s, file %s, line: %d:  %s = %d (%s)\n", __PRETTY_FUNCTION__, __FILE__, __LINE__, prefix, errcode, cluGetErrorString(errcode)); \
									\
  /* Fetch and print backtrace */					\
  size = backtrace (array, 10);						\
  strings = backtrace_symbols (array, size);				\
									\
  fprintf (stderr, "Obtained %zd stack frames.\n", size);		\
									\
  for (ii = 0; ii < size; ii++) {					\
    fprintf (stderr, "%s\n", strings[ii]);				\
  }									\
									\
  free (strings);							\
  exit(EXIT_FAILURE);							\
};


#define CLU_EXIT_ERROR(fmt, arg...)					\
{									\
  fprintf(stderr, "CLU ERROR in function %s, file %s, line %d - " fmt, __PRETTY_FUNCTION__, __FILE__, __LINE__, ##arg); \
  exit (EXIT_FAILURE);							\
}
#endif	/* _WIN32 */

/* 
* ===  FUNCTION  ======================================================================
*         Name:  cluInit
*  Description:  cluInit initializes the platform, query all the devices available on 
*                the platform, and create a context based on the queried devices. Also creates a
*   		  clu_device from the default device, queries the necessary information, 
*   		  and creates a default command queue 
*   
*  Returns    :  a clu structure
* =====================================================================================
*/
clu_t cluInit (cl_platform_id platform);


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluGetDeviceID
 *  Description:  returns the cl_device_id that matches the given device_type, 
 *                dev_vendor, and dev_name.
 *
 *                If dev_name or dev_vendor is NULL, then its not used in the 
 *                search criteria.
 *
 *                If there are devices that match the device_type, and that the dev_name
 *                and/or dev_vender matches one of the devices, then return that device. 
 *
 *                For ease of use, the input dev_name and the device_name are both
 *                converted to lower case before comparing
 *  Parameters:
 *
 *  Returns:
 *                a valid device_id if found one, NULL otherwise.   
 * =====================================================================================
 */
cl_device_id cluGetDeviceID (clu_t clu, cl_device_type device_type, char *dev_vendor, char* dev_name);



/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluGetDeviceName
 *  Description:  Gets the device name string for the specified device
 * =====================================================================================
 */
const char * cluGetDeviceName (clu_t clu, cl_device_id device_id);

/* 
* ===  FUNCTION  ======================================================================
*         Name:  cluCreateCmdQueue
*  Description:  initializes the first device that matches the given device_type. It also
*   		  creates a command queue for the device and returns the command queue
*   		
*
*  Parameters:
*  	clu
*  	dev_id:  if not 0, then we just use this device_id to create the clu device. 
*  	         The dtype is going to be ignored
*  	dtype:   device_type. If dev_id is 0, then we are going to use the type to find
*  		 a matching device in the platform 
*  	properties: specifies a list of properties for the command queue. see OpenCL spec 
*
*  Returns    :  a clu_device structure
* =====================================================================================
*/
cl_command_queue cluCreateCmdQueue (clu_t clu, cl_device_id dev_id,
                                    cl_device_type dtype,
                                    cl_command_queue_properties properties);

/* 
* ===  FUNCTION  ======================================================================
*         Name:  cluSetKernelNDRange
*  Description: 
*  		  Set this kernel to be an NDRange Kernel
*  		  Set all the NDRange parameters for the kernel 
* =====================================================================================
*/
void cluSetKernelNDRange (clu_t clu, cl_kernel kernel, cl_uint work_dim,
                          const size_t * global_work_offset,
                          const size_t * global_work_size,
                          const size_t * local_work_size);


/* 
* ===  FUNCTION  ======================================================================
*         Name:  cluSetKernelDependency
*  Description:  sets the event dependency list for this kernel
*
*  Parameters: 
*  	num_events_wait_list: 
*  	events: 			
* =====================================================================================
*/
void cluSetKernelDependency (clu_t clu, cl_kernel kernel,
                             cl_uint num_events_in_wait_list,
                             const cl_event * event_wait_list);

/* 
* ===  FUNCTION  ======================================================================
*         Name:  cluRunKernel
*  Description:  enqueue this kernel onto the kernel's device command queue for 
*  		  execution
*  		  The kernel can be an NDRange Kernel or a Task Kernel.  If it's an
*  		  NDRange kernel, cluInitNDRange should be called earlier 
* 
*  Parameters: 
*  	clu: specfies the clu object
*  	kernel: specifies the cl_kernel object
*  	event: returns an event object that identifies this particular kernel execution
*  	       instance. If event is NULL, no event will be created for this kernel execution 
*  	       instance and therefore it will not be possible for the application to 
*  	       query or queue a wait for this particular kernel execution instance.
*
*  	       clu will not be responsible for cleaning up events?
* 
*  	num_args: the number of input arguments for this kernel
*
*  Return:
*      void 
*  	
* =====================================================================================
*/
void cluRunKernel (clu_t clu, cl_kernel kernel, cl_event * event,
                   cl_uint num_args, ...);

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluCreateKernel
 *  Description:  creates an OpenCL program from either source or binary that's associated
 *                with the given device, builds the program with build_options, then create
 *                the kernel based on the given kernel_name and returns the clu_i_kernel_t object.  
 *  Parameters: 
 *      clu: 		valid clu_t object
 *  	cmd_queue:   	the cl_command_queue object we want to run the kernel on.  
 *  	filename: 	filename for either the source or binary file
 *  	kernel_name: 	kernel_name
 *  	build_options:  build_options
 *  	flag: 		bitfield with the following options
 *  			CL_SOURCE: build from the source file. If there's a cached version of
 *  			the program available, it will be used to create the kernel, otherwise,
 *  			CLU will rebuild the program from the source. 
 *  			   			
 *  			CL_BINARY: build the program from the binary. The name of the binary
 *  			file is given  by filename. 
 *
 *  			CL_NO_CACHE: build the program from the source file regardless
 *  			if there's a cached version of the binary available. The resulting
 *  			binary will not be cached either.   
 *
 *  Returns: a clu_kernel object
 * =====================================================================================
 */

cl_kernel cluCreateKernel (clu_t clu, cl_command_queue cmd_queue,
                           const char *filename, const char *kernel_name,
                           const char *build_options,
                           clu_create_program_flag_t flag);
/* 
* ===  FUNCTION  ======================================================================
*         Name:  clu_create_buffer
*  Description:  get the context associated with clu
*
*  Returns:
*  	cl_context
* =====================================================================================
*/
cl_context cluGetCLContext (clu_t clu);


/* 
* ===  FUNCTION  ======================================================================
*         Name:  cluGetPlatformID
*  Description:  
* =====================================================================================
*/
cl_platform_id cluGetCLPlatformID (clu_t clu);


/* 
* ===  FUNCTION  ======================================================================
*         Name:  clu_finish
*  Description:  cleans up clu, releases resources associated with clu.  Once this
*   		  function returns, references to clu is going to result in 
*   		  undetermined behavior.
*
*  Returns:
*  		void 
* =====================================================================================
*/
void cluDestroy (clu_t clu);


/* 
* ===  FUNCTION  ======================================================================
*         Name:  cluGetCLDeviceTypesString
*  Description:  returns a character array representing the type of the device
* =====================================================================================
*/
const char *cluGetCLDeviceTypeString (cl_device_type device_type);



/* 
* ===  FUNCTION  ======================================================================
*         Name:  printDeviceInfo
*  Description:  prints all information that can be queried about the device
* =====================================================================================
*/
void cluPrintDeviceInfo (cl_device_id dev);


/* 
* ===  FUNCTION  ======================================================================
*         Name:  printPlatformInfo
*  Description:  prints all information that can be queried about the platform
* =====================================================================================
*/
void cluPrintPlatformInfo (clu_t clu);

/* 
* ===  FUNCTION  ======================================================================
*         Name:  cluSetKernelCmdQueue
*  Description:  This function associates the given kernel with a command queue for
*                execution.  This function is needed if the kernel
*                was created outside of CLU.  If user wants to change the command queue
*                of a kernel, this function can be used also. 
*
*                If the kernel's program has not been built for the command queue's
*                device, CLU will print out an error message and exit. 
*
*                User will be responsible for releasing this kernel since it was not
*                created by CLU
*
*  Parameters:
*  	clu: specifies a valid clu_t object
*  	kernel: specifies a valid cl_kernel object
*  	cmd_queue: specifies a valid cl_command_queue object
*
*  return:
*     void
*
* =====================================================================================
*/
void cluSetKernelCmdQueue (clu_t clu, cl_kernel kernel,
                           cl_command_queue cmd_queue);


/* 
* ===  FUNCTION  ======================================================================
*         Name:  cluEnableKernelProfiling
*  Description:  enables profiling for the specified kernel. This function must be called
*  		  prior to running cluRunKernel.
*
*  		  There must also be a command queue associated with the kernel either
*  		  through creating a kernel via clu interface cluCreateKernel or 
*  		  cluSetKernelCmdQueue
*
*  Parameters:
*
*  	clu: specifies a valid clu_t object
*  	kernel: specifies a valid cl_kernel object. This kernel must has a cl_command_queue
*  	        associated with it. 
*
*  Return:
*      void
*
*  		    
* =====================================================================================
*/
void cluEnableKernelProfiling (clu_t clu, cl_kernel kernel);

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluGetKernelExecTime
 *  Description: If the input clu_profiling_flag_t flag is CLU_PROFILING_ELAPSED_TIME 
 *               then this function returns the total time elapsed between the time when 
 *               the first executing instance of the kernel starts to run and the time 
 *               when the last executing instance of the kernel is finished running.
 *
 *               If the input clu_profilling_flag_t flag is CLU_PROFILING_ACCUM_TIME 
 *               then this function returns  the accumulated executing time for all 
 *               executing instances each kernel where the executing time for each 
 *               instance is the difference between the time when the executing instance 
 *               starts and the time when the executing instance ends.  Note that 
 *               executing instances may be happen in parallel so the accumulated time 
 *               might not be similar to the total elapsed time.  
 *
 *               All outstanding executing instances of the kernel must be complete for 
 *               this function to work properly. 
 *
 *               There must be a command queue associated with the kernel either through 
 *               the cluCreateKernel API or through the cluSetKernelCmdQueue API
 *
 *               Function cluEnableKernelProfiling must be previously invoked.   
 *
 *  Parameters:
 *  	clu: specifies a valid clu_t object
 *  	kernel: specifies a valid cl_kernel object. This kernel must be one that was
 *  	        executed via the cluRunKernel call.
 *  	flag:   CLU_PROFILING_ELAPSED_TIME
 *  	        CLU_PROFILING_ACCUM_TIME
 *
 *  Return:
 *      number of seconds the kernel spent executing as described above 
 * =====================================================================================
 */
float cluGetKernelExecTime (clu_t clu, cl_kernel kernel, clu_profiling_flag_t flag);

/* 
* ===  FUNCTION  ======================================================================
*         Name:  cluDisableKernelProfiling
*  Description:  disables profiling for the specified kernel. Once this function is 
*                invoked, further invocations to cluGetKernelExecTime will return 0
*
*  Parameters:
*
*  	clu: specifies a valid clu_t object
*  	kernel: specifies a valid cl_kernel object. 
*
*  Return:
*      void
* =====================================================================================
*/
void cluDisableKernelProfiling (clu_t clu, cl_kernel kernel);

/* 
* ===  FUNCTION  ======================================================================
*         Name:  cluGetAvailableLocalMem
*  Description:  returns the amount (in bytes) of available local memory on the input device to run
*                the input kernel
*
*  Params: 
* 	device_id: specifies the cl_device_id
* 	kernel: specifies the cl_kernel
*
*  Returns:
*      The amountn (in bytes) of available local memory 
* =====================================================================================
*/
cl_ulong cluGetAvailableLocalMem (cl_device_id device_id, cl_kernel kernel);


/* 
* ===  FUNCTION  ======================================================================
*         Name:  cluCheckLocalWorkgroupSize
*  Description:  checks whether the user's required number of work item dimensions and
*                local_work_sizes can be supported by the given device and kernel
*
*  Parameters:
*  	device_id: specifies the cl_device_id
*  	kernel: specifies the cl_kernel	
*  	work_dim: specifies the number of dimensions in the work group.
*  	local_work_sizes: points to an array of work_dim unsigned values that describe the
*  	     number of work-items that make up a work-group. 
*
*  Returns
*  	CL_TRUE if the work item dimensions work_dim and the local_work_sizes are supported
*  	CL_FALSE if not
* =====================================================================================
*/
cl_bool cluCheckLocalWorkgroupSize (cl_device_id device_id,
                                    cl_kernel kernel, cl_uint work_dim,
                                    const size_t * local_work_sizes);



/* 
* ===  FUNCTION  ======================================================================
*         Name:  cluGetErrorString
*  Description:  returns a constant charater string representing the CL error
* =====================================================================================
*/
const char *cluGetErrorString (cl_int errcode);


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cluCheckDeviceExtensions
 *  Description:  Checks availability of an OpenCL device extension or set of extensions.
 *
 *  Parameters:
 *     dev_id:    specifies the device to the checked for extension support
 *     ext_names: a space separated list of OpenCL extension names
 *
 * =====================================================================================
 */
cl_bool cluCheckDeviceExtensions (cl_device_id dev_id, const char *ext_names);



#ifdef __cplusplus
}
#endif

#endif
