/*************************************************************************/
/*                                                                       */
/* Licensed Materials - Property of IBM                                  */
/*                                                                       */
/*                                                                       */
/*                                                                       */
/* (C) Copyright IBM Corp. 2009                                          */
/* All Rights Reserved                                                   */
/*                                                                       */
/* US Government Users Restricted Rights - Use, duplication or           */
/* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.     */
/*                                                                       */
/*************************************************************************/

/*************************************************************************/
/*                                                                       */
/* This sample is an OpenCL Perlin Noise generator, which is based on    */
/* Ken Perlin's Improved Noise implementation found at                   */
/* http://mrl.nyu.edu/~perlin/noise/. See readme.perlin_noise.txt for    */
/* more information.                                                     */
/*                                                                       */
/*************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <sys/stat.h>
#include <errno.h>
#include <sys/types.h>
#include <string.h>
#include <libgen.h>
#include <unistd.h>
#include <CL/opencl.h>

#ifdef GL_VIEWER
#include "viewer.h"
#endif /* GL_VIEWER */

#ifdef REMOTE_VIEWER
#include "viewer.h"
#endif /* REMOTE_VIEWER */

#include "clock.h"
#include "perlin_host.h"
#include "clu.h"


/*-----------------------------------------------------------------------------
 *  GLOBAL VARIABLES and DEFINES
 *-----------------------------------------------------------------------------*/
#define DEFAULT_ITERATIONS	1000
#define DEFAULT_IMG_WIDTH	1024
#define DEFAULT_IMG_HEIGHT	1024
#define VECTOR_WIDTH		4

/* User control parameters 
 */
static int img_width  = DEFAULT_IMG_WIDTH;
static int img_height = DEFAULT_IMG_HEIGHT;
static int img_size   = DEFAULT_IMG_WIDTH * DEFAULT_IMG_HEIGHT * 4;
static int iterations = DEFAULT_ITERATIONS;
static int verify     = 0;
static int verbose    = 0;
static size_t local_work_group_size = 0;
static cl_device_type dev_type = CL_DEVICE_TYPE_DEFAULT;

/* Allocated resources 
 */
static clu_t clu;                       /* clu_t object */
static cl_context context;		/* OpenCL context */
static cl_command_queue commands;	/* OpenCL command queue */
static cl_kernel kernel;		/* OpenCL perlin_noise kernel */
static cl_mem cm_output_buffers[2];	/* OpenCL output memory buffer */
static unsigned int *mapped_addr[2];	/* mapped addresses returned from clMapBuffers */
static cl_event compute_event;		/* OpenCL event indicating computation complete */


/* State variables 
 */
static int current_frame = 0;
static int first_frame   = 1;
static int curr_iter     = 0;                   /* current noise iteration */

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  print_usage
 *  Description:  print the usage to stdout
 * =====================================================================================
 */
void print_usage (char *name)
{
  printf("Usage: %s [options...]\n", name);
  printf("\n");
  printf("Examples:\n");
#if !defined(GL_VIEWER) && !defined(REMOTE_VIEWER)
  printf("  %s --accel -i 10      # Run 10 iterations on accelerator device.\n", name);
#endif
  printf("  %s --cpu --size 800   # Run with image size of 800x800 on a CPUd device.\n", name);
  printf("\n");
  printf("Device Type Options:\n");
  printf("  -a, --accel           Use Accelerator device for compute.\n");
  printf("  -c, --cpu             Use CPU device for compute.\n");
  printf("  -g, --gpu             Use GPU device for compute.\n");
  printf("\n");
  printf("  If a device type is unspecified, then the platform's CL_DEVICE_TYPE_DEFAULT is used.\n");
  printf("\n");
  printf("Run Options:\n");
#if !defined(GL_VIEWER) && !defined(REMOTE_VIEWER)
  printf(" -i, --iter N          Number of iterations (frames). Default is %d\n", DEFAULT_ITERATIONS);
#endif
  printf(" -l, --lwgsize N       Local work group size. Must be an integer factor of the image size divided by %d.\n", VECTOR_WIDTH);
  printf("                       0 or unspecified then OpenCL will choose the best size.\n");
  printf(" -s, --size N          Image size. Must be a positive multiple of 16. Default is %d\n", DEFAULT_IMG_WIDTH);
  printf(" -v, --verify          Verify OpenCL device output compares equal to host computed output. Default is non-verify.\n");
  printf(" -V, --verbose         Emit verbose informational output. Default is non-verbose.\n");

}



/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  process_command_line
 *  Description:  Parse the command line options and return the option values.
 * =====================================================================================
 */
void process_command_line(int argc, char **argv, char *name)
{
  int option_index = 0;
  int opt, size;
  static struct option long_options[] = {
    {"help",     0, NULL, 'h'},
    {"gpu",      0, NULL, 'g'},
    {"cpu",      0, NULL, 'c'},
    {"accel",    0, NULL, 'a'},
    {"size",     1, NULL, 's'},
    {"iter",     1, NULL, 'i'},
    {"lwgsize",  1, NULL, 'l'},
    {"verify",   0, NULL, 'v'},
    {"verbose",  0, NULL, 'V'},
    {NULL, 0, NULL, 0}
  };
  char *optstring = "hgcas:i:l:voV";


  while ((opt = getopt_long(argc, argv, optstring, long_options, &option_index)) != -1) {
    switch (opt) {
    case 'h':
    case '?':
      print_usage(name);
      exit(EXIT_FAILURE);
      break;
    case 'g':
      dev_type = CL_DEVICE_TYPE_GPU;
      break;
    case 'c':
      dev_type = CL_DEVICE_TYPE_CPU;
      break;
    case 'a':
      dev_type = CL_DEVICE_TYPE_ACCELERATOR;
      break;
    case 's':
      size = atoi(optarg);
      if ((size % 16) || (size < 1)) {
        fprintf(stderr, "ERROR: Image size %d is not a multiple of 16.\n", size);
	exit (EXIT_FAILURE);
      }
      img_width = img_height = size;
      img_size = img_width * img_height * 4;
      break;
    case 'i':
      iterations = atoi(optarg);
      break;
    case 'l':
      local_work_group_size = atoi(optarg);
      break;
    case 'v':
      verify = 1;
      break;
    case 'V':
      verbose = 1;
      break;
    }
  }
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  update_frame_buffer
 *  Description:  Invoke the OpenCL kernel to compute the frame_buffer
 * =====================================================================================
 */
/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  next_frame
 *  Description:  Invokes the julia compute kernel and returns a pointer to the generated
 *                next frame buffer and image size. A image size of 0 implies the image
 *                size is equal to img_width * img_height * img_pixel_size.
 *
 *                This function utilizes a double buffering rendering pipeline in which
 *                the subsequent frame is precomputed while the returned frame is processed.
 * =====================================================================================
 */
void *next_frame(int **size)
{
  cl_int err;
  int    next;				/* next frame index */
  void * frame_buffer;              	/* image frame buffer to display */
  cl_event map_event;
  cl_event unmap_event = NULL;
  cl_float time;

  /* ----------------------------------------------------------------------------------
   * The rendering pipeline when double buffering. This ordering of requests is
   * design to support both in-order and out-of-order queues. 
   *
   *          Compute buffer 0
   *                 |
   *      .--------->|
   *      |          v
   *      |     Map buffer 0
   *      |          |
   *      |          v
   *      |    Unmap buffer 1 (if mapped)
   *      |          |
   *      |          v
   *      |   Compute buffer 1
   *      |          |------------------> return buffer 0
   *      |          v
   *      |     Map buffer 1
   *      |          |
   *      |          v
   *      |    Unmap buffer 0
   *      |          |
   *      |          v
   *      |   Compute buffer 0
   *      |          |------------------> return buffer 1
   *      `----------'
   *
   * ----------------------------------------------------------------------------------
   */
  /* If this is the first frame, prime the pipeline by computing the first buffer.
   */
  if (first_frame) {
    first_frame = 0;
    time = 0.05f * curr_iter++;
    cluRunKernel (clu, kernel, &compute_event, 3,  
		  sizeof(cl_mem), &cm_output_buffers[current_frame],
		  sizeof(cl_float), &time, 
		  sizeof(unsigned int), &img_width);
  }

  next = 1 - current_frame;
  mapped_addr[current_frame] = clEnqueueMapBuffer (commands, cm_output_buffers[current_frame], 
						   CL_FALSE, CL_MAP_READ, 0, img_size,
						   1, &compute_event, &map_event, &err);
  CLU_CHECK_ERROR ("clEnqueueMapBuffer", err);
  CLU_CHECK_ERROR ("clReleaseEvent", clReleaseEvent(compute_event));

  /* Unmap the next frame if it was previously mapped.
   */
  if (mapped_addr[next]) {
    CLU_CHECK_ERROR ("clEnqueueUnmapMemObject", 
		     clEnqueueUnmapMemObject (commands, cm_output_buffers[next], mapped_addr[next], 0, NULL, &unmap_event));
    mapped_addr[next] = NULL;
    cluSetKernelDependency(clu, kernel, 1, &unmap_event);
  }

  /* Wait for the map to complete */
  CLU_CHECK_ERROR ("clWaitForEvents", clWaitForEvents(1, &map_event));
  CLU_CHECK_ERROR ("clReleaseEvent", clReleaseEvent(map_event));


  /* Generate the next frame 
   */
  time = 0.05f * curr_iter++;
  cluRunKernel (clu, kernel, &compute_event, 3,  
		sizeof(cl_mem), &cm_output_buffers[next],
		sizeof(cl_float), &time, 
		sizeof(unsigned int), &img_width);

  if (unmap_event) {
    CLU_CHECK_ERROR ("clReleaseEvent", clReleaseEvent(unmap_event));
  }

  frame_buffer = (void *)mapped_addr[current_frame];
  
  /*------------------------------------------------------------------------------------
   * Return the resulting frame buffer pointer and size information
   *------------------------------------------------------------------------------------
   */
  current_frame = next;
  if (size) *size = NULL;
  return (frame_buffer);
}

#if !defined(GL_VIEWER) && !defined(REMOTE_VIEWER)
/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  last_frame
 *  Description:  Fetches the last frame in the double buffered rendering pipeline.
 * 
 *                NOTE: This function has not been enabled to support the encoding
 *                      of images since it is not used by the remote viewer.
 * =====================================================================================
 */
static void *last_frame(int *size)
{
  cl_int err;
  int    frame_size = 0;		/* default frame size */
  char * frame_buffer;              	/* image frame buffer to display */
  
  /* Compute the frame if no frame is already in the pipeline
   */
  if (first_frame) {
    float  time = 0.0f;

    first_frame = 0;
    cluRunKernel (clu, kernel, &compute_event, 3,  
		  sizeof(cl_mem), &cm_output_buffers[current_frame],
		  sizeof(cl_float), &time, 
		  sizeof(unsigned int), &img_width);
  }
  /* Map the computed frame into the host address space 
   */
  frame_buffer = clEnqueueMapBuffer (commands, cm_output_buffers[current_frame], 
				     CL_TRUE, CL_MAP_READ, 0, img_size,
				     1, &compute_event, 
				     NULL, &err);
  CLU_CHECK_ERROR ("clEnqueueMapBuffer", err);
  CLU_CHECK_ERROR ("clReleaseEvent", clReleaseEvent(compute_event));
  CLU_CHECK_ERROR ("clFinish", clFinish(commands));

  compute_event = NULL;

  if (size) *size = frame_size;
  return (frame_buffer);
}  

#else /* GL_VIEWER || REMOTE_VIEWER */

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  mouse_action
 *  Description:  Handles mouse actions. Externally referenced by the viewer. 
 * =====================================================================================
 */
void mouse_action(int button, int state, int x, int y)
{
  /* Ignore all mouse actions */
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  mouse_motion
 *  Description:  Handles mouse motion. Externally referenced by the viewer.
 * =====================================================================================
 */
void mouse_motion(int x, int y)
{
  /* Ignore all mouse motion */
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  keyboard_action
 *  Description:  Handles keyboard input. Externally referenced by the viewer.
 * =====================================================================================
 */
void keyboard_action (unsigned char key, int x, int y)
{
  switch (key) {
  default:
    break;
  }
}
#endif /* !defined(GL_VIEWER) && !defined(REMOTE_VIEWER) */
 



/****************************************************************************************
 * main
 * 
 * Compute perlin noise using OpenCL.
 ****************************************************************************************
 */
int main(int argc, char **argv)
{
  int retval = 0;
  size_t  global_size[2];
  size_t* local_size;

  char *name;
  char *filename = "perlin_kernel.cl";
  char *kernel_name = "compute_perlin_noise";

  cl_int err;
  cl_device_id device_id;
  cl_command_queue_properties device_q_prop;

#if !defined(GL_VIEWER) && !defined(REMOTE_VIEWER)
  int i;
  float  delta;
  double device_mpix, frame_rate;
  void * output_device;
#endif

  /* Change current working directory to that of the invocation path so that 
   * perlin noise can be run from any current working directory. 
   */

  name = basename(argv[0]);
  (void)chdir(dirname(argv[0]));

  /* Process command line arguments */ 
  process_command_line(argc, argv, name);

  /*--------------------------------------------------------------------
   * Initialize OpenCL and create OpenCL resources.
   *--------------------------------------------------------------------
   */
  clu = cluInit (NULL);
  context = cluGetCLContext (clu);
  if ((device_id = cluGetDeviceID (clu, dev_type, NULL, NULL)) == NULL) {
    CLU_EXIT_ERROR("Unable to locate a device of type %s.\n", cluGetCLDeviceTypeString(dev_type));
  }
  if (verbose) printf("Compute Device = %s\n", cluGetDeviceName(clu, device_id));

  //check to see if the device supports out-of-order queue
  CLU_CHECK_ERROR("clGetDeviceInfo(CL_DEVICE_QUEUE_PROPERTIES)", 
		  clGetDeviceInfo (device_id, CL_DEVICE_QUEUE_PROPERTIES,
				   sizeof (cl_command_queue_properties), 
				   &device_q_prop, NULL));

  //use CLU API to create a command queue
  if (device_q_prop & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
    commands = cluCreateCmdQueue (clu, device_id, dev_type, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
    if (verbose) printf ("%s is using an out-of-order queue\n", name);
  } else {
    commands = cluCreateCmdQueue (clu, device_id, dev_type, 0);
    if (verbose) printf ("%s is using an in-order queue\n", name);
  }

  /* Compile the kernel program from filename and create a kernel from kernel_name. The
   * kernel is specified to run on the given command queue 
   */
  if (verbose) printf ("Compiling and Creating compute kernel...\n");
  kernel = cluCreateKernel (clu, commands, filename, kernel_name, NULL, CLU_SOURCE);
  if (verbose) printf("Compute kernel created\n");

  global_size[0] = img_width / VECTOR_WIDTH;
  global_size[1] = img_height;

  /*  If user specified local_work_group_size to be zero then let the implementation
   *  decides the best local work-group size 
   */
  if (local_work_group_size == 0) {
    local_size = NULL;
    if (verbose) printf("Local Work Group Size = NULL\n");
  } else {
    /* Ensure the local work group size does not exceed the global work group size 
     */
    if (local_work_group_size > global_size[0]) {
      local_work_group_size = global_size[0];
      if (verbose) printf("The local work group size exceeds the global work group size. Coercing the local size to the global size.\n");
    }

    local_size = malloc (2 * sizeof (size_t));
    local_size[0] = local_work_group_size;
    local_size[1] = 1;

    if (verbose) printf("Local Work Group Size is (%d, %d)\n", (int)local_size[0], (int)local_size[1]);

    if (global_size[0] % local_work_group_size) {
      fprintf (stderr, "ERROR: Local workgroup size of (%d,%d) must be an integer factor of the global size (%d %d)\n", 
	       (int)local_size[0], (int)local_size[1], (int)global_size[0], (int)global_size[1]);
      exit (EXIT_FAILURE);
    }

    /* Check to make sure the local work group size requested is 
     * supported by the device and kernel.
     */
    if (cluCheckLocalWorkgroupSize (device_id, kernel , 2, local_size) == CL_FALSE) {
      fprintf (stderr, "ERROR: Local workgroup size of (%d,%d) can not be supported by the device.\n", (int)local_size[0], (int)local_size[1]);
      exit (EXIT_FAILURE);
    }
  }
  if (verbose) printf("Global Work Group Size is (%d, %d)\n", (int)global_size[0], (int)global_size[1]);


  /*-----------------------------------------------------------------------------
   *  Allocating buffers
   *  Create the output arrays with OpenCL allocating the memory.
   *  We create two buffers for double buffering purpose
   *-----------------------------------------------------------------------------
   */
  cm_output_buffers[0] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, img_size, NULL, &err);
  CLU_CHECK_ERROR ("clCreateBuffer", err);

  cm_output_buffers[1] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, img_size, NULL, &err);
  CLU_CHECK_ERROR ("clCreateBuffer", err);

  /*-----------------------------------------------------------------------------
   * Execution 
   *-----------------------------------------------------------------------------
   */
  /*  Set the NDRange for the kernel */
  cluSetKernelNDRange (clu, kernel, 2, NULL, global_size, local_size);

#if defined(GL_VIEWER) || defined(REMOTE_VIEWER)
  /* Running with a viewer, either the GL viewer or remote viewer */
  initViewer(name, img_width, img_height, 4, GL_RGBA, GL_UNSIGNED_BYTE, 0);
  viewerMainLoop();
#else 	/*  no viewer */
  /* Running without a viewer, compute "iteration" frames */
  startclock();
  for (i = 0; i < iterations-1; i++) {
    (void)next_frame(NULL);
  }
  output_device = last_frame(NULL);
  delta = stopclock();

  /* calculate the mega pixels per second calculated */
  frame_rate = (double)(iterations)/(double)(delta);
  device_mpix = (double)(img_width * img_height) * frame_rate / 1000000.0;

  printf("OpenCL took %f seconds to compute %d frames. Pixel Rate = %f Mpixels/sec, Frame Rate = %f frames/sec\n",
         delta, iterations, device_mpix, frame_rate);


  /* Compute data on host and verify device results 
   */
  retval = compute_host_and_verify(iterations, (uint *)output_device, img_width,
				   img_height, img_width, verify, device_mpix, verbose);
  CLU_CHECK_ERROR("clEnqueueUnmapMemObject", 
                  clEnqueueUnmapMemObject (commands, cm_output_buffers[current_frame], output_device, 0, NULL, NULL));
  CLU_CHECK_ERROR("clFinish" , clFinish (commands));

  clReleaseMemObject (cm_output_buffers[0]);
  clReleaseMemObject (cm_output_buffers[1]);
  cluDestroy (clu);

#endif /* GL_VIEWER || REMOTE_VIEWER */
  return retval;
}
