/*************************************************************************/
/*                                                                       */
/* Licensed Materials - Property of IBM                                  */
/*                                                                       */
/* (C) Copyright IBM Corp. 2009, 2010                                    */
/* All Rights Reserved                                                   */
/*                                                                       */
/* US Government Users Restricted Rights - Use, duplication or           */
/* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.     */
/*                                                                       */
/* Inspired by Caltech's Java Applet Fluid Solver at                     */
/* www.multires.caltech.edu/teaching/demos/java/FluidSolver.java         */
/*                                                                       */
/* References:  Visual Simulation of Smoke                               */
/*              R. Fedkiw, J. Stam, H. W. Jensen                         */
/*              SIGGRAPH 2001 Annual Proceedings                         */
/*                                                                       */
/*************************************************************************/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <fenv.h>
#include <getopt.h>
#include <libgen.h>
#include <unistd.h>
#include <CL/opencl.h>

#ifdef GL_VIEWER
#include "viewer.h"
#endif

#ifdef REMOTE_VIEWER
#include "viewer.h"
#endif

#include "clu.h"
#include "clock.h"
#include "solver.h"

/*-----------------------------------------------------------------------------
 *  GLOBAL VARIABLES and DEFINITIONS
 *-----------------------------------------------------------------------------*/


#define DEFAULT_NUM_FRAMES 100

#ifdef REMOTE_VIEWER
#define IMG_SIZE_CONSTRAINT	16
#else
#define IMG_SIZE_CONSTRAINT	_VEC_SIZE
#endif 

int use_async_wg_copy = 0;
const char *filename = "fluid_solver_vec4.cl";

/* compute command queue */
cl_command_queue commands;

/* All OpenCL compute kernels */
cl_kernel set_boundary_kernel;
cl_kernel advect_kernel;
cl_kernel add_source_kernel;
cl_kernel add_source_dual_kernel;
cl_kernel linear_solver_kernel;
cl_kernel project_part1_kernel;
cl_kernel project_part2_kernel;
cl_kernel pack_image_kernel;

/* local work-group size for add_source and add_source_dual kernels */
size_t local_wg_size_add_source[2];

/* local domain size for all other kernels */
size_t local_wg_size[2];

/*  indicates whether we are turning on profiling in CLU */
int profiling = 0;

clu_t clu;
struct fluid_solver fs;

int img_size = DEFAULT_IMG_SIZE;

int verbose = 0;


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  verify_results
 *  Description:  verify the results from OpenCL to the normal C host calculation
 *                imageHost is the result from the normal C calculation
 *                imageAccel is the OpenCL result
 * =====================================================================================
 */
int
verify_results (unsigned int *imageHost, unsigned int *imageAccel, int n)
{
  int error_count = 0;
  int i;
  unsigned char *hptr, *aptr;
  float one_shade = 1.0f;

  hptr = (unsigned char *) imageHost;
  aptr = (unsigned char *) imageAccel;

  for (i = 0; i < n*n* 4; i++) {
    float hc = (float) hptr[i];
    float ac = (float) aptr[i];
    float diff = fabsf (hc - ac);

    /* If there's a difference in one shade, then declare an error  
     */
    if (diff > one_shade)  {
      if (verbose) {
	if (error_count < 10)
	  printf ("Verification error at pixel %d. host=%f device=%f\n", i, hc, ac);
	error_count++;
      }
    }
  }
  printf ("Verification results: %d errors out of %d values\n", error_count, n*n*4);

  return error_count != 0;
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  print_usage
 *  Description:  print the usage to stdout
 * =====================================================================================
 */
void
print_usage (char * name)
{
  char *usage =
    "Usage: %s [options...]\n"
    "\n"
    "Examples:\n"
    "   fluid --accel -V	 # Run simulation on accelerator device with verbose output\n"
    "\n"
    " Device Types Options:\n"
    "   -a, --accel              Use SPU Accelerator device for compute\n"
    "   -c, --cpu                Use CPU device for compute\n"
    "   -g, --gpu                Use GPU device for compute\n"
    "\n"
    "  If a device type is unspecified, then the platform's CL_DEVICE_TYPE_DEFAULT is used.\n"
    "\n"
    " Run Options:\n"
    "   -m, --awgc               Run the version of the kernel that uses async work-group copy (default: no)\n"
    "   -s, --size N             Specifies the window size of the simulation (default: %d)\n"
#if !defined(GL_VIEWER) && !defined(REMOTE_VIEWER)
    "   -n, --numframes N        Number of frames to simulate (default: %d)\n"
    "   -p, --profiling          Enable CLU kernel profiling (default: off)\n"
    "   -v, --verify             Verify the results with computations on the host (default: off). Forces the\n"
    "                            number of frames to 2.\n"
#endif
    "   -V, --verbose            Enable verbose output.\n"
    "\n";

#if !defined(GL_VIEWER) && !defined(REMOTE_VIEWER)
  printf (usage, name, DEFAULT_IMG_SIZE, DEFAULT_NUM_FRAMES);
#else
  printf (usage, name, DEFAULT_IMG_SIZE);
#endif
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  set_local_wg_size
 *  Description:    
 * =====================================================================================
 */
void
set_local_wg_size (cl_device_id device_id)
{
  size_t curr_work_group_size;
  size_t kernel_work_group_size;
  size_t tmp;
  size_t global_size;
  cl_int rc;

  rc = clGetKernelWorkGroupInfo (set_boundary_kernel, device_id,
				 CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t),
				 &kernel_work_group_size, NULL);
  CLU_CHECK_ERROR ("clGetKernelWorkGroupInfo CL_KERNEL_WORK_GROUP_SIZE", rc);
  curr_work_group_size = kernel_work_group_size;

  rc = clGetKernelWorkGroupInfo (advect_kernel, device_id,
				 CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t),
				 &kernel_work_group_size, NULL);
  CLU_CHECK_ERROR ("clGetKernelWorkGroupInfo CL_KERNEL_WORK_GROUP_SIZE", rc);

  if (kernel_work_group_size < curr_work_group_size)
    curr_work_group_size = kernel_work_group_size;

  rc = clGetKernelWorkGroupInfo (linear_solver_kernel, device_id,
				 CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t),
				 &kernel_work_group_size, NULL);
  CLU_CHECK_ERROR ("clGetKernelWorkGroupInfo CL_KERNEL_WORK_GROUP_SIZE", rc);

  if (kernel_work_group_size < curr_work_group_size)
    curr_work_group_size = kernel_work_group_size;

  rc = clGetKernelWorkGroupInfo (project_part1_kernel, device_id,
				 CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t),
				 &kernel_work_group_size, NULL);
  CLU_CHECK_ERROR ("clGetKernelWorkGroupInfo CL_KERNEL_WORK_GROUP_SIZE", rc);

  if (kernel_work_group_size < curr_work_group_size)
    curr_work_group_size = kernel_work_group_size;

  rc = clGetKernelWorkGroupInfo (project_part2_kernel, device_id,
				 CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t),
				 &kernel_work_group_size, NULL);
  CLU_CHECK_ERROR ("clGetKernelWorkGroupInfo CL_KERNEL_WORK_GROUP_SIZE", rc);

  if (kernel_work_group_size < curr_work_group_size)
    curr_work_group_size = kernel_work_group_size;

  rc = clGetKernelWorkGroupInfo (pack_image_kernel, device_id,
				 CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t),
				 &kernel_work_group_size, NULL);
  CLU_CHECK_ERROR ("clGetKernelWorkGroupInfo CL_KERNEL_WORK_GROUP_SIZE", rc);

  if (kernel_work_group_size < curr_work_group_size)
    curr_work_group_size = kernel_work_group_size;

  rc = clGetKernelWorkGroupInfo (add_source_kernel, device_id,
				 CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t),
				 &kernel_work_group_size, NULL);
  CLU_CHECK_ERROR ("clGetKernelWorkGroupInfo CL_KERNEL_WORK_GROUP_SIZE", rc);

  if (kernel_work_group_size < curr_work_group_size)
    curr_work_group_size = kernel_work_group_size;

  rc = clGetKernelWorkGroupInfo (add_source_dual_kernel, device_id,
				 CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t),
				 &kernel_work_group_size, NULL);
  CLU_CHECK_ERROR ("clGetKernelWorkGroupInfo CL_KERNEL_WORK_GROUP_SIZE", rc);

  if (kernel_work_group_size < curr_work_group_size)
    curr_work_group_size = kernel_work_group_size;

  /* We are capping the local work group size to img_size/_VEC_SIZE since we are 
   * using vector type for computing, so the number of elements is img_size/_VEC_SIZE  
   * This local_wg_size is used for all the kernels except add_source and add_source_dual
   */
  if (curr_work_group_size > ((img_size) / (_VEC_SIZE)))
    curr_work_group_size = ((img_size) / (_VEC_SIZE));
  local_wg_size[0] = curr_work_group_size;
  local_wg_size[1] = 1;

  /* Determine the local work group size for add_source and add_source_dual the
   * local work group size has to be divisible into the global work size. For the
   * two add_source kernels (add_source and add_source_dual), the global_work_size is
   *          (img_size + (2 * _PAD)) * (img_size + (2 * _PAD))
   * The padded local_work_group_size is then determined by finding a divisor of a that's closest
   * to, but not exceeding the curr_work_group_size
   */
  tmp = curr_work_group_size;
  global_size = ((img_size + (2 * _PAD)) * (img_size + (2 * _PAD))) / _VEC_SIZE;
  while (1) {
    if ((tmp * (global_size / tmp)) == global_size)  {
      local_wg_size_add_source[0] = tmp;
      break;
    }
    tmp--;
  }
  local_wg_size_add_source[1] = 1;

  if (verbose) {
    printf ("2D Work Group Size for add_source and add_source_dual kernels is %d, %d\n",
	    (int) local_wg_size_add_source[0], (int) local_wg_size_add_source[1]);
    printf ("2D Work Group Size for all other kernels is %d, %d\n",
	    (int) local_wg_size[0], (int) local_wg_size[1]);
  }
}



int
main (int argc, char **argv)
{
  int rc = 0;
  char *name;
  cl_device_id device_id;	/*  compute device id */
  cl_device_type device_type = CL_DEVICE_TYPE_DEFAULT;
  cl_context context;		/*  compute context */
  int num_frames = DEFAULT_NUM_FRAMES;
  int verify = 0;
  int opt, option_index = 0;
#if defined(GL_VIEWER)
  char *optstring = "hacgmVs:";
#elif defined(REMOTE_VIEWER) 
  char *optstring = "hacgmVs:";
#else
  char *optstring = "hacgn:mvpVs:";
  struct fluid_solver_host fsh;
#endif

  static struct option long_options[] = {
    {"help", 0, NULL, 'h'},
    /* Devtype */
    {"accel", 0, NULL, 'a'},
    {"cpu", 0, NULL, 'c'},
    {"gpu", 0, NULL, 'g'},
    /* Control */
    {"awgc", 0, NULL, 'm'},
    {"size", 1, NULL, 's'},
#if !defined(GL_VIEWER) && !defined(REMOTE_VIEWER)
    {"numframes", 1, NULL, 'n'},
    {"verify", 0, NULL, 'v'},
    {"profiling", 0, NULL, 'p'},
#endif
    {"verbose", 0, NULL, 'V'},
    {NULL, 0, NULL, 0}
  };

  /* Change current working directory to that of the invocation path so that fluid can
   * be run from any current working directory.
   */
  name = basename(argv[0]);
  (void)chdir(dirname(argv[0]));

  /*------------------------------------------------------------------------------------
   * Parse command line arguments 
   *------------------------------------------------------------------------------------
   */
  while ((opt = getopt_long (argc, argv, optstring, long_options, &option_index)) != -1) {
    switch (opt) {
    case 'a':
      device_type = CL_DEVICE_TYPE_ACCELERATOR;
      break;
    case 'c':
      device_type = CL_DEVICE_TYPE_CPU;
      break;
    case 'g':
      device_type = CL_DEVICE_TYPE_GPU;
      break;
    case 'n':
      num_frames = atoi (optarg);
      break;
    case 'v':
      verify = 1;
      break;
    case 'p':
      profiling = 1;
      break;
    case 'm':
      use_async_wg_copy = 1;
      filename = "fluid_solver_lm_vec4.cl";
      break;
    case 's':
      img_size = atoi(optarg);
      if (img_size < 1) {
	fprintf(stderr, "Error, Image size %d is not a positive non-zero value.\n", img_size);
	exit (EXIT_FAILURE);
      }
      break;
    case 'V':
      verbose = 1;
      break;
    default:
    case 'h':
      print_usage (name);
      exit (EXIT_FAILURE);
    }
  }

  /* Make sure the image size is a multiple of _VEC_SIZE or 16 for remote viewer encoding. Round
   * upward if it is not.
   */
  img_size = IMG_SIZE_CONSTRAINT * ((img_size + (IMG_SIZE_CONSTRAINT-1)) / IMG_SIZE_CONSTRAINT);

  /* Force the number of frames to 2 if verification is requested. Otherwise, differences
   * in accuracy will make it difficult to verify with a reasonable tolerance. 
   */
  if (verify) num_frames = 2;

  /*-----------------------------------------------------------------------------
   * Initialize the computing subsystem. Init clu, get devices, create command
   * queue(s), and create kernels.
   *-----------------------------------------------------------------------------
   */

  /* Initialize CLU and get the context from CLU */
  clu = cluInit (NULL);
  context = cluGetCLContext (clu);

  /* Get device id */
  if ((device_id = cluGetDeviceID (clu, device_type, NULL, NULL)) == NULL) {
    CLU_EXIT_ERROR("Unable to locate a device of type %s.\n", cluGetCLDeviceTypeString(device_type));
  }
  if (verbose) {
    printf("Fluid Compute Device Name = %s\n", cluGetDeviceName(clu, device_id));
  }

  /* Create a command queue, enabling profiling if requested */
  commands = cluCreateCmdQueue (clu, device_id, 0, ((profiling) ? CL_QUEUE_PROFILING_ENABLE : 0));

  /* Create all the kernels */
  if (verbose) printf ("Creating Fluid compute kernels...\n");
  char *build_options = "-Werror";
  advect_kernel = cluCreateKernel (clu, commands, filename, "advect", build_options, CLU_SOURCE);
  set_boundary_kernel = cluCreateKernel (clu, commands, filename, "set_boundary", build_options, CLU_SOURCE);
  add_source_kernel = cluCreateKernel (clu, commands, filename, "add_source", build_options, CLU_SOURCE);
  add_source_dual_kernel = cluCreateKernel (clu, commands, filename, "add_source_dual", build_options, CLU_SOURCE);
  linear_solver_kernel = cluCreateKernel (clu, commands, filename, "linear_solver", build_options, CLU_SOURCE);
  project_part1_kernel = cluCreateKernel (clu, commands, filename, "project_part1", build_options, CLU_SOURCE);
  project_part2_kernel = cluCreateKernel (clu, commands, filename, "project_part2", build_options, CLU_SOURCE);
  pack_image_kernel = cluCreateKernel (clu, commands, filename, "pack_image", build_options, CLU_SOURCE);
  if (verbose) printf ("Fluid compute kernels created.\n");


  /* Find the appropriate local_work_group_size */
  set_local_wg_size (device_id);

  
  setup (&fs, context, img_size, _DT);

#if defined(GL_VIEWER) || defined(REMOTE_VIEWER)
  /* Initialize the viewer and pass control perminantly to the viewer.
   */
  initViewer(name, img_size, img_size, 4, GL_RGBA, GL_UNSIGNED_BYTE, 0);
  viewerMainLoop();

#else	/* !(GL_VIEWER || REMOTE_VIEWER) */
  run_sim (clu, &fs, num_frames);

  if (verify) {
    cl_int err;
    cl_device_fp_config config;

    if (verbose) printf ("Running non-OpenCL host fluid simulation...\n");

    /* Coerce the host to match the device's rounding mode in case the device
     * is an embedded device and doesn't support CL_FP_ROUND_TO_NEAREST
     */
    err = clGetDeviceInfo(device_id, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(config), &config, NULL);
    CLU_CHECK_ERROR("clGetDeviceInfo CL_DEVICE_SINGLE_FP_CONFIG", err);
    if ((config & CL_FP_ROUND_TO_NEAREST) == 0) (void)fesetround(FE_TOWARDZERO);

    setup_host (&fsh, img_size, _DT);
    run_sim_host (&fsh, num_frames);

    if (verbose) printf("Host simulation complete.\n");

    rc = verify_results (fsh.imgDatah, fs.imgAccel, fs.n);
    cleanup_buffers_host (&fsh);
  }

  /* Shutdown and cleanup */
  cleanup_buffers (&fs);
  cluDestroy (clu);
#endif /* GL_VIEWER || REMOTE_VIEWER */

  return rc;
}
