/*************************************************************************/
/*                                                                       */
/* Licensed Materials - Property of IBM                                  */
/*                                                                       */
/* (C) Copyright IBM Corp. 2006, 2009                                    */
/* All Rights Reserved                                                   */
/*                                                                       */
/* US Government Users Restricted Rights - Use, duplication or           */
/* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.     */
/*                                                                       */
/*************************************************************************/
/* --------------------------------------------------------------  */
/* Copyright (c) 1984-2005, Keenan Crane                           */
/* All rights reserved.                                            */
/*                                                                 */
/* Redistribution and use in source and binary forms, with or      */
/* without modification, are permitted provided that the following */
/* conditions are met:                                             */
/*                                                                 */
/* Redistributions of source code must retain the above copyright  */
/* notice, this list of conditions and the following disclaimer.   */
/* Redistributions in binary form must reproduce the above         */
/* copyright notice, this list of conditions and the following     */
/* disclaimer in the documentation and/or other materials provided */
/* with the distribution.                                          */
/*                                                                 */
/* The name of Keenan Crane may not be used to endorse or promote  */
/* products derived from this software without specific prior      */
/* written permission.                                             */
/*                                                                 */
/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          */
/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        */
/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            */
/* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    */
/* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT    */
/* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;    */
/* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)        */
/* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN       */
/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR    */
/* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  */
/* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              */
/* --------------------------------------------------------------  */

#include <stdio.h>
#include <math.h>
#include <getopt.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <libgen.h>
#include <CL/opencl.h>

#ifdef GL_VIEWER
#include "viewer.h"
#endif

#ifdef REMOTE_VIEWER
#include "viewer.h"
#endif 

#include "clu.h"
#include "clock.h"
#include "julia.h"

/*-----------------------------------------------------------------------------
 *  GLOBAL variables and macros
 *-----------------------------------------------------------------------------
 */
#define IMG_WIDTH       	512		/* default image width */
#define IMG_HEIGHT      	512		/* default image height */

static int img_width  = IMG_WIDTH;
static int img_height = IMG_HEIGHT;
static int img_size; 

static int paused     = 0;
static int verbose    = 0;

static float currMu[4];
static float morphTimer = 0.0f;

static clu_t clu;
static cl_device_id device_id;			/* compute device id */
static cl_context context;              	/* compute context */
static cl_command_queue commands;       	/* compute command queue */
static cl_event compute_event;			/* compute event */
static cl_kernel kernel;                	/* compute kernel */
static unsigned char *mapped_addr[2];		/* mapped address returned from mapbuffer */
static cl_mem outBuffer[2];           		/* device memory used for the framebuffer */

static int first_frame   = 1;
static int current_frame = 0;


/*
 * This structure is passed once between the host and kernel for
 * data initialized before the run.
 */
static struct julia_context {
  cl_float4 dir_top_start;
  cl_float4 dir_bottom_start;
  cl_float4 dir_bottom_stop;
  cl_float4 eyeP;
  cl_float4 lightP;
  cl_int2 window_size;
  cl_float epsilon;
  cl_int maxIterations;
  cl_int stride;
  cl_int pad[3];
} jc;


/* kernel file name  */
static const char *filename = "julia_kernel.cl";

/* morphing matrix */
static float mu[4][4];

/* Camera parameters */
static float N[3];                     /* Normal to the image plane */
static float T[3];                     /* Tangent of the image plane */
static float B[3];                     /* Binormal of the image plane */
static program_context_t rc;           /* Misc screen parameters */

static char *usage =
  "Usage: %s [options]\n"
  "\n"
  "Examples:\n"
  "  %s --accel -V         # Run on accelerator device with verbose output.\n"
  "  %s --cpu              # Run on CPU device.\n"
  "\n"
  "Device Types Options:\n"
  "  -a, --accel              Use Accelerator for compute.\n"
  "  -c, --cpu                Use CPU for compute.\n"              
  "  -g, --gpu                Use GPU for compute.\n"
  "\n"
  "  If a device type is unspecified, then the platform's CL_DEVICE_TYPE_DEFAULT is used.\n"
  "\n"
  "Run Options:\n"
#if !defined(GL_VIEWER) && !defined(REMOTE_VIEWER)
  "  -i, --iterations N       Number of iterations (frames).\n"
#endif
  "  -l, --lwgsize N          Local work group size. Must be an integer factor of width/4.\n"
  "                           0 or unspecified then OpenCL will choose the best size.\n"
  "  -w, --width N            The width, in pixels, of the ray traced image (default: %d)\n"
  "  -h, --height N           The height, in pixels, of the ray traced image (default: %d).\n"
  "\n"
  "Output Options:\n"
  "  -H, --help               Display usage help.\n"
  "  -V, --verbose            Emit verbose informational output.\n"
  "\n"
  ;


/* These defines specify the constraints to be placed on the image size
 * based upon the environment.
 */
#define IMG_WIDTH_CONSTRAINT	4
#define IMG_HEIGHT_CONSTRAINT	1

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  calculateView
 *  Description:  
 * =====================================================================================
 */
static void calculateView (program_context_t * rc)
{
  float tmpEye[4];
  float tmpLightP[4];

  /* 
   * First apply the view transformations to the initial eye, look at,
   * and up.  These will be used later to determine the basis.
   */
  int i, j;
  float mag;
  /* eye starts on the unit sphere */
  float eyeStart[4] = { 0.0f, 0.0f, 1.0f, 1.0f };

  /* initially look at the origin */
  float lookatStart[4] = { 0.0f, 0.0f, 0.0f, 1.0f };

  /* up is initially along the y-axis */
  float upStart[4] = { 0.0f, 1.0f, 0.0f, 0.0f };

  /* point light location */
  static float lookAt[4], up[4];


  /* translate the eye and look at points */
  eyeStart[0]    += rc->translate[0];
  eyeStart[1]    += rc->translate[1];
  eyeStart[2]    += rc->zoom;
  lookatStart[0] += rc->translate[0];
  lookatStart[1] += rc->translate[1];
  lookatStart[2] += rc->zoom;

  /* rotate eye, lookat, and up by multiplying them with the current rotation matrix */
  for (i = 0; i < 4; i++) {
    tmpEye[i] = 0.0f;
    lookAt[i] = 0.0f;
    up[i] = 0.0f;

    for (j = 0; j < 4; j++) {
      tmpEye[i] += rc->curRotation[i * 4 + j] * eyeStart[j];
      lookAt[i] += rc->curRotation[i * 4 + j] * lookatStart[j];
      up[i] += rc->curRotation[i * 4 + j] * upStart[j];
    }
  }


  /* Now we construct the basis: */
  /*   N = (look at) - (eye)     */
  /*   T = up                    */
  /*   B = N x T                 */

  /* find and normalize N = (lookat - eye) */
  for (i = 0; i < 3; i++) N[i] = lookAt[i] - tmpEye[i];
  mag = 1.0f / sqrt (N[0] * N[0] + N[1] * N[1] + N[2] * N[2]);
  for (i = 0; i < 3; i++) N[i] *= mag;

  /* find and normalize T = up */
  for (i = 0; i < 3; i++) T[i] = up[i];
  mag = 1.0f / sqrt (T[0] * T[0] + T[1] * T[1] + T[2] * T[2]);
  for (i = 0; i < 3; i++) T[i] *= mag;

  /* find B = N x T (already unit length) */
  B[0] = N[1] * T[2] - N[2] * T[1];
  B[1] = N[2] * T[0] - N[0] * T[2];
  B[2] = N[0] * T[1] - N[1] * T[0];

  /* move the light a little bit up and to the right of the eye. */
  for (i = 0; i < 3; i++) {
    tmpLightP[i] = tmpEye[i] - B[i] * 0.5f;
    tmpLightP[i] += T[i] * 0.5f;
  }

  jc.eyeP.s0 = tmpEye[0];
  jc.eyeP.s1 = tmpEye[1];
  jc.eyeP.s2 = tmpEye[2];
  jc.eyeP.s3 = tmpEye[3];

  jc.lightP.s0 = tmpLightP[0];
  jc.lightP.s1 = tmpLightP[1];
  jc.lightP.s2 = tmpLightP[2];
  jc.lightP.s3 = tmpLightP[3];
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  getCurrMu
 *  Description:  Get the interpolated constant for the current time used for
 *                mophing between two Julia sets.
 * =====================================================================================
 */
static void getCurMu (float *cur, float t)
{
  int i;
  float t0, t1, t2, t3;
  float mt = 1.0f - t;
  float tsq = t * t;
  float tcb = t * t * t;
  float inv6 = 1.0f / 6.0f;

  t0 = mt * mt * mt * inv6;
  t1 = (4.0f + 3.0f * tcb - 6.0f * tsq) * inv6;
  t2 = (1.0f + 3.0f * t + 3.0f * tsq - 3.0f * tcb) * inv6;
  t3 = tcb * inv6;
  for (i = 0; i < 4; i++) {
    cur[i] = t0 * mu[0][i] + t1 * mu[1][i] + t2 * mu[2][i] + t3 * mu[3][i];
  }
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  initWorkload
 *  Description:  Initialize the morphing and viewing matrices.
 * =====================================================================================
 */
static void initWorkload()
{
  float alpha;                  		/* height for aspect ratio */
  float beta;                   		/* width for aspect ratio */
  float fRandMax = 1.0f/((float)RAND_MAX);      /* used to normalize random values */

  /* Setup the initial morphing matrix */
  mu[0][0] = 0.0f;
  mu[0][1] = 0.0f;
  mu[0][2] = 0.0f;
  mu[0][3] = 0.0f;
  mu[1][0] = 0.0f;
  mu[1][1] = 0.0f;
  mu[1][2] = 0.0f;
  mu[1][3] = 0.0f;
  mu[2][0] = 0.0f;
  mu[2][1] = 0.0f;
  mu[2][2] = 0.0f;
  mu[2][3] = 0.0f;
  mu[3][0] = 2.0f * (rand () * fRandMax) - 1.0f;
  mu[3][1] = 2.0f * (rand () * fRandMax) - 1.0f;
  mu[3][2] = 2.0f * (rand () * fRandMax) - 1.0f;
  mu[3][3] = 2.0f * (rand () * fRandMax) - 1.0f;

  /* Set initial view parametrs */
  rc.translate[0]    = 0.0F;
  rc.translate[1]    = 0.0F;
  rc.zoom            = 2.0F;
  rc.curRotation[0]  = 0.0F;
  rc.curRotation[1]  = 1.0F;
  rc.curRotation[2]  = 0.0F;
  rc.curRotation[3]  = 0.0F;
  rc.curRotation[4]  = -1.0F;
  rc.curRotation[5]  = 0.0F;
  rc.curRotation[6]  = 0.0F;
  rc.curRotation[7]  = 0.0F;
  rc.curRotation[8]  = 0.0F;
  rc.curRotation[9]  = 0.0F;
  rc.curRotation[10] = 1.0F;
  rc.curRotation[11] = 0.0F;
  rc.curRotation[12] = 0.0F;
  rc.curRotation[13] = 0.0F;
  rc.curRotation[14] = 0.0F;
  rc.curRotation[15] = 1.0F;
  rc.shadows = 0;

  rc.fov = 60.0f;
  rc.aspect = ((float) img_width) / ((float) img_height);
  jc.window_size.s0 = (float) img_width;
  jc.window_size.s1 = (float) img_height;
  jc.maxIterations = 4;
  jc.epsilon = 0.003f;
  jc.stride = img_width;

  calculateView (&rc);

  beta = tan ((rc.fov * M_PI / 180.0f) / 2.0f); /*find height */
  alpha = beta * rc.aspect;     /*find width */

  /* rendering region: upper left corner */
  jc.dir_top_start.s0 = -alpha * T[0] - beta * B[0] + N[0];
  jc.dir_top_start.s1 = -alpha * T[1] - beta * B[1] + N[1];
  jc.dir_top_start.s2 = -alpha * T[2] - beta * B[2] + N[2];

  /* rendering region: lower left corner */
  jc.dir_bottom_start.s0 = -alpha * T[0] + beta * B[0] + N[0];
  jc.dir_bottom_start.s1 = -alpha * T[1] + beta * B[1] + N[1];
  jc.dir_bottom_start.s2 = -alpha * T[2] + beta * B[2] + N[2];

  /* rendering region: lower right corner */
  jc.dir_bottom_stop.s0 = alpha * T[0] + beta * B[0] + N[0];
  jc.dir_bottom_stop.s1 = alpha * T[1] + beta * B[1] + N[1];
  jc.dir_bottom_stop.s2 = alpha * T[2] + beta * B[2] + N[2];

}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  updateMu
 *  Description:  RandomlyuUpdate the mu morphing matrix. 
 * =====================================================================================
 */
static void updateMu()
{
  float fRandMax = 1.0f/((float)RAND_MAX);  /* used to normalize random values */  

  mu[0][0] = mu[1][0];
  mu[0][1] = mu[1][1];
  mu[0][2] = mu[1][2];
  mu[0][3] = mu[1][3];

  mu[1][0] = mu[2][0];
  mu[1][1] = mu[2][1];
  mu[1][2] = mu[2][2];
  mu[1][3] = mu[2][3];

  mu[2][0] = mu[3][0];
  mu[2][1] = mu[3][1];
  mu[2][2] = mu[3][2];
  mu[2][3] = mu[3][3];

  mu[3][0] = 2.0f * (rand() * fRandMax) - 1.0f;
  mu[3][1] = 2.0f * (rand() * fRandMax) - 1.0f;
  mu[3][2] = 2.0f * (rand() * fRandMax) - 1.0f;
  mu[3][3] = 2.0f * (rand() * fRandMax) - 1.0f;
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  nextMu
 *  Description:  Update mu for the next frame.
 * =====================================================================================
 */
static void nextMu()
{
  getCurMu (currMu, morphTimer);

  /* If the animation is not paused, then advance the morph. */
  if (!paused) {
    if ((morphTimer += 0.05f) >= 1.0f) {
      morphTimer -= 1.0f;
      updateMu();
    }
  }
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  next_frame
 *  Description:  Invokes the julia compute kernel and returns a pointer to the generated
 *                next frame buffer and image size. An image size of 0 implies the image
 *                size is equal to img_width * img_height * img_pixel_size.
 *
 *                This function utilizes a double buffering rendering pipeline in which
 *                the subsequent frame is precomputed while the returned frame is processed.
 * =====================================================================================
 */
void *next_frame(int **size)
{
  cl_int err;
  int    next;				/* next frame index */
  void * frame_buffer;              	/* image frame buffer to display */
  cl_event map_event;
  cl_event unmap_event = NULL;

  /*-----------------------------------------------------------------------------------
   * Advance to the parameterization to next frame
   *-----------------------------------------------------------------------------------
   */
  nextMu();

  /* ----------------------------------------------------------------------------------
   * The rendering pipeline when double buffering. This ordering of requests is
   * design to support both in-order and out-of-order queues. 
   *
   *          Compute buffer 0
   *                 |
   *      .--------->|
   *      |          v
   *      |     Map buffer 0
   *      |          |
   *      |          v
   *      |    Unmap buffer 1 (if mapped)
   *      |          |
   *      |          v
   *      |   Compute buffer 1
   *      |          |------------------> return buffer 0
   *      |          v
   *      |     Map buffer 1
   *      |          |
   *      |          v
   *      |    Unmap buffer 0
   *      |          |
   *      |          v
   *      |   Compute buffer 0
   *      |          |------------------> return buffer 1
   *      `----------'
   *
   * ----------------------------------------------------------------------------------
   */
  /* If this is the first frame, prime the pipeline by computing the first buffer.
   */
  if (first_frame) {
    first_frame = 0;
    cluRunKernel (clu, kernel, &compute_event, 3,
		  sizeof(currMu), currMu, 
		  sizeof(cl_mem), &outBuffer[current_frame], 
		  sizeof(struct julia_context), &jc);      
    nextMu();
  }

  next = 1 - current_frame;
  mapped_addr[current_frame] = clEnqueueMapBuffer (commands, outBuffer[current_frame], 
						   CL_FALSE, CL_MAP_READ, 0, img_size,
						   1, &compute_event, &map_event, &err);
  CLU_CHECK_ERROR ("clEnqueueMapBuffer", err);
  CLU_CHECK_ERROR ("clReleaseEvent", clReleaseEvent(compute_event));

  /* Unmap the next frame if it was previously mapped.
   */
  if (mapped_addr[next]) {
    CLU_CHECK_ERROR ("clEnqueueUnmapMemObject", 
		     clEnqueueUnmapMemObject (commands, outBuffer[next], mapped_addr[next], 0, NULL, &unmap_event));
    mapped_addr[next] = NULL;
    cluSetKernelDependency(clu, kernel, 1, &unmap_event);
  }

  /* Wait for the map to complete */
  CLU_CHECK_ERROR ("clWaitForEvents", clWaitForEvents(1, &map_event));
  CLU_CHECK_ERROR ("clReleaseEvent", clReleaseEvent(map_event));


  /* Generate the next frame 
   */
  cluRunKernel (clu, kernel, &compute_event, 3,
		sizeof(currMu), currMu, 
		sizeof(cl_mem), &outBuffer[next], 
		sizeof(struct julia_context), &jc);      

  if (unmap_event) {
    CLU_CHECK_ERROR ("clReleaseEvent", clReleaseEvent(unmap_event));
  }

  frame_buffer = (void *)mapped_addr[current_frame];

  /*------------------------------------------------------------------------------------
   * Return the resulting frame buffer pointer and size information/
   *------------------------------------------------------------------------------------
   */
  current_frame = next;
#ifdef REMOTE_VIEWER
  if (size) *size = NULL;
#endif

  return (frame_buffer);
}



#if !defined(GL_VIEWER) && !defined(REMOTE_VIEWER)
/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  last_frame
 *  Description:  Fetches the last frame in the double buffered rendering pipeline.
 * =====================================================================================
 */
static void *last_frame(int *size)
{
  cl_int err;
  int    frame_size = 0;		/* default frame size */
  char * frame_buffer;              	/* image frame buffer to display */

  /* Compute the frame if no frame is already in the pipeline
   */
  if (first_frame) {
    first_frame = 0;
    cluRunKernel (clu, kernel, &compute_event, 3,
		  sizeof(currMu), currMu, 
		  sizeof(cl_mem), &outBuffer[current_frame], 
		  sizeof(struct julia_context), &jc);      
  }
  /* Map the computed frame into the host address space 
   */
  frame_buffer = clEnqueueMapBuffer (commands, outBuffer[current_frame], 
				     CL_TRUE, CL_MAP_READ, 0, img_size,
				     1, &compute_event, 
				     NULL, &err);
  CLU_CHECK_ERROR ("clEnqueueMapBuffer", err);
  CLU_CHECK_ERROR ("clReleaseEvent", clReleaseEvent(compute_event));
  CLU_CHECK_ERROR ("clFinish", clFinish(commands));

  compute_event = NULL;

  if (size) *size = frame_size;
  return (frame_buffer);
}  
#endif /* GL_VIEWER || REMOTE_VIEWER */  

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  mouse_action
 *  Description:  Handles mouse actions. Externally referenced by the viewer.
 * =====================================================================================
 */
void mouse_action(int button, int state, int x, int y)
{
  /* Ignore all mouse actions */
}

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  mouse_motion
 *  Description:  Handles mouse motion. Externally referenced by the viewer.
 * =====================================================================================
 */
void mouse_motion(int x, int y)
{
  /* Ignore all mouse motion */
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  keyboard_action
 *  Description:  externally referenced by the viewer. Handles any additional key
 *                presses not handled by the viewer. The supported key press actions are:
 *            
 *                space key - toggle on/off morph animation. 
 * =====================================================================================
 */
void keyboard_action (unsigned char key, int x, int y)
{
  switch (key) {
  case ' ':
    /* Toggle animate */
    paused = !paused;
    break;
  default:
    break;
  }
}



int
main (int argc, char *argv[])
{
  int iterations = 50;
  char *name;
#if !defined(GL_VIEWER) && !defined(REMOTE_VIEWER)
  int i;
  float delta = 0.0f;
#endif
  cl_int err;
  size_t global_size[2];
  size_t local_sizes[2] = { 1, 1 };
  size_t *local_size = NULL;

  cl_device_type dev_type = CL_DEVICE_TYPE_DEFAULT;
  cl_command_queue_properties device_q_prop;

  static struct option long_options[] = {
    /* Device types */
    {"accel", 0, NULL, 'a'},
    {"cpu", 0, NULL, 'c'},
    {"gpu", 0, NULL, 'g'},
    /* Run parameters */
    {"iterations", 1, NULL, 'i'},
    {"lwgsize", 1, NULL, 'l'},
    {"width", 1, NULL, 'w'},
    {"height", 1, NULL, 'h'},
    /* Output options */
    {"verbose", 0, NULL, 'V'},
    {"help", 1, NULL, 'H'},
    {NULL, 0, NULL, 0}
  };

  char* optstring = "acgi:l:w:h:VH";
  int opt;
  int option_index = 0;

  /*-----------------------------------------------------------------------------
   * Process command line options 
   *-----------------------------------------------------------------------------
   */
  while ((opt = getopt_long (argc, argv, optstring, long_options, &option_index)) != -1) {
    switch (opt) {
    case 'a':
      dev_type = CL_DEVICE_TYPE_ACCELERATOR;
      break;
    case 'c':
      dev_type = CL_DEVICE_TYPE_CPU;
      break;
    case 'g':
      dev_type = CL_DEVICE_TYPE_GPU;
      break;
    case 'i':
      if ((iterations = atoi (optarg)) < 1) {
        fprintf(stderr, "Error, Iterations must be at least 1.\n");
        exit (EXIT_FAILURE);
      }
      break;
    case 'l':
      if ((local_sizes[0] = atoi (optarg))) {
        local_size = local_sizes;
      } else {
        local_size = NULL;
      }
      break;
    case 'V':
      verbose = 1;
      break;
    case 'w':
      img_width = atoi (optarg);
      if (img_width < 1) {
        fprintf(stderr, "Error, Image width %d is not a positive non-zero value.\n", img_width);
        exit (EXIT_FAILURE);
      }
      break;
    case 'h':
      img_height = atoi (optarg);
      if (img_height < 1) {
        fprintf(stderr, "Error, Image height %d is not a positive non-zero value.\n", img_height);
        exit (EXIT_FAILURE);
      }
      break;
    case 'H':
    case '?':
      fprintf (stderr, usage, argv[0], argv[0], argv[0], IMG_WIDTH, IMG_HEIGHT);
      exit (EXIT_FAILURE);
      break;
    }
  }

  /* Make sure the width and height is a multiple of image constraint. Round upwards if it isn't.
   */
  img_width  = IMG_WIDTH_CONSTRAINT  * ((img_width  + (IMG_WIDTH_CONSTRAINT-1))  / IMG_WIDTH_CONSTRAINT);
  img_height = IMG_HEIGHT_CONSTRAINT * ((img_height + (IMG_HEIGHT_CONSTRAINT-1)) / IMG_HEIGHT_CONSTRAINT);

  img_size = img_width * img_height * 4;

  /* Change current working directory to that of the invocation path so that Julia can
   * be run from any current working directory.
   */
  name = basename(argv[0]);
  (void)chdir(dirname(argv[0]));


  /*-----------------------------------------------------------------------------
   * Initialize the computing subsystem. Init clu, get devices, create command
   * queue(s), and create kernels.
   *-----------------------------------------------------------------------------
   */
  /* Initialize CLU and get the context from CLU */
  clu = cluInit (NULL);
  context = cluGetCLContext (clu);

  /* Get device_id and create a command queue. Remote viewers also create a
   * 2nd device and command queue for encoding.
   */
  if ((device_id = cluGetDeviceID (clu, dev_type, NULL, NULL)) == NULL) {
    CLU_EXIT_ERROR("Unable to locate a device of type %s.\n", cluGetCLDeviceTypeString(dev_type));
  }
  if (verbose) {
    printf("Compute Device Name = %s\n", cluGetDeviceName(clu, device_id));
  }
  CLU_CHECK_ERROR("clGetDeviceInfo(CL_DEVICE_QUEUE_PROPERTIES)", 
		  clGetDeviceInfo (device_id, CL_DEVICE_QUEUE_PROPERTIES, sizeof (cl_command_queue_properties), &device_q_prop, NULL));
  if (device_q_prop & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
    commands = cluCreateCmdQueue (clu, device_id, dev_type, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
    if (verbose) printf ("This sample is using an out-of-order queue for computing\n");
  } else {
    commands = cluCreateCmdQueue (clu, device_id, dev_type, 0);
    if (verbose) printf ("This sample is using an in-order queue for computing\n");
  }



  /* Create the compute kernel in the program we wish to run */
  if (verbose) printf ("Creating compute kernel...\n");
  kernel = cluCreateKernel (clu, commands, filename, "compute_julia", NULL, CLU_SOURCE);
  if (verbose) printf("Compute kernel created\n");


  initWorkload();

  /* Check to make sure the local work group size requested is supported by the device and kernel */
  if (cluCheckLocalWorkgroupSize (device_id, kernel, 2, local_size) == CL_FALSE) {
    fprintf (stderr, "Local workgroup size (%zd %zd) exceeds device and kernel max_work_group_size\n", local_size[0], local_size[1]);
    while (cluCheckLocalWorkgroupSize (device_id, kernel, 2, local_size) == CL_FALSE) --local_size[0];
    fprintf (stderr, "Coercing local work group size to %d\n", (int) local_size[0]);
  }

  /*-----------------------------------------------------------------------------
   *  Allocating memory for buffers and creating OpenCL buffer objects
   *-----------------------------------------------------------------------------
   */
  /* Create both framebuffers */
  outBuffer[0] = clCreateBuffer (context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
				 img_size, NULL, &err);
  CLU_CHECK_ERROR ("clCreateBuffer", err);
  outBuffer[1] = clCreateBuffer (context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
				 img_size, NULL, &err);
  CLU_CHECK_ERROR ("clCreateBuffer", err);

  global_size[0] = img_width / 4;
  global_size[1] = img_height;

  /* Ensure the local work group size does not exceed the global work group size 
   */
  if (local_size && (local_size[0] > global_size[0])) {
    local_size[0] = global_size[0];
    if (verbose) printf("The local work group size exceeds the global work group size. Coercing the local size to the global size.\n");
  }

  cluSetKernelNDRange (clu, kernel, 2, NULL, global_size, local_size);

#if defined(GL_VIEWER) || defined(REMOTE_VIEWER)
  /* Initialize the viewer and pass control perminantly to the viewer.
   */
  initViewer(name, img_width, img_height, 4, GL_RGBA, GL_UNSIGNED_BYTE, 0);
  viewerMainLoop();

#else /*  !(GL_VIEWER || REMOTE_VIEWER) */
  /* Run the sample workload for the specified number of iterations 
   */
  startclock ();

  for (i = 0; i < iterations-1; i++) {
    (void)next_frame(NULL);
  }                             
  (void)last_frame(NULL);

  delta = stopclock ();
  printf ("%d Frames took %f seconds. Rate = %f Mpixels/sec, %f frames/second\n",
          iterations, delta,
          (double) (img_width * img_height) * (double) (iterations) /
          (1000000.0 * (double) (delta)), iterations/delta);

#endif /*  GL_VIEWER || REMOTE_VIEWER */

  /* Shutdown and cleanup */
  CLU_CHECK_ERROR ("clReleaseMemObject", clReleaseMemObject (outBuffer[0]));
  CLU_CHECK_ERROR ("clReleaseMemObject", clReleaseMemObject (outBuffer[1]));

  cluDestroy (clu);
  exit (EXIT_SUCCESS);
}
