/*************************************************************************/
/*                                                                       */
/* Licensed Materials - Property of IBM                                  */
/*                                                                       */
/* (C) Copyright IBM Corp. 2009,2010                                     */
/* All Rights Reserved                                                   */
/*                                                                       */
/* US Government Users Restricted Rights - Use, duplication or           */
/* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.     */
/*                                                                       */
/* Inspired by Caltech's Java Applet Fluid Solver at                     */
/* www.multires.caltech.edu/teaching/demos/java/FluidSolver.java         */
/*                                                                       */
/* References:  Visual Simulation of Smoke                               */
/*              R. Fedkiw, J. Stam, H. W. Jensen                         */
/*              SIGGRAPH 2001 Annual Proceedings                         */
/*                                                                       */
/*************************************************************************/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <CL/opencl.h>

#ifdef GL_VIEWER
#include "viewer.h"
#endif

#ifdef REMOTE_VIEWER
#include "viewer.h"
#endif

#include "clu.h"
#include "solver.h"
#include "clock.h"


extern clu_t clu;
extern struct fluid_solver fs;
extern int curr_iter;
extern unsigned int* frame_buffer;

/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  clear_buffers
 *  Description:  Set the contents of the different buffers to zero. Since OpenCL does 
 *                not readily provide a way to clear a buffer in device memory, we use
 *                an OpenCL memory object that contains zeros and copy that buffer to
 *                the buffer we want to clear.
 * =====================================================================================
 */
void
clear_buffers (struct fluid_solver *s)
{
  size_t sz = s->size * sizeof(float);

  clEnqueueCopyBuffer (commands, s->zero, s->u,    0, 0, sz, 0, NULL, NULL);
  clEnqueueCopyBuffer (commands, s->zero, s->v,    0, 0, sz, 0, NULL, NULL);
  clEnqueueCopyBuffer (commands, s->zero, s->d,    0, 0, sz, 0, NULL, NULL);
  clEnqueueCopyBuffer (commands, s->zero, s->uOld, 0, 0, sz, 0, NULL, NULL);
  clEnqueueCopyBuffer (commands, s->zero, s->vOld, 0, 0, sz, 0, NULL, NULL);
  clEnqueueCopyBuffer (commands, s->zero, s->dOld, 0, 0, sz, 0, NULL, NULL);
  clEnqueueCopyBuffer (commands, s->zero, s->curl, 0, 0, sz, 0, NULL, NULL);
  clEnqueueCopyBuffer (commands, s->zero, s->temp, 0, 0, sz, 0, NULL, NULL);
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  setup
 *  Description:  
 *        _ malloc memory for the different buffers that are used 
 *        _ creates OpenCL memory object buffers that are used by OpenCL for computation
 *        _ initializes the fluid_solver structure
 * =====================================================================================
 */
void
setup (struct fluid_solver *s, cl_context context, int n, float dt)
{
  cl_int err;
  float *tmp_zero_buffer;	/* temp buffer used to initialize OpenCL device memory objects to 0 */

  s->n = n;
  s->dt = dt;
  s->size = (n + (2 * _PAD)) * (n + (2 * _PAD));
  s->decay = 0.0f;
  s->diff = 0.0f;
  s->visc = 0.0f;

  size_t sz = s->size * sizeof(float);
  size_t si = s->n * s->n * sizeof(unsigned int);

  s->imgAccel = malloc (si);

  tmp_zero_buffer = (float*)calloc (1, sz);

  /* Create the output array in device memory for our calculation */
  s->d = clCreateBuffer (context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sz, NULL, &err);
  CLU_CHECK_ERROR ("clCreatebuffer s->d", err);

  s->u = clCreateBuffer (context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sz, NULL, &err);
  CLU_CHECK_ERROR ("clCreateBuffer s->u", err);

  s->v = clCreateBuffer (context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sz, NULL, &err);
  CLU_CHECK_ERROR ("clCreateBuffer s->v", err);

  s->dOld = clCreateBuffer (context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sz, NULL, &err);
  CLU_CHECK_ERROR ("clCreateBuffer s->dOld", err);

  s->uOld = clCreateBuffer (context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sz, NULL, &err);
  CLU_CHECK_ERROR ("clCreateBuffer s->uOld", err);

  s->vOld = clCreateBuffer (context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sz, NULL, &err);
  CLU_CHECK_ERROR ("clCreateBuffer s->vOld", err);

  s->curl = clCreateBuffer (context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sz, NULL, &err);
  CLU_CHECK_ERROR ("clCreateBuffer s->curl", err);

  s->temp = clCreateBuffer (context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sz, NULL, &err);
  CLU_CHECK_ERROR ("clCreateBuffer s->temp", err);

  /* The following two buffers are using CL_MEM_COPY_HOST_PTR because we want to clear the memory
   * first.
   */
  s->zero = clCreateBuffer (context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sz, tmp_zero_buffer, &err);
  CLU_CHECK_ERROR ("clCreateBuffer s->zero", err);

  s->imgData = clCreateBuffer (context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, si, tmp_zero_buffer, &err);
  CLU_CHECK_ERROR ("clCreateBuffer s->imgData", err);

  free (tmp_zero_buffer);
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  cleanup_buffers
 *  Description:  release OpenCL memory buffers and free allocated memory buffers for 
 *                the fluid_solver  structure
 * =====================================================================================
 */
void
cleanup_buffers (struct fluid_solver *s)
{
  clReleaseMemObject (s->d);
  clReleaseMemObject (s->u);
  clReleaseMemObject (s->v);
  clReleaseMemObject (s->dOld);
  clReleaseMemObject (s->uOld);
  clReleaseMemObject (s->vOld);
  clReleaseMemObject (s->curl);
  clReleaseMemObject (s->temp);
  clReleaseMemObject (s->zero);
  clReleaseMemObject (s->imgData);

  free (s->imgAccel);
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  insert_force
 *  Description:  setup the buffers with some initial values. These values basically
 *                simulate the mouse inputs in an interactive application  
 * =====================================================================================
 */
static void
insert_force (struct fluid_solver *s)
{
  int i;
  int n = s->n;
  int i0 = img_size / 2;
  int j0 = img_size / 2;
  size_t sz = sizeof(float);
  float dOld_ptr[INIT_SIZE];
  float uOld_ptr[INIT_SIZE];
  float vOld_ptr[INIT_SIZE];

  for (i = 0; i < INIT_SIZE; i++) {
    dOld_ptr[i] = 2000.0f;
    uOld_ptr[i] = 1000.0f;
    vOld_ptr[i] = 750.0f;
  }

  clEnqueueWriteBuffer (commands, s->dOld, CL_FALSE, (Ih (i0, j0)) * sz,
			sz * INIT_SIZE, dOld_ptr, 0, NULL, NULL);
  clEnqueueWriteBuffer (commands, s->uOld, CL_FALSE, (Ih (i0, j0)) * sz,
			sz * INIT_SIZE, uOld_ptr, 0, NULL, NULL);
  clEnqueueWriteBuffer (commands, s->vOld, CL_TRUE, (Ih (i0, j0)) * sz,
			sz * INIT_SIZE, vOld_ptr, 0, NULL, NULL);
}


static void
add_source (clu_t clu, cl_mem * x, cl_mem * x0, float dt, size_t size)
{
  size /= _VEC_SIZE;

  cluSetKernelNDRange (clu, add_source_kernel, 1, NULL, &size,
		       local_wg_size_add_source);
  if (use_async_wg_copy) {
    cluRunKernel (clu, add_source_kernel, NULL, 5, sizeof(dt), &dt,
		  sizeof(cl_mem), x,
		  sizeof(cl_mem), x0,
		  sizeof(float) * 4 * local_wg_size_add_source[0], NULL,
		  sizeof(float) * 4 * local_wg_size_add_source[0], NULL);
  } else {
    cluRunKernel (clu, add_source_kernel, NULL, 3, sizeof(dt), &dt,
		  sizeof(cl_mem), x, sizeof(cl_mem), x0);
  }
}

static void
add_source_dual (clu_t clu, cl_mem * x, cl_mem * x0, cl_mem * y, cl_mem * y0,
		 float dt, size_t size)
{
  size /= _VEC_SIZE;

  cluSetKernelNDRange (clu, add_source_dual_kernel, 1, NULL, &size,
		       local_wg_size_add_source);
  if (use_async_wg_copy) {
    cluRunKernel (clu, add_source_dual_kernel, NULL, 9, sizeof(dt), &dt,
		  sizeof(cl_mem), x,
		  sizeof(cl_mem), x0,
		  sizeof(cl_mem), y,
		  sizeof(cl_mem), y0,
		  sizeof(float) * 4 * local_wg_size_add_source[0], NULL,
		  sizeof(float) * 4 * local_wg_size_add_source[0], NULL,
		  sizeof(float) * 4 * local_wg_size_add_source[0], NULL,
		  sizeof(float) * 4 * local_wg_size_add_source[0], NULL);

  } else {
    cluRunKernel (clu, add_source_dual_kernel, NULL, 5, sizeof(dt), &dt,
		  sizeof(cl_mem), x,
		  sizeof(cl_mem), x0,
		  sizeof(cl_mem), y, sizeof(cl_mem), y0);
  }
}


static void
set_boundary (clu_t clu, int b, cl_mem * x, int n)
{
  size_t global = n;

  cluSetKernelNDRange (clu, set_boundary_kernel, 1, NULL, &global,
		       local_wg_size);
  cluRunKernel (clu, set_boundary_kernel, NULL, 3, sizeof(b), &b,
		sizeof(cl_mem), x, sizeof(n), &n);
}


static void
linear_solver (clu_t clu, int b, cl_mem * x, cl_mem * xlast, cl_mem * x0,
	       float a, float c, int n)
{
  size_t k, global[2];

  global[0] = n / _VEC_SIZE;
  global[1] = n;

  cluSetKernelNDRange (clu, linear_solver_kernel, 2, NULL, global,
		       local_wg_size);

  for (k = 0; k < 5; k++) {
    if (use_async_wg_copy) {
      cluRunKernel (clu, linear_solver_kernel, NULL, 11, sizeof(cl_mem),
		    xlast, sizeof(cl_mem), x, sizeof(cl_mem), x0,
		    sizeof(a), &a, sizeof(c), &c, sizeof(n), &n, 
		    sizeof(float) * 4 * local_wg_size[0], NULL,
		    sizeof(float) * 4 * (local_wg_size[0] + 2), NULL,
		    sizeof(float) * 4 * local_wg_size[0], NULL,
		    sizeof(float) * 4 * local_wg_size[0], NULL,
		    sizeof(float) * 4 * local_wg_size[0], NULL);
    } else {
      cluRunKernel (clu, linear_solver_kernel, NULL, 6, sizeof(cl_mem),
		    xlast, sizeof(cl_mem), x, sizeof(cl_mem), x0,
		    sizeof(a), &a, sizeof(c), &c, sizeof(n), &n);
    }

    set_boundary (clu, b, xlast, n);

    if (use_async_wg_copy) {
      cluRunKernel (clu, linear_solver_kernel, NULL, 11, sizeof(cl_mem), x,
		    sizeof(cl_mem), xlast,
		    sizeof(cl_mem), x0,
		    sizeof(a), &a,
		    sizeof(c), &c,
		    sizeof(n), &n,
		    sizeof(float) * 4 * local_wg_size[0], NULL,
		    sizeof(float) * 4 * (local_wg_size[0] + 2), NULL,
		    sizeof(float) * 4 * local_wg_size[0], NULL,
		    sizeof(float) * 4 * local_wg_size[0], NULL,
		    sizeof(float) * 4 * local_wg_size[0], NULL);
    } else {
      cluRunKernel (clu, linear_solver_kernel, NULL, 6, sizeof(cl_mem), x,
		    sizeof(cl_mem), xlast,
		    sizeof(cl_mem), x0,
		    sizeof(a), &a,
		    sizeof(c), &c, 
		    sizeof(n), &n);
    }
    set_boundary (clu, b, x, n);
  }
}


static void
diffuse (clu_t clu, int b, cl_mem * c, cl_mem * c0, cl_mem * temp, float diff,
	 float dt, int n)
{
  float a = dt * diff * n * n;
  linear_solver (clu, b, c, temp, c0, a, 1 + 4 * a, n);
}


static void
project (clu_t clu, cl_mem * x, cl_mem * y, cl_mem * p, cl_mem * div,
	 cl_mem * temp, int n)
{
  size_t global[2];

  global[0] = n / _VEC_SIZE;
  global[1] = n;

  cluSetKernelNDRange (clu, project_part1_kernel, 2, NULL, global,
		       local_wg_size);

  if (use_async_wg_copy) {
    cluRunKernel (clu, project_part1_kernel, NULL, 10, sizeof(cl_mem), x,
		  sizeof(cl_mem), y,
		  sizeof(cl_mem), p,
		  sizeof(cl_mem), div,
		  sizeof(n), &n,
		  sizeof(float) * 4 * (local_wg_size[0] + 2), NULL,
		  sizeof(float) * 4 * local_wg_size[0], NULL,
		  sizeof(float) * 4 * local_wg_size[0], NULL,
		  sizeof(float) * 4 * local_wg_size[0], NULL,
		  sizeof(float) * 4 * local_wg_size[0], NULL);
  } else {
    cluRunKernel (clu, project_part1_kernel, NULL, 5, sizeof(cl_mem), x,
		  sizeof(cl_mem), y,
		  sizeof(cl_mem), p,
		  sizeof(cl_mem), div, sizeof(n), &n);
  }
  set_boundary (clu, 0, div, n);
  set_boundary (clu, 0, p, n);

  linear_solver (clu, 0, p, temp, div, 1, 4, n);

  cluSetKernelNDRange (clu, project_part2_kernel, 2, NULL, global,
		       local_wg_size);
  if (use_async_wg_copy) {
    cluRunKernel (clu, project_part2_kernel, NULL, 9, sizeof(cl_mem), x,
		  sizeof(cl_mem), y,
		  sizeof(cl_mem), p,
		  sizeof(n), &n,
		  sizeof(float) * 4 * local_wg_size[0], NULL,
		  sizeof(float) * 4 * (local_wg_size[0] + 2), NULL,
		  sizeof(float) * 4 * local_wg_size[0], NULL,
		  sizeof(float) * 4 * local_wg_size[0], NULL,
		  sizeof(float) * 4 * local_wg_size[0], NULL);
  } else {
    cluRunKernel (clu, project_part2_kernel, NULL, 4, sizeof(cl_mem), x,
		  sizeof(cl_mem), y, sizeof(cl_mem), p, sizeof(n), &n);
  }

  set_boundary (clu, 1, x, n);
  set_boundary (clu, 2, y, n);
}


static void
advect (clu_t clu, int b, cl_mem * d, cl_mem * d0, cl_mem * du, cl_mem * dv,
	float dt, int n)
{
  size_t global[2];

  global[0] = n / _VEC_SIZE;
  global[1] = n;

  cluSetKernelNDRange (clu, advect_kernel, 2, NULL, global, local_wg_size);

  if (use_async_wg_copy) {
    cluRunKernel (clu, advect_kernel, NULL, 9, sizeof(cl_mem), d,
		  sizeof(cl_mem), d0,
		  sizeof(cl_mem), du,
		  sizeof(cl_mem), dv,
		  sizeof(dt), &dt,
		  sizeof(n), &n,
		  sizeof(float) * 4 * local_wg_size[0], NULL,
		  sizeof(float) * 4 * local_wg_size[0], NULL,
		  sizeof(float) * 4 * local_wg_size[0], NULL);
  } else {
    cluRunKernel (clu, advect_kernel, NULL, 6, sizeof(cl_mem), d,
		  sizeof(cl_mem), d0,
		  sizeof(cl_mem), du,
		  sizeof(cl_mem), dv,
		  sizeof(dt), &dt, sizeof(n), &n);
  }

  set_boundary (clu, b, d, n);
}


void
pack_img (clu_t clu, struct fluid_solver *s)
{
  int n = s->n;
  size_t global[2];

  global[0] = n / _VEC_SIZE;
  global[1] = n;

  cluSetKernelNDRange (clu, pack_image_kernel, 2, NULL, global,
		       local_wg_size);
  if (use_async_wg_copy) {
    cluRunKernel (clu, pack_image_kernel, NULL, 9, sizeof(cl_mem), &s->d,
		  sizeof(cl_mem), &s->u,
		  sizeof(cl_mem), &s->v,
		  sizeof(cl_mem), &s->imgData,
		  sizeof(n), &n,
		  sizeof(float) * 4 * local_wg_size[0], NULL,
		  sizeof(float) * 4 * local_wg_size[0], NULL,
		  sizeof(float) * 4 * local_wg_size[0], NULL,
		  sizeof(float) * 4 * local_wg_size[0], NULL);
  } else {
    cluRunKernel (clu, pack_image_kernel, NULL, 5, sizeof(cl_mem), &s->d,
		  sizeof(cl_mem), &s->u,
		  sizeof(cl_mem), &s->v,
		  sizeof(cl_mem), &s->imgData, sizeof(n), &n);
  }
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  update
 *  Description:  Update the fluid simulation
 * =====================================================================================
 */
void
update (clu_t clu, struct fluid_solver *s)
{
  size_t sz = s->size * sizeof(float);

  /*-----------------------------------------------------------------------------------
   * Velocity Solver
   *-----------------------------------------------------------------------------------
   */
  /* Add velocity that was input by mouse */
  add_source_dual (clu, &s->u, &s->uOld, &s->v, &s->vOld, s->dt, s->size);

  /*  Swapping arrays for economical mem use and calculating diffusion in velocity. */
  diffuse (clu, 0, &s->uOld, &s->u, &s->temp, s->visc, s->dt, s->n);
  diffuse (clu, 0, &s->vOld, &s->v, &s->temp, s->visc, s->dt, s->n);

  /* Create an incompressible field for more effective advection. */
  project (clu, &s->uOld, &s->vOld, &s->u, &s->v, &s->temp, s->n);

  /* Self advect velocities */
  advect (clu, 1, &s->u, &s->uOld, &s->uOld, &s->vOld, s->dt, s->n);
  advect (clu, 2, &s->v, &s->vOld, &s->uOld, &s->vOld, s->dt, s->n);

  /* Make an incompressible field */
  project (clu, &s->u, &s->v, &s->uOld, &s->vOld, &s->temp, s->n);

  /* Clear all input velocities for next frame */
  clEnqueueCopyBuffer (commands, s->zero, s->uOld, 0, 0, sz, 0, NULL, NULL);
  clEnqueueCopyBuffer (commands, s->zero, s->vOld, 0, 0, sz, 0, NULL, NULL);

  /*-----------------------------------------------------------------------------------
   * Density Solver
   *-----------------------------------------------------------------------------------
   */
  /* Add density inputed by mouse */
  add_source (clu, &s->d, &s->dOld, s->dt, s->size);

  /* Swap d array */
  diffuse (clu, 0, &s->dOld, &s->d, &s->temp, s->diff, s->dt, s->n);
  advect (clu, 0, &s->d, &s->dOld, &s->u, &s->v, s->dt, s->n);

  /* Clear input density array for next frame */
  clEnqueueCopyBuffer (commands, s->zero, s->dOld, 0, 0, sz, 0, NULL, NULL);
}




#if defined(GL_VIEWER) || defined(REMOTE_VIEWER)

static int first_frame = 1;


/*                                                                                                       
 * ===  FUNCTION  ======================================================================                 
 *         Name:  next_frame                                                                             
 *  Description:  Generate the next fluid simulation frame. This function is double 
 *                buffered in that when the "next frame" is obtained, the following 
 *                frame is being computed.
 * =====================================================================================                 
 */
void *next_frame(int **size)
{
  void *frame_buffer;

  if (first_frame) {
    first_frame = 0;
    update (clu, &fs);
    pack_img (clu, &fs);
  }
  clEnqueueReadBuffer (commands, fs.imgData, CL_TRUE, 0, fs.n * fs.n * sizeof(unsigned int), 
		       fs.imgAccel, 0, NULL, NULL);
  frame_buffer = fs.imgAccel;


  /* Start generating the following frame */
  update (clu, &fs);
  pack_img (clu, &fs);


  /* Return the resulting frame buffer pointer and size information/
   */
  if (size) *size = NULL;

  return frame_buffer;
}


/* Mouse input state variables 
 */
static int button_active = 0;		/* non-zero if the last button action was press */
static int mouse_button = 0;		/* button last pressed */
static int last_x, last_y;		/* last tracked mouse position */



/*                                                                                                       
 * ===  FUNCTION  ======================================================================                 
 *         Name:  mouse_action
 *  Description:  Handles user mouse actions.
 * =====================================================================================                 
 */
void mouse_action (int button, int state, int x, int y)
{
  switch (state) {
  case GLUT_DOWN:
    mouse_button = button;
    button_active = 1;
    last_x = x;
    last_y = y;
    mouse_motion(x, y);
    break;
  case GLUT_UP:
    button_active = 0;
    break;
  }
}

/*                                                                                                       
 * ===  FUNCTION  ======================================================================                 
 *         Name:  mouse_motion
 *  Description:  Handles user mouse motion.
 * =====================================================================================                 
 */
void mouse_motion (int next_x, int next_y)
{
  if (button_active) {
    /* A button is active, apply input to the simulation depending on last button press
     */
    int x, y;
    int n = fs.n;
    cl_float d, u, v;
    
    x = last_x;
    y = last_y;
    last_x = next_x;
    last_y = next_y;

    switch (mouse_button) {
    case GLUT_LEFT_BUTTON:
      /* Mouse positions with left button press inserts density */
      d = 500.0f;
      clEnqueueWriteBuffer(commands, fs.dOld, CL_TRUE, (Ih(x+_PAD, n-y+_PAD)) * sizeof(cl_float),
			   sizeof(cl_float), &d, 0, NULL, NULL);
      break;
    case GLUT_RIGHT_BUTTON:
      /* Mouse positions with left button press inserts velocity forces */
      u = (float)(next_x - x) * 10.0f;
      v = (float)(y - next_y) * 10.0f;
      clEnqueueWriteBuffer(commands, fs.uOld, CL_FALSE, (Ih(x+_PAD, n-y+_PAD)) * sizeof(cl_float),
			   sizeof(cl_float), &u, 0, NULL, NULL);
      
      clEnqueueWriteBuffer(commands, fs.vOld, CL_TRUE, (Ih(x+_PAD, n-y+_PAD)) * sizeof(cl_float),
			   sizeof(cl_float), &v, 0, NULL, NULL);
      break;
    }
  }
}


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  keyboard_action
 *  Description:  Externally referenced by the viewer. Handles any additional key 
 *                presses not handled by the viewer. The supported keypress actions
 *                are:
 *                  r key - reset simulation
 */
void keyboard_action (unsigned char key, int x, int y)
{
  switch (key) {
  case 'r':
    /* Reset simulation */
    clear_buffers(&fs);
    break;
  }
}
#endif /* defined(GL_VIEWER) || defined(REMOTE_VIEWER) */


/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  run_sim
 *  Description:  Run the fluid simulation for num_frames.  The last frames output is 
 *                read if "verify" is enabled.
 * =====================================================================================
 */
void
run_sim (clu_t clu, struct fluid_solver *s, int num_frames)
{
  int i;
  size_t si = s->n * s->n * sizeof(unsigned int);
  float delta_accel;

  clear_buffers (s);
  insert_force (s);

  if (profiling) {
    cluEnableKernelProfiling (clu, advect_kernel);
    cluEnableKernelProfiling (clu, set_boundary_kernel);
    cluEnableKernelProfiling (clu, add_source_kernel);
    cluEnableKernelProfiling (clu, add_source_dual_kernel);
    cluEnableKernelProfiling (clu, linear_solver_kernel);
    cluEnableKernelProfiling (clu, project_part1_kernel);
    cluEnableKernelProfiling (clu, project_part2_kernel);
    cluEnableKernelProfiling (clu, pack_image_kernel);
  }

  startclock ();

  for (i = 0; i < num_frames; i++) {
    /*  Update simulation at this timestep. */
    update (clu, s);

    /*  Paint based on density / velocity. */
    pack_img (clu, s);
  }
  clFinish (commands);

  delta_accel = stopclock ();


  if (profiling) {
    float total_time = 0.0f;
    float time;

    /* Fetch and sum the execution time for each of the eight kernels.
     */
    time = cluGetKernelExecTime (clu, advect_kernel, CLU_PROFILING_ACCUM_TIME);
    cluDisableKernelProfiling (clu, advect_kernel);
    total_time += time;
    if (verbose) printf("advect_kernel time           = %f seconds\n", time);

    time = cluGetKernelExecTime (clu, set_boundary_kernel, CLU_PROFILING_ACCUM_TIME);
    cluDisableKernelProfiling (clu, set_boundary_kernel);
    total_time += time;
    if (verbose) printf("set_boundary_kernel time     = %f seconds\n", time);
    
    time = cluGetKernelExecTime (clu, add_source_kernel, CLU_PROFILING_ACCUM_TIME);
    cluDisableKernelProfiling (clu, add_source_kernel);
    total_time += time;
    if (verbose) printf("add_source_kernel time       = %f seconds\n", time);

    time = cluGetKernelExecTime (clu, add_source_dual_kernel, CLU_PROFILING_ACCUM_TIME);
    cluDisableKernelProfiling (clu, add_source_dual_kernel);
    total_time += time;
    if (verbose) printf("add_source_dual_kernel time  = %f seconds\n", time);

    time = cluGetKernelExecTime (clu, linear_solver_kernel, CLU_PROFILING_ACCUM_TIME);
    cluDisableKernelProfiling (clu, linear_solver_kernel);
    total_time += time;
    if (verbose) printf("linear_solver_kernel time    = %f seconds\n", time);

    time = cluGetKernelExecTime (clu, project_part1_kernel, CLU_PROFILING_ACCUM_TIME);
    cluDisableKernelProfiling (clu, project_part1_kernel);
    total_time += time;
    if (verbose) printf("project_part1_kernel time    = %f seconds\n", time);

    time = cluGetKernelExecTime (clu, project_part2_kernel, CLU_PROFILING_ACCUM_TIME);
    cluDisableKernelProfiling (clu, project_part2_kernel);
    total_time += time;
    if (verbose) printf("project_part2_kernel time    = %f seconds\n", time);

    time = cluGetKernelExecTime (clu, pack_image_kernel, CLU_PROFILING_ACCUM_TIME);
    cluDisableKernelProfiling (clu, pack_image_kernel);
    total_time += time;
    if (verbose) printf("pack_image time              = %f seconds\n", time);

    printf ("Total Kernel Execution time = %f seconds\n", total_time);
  }

  printf ("OpenCL device rendered %d frames in %f seconds. Rate = %f frames/sec\n",
	  num_frames, delta_accel, (double)num_frames/(double)(delta_accel));

  clEnqueueReadBuffer (commands, s->imgData, CL_TRUE, 0, si, s->imgAccel, 0,
		       NULL, NULL);
}
