/*************************************************************************/
/*                                                                       */
/* Licensed Materials - Property of IBM                                  */
/*                                                                       */
/* (C) Copyright IBM Corp. 2009, 2010                                    */
/* All Rights Reserved                                                   */
/*                                                                       */
/* US Government Users Restricted Rights - Use, duplication or           */
/* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.     */
/*                                                                       */
/* Inspired by Caltech's Java Applet Fluid Solver at                     */
/* www.multires.caltech.edu/teaching/demos/java/FluidSolver.java         */
/*                                                                       */
/* References:  Visual Simulation of Smoke                               */
/*              R. Fedkiw, J. Stam, H. W. Jensen                         */
/*              SIGGRAPH 2001 Annual Proceedings                         */
/*                                                                       */
/*************************************************************************/



#define _PAD        4
#define _VEC_SIZE   4

#define I(_i, _j)   ((_i) + ((n + (_PAD * 2)) * (_j)))



__kernel void
set_boundary (int b, __global float * restrict x, int n)
{
  size_t i;

  i = get_global_id (0);
  i += _PAD;

  x[I (_PAD - 1, i)] = b == 1 ? -x[I (_PAD, i)] : x[I (_PAD, i)];
  x[I ((n - 1) + _PAD + 1, i)] = b == 1 ? -x[I ((n - 1) + _PAD, i)] : x[I ((n - 1) + _PAD, i)];
  x[I (i, _PAD - 1)] = b == 2 ? -x[I (i, _PAD)] : x[I (i, _PAD)];
  x[I (i, (n - 1) + _PAD + 1)] = b == 2 ? -x[I (i, (n - 1) + _PAD)] : x[I (i, (n - 1) + _PAD)];
}



__kernel void
advect (__global float4 * d,
    __global float * d0,
    __global float4 * du,
    __global float4 * dv,
    float dt, 
    int n, 
    __local float4 * d_lm, 
    __local float4 * du_lm, 
    __local float4 * dv_lm)
{
  int4 i0, j0, i1, j1;
  float4 x, y, s0, t0, s1, t1, dt0;
  size_t i, j;
  size_t gi, gj;
  int4 iv;
  int4 v0123;
  float4 nf, ivf, jvf;
  float4 d0i0j0, d0i0j1, d0i1j0, d0i1j1;

  size_t local_id_x;
  size_t local_size_x;
  size_t wg_id_x, wg_id_y;
  __global float4 *du_dm, *dv_dm;
  __global float4 *d_dm;
  event_t in_tag, out_tag;
  unsigned int rowstride = (n + (_PAD * 2)) / _VEC_SIZE;
  unsigned int offset;

  /*  Query all the work group and work item info */
  wg_id_x = get_group_id (0);
  wg_id_y = get_group_id (1);
  local_id_x = get_local_id (0);
  local_size_x = get_local_size (0);
  gi = get_global_id (0) + 1;
  gj = get_global_id (1) + _PAD;

  /*  Read all data needed for the entire work group into local memory using async_work_group_copy */
  i = local_id_x;
  j = wg_id_y + _PAD;
  offset = (j * rowstride) + (wg_id_x * local_size_x) + _PAD / _VEC_SIZE;

  d_dm = &d[offset];
  du_dm = &du[offset];
  dv_dm = &dv[offset];

  in_tag = async_work_group_copy (du_lm, (const __global float4 *) du_dm, local_size_x, (event_t) 0);
  async_work_group_copy (dv_lm, (const __global float4 *) dv_dm, local_size_x, in_tag);

  /*  Wait for input data */
  wait_group_events (1, &in_tag);

  v0123.s0 = 0;
  v0123.s1 = 1;
  v0123.s2 = 2;
  v0123.s3 = 3;
  nf = n;

  iv = gi * 4;
  iv = iv + v0123;
  jvf = gj;

  dt0 = dt * n;

  ivf = convert_float4 (iv);
  x = ivf - dt0 * du_lm[i];
  y = jvf - dt0 * dv_lm[i];

  x = fmin (x, nf + 0.5f);
  x = fmax (x, 0.5f);

  i0 = convert_int4 (x);
  i1 = i0 + 1;

  y = fmin (y, nf + 0.5f);
  y = fmax (y, 0.5f);

  j0 = convert_int4 (y);
  j1 = j0 + 1;

  s1 = x - convert_float4 (i0);
  s0 = 1.0f - s1;
  t1 = y - convert_float4 (j0);
  t0 = 1.0f - t1;

  d0i0j0.s0 = d0[I (i0.s0, j0.s0)];
  d0i0j0.s1 = d0[I (i0.s1, j0.s1)];
  d0i0j0.s2 = d0[I (i0.s2, j0.s2)];
  d0i0j0.s3 = d0[I (i0.s3, j0.s3)];

  d0i0j1.s0 = d0[I (i0.s0, j1.s0)];
  d0i0j1.s1 = d0[I (i0.s1, j1.s1)];
  d0i0j1.s2 = d0[I (i0.s2, j1.s2)];
  d0i0j1.s3 = d0[I (i0.s3, j1.s3)];

  d0i1j0.s0 = d0[I (i1.s0, j0.s0)];
  d0i1j0.s1 = d0[I (i1.s1, j0.s1)];
  d0i1j0.s2 = d0[I (i1.s2, j0.s2)];
  d0i1j0.s3 = d0[I (i1.s3, j0.s3)];

  d0i1j1.s0 = d0[I (i1.s0, j1.s0)];
  d0i1j1.s1 = d0[I (i1.s1, j1.s1)];
  d0i1j1.s2 = d0[I (i1.s2, j1.s2)];
  d0i1j1.s3 = d0[I (i1.s3, j1.s3)];

  d_lm[i] = s0 * (t0 * d0i0j0 + t1 * d0i0j1) + s1 * (t0 * d0i1j0 + t1 * d0i1j1);

  /*  Sync all processing elements */
  barrier (CLK_LOCAL_MEM_FENCE);
  out_tag =
    async_work_group_copy (d_dm, (const __local float4 *)d_lm, local_size_x, (event_t) 0);
  /*  Wait for output data */
  wait_group_events (1, &out_tag);
}



__kernel void
add_source (float dt,
        __global float4 * x,
        __global float4 * x0, 
        __local float4 * x_lm, 
        __local float4 * x0_lm)
{
  size_t i;
  size_t local_id_x;
  size_t local_size_x;
  size_t wg_id;
  __global float4 *x_dm, *x0_dm;
  event_t in_tag, out_tag;
  unsigned int offset;

  /*  Query all the work group and work item info */
  wg_id = get_group_id (0);
  local_id_x = get_local_id (0);
  local_size_x = get_local_size (0);

  /*  Read all data needed for the entire work group into local memory using async_work_group_copy */

  i = local_id_x;
  offset = wg_id * local_size_x;

  x_dm = &x[offset];
  x0_dm = &x0[offset];

  in_tag = async_work_group_copy (x_lm, (const __global float4 *) x_dm, local_size_x, (event_t) 0);
  async_work_group_copy (x0_lm, (const __global float4 *) x0_dm, local_size_x, in_tag);

  /*  Wait for input data */
  wait_group_events (1, &in_tag);

  x_lm[i] += dt * x0_lm[i];

  /* We have a barrier here to ensure that all work items in the work group all arrive 
   * here before we copy the data out. Since the async_work_group_copy is done once per work-group
   * we want to ensure that we're not missing any work-items 
   */
  barrier (CLK_LOCAL_MEM_FENCE);
  out_tag = async_work_group_copy (x_dm, (const __local float4 *) x_lm, local_size_x, (event_t) 0);
  /*  Wait for output data */
  wait_group_events (1, &out_tag);
}

__kernel void
add_source_dual (float dt,
         __global float4 * x,
         __global float4 * x0,
         __global float4 * y,
         __global float4 * y0,
         __local float4 * x_lm,
         __local float4 * x0_lm, 
         __local float4 * y_lm, 
         __local float4 * y0_lm)
{
  size_t i;
  size_t local_id_x;
  size_t local_size_x;
  size_t wg_id;
  __global float4 *x_dm, *x0_dm;
  __global float4 *y_dm, *y0_dm;
  event_t in_tag, out_tag;
  unsigned int offset;

  /*  Query all the work group and work item info */
  wg_id = get_group_id (0);
  local_id_x = get_local_id (0);
  local_size_x = get_local_size (0);

  /*  Read all data needed for the entire work group into local memory using async_work_group_copy */

  i = local_id_x;
  offset = wg_id * local_size_x;

  x_dm = &x[offset];
  x0_dm = &x0[offset];
  y_dm = &y[offset];
  y0_dm = &y0[offset];

  in_tag = async_work_group_copy (x_lm, (const __global float4 *) x_dm, local_size_x, (event_t) 0);
  async_work_group_copy (x0_lm, (const __global float4 *) x0_dm, local_size_x, in_tag);
  async_work_group_copy (y_lm, (const __global float4 *) y_dm, local_size_x, in_tag);
  async_work_group_copy (y0_lm, (const __global float4 *) y0_dm, local_size_x, in_tag);

  /*  Wait for input data */
  wait_group_events (1, &in_tag);

  x_lm[i] += dt * x0_lm[i];
  y_lm[i] += dt * y0_lm[i];

  /*  Sync all processing elements */
  barrier (CLK_LOCAL_MEM_FENCE);
  out_tag = async_work_group_copy (x_dm, (const __local float4 *) x_lm, local_size_x, (event_t) 0);
  async_work_group_copy (y_dm, (const __local float4 *) y_lm, local_size_x, out_tag);
  /*  Wait for output data */
  wait_group_events (1, &out_tag);
}



__kernel void
linear_solver (__global float4 * x,
           __global float4 * xlast,
           __global float4 * x0,
           float a, float c, int n,
           __local float4 * xlast_1_lm,
           __local float4 * xlast_2_lm,
           __local float4 * xlast_3_lm, 
           __local float4 * x0_lm, 
           __local float4 * x_lm)
{
  size_t i, j;
  float4 up, down, left, right;
  size_t local_id_x;
  size_t local_size_x;
  size_t wg_id_x, wg_id_y;
  __global float4 *xlast_1_dm, *xlast_2_dm, *xlast_3_dm;
  __global float4 *x_dm, *x0_dm;
  event_t in_tag, out_tag;
  unsigned int rowstride = (n + (_PAD * 2)) / _VEC_SIZE;
  unsigned int offset;

  /*  Query all the work group and work item info */
  wg_id_x = get_group_id (0);
  wg_id_y = get_group_id (1);
  local_id_x = get_local_id (0);
  local_size_x = get_local_size (0);

  /*  Read all data needed for the entire work group into local memory using async_work_group_copy */

  i = local_id_x;
  j = wg_id_y + _PAD;
  offset = (j * rowstride) + (wg_id_x * local_size_x) + _PAD / _VEC_SIZE;

  xlast_2_dm = &xlast[offset];
  xlast_1_dm = xlast_2_dm - rowstride;
  xlast_3_dm = xlast_2_dm + rowstride;
  x0_dm = &x0[offset];
  x_dm = &x[offset];

  in_tag = async_work_group_copy (xlast_1_lm, (const __global float4 *) xlast_1_dm, local_size_x, (event_t) 0);
  async_work_group_copy (xlast_2_lm, (const __global float4 *) xlast_2_dm - 1, local_size_x + 2, in_tag);
  async_work_group_copy (xlast_3_lm, (const __global float4 *) xlast_3_dm, local_size_x, in_tag);
  async_work_group_copy (x0_lm, (const __global float4 *) x0_dm, local_size_x, in_tag);

  /*  Wait for input data */
  wait_group_events (1, &in_tag);

  up = xlast_1_lm[i];
  down = xlast_3_lm[i];
  left.s0 = xlast_2_lm[i].s3;
  left.s12 = xlast_2_lm[i + 1].s01;
  left.s3 = xlast_2_lm[i + 1].s2;
  right.s01 = xlast_2_lm[i + 1].s12;
  right.s2 = xlast_2_lm[i + 1].s3;

  right.s3 = xlast_2_lm[i + 2].s0;
  x_lm[i] = (a * (right + left + up + down) + x0_lm[i]) / c;
  /*  Sync all processing elements */
  barrier (CLK_LOCAL_MEM_FENCE);
  out_tag = async_work_group_copy (x_dm, (const __local float4 *) x_lm, local_size_x, (event_t) 0);
  /*  Wait for output data */
  wait_group_events (1, &out_tag);
}


__kernel void
project_part1 (__global float4 * x,
           __global float4 * y,
           __global float4 * p,
           __global float4 * div,
           int n,
           __local float4 * x_lm,
           __local float4 * y_1_lm,
           __local float4 * y_2_lm, 
           __local float4 * div_lm, 
           __local float4 * p_lm)
{
  size_t i, j;
  float4 up, down, left, right;
  float4 nfv = n;
  size_t local_id_x;
  size_t local_size_x;
  size_t wg_id_x, wg_id_y;
  __global float4 *x_dm, *y_dm;
  __global float4 *div_dm, *p_dm;
  event_t in_tag, out_tag;
  unsigned int rowstride = (n + (_PAD * 2)) / _VEC_SIZE;
  unsigned int offset;

  /*  Query all the work group and work item info */
  wg_id_x = get_group_id (0);
  wg_id_y = get_group_id (1);
  local_id_x = get_local_id (0);
  local_size_x = get_local_size (0);

  /*  Read all data needed for the entire work group into local memory using async_work_group_copy */

  i = local_id_x;
  j = wg_id_y + _PAD;
  offset = (j * rowstride) + (wg_id_x * local_size_x) + _PAD / _VEC_SIZE;

  x_dm = &x[offset];
  y_dm = &y[offset];
  div_dm = &div[offset];
  p_dm = &p[offset];

  in_tag = async_work_group_copy (x_lm, (const __global float4 *) (x_dm - 1), local_size_x + 2, (event_t) 0);
  async_work_group_copy (y_1_lm, (const __global float4 *) (y_dm + rowstride), local_size_x, in_tag);
  async_work_group_copy (y_2_lm, (const __global float4 *) (y_dm - rowstride), local_size_x, in_tag);

  /*  Wait for input data */
  wait_group_events (1, &in_tag);

  up = y_1_lm[i];
  down = y_2_lm[i];
  left.s0 = x_lm[i].s3;

  left.s12 = x_lm[i + 1].s01;
  left.s3 = x_lm[i + 1].s2;
  right.s01 = x_lm[i + 1].s12;
  right.s2 = x_lm[i + 1].s3;

  right.s3 = x_lm[i + 2].s0;
  div_lm[i] = (right - left + up - down) * -0.5f / nfv;
  p_lm[i] = 0.0f;

  /*  Sync all processing elements */
  barrier (CLK_LOCAL_MEM_FENCE);
  out_tag = async_work_group_copy (div_dm, (const __local float4 *) div_lm, local_size_x, (event_t) 0);
  async_work_group_copy (p_dm, (const __local float4 *)p_lm, local_size_x, out_tag);
  /*  Wait for output data */
  wait_group_events (1, &out_tag);
}


__kernel void
project_part2 (__global float4 * x,
           __global float4 * y,
           __global float4 * p,
           int n,
           __local float4 * p_1_lm,
           __local float4 * p_2_lm,
           __local float4 * p_3_lm, 
           __local float4 * x_lm, 
           __local float4 * y_lm)
{
  size_t i, j;
  float4 up, down, left, right;
  float4 nfv = n;
  size_t local_id_x;
  size_t local_size_x;
  size_t wg_id_x, wg_id_y;
  __global float4 *p_1_dm, *p_2_dm, *p_3_dm;
  __global float4 *x_dm, *y_dm;
  event_t in_tag, out_tag;
  unsigned int rowstride = (n + (_PAD * 2)) / _VEC_SIZE;
  unsigned int offset;

  /*  Query all the work group and work item info */
  wg_id_x = get_group_id (0);
  wg_id_y = get_group_id (1);
  local_id_x = get_local_id (0);
  local_size_x = get_local_size (0);

  /*  Read all data needed for the entire work group into local memory using async_work_group_copy */

  i = local_id_x;
  j = wg_id_y + _PAD;
  offset = (j * rowstride) + (wg_id_x * local_size_x) + _PAD / _VEC_SIZE;

  p_2_dm = &p[offset];
  p_1_dm = p_2_dm + rowstride;
  p_3_dm = p_2_dm - rowstride;
  x_dm = &x[offset];
  y_dm = &y[offset];

  in_tag = async_work_group_copy (p_1_lm, (const __global float4 *) p_1_dm, local_size_x, (event_t) 0);
  async_work_group_copy (p_2_lm, (const __global float4 *) p_2_dm - 1, local_size_x + 2, in_tag);
  async_work_group_copy (p_3_lm, (const __global float4 *) p_3_dm, local_size_x, in_tag);
  async_work_group_copy (x_lm, (const __global float4 *) x_dm, local_size_x, in_tag);
  async_work_group_copy (y_lm, (const __global float4 *) y_dm, local_size_x, in_tag);

  /*  Wait for input data */
  wait_group_events (1, &in_tag);

  up = p_1_lm[i];
  down = p_3_lm[i];
  left.s0 = p_2_lm[i].s3;

  left.s12 = p_2_lm[i + 1].s01;
  left.s3 = p_2_lm[i + 1].s2;
  right.s01 = p_2_lm[i + 1].s12;
  right.s2 = p_2_lm[i + 1].s3;

  right.s3 = p_2_lm[i + 2].s0;
  x_lm[i] -= 0.5f * nfv * (right - left);
  y_lm[i] -= 0.5f * nfv * (up - down);

  /*  Sync all processing elements */
  barrier (CLK_LOCAL_MEM_FENCE);
  out_tag =  async_work_group_copy (x_dm, (const __local float4 *) x_lm, local_size_x, (event_t) 0);
  async_work_group_copy (y_dm, (const __local float4 *) y_lm, local_size_x, out_tag);
  /*  Wait for output data */
  wait_group_events (1, &out_tag);
}



__kernel void
pack_image (__global float4 * d_in,
        __global float4 * u_in,
        __global float4 * v_in,
        __global uint4 * img,
        int n,
        __local float4 * d_lm,
        __local float4 * u_lm, 
        __local float4 * v_lm, 
        __local uint4 * img_lm)
{
  int i, j;
  uint4 r_int, g_int, b_int, a_int;
  float4 vel;
  float4 df, uf, vf;
  float4 r = 0.0f;
  float4 g = 0.0f;
  float4 b = 0.0f;
  float4 sector;
  float4 f, p, q, t;
  float4 h, s, v;
  int4 emask0, emask1, emask2, emask3, emask4, emask5;

  size_t local_id_x;
  size_t local_size_x;
  size_t wg_id_x, wg_id_y;
  __global float4 *d_dm, *u_dm, *v_dm;
  __global uint4 *img_dm;
  event_t in_tag, out_tag;
  unsigned int rowstride = (n + (_PAD * 2)) / _VEC_SIZE;
  unsigned int img_rowstride = n / _VEC_SIZE;
  unsigned int offset, img_offset;

  /*  Query all the work group and work item info */
  wg_id_x = get_group_id (0);
  wg_id_y = get_group_id (1);
  local_id_x = get_local_id (0);
  local_size_x = get_local_size (0);

  /*  Read all data needed for the entire work group into local memory using async_work_group_copy */
  i = local_id_x;
  j = wg_id_y + _PAD;
  offset = (j * rowstride) + (wg_id_x * local_size_x) + _PAD / _VEC_SIZE;
  img_offset = (wg_id_y * img_rowstride) + (wg_id_x * local_size_x);

  d_dm = &d_in[offset];
  u_dm = &u_in[offset];
  v_dm = &v_in[offset];
  img_dm = &img[img_offset];

  in_tag = async_work_group_copy (d_lm, (const __global float4 *) d_dm, local_size_x, (event_t) 0);
  async_work_group_copy (u_lm, (const __global float4 *) u_dm, local_size_x, in_tag);
  async_work_group_copy (v_lm, (const __global float4 *) v_dm, local_size_x, in_tag);

  /*  Wait for input data */
  wait_group_events (1, &in_tag);

  df = d_lm[i];
  uf = u_lm[i];
  vf = v_lm[i];

  df = clamp (df, 0.0f, 1.0f);
  uf *= 10.0f;
  vf *= 10.0f;
  uf = fabs (uf);
  vf = fabs (vf);
  vel = clamp (uf + vf, 0.0f, 0.8375f);

  /* Compute HSV and convert to RGB. The hue is encoded from the 
   * sum of the horizontal and vertical velocities where low velocities
   * are colored blue and high velocity are magenta.
   */
  h = 0.667f - vel;
  h = select(h, 1.0f+h, as_int4(h));
  s = 0.9f;
  v = df;

  h *= 6.0f;            /*  sector 0 to 5 */
  sector = floor (h);
  f = h - sector;       /*  factorial part of h */

  p = v * (1.0f - s);
  q = v * (1.0f - s * f);
  t = v * (1.0f - s * (1.0f - f));

  emask0 = (sector == (float4) 0.0f);
  r = bitselect (r, v, as_float4(emask0));
  g = bitselect (g, t, as_float4(emask0));
  b = bitselect (b, p, as_float4(emask0));

  emask1 = (sector == (float4) 1.0f);
  r = bitselect (r, q, as_float4(emask1));
  g = bitselect (g, v, as_float4(emask1));
  b = bitselect (b, p, as_float4(emask1));

  emask2 = (sector == (float4) 2.0f);
  r = bitselect (r, p, as_float4(emask2));
  g = bitselect (g, v, as_float4(emask2));
  b = bitselect (b, t, as_float4(emask2));

  emask3 = (sector == (float4) 3.0f);
  r = bitselect (r, p, as_float4(emask3));
  g = bitselect (g, q, as_float4(emask3));
  b = bitselect (b, v, as_float4(emask3));

  emask4 = (sector == (float4) 4.0f);
  r = bitselect (r, t, as_float4(emask4));
  g = bitselect (g, p, as_float4(emask4));
  b = bitselect (b, v, as_float4(emask4));

  emask5 = (sector >= (float4) 5.0f);
  r = bitselect (r, v, as_float4(emask5));
  g = bitselect (g, p, as_float4(emask5));
  b = bitselect (b, q, as_float4(emask5));

  /* Convert and clamp color components and pack into unsigned byte rgba colors.
   * This is done by scaling the 0.0 to 1.0 ranged color components to 0.0
   * to 2^32 and converting them to a unsigned int with saturation. The most
   * significant 8 bits of resulting unsigned integer contains the 8-bit color
   * component.
   */
#ifdef __ENDIAN_LITTLE__
  r_int = (convert_uint4_sat(r * 0x1.0p32f)             ) >> 24;	
  g_int = (convert_uint4_sat(g * 0x1.0p32f) & 0xFF000000) >> 16;	
  b_int = (convert_uint4_sat(b * 0x1.0p32f) & 0xFF000000) >>  8;	
  a_int = (uint4)(0xFF000000);
#else	/* __ENDIAN_BIG__ */
  r_int = (convert_uint4_sat(r * 0x1.0p32f) & 0xFF000000);	
  g_int = (convert_uint4_sat(g * 0x1.0p32f) & 0xFF000000) >>  8;	
  b_int = (convert_uint4_sat(b * 0x1.0p32f) & 0xFF000000) >> 16;	
  a_int = (uint4)(0x000000FF);
#endif	/* __ENDIAN_LITTLE__ */

  img_lm[i] = r_int | g_int | b_int | a_int;
    
  /*  Sync all processing elements */
  barrier (CLK_LOCAL_MEM_FENCE);
  out_tag = async_work_group_copy ((__global float4 *)img_dm, (const __local float4 *) img_lm, 
                                   local_size_x, (event_t) 0);
  /*  Wait for output data */
  wait_group_events (1, &out_tag);
}
