/*************************************************************************/
/*                                                                       */
/* Licensed Materials - Property of IBM                                  */
/*                                                                       */
/* (C) Copyright IBM Corp. 2009, 2010                                    */
/* All Rights Reserved                                                   */
/*                                                                       */
/* US Government Users Restricted Rights - Use, duplication or           */
/* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.     */
/*                                                                       */
/* Inspired by Caltech's Java Applet Fluid Solver at                     */
/* www.multires.caltech.edu/teaching/demos/java/FluidSolver.java         */
/*                                                                       */
/* References:  Visual Simulation of Smoke                               */
/*              R. Fedkiw, J. Stam, H. W. Jensen                         */
/*              SIGGRAPH 2001 Annual Proceedings                         */
/*                                                                       */
/*************************************************************************/


#define _PAD	4

/*  Index for scalar code */
#define Is(_i, _j) 	((_i) + ((n + (_PAD * 2)) * (_j)))

/*  Index for vector code */
#define Iv(_i, _j) 	((_i) + (((n + (_PAD * 2))/4) * (_j)))

/*  Index for vector image code */
#define Im(_i, _j)  (((_j)-_PAD) * (n / 4) + ((_i)-1))

__kernel void
set_boundary (int b, __global float * restrict x, int n)
{
  size_t i;

  i = get_global_id (0);
  i += _PAD;

  x[Is (_PAD - 1, i)] = b == 1 ? -x[Is (_PAD, i)] : x[Is (_PAD, i)];
  x[Is ((n - 1) + _PAD + 1, i)] = b == 1 ? -x[Is ((n - 1) + _PAD, i)] : x[Is ((n - 1) + _PAD, i)];
  x[Is (i, _PAD - 1)] = b == 2 ? -x[Is (i, _PAD)] : x[Is (i, _PAD)];
  x[Is (i, (n - 1) + _PAD + 1)] = b == 2 ? -x[Is (i, (n - 1) + _PAD)] : x[Is (i, (n - 1) + _PAD)];
}


__kernel void
advect (__global float4 * restrict d, __global float * restrict d0, __global float4 * restrict du,
	__global float4 * restrict dv, float dt, int n)
{
  int4 i0, j0, i1, j1;
  float4 x, y, s0, t0, s1, t1, dt0;
  size_t i, j;
  int4 iv;
  int4 v0123;
  float4 nf, ivf, jvf;
  int4 mask1, mask2;
  float4 d0i0j0, d0i0j1, d0i1j0, d0i1j1;

  i = get_global_id (0);
  j = get_global_id (1);

  i += 1;
  j += _PAD;

  v0123.s0 = 0;
  v0123.s1 = 1;
  v0123.s2 = 2;
  v0123.s3 = 3;
  nf = (float4)n;

  iv = (int4)(i * 4);
  iv = iv + v0123;
  jvf = (float4)(j);

  dt0 = dt * n;

  ivf = convert_float4 (iv);
  x = ivf - dt0 * du[Iv (i, j)];
  y = jvf - dt0 * dv[Iv (i, j)];

  mask1 = (x > nf + 0.5f);
  x = bitselect (x, nf + 0.5f, as_float4(mask1));

  mask1 = (x < (float4) 0.5f);
  x = bitselect (x, (float4) 0.5f, as_float4(mask1));

  i0 = convert_int4 (x);
  i1 = i0 + 1;

  mask1 = (y > nf + 0.5f);
  y = bitselect (y, nf + 0.5f, as_float4(mask1));

  mask2 = (y < (float4) 0.5f);
  y = bitselect (y, (float4) 0.5f, as_float4(mask2));

  j0 = convert_int4 (y);
  j1 = j0 + 1;

  s1 = x - convert_float4 (i0);
  s0 = 1.0f - s1;
  t1 = y - convert_float4 (j0);
  t0 = 1.0f - t1;

  d0i0j0.s0 = d0[Is (i0.s0, j0.s0)];
  d0i0j0.s1 = d0[Is (i0.s1, j0.s1)];
  d0i0j0.s2 = d0[Is (i0.s2, j0.s2)];
  d0i0j0.s3 = d0[Is (i0.s3, j0.s3)];

  d0i0j1.s0 = d0[Is (i0.s0, j1.s0)];
  d0i0j1.s1 = d0[Is (i0.s1, j1.s1)];
  d0i0j1.s2 = d0[Is (i0.s2, j1.s2)];
  d0i0j1.s3 = d0[Is (i0.s3, j1.s3)];

  d0i1j0.s0 = d0[Is (i1.s0, j0.s0)];
  d0i1j0.s1 = d0[Is (i1.s1, j0.s1)];
  d0i1j0.s2 = d0[Is (i1.s2, j0.s2)];
  d0i1j0.s3 = d0[Is (i1.s3, j0.s3)];

  d0i1j1.s0 = d0[Is (i1.s0, j1.s0)];
  d0i1j1.s1 = d0[Is (i1.s1, j1.s1)];
  d0i1j1.s2 = d0[Is (i1.s2, j1.s2)];
  d0i1j1.s3 = d0[Is (i1.s3, j1.s3)];

  d[Iv (i, j)] = s0 * (t0 * d0i0j0 + t1 * d0i0j1) + s1 * (t0 * d0i1j0 + t1 * d0i1j1);
}


__kernel void
add_source (float dt, __global float4 * restrict x, __global float4 * restrict x0)
{
  size_t i;

  i = get_global_id (0);

  x[i] += dt * x0[i];
}

__kernel void
add_source_dual (float dt,
		 __global float4 * restrict x,
		 __global float4 * restrict x0, __global float4 * restrict y, __global float4 * restrict y0)
{
  size_t i;

  i = get_global_id (0);

  x[i] += dt * x0[i];

  y[i] += dt * y0[i];
}


__kernel void
linear_solver (__global float4 * restrict x, __global float4 * restrict xlast,
	       __global float4 * restrict x0, float a, float c, int n)
{
  size_t i, j;
  float4 up, down, left, right;

  i = get_global_id (0);
  j = get_global_id (1);

  i += 1;
  j += _PAD;

  up = xlast[Iv (i, j + 1)];
  down = xlast[Iv (i, j - 1)];

  left = vload4 (Iv (i - 1, j), ((const __global float *) xlast) + 3);
  right = vload4 (Iv (i - 1, j), ((const __global float *) xlast) + 5);

  x[Iv (i, j)] = (a * (right + left + up + down) + x0[Iv (i, j)]) / c;
}


__kernel void
project_part1 (__global float4 * restrict x, __global float4 * restrict y, __global float4 * restrict p,
	       __global float4 * restrict div, int n)
{
  size_t i, j;
  float4 up, down, left, right;
  float4 nfv = (float4)(n);

  i = get_global_id (0);
  j = get_global_id (1);

  i += 1;
  j += _PAD;

  up = y[Iv (i, j + 1)];
  down = y[Iv (i, j - 1)];

  left = vload4 (Iv (i - 1, j), ((const __global float *) x) + 3);
  right = vload4 (Iv (i - 1, j), ((const __global float *) x) + 5);

  div[Iv (i, j)] = (right - left + up - down) * -0.5f / nfv;

  p[Iv (i, j)] = 0.0f;
}


__kernel void
project_part2 (__global float4 * restrict x, __global float4 * restrict y, __global float4 * restrict p, int n)
{
  size_t i, j;
  float4 up, down, left, right;
  float4 nfv = (float4)(n);

  i = get_global_id (0);
  j = get_global_id (1);

  i += 1;
  j += _PAD;

  up = p[Iv (i, j + 1)];
  down = p[Iv (i, j - 1)];

  left = vload4 (Iv (i - 1, j), ((const __global float *) p) + 3);
  right = vload4 (Iv (i - 1, j), ((const __global float *) p) + 5);

  x[Iv (i, j)] -= 0.5f * nfv * (right - left);
  y[Iv (i, j)] -= 0.5f * nfv * (up - down);
}


__kernel void
pack_image (__global float4 * restrict d_in, __global float4 * restrict u_in,
	    __global float4 * restrict v_in, __global uint4 * restrict img, int n)
{
  int i, j;
  uint4 r_int, g_int, b_int, a_int;
  float4 vel;
  float4 df, uf, vf;
  float4 r = 0.0f;
  float4 g = 0.0f;
  float4 b = 0.0f;
  float4 sector;
  float4 f, p, q, t;
  float4 h, s, v;
  int4 emask0, emask1, emask2, emask3, emask4, emask5;

  i = get_global_id (0);
  j = get_global_id (1);

  i += 1;
  j += _PAD;

  df = d_in[Iv (i, j)];
  uf = u_in[Iv (i, j)];
  vf = v_in[Iv (i, j)];

  df = clamp (df, 0.0f, 1.0f);
  uf *= 10.0f;
  vf *= 10.0f;
  uf = fabs (uf);
  vf = fabs (vf);
  vel = clamp (uf + vf, 0.0f, 0.8375f);

  /* Compute HSV and convert to RGB. The hue is encoded from the 
   * sum of the horizontal and vertical velocities where low velocities
   * are colored blue and high velocity are magenta.
   */
  h = 0.667f - vel;
  h = select(h, 1.0f+h, as_int4(h));
  s = 0.9f;
  v = df;

  h *= 6.0f;			/*  sector 0 to 5 */
  sector = floor (h);
  f = h - sector;		/*  factorial part of h */

  p = v * (1.0f - s);
  q = v * (1.0f - s * f);
  t = v * (1.0f - s * (1.0f - f));

  emask0 = (sector == (float4) 0.0f);
  r = bitselect (r, v, as_float4(emask0));
  g = bitselect (g, t, as_float4(emask0));
  b = bitselect (b, p, as_float4(emask0));

  emask1 = (sector == (float4) 1.0f);
  r = bitselect (r, q, as_float4(emask1));
  g = bitselect (g, v, as_float4(emask1));
  b = bitselect (b, p, as_float4(emask1));

  emask2 = (sector == (float4) 2.0f);
  r = bitselect (r, p, as_float4(emask2));
  g = bitselect (g, v, as_float4(emask2));
  b = bitselect (b, t, as_float4(emask2));

  emask3 = (sector == (float4) 3.0f);
  r = bitselect (r, p, as_float4(emask3));
  g = bitselect (g, q, as_float4(emask3));
  b = bitselect (b, v, as_float4(emask3));

  emask4 = (sector == (float4) 4.0f);
  r = bitselect (r, t, as_float4(emask4));
  g = bitselect (g, p, as_float4(emask4));
  b = bitselect (b, v, as_float4(emask4));

  emask5 = (sector >= (float4) 5.0f);
  r = bitselect (r, v, as_float4(emask5));
  g = bitselect (g, p, as_float4(emask5));
  b = bitselect (b, q, as_float4(emask5));

  /* Convert float to int and pack pixels, swapping red and blue */
#ifdef __ENDIAN_LITTLE__
  r_int = (convert_uint4_sat(r * 0x1.0p32f)             ) >> 24;	
  g_int = (convert_uint4_sat(g * 0x1.0p32f) & 0xFF000000) >> 16;	
  b_int = (convert_uint4_sat(b * 0x1.0p32f) & 0xFF000000) >>  8;	
  a_int = (uint4)(0xFF000000);
#else	/* __ENDIAN_BIG__ */
  r_int = (convert_uint4_sat(r * 0x1.0p32f) & 0xFF000000);	
  g_int = (convert_uint4_sat(g * 0x1.0p32f) & 0xFF000000) >>  8;	
  b_int = (convert_uint4_sat(b * 0x1.0p32f) & 0xFF000000) >> 16;	
  a_int = (uint4)(0x000000FF);
#endif	/* __ENDIAN_LITTLE__ */

  img[Im (i, j)] = r_int | g_int | b_int | a_int;
}
