/*************************************************************************/
/*                                                                       */
/* Licensed Materials - Property of IBM                                  */
/*                                                                       */
/*                                                                       */
/* (C) Copyright IBM Corp. 2009, 2010                                    */
/* All Rights Reserved                                                   */
/*                                                                       */
/* US Government Users Restricted Rights - Use, duplication or           */
/* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.     */
/*                                                                       */
/*************************************************************************/

// The kernel to be selected is a function of the defines specified when
// the program is built (see clBuildProgram in the bsop.c code for more info).

#ifdef _SINGLE_

// various constants used in the core BlackScholes computations

#define ZERO        0.0f
#define ONE         1.0f
#define HALF        0.5f
#define A1          0.319381530f
#define A2         -0.356563782f
#define A3          1.781477937f
#define A4         -1.821255978f
#define A5          1.330274429f
#define INV_ROOT2PI 0.39894228f
#define NCDF        0.2316419f

// You can specify a vector width of 1 (scalar), 2, 4, 8, or 16.
// Character strings appropriate to each width are defined here.

#if SIZE==1
#define FLOAT float
#define FIXED uint
#define SFIXED int
#define STRIDESHIFT 2
#define SELECT(_a, _b, _c) (_c ? _b : _a)
#endif

#if SIZE==2
#define FLOAT float2
#define FIXED uint2
#define SFIXED int2
#define STRIDESHIFT 3
#define SELECT(_a, _b, _c) bitselect(_a, _b, as_float2(_c))
#endif

#if SIZE==4
#define FLOAT float4
#define FIXED uint4
#define SFIXED int4
#define STRIDESHIFT 4
#define SELECT(_a, _b, _c) bitselect(_a, _b, as_float4(_c))
#endif

#if SIZE==8
#define FLOAT float8
#define FIXED uint8
#define SFIXED int8
#define STRIDESHIFT 5
#define SELECT(_a, _b, _c) bitselect(_a, _b, as_float8(_c))
#endif

#if SIZE==16
#define FLOAT float16
#define FIXED uint16
#define SFIXED int16
#define STRIDESHIFT 6
#define SELECT(_a, _b, _c) bitselect(_a, _b, as_float16(_c))
#endif

#endif // SINGLE

#ifdef _DOUBLE_
#pragma OPENCL EXTENSION cl_khr_fp64: enable
//#pragma OPENCL EXTENSION cl_amd_fp64: enable

// various constants used in the core BlackScholes computations

#define ZERO        0.0
#define ONE         1.0
#define HALF        0.5
#define A1          0.319381530
#define A2         -0.356563782
#define A3          1.781477937
#define A4         -1.821255978
#define A5          1.330274429
#define INV_ROOT2PI 0.39894228
#define NCDF        0.2316419

// You can specify a vector width of 1 (scalar), 2, 4, 8, or 16.
// Character strings appropriate to each width are defined here.

#if SIZE==1
#define FLOAT double
#define FIXED ulong
#define SFIXED long
#define STRIDESHIFT 3
#define SELECT(_a, _b, _c) _c ? _b : _a
#endif

#if SIZE==2
#define FLOAT double2
#define FIXED ulong2
#define SFIXED long2
#define STRIDESHIFT 4
#define SELECT(_a, _b, _c) bitselect(_a, _b, as_double2(_c))
#endif

#if SIZE==4
#define FLOAT double4
#define FIXED ulong4
#define SFIXED long4
#define STRIDESHIFT 5
#define SELECT(_a, _b, _c) bitselect(_a, _b, as_double4(_c))
#endif

#if SIZE==8
#define FLOAT double8
#define FIXED ulong8
#define SFIXED long8
#define STRIDESHIFT 6
#define SELECT(_a, _b, _c) bitselect(_a, _b, as_double8(_c))
#endif

#if SIZE==16
#define FLOAT double16
#define FIXED ulong16
#define SFIXED long16
#define STRIDESHIFT 7
#define SELECT(_a, _b, _c) bitselect(_a, _b, as_double16(_c))
#endif

#endif // _DOUBLE_

/* If fast_math is enabled then use native functions when appropriate 
 * for each of the supported platforms. This "fast" optimization
 * is okay for this code example's error tolerance and input set.
 */
#define SQRT(_x) sqrt(_x)
#define LOG(_x) log(_x)
#define EXP(_x) exp(_x)
#define RECIP(_x) (1.0f/(_x))
#define DIVIDE(_x,_y) (_x/_y)

#define XN(_x)(native_recip (_x))
#define XN1(_x)((XN(_x)) * (2.0f - ((_x) * (XN(_x)))))

#if defined(USE_FAST_MATH) && defined(_SINGLE_)
 #if defined(__SPU__)
  /* On SPU device and single precision: 
   * 1) use native functions which support the extended precision float
   * 2) reciprocal is a native reciprocal estimate and 2 Newton-Raphson iterations.
   * 3) divide is x * 1/y
   */
  #undef SQRT
  #undef LOG
  #undef EXP
  #undef RECIP
  #undef DIVIDE
  #define SQRT(_x) native_sqrt(_x)
  #define LOG(_x) native_log(_x)
  #define EXP(_x) native_exp(_x)
  #define RECIP(_x) ((XN1(_x)) * (2.0f - ((_x) * (XN1(_x)))))
  #define DIVIDE(_x,_y) (_x * RECIP(_y))
 #elif defined(_ARCH_PWR7)
  /* On Power7 device and single precision: use default functions.
  */
 #elif (defined(_ARCH_PWR6) || defined(__PPU__))
  /* On Power6 or PPU devices and single precision: 
   * 1) reciprocal is a native reciprocal estimate and 2 Newton-Raphson iterations.
   * 2) divide is x * 1/y
   */
  #undef RECIP
  #undef DIVIDE
  #define RECIP(_x) ((XN1(_x)) * (2.0f - ((_x) * (XN1(_x)))))
  #define DIVIDE(_x,_y) (_x * RECIP(_y))
 #endif
#endif


//===================================================================================================
// Here is the core of the BlackScholes computation.
// This inline function is called within each of the subsequent five kernels.
//===================================================================================================
FLOAT COMPUTATIONAL_CORE(FIXED cpflag, FLOAT S0, FLOAT K, FLOAT r,
                                  FLOAT sigma, FLOAT T)
{
  FLOAT d1, d2, Nd1, Nd2, expval;
  FLOAT k1, n1, k2, n2;
  FLOAT accum1, accum2;
  FLOAT candidate_answer1, candidate_answer2;
  FLOAT call, put;
  SFIXED flag1, flag2;
  d1 = LOG(DIVIDE(S0,K)) + (r + HALF * sigma * sigma) * T;
  d1 = DIVIDE (d1, (sigma * SQRT(T)));
  expval = EXP(ZERO - r * T);
  d2 = d1 - sigma * SQRT(T);
  flag1 = (d1 < ZERO);
  flag2 = (d2 < ZERO);
  d1 = fabs(d1);
  d2 = fabs(d2);
  k1 = RECIP(ONE + NCDF * d1);
  k2 = RECIP(ONE + NCDF * d2);
  accum1 = A4 + A5 * k1;
  accum2 = A4 + A5 * k2;
  accum1 = k1 * accum1 + A3;
  accum2 = k2 * accum2 + A3;
  accum1 = k1 * accum1 + A2;
  accum2 = k2 * accum2 + A2;
  accum1 = k1 * accum1 + A1;
  accum2 = k2 * accum2 + A1;
  accum1 = k1 * accum1;
  accum2 = k2 * accum2;
  n1 = EXP(ZERO - HALF * d1 * d1);
  n2 = EXP(ZERO - HALF * d2 * d2);
  n1 *= INV_ROOT2PI;
  n2 *= INV_ROOT2PI;
  candidate_answer1 = ONE - n1 * accum1;
  candidate_answer2 = ONE - n2 * accum2;
  Nd1 = SELECT(candidate_answer1, (ONE - candidate_answer1), flag1);
  Nd2 = SELECT(candidate_answer2, (ONE - candidate_answer2), flag2);
  call = S0 * Nd1 - K * expval * Nd2;
  put = K * expval * (ONE - Nd2) - S0 * (ONE - Nd1);
  return SELECT(put, call, cpflag);
}

//===================================================================================================
// First kernel: this "NDRange" kernel is designed to be instantiated many times in parallel.
// It uses the simplest form of memory movement: basic load/store.  ("dm" stands for "device memory")
//===================================================================================================

#ifdef RANGE_LOAD_STORE
__kernel void bsop_kernel(__global FIXED * restrict dm_cpflag,
                 __global FLOAT * restrict dm_S0,
                 __global FLOAT * restrict dm_K,
                 __global FLOAT * restrict dm_r,
                 __global FLOAT * restrict dm_sigma,
                 __global FLOAT * restrict dm_T, 
                 __global FLOAT * restrict dm_answer
    )
{
    uint tid = get_global_id(0);
    dm_answer[tid] =
        COMPUTATIONAL_CORE(dm_cpflag[tid], dm_S0[tid], dm_K[tid], dm_r[tid],
                           dm_sigma[tid], dm_T[tid]);
}
#endif // RANGE_LOAD_STORE

//===================================================================================================
// Second kernel: another "NDRange" kernel, expected to be instantiated lots of times.
// It uses the "async_work_group_copy" function to move data between device memory and local memory.
//===================================================================================================

#ifdef RANGE_ASYNC_WORKGROUP_COPY
__kernel void bsop_kernel(__global const FIXED * cpflag_dm,
                 __global const FLOAT * S0_dm,
                 __global const FLOAT * K_dm,
                 __global const FLOAT * r_dm,
                 __global const FLOAT * sigma_dm,
                 __global const FLOAT * T_dm,
                 __global FLOAT * answer_dm,
                 __local FIXED * lm_cpflag,
                 __local FLOAT * lm_S0,
                 __local FLOAT * lm_K,
                 __local FLOAT * lm_r,
                 __local FLOAT * lm_sigma,
                 __local FLOAT * lm_T, __local FLOAT * lm_answer
    )
{
  size_t local_id, local_size, wg_id;
  int offset;
  event_t event;

    wg_id = get_group_id(0);
    local_id = get_local_id(0);
    local_size = get_local_size(0);
    offset = local_size * wg_id;
    event =
        async_work_group_copy(lm_cpflag, (cpflag_dm + offset), local_size,
                              (event_t) 0);
    event = async_work_group_copy(lm_S0, (S0_dm + offset), local_size, event);
    event = async_work_group_copy(lm_K, (K_dm + offset), local_size, event);
    event = async_work_group_copy(lm_r, (r_dm + offset), local_size, event);
    event =
        async_work_group_copy(lm_sigma, (sigma_dm + offset), local_size, event);
    event = async_work_group_copy(lm_T, (T_dm + offset), local_size, event);
    wait_group_events(1, &event);

    lm_answer[local_id] =
        COMPUTATIONAL_CORE(lm_cpflag[local_id], lm_S0[local_id], lm_K[local_id],
                           lm_r[local_id], lm_sigma[local_id], lm_T[local_id]);

    // before proceeding to write back data to device memory, we wait until everyone is here...
    barrier(CLK_LOCAL_MEM_FENCE);

    event =
        async_work_group_copy((__global FLOAT *) (answer_dm + offset),
                              (const __local FLOAT *) lm_answer,
                              (size_t) local_size, (event_t) 0);
    wait_group_events(1, &event);
}
#endif // RANGE_ASYNC_WORKGROUP_COPY

//===================================================================================================
// Third kernel: This task kernel is the most complicated of the five, as it handles the whole array in 
// very few instantiations.  Each instance handles a large chunk of the array by moving data around in 
// a double-buffered fashion, using "async_work_group_copy" functions.  ("lm" stands for "local memory").
//===================================================================================================

#ifdef TASK_DOUBLE_BUFFER
/*
 * We are using __attribute ((reqd_work_group_size(1,1,1))) to give hints to the
 * compiler that this kernel will not run with workgroup size bigger than
 * (1, 1, 1)  
 */
__kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void bsop_kernel(__global const FIXED * cpflag_dm,
                 __global const FLOAT * S0_dm,
                 __global const FLOAT * K_dm,
                 __global const FLOAT * r_dm,
                 __global const FLOAT * sigma_dm,
                 __global const FLOAT * T_dm,
                 __global FLOAT * answer_dm,
                 int task_id,
                 __local FIXED * lm_cpflag,
                 __local FLOAT * lm_S0,
                 __local FLOAT * lm_K,
                 __local FLOAT * lm_r,
                 __local FLOAT * lm_sigma,
                 __local FLOAT * lm_T,
                 __local FLOAT * answer, int n, unsigned int stride
    )
{
  int i, j, tag;
  event_t event[2];
  stride >>= STRIDESHIFT;

    tag = 0;
    // some comments about double-buffering:
    // the idea is to have an even/odd buffer pair, so that we can read in the odd buffer's data at the same time
    // as we process the even buffer's data, and vice versa.

    // some comments about "task_id"
    // All active tasks are given the same addresses for the seven arrays.
    // Each task must compute its own offset into these arrays.
    // we use "task_id" and "n" to do this.

    // here we read in the even buffers' data:
    event[0] =
        async_work_group_copy(lm_cpflag, cpflag_dm + n * task_id, stride,
                              (event_t) 0);
    event[0] =
        async_work_group_copy(lm_S0, S0_dm + n * task_id, stride, event[0]);
    event[0] =
        async_work_group_copy(lm_K, K_dm + n * task_id, stride, event[0]);
    event[0] =
        async_work_group_copy(lm_r, r_dm + n * task_id, stride, event[0]);
    event[0] =
        async_work_group_copy(lm_sigma, sigma_dm + n * task_id, stride,
                              event[0]);
    event[0] =
        async_work_group_copy(lm_T, T_dm + n * task_id, stride, event[0]);
    event[1] = 0;

    for (j = n * task_id; j < n * (task_id + 1); j += stride) {

      // Note that 'tag' toggles between 0 and 1, so that as we prepare to process buffer[tag] we can start the read on [1-tag]:
      if (j < n * (task_id + 1) - stride) {
        event[1 - tag] =
            async_work_group_copy(lm_S0 + ((1 - tag) * stride),
                                  (S0_dm + j + stride), stride, event[1 - tag]);
        event[1 - tag] =
            async_work_group_copy(lm_cpflag + ((1 - tag) * stride),
                                  (cpflag_dm + j + stride), stride,
                                  event[1 - tag]);
        event[1 - tag] =
            async_work_group_copy(lm_K + ((1 - tag) * stride),
                                  (K_dm + j + stride), stride, event[1 - tag]);
        event[1 - tag] =
            async_work_group_copy(lm_r + ((1 - tag) * stride),
                                  (r_dm + j + stride), stride, event[1 - tag]);
        event[1 - tag] =
            async_work_group_copy(lm_sigma + ((1 - tag) * stride),
                                  (sigma_dm + j + stride), stride,
                                  event[1 - tag]);
        event[1 - tag] =
            async_work_group_copy(lm_T + ((1 - tag) * stride),
                                  (T_dm + j + stride), stride, event[1 - tag]);
      }
      // Before processing [tag] we need to be sure it's all present, *and* we need to be sure that the corresponding write operation
      // coded as [tag] has been completed:
      wait_group_events(1, &event[tag]);

      for (i = 0; i < stride; i++) {
        int ii = tag * stride + i;
        answer[ii] =
            COMPUTATIONAL_CORE(lm_cpflag[ii], lm_S0[ii], lm_K[ii], lm_r[ii],
                               lm_sigma[ii], lm_T[ii]);
      }

      // Move calculated data out to global
      event[tag] =
          async_work_group_copy((answer_dm + j),
                                (__local const FLOAT *) (answer +
                                                         (tag * stride)),
                                stride, (event_t) 0);

      // here is where we toggle the value of 'tag'
      tag = 1 - tag;
    }

    // before returning, wait for the last write operations to finish:
    if (j > (n * task_id + stride))
      wait_group_events(1, &event[tag]);
    wait_group_events(1, &event[1 - tag]);
}
#endif // TASK_DOUBLE_BUFFER

//===================================================================================================
// Fourth kernel: this task differs from the previous one only in that it does *not* use double-buffering.
// We include it to demonstrate performance differences to the user.
//===================================================================================================

#ifdef TASK_SINGLE_BUFFER
/*
 * We are using __attribute ((reqd_work_group_size(1,1,1))) to give hints to the
 * compiler that this kernel will not run with workgroup size bigger than
 * (1, 1, 1)  
 */
__kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void bsop_kernel(__global const FIXED * cpflag_dm,
                 __global const FLOAT * S0_dm,
                 __global const FLOAT * K_dm,
                 __global const FLOAT * r_dm,
                 __global const FLOAT * sigma_dm,
                 __global const FLOAT * T_dm,
                 __global FLOAT * answer_dm,
                 int task_id,
                 __local FIXED * lm_cpflag,
                 __local FLOAT * lm_S0,
                 __local FLOAT * lm_K,
                 __local FLOAT * lm_r,
                 __local FLOAT * lm_sigma,
                 __local FLOAT * lm_T,
                 __local FLOAT * answer, int n, unsigned int stride
    )
{
  int i, j;
  event_t event;
  stride >>= STRIDESHIFT;

    event = 0;

    // some comments about "task_id"
    // All active tasks are given the same addresses for the seven arrays.
    // Each task must compute its own offset into these arrays.
    // we use "task_id" and "n" to do this.

    for (j = n * task_id; j < n * (task_id + 1); j += stride) {

      //Move data from global to local
      event = async_work_group_copy(lm_cpflag, (cpflag_dm + j), stride, event);
      event = async_work_group_copy(lm_S0, (S0_dm + j), stride, event);
      event = async_work_group_copy(lm_K, (K_dm + j), stride, event);
      event = async_work_group_copy(lm_r, (r_dm + j), stride, event);
      event = async_work_group_copy(lm_sigma, (sigma_dm + j), stride, event);
      event = async_work_group_copy(lm_T, (T_dm + j), stride, event);

      wait_group_events(1, &event);

      for (i = 0; i < stride; i++) {
        answer[i] =
            COMPUTATIONAL_CORE(lm_cpflag[i], lm_S0[i], lm_K[i], lm_r[i],
                               lm_sigma[i], lm_T[i]);
      }

      //Move calculated data out to global
      event =
          async_work_group_copy((__global FLOAT *) (answer_dm + j),
                                (const __local FLOAT *) answer, stride,
                                (event_t) 0);
    }

    // before returning, wait for the last write operation to finish:
    wait_group_events(1, &event);
}
#endif // TASK_SINGLE_BUFFER

//===================================================================================================
// Fifth kernel: again, a task kernel, which handles a large section of the array, but we
// return to the simplest form of data movement, the load/store operations.
//===================================================================================================

#ifdef TASK_LOAD_STORE
/*
 * We are using __attribute ((reqd_work_group_size(1,1,1))) to give hints to the
 * compiler that this kernel will not run with workgroup size bigger than
 * (1, 1, 1)  
 */
__kernel __attribute__ ((reqd_work_group_size(1, 1, 1)))
void bsop_kernel(__global const FIXED * restrict cpflag_dm,
                 __global const FLOAT * restrict S0_dm,
                 __global const FLOAT * restrict K_dm,
                 __global const FLOAT * restrict r_dm,
                 __global const FLOAT * restrict sigma_dm,
                 __global const FLOAT * restrict T_dm,
                 __global FLOAT * restrict answer_dm, int task_id, int n
    )
{
    int i;
    for (i = n * task_id; i < n * (task_id + 1); i++) {
      answer_dm[i] =
          COMPUTATIONAL_CORE(cpflag_dm[i], S0_dm[i], K_dm[i], r_dm[i],
                             sigma_dm[i], T_dm[i]);
    }
}
#endif // TASK_LOAD_STORE
