/* -*- mode: c++ -*- */

// Linear Congruential Generator
uint stepLCG(uint *z, uint A, uint C){
  return (*z) = (A * (*z) + C);
}

__kernel
void randomLCG(const int randomNumbers, __global float* randomsSeed, __global float* randomGPU){
  int id = get_global_id(0);
  int maxID = get_global_size(0);

  uint rng = randomsSeed[id];
  for(int i=0; i < randomNumbers; ++i){
    randomGPU[id + i * maxID] =  (float)stepLCG(&rng, 1664525, 1013904223UL) / 0xffffffff;
  }
}

uint stepLFG(uint *z, __global uint *znmk, uint A, uint C){
  return (*znmk) = (*z) = (A * (*z) + C) + (*znmk);
}

// Lagged Fibonacci Generator
__kernel
void randomLFG(const int randomNumbers, __global float* randomsSeed, const int randomStateSize, __global uint* randomState, __global float* randomGPU){
  int id = get_global_id(0);
  int maxID = get_global_size(0);

  // bootstrap
  uint rng = randomsSeed[id];
  for(int i=0; i < randomStateSize; ++i){
    randomState[id + i * maxID] =  stepLCG(&rng, 1664525, 1013904223UL);
  }

  // Lagged Fibonacci Generator
  int nmkIndex = 0;
  for(int i=0; i < randomNumbers; ++i){
    randomGPU[id + i * maxID] = (float)stepLFG(&rng, &randomState[nmkIndex], 1664525, 1013904223UL) / 0xffffffff;
    nmkIndex = (nmkIndex + 1) % randomStateSize;
  }

}

// Combined Tausworthe Generator
uint stepCTG(uint *z, uint S1, uint S2, uint S3, uint M){
  uint b=((((*z)<<S1)^(*z))>>S2);
  return (*z) = ((((*z)&M)<<S3)^b);
}

__kernel
void randomCTG(const int randomNumbers, __global float2* randomsSeed, __global float* randomGPU){
  int id = get_global_id(0);
  int maxID = get_global_size(0);

  uint rng1 = randomsSeed[id].x;
  uint rng2 = randomsSeed[id].y;
  for(int i=0; i < randomNumbers; ++i){
    randomGPU[id + i * maxID] =  (float)(stepCTG(&rng1, 13, 19, 12, 4294967294UL) ^
					 stepCTG(&rng2, 2, 25, 4, 4294967288UL)) / 0xffffffff;
  }
}

// Hybrid RNG
float stepHybrid(uint* rng1, uint* rng2, uint* rng3, uint* rng4){
  return 2.3283064365387e-10 * (
				stepCTG(rng1, 13, 19, 12, 4294967294UL) ^
				stepCTG(rng2, 2, 25, 4, 4294967288UL) ^
				stepCTG(rng3, 3, 11, 17, 4294967280UL) ^
				stepLCG(rng4,1664525,1013904223UL)
				);
}

__kernel
void hybridRNG(const int randomNumbers, __global float* randomsSeed, __global float* randomGPU){
  int id = get_global_id(0);
  int maxID = get_global_size(0);

  uint rng1 = randomsSeed[id * 4 + 0];
  uint rng2 = randomsSeed[id * 4 + 1];
  uint rng3 = randomsSeed[id * 4 + 2];
  uint rng4 = randomsSeed[id * 4 + 3];

  for(int i = 0; i < randomNumbers; ++i){
    randomGPU[id + i * maxID] =  (float)stepHybrid(&rng1, &rng2, &rng3, &rng4);
  }
}

// Halton sequence
float stepHalton(float *value, float inv_base){
  float r = 1.0 - (*value) - 0.0000000001;
  if(inv_base < r) {
    (*value) += inv_base;
  } else {
    float h = inv_base, hh;
    do{
      hh = h;
      h *= inv_base;
    } while (h >= r);
    (*value) += hh + h - 1.0;
  }
  return (*value);
}

void seedHalton(ulong i, int base, float* inv_base, float* value){
  float f = (*inv_base) = 1.0/base;
  (*value) = 0.0;
  while( i > 0){
    (*value) += f * (float)(i % base);
    i /= base;
    f *= (*inv_base);
  }
}

__kernel
void haltonSequence(const int randomNumbers, const int base, __global float* randomGPU){
  int id = get_global_id(0);
  int maxID = get_global_size(0);

  float inv_base = 0.0;
  float rng = 0.0;
  seedHalton(id * randomNumbers, base, &inv_base, &rng);

  for(int i=0; i < randomNumbers; ++i){
    randomGPU[id + i * maxID] =  stepHalton(&rng, inv_base);
  }
}

// 1D uniformity test
// TODO
// Generate a quantized histogram
// randomNums = number of randoms per thread
// randoms    = array of random numbers
// bucketNum  = number of histogram buckets
// buckets    = array of histogram buckets
__kernel
void testUniform1D(const int randomNums, __global float* randoms, const int bucketNum, __global int* buckets){
    int id = get_global_id(0);      // ID
    int threadCount = get_global_size(0);  // threadCount

    float bucketWidth = 1.0 / bucketNum;

    int target = 0;

    for (size_t i = id; i < randomNums; i += threadCount) {
        if (i < randomNums) {
            // random numbers are between 0 and 1
            target = (int)(randoms[i] / bucketWidth);
            atomic_add(&buckets[target], 1);
        }
    }
}

// 1D Monte-Carlo integral
// TODO
// Implement a Monte Carlo integrator: sin(x) ; x := [0:PI/2]
// sampleNumber = number of samples per thread
// seed         = float4 seed array for the random number generator
// integral     = partial integral
#define M_PIP2 1.57796327f
__kernel
void mcInt1D(const int sampleNumber, __global float4* seed, __global float* integral){
  
}