// map operator with f(x) = x*x
__kernel void square(__global float* inputData,
						 __global float* outputData)
{
  int id = get_global_id(0); //get_local_id a workgroup-on belul
  outputData[id] = inputData[id] * inputData[id];
}

// TODO
//
// histogram[data[id]] := histogram[data[id]] + 1
//
// SYNCHRONIZATION!
__kernel
void histogram_global(__global int* data, __global int* histogram)
{
	int id = get_global_id(0);
	atomic_add(&histogram[data[id]], 1.0f);
}

// TODO
//
// ID  := get_global_id(0)
// LID := get_local_id(0)
//
// IF LID < histogramSize DO:
//  lhistogram[LID] := 0
// BARRIER
//
// Add data to local histogram
//
// BARRIER
// 
// IF LID < histogramSize DO:
//  histogram[LID] = lhistogram[LID]
__kernel
void histogram_local(__global int* data, __global int* histogram, __local int* lhistogram, const int histogramSize)
{
	int id = get_global_id(0);
	int lid = get_local_id(0);

	if (lid < histogramSize) {
		lhistogram[lid] = 0;
	}

	barrier(CLK_LOCAL_MEM_FENCE);

	atomic_add(&lhistogram[data[id]], 1.0f);

	barrier(CLK_GLOBAL_MEM_FENCE); // LOCAL???

	if (lid < histogramSize) {
		atomic_add(&histogram[lid], lhistogram[lid]);
	}

}

// TODO
//
// ID := get_global_id(0)
//
// FOR s = get_global_size(0) / 2 ; s > 0 ; s >>= 1 DO:
//  IF (ID < s)
//    data[ID] = max(data[ID], data[ID + s])
//  BARRIER
//
__kernel
void reduce_global(__global float* data)
{
	int id = get_global_id(0);

	// one right shift is = divide number by two
	for (size_t s = get_global_size(0) / 2; s > 0; s >>= 1) {
		if (id < s) {
			data[id] = data[id] +  data[id + s];
		}
		barrier(CLK_GLOBAL_MEM_FENCE);
	}
}

// TODO
//
// ID := get_global_id(0)
// IF ID > 0 THEN data[ID] = data[ID - 1]
//           ELSE data[ID] = 0
// BARRIER
//
// FOR s = 1; s < get_global_size(0); s *= 2 DO:
//   tmp := data[ID]
//   IF ( ID + s < get_global_size(0) THEN
//     data[ID + s] += tmp;
//   BARRIER
//
// IF(ID = 0) THEN data[ID] = 0;
__kernel
void exscan_global(__global int* data)
{
	int id = get_global_id(0);
	int size = get_global_size(0);

	int val;
	if (id > 0) {
		val = data[id - 1];
	}
	else {
		val = 0;
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	data[id] = val;


	for (int s = 1; s < size; s *= 2) {
		int tmp = data[id];
		barrier(CLK_LOCAL_MEM_FENCE);
		if (id + s < size) {
			data[id + s] += data[id];
		}
		barrier(CLK_LOCAL_MEM_FENCE);
	}
	if (id == 0) data[id] = 0;
}

// TODO
// ID := get_global_id(0)
// IF data[id] < 50 THEN
//   predicate = 1
// ELSE
//   predicate = 0
__kernel
void compact_predicate(__global int* data, __global int* pred)
{
	int id = get_global_id(0);
	if (data[id] < 50) {
		pred[id] = 1;
	}
	else {
		pred[id] = 0;
	}
}

// TODO
//
// exclusive scan pred to prefSum
__kernel
void compact_exscan(__global int* pred, __global int* prefSum)
{
	int id = get_global_id(0);
	int size = get_global_size(0);

	int val;
	if (id > 0) {
		val = pred[id - 1];
	}
	else {
		val = 0;
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	prefSum[id] = val;


	for (int s = 1; s < size; s *= 2) {
		int tmp = prefSum[id];
		if (id + s < size) {
			prefSum[id + s] += prefSum[id];
		}
		barrier(CLK_LOCAL_MEM_FENCE);
	}
	// First element of array will store length
	if (id == 0) prefSum[0] = prefSum[size-1];
}

// TODO
// 
// ID := get_global_id(0)
// VALUE := data[ID]
// BARRIER
// IF pred[ID] == 1 THEN
//  data[prefSum[ID]] = VALUE
__kernel
void compact_compact(__global int* data, __global int* pred, __global int* prefSum)
{
	int id = get_global_id(0);
	int tmp = data[id];
	barrier(CLK_LOCAL_MEM_FENCE);
	if (pred[id] == 1) {
		// IF id == 0, fill data[0], see line #168
		data[!!id * prefSum[id]] = tmp;
	}
}