Histogram + Reduce

4 years ago · 5b1820c7be
10 changed files with 299 additions and 62 deletions
--- a/Primitives/OpenCLHandler.cpp
+++ b/Primitives/OpenCLHandler.cpp
@ -26,6 +26,8 @@ OpenCLHandler::OpenCLHandler()
 	std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
 	std::cout << devices[0].getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() << std::endl;

+	max_workgroup_size = devices[0].getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+
 	// Create the command queue
 	cl::Event event;
 	queue = cl::CommandQueue(context, devices[0], 0, &err);
@ -48,4 +50,9 @@ bool OpenCLHandler::run_test(TestCase* test)
 	test->collect_results(&queue);

 	return test->validate_results();
+}
+
+size_t OpenCLHandler::get_max_size()
+{
+	return max_workgroup_size;
 }
--- a/Primitives/OpenCLHandler.h
+++ b/Primitives/OpenCLHandler.h
@ -7,7 +7,9 @@ private:
 	cl::Program program;
 	cl::Context context;
 	cl::CommandQueue queue;
+	size_t max_workgroup_size;
 public:
 	OpenCLHandler();
 	bool run_test(TestCase* test);
+	size_t get_max_size();
 };
--- a/Primitives/Primitives.cpp
+++ b/Primitives/Primitives.cpp
@ -15,59 +15,62 @@

 //const size_t dataSize = 4096;
 //
-//void capi()
-//{
-//	// Get a platform ID
-//	cl_platform_id platformID;
-//	clGetPlatformIDs(1, &platformID, NULL);
-//	
-//	// Get a device ID
-//	cl_device_id deviceID;
-//	clGetDeviceIDs(platformID, CL_DEVICE_TYPE_GPU, 1, &deviceID, NULL);
-//	
-//	// Create a context
-//	cl_context context;
-//	cl_context_properties contextProperties[] =
-//	{ CL_CONTEXT_PLATFORM, (cl_context_properties)platformID, 0 };
-//	context = clCreateContext(contextProperties, 1, &deviceID, NULL, NULL, NULL);
-//	
-//	// Create a command queue
-//	cl_command_queue queue;
-//	queue = clCreateCommandQueue(context, deviceID, CL_QUEUE_PROFILING_ENABLE, NULL);
-//	
-//	// Create an OpenCL program
-//	std::string source = FileToString("../kernels/programs.cl");
-//	const char* csource = source.c_str();
-//	cl_program program = clCreateProgramWithSource(context, 1, &csource, NULL, NULL);
-//	cl_int err = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);
-//	if (err != CL_SUCCESS)
-//	{
-//		cl_uint logLength;
-//		clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLength);
-//		char* log = new char[logLength];
-//		clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, logLength, log, 0);
-//		std::cout << log << std::endl;
-//		delete[] log;
-//		exit(-1);
-//	}
-//}
+void capi()
+{
+	// Get a platform ID
+	cl_platform_id platformID;
+	clGetPlatformIDs(1, &platformID, NULL);
+	
+	// Get a device ID
+	cl_device_id deviceID;
+	clGetDeviceIDs(platformID, CL_DEVICE_TYPE_GPU, 1, &deviceID, NULL);
+	
+	// Create a context
+	cl_context context;
+	cl_context_properties contextProperties[] =
+	{ CL_CONTEXT_PLATFORM, (cl_context_properties)platformID, 0 };
+	context = clCreateContext(contextProperties, 1, &deviceID, NULL, NULL, NULL);
+	
+	// Create a command queue
+	cl_command_queue queue;
+	queue = clCreateCommandQueue(context, deviceID, CL_QUEUE_PROFILING_ENABLE, NULL);
+	
+	// Create an OpenCL program
+	std::string source = FileToString("../kernels/programs.cl");
+	const char* csource = source.c_str();
+	cl_program program = clCreateProgramWithSource(context, 1, &csource, NULL, NULL);
+	cl_int err = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		cl_uint logLength;
+		clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLength);
+		char* log = new char[logLength];
+		clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, logLength, log, 0);
+		std::cout << log << std::endl;
+		delete[] log;
+		exit(-1);
+	}
+}


 int main()
 {
+	capi();
 	OpenCLHandler handler;
 	std::vector<TestCase*> tests;

 	Square s;
-	TestCase* t;
-	t = &s;
-
-	tests.push_back(t);
+	Histogram h(false, 32, 4096);
+	ReduceAdd r1(handler.get_max_size());

-	for (size_t i = 0; i < tests.size(); i++) {
-		handler.run_test(tests[i]);
+	//handler.run_test(&s);
+	//handler.run_test(&h);
+	if (handler.run_test(&r1)) {
+		std::cout << "Success" << std::endl;
+	}
+	else {
+		std::cout << "Failure" << std::endl;
 	}
-	
    return 0;
 }

--- a/Primitives/Primitives.vcxproj
+++ b/Primitives/Primitives.vcxproj
@ -80,6 +80,8 @@
  <ItemGroup>
    <ClCompile Include="OpenCLHandler.cpp" />
    <ClCompile Include="Primitives.cpp" />
+    <ClCompile Include="primitives\Histogram.cpp" />
+    <ClCompile Include="primitives\ReduceAdd.cpp" />
    <ClCompile Include="primitives\Square.cpp" />
  </ItemGroup>
  <ItemGroup>
--- a/Primitives/Primitives.vcxproj.filters
+++ b/Primitives/Primitives.vcxproj.filters
@ -44,6 +44,12 @@
    <ClCompile Include="Common.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="primitives\Histogram.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="primitives\ReduceAdd.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <None Include="..\kernels\programs.cl">
--- a/Primitives/primitives/Histogram.cpp
+++ b/Primitives/primitives/Histogram.cpp
@ -0,0 +1,90 @@
+#include <iostream>
+#include "../Common.h"
+#include "Tests.h"
+#include <random>
+
+Histogram::Histogram(bool _global, int _valueSet, int _data_size)
+{
+	global = _global;
+	valueSet = _valueSet;
+	data_size = _data_size;
+
+	std::random_device rd;
+	std::mt19937 gen(rd());
+	std::uniform_int_distribution<int> distr(0, valueSet-1);
+
+	for (size_t index = 0; index < data_size; ++index) {
+		sourceData.push_back(distr(gen));
+	}
+	cpuResult.resize(valueSet, 0);
+	gpuResult.resize(valueSet, 0);
+}
+
+void Histogram::collect_results(cl::CommandQueue* queue)
+{
+	queue->enqueueReadBuffer(clResultBuffer, true, 0, sizeof(int) * valueSet, gpuResult.data());
+}
+
+void Histogram::gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event)
+{
+	cl_int err = CL_SUCCESS;
+
+	// Get the kernel handle
+	cl::Kernel kernel;
+	if (global) {
+		kernel = cl::Kernel(*program, "histogram_global", &err);
+	}
+	else {
+		kernel = cl::Kernel(*program, "histogram_local", &err);
+	}	
+	CheckCLError(err);
+
+	clInputBuffer = cl::Buffer(*context, CL_MEM_READ_ONLY, sizeof(int) * data_size, NULL, &err);
+	queue->enqueueWriteBuffer(clInputBuffer,
+		true, // Blocking!
+		0, sizeof(int) * data_size, sourceData.data());
+	CheckCLError(err);
+	// Allocate the output data
+	clResultBuffer = cl::Buffer(*context, CL_MEM_WRITE_ONLY, sizeof(int) * valueSet, NULL, &err);
+	CheckCLError(err);
+	// Set the kernel parameters	
+	kernel.setArg(0, clInputBuffer); // kernel FV paraméterei sorrendben
+	kernel.setArg(1, clResultBuffer);
+	if (!global) {
+		kernel.setArg(2, sizeof(int) * valueSet, NULL);
+		kernel.setArg(3, valueSet);
+	}
+
+	// Enqueue the kernel
+	queue->enqueueNDRangeKernel(kernel,
+		cl::NullRange,				// Indexek nem eloffszetelve
+		cl::NDRange(data_size, 1),	// Minden elemet egy szál
+		cl::NullRange,				// Workgroup méret? - ez az auto, ha nem indul, 1024-re, onnan csökkent, amig elindul
+		NULL,						// 
+		Event);
+}
+
+void Histogram::cpu_compute()
+{
+	
+	for (size_t index = 0; index < data_size; ++index) {
+		cpuResult[sourceData[index]] = cpuResult[sourceData[index]] + 1;
+	}
+}
+
+bool Histogram::validate_results()
+{
+	for (size_t index = 0; index < valueSet; index++) {
+		if (cpuResult[index] != gpuResult[index]) {
+			std::cout << "Wrong result at [" << index << "]: " << gpuResult[index] << "!=" << cpuResult[index] << std::endl;
+			return false;
+		}
+	}
+	if (global) {
+		std::cout << "Test \"Histogram - global\" completed, set: " << valueSet << ", size: " << data_size << std::endl;
+	} 
+	else {
+		std::cout << "Test \"Histogram - local\" completed, set: " << valueSet << ", size: " << data_size << std::endl;
+	}
+	return true;
+}
--- a/Primitives/primitives/ReduceAdd.cpp
+++ b/Primitives/primitives/ReduceAdd.cpp
@ -0,0 +1,62 @@
+#include <iostream>
+#include <iomanip>
+#include "../Common.h"
+#include "Tests.h"
+#include <random>
+#include <stdlib.h>
+
+ReduceAdd::ReduceAdd(size_t max_size)
+{
+	data_size = max_size;
+	cpuResult = 0.0f;
+	const float vmax = 1000.0f;
+	std::random_device rd;
+	std::mt19937 gen(rd());
+	std::uniform_real_distribution<float> distr(0.0f, vmax);
+
+	for (size_t index = 0; index < data_size; ++index)
+	{
+		float val = distr(gen);
+		gpuResult.push_back(val);
+		sourceData.push_back(val);
+	}
+}
+
+void ReduceAdd::gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event)
+{
+	cl_int err = CL_SUCCESS;
+	cl::Kernel kernel = cl::Kernel(*program, "reduce_global", &err);
+	CheckCLError(err);
+
+	clInputBuffer = cl::Buffer(*context, CL_MEM_READ_ONLY, sizeof(int) * data_size, NULL, &err);
+	CheckCLError(err);
+	queue->enqueueWriteBuffer(clInputBuffer, true, 0, sizeof(int) * data_size, gpuResult.data());
+
+	kernel.setArg(0, clInputBuffer);
+
+	queue->enqueueNDRangeKernel(kernel,
+		cl::NullRange,				// Indexek nem eloffszetelve
+		cl::NDRange(data_size, 1),	// Minden elemet egy szál
+		cl::NDRange(1024, 1),				// Workgroup méret? - ez az auto, ha nem indul, 1024-re, onnan csökkent, amig elindul
+		NULL,						// 
+		Event);
+}
+
+void ReduceAdd::cpu_compute()
+{
+	for (size_t index = 0; index < data_size; ++index) {
+		cpuResult += sourceData[index];
+	}
+	std::cout << "CPU result is " << std::setprecision(12) << cpuResult << std::endl;
+}
+
+void ReduceAdd::collect_results(cl::CommandQueue* queue)
+{
+	queue->enqueueReadBuffer(clInputBuffer, true, 0, sizeof(int) * data_size, gpuResult.data());
+	std::cout << "Results grabbed" << std::endl;
+}
+
+bool ReduceAdd::validate_results()
+{
+	return abs(cpuResult - gpuResult[0]) < 0.3f;
+}
--- a/Primitives/primitives/Square.cpp
+++ b/Primitives/primitives/Square.cpp
@ -1,13 +1,21 @@
 #include <iostream>
 #include "../Common.h"
 #include "Tests.h"
+#include <random>


 Square::Square() {
+	const float vmax = 1000.0f;
+	data_size = 4096;
+	std::random_device rd;
+	std::mt19937 gen(rd());
+	std::uniform_real_distribution<float> distr(0.0f, vmax);
+
 	for (size_t index = 0; index < data_size; ++index)
 	{
-		gpuHostBuffer.push_back(static_cast<float>(index));
-		sourceData.push_back(static_cast<float>(index));
+		float val = distr(gen);
+		gpuResult.push_back(val);
+		sourceData.push_back(val);
 	}
 }
 void Square::gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event) {
@ -20,7 +28,7 @@ void Square::gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Prog
 	clInputBuffer = cl::Buffer(*context, CL_MEM_READ_ONLY, sizeof(float) * data_size, NULL, &err);
 	queue->enqueueWriteBuffer(clInputBuffer,
 		true, // Blocking!
-		0, sizeof(float) * data_size, gpuHostBuffer.data());
+		0, sizeof(float) * data_size, gpuResult.data());

 	// Allocate the output data
 	clResultBuffer = cl::Buffer(*context, CL_MEM_WRITE_ONLY, sizeof(float) * data_size, NULL, &err);
@ -46,13 +54,13 @@ void Square::cpu_compute()
 }

 void Square::collect_results(cl::CommandQueue* queue) {
-	queue->enqueueReadBuffer(clResultBuffer, true, 0, sizeof(float) * data_size, gpuHostBuffer.data());
+	queue->enqueueReadBuffer(clResultBuffer, true, 0, sizeof(float) * data_size, gpuResult.data());
 }	

 bool Square::validate_results() {
 	for (size_t index = 0; index < data_size; index++) {
-		if (cpuResult[index] != gpuHostBuffer[index]) {
-			std::cout << "Wrong result at [" << index << "]: " << gpuHostBuffer[index] << "!=" << cpuResult[index] << std::endl;
+		if (cpuResult[index] != gpuResult[index]) {
+			std::cout << "Wrong result at [" << index << "]: " << gpuResult[index] << "!=" << cpuResult[index] << std::endl;
 			return false;
 		}
 	}
--- a/Primitives/primitives/Tests.h
+++ b/Primitives/primitives/Tests.h
@ -2,11 +2,11 @@
 #include <string>
 #include "../cl.hpp"

-
-
 class TestCase {
-private:
-	std::string kernelCode;
+protected:
+	cl::Buffer clInputBuffer;
+	cl::Buffer clResultBuffer;
+	size_t data_size;
 public:
 	virtual void collect_results(cl::CommandQueue* queue) = 0;
 	virtual void gpu_compute(
@ -21,16 +21,41 @@ public:

 class Square : public TestCase {
 private:
-	size_t data_size = 4096;
-	std::vector<float> gpuHostBuffer;
+	std::vector<float> gpuResult;
 	std::vector<float> sourceData;
 	std::vector<float> cpuResult;
-	cl::Buffer clInputBuffer;
-	cl::Buffer clResultBuffer;
 public:
 	Square();
 	void collect_results(cl::CommandQueue* queue);
 	void gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event);
 	void cpu_compute();
 	bool validate_results();
+};
+
+class Histogram : public TestCase {
+private:
+	bool global;
+	size_t valueSet;
+	std::vector<int> gpuResult;
+	std::vector<int> sourceData;
+	std::vector<int> cpuResult;
+public:
+	Histogram(bool _global, int _valueSet, int _data_size);
+	void collect_results(cl::CommandQueue* queue);
+	void gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event);
+	void cpu_compute();
+	bool validate_results();
+};
+
+class ReduceAdd : public TestCase {
+private:
+	std::vector<float> gpuResult;
+	std::vector<float> sourceData;
+	float cpuResult;
+public:
+	ReduceAdd(size_t max_size);
+	void collect_results(cl::CommandQueue* queue);
+	void gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event);
+	void cpu_compute();
+	bool validate_results();
 };
--- a/kernels/programs.cl
+++ b/kernels/programs.cl
@ -1,6 +1,6 @@
 // map operator with f(x) = x*x
 __kernel void square(__global float* inputData,
-             		     __global float* outputData)
+						 __global float* outputData)
 {
  int id = get_global_id(0); //get_local_id a workgroup-on belul
  outputData[id] = inputData[id] * inputData[id];
@ -36,7 +36,23 @@ void histogram_global(__global int* data, __global int* histogram)
 __kernel
 void histogram_local(__global int* data, __global int* histogram, __local int* lhistogram, const int histogramSize)
 {
-        
+	int id = get_global_id(0);
+	int lid = get_local_id(0);
+
+	if (lid < histogramSize) {
+		lhistogram[lid] = 0;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	atomic_add(&lhistogram[data[id]], 1.0f);
+
+	barrier(CLK_GLOBAL_MEM_FENCE); // LOCAL???
+
+	if (lid < histogramSize) {
+		atomic_add(&histogram[lid], lhistogram[lid]);
+	}
+
 }

 // TODO
@ -51,7 +67,15 @@ void histogram_local(__global int* data, __global int* histogram, __local int* l
 __kernel
 void reduce_global(__global float* data)
 {
+	int id = get_global_id(0);

+	// one right shift is = divide number by two
+	for (size_t s = get_global_size(0) / 2; s > 0; s >>= 1) {
+		if (id < s) {
+			data[id] = data[id] +  data[id + s];
+		}
+		barrier(CLK_GLOBAL_MEM_FENCE);
+	}
 }

 // TODO
@ -71,7 +95,15 @@ void reduce_global(__global float* data)
 __kernel
 void exscan_global(__global int* data)
 {
-        
+	int id = get_global_id(0);
+	if (id > 0) {
+		data[id] = data[ID - 1];
+	}
+	else {
+		data[id] = 0;
+	}
+	barrier(CLK_GLOBAL_MEM_FENCE);
+
 }

 // TODO
@ -83,7 +115,7 @@ void exscan_global(__global int* data)
 __kernel
 void compact_predicate(__global int* data, __global int* pred)
 {
-        
+		
 }

 // TODO
@ -105,5 +137,5 @@ void compact_exscan(__global int* pred, __global int* prefSum)
 __kernel
 void compact_compact(__global int* data, __global int* pred, __global int* prefSum)
 {
-        
+		
 }