diff --git a/Primitives/OpenCLHandler.cpp b/Primitives/OpenCLHandler.cpp index fe9312c..dbeb35e 100644 --- a/Primitives/OpenCLHandler.cpp +++ b/Primitives/OpenCLHandler.cpp @@ -26,6 +26,8 @@ OpenCLHandler::OpenCLHandler() std::vector devices = context.getInfo(); std::cout << devices[0].getInfo() << std::endl; + max_workgroup_size = devices[0].getInfo(); + // Create the command queue cl::Event event; queue = cl::CommandQueue(context, devices[0], 0, &err); @@ -48,4 +50,9 @@ bool OpenCLHandler::run_test(TestCase* test) test->collect_results(&queue); return test->validate_results(); +} + +size_t OpenCLHandler::get_max_size() +{ + return max_workgroup_size; } \ No newline at end of file diff --git a/Primitives/OpenCLHandler.h b/Primitives/OpenCLHandler.h index ae17222..4094b25 100644 --- a/Primitives/OpenCLHandler.h +++ b/Primitives/OpenCLHandler.h @@ -7,7 +7,9 @@ private: cl::Program program; cl::Context context; cl::CommandQueue queue; + size_t max_workgroup_size; public: OpenCLHandler(); bool run_test(TestCase* test); + size_t get_max_size(); }; \ No newline at end of file diff --git a/Primitives/Primitives.cpp b/Primitives/Primitives.cpp index 8fb99d2..6879d95 100644 --- a/Primitives/Primitives.cpp +++ b/Primitives/Primitives.cpp @@ -15,59 +15,62 @@ //const size_t dataSize = 4096; // -//void capi() -//{ -// // Get a platform ID -// cl_platform_id platformID; -// clGetPlatformIDs(1, &platformID, NULL); -// -// // Get a device ID -// cl_device_id deviceID; -// clGetDeviceIDs(platformID, CL_DEVICE_TYPE_GPU, 1, &deviceID, NULL); -// -// // Create a context -// cl_context context; -// cl_context_properties contextProperties[] = -// { CL_CONTEXT_PLATFORM, (cl_context_properties)platformID, 0 }; -// context = clCreateContext(contextProperties, 1, &deviceID, NULL, NULL, NULL); -// -// // Create a command queue -// cl_command_queue queue; -// queue = clCreateCommandQueue(context, deviceID, CL_QUEUE_PROFILING_ENABLE, NULL); -// -// // Create an OpenCL program -// std::string source = FileToString("../kernels/programs.cl"); -// const char* csource = source.c_str(); -// cl_program program = clCreateProgramWithSource(context, 1, &csource, NULL, NULL); -// cl_int err = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL); -// if (err != CL_SUCCESS) -// { -// cl_uint logLength; -// clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLength); -// char* log = new char[logLength]; -// clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, logLength, log, 0); -// std::cout << log << std::endl; -// delete[] log; -// exit(-1); -// } -//} +void capi() +{ + // Get a platform ID + cl_platform_id platformID; + clGetPlatformIDs(1, &platformID, NULL); + + // Get a device ID + cl_device_id deviceID; + clGetDeviceIDs(platformID, CL_DEVICE_TYPE_GPU, 1, &deviceID, NULL); + + // Create a context + cl_context context; + cl_context_properties contextProperties[] = + { CL_CONTEXT_PLATFORM, (cl_context_properties)platformID, 0 }; + context = clCreateContext(contextProperties, 1, &deviceID, NULL, NULL, NULL); + + // Create a command queue + cl_command_queue queue; + queue = clCreateCommandQueue(context, deviceID, CL_QUEUE_PROFILING_ENABLE, NULL); + + // Create an OpenCL program + std::string source = FileToString("../kernels/programs.cl"); + const char* csource = source.c_str(); + cl_program program = clCreateProgramWithSource(context, 1, &csource, NULL, NULL); + cl_int err = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL); + if (err != CL_SUCCESS) + { + cl_uint logLength; + clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLength); + char* log = new char[logLength]; + clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, logLength, log, 0); + std::cout << log << std::endl; + delete[] log; + exit(-1); + } +} int main() { + capi(); OpenCLHandler handler; std::vector tests; Square s; - TestCase* t; - t = &s; - - tests.push_back(t); + Histogram h(false, 32, 4096); + ReduceAdd r1(handler.get_max_size()); - for (size_t i = 0; i < tests.size(); i++) { - handler.run_test(tests[i]); + //handler.run_test(&s); + //handler.run_test(&h); + if (handler.run_test(&r1)) { + std::cout << "Success" << std::endl; + } + else { + std::cout << "Failure" << std::endl; } - return 0; } diff --git a/Primitives/Primitives.vcxproj b/Primitives/Primitives.vcxproj index 2d3e8bd..5bf5df6 100644 --- a/Primitives/Primitives.vcxproj +++ b/Primitives/Primitives.vcxproj @@ -80,6 +80,8 @@ + + diff --git a/Primitives/Primitives.vcxproj.filters b/Primitives/Primitives.vcxproj.filters index 7ff3e82..27fc81f 100644 --- a/Primitives/Primitives.vcxproj.filters +++ b/Primitives/Primitives.vcxproj.filters @@ -44,6 +44,12 @@ Source Files + + Source Files + + + Source Files + diff --git a/Primitives/primitives/Histogram.cpp b/Primitives/primitives/Histogram.cpp new file mode 100644 index 0000000..e79fa7a --- /dev/null +++ b/Primitives/primitives/Histogram.cpp @@ -0,0 +1,90 @@ +#include +#include "../Common.h" +#include "Tests.h" +#include + +Histogram::Histogram(bool _global, int _valueSet, int _data_size) +{ + global = _global; + valueSet = _valueSet; + data_size = _data_size; + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution distr(0, valueSet-1); + + for (size_t index = 0; index < data_size; ++index) { + sourceData.push_back(distr(gen)); + } + cpuResult.resize(valueSet, 0); + gpuResult.resize(valueSet, 0); +} + +void Histogram::collect_results(cl::CommandQueue* queue) +{ + queue->enqueueReadBuffer(clResultBuffer, true, 0, sizeof(int) * valueSet, gpuResult.data()); +} + +void Histogram::gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event) +{ + cl_int err = CL_SUCCESS; + + // Get the kernel handle + cl::Kernel kernel; + if (global) { + kernel = cl::Kernel(*program, "histogram_global", &err); + } + else { + kernel = cl::Kernel(*program, "histogram_local", &err); + } + CheckCLError(err); + + clInputBuffer = cl::Buffer(*context, CL_MEM_READ_ONLY, sizeof(int) * data_size, NULL, &err); + queue->enqueueWriteBuffer(clInputBuffer, + true, // Blocking! + 0, sizeof(int) * data_size, sourceData.data()); + CheckCLError(err); + // Allocate the output data + clResultBuffer = cl::Buffer(*context, CL_MEM_WRITE_ONLY, sizeof(int) * valueSet, NULL, &err); + CheckCLError(err); + // Set the kernel parameters + kernel.setArg(0, clInputBuffer); // kernel FV paraméterei sorrendben + kernel.setArg(1, clResultBuffer); + if (!global) { + kernel.setArg(2, sizeof(int) * valueSet, NULL); + kernel.setArg(3, valueSet); + } + + // Enqueue the kernel + queue->enqueueNDRangeKernel(kernel, + cl::NullRange, // Indexek nem eloffszetelve + cl::NDRange(data_size, 1), // Minden elemet egy szál + cl::NullRange, // Workgroup méret? - ez az auto, ha nem indul, 1024-re, onnan csökkent, amig elindul + NULL, // + Event); +} + +void Histogram::cpu_compute() +{ + + for (size_t index = 0; index < data_size; ++index) { + cpuResult[sourceData[index]] = cpuResult[sourceData[index]] + 1; + } +} + +bool Histogram::validate_results() +{ + for (size_t index = 0; index < valueSet; index++) { + if (cpuResult[index] != gpuResult[index]) { + std::cout << "Wrong result at [" << index << "]: " << gpuResult[index] << "!=" << cpuResult[index] << std::endl; + return false; + } + } + if (global) { + std::cout << "Test \"Histogram - global\" completed, set: " << valueSet << ", size: " << data_size << std::endl; + } + else { + std::cout << "Test \"Histogram - local\" completed, set: " << valueSet << ", size: " << data_size << std::endl; + } + return true; +} \ No newline at end of file diff --git a/Primitives/primitives/ReduceAdd.cpp b/Primitives/primitives/ReduceAdd.cpp new file mode 100644 index 0000000..b5805cf --- /dev/null +++ b/Primitives/primitives/ReduceAdd.cpp @@ -0,0 +1,62 @@ +#include +#include +#include "../Common.h" +#include "Tests.h" +#include +#include + +ReduceAdd::ReduceAdd(size_t max_size) +{ + data_size = max_size; + cpuResult = 0.0f; + const float vmax = 1000.0f; + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution distr(0.0f, vmax); + + for (size_t index = 0; index < data_size; ++index) + { + float val = distr(gen); + gpuResult.push_back(val); + sourceData.push_back(val); + } +} + +void ReduceAdd::gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event) +{ + cl_int err = CL_SUCCESS; + cl::Kernel kernel = cl::Kernel(*program, "reduce_global", &err); + CheckCLError(err); + + clInputBuffer = cl::Buffer(*context, CL_MEM_READ_ONLY, sizeof(int) * data_size, NULL, &err); + CheckCLError(err); + queue->enqueueWriteBuffer(clInputBuffer, true, 0, sizeof(int) * data_size, gpuResult.data()); + + kernel.setArg(0, clInputBuffer); + + queue->enqueueNDRangeKernel(kernel, + cl::NullRange, // Indexek nem eloffszetelve + cl::NDRange(data_size, 1), // Minden elemet egy szál + cl::NDRange(1024, 1), // Workgroup méret? - ez az auto, ha nem indul, 1024-re, onnan csökkent, amig elindul + NULL, // + Event); +} + +void ReduceAdd::cpu_compute() +{ + for (size_t index = 0; index < data_size; ++index) { + cpuResult += sourceData[index]; + } + std::cout << "CPU result is " << std::setprecision(12) << cpuResult << std::endl; +} + +void ReduceAdd::collect_results(cl::CommandQueue* queue) +{ + queue->enqueueReadBuffer(clInputBuffer, true, 0, sizeof(int) * data_size, gpuResult.data()); + std::cout << "Results grabbed" << std::endl; +} + +bool ReduceAdd::validate_results() +{ + return abs(cpuResult - gpuResult[0]) < 0.3f; +} \ No newline at end of file diff --git a/Primitives/primitives/Square.cpp b/Primitives/primitives/Square.cpp index 9361e42..f58a1ce 100644 --- a/Primitives/primitives/Square.cpp +++ b/Primitives/primitives/Square.cpp @@ -1,13 +1,21 @@ #include #include "../Common.h" #include "Tests.h" +#include Square::Square() { + const float vmax = 1000.0f; + data_size = 4096; + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution distr(0.0f, vmax); + for (size_t index = 0; index < data_size; ++index) { - gpuHostBuffer.push_back(static_cast(index)); - sourceData.push_back(static_cast(index)); + float val = distr(gen); + gpuResult.push_back(val); + sourceData.push_back(val); } } void Square::gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event) { @@ -20,7 +28,7 @@ void Square::gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Prog clInputBuffer = cl::Buffer(*context, CL_MEM_READ_ONLY, sizeof(float) * data_size, NULL, &err); queue->enqueueWriteBuffer(clInputBuffer, true, // Blocking! - 0, sizeof(float) * data_size, gpuHostBuffer.data()); + 0, sizeof(float) * data_size, gpuResult.data()); // Allocate the output data clResultBuffer = cl::Buffer(*context, CL_MEM_WRITE_ONLY, sizeof(float) * data_size, NULL, &err); @@ -46,13 +54,13 @@ void Square::cpu_compute() } void Square::collect_results(cl::CommandQueue* queue) { - queue->enqueueReadBuffer(clResultBuffer, true, 0, sizeof(float) * data_size, gpuHostBuffer.data()); + queue->enqueueReadBuffer(clResultBuffer, true, 0, sizeof(float) * data_size, gpuResult.data()); } bool Square::validate_results() { for (size_t index = 0; index < data_size; index++) { - if (cpuResult[index] != gpuHostBuffer[index]) { - std::cout << "Wrong result at [" << index << "]: " << gpuHostBuffer[index] << "!=" << cpuResult[index] << std::endl; + if (cpuResult[index] != gpuResult[index]) { + std::cout << "Wrong result at [" << index << "]: " << gpuResult[index] << "!=" << cpuResult[index] << std::endl; return false; } } diff --git a/Primitives/primitives/Tests.h b/Primitives/primitives/Tests.h index 2676a14..d3898ad 100644 --- a/Primitives/primitives/Tests.h +++ b/Primitives/primitives/Tests.h @@ -2,11 +2,11 @@ #include #include "../cl.hpp" - - class TestCase { -private: - std::string kernelCode; +protected: + cl::Buffer clInputBuffer; + cl::Buffer clResultBuffer; + size_t data_size; public: virtual void collect_results(cl::CommandQueue* queue) = 0; virtual void gpu_compute( @@ -21,16 +21,41 @@ public: class Square : public TestCase { private: - size_t data_size = 4096; - std::vector gpuHostBuffer; + std::vector gpuResult; std::vector sourceData; std::vector cpuResult; - cl::Buffer clInputBuffer; - cl::Buffer clResultBuffer; public: Square(); void collect_results(cl::CommandQueue* queue); void gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event); void cpu_compute(); bool validate_results(); +}; + +class Histogram : public TestCase { +private: + bool global; + size_t valueSet; + std::vector gpuResult; + std::vector sourceData; + std::vector cpuResult; +public: + Histogram(bool _global, int _valueSet, int _data_size); + void collect_results(cl::CommandQueue* queue); + void gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event); + void cpu_compute(); + bool validate_results(); +}; + +class ReduceAdd : public TestCase { +private: + std::vector gpuResult; + std::vector sourceData; + float cpuResult; +public: + ReduceAdd(size_t max_size); + void collect_results(cl::CommandQueue* queue); + void gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event); + void cpu_compute(); + bool validate_results(); }; \ No newline at end of file diff --git a/kernels/programs.cl b/kernels/programs.cl index a5ef2c2..16a8267 100644 --- a/kernels/programs.cl +++ b/kernels/programs.cl @@ -1,6 +1,6 @@ // map operator with f(x) = x*x __kernel void square(__global float* inputData, - __global float* outputData) + __global float* outputData) { int id = get_global_id(0); //get_local_id a workgroup-on belul outputData[id] = inputData[id] * inputData[id]; @@ -36,7 +36,23 @@ void histogram_global(__global int* data, __global int* histogram) __kernel void histogram_local(__global int* data, __global int* histogram, __local int* lhistogram, const int histogramSize) { - + int id = get_global_id(0); + int lid = get_local_id(0); + + if (lid < histogramSize) { + lhistogram[lid] = 0; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + atomic_add(&lhistogram[data[id]], 1.0f); + + barrier(CLK_GLOBAL_MEM_FENCE); // LOCAL??? + + if (lid < histogramSize) { + atomic_add(&histogram[lid], lhistogram[lid]); + } + } // TODO @@ -51,7 +67,15 @@ void histogram_local(__global int* data, __global int* histogram, __local int* l __kernel void reduce_global(__global float* data) { + int id = get_global_id(0); + // one right shift is = divide number by two + for (size_t s = get_global_size(0) / 2; s > 0; s >>= 1) { + if (id < s) { + data[id] = data[id] + data[id + s]; + } + barrier(CLK_GLOBAL_MEM_FENCE); + } } // TODO @@ -71,7 +95,15 @@ void reduce_global(__global float* data) __kernel void exscan_global(__global int* data) { - + int id = get_global_id(0); + if (id > 0) { + data[id] = data[ID - 1]; + } + else { + data[id] = 0; + } + barrier(CLK_GLOBAL_MEM_FENCE); + } // TODO @@ -83,7 +115,7 @@ void exscan_global(__global int* data) __kernel void compact_predicate(__global int* data, __global int* pred) { - + } // TODO @@ -105,5 +137,5 @@ void compact_exscan(__global int* pred, __global int* prefSum) __kernel void compact_compact(__global int* data, __global int* pred, __global int* prefSum) { - + } \ No newline at end of file