Browse Source

Histogram + Reduce

master
trinitas 3 years ago
parent
commit
5b1820c7be
  1. 7
      Primitives/OpenCLHandler.cpp
  2. 2
      Primitives/OpenCLHandler.h
  3. 89
      Primitives/Primitives.cpp
  4. 2
      Primitives/Primitives.vcxproj
  5. 6
      Primitives/Primitives.vcxproj.filters
  6. 90
      Primitives/primitives/Histogram.cpp
  7. 62
      Primitives/primitives/ReduceAdd.cpp
  8. 20
      Primitives/primitives/Square.cpp
  9. 41
      Primitives/primitives/Tests.h
  10. 42
      kernels/programs.cl

7
Primitives/OpenCLHandler.cpp

@ -26,6 +26,8 @@ OpenCLHandler::OpenCLHandler()
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
std::cout << devices[0].getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() << std::endl;
max_workgroup_size = devices[0].getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
// Create the command queue
cl::Event event;
queue = cl::CommandQueue(context, devices[0], 0, &err);
@ -48,4 +50,9 @@ bool OpenCLHandler::run_test(TestCase* test)
test->collect_results(&queue);
return test->validate_results();
}
size_t OpenCLHandler::get_max_size()
{
return max_workgroup_size;
}

2
Primitives/OpenCLHandler.h

@ -7,7 +7,9 @@ private:
cl::Program program;
cl::Context context;
cl::CommandQueue queue;
size_t max_workgroup_size;
public:
OpenCLHandler();
bool run_test(TestCase* test);
size_t get_max_size();
};

89
Primitives/Primitives.cpp

@ -15,59 +15,62 @@
//const size_t dataSize = 4096;
//
//void capi()
//{
// // Get a platform ID
// cl_platform_id platformID;
// clGetPlatformIDs(1, &platformID, NULL);
//
// // Get a device ID
// cl_device_id deviceID;
// clGetDeviceIDs(platformID, CL_DEVICE_TYPE_GPU, 1, &deviceID, NULL);
//
// // Create a context
// cl_context context;
// cl_context_properties contextProperties[] =
// { CL_CONTEXT_PLATFORM, (cl_context_properties)platformID, 0 };
// context = clCreateContext(contextProperties, 1, &deviceID, NULL, NULL, NULL);
//
// // Create a command queue
// cl_command_queue queue;
// queue = clCreateCommandQueue(context, deviceID, CL_QUEUE_PROFILING_ENABLE, NULL);
//
// // Create an OpenCL program
// std::string source = FileToString("../kernels/programs.cl");
// const char* csource = source.c_str();
// cl_program program = clCreateProgramWithSource(context, 1, &csource, NULL, NULL);
// cl_int err = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);
// if (err != CL_SUCCESS)
// {
// cl_uint logLength;
// clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLength);
// char* log = new char[logLength];
// clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, logLength, log, 0);
// std::cout << log << std::endl;
// delete[] log;
// exit(-1);
// }
//}
void capi()
{
// Get a platform ID
cl_platform_id platformID;
clGetPlatformIDs(1, &platformID, NULL);
// Get a device ID
cl_device_id deviceID;
clGetDeviceIDs(platformID, CL_DEVICE_TYPE_GPU, 1, &deviceID, NULL);
// Create a context
cl_context context;
cl_context_properties contextProperties[] =
{ CL_CONTEXT_PLATFORM, (cl_context_properties)platformID, 0 };
context = clCreateContext(contextProperties, 1, &deviceID, NULL, NULL, NULL);
// Create a command queue
cl_command_queue queue;
queue = clCreateCommandQueue(context, deviceID, CL_QUEUE_PROFILING_ENABLE, NULL);
// Create an OpenCL program
std::string source = FileToString("../kernels/programs.cl");
const char* csource = source.c_str();
cl_program program = clCreateProgramWithSource(context, 1, &csource, NULL, NULL);
cl_int err = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
cl_uint logLength;
clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLength);
char* log = new char[logLength];
clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, logLength, log, 0);
std::cout << log << std::endl;
delete[] log;
exit(-1);
}
}
int main()
{
capi();
OpenCLHandler handler;
std::vector<TestCase*> tests;
Square s;
TestCase* t;
t = &s;
tests.push_back(t);
Histogram h(false, 32, 4096);
ReduceAdd r1(handler.get_max_size());
for (size_t i = 0; i < tests.size(); i++) {
handler.run_test(tests[i]);
//handler.run_test(&s);
//handler.run_test(&h);
if (handler.run_test(&r1)) {
std::cout << "Success" << std::endl;
}
else {
std::cout << "Failure" << std::endl;
}
return 0;
}

2
Primitives/Primitives.vcxproj

@ -80,6 +80,8 @@
<ItemGroup>
<ClCompile Include="OpenCLHandler.cpp" />
<ClCompile Include="Primitives.cpp" />
<ClCompile Include="primitives\Histogram.cpp" />
<ClCompile Include="primitives\ReduceAdd.cpp" />
<ClCompile Include="primitives\Square.cpp" />
</ItemGroup>
<ItemGroup>

6
Primitives/Primitives.vcxproj.filters

@ -44,6 +44,12 @@
<ClCompile Include="Common.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="primitives\Histogram.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="primitives\ReduceAdd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<None Include="..\kernels\programs.cl">

90
Primitives/primitives/Histogram.cpp

@ -0,0 +1,90 @@
#include <iostream>
#include "../Common.h"
#include "Tests.h"
#include <random>
Histogram::Histogram(bool _global, int _valueSet, int _data_size)
{
global = _global;
valueSet = _valueSet;
data_size = _data_size;
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<int> distr(0, valueSet-1);
for (size_t index = 0; index < data_size; ++index) {
sourceData.push_back(distr(gen));
}
cpuResult.resize(valueSet, 0);
gpuResult.resize(valueSet, 0);
}
void Histogram::collect_results(cl::CommandQueue* queue)
{
queue->enqueueReadBuffer(clResultBuffer, true, 0, sizeof(int) * valueSet, gpuResult.data());
}
void Histogram::gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event)
{
cl_int err = CL_SUCCESS;
// Get the kernel handle
cl::Kernel kernel;
if (global) {
kernel = cl::Kernel(*program, "histogram_global", &err);
}
else {
kernel = cl::Kernel(*program, "histogram_local", &err);
}
CheckCLError(err);
clInputBuffer = cl::Buffer(*context, CL_MEM_READ_ONLY, sizeof(int) * data_size, NULL, &err);
queue->enqueueWriteBuffer(clInputBuffer,
true, // Blocking!
0, sizeof(int) * data_size, sourceData.data());
CheckCLError(err);
// Allocate the output data
clResultBuffer = cl::Buffer(*context, CL_MEM_WRITE_ONLY, sizeof(int) * valueSet, NULL, &err);
CheckCLError(err);
// Set the kernel parameters
kernel.setArg(0, clInputBuffer); // kernel FV paraméterei sorrendben
kernel.setArg(1, clResultBuffer);
if (!global) {
kernel.setArg(2, sizeof(int) * valueSet, NULL);
kernel.setArg(3, valueSet);
}
// Enqueue the kernel
queue->enqueueNDRangeKernel(kernel,
cl::NullRange, // Indexek nem eloffszetelve
cl::NDRange(data_size, 1), // Minden elemet egy szál
cl::NullRange, // Workgroup méret? - ez az auto, ha nem indul, 1024-re, onnan csökkent, amig elindul
NULL, //
Event);
}
void Histogram::cpu_compute()
{
for (size_t index = 0; index < data_size; ++index) {
cpuResult[sourceData[index]] = cpuResult[sourceData[index]] + 1;
}
}
bool Histogram::validate_results()
{
for (size_t index = 0; index < valueSet; index++) {
if (cpuResult[index] != gpuResult[index]) {
std::cout << "Wrong result at [" << index << "]: " << gpuResult[index] << "!=" << cpuResult[index] << std::endl;
return false;
}
}
if (global) {
std::cout << "Test \"Histogram - global\" completed, set: " << valueSet << ", size: " << data_size << std::endl;
}
else {
std::cout << "Test \"Histogram - local\" completed, set: " << valueSet << ", size: " << data_size << std::endl;
}
return true;
}

62
Primitives/primitives/ReduceAdd.cpp

@ -0,0 +1,62 @@
#include <iostream>
#include <iomanip>
#include "../Common.h"
#include "Tests.h"
#include <random>
#include <stdlib.h>
ReduceAdd::ReduceAdd(size_t max_size)
{
data_size = max_size;
cpuResult = 0.0f;
const float vmax = 1000.0f;
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> distr(0.0f, vmax);
for (size_t index = 0; index < data_size; ++index)
{
float val = distr(gen);
gpuResult.push_back(val);
sourceData.push_back(val);
}
}
void ReduceAdd::gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event)
{
cl_int err = CL_SUCCESS;
cl::Kernel kernel = cl::Kernel(*program, "reduce_global", &err);
CheckCLError(err);
clInputBuffer = cl::Buffer(*context, CL_MEM_READ_ONLY, sizeof(int) * data_size, NULL, &err);
CheckCLError(err);
queue->enqueueWriteBuffer(clInputBuffer, true, 0, sizeof(int) * data_size, gpuResult.data());
kernel.setArg(0, clInputBuffer);
queue->enqueueNDRangeKernel(kernel,
cl::NullRange, // Indexek nem eloffszetelve
cl::NDRange(data_size, 1), // Minden elemet egy szál
cl::NDRange(1024, 1), // Workgroup méret? - ez az auto, ha nem indul, 1024-re, onnan csökkent, amig elindul
NULL, //
Event);
}
void ReduceAdd::cpu_compute()
{
for (size_t index = 0; index < data_size; ++index) {
cpuResult += sourceData[index];
}
std::cout << "CPU result is " << std::setprecision(12) << cpuResult << std::endl;
}
void ReduceAdd::collect_results(cl::CommandQueue* queue)
{
queue->enqueueReadBuffer(clInputBuffer, true, 0, sizeof(int) * data_size, gpuResult.data());
std::cout << "Results grabbed" << std::endl;
}
bool ReduceAdd::validate_results()
{
return abs(cpuResult - gpuResult[0]) < 0.3f;
}

20
Primitives/primitives/Square.cpp

@ -1,13 +1,21 @@
#include <iostream>
#include "../Common.h"
#include "Tests.h"
#include <random>
Square::Square() {
const float vmax = 1000.0f;
data_size = 4096;
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> distr(0.0f, vmax);
for (size_t index = 0; index < data_size; ++index)
{
gpuHostBuffer.push_back(static_cast<float>(index));
sourceData.push_back(static_cast<float>(index));
float val = distr(gen);
gpuResult.push_back(val);
sourceData.push_back(val);
}
}
void Square::gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event) {
@ -20,7 +28,7 @@ void Square::gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Prog
clInputBuffer = cl::Buffer(*context, CL_MEM_READ_ONLY, sizeof(float) * data_size, NULL, &err);
queue->enqueueWriteBuffer(clInputBuffer,
true, // Blocking!
0, sizeof(float) * data_size, gpuHostBuffer.data());
0, sizeof(float) * data_size, gpuResult.data());
// Allocate the output data
clResultBuffer = cl::Buffer(*context, CL_MEM_WRITE_ONLY, sizeof(float) * data_size, NULL, &err);
@ -46,13 +54,13 @@ void Square::cpu_compute()
}
void Square::collect_results(cl::CommandQueue* queue) {
queue->enqueueReadBuffer(clResultBuffer, true, 0, sizeof(float) * data_size, gpuHostBuffer.data());
queue->enqueueReadBuffer(clResultBuffer, true, 0, sizeof(float) * data_size, gpuResult.data());
}
bool Square::validate_results() {
for (size_t index = 0; index < data_size; index++) {
if (cpuResult[index] != gpuHostBuffer[index]) {
std::cout << "Wrong result at [" << index << "]: " << gpuHostBuffer[index] << "!=" << cpuResult[index] << std::endl;
if (cpuResult[index] != gpuResult[index]) {
std::cout << "Wrong result at [" << index << "]: " << gpuResult[index] << "!=" << cpuResult[index] << std::endl;
return false;
}
}

41
Primitives/primitives/Tests.h

@ -2,11 +2,11 @@
#include <string>
#include "../cl.hpp"
class TestCase {
private:
std::string kernelCode;
protected:
cl::Buffer clInputBuffer;
cl::Buffer clResultBuffer;
size_t data_size;
public:
virtual void collect_results(cl::CommandQueue* queue) = 0;
virtual void gpu_compute(
@ -21,16 +21,41 @@ public:
class Square : public TestCase {
private:
size_t data_size = 4096;
std::vector<float> gpuHostBuffer;
std::vector<float> gpuResult;
std::vector<float> sourceData;
std::vector<float> cpuResult;
cl::Buffer clInputBuffer;
cl::Buffer clResultBuffer;
public:
Square();
void collect_results(cl::CommandQueue* queue);
void gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event);
void cpu_compute();
bool validate_results();
};
class Histogram : public TestCase {
private:
bool global;
size_t valueSet;
std::vector<int> gpuResult;
std::vector<int> sourceData;
std::vector<int> cpuResult;
public:
Histogram(bool _global, int _valueSet, int _data_size);
void collect_results(cl::CommandQueue* queue);
void gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event);
void cpu_compute();
bool validate_results();
};
class ReduceAdd : public TestCase {
private:
std::vector<float> gpuResult;
std::vector<float> sourceData;
float cpuResult;
public:
ReduceAdd(size_t max_size);
void collect_results(cl::CommandQueue* queue);
void gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event);
void cpu_compute();
bool validate_results();
};

42
kernels/programs.cl

@ -1,6 +1,6 @@
// map operator with f(x) = x*x
__kernel void square(__global float* inputData,
__global float* outputData)
__global float* outputData)
{
int id = get_global_id(0); //get_local_id a workgroup-on belul
outputData[id] = inputData[id] * inputData[id];
@ -36,7 +36,23 @@ void histogram_global(__global int* data, __global int* histogram)
__kernel
void histogram_local(__global int* data, __global int* histogram, __local int* lhistogram, const int histogramSize)
{
int id = get_global_id(0);
int lid = get_local_id(0);
if (lid < histogramSize) {
lhistogram[lid] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
atomic_add(&lhistogram[data[id]], 1.0f);
barrier(CLK_GLOBAL_MEM_FENCE); // LOCAL???
if (lid < histogramSize) {
atomic_add(&histogram[lid], lhistogram[lid]);
}
}
// TODO
@ -51,7 +67,15 @@ void histogram_local(__global int* data, __global int* histogram, __local int* l
__kernel
void reduce_global(__global float* data)
{
int id = get_global_id(0);
// one right shift is = divide number by two
for (size_t s = get_global_size(0) / 2; s > 0; s >>= 1) {
if (id < s) {
data[id] = data[id] + data[id + s];
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
// TODO
@ -71,7 +95,15 @@ void reduce_global(__global float* data)
__kernel
void exscan_global(__global int* data)
{
int id = get_global_id(0);
if (id > 0) {
data[id] = data[ID - 1];
}
else {
data[id] = 0;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
// TODO
@ -83,7 +115,7 @@ void exscan_global(__global int* data)
__kernel
void compact_predicate(__global int* data, __global int* pred)
{
}
// TODO
@ -105,5 +137,5 @@ void compact_exscan(__global int* pred, __global int* prefSum)
__kernel
void compact_compact(__global int* data, __global int* pred, __global int* prefSum)
{
}
Loading…
Cancel
Save