Browse Source

Implemented Simple MVM

tg-25 #in-progress
master
Daniel Gyulai 3 years ago
parent
commit
ae5ae5e612
  1. 203
      Linear/Common.h
  2. 145
      Linear/Jacobi.cpp
  3. 11
      Linear/Large.cpp
  4. 179
      Linear/Linear.cpp
  5. 14
      Linear/Linear.vcxproj
  6. 25
      Linear/Linear.vcxproj.filters
  7. 62
      Linear/LinearTests.h
  8. 11
      Linear/Reduce.cpp
  9. 51
      Linear/Simple.cpp
  10. 12936
      Linear/cl.hpp
  11. 37
      kernels/linear.cl

203
Linear/Common.h

@ -1,203 +0,0 @@
#pragma once
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <sstream>
#include "cl.hpp"
#pragma warning( disable : 4996 )
void printTimeStats(cl_event event)
{
cl_int err = CL_SUCCESS;
if(event == NULL)
{
std::cerr << "No event object returned!" << std::endl;
}
else
{
clWaitForEvents(1, &event);
}
cl_ulong execStart, execEnd;
err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &execStart, NULL);
if(err != CL_SUCCESS)
{
std::cerr << "Error during profile query: CL_PROFILING_COMMAND_START [" << err << "]." << std::endl;
}
err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &execEnd, NULL);
if(err != CL_SUCCESS)
{
std::cerr << "Error during profile query: CL_PROFILING_COMMAND_END [" << err << "]." << std::endl;
}
std::cout << "[start] " << execStart << " [end] " << execEnd << " [time] " << (execEnd - execStart) / 1e+06 << "ms." << std::endl;
}
void WriteTGA_RGB(const char* filename, unsigned char* data, unsigned int width, unsigned int height)
{
FILE *f = fopen(filename, "wb");
if (!f) {
fprintf(stderr, "Unable to create output TGA image `%s'\n", filename);
exit(EXIT_FAILURE);
}
fputc(0x00, f); /* ID Length, 0 => No ID */
fputc(0x00, f); /* Color Map Type, 0 => No color map included */
fputc(0x02, f); /* Image Type, 2 => Uncompressed, True-color Image */
fputc(0x00, f); /* Next five bytes are about the color map entries */
fputc(0x00, f); /* 2 bytes Index, 2 bytes length, 1 byte size */
fputc(0x00, f);
fputc(0x00, f);
fputc(0x00, f);
fputc(0x00, f); /* X-origin of Image */
fputc(0x00, f);
fputc(0x00, f); /* Y-origin of Image */
fputc(0x00, f);
fputc(width & 0xff, f); /* Image Width */
fputc((width >> 8) & 0xff, f);
fputc(height & 0xff, f); /* Image Height */
fputc((height >> 8) & 0xff, f);
fputc(0x18, f); /* Pixel Depth, 0x18 => 24 Bits */
fputc(0x20, f); /* Image Descriptor */
for (int y = height - 1; y >= 0; y--) {
for (size_t x = 0; x < width; x++) {
const size_t i = (y * width + x) * 3;
fputc(data[i + 2], f); /* write blue */
fputc(data[i + 1], f); /* write green */
fputc(data[i], f); /* write red */
}
}
}
std::string FileToString(const std::string& path) {
std::ifstream file(path, std::ios::in | std::ios::binary);
if (file)
{
std::ostringstream contents;
contents << file.rdbuf();
file.close();
return(contents.str());
}
return std::string();
}
const char *getErrorString(cl_int error)
{
switch (error) {
// run-time and JIT compiler errors
case 0: return "CL_SUCCESS";
case -1: return "CL_DEVICE_NOT_FOUND";
case -2: return "CL_DEVICE_NOT_AVAILABLE";
case -3: return "CL_COMPILER_NOT_AVAILABLE";
case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
case -5: return "CL_OUT_OF_RESOURCES";
case -6: return "CL_OUT_OF_HOST_MEMORY";
case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
case -8: return "CL_MEM_COPY_OVERLAP";
case -9: return "CL_IMAGE_FORMAT_MISMATCH";
case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
case -11: return "CL_BUILD_PROGRAM_FAILURE";
case -12: return "CL_MAP_FAILURE";
case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
case -15: return "CL_COMPILE_PROGRAM_FAILURE";
case -16: return "CL_LINKER_NOT_AVAILABLE";
case -17: return "CL_LINK_PROGRAM_FAILURE";
case -18: return "CL_DEVICE_PARTITION_FAILED";
case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
// compile-time errors
case -30: return "CL_INVALID_VALUE";
case -31: return "CL_INVALID_DEVICE_TYPE";
case -32: return "CL_INVALID_PLATFORM";
case -33: return "CL_INVALID_DEVICE";
case -34: return "CL_INVALID_CONTEXT";
case -35: return "CL_INVALID_QUEUE_PROPERTIES";
case -36: return "CL_INVALID_COMMAND_QUEUE";
case -37: return "CL_INVALID_HOST_PTR";
case -38: return "CL_INVALID_MEM_OBJECT";
case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
case -40: return "CL_INVALID_IMAGE_SIZE";
case -41: return "CL_INVALID_SAMPLER";
case -42: return "CL_INVALID_BINARY";
case -43: return "CL_INVALID_BUILD_OPTIONS";
case -44: return "CL_INVALID_PROGRAM";
case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
case -46: return "CL_INVALID_KERNEL_NAME";
case -47: return "CL_INVALID_KERNEL_DEFINITION";
case -48: return "CL_INVALID_KERNEL";
case -49: return "CL_INVALID_ARG_INDEX";
case -50: return "CL_INVALID_ARG_VALUE";
case -51: return "CL_INVALID_ARG_SIZE";
case -52: return "CL_INVALID_KERNEL_ARGS";
case -53: return "CL_INVALID_WORK_DIMENSION";
case -54: return "CL_INVALID_WORK_GROUP_SIZE";
case -55: return "CL_INVALID_WORK_ITEM_SIZE";
case -56: return "CL_INVALID_GLOBAL_OFFSET";
case -57: return "CL_INVALID_EVENT_WAIT_LIST";
case -58: return "CL_INVALID_EVENT";
case -59: return "CL_INVALID_OPERATION";
case -60: return "CL_INVALID_GL_OBJECT";
case -61: return "CL_INVALID_BUFFER_SIZE";
case -62: return "CL_INVALID_MIP_LEVEL";
case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
case -64: return "CL_INVALID_PROPERTY";
case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
case -66: return "CL_INVALID_COMPILER_OPTIONS";
case -67: return "CL_INVALID_LINKER_OPTIONS";
case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
// extension errors
case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
default: return "Unknown OpenCL error";
}
}
bool CheckCLError(cl_int err)
{
if(err != CL_SUCCESS)
{
std::cout << "OpenCL error: " << getErrorString(err) << std::endl;
return false;
}
return true;
}

145
Linear/Jacobi.cpp

@ -0,0 +1,145 @@
#include "LinearTests.h"
#include <iostream>
void Jacobi::generateLinEq()
{
Jx_c[0] = new float[Jn];
Jx_c[1] = new float[Jn];
Jx_g[0] = new float[Jn];
Jx_g[1] = new float[Jn];
for (int i = 0; i < Jn; ++i) {
Jx_c[0][i] = 0.0f;
Jx_c[1][i] = 0.0f;
Jx_g[0][i] = 0.0f;
Jx_g[1][i] = 0.0f;
}
JA_c = new float[Jn * Jn];
JA_g = new float[Jn * Jn];
for (int i = 0; i < Jn; ++i) {
for (int j = 0; j < Jn; ++j) {
float v = 0.0f;
if (i == j) {
v = 0.5f;
}
JA_c[i + j * Jn] = v;
JA_g[i + j * Jn] = v;
}
}
Jb_c = new float[Jn];
Jb_g = new float[Jn];
for (int i = 0; i < Jn; ++i) {
Jb_c[i] = 1.0f;
Jb_g[i] = 1.0f;
}
}
void Jacobi::cpuScalarMV(int n, int m, float* y, const float* A, const float* x, const float* b)
{
for (int i = 0; i < n; ++i) {
float yi = b[i];
for (int j = 0; j < m; ++j) {
yi += A[i * m + j] * x[j];
}
y[i] = yi;
}
}
void Jacobi::printMatrix(int n, int m, float* A)
{
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j) {
std::cout << A[j + i * n];
if (j < m - 1) std::cout << ", ";
}
std::cout << std::endl;
}
}
MatrixVectorMultiplier* Jacobi::MethodFactory(MVType type, cl::Context* context, cl::CommandQueue* queue, cl::Program* program)
{
if (type == MVType::SimpleMV) {
return new Simple(context, queue, program);
}
else if (type == MVType::ReduceMV) {
return new Reduce(context, queue, program);
}
else if (type == MVType::LargeMV) {
return new Large(context, queue, program);
}
else {
return NULL;
}
}
Jacobi::Jacobi(MVType _type)
{
type = _type;
generateLinEq();
}
void Jacobi::collect_results(cl::CommandQueue* queue)
{
}
void Jacobi::gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event)
{
MatrixVectorMultiplier* MVMultiplier = MethodFactory(type, context, queue, program);
if (MVMultiplier != NULL) {
int inputBuffer = 0;
const int iterations = 20;
for (int i = 0; i < iterations; ++i) {
MVMultiplier->dewIt(Jn, Jn, Jx_g[(inputBuffer + 1) % 2], JA_g, Jx_g[inputBuffer], Jb_g);
printMatrix(1, Jn, Jx_g[inputBuffer]);
inputBuffer = (inputBuffer + 1) % 2;
}
}
else {
std::cout << "Invalid factory parameter" << std::endl;
exit(-1);
}
}
void Jacobi::cpu_compute()
{
int inputBuffer = 0;
const int iterations = 20;
for (int i = 0; i < iterations; ++i) {
cpuScalarMV(Jn, Jn, Jx_c[(inputBuffer + 1) % 2], JA_c, Jx_c[inputBuffer], Jb_c);
//printMatrix(1, Jn, Jx_c[inputBuffer + 1]);
inputBuffer = (inputBuffer + 1) % 2;
}
}
bool Jacobi::validate_results()
{
bool result = true;
// Actual validation
//printMatrix(Jn, Jn, JA_c);
//printMatrix(1, Jn, Jx_c[0]);
//printMatrix(1, Jn, Jx_c[1]);
// Cleanup
if (Jx_c[0] == 0) delete[] Jx_c[0];
if (Jx_c[1] == 0) delete[] Jx_c[1];
if (Jx_g[0] == 0) delete[] Jx_g[0];
if (Jx_g[1] == 0) delete[] Jx_g[1];
if (JA_c == 0) delete[] JA_c;
if (Jb_c == 0) delete[] Jb_c;
if (JA_g == 0) delete[] JA_g;
if (Jb_g == 0) delete[] Jb_g;
return result;
}
std::string Jacobi::description()
{
return std::string();
}

11
Linear/Large.cpp

@ -0,0 +1,11 @@
#include "LinearTests.h"
Large::Large(cl::Context* context, cl::CommandQueue* queue, cl::Program* program)
{
//TODO: Implement
}
void Large::dewIt(int n, int m, float* y, const float* A, const float* x, const float* b)
{
//TODO: Implement
}

179
Linear/Linear.cpp

@ -9,72 +9,14 @@
// OpenCL C++ API
#include "cl.hpp"
#include <iostream>
#include <OpenCLHandler.h>
#include "LinearTests.h"
// Gaussian elimination
const int GAn = 4;
const int GAm = 3;
float GA[] = { 2, 1, -1, 8,
-3, -1, 2, -11,
-2, 1, 2, -3 };
int GBn = 6;
int GBm = 3;
float GB[] = { 2, -1, 0, 1, 0, 0,
-1, 2, -1, 0, 1, 0,
0, -1, 2, 0, 0, 1 };
void scalarMV(int n, int m, float* y, const float* A, const float* x, const float* b) {
for (int i = 0; i<n; ++i) {
float yi = b[i];
for (int j = 0; j<m; ++j) {
yi += A[i * m + j] * x[j];
}
y[i] = yi;
}
}
// Jacobi iteration
const int Jn = 8;
float* Jx[2] = { NULL, NULL };
float* JA = NULL;
float* Jb = NULL;
void generateLinEq()
{
Jx[0] = new float[Jn];
Jx[1] = new float[Jn];
for (int i = 0; i < Jn; ++i) {
Jx[0][i] = 0.0f;
Jx[1][i] = 0.0f;
}
JA = new float[Jn * Jn];
for (int i = 0; i < Jn; ++i) {
for (int j = 0; j < Jn; ++j) {
float v = 0.0f;
if (i == j) {
v = 0.5f;
}
JA[i + j * Jn] = v;
}
}
Jb = new float[Jn];
for (int i = 0; i < Jn; ++i) {
Jb[i] = 1.0f;
}
}
void releaseLinEq()
{
if (Jx[0] == 0) delete[] Jx[0];
if (Jx[1] == 0) delete[] Jx[1];
if (JA == 0) delete[] JA;
if (Jb == 0) delete[] Jb;
}
void capi()
{
// Get a platform ID
@ -96,7 +38,7 @@ void capi()
queue = clCreateCommandQueue(context, deviceID, CL_QUEUE_PROFILING_ENABLE, NULL);
// Create an OpenCL program
std::string source = FileToString("../kernels/programs.cl");
std::string source = FileToString("../kernels/linear.cl");
const char* csource = source.c_str();
cl_program program = clCreateProgramWithSource(context, 1, &csource, NULL, NULL);
cl_int err = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL);
@ -110,121 +52,16 @@ void capi()
delete[] log;
exit(-1);
}
// Get the kernel handle
cl_kernel kernel = clCreateKernel(program, "gaussian", &err);
if(!CheckCLError(err)) exit(-1);
// Allocate and upload the input data
// ...
cl_mem inputBuffer;
inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * GAn * GAm, NULL, &err);
if (!CheckCLError(err)) exit(-1);
clEnqueueWriteBuffer(queue, inputBuffer, CL_TRUE, 0, sizeof(float) * GAn * GAm, GA, 0, NULL, NULL);
// Set the kernel paramateres
clSetKernelArg(kernel, 0, sizeof(int), &GAn);
clSetKernelArg(kernel, 1, sizeof(int), &GAm);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputBuffer);
// Enqueue the kernel
size_t workSize = GAm;
size_t workGroupSize = GAm;
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &workSize, &workGroupSize, 0, NULL, NULL);
// Copy the result back to the host
clEnqueueReadBuffer(queue, inputBuffer, CL_TRUE, 0, sizeof(float) * GAm * GAn, GA, 0, NULL, NULL);
for (int i = 0; i < GAm; ++i) {
for (int j = 0; j < GAn; ++j) {
std::cout << GA[j + i * GAn];
if (j < GAn - 1) std::cout << ", ";
}
std::cout << std::endl;
}
clReleaseMemObject(inputBuffer);
clReleaseKernel(kernel);
std::cout << "Finished" << std::endl;
}
void cppapi()
{
cl_int err = CL_SUCCESS;
// Get a platform ID
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if (platforms.size() == 0)
{
std::cout << "Unable to find suitable platform." << std::endl;
exit(-1);
}
// Create a context
cl_context_properties properties[] =
{ CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0 };
cl::Context context(CL_DEVICE_TYPE_GPU, properties);
// Enumerate the devices
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create the command queue
cl::Event event;
cl::CommandQueue queue(context, devices[0], 0, &err);
// Create the OpenCL program
std::string programSource = FileToString("../kernels/programs.cl");
cl::Program program = cl::Program(context, programSource);
program.build(devices);
// Get the kernel handle
cl::Kernel kernel(program, "gaussian", &err);
CheckCLError(err);
// Allocate and upload the input data
// ...
cl::Buffer clInputBuffer = cl::Buffer(context, CL_MEM_READ_ONLY, sizeof(float) * GAn * GAm, NULL, &err);
queue.enqueueWriteBuffer(clInputBuffer, true, 0, sizeof(float) * GAn * GAm, GA);
// Set the kernel parameters
kernel.setArg(0, GAn);
kernel.setArg(1, GAm);
kernel.setArg(2, clInputBuffer);
// Enqueue the kernel
queue.enqueueNDRangeKernel(kernel,
cl::NullRange,
cl::NDRange(GAm, 1),
cl::NDRange(GAm, 1),
NULL,
&event);
event.wait();
// Copy result back to host
queue.enqueueReadBuffer(clInputBuffer, true, 0, sizeof(float) * GAn * GAm, GA);
// Validate the result
for (int i = 0; i < GAm; ++i) {
for (int j = 0; j < GAn; ++j) {
std::cout << GA[j + i * GAn];
if (j < GAn - 1) std::cout << ", ";
}
std::cout << std::endl;
}
std::cout << "Finished" << std::endl;
}
int main()
{
capi();
cppapi();
//cppapi();
OpenCLHandler handler("../kernels/linear.cl");
Jacobi j(MVType::SimpleMV);
handler.run_test(&j);
return 0;
}

14
Linear/Linear.vcxproj

@ -45,12 +45,13 @@
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<AdditionalIncludeDirectories>$(CUDA_PATH)\include</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>..\Common;$(CUDA_PATH)\include</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32\</AdditionalLibraryDirectories>
<AdditionalDependencies>OpenCL.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<SubSystem>Console</SubSystem>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@ -70,11 +71,11 @@
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="cl.hpp" />
<ClInclude Include="Common.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="Jacobi.cpp" />
<ClCompile Include="Large.cpp" />
<ClCompile Include="Linear.cpp" />
<ClCompile Include="Reduce.cpp" />
<ClCompile Include="Simple.cpp" />
</ItemGroup>
<ItemGroup>
<None Include="..\kernels\linear.cl" />
@ -84,6 +85,9 @@
<Project>{f66311cb-c60d-43de-890c-7e6d8179ca44}</Project>
</ProjectReference>
</ItemGroup>
<ItemGroup>
<ClInclude Include="LinearTests.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>

25
Linear/Linear.vcxproj.filters

@ -17,22 +17,31 @@
<UniqueIdentifier>{fd13ccc5-a98b-4e30-9eba-12f62c7dd566}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClInclude Include="cl.hpp">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="Common.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="Linear.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="Jacobi.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="Simple.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="Reduce.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="Large.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<None Include="..\kernels\linear.cl">
<Filter>Kernels</Filter>
</None>
</ItemGroup>
<ItemGroup>
<ClInclude Include="LinearTests.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
</Project>

62
Linear/LinearTests.h

@ -0,0 +1,62 @@
#pragma once
#include "Tests.h"
enum class MVType {
SimpleMV, ReduceMV, LargeMV
};
class MatrixVectorMultiplier {
protected:
cl::Context* context;
cl::CommandQueue* queue;
cl::Program* program;
public:
virtual void dewIt(int n, int m, float* y, const float* A, const float* x, const float* b) = 0;
};
class Jacobi : public TestCase {
private:
const int Jn = 8;
// CPU
float* Jx_c[2] = { NULL, NULL };
float* JA_c = NULL;
float* Jb_c = NULL;
// GPU
float* Jx_g[2] = { NULL, NULL };
float* JA_g = NULL;
float* Jb_g = NULL;
MVType type;
void generateLinEq();
void cpuScalarMV(int n, int m, float* y, const float* A, const float* x, const float* b);
void printMatrix(int n, int m, float* A);
MatrixVectorMultiplier* MethodFactory(MVType type, cl::Context* context, cl::CommandQueue* queue, cl::Program* program);
public:
Jacobi(MVType type);
void collect_results(cl::CommandQueue* queue);
void gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Program* program, cl::Event* Event);
void cpu_compute();
bool validate_results();
std::string description();
};
class Simple : public MatrixVectorMultiplier {
public:
Simple(cl::Context* context, cl::CommandQueue* queue, cl::Program* program);
void dewIt(int n, int m, float* y, const float* A, const float* x, const float* b);
};
class Reduce : public MatrixVectorMultiplier {
public:
Reduce(cl::Context* context, cl::CommandQueue* queue, cl::Program* program);
void dewIt(int n, int m, float* y, const float* A, const float* x, const float* b);
};
class Large : public MatrixVectorMultiplier {
public:
Large(cl::Context* context, cl::CommandQueue* queue, cl::Program* program);
void dewIt(int n, int m, float* y, const float* A, const float* x, const float* b);
};

11
Linear/Reduce.cpp

@ -0,0 +1,11 @@
#include "LinearTests.h"
Reduce::Reduce(cl::Context* context, cl::CommandQueue* queue, cl::Program* program)
{
//TODO: Implement
}
void Reduce::dewIt(int n, int m, float* y, const float* A, const float* x, const float* b)
{
//TODO: Implement
}

51
Linear/Simple.cpp

@ -0,0 +1,51 @@
#include "LinearTests.h"
#include <Common.h>
Simple::Simple(cl::Context* _context, cl::CommandQueue* _queue, cl::Program* _program)
{
context = _context;
queue = _queue;
program = _program;
}
void Simple::dewIt(int n, int m, float* y, const float* A, const float* x, const float* b)
{
cl_int err = CL_SUCCESS;
cl::Event _event;
cl::Kernel kernel = cl::Kernel(*program, "simpleMV", &err);
CheckCLError(err);
cl::Buffer ABuffer(*context, CL_MEM_READ_ONLY, sizeof(float) * n *m, NULL, &err);
CheckCLError(err);
queue->enqueueWriteBuffer(ABuffer, true, 0, sizeof(float) * n *m, A);
cl::Buffer XBuffer(*context, CL_MEM_READ_ONLY, sizeof(float) * m, NULL, &err); // Ot kell kiirogatni
CheckCLError(err);
queue->enqueueWriteBuffer(XBuffer, true, 0, sizeof(float) * m, x);
cl::Buffer YBuffer(*context, CL_MEM_WRITE_ONLY, sizeof(float) * n, NULL, &err);
CheckCLError(err);
queue->enqueueWriteBuffer(YBuffer, true, 0, sizeof(float) * n, y);
cl::Buffer BBuffer(*context, CL_MEM_READ_ONLY, sizeof(float) * n, NULL, &err);
CheckCLError(err);
queue->enqueueWriteBuffer(BBuffer, true, 0, sizeof(float) * n, b);
kernel.setArg(0, n);
kernel.setArg(1, m);
kernel.setArg(2, YBuffer);
kernel.setArg(3, ABuffer);
kernel.setArg(4, XBuffer);
kernel.setArg(5, BBuffer);
queue->enqueueNDRangeKernel(kernel,
cl::NullRange, // Indexek nem eloffszetelve
cl::NDRange(n, 1), // Minden elemet egy szál
cl::NullRange, // Workgroup méret? - ez az auto, ha nem indul, 1024-re, onnan csökkent, amig elindul
NULL, //
&_event);
_event.wait();
queue->enqueueReadBuffer(YBuffer, true, 0, sizeof(int) * n, y);
}

12936
Linear/cl.hpp

File diff suppressed because it is too large

37
kernels/linear.cl

@ -11,7 +11,15 @@
// END IF
__kernel
void simpleMV(const int n, const int m, __global float* y, __global float* A, __global float* x, __global float* b){
int i = get_local_id(0);
if (i < n) {
float yi = b[i];
for (int j = 0; j < m; j++) {
yi += A[j + i * m] * x[j];
}
y[i] = yi;
}
}
// TODO: Matrix-vector multiplication with parallelization of the dot product
@ -30,7 +38,8 @@ void simpleMV(const int n, const int m, __global float* y, __global float* A, __
//
__kernel
void reduceMV(const int n, const int M, __global float* y, __global float* A, __global float* x, __global float* b, __local float* Q){
int i = get_group_id(0);
int j = get_local_id(0);
}
// TODO: General solution for matrix-vector multiplication, every thread processes a chunk of the dot product and visits multiple rows of the result
@ -51,8 +60,34 @@ void largeMV(const int n, const int m, __global float* y, __global float* A, __g
}
// TODO: Gaussian elimination as shown in the lecture
// for k := 1 .. n-1 do
// for i : = k + 1 ..n do
// l : = aik / akk
// bi : = bi l * bk
// for j : = k ..n do
// aij : = aij l * akj
// end for
// end for
// end for
// (execute the 2nd loop of the sequential implemential in parallel)
__kernel void gaussian(const int n, const int m, __global float* A){
int i = get_global_id(0);
int lid = get_local_id(0);
/*if (i < n) {
for (size_t k = 1; k < n - 1; k++) {
int l = A[k * i] / A[k * k];
for (size_t j = k; k < n; j++) {
A[i * j] = A[i * j] - l * A[k * j];
}
}
}*/
for (size_t k = 1; k < n - 1; k++) {
float l = A[k * n + i] / A[k * n + k];
for (size_t j = k; j < n; j++) {
A[i * n + j] = A[i * n + j] - l * A[k * n + j];
}
}
}

Loading…
Cancel
Save