diff --git a/Linear/Large.cpp b/Linear/Large.cpp index 56b8fa2..fe5ce48 100644 --- a/Linear/Large.cpp +++ b/Linear/Large.cpp @@ -1,8 +1,10 @@ #include "LinearTests.h" -Large::Large(cl::Context* context, cl::CommandQueue* queue, cl::Program* program) +Large::Large(cl::Context* _context, cl::CommandQueue* _queue, cl::Program* _program) { - //TODO: Implement + context = _context; + queue = _queue; + program = _program; } void Large::dewIt(int n, int m, float* y, const float* A, const float* x, const float* b) diff --git a/Linear/Linear.cpp b/Linear/Linear.cpp index 3e16a0d..b63e4e0 100644 --- a/Linear/Linear.cpp +++ b/Linear/Linear.cpp @@ -60,7 +60,8 @@ int main() capi(); //cppapi(); OpenCLHandler handler("../kernels/linear.cl"); - Jacobi j(MVType::SimpleMV); + Jacobi j(MVType::ReduceMV); + //Jacobi j(MVType::SimpleMV); handler.run_test(&j); return 0; } diff --git a/Linear/Reduce.cpp b/Linear/Reduce.cpp index 248428f..3d9a014 100644 --- a/Linear/Reduce.cpp +++ b/Linear/Reduce.cpp @@ -1,11 +1,56 @@ #include "LinearTests.h" +#include -Reduce::Reduce(cl::Context* context, cl::CommandQueue* queue, cl::Program* program) +Reduce::Reduce(cl::Context* _context, cl::CommandQueue* _queue, cl::Program* _program) { - //TODO: Implement + context = _context; + queue = _queue; + program = _program; } void Reduce::dewIt(int n, int m, float* y, const float* A, const float* x, const float* b) { - //TODO: Implement -} + cl_int err = CL_SUCCESS; + cl::Event _event; + + cl::Kernel kernel = cl::Kernel(*program, "reduceMV", &err); + CheckCLError(err); + + cl::Buffer ABuffer(*context, CL_MEM_READ_ONLY, sizeof(float) * n * m, NULL, &err); + CheckCLError(err); + queue->enqueueWriteBuffer(ABuffer, true, 0, sizeof(float) * n * m, A); + + cl::Buffer XBuffer(*context, CL_MEM_READ_ONLY, sizeof(float) * m, NULL, &err); + CheckCLError(err); + queue->enqueueWriteBuffer(XBuffer, true, 0, sizeof(float) * m, x); + + cl::Buffer YBuffer(*context, CL_MEM_WRITE_ONLY, sizeof(float) * n, NULL, &err); + CheckCLError(err); + queue->enqueueWriteBuffer(YBuffer, true, 0, sizeof(float) * n, y); + + cl::Buffer BBuffer(*context, CL_MEM_READ_ONLY, sizeof(float) * n, NULL, &err); + CheckCLError(err); + queue->enqueueWriteBuffer(BBuffer, true, 0, sizeof(float) * n, b); + + + + + // void reduceMV + kernel.setArg(0, n); // (const int n, + kernel.setArg(1, m); // const int M, + kernel.setArg(2, YBuffer); // __global float* y, + kernel.setArg(3, ABuffer); // __global float* A, + kernel.setArg(4, XBuffer); // __global float* x, + kernel.setArg(5, BBuffer); // __global float* b, + kernel.setArg(6, sizeof(float) * n * m, NULL); // __local float* Q) + + queue->enqueueNDRangeKernel(kernel, + cl::NullRange, // Indexek nem eloffszetelve + cl::NDRange(n, 1), // Minden elemet egy szál + cl::NullRange, // Workgroup méret? - ez az auto, ha nem indul, 1024-re, onnan csökkent, amig elindul + NULL, // + &_event); + _event.wait(); + + queue->enqueueReadBuffer(YBuffer, true, 0, sizeof(float) * n, y); +} \ No newline at end of file diff --git a/Linear/Simple.cpp b/Linear/Simple.cpp index 7c3eeaf..899d139 100644 --- a/Linear/Simple.cpp +++ b/Linear/Simple.cpp @@ -20,7 +20,7 @@ void Simple::dewIt(int n, int m, float* y, const float* A, const float* x, const CheckCLError(err); queue->enqueueWriteBuffer(ABuffer, true, 0, sizeof(float) * n *m, A); - cl::Buffer XBuffer(*context, CL_MEM_READ_ONLY, sizeof(float) * m, NULL, &err); // Ot kell kiirogatni + cl::Buffer XBuffer(*context, CL_MEM_READ_ONLY, sizeof(float) * m, NULL, &err); CheckCLError(err); queue->enqueueWriteBuffer(XBuffer, true, 0, sizeof(float) * m, x); @@ -47,5 +47,5 @@ void Simple::dewIt(int n, int m, float* y, const float* A, const float* x, const &_event); _event.wait(); - queue->enqueueReadBuffer(YBuffer, true, 0, sizeof(int) * n, y); + queue->enqueueReadBuffer(YBuffer, true, 0, sizeof(float) * n, y); } diff --git a/kernels/linear.cl b/kernels/linear.cl index c3cf73b..203786b 100644 --- a/kernels/linear.cl +++ b/kernels/linear.cl @@ -11,7 +11,7 @@ // END IF __kernel void simpleMV(const int n, const int m, __global float* y, __global float* A, __global float* x, __global float* b){ - int i = get_local_id(0); + int i = get_global_id(0); if (i < n) { float yi = b[i]; @@ -38,8 +38,26 @@ void simpleMV(const int n, const int m, __global float* y, __global float* A, __ // __kernel void reduceMV(const int n, const int M, __global float* y, __global float* A, __global float* x, __global float* b, __local float* Q){ - int i = get_group_id(0); - int j = get_local_id(0); + int i = get_group_id(0); // Matrix sora, workgroup ID + int j = get_local_id(0); // Oszlop a matrixban, munkacsoporton beluli ID + + // MAP + // Q - matrix i soranak és a vektornak elemenkenti szorzata + Q[j] = A[i * M + j] * x[j]; + barrier(CLK_LOCAL_MEM_FENCE); + + // REDUCE + // Lokalis memoria Q vektort Q[0]-ba redukalja összeadva + for (size_t s = get_local_size(0) / 2; s > 0; s >>= 1) { + if (j < s) { + Q[j] = Q[j] + Q[j + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (i == 0) { + y[j] = Q[0] + b[j]; + } } // TODO: General solution for matrix-vector multiplication, every thread processes a chunk of the dot product and visits multiple rows of the result