From 9dd98fb4b05de5db91528079ef7b6ec1038f6468 Mon Sep 17 00:00:00 2001 From: gyulaid Date: Tue, 15 Mar 2022 23:24:21 +0100 Subject: [PATCH] Large almost working --- Linear/Jacobi.cpp | 2 ++ Linear/Large.cpp | 43 ++++++++++++++++++++++++++++++++++++++++++- Linear/Linear.cpp | 3 ++- kernels/linear.cl | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 2 deletions(-) diff --git a/Linear/Jacobi.cpp b/Linear/Jacobi.cpp index 8fe0533..7864d2e 100644 --- a/Linear/Jacobi.cpp +++ b/Linear/Jacobi.cpp @@ -88,6 +88,8 @@ void Jacobi::gpu_compute(cl::Context* context, cl::CommandQueue* queue, cl::Prog { MatrixVectorMultiplier* MVMultiplier = MethodFactory(type, context, queue, program); + + if (MVMultiplier != NULL) { int inputBuffer = 0; const int iterations = 20; diff --git a/Linear/Large.cpp b/Linear/Large.cpp index fe5ce48..b9ad746 100644 --- a/Linear/Large.cpp +++ b/Linear/Large.cpp @@ -1,4 +1,5 @@ #include "LinearTests.h" +#include Large::Large(cl::Context* _context, cl::CommandQueue* _queue, cl::Program* _program) { @@ -9,5 +10,45 @@ Large::Large(cl::Context* _context, cl::CommandQueue* _queue, cl::Program* _prog void Large::dewIt(int n, int m, float* y, const float* A, const float* x, const float* b) { - //TODO: Implement + cl_int err = CL_SUCCESS; + cl::Event _event; + + cl::Kernel kernel = cl::Kernel(*program, "largeMV", &err); + CheckCLError(err); + + cl::Buffer ABuffer(*context, CL_MEM_READ_ONLY, sizeof(float) * n * m, NULL, &err); + CheckCLError(err); + queue->enqueueWriteBuffer(ABuffer, true, 0, sizeof(float) * n * m, A); + + cl::Buffer XBuffer(*context, CL_MEM_READ_ONLY, sizeof(float) * m, NULL, &err); + CheckCLError(err); + queue->enqueueWriteBuffer(XBuffer, true, 0, sizeof(float) * m, x); + + cl::Buffer YBuffer(*context, CL_MEM_WRITE_ONLY, sizeof(float) * n, NULL, &err); + CheckCLError(err); + queue->enqueueWriteBuffer(YBuffer, true, 0, sizeof(float) * n, y); + + cl::Buffer BBuffer(*context, CL_MEM_READ_ONLY, sizeof(float) * n, NULL, &err); + CheckCLError(err); + queue->enqueueWriteBuffer(BBuffer, true, 0, sizeof(float) * n, b); + + //void largeMV(const int n, const int m, __global float* y, __global float* A, + // __global float* x, __global float* b, const int T, const int Z, __local float* Q) + int T = 2; // kimenet T hosszu darabokra + int Z = 2; // Bemenet Z hosszu darabokra + + kernel.setArg(0, n); + kernel.setArg(1, m); + kernel.setArg(2, YBuffer); + kernel.setArg(3, ABuffer); + kernel.setArg(4, XBuffer); + kernel.setArg(5, BBuffer); + kernel.setArg(6, T); + kernel.setArg(7, Z); + kernel.setArg(8, sizeof(float) * T * Z, NULL); + + queue->enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(Z*T, 1), cl::NDRange(T * Z, 1), NULL, &_event); + _event.wait(); + + queue->enqueueReadBuffer(YBuffer, true, 0, sizeof(float) * n, y); } diff --git a/Linear/Linear.cpp b/Linear/Linear.cpp index b63e4e0..71515a5 100644 --- a/Linear/Linear.cpp +++ b/Linear/Linear.cpp @@ -60,8 +60,9 @@ int main() capi(); //cppapi(); OpenCLHandler handler("../kernels/linear.cl"); - Jacobi j(MVType::ReduceMV); + //Jacobi j(MVType::ReduceMV); //Jacobi j(MVType::SimpleMV); + Jacobi j(MVType::LargeMV); handler.run_test(&j); return 0; } diff --git a/kernels/linear.cl b/kernels/linear.cl index 203786b..410b610 100644 --- a/kernels/linear.cl +++ b/kernels/linear.cl @@ -74,7 +74,43 @@ void reduceMV(const int n, const int M, __global float* y, __global float* A, __ // END FOR __kernel void largeMV(const int n, const int m, __global float* y, __global float* A, __global float* x, __global float* b, const int T, const int Z, __local float* Q){ + int id = get_local_id(0); + // WG size: T*Z 4 + // ID 2 + int t = get_local_id(0) / T; // sor 1 + int z = get_local_id(0) % Z; // oszlop 0 + for (size_t i = 0; i < m; i += Z) { // sor kezdete ahol a modul van + if (z == 0) { + y[i + t] = 0; + } + Q[id] = 0; + for (size_t j = 0; j < n; j += T) { // oszlop kezdete ahol a modul van + + //Q[t * Z + z] += A[(j + z) + (i + t) * m] * x[i + t]; + Q[t * Z + z] = A[(j + z) + (i + t) * m] * x[i + t]; + barrier(CLK_LOCAL_MEM_FENCE); + + for (size_t s = Z / 2; s > 0; s >>= 1) { + if (t < s) { + Q[t * Z + z] = Q[t * Z + z] + Q[t * Z + z + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + //if (z == 0) { + // for (size_t zz = 1; zz < Z; zz++) { + // Q[t * Z] += Q[t * Z + zz]; + // } + //} + if (z == 0) { + y[i + t] += Q[t * Z + 0]; + } + } + if (z == 0) { + y[i + t] += b[i + t]; + } + + } } // TODO: Gaussian elimination as shown in the lecture