You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
139 lines
3.5 KiB
139 lines
3.5 KiB
// TODO: Simple matrix-vector multiplication, every thread computes a complete dot product
|
|
//
|
|
// i := get_global_id(0)
|
|
//
|
|
// IF ID < n THEN:
|
|
// yi := b[i]
|
|
// LOOP j := 0 .. m DO:
|
|
// yi += A[j + i * m] * x[j]
|
|
// END LOOP
|
|
// y[i] := yi
|
|
// END IF
|
|
__kernel
|
|
void simpleMV(const int n, const int m, __global float* y, __global float* A, __global float* x, __global float* b){
|
|
int i = get_global_id(0);
|
|
|
|
if (i < n) {
|
|
float yi = b[i];
|
|
for (int j = 0; j < m; j++) {
|
|
yi += A[j + i * m] * x[j];
|
|
}
|
|
y[i] = yi;
|
|
}
|
|
}
|
|
|
|
// TODO: Matrix-vector multiplication with parallelization of the dot product
|
|
// Assumptions: M = 2^k, M <= maximum workgroup size
|
|
//
|
|
// i = get_group_id(0)
|
|
// j = get_local_id(0)
|
|
//
|
|
// Q[j] := A[i * M + j] * x[j]
|
|
// BARRIER
|
|
//
|
|
// Sum scan on Q (reduction)
|
|
//
|
|
// IF j = 0 THEN:
|
|
// y[i] = Q[0] + b[i]
|
|
//
|
|
__kernel
|
|
void reduceMV(const int n, const int M, __global float* y, __global float* A, __global float* x, __global float* b, __local float* Q){
|
|
int i = get_group_id(0); // Matrix sora, workgroup ID
|
|
int j = get_local_id(0); // Oszlop a matrixban, munkacsoporton beluli ID
|
|
|
|
// MAP
|
|
// Q - matrix i soranak és a vektornak elemenkenti szorzata
|
|
Q[j] = A[i * M + j] * x[j];
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// REDUCE
|
|
// Lokalis memoria Q vektort Q[0]-ba redukalja összeadva
|
|
for (size_t s = get_local_size(0) / 2; s > 0; s >>= 1) {
|
|
if (j < s) {
|
|
Q[j] = Q[j] + Q[j + s];
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
|
|
if (i == 0) {
|
|
y[j] = Q[0] + b[j];
|
|
}
|
|
}
|
|
|
|
// TODO: General solution for matrix-vector multiplication, every thread processes a chunk of the dot product and visits multiple rows of the result
|
|
//
|
|
// t := get_local_id(0) / Z
|
|
// z := get_local_id(0) % Z
|
|
//
|
|
// FOR i := t ; i < n ; i := i + T :
|
|
// Compute Q[t * Z + z] as shown in the lecture
|
|
// Sum scan on Q (reduction)
|
|
// IF z = 0 THEN:
|
|
// y[i] = Q[t * Z + 0] + b[i]
|
|
//
|
|
// END FOR
|
|
__kernel
|
|
void largeMV(const int n, const int m, __global float* y, __global float* A, __global float* x, __global float* b, const int T, const int Z, __local float* Q){
|
|
int id = get_local_id(0);
|
|
// WG size: T*Z 4
|
|
// ID 2
|
|
int t = get_local_id(0) / T; // sor 1
|
|
int z = get_local_id(0) % Z; // oszlop 0
|
|
|
|
for (size_t i = 0; i < m; i += Z) { // sor kezdete ahol a modul van
|
|
if (z == 0) {
|
|
y[i + t] = 0;
|
|
}
|
|
for (size_t j = 0; j < n; j += T) { // oszlop kezdete ahol a modul van
|
|
Q[t * Z + z] = A[(j + z) + (i + t) * m] * x[i + t];
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
for (size_t s = Z / 2; s > 0; s >>= 1) {
|
|
if (z < s) {
|
|
Q[t * Z + z] = Q[t * Z + z] + Q[t * Z + z + s];
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
if (z == 0) {
|
|
y[i + t] += Q[t * Z + 0];
|
|
}
|
|
}
|
|
if (z == 0) {
|
|
y[i + t] += b[i + t];
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
// TODO: Gaussian elimination as shown in the lecture
|
|
// for k := 1 .. n-1 do
|
|
// for i : = k + 1 ..n do
|
|
// l : = aik / akk
|
|
// bi : = bi – l * bk
|
|
// for j : = k ..n do
|
|
// aij : = aij – l * akj
|
|
// end for
|
|
// end for
|
|
// end for
|
|
// (execute the 2nd loop of the sequential implemential in parallel)
|
|
__kernel void gaussian(const int n, const int m, __global float* A){
|
|
int i = get_global_id(0);
|
|
int lid = get_local_id(0);
|
|
|
|
/*if (i < n) {
|
|
for (size_t k = 1; k < n - 1; k++) {
|
|
int l = A[k * i] / A[k * k];
|
|
for (size_t j = k; k < n; j++) {
|
|
A[i * j] = A[i * j] - l * A[k * j];
|
|
}
|
|
}
|
|
}*/
|
|
for (size_t k = 1; k < n - 1; k++) {
|
|
float l = A[k * n + i] / A[k * n + k];
|
|
for (size_t j = k; j < n; j++) {
|
|
A[i * n + j] = A[i * n + j] - l * A[k * n + j];
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
|