// TODO: Simple matrix-vector multiplication, every thread computes a complete dot product // // i := get_global_id(0) // // IF ID < n THEN: // yi := b[i] // LOOP j := 0 .. m DO: // yi += A[j + i * m] * x[j] // END LOOP // y[i] := yi // END IF __kernel void simpleMV(const int n, const int m, __global float* y, __global float* A, __global float* x, __global float* b){ int i = get_global_id(0); if (i < n) { float yi = b[i]; for (int j = 0; j < m; j++) { yi += A[j + i * m] * x[j]; } y[i] = yi; } } // TODO: Matrix-vector multiplication with parallelization of the dot product // Assumptions: M = 2^k, M <= maximum workgroup size // // i = get_group_id(0) // j = get_local_id(0) // // Q[j] := A[i * M + j] * x[j] // BARRIER // // Sum scan on Q (reduction) // // IF j = 0 THEN: // y[i] = Q[0] + b[i] // __kernel void reduceMV(const int n, const int M, __global float* y, __global float* A, __global float* x, __global float* b, __local float* Q){ int i = get_group_id(0); // Matrix sora, workgroup ID int j = get_local_id(0); // Oszlop a matrixban, munkacsoporton beluli ID // MAP // Q - matrix i soranak és a vektornak elemenkenti szorzata Q[j] = A[i * M + j] * x[j]; barrier(CLK_LOCAL_MEM_FENCE); // REDUCE // Lokalis memoria Q vektort Q[0]-ba redukalja összeadva for (size_t s = get_local_size(0) / 2; s > 0; s >>= 1) { if (j < s) { Q[j] = Q[j] + Q[j + s]; } barrier(CLK_LOCAL_MEM_FENCE); } if (i == 0) { y[j] = Q[0] + b[j]; } } // TODO: General solution for matrix-vector multiplication, every thread processes a chunk of the dot product and visits multiple rows of the result // // t := get_local_id(0) / Z // z := get_local_id(0) % Z // // FOR i := t ; i < n ; i := i + T : // Compute Q[t * Z + z] as shown in the lecture // Sum scan on Q (reduction) // IF z = 0 THEN: // y[i] = Q[t * Z + 0] + b[i] // // END FOR __kernel void largeMV(const int n, const int m, __global float* y, __global float* A, __global float* x, __global float* b, const int T, const int Z, __local float* Q){ int t = get_local_id(0) / T; // sor 1 int z = get_local_id(0) % Z; // oszlop 0 for (size_t i = 0; i < m; i += Z) { // sor kezdete ahol a modul van if (z == 0) { y[i + t] = 0; } for (size_t j = 0; j < n; j += T) { // oszlop kezdete ahol a modul van Q[t * Z + z] = A[(j + z) + (i + t) * m] * x[i + t]; barrier(CLK_LOCAL_MEM_FENCE); for (size_t s = Z / 2; s > 0; s >>= 1) { if (z < s) { Q[t * Z + z] = Q[t * Z + z] + Q[t * Z + z + s]; } barrier(CLK_LOCAL_MEM_FENCE); } if (z == 0) { y[i + t] += Q[t * Z + 0]; } } if (z == 0) { y[i + t] += b[i + t]; } } } // TODO: Gaussian elimination as shown in the lecture // for k := 1 .. n-1 do // for i : = k + 1 ..n do // l : = aik / akk // bi : = bi – l * bk // for j : = k ..n do // aij : = aij – l * akj // end for // end for // end for // (execute the 2nd loop of the sequential implemential in parallel) __kernel void gaussian(const int n, const int m, __global float* A){ int j = get_global_id(0); for (size_t k = 0; k < m; k++) { for (size_t i = 0; i < m; i++) { if (k != i) { float ratio = A[i * n + k] / A[k * n + k]; barrier(CLK_LOCAL_MEM_FENCE); A[i * n + j] = A[i * n + j] - ratio * A[k * n + j]; } } } barrier(CLK_LOCAL_MEM_FENCE); if (j < m) { float ref = A[j * n + j]; A[j] = ref; for (size_t i = 0; i < n; i++) { A[j * n + i] = A[j * n + i] / ref; } } }