gpgpu/kernels/linear.cl


								// TODO: Simple matrix-vector multiplication, every thread computes a complete dot product

								//

								// i := get_global_id(0)

								//

								// IF ID < n THEN:

								//   yi := b[i]

								//   LOOP j := 0 .. m DO:

								//     yi += A[j + i * m] * x[j]

								//   END LOOP

								//   y[i] := yi

								// END IF

								__kernel

								void simpleMV(const int n, const int m, __global float* y, __global float* A, __global float* x, __global float* b){


								}


								// TODO: Matrix-vector multiplication with parallelization of the dot product

								// Assumptions: M = 2^k, M <= maximum workgroup size

								//

								// i = get_group_id(0)

								// j = get_local_id(0)

								//

								// Q[j] := A[i * M + j] * x[j]

								// BARRIER

								//

								// Sum scan on Q (reduction)

								//

								// IF j = 0 THEN:

								//   y[i] = Q[0] + b[i]

								//

								__kernel

								void reduceMV(const int n, const int M, __global float* y, __global float* A, __global float* x, __global float* b, __local float* Q){


								}


								// TODO: General solution for matrix-vector multiplication, every thread processes a chunk of the dot product and visits multiple rows of the result

								//

								// t := get_local_id(0) / Z

								// z := get_local_id(0) % Z

								//

								// FOR i := t ; i < n ; i := i + T :

								//    Compute Q[t * Z + z] as shown in the lecture

								//    Sum scan on Q (reduction)

								//    IF z = 0 THEN:

								//        y[i] = Q[t * Z + 0] + b[i]

								//

								// END FOR

								__kernel

								void largeMV(const int n, const int m, __global float* y, __global float* A, __global float* x, __global float* b, const int T, const int Z, __local float* Q){


								}


								// TODO: Gaussian elimination as shown in the lecture

								// (execute the 2nd loop of the sequential implemential in parallel)

								__kernel void gaussian(const int n, const int m, __global float* A){


								}