// TODO: Simple matrix-vector multiplication, every thread computes a complete dot product // // i := get_global_id(0) // // IF ID < n THEN: // yi := b[i] // LOOP j := 0 .. m DO: // yi += A[j + i * m] * x[j] // END LOOP // y[i] := yi // END IF __kernel void simpleMV(const int n, const int m, __global float* y, __global float* A, __global float* x, __global float* b){ } // TODO: Matrix-vector multiplication with parallelization of the dot product // Assumptions: M = 2^k, M <= maximum workgroup size // // i = get_group_id(0) // j = get_local_id(0) // // Q[j] := A[i * M + j] * x[j] // BARRIER // // Sum scan on Q (reduction) // // IF j = 0 THEN: // y[i] = Q[0] + b[i] // __kernel void reduceMV(const int n, const int M, __global float* y, __global float* A, __global float* x, __global float* b, __local float* Q){ } // TODO: General solution for matrix-vector multiplication, every thread processes a chunk of the dot product and visits multiple rows of the result // // t := get_local_id(0) / Z // z := get_local_id(0) % Z // // FOR i := t ; i < n ; i := i + T : // Compute Q[t * Z + z] as shown in the lecture // Sum scan on Q (reduction) // IF z = 0 THEN: // y[i] = Q[t * Z + 0] + b[i] // // END FOR __kernel void largeMV(const int n, const int m, __global float* y, __global float* A, __global float* x, __global float* b, const int T, const int Z, __local float* Q){ } // TODO: Gaussian elimination as shown in the lecture // (execute the 2nd loop of the sequential implemential in parallel) __kernel void gaussian(const int n, const int m, __global float* A){ }