| #include "stdlib.h" |
| |
| #include "util.h" |
| |
| #include "dataset.h" |
| void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] ) |
| { |
| |
| // ***************************** // |
| // **** ADD YOUR CODE HERE ***** // |
| // ***************************** // |
| // |
| // feel free to make a separate function for MI and MSI versions. |
| int i, j, k, ii, jj, bsize; |
| bsize = 16; |
| for ( jj = bsize*coreid; jj < lda; jj += bsize*ncores) { |
| for ( ii = 0; ii < lda; ii += bsize) { |
| for ( j = jj; j < lda && j < jj + bsize; j++) { |
| for ( i = ii; i < lda && i < ii + bsize; i += 8) { |
| data_t c1 = C[i + j*lda]; |
| data_t c2 = C[i + j*lda + 1]; |
| data_t c3 = C[i + j*lda + 2]; |
| data_t c4 = C[i + j*lda + 3]; |
| data_t c5 = C[i + j*lda + 4]; |
| data_t c6 = C[i + j*lda + 5]; |
| data_t c7 = C[i + j*lda + 6]; |
| data_t c8 = C[i + j*lda + 7]; |
| for ( k = 0; k < lda; k+=4 ) { |
| for (int x = 0; x < 4; x++) { |
| data_t a = A[j*lda + k+x]; |
| data_t b1 = B[(k+x)*lda + i]; |
| data_t b2 = B[(k+x)*lda + i + 1]; |
| data_t b3 = B[(k+x)*lda + i + 2]; |
| data_t b4 = B[(k+x)*lda + i + 3]; |
| data_t b5 = B[(k+x)*lda + i + 4]; |
| data_t b6 = B[(k+x)*lda + i + 5]; |
| data_t b7 = B[(k+x)*lda + i + 6]; |
| data_t b8 = B[(k+x)*lda + i + 7]; |
| c1 += a * b1; |
| c2 += a * b2; |
| c3 += a * b3; |
| c4 += a * b4; |
| c5 += a * b5; |
| c6 += a * b6; |
| c7 += a * b7; |
| c8 += a * b8; |
| } |
| } |
| C[i + j*lda] = c1; |
| C[i + j*lda + 1] = c2; |
| C[i + j*lda + 2] = c3; |
| C[i + j*lda + 3] = c4; |
| C[i + j*lda + 4] = c5; |
| C[i + j*lda + 5] = c6; |
| C[i + j*lda + 6] = c7; |
| C[i + j*lda + 7] = c8; |
| } |
| } |
| } |
| } |
| |
| } |