| #include "stdlib.h" |
| |
| #include "util.h" |
| |
| #include "dataset.h" |
| void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] ) |
| { |
| int i, k; |
| int j = coreid*(lda/ncores); |
| int jend = (coreid+1)*(lda/ncores); |
| for ( ; j < jend; j++ ) |
| { |
| int j32 = j << 5; |
| data_t* Cj32 = C + j32; |
| for ( k = 0; k < 32; k+=2 ) |
| { |
| data_t Aj32k = A[k + j32]; |
| data_t Aj32k2 = A[k + 1 + j32]; |
| data_t* Bk32 = B + (k << 5); |
| data_t* Bk322 = Bk32 + 32; |
| for ( i = 0; i < 32; i+=4 ) |
| { |
| Cj32[i] += Aj32k * Bk32 [i]; |
| Cj32[i] += Aj32k2 * Bk322 [i]; |
| Cj32[i+1] += Aj32k * Bk32 [i+1]; |
| Cj32[i+1] += Aj32k2 * Bk322[i+1]; |
| Cj32[i+2] += Aj32k * Bk32 [i+2]; |
| Cj32[i+2] += Aj32k2 * Bk322[i+2]; |
| Cj32[i+3] += Aj32k * Bk32 [i+3]; |
| Cj32[i+3] += Aj32k2 * Bk322[i+3]; |
| } |
| barrier(ncores); |
| } |
| } |
| |
| |
| } |