| #include "stdlib.h" |
| |
| #include "util.h" |
| |
| #include "dataset.h" |
| void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] ) |
| { |
| int i, j, k; |
| |
| for ( i = 0; i < lda; i+=2 ) |
| { |
| for (k = 0; k < lda; k+=4) |
| { |
| int d0 = B[k*lda + i]; |
| int c0 = B[k*lda + i + 1]; |
| int d1 = B[(k+1)*lda + i]; |
| int c1 = B[(k+1)*lda + i + 1]; |
| int d2 = B[(k+2)*lda + i]; |
| int c2 = B[(k+2)*lda + i + 1]; |
| int d3 = B[(k+3)*lda + i]; |
| int c3 = B[(k+3)*lda + i + 1]; |
| |
| for ( j = coreid*(lda/ncores); j < (coreid+1)*(lda/ncores); j+=4) |
| { |
| |
| int sum = A[j*lda + k] * d0; |
| sum += A[j*lda + k + 1] * d1; |
| sum += A[j*lda + k + 2] * d2; |
| sum += A[j*lda + k + 3] * d3; |
| C[j*lda +i] += sum; |
| |
| sum = A[j*lda + k] * c0; |
| sum += A[j*lda + k + 1] * c1; |
| sum += A[j*lda + k + 2] * c2; |
| sum += A[j*lda + k + 3] * c3; |
| C[j*lda + i + 1] += sum; |
| |
| sum = A[(j+1)*lda + k] * d0; |
| sum += A[(j+1)*lda + k + 1] * d1; |
| sum += A[(j+1)*lda + k + 2] * d2; |
| sum += A[(j+1)*lda + k + 3] * d3; |
| C[(j+1)*lda +i] += sum; |
| |
| sum = A[(j+1)*lda + k] * c0; |
| sum += A[(j+1)*lda + k + 1] * c1; |
| sum += A[(j+1)*lda + k + 2] * c2; |
| sum += A[(j+1)*lda + k + 3] * c3; |
| C[(j+1)*lda + i + 1] += sum; |
| |
| sum = A[(j+2)*lda + k] * d0; |
| sum += A[(j+2)*lda + k + 1] * d1; |
| sum += A[(j+2)*lda + k + 2] * d2; |
| sum += A[(j+2)*lda + k + 3] * d3; |
| C[(j+2)*lda +i] += sum; |
| |
| sum = A[(j+2)*lda + k] * c0; |
| sum += A[(j+2)*lda + k + 1] * c1; |
| sum += A[(j+2)*lda + k + 2] * c2; |
| sum += A[(j+2)*lda + k + 3] * c3; |
| C[(j+2)*lda + i + 1] += sum; |
| |
| sum = A[(j+3)*lda + k] * d0; |
| sum += A[(j+3)*lda + k + 1] * d1; |
| sum += A[(j+3)*lda + k + 2] * d2; |
| sum += A[(j+3)*lda + k + 3] * d3; |
| C[(j+3)*lda +i] += sum; |
| |
| sum = A[(j+3)*lda + k] * c0; |
| sum += A[(j+3)*lda + k + 1] * c1; |
| sum += A[(j+3)*lda + k + 2] * c2; |
| sum += A[(j+3)*lda + k + 3] * c3; |
| C[(j+3)*lda + i + 1] += sum; |
| |
| } |
| barrier(ncores); |
| } |
| } |
| } |