| /* |
| Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. |
| |
| Permission is hereby granted, free of charge, to any person obtaining a copy |
| of this software and associated documentation files (the "Software"), to deal |
| in the Software without restriction, including without limitation the rights |
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| copies of the Software, and to permit persons to whom the Software is |
| furnished to do so, subject to the following conditions: |
| |
| The above copyright notice and this permission notice shall be included in |
| all copies or substantial portions of the Software. |
| |
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| THE SOFTWARE. |
| */ |
| #include<iostream> |
| |
| // hip header file |
| #include "hip/hip_runtime.h" |
| |
| #define WIDTH 16 |
| |
| #define NUM (WIDTH*WIDTH) |
| |
| #define THREADS_PER_BLOCK_X 4 |
| #define THREADS_PER_BLOCK_Y 4 |
| #define THREADS_PER_BLOCK_Z 1 |
| |
| // Device (Kernel) function, it must be void |
| // hipLaunchParm provides the execution configuration |
| __global__ void matrixTranspose(float *out, |
| float *in, |
| const int width) |
| { |
| // declare dynamic shared memory |
| HIP_DYNAMIC_SHARED(float, sharedMem); |
| |
| int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; |
| int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; |
| |
| sharedMem[y * width + x] = in[x * width + y]; |
| |
| __syncthreads(); |
| |
| out[y * width + x] = sharedMem[y * width + x]; |
| } |
| |
| // CPU implementation of matrix transpose |
| void matrixTransposeCPUReference( |
| float * output, |
| float * input, |
| const unsigned int width) |
| { |
| for(unsigned int j=0; j < width; j++) |
| { |
| for(unsigned int i=0; i < width; i++) |
| { |
| output[i*width + j] = input[j*width + i]; |
| } |
| } |
| } |
| |
| int main() { |
| |
| float* Matrix; |
| float* cpuTransposeMatrix; |
| |
| float* gpuTransposeMatrix; |
| |
| hipDeviceProp_t devProp; |
| hipGetDeviceProperties(&devProp, 0); |
| |
| std::cout << "Device name " << devProp.name << std::endl; |
| |
| int i; |
| int errors; |
| |
| hipHostMalloc(&Matrix, NUM * sizeof(float)); |
| cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); |
| |
| // initialize the input data |
| for (i = 0; i < NUM; i++) { |
| Matrix[i] = (float)i*10.0f; |
| } |
| |
| // allocate the memory on the device side |
| hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float)); |
| |
| // Lauching kernel from host |
| hipLaunchKernelGGL(matrixTranspose, |
| dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y), |
| dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), |
| sizeof(float)*WIDTH*WIDTH, 0, |
| gpuTransposeMatrix , Matrix, WIDTH); |
| hipDeviceSynchronize(); |
| |
| // CPU MatrixTranspose computation |
| matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); |
| |
| // verify the results |
| errors = 0; |
| double eps = 1.0E-6; |
| for (i = 0; i < NUM; i++) { |
| if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { |
| printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]); |
| errors++; |
| } |
| } |
| if (errors!=0) { |
| printf("FAILED: %d errors\n",errors); |
| } else { |
| printf ("dynamic_shared PASSED!\n"); |
| } |
| |
| //free the resources on device side |
| hipFree(gpuTransposeMatrix); |
| |
| //free the resources on host side |
| hipFree(Matrix); |
| free(cpuTransposeMatrix); |
| |
| return errors; |
| } |