resources: Add HIP sample apps for GPU model tests
This patch adds in a subset of the sample applications found in
HIP/samples/2_Cookbook that should all work in gem5
hipEvent isn't included due to the timers in ROCm 1.6.4 not working in
gem5
profiler isn't included as it requires a kernel module, which doesn't
work well with Docker
peer2peer isn't included due to lack of multi-GPU support currently
Change-Id: I466af7597a4f85572f36c90081d1185805278e9e
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5-resources/+/36055
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: Jason Lowe-Power <power.jg@gmail.com>
diff --git a/README.md b/README.md
index e03e811..8c8491e 100644
--- a/README.md
+++ b/README.md
@@ -420,6 +420,49 @@
The compiled binary can be found in `src/hsa-agent-pkt/bin`
+# Resource: HIP Sample Applications
+
+The [HIP sample apps](
+https://github.com/ROCm-Developer-Tools/HIP/tree/roc-1.6.0/samples) contain
+applications that introduce various GPU programming concepts that are usable
+in HIP.
+
+The samples cover topics such as using and accessing different parts of GPU
+memory, running multiple GPU streams, and optimization techniques for GPU code.
+
+Certain apps aren't included due to complexities with either ROCm or Docker
+(hipEvent, profiler), or due to lack of feature support in gem5 (peer2peer)
+
+## Compilation
+
+```
+cd src/hip-samples
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
+```
+
+Individual programs can be made by specifying the name of the program
+
+By default, this code builds for gfx801, a GCN3-based APU. This can be
+overridden by specifying `-e HCC_AMDGPU_TARGET=<target>` in the build command.
+
+## Pre-built binary
+
+<http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/2dshfl>
+
+<http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/dynamic_shared>
+
+<http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/inline_asm>
+
+<http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/MatrixTranspose>
+
+<http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/sharedMemory>
+
+<http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/shfl>
+
+<http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/stream>
+
+<http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/unroll>
+
# Resource: SPEC 2006
The [Standard Performance Evaluation Corporation](
@@ -552,6 +595,8 @@
same licence as 'src/square/square.cpp'.
`src/hsa-agent-pkt/HSA_Interface.[h|.cpp]` are licensed under a BSD Lisense
(A University of Maryland copyright).
+* **hip-samples**: Consult individual copyright notices of the source file in
+'src/hip-samples/src'
* **spec 2006**: SPEC CPU 2006 requires purchase of benchmark suite from
[SPEC](https://www.spec.org/cpu2006/) thus, it cannot be freely distributed.
Consult individual copyright notices of source files in `src/spec-2006`.
diff --git a/src/hip-samples/.gitignore b/src/hip-samples/.gitignore
new file mode 100644
index 0000000..ba077a4
--- /dev/null
+++ b/src/hip-samples/.gitignore
@@ -0,0 +1 @@
+bin
diff --git a/src/hip-samples/Makefile b/src/hip-samples/Makefile
new file mode 100644
index 0000000..da12ce5
--- /dev/null
+++ b/src/hip-samples/Makefile
@@ -0,0 +1,21 @@
+HIP_PATH?= /opt/rocm/hip
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+SRC_DIR := src
+BIN_DIR := bin
+
+SOURCES := $(wildcard $(SRC_DIR)/*.cpp)
+EXECUTABLES := $(basename $(notdir $(SOURCES)))
+
+.PHONY: all clean
+
+all: $(EXECUTABLES)
+
+$(EXECUTABLES): %: $(SRC_DIR)/%.cpp | $(BIN_DIR)
+ $(HIPCC) $< -o $(BIN_DIR)/$@
+
+$(BIN_DIR):
+ mkdir -p $@
+
+clean:
+ rm -rf $(BIN_DIR)
diff --git a/src/hip-samples/src/2dshfl.cpp b/src/hip-samples/src/2dshfl.cpp
new file mode 100644
index 0000000..1b22a0c
--- /dev/null
+++ b/src/hip-samples/src/2dshfl.cpp
@@ -0,0 +1,137 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+
+#define WIDTH 4
+
+#define NUM (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+ float *out,
+ float *in,
+ const int width)
+{
+ int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+ int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+ float val = in[y*width + x];
+
+ out[x*width + y] = __shfl(val,y*width + x);
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+ float * output,
+ float * input,
+ const unsigned int width)
+{
+ for(unsigned int j=0; j < width; j++)
+ {
+ for(unsigned int i=0; i < width; i++)
+ {
+ output[i*width + j] = input[j*width + i];
+ }
+ }
+}
+
+int main() {
+
+ float* Matrix;
+ float* TransposeMatrix;
+ float* cpuTransposeMatrix;
+
+ float* gpuMatrix;
+ float* gpuTransposeMatrix;
+
+ hipDeviceProp_t devProp;
+ hipGetDeviceProperties(&devProp, 0);
+
+ std::cout << "Device name " << devProp.name << std::endl;
+
+ int i;
+ int errors;
+
+ Matrix = (float*)malloc(NUM * sizeof(float));
+ TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+ cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+ // initialize the input data
+ for (i = 0; i < NUM; i++) {
+ Matrix[i] = (float)i*10.0f;
+ }
+
+ // allocate the memory on the device side
+ hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+ hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+ // Memory transfer from host to device
+ hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+ // Lauching kernel from host
+ hipLaunchKernel(matrixTranspose,
+ dim3(1),
+ dim3(THREADS_PER_BLOCK_X , THREADS_PER_BLOCK_Y),
+ 0, 0,
+ gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+ // Memory transfer from device to host
+ hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+ // CPU MatrixTranspose computation
+ matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+ // verify the results
+ errors = 0;
+ double eps = 1.0E-6;
+ for (i = 0; i < NUM; i++) {
+ if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+ printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+ errors++;
+ }
+ }
+ if (errors!=0) {
+ printf("FAILED: %d errors\n",errors);
+ } else {
+ printf ("PASSED!\n");
+ }
+
+ //free the resources on device side
+ hipFree(gpuMatrix);
+ hipFree(gpuTransposeMatrix);
+
+ //free the resources on host side
+ free(Matrix);
+ free(TransposeMatrix);
+ free(cpuTransposeMatrix);
+
+ return errors;
+}
diff --git a/src/hip-samples/src/MatrixTranspose.cpp b/src/hip-samples/src/MatrixTranspose.cpp
new file mode 100644
index 0000000..264fcbe
--- /dev/null
+++ b/src/hip-samples/src/MatrixTranspose.cpp
@@ -0,0 +1,136 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+
+#define WIDTH 1024
+
+
+#define NUM (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+ float *out,
+ float *in,
+ const int width)
+{
+ int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+ int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+ out[y * width + x] = in[x * width + y];
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+ float * output,
+ float * input,
+ const unsigned int width)
+{
+ for(unsigned int j=0; j < width; j++)
+ {
+ for(unsigned int i=0; i < width; i++)
+ {
+ output[i*width + j] = input[j*width + i];
+ }
+ }
+}
+
+int main() {
+
+ float* Matrix;
+ float* TransposeMatrix;
+ float* cpuTransposeMatrix;
+
+ float* gpuMatrix;
+ float* gpuTransposeMatrix;
+
+ hipDeviceProp_t devProp;
+ hipGetDeviceProperties(&devProp, 0);
+
+ std::cout << "Device name " << devProp.name << std::endl;
+
+ int i;
+ int errors;
+
+ Matrix = (float*)malloc(NUM * sizeof(float));
+ TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+ cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+ // initialize the input data
+ for (i = 0; i < NUM; i++) {
+ Matrix[i] = (float)i*10.0f;
+ }
+
+ // allocate the memory on the device side
+ hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+ hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+ // Memory transfer from host to device
+ hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+ // Lauching kernel from host
+ hipLaunchKernel(matrixTranspose,
+ dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+ dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+ 0, 0,
+ gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+ // Memory transfer from device to host
+ hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+ // CPU MatrixTranspose computation
+ matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+ // verify the results
+ errors = 0;
+ double eps = 1.0E-6;
+ for (i = 0; i < NUM; i++) {
+ if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+ errors++;
+ }
+ }
+ if (errors!=0) {
+ printf("FAILED: %d errors\n",errors);
+ } else {
+ printf ("PASSED!\n");
+ }
+
+ //free the resources on device side
+ hipFree(gpuMatrix);
+ hipFree(gpuTransposeMatrix);
+
+ //free the resources on host side
+ free(Matrix);
+ free(TransposeMatrix);
+ free(cpuTransposeMatrix);
+
+ return errors;
+}
diff --git a/src/hip-samples/src/dynamic_shared.cpp b/src/hip-samples/src/dynamic_shared.cpp
new file mode 100644
index 0000000..22d7eb9
--- /dev/null
+++ b/src/hip-samples/src/dynamic_shared.cpp
@@ -0,0 +1,141 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+#define WIDTH 16
+
+#define NUM (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+ float *out,
+ float *in,
+ const int width)
+{
+ // declare dynamic shared memory
+ HIP_DYNAMIC_SHARED(float, sharedMem);
+
+ int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+ int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+ sharedMem[y * width + x] = in[x * width + y];
+
+ __syncthreads();
+
+ out[y * width + x] = sharedMem[y * width + x];
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+ float * output,
+ float * input,
+ const unsigned int width)
+{
+ for(unsigned int j=0; j < width; j++)
+ {
+ for(unsigned int i=0; i < width; i++)
+ {
+ output[i*width + j] = input[j*width + i];
+ }
+ }
+}
+
+int main() {
+
+ float* Matrix;
+ float* TransposeMatrix;
+ float* cpuTransposeMatrix;
+
+ float* gpuMatrix;
+ float* gpuTransposeMatrix;
+
+ hipDeviceProp_t devProp;
+ hipGetDeviceProperties(&devProp, 0);
+
+ std::cout << "Device name " << devProp.name << std::endl;
+
+ int i;
+ int errors;
+
+ Matrix = (float*)malloc(NUM * sizeof(float));
+ TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+ cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+ // initialize the input data
+ for (i = 0; i < NUM; i++) {
+ Matrix[i] = (float)i*10.0f;
+ }
+
+ // allocate the memory on the device side
+ hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+ hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+ // Memory transfer from host to device
+ hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+ // Lauching kernel from host
+ hipLaunchKernel(matrixTranspose,
+ dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+ dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+ sizeof(float)*WIDTH*WIDTH, 0,
+ gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+ // Memory transfer from device to host
+ hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+ // CPU MatrixTranspose computation
+ matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+ // verify the results
+ errors = 0;
+ double eps = 1.0E-6;
+ for (i = 0; i < NUM; i++) {
+ if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+ printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+ errors++;
+ }
+ }
+ if (errors!=0) {
+ printf("FAILED: %d errors\n",errors);
+ } else {
+ printf ("dynamic_shared PASSED!\n");
+ }
+
+ //free the resources on device side
+ hipFree(gpuMatrix);
+ hipFree(gpuTransposeMatrix);
+
+ //free the resources on host side
+ free(Matrix);
+ free(TransposeMatrix);
+ free(cpuTransposeMatrix);
+
+ return errors;
+}
diff --git a/src/hip-samples/src/inline_asm.cpp b/src/hip-samples/src/inline_asm.cpp
new file mode 100644
index 0000000..f2345e5
--- /dev/null
+++ b/src/hip-samples/src/inline_asm.cpp
@@ -0,0 +1,136 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+#define WIDTH 1024
+
+#define NUM (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+ float *out,
+ float *in,
+ const int width)
+{
+
+ int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+ int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+ asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x]));
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+ float * output,
+ float * input,
+ const unsigned int width)
+{
+ for(unsigned int j=0; j < width; j++)
+ {
+ for(unsigned int i=0; i < width; i++)
+ {
+ output[i*width + j] = input[j*width + i];
+ }
+ }
+}
+
+int main() {
+
+ float* Matrix;
+ float* TransposeMatrix;
+ float* cpuTransposeMatrix;
+
+ float* gpuMatrix;
+ float* gpuTransposeMatrix;
+
+ hipDeviceProp_t devProp;
+ hipGetDeviceProperties(&devProp, 0);
+
+ std::cout << "Device name " << devProp.name << std::endl;
+
+ int i;
+ int errors;
+
+ Matrix = (float*)malloc(NUM * sizeof(float));
+ TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+ cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+ // initialize the input data
+ for (i = 0; i < NUM; i++) {
+ Matrix[i] = (float)i*10.0f;
+ }
+
+ // allocate the memory on the device side
+ hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+ hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+ // Memory transfer from host to device
+ hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+ // Lauching kernel from host
+ hipLaunchKernel(matrixTranspose,
+ dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+ dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+ 0, 0,
+ gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+ // Memory transfer from device to host
+ hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+ // CPU MatrixTranspose computation
+ matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+ // verify the results
+ errors = 0;
+ double eps = 1.0E-6;
+ for (i = 0; i < NUM; i++) {
+ if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+ printf("gpu%f cpu %f \n",TransposeMatrix[i],cpuTransposeMatrix[i]);
+ errors++;
+ }
+ }
+ if (errors!=0) {
+ printf("FAILED: %d errors\n",errors);
+ } else {
+ printf ("PASSED!\n");
+ }
+
+ //free the resources on device side
+ hipFree(gpuMatrix);
+ hipFree(gpuTransposeMatrix);
+
+ //free the resources on host side
+ free(Matrix);
+ free(TransposeMatrix);
+ free(cpuTransposeMatrix);
+
+ return errors;
+}
diff --git a/src/hip-samples/src/sharedMemory.cpp b/src/hip-samples/src/sharedMemory.cpp
new file mode 100644
index 0000000..9b51aba
--- /dev/null
+++ b/src/hip-samples/src/sharedMemory.cpp
@@ -0,0 +1,142 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+
+#define WIDTH 64
+
+#define NUM (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+ float *out,
+ float *in,
+ const int width)
+{
+ __shared__ float sharedMem[WIDTH*WIDTH];
+
+ int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+ int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+ sharedMem[y * width + x] = in[x * width + y];
+
+ __syncthreads();
+
+ out[y * width + x] = sharedMem[y * width + x];
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+ float * output,
+ float * input,
+ const unsigned int width)
+{
+ for(unsigned int j=0; j < width; j++)
+ {
+ for(unsigned int i=0; i < width; i++)
+ {
+ output[i*width + j] = input[j*width + i];
+ }
+ }
+}
+
+int main() {
+
+ float* Matrix;
+ float* TransposeMatrix;
+ float* cpuTransposeMatrix;
+
+ float* gpuMatrix;
+ float* gpuTransposeMatrix;
+
+ hipDeviceProp_t devProp;
+ hipGetDeviceProperties(&devProp, 0);
+
+ std::cout << "Device name " << devProp.name << std::endl;
+
+ int i;
+ int errors;
+
+ Matrix = (float*)malloc(NUM * sizeof(float));
+ TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+ cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+ // initialize the input data
+ for (i = 0; i < NUM; i++) {
+ Matrix[i] = (float)i*10.0f;
+ }
+
+ // allocate the memory on the device side
+ hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+ hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+ // Memory transfer from host to device
+ hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+ // Lauching kernel from host
+ hipLaunchKernel(matrixTranspose,
+ dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+ dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+ 0, 0,
+ gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+ // Memory transfer from device to host
+ hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+ // CPU MatrixTranspose computation
+ matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+ // verify the results
+ errors = 0;
+ double eps = 1.0E-6;
+ for (i = 0; i < NUM; i++) {
+ if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+ printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+ errors++;
+ }
+ }
+ if (errors!=0) {
+ printf("FAILED: %d errors\n",errors);
+ } else {
+ printf ("PASSED!\n");
+ }
+
+ //free the resources on device side
+ hipFree(gpuMatrix);
+ hipFree(gpuTransposeMatrix);
+
+ //free the resources on host side
+ free(Matrix);
+ free(TransposeMatrix);
+ free(cpuTransposeMatrix);
+
+ return errors;
+}
diff --git a/src/hip-samples/src/shfl.cpp b/src/hip-samples/src/shfl.cpp
new file mode 100644
index 0000000..e0f4c21
--- /dev/null
+++ b/src/hip-samples/src/shfl.cpp
@@ -0,0 +1,141 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+
+#define WIDTH 4
+
+#define NUM (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+ float *out,
+ float *in,
+ const int width)
+{
+ int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+
+ float val = in[x];
+
+ for(int i=0;i<width;i++)
+ {
+ for(int j=0;j<width;j++)
+ out[i*width + j] = __shfl(val,j*width + i);
+ }
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+ float * output,
+ float * input,
+ const unsigned int width)
+{
+ for(unsigned int j=0; j < width; j++)
+ {
+ for(unsigned int i=0; i < width; i++)
+ {
+ output[i*width + j] = input[j*width + i];
+ }
+ }
+}
+
+int main() {
+
+ float* Matrix;
+ float* TransposeMatrix;
+ float* cpuTransposeMatrix;
+
+ float* gpuMatrix;
+ float* gpuTransposeMatrix;
+
+ hipDeviceProp_t devProp;
+ hipGetDeviceProperties(&devProp, 0);
+
+ std::cout << "Device name " << devProp.name << std::endl;
+
+ int i;
+ int errors;
+
+ Matrix = (float*)malloc(NUM * sizeof(float));
+ TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+ cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+ // initialize the input data
+ for (i = 0; i < NUM; i++) {
+ Matrix[i] = (float)i*10.0f;
+ }
+
+ // allocate the memory on the device side
+ hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+ hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+ // Memory transfer from host to device
+ hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+ // Lauching kernel from host
+ hipLaunchKernel(matrixTranspose,
+ dim3(1),
+ dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
+ 0, 0,
+ gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+ // Memory transfer from device to host
+ hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+ // CPU MatrixTranspose computation
+ matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+ // verify the results
+ errors = 0;
+ double eps = 1.0E-6;
+ for (i = 0; i < NUM; i++) {
+ if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+ printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+ errors++;
+ }
+ }
+ if (errors!=0) {
+ printf("FAILED: %d errors\n",errors);
+ } else {
+ printf ("PASSED!\n");
+ }
+
+ //free the resources on device side
+ hipFree(gpuMatrix);
+ hipFree(gpuTransposeMatrix);
+
+ //free the resources on host side
+ free(Matrix);
+ free(TransposeMatrix);
+ free(cpuTransposeMatrix);
+
+ return errors;
+}
diff --git a/src/hip-samples/src/stream.cpp b/src/hip-samples/src/stream.cpp
new file mode 100644
index 0000000..2dc7544
--- /dev/null
+++ b/src/hip-samples/src/stream.cpp
@@ -0,0 +1,147 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANUMTY OF ANY KIND, EXPRESS OR
+IMPLIED, INUMCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNUMESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANUMY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INUM AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INUM CONUMECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <iostream>
+#include <hip/hip_runtime.h>
+
+#define WIDTH 32
+
+#define NUM (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1
+
+using namespace std;
+
+__global__ void matrixTranspose_static_shared(hipLaunchParm lp,
+ float *out,
+ float *in,
+ const int width)
+{
+ __shared__ float sharedMem[WIDTH*WIDTH];
+
+ int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+ int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+ sharedMem[y * width + x] = in[x * width + y];
+
+ __syncthreads();
+
+ out[y * width + x] = sharedMem[y * width + x];
+}
+
+__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp,
+ float *out,
+ float *in,
+ const int width)
+{
+ // declare dynamic shared memory
+ HIP_DYNAMIC_SHARED(float, sharedMem)
+
+ int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+ int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+ sharedMem[y * width + x] = in[x * width + y];
+
+ __syncthreads();
+
+ out[y * width + x] = sharedMem[y * width + x];
+}
+
+void MultipleStream (float **data, float *randArray, float **gpuTransposeMatrix, float **TransposeMatrix, int width)
+{
+ const int num_streams = 2;
+ hipStream_t streams[num_streams];
+
+ for(int i=0;i<num_streams;i++)
+ hipStreamCreate(&streams[i]);
+
+ for(int i=0;i<num_streams;i++)
+ {
+ hipMalloc((void**)&data[i], NUM * sizeof(float));
+ hipMemcpyAsync(data[i], randArray, NUM * sizeof(float), hipMemcpyHostToDevice,streams[i]);
+ }
+
+ hipLaunchKernel(matrixTranspose_static_shared,
+ dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+ dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+ 0, streams[0],
+ gpuTransposeMatrix[0], data[0], width);
+
+ hipLaunchKernel(matrixTranspose_dynamic_shared,
+ dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+ dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+ sizeof(float)*WIDTH*WIDTH, streams[1],
+ gpuTransposeMatrix[1], data[1], width);
+
+ for(int i=0;i<num_streams;i++)
+ hipMemcpyAsync(TransposeMatrix[i], gpuTransposeMatrix[i], NUM*sizeof(float), hipMemcpyDeviceToHost, streams[i]);
+}
+
+int main(){
+
+ hipSetDevice(0);
+
+ float *data[2], *TransposeMatrix[2], *gpuTransposeMatrix[2], *randArray;
+
+ int width = WIDTH;
+
+ randArray = (float*)malloc(NUM * sizeof(float));
+
+ TransposeMatrix[0] = (float*)malloc(NUM * sizeof(float));
+ TransposeMatrix[1] = (float*)malloc(NUM * sizeof(float));
+
+ hipMalloc((void**)&gpuTransposeMatrix[0], NUM * sizeof(float));
+ hipMalloc((void**)&gpuTransposeMatrix[1], NUM * sizeof(float));
+
+ for(int i = 0; i < NUM; i++)
+ {
+ randArray[i] = (float)i*1.0f;
+ }
+
+ MultipleStream(data, randArray, gpuTransposeMatrix, TransposeMatrix, width);
+
+ hipDeviceSynchronize();
+
+ // verify the results
+ int errors = 0;
+ double eps = 1.0E-6;
+ for (int i = 0; i < NUM; i++) {
+ if (std::abs(TransposeMatrix[0][i] - TransposeMatrix[1][i]) > eps ) {
+ printf("%d stream0: %f stream1 %f\n",i,TransposeMatrix[0][i],TransposeMatrix[1][i]);
+ errors++;
+ }
+ }
+ if (errors!=0) {
+ printf("FAILED: %d errors\n",errors);
+ } else {
+ printf ("stream PASSED!\n");
+ }
+
+ free(randArray);
+ for(int i=0;i<2;i++){
+ hipFree(data[i]);
+ hipFree(gpuTransposeMatrix[i]);
+ free(TransposeMatrix[i]);
+ }
+
+ hipDeviceReset();
+ return 0;
+}
diff --git a/src/hip-samples/src/unroll.cpp b/src/hip-samples/src/unroll.cpp
new file mode 100644
index 0000000..22f1c75
--- /dev/null
+++ b/src/hip-samples/src/unroll.cpp
@@ -0,0 +1,141 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+
+#define WIDTH 4
+
+#define NUM (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+ float *out,
+ float *in,
+ const int width)
+{
+ int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+ float val = in[x];
+
+#pragma unroll
+ for(int i=0;i<width;i++)
+ {
+ for(int j=0;j<width;j++)
+ out[i*width + j] = __shfl(val,j*width + i);
+ }
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+ float * output,
+ float * input,
+ const unsigned int width)
+{
+ for(unsigned int j=0; j < width; j++)
+ {
+ for(unsigned int i=0; i < width; i++)
+ {
+ output[i*width + j] = input[j*width + i];
+ }
+ }
+}
+
+int main() {
+
+ float* Matrix;
+ float* TransposeMatrix;
+ float* cpuTransposeMatrix;
+
+ float* gpuMatrix;
+ float* gpuTransposeMatrix;
+
+ hipDeviceProp_t devProp;
+ hipGetDeviceProperties(&devProp, 0);
+
+ std::cout << "Device name " << devProp.name << std::endl;
+
+ int i;
+ int errors;
+
+ Matrix = (float*)malloc(NUM * sizeof(float));
+ TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+ cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+ // initialize the input data
+ for (i = 0; i < NUM; i++) {
+ Matrix[i] = (float)i*10.0f;
+ }
+
+ // allocate the memory on the device side
+ hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+ hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+ // Memory transfer from host to device
+ hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+ // Lauching kernel from host
+ hipLaunchKernel(matrixTranspose,
+ dim3(1),
+ dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
+ 0, 0,
+ gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+ // Memory transfer from device to host
+ hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+ // CPU MatrixTranspose computation
+ matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+ // verify the results
+ errors = 0;
+ double eps = 1.0E-6;
+ for (i = 0; i < NUM; i++) {
+ if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+ printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+ errors++;
+ }
+ }
+ if (errors!=0) {
+ printf("FAILED: %d errors\n",errors);
+ } else {
+ printf ("PASSED!\n");
+ }
+
+ //free the resources on device side
+ hipFree(gpuMatrix);
+ hipFree(gpuTransposeMatrix);
+
+ //free the resources on host side
+ free(Matrix);
+ free(TransposeMatrix);
+ free(cpuTransposeMatrix);
+
+ return errors;
+}