resources: Add HIP sample apps for GPU model tests

This patch adds in a subset of the sample applications found in
HIP/samples/2_Cookbook that should all work in gem5

hipEvent isn't included due to the timers in ROCm 1.6.4 not working in
gem5

profiler isn't included as it requires a kernel module, which doesn't
work well with Docker

peer2peer isn't included due to lack of multi-GPU support currently

Change-Id: I466af7597a4f85572f36c90081d1185805278e9e
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5-resources/+/36055
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: Jason Lowe-Power <power.jg@gmail.com>
diff --git a/README.md b/README.md
index e03e811..8c8491e 100644
--- a/README.md
+++ b/README.md
@@ -420,6 +420,49 @@
 
 The compiled binary can be found in `src/hsa-agent-pkt/bin`
 
+# Resource: HIP Sample Applications
+
+The [HIP sample apps](
+https://github.com/ROCm-Developer-Tools/HIP/tree/roc-1.6.0/samples) contain
+applications that introduce various GPU programming concepts that are usable
+in HIP.
+
+The samples cover topics such as using and accessing different parts of GPU
+memory, running multiple GPU streams, and optimization techniques for GPU code.
+
+Certain apps aren't included due to complexities with either ROCm or Docker
+(hipEvent, profiler), or due to lack of feature support in gem5 (peer2peer)
+
+## Compilation
+
+```
+cd src/hip-samples
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make
+```
+
+Individual programs can be made by specifying the name of the program
+
+By default, this code builds for gfx801, a GCN3-based APU. This can be
+overridden by specifying `-e HCC_AMDGPU_TARGET=<target>` in the build command.
+
+## Pre-built binary
+
+<http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/2dshfl>
+
+<http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/dynamic_shared>
+
+<http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/inline_asm>
+
+<http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/MatrixTranspose>
+
+<http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/sharedMemory>
+
+<http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/shfl>
+
+<http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/stream>
+
+<http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/unroll>
+
 # Resource: SPEC 2006
 
 The [Standard Performance Evaluation Corporation](
@@ -552,6 +595,8 @@
 same licence as 'src/square/square.cpp'.
 `src/hsa-agent-pkt/HSA_Interface.[h|.cpp]` are licensed under a BSD Lisense
 (A University of Maryland copyright).
+* **hip-samples**: Consult individual copyright notices of the source file in
+'src/hip-samples/src'
 * **spec 2006**: SPEC CPU 2006 requires purchase of benchmark suite from
 [SPEC](https://www.spec.org/cpu2006/) thus, it cannot be freely distributed.
 Consult individual copyright notices of source files in `src/spec-2006`.
diff --git a/src/hip-samples/.gitignore b/src/hip-samples/.gitignore
new file mode 100644
index 0000000..ba077a4
--- /dev/null
+++ b/src/hip-samples/.gitignore
@@ -0,0 +1 @@
+bin
diff --git a/src/hip-samples/Makefile b/src/hip-samples/Makefile
new file mode 100644
index 0000000..da12ce5
--- /dev/null
+++ b/src/hip-samples/Makefile
@@ -0,0 +1,21 @@
+HIP_PATH?= /opt/rocm/hip
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+SRC_DIR := src
+BIN_DIR := bin
+
+SOURCES := $(wildcard $(SRC_DIR)/*.cpp)
+EXECUTABLES := $(basename $(notdir $(SOURCES)))
+
+.PHONY: all clean
+
+all: $(EXECUTABLES)
+
+$(EXECUTABLES): %: $(SRC_DIR)/%.cpp | $(BIN_DIR)
+	$(HIPCC) $< -o $(BIN_DIR)/$@
+
+$(BIN_DIR):
+	mkdir -p $@
+
+clean:
+	rm -rf $(BIN_DIR)
diff --git a/src/hip-samples/src/2dshfl.cpp b/src/hip-samples/src/2dshfl.cpp
new file mode 100644
index 0000000..1b22a0c
--- /dev/null
+++ b/src/hip-samples/src/2dshfl.cpp
@@ -0,0 +1,137 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+
+#define WIDTH     4
+
+#define NUM       (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X  4
+#define THREADS_PER_BLOCK_Y  4
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width)
+{
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    float val = in[y*width + x];
+
+    out[x*width + y] = __shfl(val,y*width + x);
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width)
+{
+    for(unsigned int j=0; j < width; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*width + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(1),
+                  dim3(THREADS_PER_BLOCK_X , THREADS_PER_BLOCK_Y),
+                  0, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}
diff --git a/src/hip-samples/src/MatrixTranspose.cpp b/src/hip-samples/src/MatrixTranspose.cpp
new file mode 100644
index 0000000..264fcbe
--- /dev/null
+++ b/src/hip-samples/src/MatrixTranspose.cpp
@@ -0,0 +1,136 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+
+#define WIDTH     1024
+
+
+#define NUM       (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X  4
+#define THREADS_PER_BLOCK_Y  4
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width)
+{
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+    out[y * width + x] = in[x * width + y];
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width)
+{
+    for(unsigned int j=0; j < width; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*width + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                  0, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}
diff --git a/src/hip-samples/src/dynamic_shared.cpp b/src/hip-samples/src/dynamic_shared.cpp
new file mode 100644
index 0000000..22d7eb9
--- /dev/null
+++ b/src/hip-samples/src/dynamic_shared.cpp
@@ -0,0 +1,141 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+#define WIDTH     16
+
+#define NUM       (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X  4
+#define THREADS_PER_BLOCK_Y  4
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width)
+{
+    // declare dynamic shared memory
+    HIP_DYNAMIC_SHARED(float, sharedMem);
+
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+    sharedMem[y * width + x] = in[x * width + y];
+
+    __syncthreads();
+
+    out[y * width + x] = sharedMem[y * width + x];
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width)
+{
+    for(unsigned int j=0; j < width; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*width + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                  sizeof(float)*WIDTH*WIDTH, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("dynamic_shared PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}
diff --git a/src/hip-samples/src/inline_asm.cpp b/src/hip-samples/src/inline_asm.cpp
new file mode 100644
index 0000000..f2345e5
--- /dev/null
+++ b/src/hip-samples/src/inline_asm.cpp
@@ -0,0 +1,136 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+#define WIDTH     1024
+
+#define NUM       (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X  4
+#define THREADS_PER_BLOCK_Y  4
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width)
+{
+
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+    asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x]));
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width)
+{
+    for(unsigned int j=0; j < width; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*width + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                  0, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("gpu%f cpu %f \n",TransposeMatrix[i],cpuTransposeMatrix[i]);
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}
diff --git a/src/hip-samples/src/sharedMemory.cpp b/src/hip-samples/src/sharedMemory.cpp
new file mode 100644
index 0000000..9b51aba
--- /dev/null
+++ b/src/hip-samples/src/sharedMemory.cpp
@@ -0,0 +1,142 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+
+#define WIDTH     64
+
+#define NUM       (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X  4
+#define THREADS_PER_BLOCK_Y  4
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width)
+{
+    __shared__ float sharedMem[WIDTH*WIDTH];
+
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+    sharedMem[y * width + x] = in[x * width + y];
+
+    __syncthreads();
+
+    out[y * width + x] = sharedMem[y * width + x];
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width)
+{
+    for(unsigned int j=0; j < width; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*width + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                  0, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}
diff --git a/src/hip-samples/src/shfl.cpp b/src/hip-samples/src/shfl.cpp
new file mode 100644
index 0000000..e0f4c21
--- /dev/null
+++ b/src/hip-samples/src/shfl.cpp
@@ -0,0 +1,141 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+
+#define WIDTH     4
+
+#define NUM       (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X  4
+#define THREADS_PER_BLOCK_Y  4
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width)
+{
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+
+    float val = in[x];
+
+    for(int i=0;i<width;i++)
+    {
+        for(int j=0;j<width;j++)
+            out[i*width + j] = __shfl(val,j*width + i);
+    }
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width)
+{
+    for(unsigned int j=0; j < width; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*width + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(1),
+                  dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
+                  0, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}
diff --git a/src/hip-samples/src/stream.cpp b/src/hip-samples/src/stream.cpp
new file mode 100644
index 0000000..2dc7544
--- /dev/null
+++ b/src/hip-samples/src/stream.cpp
@@ -0,0 +1,147 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANUMTY OF ANY KIND, EXPRESS OR
+IMPLIED, INUMCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNUMESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANUMY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INUM AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INUM CONUMECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <iostream>
+#include <hip/hip_runtime.h>
+
+#define WIDTH     32
+
+#define NUM       (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X  4
+#define THREADS_PER_BLOCK_Y  4
+#define THREADS_PER_BLOCK_Z  1
+
+using namespace std;
+
+__global__ void matrixTranspose_static_shared(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width)
+{
+    __shared__ float sharedMem[WIDTH*WIDTH];
+
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+    sharedMem[y * width + x] = in[x * width + y];
+
+    __syncthreads();
+
+    out[y * width + x] = sharedMem[y * width + x];
+}
+
+__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width)
+{
+    // declare dynamic shared memory
+    HIP_DYNAMIC_SHARED(float, sharedMem)
+
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+    sharedMem[y * width + x] = in[x * width + y];
+
+    __syncthreads();
+
+    out[y * width + x] = sharedMem[y * width + x];
+}
+
+void MultipleStream (float **data, float *randArray, float **gpuTransposeMatrix, float **TransposeMatrix, int width)
+{
+    const int num_streams = 2;
+    hipStream_t streams[num_streams];
+
+    for(int i=0;i<num_streams;i++)
+        hipStreamCreate(&streams[i]);
+
+    for(int i=0;i<num_streams;i++)
+    {
+        hipMalloc((void**)&data[i], NUM * sizeof(float));
+        hipMemcpyAsync(data[i], randArray, NUM * sizeof(float), hipMemcpyHostToDevice,streams[i]);
+    }
+
+    hipLaunchKernel(matrixTranspose_static_shared,
+                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                    0, streams[0],
+                    gpuTransposeMatrix[0], data[0], width);
+
+    hipLaunchKernel(matrixTranspose_dynamic_shared,
+                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                    sizeof(float)*WIDTH*WIDTH, streams[1],
+                    gpuTransposeMatrix[1], data[1], width);
+
+    for(int i=0;i<num_streams;i++)
+        hipMemcpyAsync(TransposeMatrix[i], gpuTransposeMatrix[i], NUM*sizeof(float), hipMemcpyDeviceToHost, streams[i]);
+}
+
+int main(){
+
+    hipSetDevice(0);
+
+    float *data[2], *TransposeMatrix[2], *gpuTransposeMatrix[2], *randArray;
+
+    int width = WIDTH;
+
+    randArray = (float*)malloc(NUM * sizeof(float));
+
+    TransposeMatrix[0] = (float*)malloc(NUM * sizeof(float));
+    TransposeMatrix[1] = (float*)malloc(NUM * sizeof(float));
+
+    hipMalloc((void**)&gpuTransposeMatrix[0], NUM * sizeof(float));
+    hipMalloc((void**)&gpuTransposeMatrix[1], NUM * sizeof(float));
+
+    for(int i = 0; i < NUM; i++)
+    {
+        randArray[i] = (float)i*1.0f;
+    }
+
+    MultipleStream(data, randArray, gpuTransposeMatrix, TransposeMatrix, width);
+
+    hipDeviceSynchronize();
+
+    // verify the results
+    int errors = 0;
+    double eps = 1.0E-6;
+    for (int i = 0; i < NUM; i++) {
+        if (std::abs(TransposeMatrix[0][i] - TransposeMatrix[1][i]) > eps ) {
+        printf("%d stream0: %f stream1  %f\n",i,TransposeMatrix[0][i],TransposeMatrix[1][i]);
+        errors++;
+        }
+    }
+    if (errors!=0) {
+        printf("FAILED: %d errors\n",errors);
+    } else {
+        printf ("stream PASSED!\n");
+    }
+
+    free(randArray);
+    for(int i=0;i<2;i++){
+       hipFree(data[i]);
+       hipFree(gpuTransposeMatrix[i]);
+       free(TransposeMatrix[i]);
+    }
+
+    hipDeviceReset();
+    return 0;
+}
diff --git a/src/hip-samples/src/unroll.cpp b/src/hip-samples/src/unroll.cpp
new file mode 100644
index 0000000..22f1c75
--- /dev/null
+++ b/src/hip-samples/src/unroll.cpp
@@ -0,0 +1,141 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+
+#define WIDTH     4
+
+#define NUM       (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X  4
+#define THREADS_PER_BLOCK_Y  4
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width)
+{
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    float val = in[x];
+
+#pragma unroll
+    for(int i=0;i<width;i++)
+    {
+        for(int j=0;j<width;j++)
+            out[i*width + j] = __shfl(val,j*width + i);
+    }
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width)
+{
+    for(unsigned int j=0; j < width; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*width + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(1),
+                  dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
+                  0, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}