resources: Update hip-samples for ROCm 4 and dGPU support

This patch implements fixes for ROCm 4 and workarounds used to avoid
memory management bugs that exist in gem5.

The patch replaces calls to hipLaunchKernel with hipLaunchKernelGGL.

The patch also uses hipHostMalloc instead of hipMalloc as a way to
create host and device memory without needing to use hipMemcpy, as
hipMemcpy currently doesn't work for hipMalloc'd memory in gem5.

Finally, this patch updates the Makefile to build for both gfx801 and
gfx803 in the same binary

Change-Id: I89f091575cd8d1dcd5ac12278211eef4e01cbf21
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5-resources/+/48039
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Alex Dutu <alexandru.dutu@amd.com>
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: Bobby R. Bruce <bbruce@ucdavis.edu>
diff --git a/src/gpu/hip-samples/Makefile b/src/gpu/hip-samples/Makefile
index da12ce5..ee863b8 100644
--- a/src/gpu/hip-samples/Makefile
+++ b/src/gpu/hip-samples/Makefile
@@ -12,7 +12,7 @@
 all: $(EXECUTABLES)
 
 $(EXECUTABLES): %: $(SRC_DIR)/%.cpp | $(BIN_DIR)
-	$(HIPCC) $< -o $(BIN_DIR)/$@
+	$(HIPCC) --amdgpu-target=gfx801,gfx803 $< -o $(BIN_DIR)/$@
 
 $(BIN_DIR):
 	mkdir -p $@
diff --git a/src/gpu/hip-samples/README.md b/src/gpu/hip-samples/README.md
index 3b17193..7095ae7 100644
--- a/src/gpu/hip-samples/README.md
+++ b/src/gpu/hip-samples/README.md
@@ -33,8 +33,8 @@
 
 Individual programs can be made by specifying the name of the program
 
-By default, this code builds for gfx801, a GCN3-based APU. This can be
-overridden by specifying `-e HCC_AMDGPU_TARGET=<target>` in the build command.
+By default, the apps are built for all supported GPU types (gfx801, gfx803).
+This can be changed by editing the --amdgpu-target argument in the Makefile.
 
 ## Pre-built binary
 
diff --git a/src/gpu/hip-samples/src/2dshfl.cpp b/src/gpu/hip-samples/src/2dshfl.cpp
index 1b22a0c..4e58cfb 100644
--- a/src/gpu/hip-samples/src/2dshfl.cpp
+++ b/src/gpu/hip-samples/src/2dshfl.cpp
@@ -36,8 +36,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -66,10 +65,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -80,8 +77,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -90,21 +86,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(1),
-                  dim3(THREADS_PER_BLOCK_X , THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(1),
+                     dim3(THREADS_PER_BLOCK_X , THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -113,8 +103,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -125,12 +115,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/MatrixTranspose.cpp b/src/gpu/hip-samples/src/MatrixTranspose.cpp
index 264fcbe..68741e2 100644
--- a/src/gpu/hip-samples/src/MatrixTranspose.cpp
+++ b/src/gpu/hip-samples/src/MatrixTranspose.cpp
@@ -37,8 +37,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -66,10 +65,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -80,8 +77,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -90,21 +86,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -113,7 +103,7 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
       errors++;
     }
   }
@@ -124,12 +114,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/dynamic_shared.cpp b/src/gpu/hip-samples/src/dynamic_shared.cpp
index 22d7eb9..9627d3b 100644
--- a/src/gpu/hip-samples/src/dynamic_shared.cpp
+++ b/src/gpu/hip-samples/src/dynamic_shared.cpp
@@ -34,8 +34,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -70,10 +69,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -84,8 +81,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -94,21 +90,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  sizeof(float)*WIDTH*WIDTH, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                     sizeof(float)*WIDTH*WIDTH, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -117,8 +107,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -129,12 +119,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/inline_asm.cpp b/src/gpu/hip-samples/src/inline_asm.cpp
index f2345e5..5a8b628 100644
--- a/src/gpu/hip-samples/src/inline_asm.cpp
+++ b/src/gpu/hip-samples/src/inline_asm.cpp
@@ -35,8 +35,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -65,10 +64,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -79,8 +76,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -89,21 +85,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -112,8 +102,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("gpu%f cpu %f \n",TransposeMatrix[i],cpuTransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("gpu%f cpu %f \n",gpuTransposeMatrix[i],cpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -124,12 +114,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/sharedMemory.cpp b/src/gpu/hip-samples/src/sharedMemory.cpp
index 9b51aba..d88d18e 100644
--- a/src/gpu/hip-samples/src/sharedMemory.cpp
+++ b/src/gpu/hip-samples/src/sharedMemory.cpp
@@ -36,8 +36,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -71,10 +70,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -85,8 +82,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -95,21 +91,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -118,8 +108,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -130,12 +120,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/shfl.cpp b/src/gpu/hip-samples/src/shfl.cpp
index e0f4c21..d523ffb 100644
--- a/src/gpu/hip-samples/src/shfl.cpp
+++ b/src/gpu/hip-samples/src/shfl.cpp
@@ -36,8 +36,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -70,10 +69,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -84,8 +81,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -94,21 +90,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(1),
-                  dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(1),
+                     dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -117,8 +107,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -129,12 +119,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/stream.cpp b/src/gpu/hip-samples/src/stream.cpp
index 2dc7544..c14759a 100644
--- a/src/gpu/hip-samples/src/stream.cpp
+++ b/src/gpu/hip-samples/src/stream.cpp
@@ -30,10 +30,9 @@
 
 using namespace std;
 
-__global__ void matrixTranspose_static_shared(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
+__global__ void matrixTranspose_static_shared(float *out,
+                                              float *in,
+                                              const int width)
 {
     __shared__ float sharedMem[WIDTH*WIDTH];
 
@@ -47,10 +46,9 @@
     out[y * width + x] = sharedMem[y * width + x];
 }
 
-__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
+__global__ void matrixTranspose_dynamic_shared(float *out,
+                                               float *in,
+                                               const int width)
 {
     // declare dynamic shared memory
     HIP_DYNAMIC_SHARED(float, sharedMem)
@@ -65,7 +63,7 @@
     out[y * width + x] = sharedMem[y * width + x];
 }
 
-void MultipleStream (float **data, float *randArray, float **gpuTransposeMatrix, float **TransposeMatrix, int width)
+void MultipleStream (float **data, float *randArray, float **gpuTransposeMatrix, int width)
 {
     const int num_streams = 2;
     hipStream_t streams[num_streams];
@@ -75,48 +73,42 @@
 
     for(int i=0;i<num_streams;i++)
     {
-        hipMalloc((void**)&data[i], NUM * sizeof(float));
+        hipHostMalloc(&data[i], NUM * sizeof(float));
         hipMemcpyAsync(data[i], randArray, NUM * sizeof(float), hipMemcpyHostToDevice,streams[i]);
     }
 
-    hipLaunchKernel(matrixTranspose_static_shared,
-                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                    0, streams[0],
-                    gpuTransposeMatrix[0], data[0], width);
+    hipLaunchKernelGGL(matrixTranspose_static_shared,
+                       dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                       dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                       0, streams[0],
+                       gpuTransposeMatrix[0], data[0], width);
 
-    hipLaunchKernel(matrixTranspose_dynamic_shared,
-                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                    sizeof(float)*WIDTH*WIDTH, streams[1],
-                    gpuTransposeMatrix[1], data[1], width);
-
-    for(int i=0;i<num_streams;i++)
-        hipMemcpyAsync(TransposeMatrix[i], gpuTransposeMatrix[i], NUM*sizeof(float), hipMemcpyDeviceToHost, streams[i]);
+    hipLaunchKernelGGL(matrixTranspose_dynamic_shared,
+                       dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                       dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                       sizeof(float)*WIDTH*WIDTH, streams[1],
+                       gpuTransposeMatrix[1], data[1], width);
 }
 
 int main(){
 
     hipSetDevice(0);
 
-    float *data[2], *TransposeMatrix[2], *gpuTransposeMatrix[2], *randArray;
+    float *data[2], *gpuTransposeMatrix[2], *randArray;
 
     int width = WIDTH;
 
     randArray = (float*)malloc(NUM * sizeof(float));
 
-    TransposeMatrix[0] = (float*)malloc(NUM * sizeof(float));
-    TransposeMatrix[1] = (float*)malloc(NUM * sizeof(float));
-
-    hipMalloc((void**)&gpuTransposeMatrix[0], NUM * sizeof(float));
-    hipMalloc((void**)&gpuTransposeMatrix[1], NUM * sizeof(float));
+    hipHostMalloc(&gpuTransposeMatrix[0], NUM * sizeof(float));
+    hipHostMalloc(&gpuTransposeMatrix[1], NUM * sizeof(float));
 
     for(int i = 0; i < NUM; i++)
     {
         randArray[i] = (float)i*1.0f;
     }
 
-    MultipleStream(data, randArray, gpuTransposeMatrix, TransposeMatrix, width);
+    MultipleStream(data, randArray, gpuTransposeMatrix, width);
 
     hipDeviceSynchronize();
 
@@ -124,9 +116,9 @@
     int errors = 0;
     double eps = 1.0E-6;
     for (int i = 0; i < NUM; i++) {
-        if (std::abs(TransposeMatrix[0][i] - TransposeMatrix[1][i]) > eps ) {
-        printf("%d stream0: %f stream1  %f\n",i,TransposeMatrix[0][i],TransposeMatrix[1][i]);
-        errors++;
+        if (std::abs(gpuTransposeMatrix[0][i] - gpuTransposeMatrix[1][i]) > eps ) {
+            printf("%d stream0: %f stream1  %f\n",i,gpuTransposeMatrix[0][i],gpuTransposeMatrix[1][i]);
+            errors++;
         }
     }
     if (errors!=0) {
@@ -139,7 +131,6 @@
     for(int i=0;i<2;i++){
        hipFree(data[i]);
        hipFree(gpuTransposeMatrix[i]);
-       free(TransposeMatrix[i]);
     }
 
     hipDeviceReset();
diff --git a/src/gpu/hip-samples/src/unroll.cpp b/src/gpu/hip-samples/src/unroll.cpp
index 22f1c75..6935c03 100644
--- a/src/gpu/hip-samples/src/unroll.cpp
+++ b/src/gpu/hip-samples/src/unroll.cpp
@@ -36,8 +36,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -70,10 +69,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -84,8 +81,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -94,21 +90,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(1),
-                  dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(1),
+                     dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -117,8 +107,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -129,12 +119,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;