diff --git a/src/gpu/hip-samples/Makefile b/src/gpu/hip-samples/Makefile
index da12ce5..ee863b8 100644
--- a/src/gpu/hip-samples/Makefile
+++ b/src/gpu/hip-samples/Makefile
@@ -12,7 +12,7 @@
 all: $(EXECUTABLES)
 
 $(EXECUTABLES): %: $(SRC_DIR)/%.cpp | $(BIN_DIR)
-	$(HIPCC) $< -o $(BIN_DIR)/$@
+	$(HIPCC) --amdgpu-target=gfx801,gfx803 $< -o $(BIN_DIR)/$@
 
 $(BIN_DIR):
 	mkdir -p $@
diff --git a/src/gpu/hip-samples/README.md b/src/gpu/hip-samples/README.md
index 3b17193..7095ae7 100644
--- a/src/gpu/hip-samples/README.md
+++ b/src/gpu/hip-samples/README.md
@@ -33,8 +33,8 @@
 
 Individual programs can be made by specifying the name of the program
 
-By default, this code builds for gfx801, a GCN3-based APU. This can be
-overridden by specifying `-e HCC_AMDGPU_TARGET=<target>` in the build command.
+By default, the apps are built for all supported GPU types (gfx801, gfx803).
+This can be changed by editing the --amdgpu-target argument in the Makefile.
 
 ## Pre-built binary
 
diff --git a/src/gpu/hip-samples/src/2dshfl.cpp b/src/gpu/hip-samples/src/2dshfl.cpp
index 1b22a0c..4e58cfb 100644
--- a/src/gpu/hip-samples/src/2dshfl.cpp
+++ b/src/gpu/hip-samples/src/2dshfl.cpp
@@ -36,8 +36,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -66,10 +65,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -80,8 +77,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -90,21 +86,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(1),
-                  dim3(THREADS_PER_BLOCK_X , THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(1),
+                     dim3(THREADS_PER_BLOCK_X , THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -113,8 +103,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -125,12 +115,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/MatrixTranspose.cpp b/src/gpu/hip-samples/src/MatrixTranspose.cpp
index 264fcbe..68741e2 100644
--- a/src/gpu/hip-samples/src/MatrixTranspose.cpp
+++ b/src/gpu/hip-samples/src/MatrixTranspose.cpp
@@ -37,8 +37,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -66,10 +65,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -80,8 +77,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -90,21 +86,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -113,7 +103,7 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
       errors++;
     }
   }
@@ -124,12 +114,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/dynamic_shared.cpp b/src/gpu/hip-samples/src/dynamic_shared.cpp
index 22d7eb9..9627d3b 100644
--- a/src/gpu/hip-samples/src/dynamic_shared.cpp
+++ b/src/gpu/hip-samples/src/dynamic_shared.cpp
@@ -34,8 +34,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -70,10 +69,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -84,8 +81,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -94,21 +90,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  sizeof(float)*WIDTH*WIDTH, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                     sizeof(float)*WIDTH*WIDTH, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -117,8 +107,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -129,12 +119,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/inline_asm.cpp b/src/gpu/hip-samples/src/inline_asm.cpp
index f2345e5..5a8b628 100644
--- a/src/gpu/hip-samples/src/inline_asm.cpp
+++ b/src/gpu/hip-samples/src/inline_asm.cpp
@@ -35,8 +35,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -65,10 +64,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -79,8 +76,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -89,21 +85,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -112,8 +102,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("gpu%f cpu %f \n",TransposeMatrix[i],cpuTransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("gpu%f cpu %f \n",gpuTransposeMatrix[i],cpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -124,12 +114,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/sharedMemory.cpp b/src/gpu/hip-samples/src/sharedMemory.cpp
index 9b51aba..d88d18e 100644
--- a/src/gpu/hip-samples/src/sharedMemory.cpp
+++ b/src/gpu/hip-samples/src/sharedMemory.cpp
@@ -36,8 +36,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -71,10 +70,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -85,8 +82,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -95,21 +91,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -118,8 +108,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -130,12 +120,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/shfl.cpp b/src/gpu/hip-samples/src/shfl.cpp
index e0f4c21..d523ffb 100644
--- a/src/gpu/hip-samples/src/shfl.cpp
+++ b/src/gpu/hip-samples/src/shfl.cpp
@@ -36,8 +36,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -70,10 +69,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -84,8 +81,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -94,21 +90,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(1),
-                  dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(1),
+                     dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -117,8 +107,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -129,12 +119,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
diff --git a/src/gpu/hip-samples/src/stream.cpp b/src/gpu/hip-samples/src/stream.cpp
index 2dc7544..c14759a 100644
--- a/src/gpu/hip-samples/src/stream.cpp
+++ b/src/gpu/hip-samples/src/stream.cpp
@@ -30,10 +30,9 @@
 
 using namespace std;
 
-__global__ void matrixTranspose_static_shared(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
+__global__ void matrixTranspose_static_shared(float *out,
+                                              float *in,
+                                              const int width)
 {
     __shared__ float sharedMem[WIDTH*WIDTH];
 
@@ -47,10 +46,9 @@
     out[y * width + x] = sharedMem[y * width + x];
 }
 
-__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp,
-                                float *out,
-                                float *in,
-                                const int width)
+__global__ void matrixTranspose_dynamic_shared(float *out,
+                                               float *in,
+                                               const int width)
 {
     // declare dynamic shared memory
     HIP_DYNAMIC_SHARED(float, sharedMem)
@@ -65,7 +63,7 @@
     out[y * width + x] = sharedMem[y * width + x];
 }
 
-void MultipleStream (float **data, float *randArray, float **gpuTransposeMatrix, float **TransposeMatrix, int width)
+void MultipleStream (float **data, float *randArray, float **gpuTransposeMatrix, int width)
 {
     const int num_streams = 2;
     hipStream_t streams[num_streams];
@@ -75,48 +73,42 @@
 
     for(int i=0;i<num_streams;i++)
     {
-        hipMalloc((void**)&data[i], NUM * sizeof(float));
+        hipHostMalloc(&data[i], NUM * sizeof(float));
         hipMemcpyAsync(data[i], randArray, NUM * sizeof(float), hipMemcpyHostToDevice,streams[i]);
     }
 
-    hipLaunchKernel(matrixTranspose_static_shared,
-                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                    0, streams[0],
-                    gpuTransposeMatrix[0], data[0], width);
+    hipLaunchKernelGGL(matrixTranspose_static_shared,
+                       dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                       dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                       0, streams[0],
+                       gpuTransposeMatrix[0], data[0], width);
 
-    hipLaunchKernel(matrixTranspose_dynamic_shared,
-                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
-                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
-                    sizeof(float)*WIDTH*WIDTH, streams[1],
-                    gpuTransposeMatrix[1], data[1], width);
-
-    for(int i=0;i<num_streams;i++)
-        hipMemcpyAsync(TransposeMatrix[i], gpuTransposeMatrix[i], NUM*sizeof(float), hipMemcpyDeviceToHost, streams[i]);
+    hipLaunchKernelGGL(matrixTranspose_dynamic_shared,
+                       dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                       dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                       sizeof(float)*WIDTH*WIDTH, streams[1],
+                       gpuTransposeMatrix[1], data[1], width);
 }
 
 int main(){
 
     hipSetDevice(0);
 
-    float *data[2], *TransposeMatrix[2], *gpuTransposeMatrix[2], *randArray;
+    float *data[2], *gpuTransposeMatrix[2], *randArray;
 
     int width = WIDTH;
 
     randArray = (float*)malloc(NUM * sizeof(float));
 
-    TransposeMatrix[0] = (float*)malloc(NUM * sizeof(float));
-    TransposeMatrix[1] = (float*)malloc(NUM * sizeof(float));
-
-    hipMalloc((void**)&gpuTransposeMatrix[0], NUM * sizeof(float));
-    hipMalloc((void**)&gpuTransposeMatrix[1], NUM * sizeof(float));
+    hipHostMalloc(&gpuTransposeMatrix[0], NUM * sizeof(float));
+    hipHostMalloc(&gpuTransposeMatrix[1], NUM * sizeof(float));
 
     for(int i = 0; i < NUM; i++)
     {
         randArray[i] = (float)i*1.0f;
     }
 
-    MultipleStream(data, randArray, gpuTransposeMatrix, TransposeMatrix, width);
+    MultipleStream(data, randArray, gpuTransposeMatrix, width);
 
     hipDeviceSynchronize();
 
@@ -124,9 +116,9 @@
     int errors = 0;
     double eps = 1.0E-6;
     for (int i = 0; i < NUM; i++) {
-        if (std::abs(TransposeMatrix[0][i] - TransposeMatrix[1][i]) > eps ) {
-        printf("%d stream0: %f stream1  %f\n",i,TransposeMatrix[0][i],TransposeMatrix[1][i]);
-        errors++;
+        if (std::abs(gpuTransposeMatrix[0][i] - gpuTransposeMatrix[1][i]) > eps ) {
+            printf("%d stream0: %f stream1  %f\n",i,gpuTransposeMatrix[0][i],gpuTransposeMatrix[1][i]);
+            errors++;
         }
     }
     if (errors!=0) {
@@ -139,7 +131,6 @@
     for(int i=0;i<2;i++){
        hipFree(data[i]);
        hipFree(gpuTransposeMatrix[i]);
-       free(TransposeMatrix[i]);
     }
 
     hipDeviceReset();
diff --git a/src/gpu/hip-samples/src/unroll.cpp b/src/gpu/hip-samples/src/unroll.cpp
index 22f1c75..6935c03 100644
--- a/src/gpu/hip-samples/src/unroll.cpp
+++ b/src/gpu/hip-samples/src/unroll.cpp
@@ -36,8 +36,7 @@
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp,
-                                float *out,
+__global__ void matrixTranspose(float *out,
                                 float *in,
                                 const int width)
 {
@@ -70,10 +69,8 @@
 int main() {
 
   float* Matrix;
-  float* TransposeMatrix;
   float* cpuTransposeMatrix;
 
-  float* gpuMatrix;
   float* gpuTransposeMatrix;
 
   hipDeviceProp_t devProp;
@@ -84,8 +81,7 @@
   int i;
   int errors;
 
-  Matrix = (float*)malloc(NUM * sizeof(float));
-  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  hipHostMalloc(&Matrix, NUM * sizeof(float));
   cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
   // initialize the input data
@@ -94,21 +90,15 @@
   }
 
   // allocate the memory on the device side
-  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
-
-  // Memory transfer from host to device
-  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+  hipHostMalloc(&gpuTransposeMatrix, NUM * sizeof(float));
 
   // Lauching kernel from host
-  hipLaunchKernel(matrixTranspose,
-                  dim3(1),
-                  dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
-                  0, 0,
-                  gpuTransposeMatrix , gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+  hipLaunchKernelGGL(matrixTranspose,
+                     dim3(1),
+                     dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
+                     0, 0,
+                     gpuTransposeMatrix , Matrix, WIDTH);
+  hipDeviceSynchronize();
 
   // CPU MatrixTranspose computation
   matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -117,8 +107,8 @@
   errors = 0;
   double eps = 1.0E-6;
   for (i = 0; i < NUM; i++) {
-    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
-    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+    if (std::abs(gpuTransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],gpuTransposeMatrix[i]);
       errors++;
     }
   }
@@ -129,12 +119,10 @@
   }
 
   //free the resources on device side
-  hipFree(gpuMatrix);
   hipFree(gpuTransposeMatrix);
 
   //free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
+  hipFree(Matrix);
   free(cpuTransposeMatrix);
 
   return errors;
