resources: Adding Agent Dispatch Pkt Example

This commit adds the hsa-agent-pkt example program for the GCN3_X86
build of gem5. It serves as an example for sending commands to the
gpu command processor. Details added in readme.

Change-Id: I54c79abf6f8f7d9ebfcd8eb6c21147a5d6f30b53
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5-resources/+/37678
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: Jason Lowe-Power <power.jg@gmail.com>
diff --git a/README.md b/README.md
index 909dcda..71fa9ac 100755
--- a/README.md
+++ b/README.md
@@ -372,6 +372,31 @@
 
 <http://dist.gem5.org/dist/v20-1/test-progs/square/square.o>
 
+# Resource: HSA Agent Packet Example
+
+Based off of the Square resource in this repository, this resource serves as
+an example for using an HSA Agent Packet to send commands to the GPU command
+processor included in the GCN_X86 build of gem5.
+
+The example command extracts the kernel's completion signal from the domain
+of the command processor and the GPU's dispatcher. Initially this was a 
+workaround for the hipDeviceSynchronize bug, now fixed. The method of
+waiting on a signal can be applied to other agent packet commands though.
+
+Custom commands can be added to the command processor in gem5 to control
+the GPU in novel ways.
+
+## Compilation
+
+To compile:
+
+```
+cd src/hsa-agent-pkt
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make gfx8-apu
+```
+
+The compiled binary can be found in `src/hsa-agent-pkt/bin`
+
 # Resource: SPEC 2006
 
 The [Standard Performance Evaluation Corporation](
@@ -500,6 +525,10 @@
 `src/pthreads`.
 * **square**: Consult individual copyright notices of source files in
 `src/square`.
+* **hsa-agent-pkt**: `src/hsa-agent-pkt/square.cpp` is licensed under the
+same licence as 'src/square/square.cpp'.
+`src/hsa-agent-pkt/HSA_Interface.[h|.cpp]` are licensed under a BSD Lisense
+(A University of Maryland copyright).
 * **spec 2006**: SPEC CPU 2006 requires purchase of benchmark suite from
 [SPEC](https://www.spec.org/cpu2006/) thus, it cannot be freely distributed.
 Consult individual copyright notices of source files in `src/spec-2006`.
diff --git a/src/hsa-agent-pkt/HSA_Interface.cpp b/src/hsa-agent-pkt/HSA_Interface.cpp
new file mode 100644
index 0000000..e89cca4
--- /dev/null
+++ b/src/hsa-agent-pkt/HSA_Interface.cpp
@@ -0,0 +1,176 @@
+/*
+Copyright (c) 2020 University of Maryland
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met: redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer;
+redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution;
+neither the name of the copyright holders nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "HSA_Interface.h"
+
+void print_agent_dispatch_packet(hsa_agent_dispatch_packet_t* pkt)
+{
+
+    printf("Packet \t\t%p\n",
+        (void *)pkt);
+    printf("Packet 16t\t\t%p\n",
+        (uint16_t *)pkt);
+    printf("Packet 32t\t\t%p\n",
+        (uint32_t *)pkt);
+    printf("Packet void**\t\t%p\n",
+        (void **)pkt);
+    printf("%p header: \t\t%hu\n",
+        (void *)(&(pkt->header )),pkt->header );
+    printf("%p type: \t\t%hu\n",
+        (void *)(&(pkt->type )),pkt->type );
+    printf("%p reserved0: \t\t%u\n",
+        (void *)(&(pkt->reserved0 )),pkt->reserved0 );
+    printf("%p return_address: \t\t%p\n",
+        (void *)(&(pkt->return_address )),pkt->return_address );
+    printf("%p arg[0]: \t\t%lu\n",
+        (void *)(&(pkt->arg[0] )),pkt->arg[0] );
+    printf("%p arg[1]: \t\t%lu\n",
+        (void *)(&(pkt->arg[1] )),pkt->arg[1] );
+    printf("%p arg[2]: \t\t%lu\n",
+        (void *)(&(pkt->arg[2] )),pkt->arg[0] );
+    printf("%p arg[3]: \t\t%lu\n",
+        (void *)(&(pkt->arg[3] )),pkt->arg[1] );
+    printf("%p reserved2: \t\t%lu\n",
+        (void *)(&(pkt->reserved2 )),pkt->reserved2 );
+    printf("%p completion_signal: \t\t%lu\n",
+        (void *)(&(pkt->completion_signal )),pkt->completion_signal.handle );
+
+
+    fflush(stdout);
+}
+
+void agent_disp_packet_store_release(uint16_t* packet, uint16_t header) {
+    __atomic_store_n(packet, header, __ATOMIC_RELEASE);
+}
+
+uint16_t header(hsa_packet_type_t type) {
+    uint16_t header = type << HSA_PACKET_HEADER_TYPE;
+    header |=
+        HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE;
+    header |=
+        HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE;
+    return header;
+}
+
+hsa_status_t get_kernel_agent(hsa_agent_t agent, void* data) {
+    uint32_t features = 0;
+    hsa_agent_get_info(agent, HSA_AGENT_INFO_FEATURE, &features);
+    if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) {
+        // Store kernel agent in the application-provided buffer and return
+        hsa_agent_t* ret = (hsa_agent_t*) data;
+        *ret = agent;
+        return HSA_STATUS_INFO_BREAK;
+    }
+    // Keep iterating
+    return HSA_STATUS_SUCCESS;
+}
+
+void signal_wait(hsa_signal_t signal)
+{
+    while (hsa_signal_wait_relaxed(signal, HSA_SIGNAL_CONDITION_EQ, 0,
+        UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0);
+    // while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_EQ, 0,
+    //  UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0);
+}
+
+void initialize_agent_dispatch_packet(
+    hsa_agent_dispatch_packet_t* packet,
+    size_t header_size
+    )
+{
+    // Reserved fields, private and group memory,
+    // and completion signal are all set to 0.
+    memset(((uint8_t*) packet) + header_size, 0,
+        sizeof(hsa_agent_dispatch_packet_t) - header_size);
+}
+
+HSA_Interface::HSA_Interface(){
+
+    printf("INFO:: Setting up HSA Interface:\n");
+
+    CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
+    printf ("info: running on device %s\n", props.name); fflush(stdout);
+    #ifdef __HIP_PLATFORM_HCC__
+      printf ("info: architecture on AMD GPU device is: %d\n",
+        props.gcnArch); fflush(stdout);
+    #endif
+
+    printf ("INFO:: hsa_iterate_agents\n"); fflush(stdout);
+    hsa_agent_t kernel_agent;
+    hsa_iterate_agents(get_kernel_agent, &kernel_agent);
+    printf ("INFO:: hsa_queue_create\n"); fflush(stdout);
+    hsa_queue_create(kernel_agent, 4, HSA_QUEUE_TYPE_SINGLE,
+        NULL, NULL, 0, 0, &queue);
+    printf ("INFO:: hsa_queue_add_write_index_relaxed\n"); fflush(stdout);
+    hsa_queue_add_write_index_relaxed(queue, 1);
+
+    packet_id = 0;
+
+    printf("INFO:: Creating Stream\n");fflush(stdout);
+    stream = 0;
+    hipStreamCreate(&stream);
+}
+
+void HSA_Interface::steal_kernel_signal(uint32_t kid)
+{
+    hsa_agent_dispatch_packet_t * packet =
+        (hsa_agent_dispatch_packet_t*) queue->base_address + packet_id;
+    // Populate fields in kernel dispatch packet, except for the header,
+    // the setup, and the completion signal fields
+    initialize_agent_dispatch_packet(packet,sizeof(uint16_t));
+
+    uint64_t kernel_completion_signal_addr;
+    packet->type = AGENT_DISPATCH_PACKET_STEAL_KERNEL_SIGNAL;
+    packet->return_address = &kernel_completion_signal_addr;
+    packet->arg[0] = kid; //This field is for the kernel id.
+
+    //Create thief packet wait signal
+    hsa_signal_create(1, 0, NULL, &packet->completion_signal);
+
+    agent_disp_packet_store_release((uint16_t*) packet,
+        header(HSA_PACKET_TYPE_AGENT_DISPATCH));
+
+    print_agent_dispatch_packet(packet);
+
+    //Send thief packet
+    hsa_signal_store_screlease(queue->doorbell_signal, packet_id);
+
+    signal_wait(packet->completion_signal);
+    printf("INFO:: Done Waiting on Thief Signal\n");
+
+    hsa_signal_t * new_signal = new hsa_signal_t;
+    new_signal->handle = kernel_completion_signal_addr;
+    m_kernel_signals.push_back(new_signal);
+
+    packet_id++;
+}
+
+void HSA_Interface::wait_kernel(uint32_t kid)
+{
+    signal_wait(*(m_kernel_signals[kid]));
+}
\ No newline at end of file
diff --git a/src/hsa-agent-pkt/HSA_Interface.h b/src/hsa-agent-pkt/HSA_Interface.h
new file mode 100644
index 0000000..b0aab72
--- /dev/null
+++ b/src/hsa-agent-pkt/HSA_Interface.h
@@ -0,0 +1,83 @@
+/*
+Copyright (c) 2020 University of Maryland
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met: redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer;
+redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution;
+neither the name of the copyright holders nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "hip/hip_runtime.h"
+
+#include <include/hsa/hsa.h>
+
+#include <vector>
+
+#define AGENT_DISPATCH_PACKET_NOP 0
+#define AGENT_DISPATCH_PACKET_STEAL_KERNEL_SIGNAL 1
+
+#define CHECK(cmd) \
+{\
+    hipError_t error  = cmd;\
+    if (error != hipSuccess) {\
+      fprintf(stderr, "error: '%s'(%d) at %s:%d\n",\
+      hipGetErrorString(error), error,__FILE__, __LINE__);\
+    exit(EXIT_FAILURE);\
+    }\
+}
+
+void print_agent_dispatch_packet(hsa_agent_dispatch_packet_t* pkt);
+void agent_disp_packet_store_release(uint16_t* packet, uint16_t header);
+uint16_t header(hsa_packet_type_t type);
+hsa_status_t get_kernel_agent(hsa_agent_t agent, void* data);
+void signal_wait( hsa_signal_t signal);
+void initialize_agent_dispatch_packet(
+    hsa_agent_dispatch_packet_t* packet,
+    size_t header_size
+    );
+
+//Class for interacting with kernel agent and creating pipes
+class HSA_Interface {
+
+public:
+    HSA_Interface();
+    ~HSA_Interface(){};
+
+    void steal_kernel_signal(uint32_t kid);
+    void wait_kernel(uint32_t kid);
+
+    hipStream_t getStream() {return stream;}
+
+private:
+    hsa_queue_t * queue;
+    hipStream_t stream;
+    hipDeviceProp_t props;
+
+    //Store Kernel Signals for multuple launches
+    std::vector<hsa_signal_t *> m_kernel_signals;
+
+    //Each packet created will have an ID associated with it.
+    //It is used to index into the hsa queue.
+    uint64_t packet_id;
+
+};
+
diff --git a/src/hsa-agent-pkt/Makefile b/src/hsa-agent-pkt/Makefile
new file mode 100644
index 0000000..a5fec13
--- /dev/null
+++ b/src/hsa-agent-pkt/Makefile
@@ -0,0 +1,18 @@
+HIP_PATH?= /opt/rocm/hip
+HSA_PATH?= /opt/rocm/hsa
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+BIN_DIR?= ./bin
+
+gfx8-apu: $(BIN_DIR)/square.o
+
+$(BIN_DIR)/square.o: square.cpp HSA_Interface.cpp $(BIN_DIR)
+	$(HIPCC) --amdgpu-target=gfx801 $(CXXFLAGS) -I$(HSA_PATH) square.cpp HSA_Interface.cpp -o $(BIN_DIR)/square.o
+
+$(BIN_DIR):
+	mkdir -p $(BIN_DIR)
+
+clean:
+	rm -rf $(BIN_DIR)
+
+.PHONY: gfx8-apu clean
diff --git a/src/hsa-agent-pkt/square.cpp b/src/hsa-agent-pkt/square.cpp
new file mode 100644
index 0000000..86653b1
--- /dev/null
+++ b/src/hsa-agent-pkt/square.cpp
@@ -0,0 +1,100 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <stdio.h>
+#include "hip/hip_runtime.h"
+#include "HSA_Interface.h"
+
+/*
+ * Square each element in the array A and write to array C.
+ */
+template <typename T>
+__global__ void
+vector_square(hipLaunchParm lp, T *C_d, const T *A_d, size_t N)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    for (size_t i=offset; i<N; i+=stride) {
+        C_d[i] = A_d[i] * A_d[i];
+    }
+}
+
+
+int main(int argc, char *argv[])
+{
+    float *A_h, *C_h;
+    size_t N = 1000000;
+
+    if (argc == 2)
+        N = atoi(argv[1]);
+
+    const unsigned threadsPerBlock = 256;
+    unsigned blocks = (N + threadsPerBlock - 1) / threadsPerBlock; 
+
+    size_t Nbytes = N * sizeof(float);
+
+    HSA_Interface * hsa= new HSA_Interface();
+
+    printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    A_h = (float*)malloc(Nbytes);
+    CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+    C_h = (float*)malloc(Nbytes);
+    CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+
+    // Fill with Phi + i
+    for (size_t i=0; i<N; i++)
+    {
+        A_h[i] = 1.618f + i;
+    }
+
+    int kernel_id = 0;
+    printf ("info: launch 'vector_square' kernel: "
+            "N = %lu | Blocks = %u | kernel_id %d\n",
+            N, blocks, kernel_id);
+    hipLaunchKernel(vector_square, dim3(blocks),
+                    dim3(threadsPerBlock), 0, hsa->getStream(),
+                    C_h, A_h, N);
+
+
+    //Kernel_id must match that of the launched kernel (ie launch order)
+    printf("info: Stealing kernel completion signal (kid: %d)\n",
+            kernel_id);
+    hsa->steal_kernel_signal(kernel_id);
+
+    //Theoretically Equivalent to hipDeviceSynchronize();
+    printf("info: Waiting on kernel completion signal (kid: %d)\n",
+            kernel_id);
+    hsa->wait_kernel(kernel_id);
+
+    //Increment the Kernel_id every time any kernel is launched.
+    kernel_id++;
+
+    printf ("info: check result\n");
+    for (size_t i=0; i<N; i++)  {
+        if (C_h[i] != A_h[i] * A_h[i]) {
+            CHECK(hipErrorUnknown);
+        }
+    }
+    printf ("PASSED!\n");
+	return 0;
+}