hsa: enable interruptible hsa signal support

Event creation and management support from emulated drivers is required to
support interruptible signals in HSA and this support was not available. This
changeset adds the event creation and management support in the emulated
driver.  With this patch, each interruptible signal created by the HSA runtime
is associated with a signal event. The HSA runtime can then put a thread
waiting on a signal condition to sleep asking the driver to monitor the event
associated with that signal. If the signal is modified by the GPU, the
dispatcher notifies the driver about signal value change.  If the modifier is a
CPU thread, the thread will have to make HSA API calls to modify the signal and
these API calls will notify the driver about signal value change. Once the
driver is notified about a change in the signal value, the driver checks to see
if any thread is sleeping on that signal and wake up the sleeping thread
associated with that event. The driver has also implemented the time_out wakeup
that can wake up the thread after a certain time period has expired. This
is also true for barrier packets. This changeset also fixes a bug in the mmap
syscall.

Each signal has an event address in a kernel managed and allocated  event page
that can be used as a mailbox pointer to notify an event. However, this feature
used by non-CPU agents to communicate with the driver is not implemented by
this changeset because the non-CPU HSA agents in our model can directly
communicate with driver in our implementation. Having said that, adding that
feature should be trivial because the event address and event pages are
correctly setup by this changeset and just adding the event page's virtual
address to our PIO doorbell interface in the page tables and registering that
pio address to the driver should be sufficient. Managing mailbox pointer for an
event is based on event ID and using this event ID as an index into event page,
this changeset already provides a unique mailbox pointer for each event.

Change-Id: Ic62794076ddd47526b1f952fdb4c1bad632bdd2e
diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index 2527d97..3b085cd 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -459,7 +459,7 @@
                "/usr/lib/x86_64-linux-gnu"
            ]),
            'HOME=%s' % os.getenv('HOME','/'),
-           "HSA_ENABLE_INTERRUPT=0"]
+           "HSA_ENABLE_INTERRUPT=1"]
 
 process = Process(executable = executable, cmd = [options.cmd]
                   + options.options.split(), drivers = [gpu_driver], env = env)
diff --git a/src/dev/hsa/hsa_device.hh b/src/dev/hsa/hsa_device.hh
index c396e43..7562e85 100644
--- a/src/dev/hsa/hsa_device.hh
+++ b/src/dev/hsa/hsa_device.hh
@@ -43,10 +43,13 @@
 #include "dev/hsa/hsa_packet_processor.hh"
 #include "params/HSADevice.hh"
 
+class HSADriver;
+
 class HSADevice : public DmaDevice
 {
   public:
     typedef HSADeviceParams Params;
+    typedef std::function<void(const uint64_t &)> HsaSignalCallbackFunction;
 
     HSADevice(const Params *p) : DmaDevice(p), hsaPP(p->hsapp)
     {
@@ -80,7 +83,22 @@
     {
         fatal("%s does not accept vendor specific packets\n", name());
     }
-
+    virtual void
+    attachDriver(HSADriver *driver)
+    {
+        fatal("%s does not need HSA driver\n", name());
+    }
+    virtual void
+    updateHsaSignal(Addr signal_handle, uint64_t signal_value,
+        HsaSignalCallbackFunction function = [ = ] (const uint64_t &) { })
+    {
+        fatal("%s does not have HSA signal update functionality.\n", name());
+    }
+    virtual uint64_t
+    functionalReadHsaSignal(Addr signal_handle)
+    {
+        fatal("%s does not have HSA signal read functionality.\n", name());
+    }
     void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb,
                      void *data, Tick delay = 0);
     void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *cb,
diff --git a/src/dev/hsa/hsa_driver.cc b/src/dev/hsa/hsa_driver.cc
index 5f30159..ce86055 100644
--- a/src/dev/hsa/hsa_driver.cc
+++ b/src/dev/hsa/hsa_driver.cc
@@ -39,11 +39,19 @@
 #include "cpu/thread_context.hh"
 #include "debug/HSADriver.hh"
 #include "dev/hsa/hsa_device.hh"
+#include "dev/hsa/hsa_packet_processor.hh"
+#include "dev/hsa/kfd_event_defines.h"
 #include "dev/hsa/kfd_ioctl.h"
 #include "params/HSADriver.hh"
 #include "sim/process.hh"
 #include "sim/syscall_emul_buf.hh"
 
+const char*
+HSADriver::DriverWakeupEvent::description() const
+{
+    return "DriverWakeupEvent";
+}
+
 HSADriver::HSADriver(HSADriverParams *p)
     : EmulatedDriver(p), device(p->device), queueId(0)
 {
@@ -64,31 +72,55 @@
 
 /**
  * Currently, mmap() will simply setup a mapping for the associated
- * device's packet processor's doorbells.
+ * device's packet processor's doorbells and creates the event page.
  */
 Addr
 HSADriver::mmap(ThreadContext *tc, Addr start, uint64_t length, int prot,
-                int tgt_flags, int tgt_fd, int offset)
+                int tgt_flags, int tgt_fd, off_t offset)
 {
-    DPRINTF(HSADriver, "amdkfd doorbell mmap (start: %p, length: 0x%x,"
-            "offset: 0x%x)\n", start, length, offset);
-
-    auto process = tc->getProcessPtr();
-    auto mem_state = process->getMemState();
+     // Is this a signal event mmap
+     bool is_event_mmap = false;
+     // If addr == 0, then we may need to do mmap.
+     bool should_mmap = (start == 0);
+     auto process = tc->getProcessPtr();
+     auto mem_state = process->getMemState();
+     // Check if mmap is for signal events first
+     if (((offset >> PAGE_SHIFT) & KFD_MMAP_TYPE_MASK) ==
+         KFD_MMAP_TYPE_EVENTS) {
+         is_event_mmap = true;
+         DPRINTF(HSADriver, "amdkfd mmap for events(start: %p, length: 0x%x,"
+                 "offset: 0x%x,  )\n", start, length, offset);
+         panic_if(start != 0,
+                  "Start address should be provided by KFD\n");
+         panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT,
+                  "Requested length %d, expected length %d; length mismatch\n",
+                   length, 8 * KFD_SIGNAL_EVENT_LIMIT);
+         // For signal event, do mmap only is eventPage is uninitialized
+         should_mmap = (!eventPage);
+    } else {
+        DPRINTF(HSADriver, "amdkfd doorbell mmap (start: %p, length: 0x%x,"
+                "offset: 0x%x)\n", start, length, offset);
+    }
 
     // Extend global mmap region if necessary.
-    if (start == 0) {
-        // Assume mmap grows down, as in x86 Linux.
+    if (should_mmap) {
+        // Assume mmap grows down, as in x86 Linux
         start = mem_state->getMmapEnd() - length;
         mem_state->setMmapEnd(start);
     }
 
-    /**
-     * Now map this virtual address to our PIO doorbell interface
-     * in the page tables (non-cacheable).
-     */
-    mem_state->map(start, device->hsaPacketProc().pioAddr, length, false);
-    DPRINTF(HSADriver, "amdkfd doorbell mapped to %xp\n", start);
+    if (is_event_mmap) {
+         if (should_mmap) {
+             eventPage = (void*) start;
+         }
+    } else {
+        // Now map this virtual address to our PIO doorbell interface
+        // in the page tables (non-cacheable)
+        mem_state->map(start, device->hsaPacketProc().pioAddr, length, false);
+
+        DPRINTF(HSADriver, "amdkfd doorbell mapped to %xp\n", start);
+    }
+
     return start;
 }
 
@@ -116,3 +148,48 @@
                               args->ring_size);
     args.copyOut(mem_proxy);
 }
+
+void
+HSADriver::DriverWakeupEvent::scheduleWakeup(Tick wakeup_delay)
+{
+    assert(driver);
+    driver->schedule(this, curTick() + wakeup_delay);
+}
+
+void
+HSADriver::signalWakeupEvent(uint32_t event_id)
+{
+    panic_if(event_id >= eventSlotIndex,
+        "Trying wakeup on an event that is not yet created\n");
+    if (ETable[event_id].threadWaiting) {
+        panic_if(!ETable[event_id].tc,
+                 "No thread context to wake up\n");
+        ThreadContext *tc = ETable[event_id].tc;
+        DPRINTF(HSADriver,
+                "Signal event: Waking up CPU %d\n", tc->cpuId());
+        // Remove events that can wakeup this thread
+        TCEvents[tc].clearEvents();
+        // Now wakeup this thread
+        tc->activate();
+    } else {
+       // This may be a race condition between an ioctl call asking to wait on
+       // this event and this signalWakeupEvent. Taking care of this race
+       // condition here by setting the event here. The ioctl call should take
+       // the necessary action when waiting on an already set event.  However,
+       // this may be a genuine instance in which the runtime has decided not
+       // to wait on this event. But since we cannot distinguish this case with
+       // the race condition, we are any way setting the event.
+       ETable[event_id].setEvent = true;
+    }
+}
+
+void
+HSADriver::DriverWakeupEvent::process()
+{
+    DPRINTF(HSADriver,
+            "Timer event: Waking up CPU %d\n", tc->cpuId());
+    // Remove events that can wakeup this thread
+    driver->TCEvents[tc].clearEvents();
+    // Now wakeup this thread
+    tc->activate();
+}
diff --git a/src/dev/hsa/hsa_driver.hh b/src/dev/hsa/hsa_driver.hh
index b3c7ee2..f9ad7d5 100644
--- a/src/dev/hsa/hsa_driver.hh
+++ b/src/dev/hsa/hsa_driver.hh
@@ -51,6 +51,8 @@
 #ifndef __DEV_HSA_HSA_DRIVER_HH__
 #define __DEV_HSA_HSA_DRIVER_HH__
 
+#include <unordered_map>
+
 #include "base/types.hh"
 #include "sim/emul_driver.hh"
 
@@ -66,8 +68,84 @@
 
     int open(ThreadContext *tc, int mode, int flags);
     Addr mmap(ThreadContext *tc, Addr start, uint64_t length,
-              int prot, int tgtFlags, int tgtFd, int offset);
+              int prot, int tgt_flags, int tgt_fd, off_t offset);
+    virtual void signalWakeupEvent(uint32_t event_id);
+    class DriverWakeupEvent : public Event
+    {
+      public:
+        DriverWakeupEvent(HSADriver *hsa_driver, ThreadContext *thrd_cntxt)
+            : driver(hsa_driver), tc(thrd_cntxt)  {}
+        void process() override;
+        const char *description() const override;
+        void scheduleWakeup(Tick wakeup_delay);
+      private:
+        HSADriver *driver;
+        ThreadContext *tc;
+    };
+    class EventTableEntry {
+      public:
+        EventTableEntry() :
+            mailBoxPtr(0), tc(nullptr), threadWaiting(false), setEvent(false)
+        {}
+        // Mail box pointer for this address. Current implementation does not
+        // use this mailBoxPtr to notify events but directly calls
+        // signalWakeupEvent from dispatcher (GPU) to notify event. So,
+        // currently this mailBoxPtr is not used. But a future implementation
+        // may communicate to the driver using mailBoxPtr.
+        Addr mailBoxPtr;
+        // Thread context waiting on this event. We do not support multiple
+        // threads waiting on an event currently.
+        ThreadContext *tc;
+        // threadWaiting = true, if some thread context is waiting on this
+        // event.  A thread context waiting on this event is put to sleep.
+        bool threadWaiting;
+        // setEvent = true, if this event is triggered but when this event
+        // triggered, no thread context was waiting on it. In the future, some
+        // thread context will try to wait on this event but since event has
+        // already happened, we will not allow that thread context to go to
+        // sleep. The above mentioned scenario can happen when the waiting
+        // thread and wakeup thread race on this event and the wakeup thread
+        // beat the waiting thread at the driver.
+        bool setEvent;
+    };
+    typedef class EventTableEntry ETEntry;
+
   protected:
+    void *eventPage;
+    uint32_t eventSlotIndex;
+    // Event table that keeps track of events. It is indexed with event ID.
+    std::unordered_map<uint32_t, ETEntry> ETable;
+
+    // TCEvents map keeps track of the events that can wakeup this thread. When
+    // multiple events can wake up this thread, this data structure helps to
+    // reset all events when one of those events wake up this thread. The
+    // signal events that can wake up this thread are stored in signalEvents
+    // whereas the timer wakeup event is stored in timerEvent.
+    class EventList {
+      public:
+        EventList() : driver(nullptr), timerEvent(nullptr, nullptr) {}
+        EventList(HSADriver *hsa_driver, ThreadContext *thrd_cntxt)
+            : driver(hsa_driver), timerEvent(hsa_driver, thrd_cntxt)
+        { }
+        void clearEvents() {
+            assert(driver);
+            for (auto event : signalEvents) {
+               assert(event < driver->eventSlotIndex);
+               driver->ETable[event].tc = nullptr;
+               driver->ETable[event].threadWaiting = false;
+            }
+            signalEvents.clear();
+            if (timerEvent.scheduled()) {
+                driver->deschedule(timerEvent);
+            }
+        }
+        HSADriver *driver;
+        DriverWakeupEvent timerEvent;
+        // The set of events that can wake up the same thread.
+        std::set<uint32_t> signalEvents;
+    };
+    std::unordered_map<ThreadContext *, EventList> TCEvents;
+
     /**
      * HSA agent (device) that is controled by this driver.
      */
diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc
index 1311fe0..590dc3c 100644
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -393,34 +393,20 @@
             dep_sgnl_rd_st->resetSigVals();
             // The completion signal is connected
             if (bar_and_pkt->completion_signal != 0) {
-                // The signal value is aligned 8 bytes
-                // from the actual handle in the runtime
-                uint64_t signal_addr =
-                    (uint64_t) (((uint64_t *)
-                    bar_and_pkt->completion_signal) + 1);
+                // HACK: The semantics of the HSA signal is to
+                // decrement the current signal value
+                // I'm going to cheat here and read out
+                // the value from main memory using functional
+                // access, and then just DMA the decremented value.
+                uint64_t signal_value = hsa_device->functionalReadHsaSignal(\
+                                            bar_and_pkt->completion_signal);
+
                 DPRINTF(HSAPacketProcessor, "Triggering barrier packet" \
-                       " completion signal: %x!\n", signal_addr);
-                /**
-                 * HACK: The semantics of the HSA signal is to
-                 * decrement the current signal value.
-                 * I'm going to cheat here and read out
-                 * the value from main memory using functional
-                 * access, and then just DMA the decremented value.
-                 * The reason for this is that the DMASequencer does
-                 * not support atomic operations.
-                 */
-                auto tc = sys->getThreadContext(0);
-                auto process = tc->getProcessPtr();
-                auto mem_state = process->getMemState();
-                auto &virt_proxy = mem_state->getVirtProxy();
-                TypedBufferArg<uint64_t> prev_signal(signal_addr);
-                prev_signal.copyIn(virt_proxy);
+                       " completion signal! Addr: %x\n",
+                       bar_and_pkt->completion_signal);
 
-                hsa_signal_value_t *new_signal = new hsa_signal_value_t;
-                *new_signal = (hsa_signal_value_t) *prev_signal - 1;
-
-                dmaWriteVirt(signal_addr,
-                             sizeof(hsa_signal_value_t), NULL, new_signal, 0);
+                hsa_device->updateHsaSignal(bar_and_pkt->completion_signal,
+                                            signal_value - 1);
             }
         }
         if (dep_sgnl_rd_st->pendingReads > 0) {
diff --git a/src/dev/hsa/hsa_packet_processor.hh b/src/dev/hsa/hsa_packet_processor.hh
index b3057bc..a5df812 100644
--- a/src/dev/hsa/hsa_packet_processor.hh
+++ b/src/dev/hsa/hsa_packet_processor.hh
@@ -270,6 +270,13 @@
         return regdQList.at(queId);
     }
 
+    uint64_t
+    inFlightPkts(uint32_t queId)
+    {
+        auto aqlBuf = regdQList.at(queId)->qCntxt.aqlBuf;
+        return aqlBuf->dispIdx() - aqlBuf->rdIdx();
+    }
+
     int numHWQueues;
     Addr pioAddr;
     Addr pioSize;
diff --git a/src/dev/hsa/hsa_signal.hh b/src/dev/hsa/hsa_signal.hh
new file mode 100644
index 0000000..943ea33
--- /dev/null
+++ b/src/dev/hsa/hsa_signal.hh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016-2019 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Michael LeBeane
+ */
+#ifndef DEV_HSA_HSA_SIGNAL_H
+#define DEV_HSA_HSA_SIGNAL_H
+
+// AMD Signal Kind Enumeration Values.
+typedef int64_t amd_signal_kind64_t;
+enum amd_signal_kind_t {
+  AMD_SIGNAL_KIND_INVALID = 0,
+  AMD_SIGNAL_KIND_USER = 1,
+  AMD_SIGNAL_KIND_DOORBELL = -1,
+  AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2
+};
+
+// AMD Signal.
+typedef struct amd_signal_s {
+  amd_signal_kind64_t kind;
+  union {
+    volatile int64_t value;
+    volatile uint32_t* legacy_hardware_doorbell_ptr;
+    volatile uint64_t* hardware_doorbell_ptr;
+  };
+  uint64_t event_mailbox_ptr;
+  uint32_t event_id;
+  uint32_t reserved1;
+  uint64_t start_ts;
+  uint64_t end_ts;
+  union {
+    uint64_t queue_ptr;
+    uint64_t reserved2;
+  };
+  uint32_t reserved3[2];
+} amd_signal_t;
+
+#endif // DEV_HSA_HSA_SIGNAL_H
diff --git a/src/dev/hsa/hw_scheduler.cc b/src/dev/hsa/hw_scheduler.cc
index 6b37155..365c202 100644
--- a/src/dev/hsa/hw_scheduler.cc
+++ b/src/dev/hsa/hw_scheduler.cc
@@ -295,7 +295,6 @@
     DPRINTF(HSAPacketProcessor,
             "@ %s, analyzing hw queue %d\n", __FUNCTION__, rl_idx);
     HSAQueueDescriptor* qDesc = hsaPP->getRegdListEntry(rl_idx)->qCntxt.qDesc;
-    AQLRingBuffer* aql_buf = hsaPP->getRegdListEntry(rl_idx)->qCntxt.aqlBuf;
 
     // If there a pending DMA to this registered queue
     // then the queue is not idle
@@ -306,7 +305,7 @@
     // Since packet completion stage happens only after kernel completion
     // we need to keep the queue mapped till all the outstanding kernels
     // from that queue are finished
-    if (aql_buf->rdIdx() != aql_buf->dispIdx()) {
+    if (hsaPP->inFlightPkts(rl_idx)) {
         return false;
     }
 
diff --git a/src/dev/hsa/kfd_event_defines.h b/src/dev/hsa/kfd_event_defines.h
new file mode 100644
index 0000000..cd107fb
--- /dev/null
+++ b/src/dev/hsa/kfd_event_defines.h
@@ -0,0 +1,13 @@
+#ifndef KFD_EVENT_DEFINES_H_INCLUDED
+#define KFD_EVENT_DEFINES_H_INCLUDED
+
+#include "dev/hsa/kfd_ioctl.h"
+
+#define PAGE_SHIFT 12
+#define KFD_MMAP_TYPE_SHIFT     (62 - PAGE_SHIFT)
+#define KFD_MMAP_TYPE_MASK      (0x3ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_DOORBELL  (0x3ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_EVENTS    (0x2ULL << KFD_MMAP_TYPE_SHIFT)
+#define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT
+
+#endif
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
index 0e1035c..ef19562 100644
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -318,32 +318,20 @@
         gpuCmdProc->hsaPacketProc()
             .finishPkt(task->dispPktPtr(), task->queueId());
         if (task->completionSignal()) {
-            // The signal value is aligned 8 bytes from
-            // the actual handle in the runtime
-            Addr signal_addr = task->completionSignal() + sizeof(Addr);
-            DPRINTF(GPUDisp, "HSA AQL Kernel Complete! Triggering "
-                    "completion signal: %x!\n", signal_addr);
-
             /**
-             * HACK: The semantics of the HSA signal is to decrement
-             * the current signal value. We cheat here and read out
-             * he value from main memory using functional access and
-             * then just DMA the decremented value. This is because
-             * the DMA controller does not currently support GPU
-             * atomics.
-             */
-            auto tc = gpuCmdProc->system()->getThreadContext(0);
-            auto process = tc->getProcessPtr();
-            auto mem_state = process->getMemState();
-            auto &virt_proxy = mem_state->getVirtProxy();
-            TypedBufferArg<Addr> prev_signal(signal_addr);
-            prev_signal.copyIn(virt_proxy);
+            * HACK: The semantics of the HSA signal is to decrement
+            * the current signal value. We cheat here and read out
+            * he value from main memory using functional access and
+            * then just DMA the decremented value.
+            */
+            uint64_t signal_value =
+                gpuCmdProc->functionalReadHsaSignal(task->completionSignal());
 
-            Addr *new_signal = new Addr;
-            *new_signal = (Addr)*prev_signal - 1;
+            DPRINTF(GPUDisp, "HSA AQL Kernel Complete with completion "
+                    "signal! Addr: %d\n", task->completionSignal());
 
-            gpuCmdProc->dmaWriteVirt(signal_addr, sizeof(Addr), nullptr,
-                new_signal, 0);
+            gpuCmdProc->updateHsaSignal(task->completionSignal(),
+                                        signal_value - 1);
         } else {
             DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
                 "signal\n");
diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index 1d283d5..59e2f5d 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -40,6 +40,7 @@
 #include "gpu-compute/dispatcher.hh"
 #include "params/GPUCommandProcessor.hh"
 #include "sim/process.hh"
+#include "sim/syscall_emul_buf.hh"
 
 GPUCommandProcessor::GPUCommandProcessor(const Params *p)
     : HSADevice(p), dispatcher(*p->dispatcher)
@@ -145,6 +146,66 @@
     ++dynamic_task_id;
 }
 
+uint64_t
+GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle)
+{
+    Addr value_addr = getHsaSignalValueAddr(signal_handle);
+    auto tc = system()->getThreadContext(0);
+    auto process = tc->getProcessPtr();
+    auto mem_state = process->getMemState();
+    auto &virt_proxy = mem_state->getVirtProxy();
+    TypedBufferArg<Addr> prev_value(value_addr);
+    prev_value.copyIn(virt_proxy);
+    return *prev_value;
+}
+
+void
+GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,
+                                     HsaSignalCallbackFunction function)
+{
+    // The signal value is aligned 8 bytes from
+    // the actual handle in the runtime
+    Addr value_addr = getHsaSignalValueAddr(signal_handle);
+    Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
+    Addr event_addr = getHsaSignalEventAddr(signal_handle);
+    DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);
+
+    Addr *new_signal = new Addr;
+    *new_signal = signal_value;
+
+    dmaWriteVirt(value_addr, sizeof(Addr), nullptr, new_signal, 0);
+
+    auto tc = system()->getThreadContext(0);
+    auto process = tc->getProcessPtr();
+    auto mem_state = process->getMemState();
+    auto &virt_proxy = mem_state->getVirtProxy();
+    TypedBufferArg<uint64_t> mailbox_ptr(mailbox_addr);
+    mailbox_ptr.copyIn(virt_proxy);
+
+    // Notifying an event with its mailbox pointer is
+    // not supported in the current implementation. Just use
+    // mailbox pointer to distinguish between interruptible
+    // and default signal. Interruptible signal will have
+    // a valid mailbox pointer.
+    if (*mailbox_ptr != 0) {
+        // This is an interruptible signal. Now, read the
+        // event ID and directly communicate with the driver
+        // about that event notification.
+        TypedBufferArg<uint32_t> event_val(event_addr);
+        event_val.copyIn(virt_proxy);
+
+        DPRINTF(GPUCommandProc, "Calling signal wakeup event on "
+                "signal event value %d\n", *event_val);
+        signalWakeupEvent(*event_val);
+    }
+}
+
+void
+GPUCommandProcessor::attachDriver(HSADriver *hsa_driver)
+{
+    driver = hsa_driver;
+}
+
 /**
  * submitVendorPkt() is for accepting vendor-specific packets from
  * the HSAPP. Vendor-specific packets may be used by the runtime to
@@ -181,6 +242,12 @@
     dispatcher.dispatch(task);
 }
 
+void
+GPUCommandProcessor::signalWakeupEvent(uint32_t event_id)
+{
+    driver->signalWakeupEvent(event_id);
+}
+
 /**
  * The CP is responsible for traversing all HSA-ABI-related data
  * structures from memory and initializing the ABI state.
diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh
index 7253dd4..8ad47c7 100644
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -48,6 +48,8 @@
 #define __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
 
 #include "dev/hsa/hsa_device.hh"
+#include "dev/hsa/hsa_signal.hh"
+#include "gpu-compute/gpu_compute_driver.hh"
 #include "gpu-compute/hsa_queue_entry.hh"
 
 struct GPUCommandProcessorParams;
@@ -69,16 +71,40 @@
                            Addr host_pkt_addr) override;
     void submitVendorPkt(void *raw_pkt, uint32_t queue_id,
                          Addr host_pkt_addr) override;
+    void attachDriver(HSADriver *driver) override;
     void dispatchPkt(HSAQueueEntry *task);
+    void signalWakeupEvent(uint32_t event_id);
 
     Tick write(PacketPtr pkt) override { return 0; }
     Tick read(PacketPtr pkt) override { return 0; }
     AddrRangeList getAddrRanges() const override;
     System *system();
 
+    void updateHsaSignal(Addr signal_handle, uint64_t signal_value,
+                         HsaSignalCallbackFunction function =
+                            [ = ] (const uint64_t &) { });
+
+    uint64_t functionalReadHsaSignal(Addr signal_handle);
+
+    Addr getHsaSignalValueAddr(Addr signal_handle)
+    {
+        return signal_handle + offsetof(amd_signal_t, value);
+    }
+
+    Addr getHsaSignalMailboxAddr(Addr signal_handle)
+    {
+        return signal_handle + offsetof(amd_signal_t, event_mailbox_ptr);
+    }
+
+    Addr getHsaSignalEventAddr(Addr signal_handle)
+    {
+        return signal_handle + offsetof(amd_signal_t, event_id);
+    }
+
   private:
     Shader *_shader;
     GPUDispatcher &dispatcher;
+    HSADriver *driver;
 
     void initABI(HSAQueueEntry *task);
 
diff --git a/src/gpu-compute/gpu_compute_driver.cc b/src/gpu-compute/gpu_compute_driver.cc
index 40420ba..334bddd 100644
--- a/src/gpu-compute/gpu_compute_driver.cc
+++ b/src/gpu-compute/gpu_compute_driver.cc
@@ -40,6 +40,7 @@
 #include "debug/GPUDriver.hh"
 #include "dev/hsa/hsa_device.hh"
 #include "dev/hsa/hsa_packet_processor.hh"
+#include "dev/hsa/kfd_event_defines.h"
 #include "dev/hsa/kfd_ioctl.h"
 #include "params/GPUComputeDriver.hh"
 #include "sim/process.hh"
@@ -48,6 +49,7 @@
 GPUComputeDriver::GPUComputeDriver(Params *p)
     : HSADriver(p)
 {
+    device->attachDriver(this);
     DPRINTF(GPUDriver, "Constructing KFD: device\n");
 }
 
@@ -66,8 +68,8 @@
             DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_VERSION\n");
 
             TypedBufferArg<kfd_ioctl_get_version_args> args(ioc_buf);
-            args->major_version = 1;
-            args->minor_version = 0;
+            args->major_version = KFD_IOCTL_MAJOR_VERSION;
+            args->minor_version = KFD_IOCTL_MINOR_VERSION;
 
             args.copyOut(virt_proxy);
           }
@@ -210,17 +212,59 @@
           break;
         case AMDKFD_IOC_CREATE_EVENT:
           {
-            warn("unimplemented ioctl: AMDKFD_IOC_CREATE_EVENT\n");
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_EVENT\n");
+
+            TypedBufferArg<kfd_ioctl_create_event_args> args(ioc_buf);
+            args.copyIn(virt_proxy);
+            if (args->event_type != KFD_IOC_EVENT_SIGNAL) {
+                fatal("Signal events are only supported currently\n");
+            } else if (eventSlotIndex == SLOTS_PER_PAGE) {
+                fatal("Signal event wasn't created; signal limit reached\n");
+            }
+            // Currently, we allocate only one signal_page for events.
+            // Note that this signal page is of size 8 * KFD_SIGNAL_EVENT_LIMIT
+            uint64_t page_index = 0;
+            args->event_page_offset = (page_index | KFD_MMAP_TYPE_EVENTS);
+            args->event_page_offset <<= PAGE_SHIFT;
+            // TODO: Currently we support only signal events, hence using
+            // the same ID for both signal slot and event slot
+            args->event_slot_index = eventSlotIndex;
+            args->event_id = eventSlotIndex++;
+            args->event_trigger_data = args->event_id;
+            DPRINTF(GPUDriver, "amdkfd create events"
+                    "(event_id: 0x%x, offset: 0x%x)\n",
+                    args->event_id, args->event_page_offset);
+            // Since eventSlotIndex is increased everytime a new event is
+            // created ETable at eventSlotIndex(event_id) is guaranteed to be
+            // empty. In a future implementation that reuses deleted event_ids,
+            // we should check if event table at this
+            // eventSlotIndex(event_id) is empty before inserting a new event
+            // table entry
+            ETable.emplace(std::pair<uint32_t, ETEntry>(args->event_id, {}));
+            args.copyOut(virt_proxy);
           }
           break;
         case AMDKFD_IOC_DESTROY_EVENT:
           {
-            warn("unimplemented ioctl: AMDKFD_IOC_DESTROY_EVENT\n");
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_EVENT\n");
+            TypedBufferArg<kfd_ioctl_destroy_event_args> args(ioc_buf);
+            args.copyIn(virt_proxy);
+            DPRINTF(GPUDriver, "amdkfd destroying event %d\n", args->event_id);
+            fatal_if(ETable.count(args->event_id) == 0,
+                     "Event ID invalid, cannot destroy this event\n");
+            ETable.erase(args->event_id);
           }
           break;
         case AMDKFD_IOC_SET_EVENT:
           {
-            warn("unimplemented ioctl: AMDKFD_IOC_SET_EVENT\n");
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_SET_EVENTS\n");
+            TypedBufferArg<kfd_ioctl_set_event_args> args(ioc_buf);
+            args.copyIn(virt_proxy);
+            DPRINTF(GPUDriver, "amdkfd set event %d\n", args->event_id);
+            fatal_if(ETable.count(args->event_id) == 0,
+                     "Event ID invlaid, cannot set this event\n");
+            ETable[args->event_id].setEvent = true;
+            signalWakeupEvent(args->event_id);
           }
           break;
         case AMDKFD_IOC_RESET_EVENT:
@@ -230,7 +274,69 @@
           break;
         case AMDKFD_IOC_WAIT_EVENTS:
           {
-            warn("unimplemented ioctl: AMDKFD_IOC_WAIT_EVENTS\n");
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_WAIT_EVENTS\n");
+            TypedBufferArg<kfd_ioctl_wait_events_args> args(ioc_buf);
+            args.copyIn(virt_proxy);
+            kfd_event_data *events =
+                (kfd_event_data *)args->events_ptr;
+            DPRINTF(GPUDriver, "amdkfd wait for events"
+                    "(wait on all: %d, timeout : %d, num_events: %s)\n",
+                    args->wait_for_all, args->timeout, args->num_events);
+            panic_if(args->wait_for_all != 0 && args->num_events > 1,
+                    "Wait for all events not supported\n");
+            bool should_sleep = true;
+            if (TCEvents.count(tc) == 0) {
+                // This thread context trying to wait on an event for the first
+                // time, initialize it.
+                TCEvents.emplace(std::piecewise_construct, std::make_tuple(tc),
+                                 std::make_tuple(this, tc));
+                DPRINTF(GPUDriver, "\tamdkfd creating event list"
+                        " for thread  %d\n", tc->cpuId());
+            }
+            panic_if(TCEvents[tc].signalEvents.size() != 0,
+                     "There are %d events that put this thread to sleep,"
+                     " this thread should not be running\n",
+                     TCEvents[tc].signalEvents.size());
+            for (int i = 0; i < args->num_events; i++) {
+                panic_if(!events,
+                         "Event pointer invalid\n");
+                Addr eventDataAddr = (Addr)(events + i);
+                TypedBufferArg<kfd_event_data> EventData(
+                    eventDataAddr, sizeof(kfd_event_data));
+                EventData.copyIn(virt_proxy);
+                DPRINTF(GPUDriver,
+                        "\tamdkfd wait for event %d\n", EventData->event_id);
+                panic_if(ETable.count(EventData->event_id) == 0,
+                         "Event ID invalid, cannot set this event\n");
+                panic_if(ETable[EventData->event_id].threadWaiting,
+                         "Multiple threads waiting on the same event\n");
+                if (ETable[EventData->event_id].setEvent) {
+                    // If event is already set, the event has already happened.
+                    // Just unset the event and dont put this thread to sleep.
+                    ETable[EventData->event_id].setEvent = false;
+                    should_sleep = false;
+                }
+                if (should_sleep) {
+                    // Put this thread to sleep
+                    ETable[EventData->event_id].threadWaiting = true;
+                    ETable[EventData->event_id].tc = tc;
+                    TCEvents[tc].signalEvents.insert(EventData->event_id);
+                }
+            }
+
+            // TODO: Return the correct wait_result back. Currently, returning
+            // success for both KFD_WAIT_TIMEOUT and KFD_WAIT_COMPLETE.
+            // Ideally, this needs to be done after the event is triggered and
+            // after the thread is woken up.
+            args->wait_result = 0;
+            args.copyOut(virt_proxy);
+            if (should_sleep) {
+                // Put this thread to sleep
+                sleepCPU(tc, args->timeout);
+            } else {
+                // Remove events that tried to put this thread to sleep
+                TCEvents[tc].clearEvents();
+            }
           }
           break;
         case AMDKFD_IOC_DBG_REGISTER:
@@ -379,6 +485,18 @@
     return 0;
 }
 
+void
+GPUComputeDriver::sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout)
+{
+    // Convert millisecs to ticks
+    Tick wakeup_delay((uint64_t)milliSecTimeout * 1000000000);
+    assert(TCEvents.count(tc) == 1);
+    TCEvents[tc].timerEvent.scheduleWakeup(wakeup_delay);
+    tc->suspend();
+    DPRINTF(GPUDriver,
+            "CPU %d is put to sleep\n", tc->cpuId());
+}
+
 Addr
 GPUComputeDriver::gpuVmApeBase(int gpuNum) const
 {
diff --git a/src/gpu-compute/gpu_compute_driver.hh b/src/gpu-compute/gpu_compute_driver.hh
index 5e56d42..9cca0e8 100644
--- a/src/gpu-compute/gpu_compute_driver.hh
+++ b/src/gpu-compute/gpu_compute_driver.hh
@@ -55,6 +55,7 @@
     typedef GPUComputeDriverParams Params;
     GPUComputeDriver(Params *p);
     int ioctl(ThreadContext *tc, unsigned req) override;
+    void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout);
 
   private:
     /**
diff --git a/src/sim/emul_driver.hh b/src/sim/emul_driver.hh
index fe13d90..f184007 100644
--- a/src/sim/emul_driver.hh
+++ b/src/sim/emul_driver.hh
@@ -93,7 +93,7 @@
      * (see the SyscallReturn class).
      */
     virtual Addr mmap(ThreadContext *tc, Addr start, uint64_t length,
-                      int prot, int tgtFlags, int tgtFd, int offset)
+                      int prot, int tgtFlags, int tgtFd, off_t offset)
                       { return -EBADF; }
 };
 
diff --git a/src/sim/syscall_emul.hh b/src/sim/syscall_emul.hh
index 19469d1..c34e3a4 100644
--- a/src/sim/syscall_emul.hh
+++ b/src/sim/syscall_emul.hh
@@ -1685,7 +1685,7 @@
     int prot = p->getSyscallArg(tc, index);
     int tgt_flags = p->getSyscallArg(tc, index);
     int tgt_fd = p->getSyscallArg(tc, index);
-    int offset = p->getSyscallArg(tc, index);
+    off_t offset = p->getSyscallArg(tc, index);
 
     if (is_mmap2)
         offset *= TheISA::PageBytes;