src/gpu-compute/gpu_compute_driver.cc - public/gem5 - Git at Google

 /*
  * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * For use for simulation and test purposes only
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */

 #include "gpu-compute/gpu_compute_driver.hh"

 #include <memory>

 #include "base/compiler.hh"
 #include "base/logging.hh"
 #include "base/trace.hh"
 #include "cpu/thread_context.hh"
 #include "debug/GPUDriver.hh"
 #include "debug/GPUShader.hh"
 #include "dev/hsa/hsa_packet_processor.hh"
 #include "dev/hsa/kfd_event_defines.h"
 #include "dev/hsa/kfd_ioctl.h"
 #include "gpu-compute/gpu_command_processor.hh"
 #include "gpu-compute/shader.hh"
 #include "mem/port_proxy.hh"
 #include "params/GPUComputeDriver.hh"
 #include "sim/process.hh"
 #include "sim/syscall_emul_buf.hh"

 GPUComputeDriver::GPUComputeDriver(const Params &p)
     : EmulatedDriver(p), device(p.device), queueId(0),
       isdGPU(p.isdGPU), gfxVersion(p.gfxVersion), dGPUPoolID(p.dGPUPoolID)
 {
     device->attachDriver(this);
     DPRINTF(GPUDriver, "Constructing KFD: device\n");

     // Convert the 3 bit mtype specified in Shader.py to the proper type
     // used for requests.
     if (MtypeFlags::SHARED & p.m_type)
         defaultMtype.set(Request::SHARED);

     if (MtypeFlags::READ_WRITE & p.m_type)
         defaultMtype.set(Request::READ_WRITE);

     if (MtypeFlags::CACHED & p.m_type)
         defaultMtype.set(Request::CACHED);
 }

 const char*
 GPUComputeDriver::DriverWakeupEvent::description() const
 {
     return "DriverWakeupEvent";
 }

 /**
  * Create an FD entry for the KFD inside of the owning process.
  */
 int
 GPUComputeDriver::open(ThreadContext *tc, int mode, int flags)
 {
     DPRINTF(GPUDriver, "Opened %s\n", filename);
     auto process = tc->getProcessPtr();
     auto device_fd_entry = std::make_shared<DeviceFDEntry>(this, filename);
     int tgt_fd = process->fds->allocFD(device_fd_entry);
     return tgt_fd;
 }

 /**
  * Currently, mmap() will simply setup a mapping for the associated
  * device's packet processor's doorbells and creates the event page.
  */
 Addr
 GPUComputeDriver::mmap(ThreadContext *tc, Addr start, uint64_t length,
                        int prot, int tgt_flags, int tgt_fd, off_t offset)
 {
     auto process = tc->getProcessPtr();
     auto mem_state = process->memState;

     Addr pg_off = offset >> PAGE_SHIFT;
     Addr mmap_type = pg_off & KFD_MMAP_TYPE_MASK;
     DPRINTF(GPUDriver, "amdkfd mmap (start: %p, length: 0x%x,"
             "offset: 0x%x)\n", start, length, offset);

     switch(mmap_type) {
         case KFD_MMAP_TYPE_DOORBELL:
             DPRINTF(GPUDriver, "amdkfd mmap type DOORBELL offset\n");
             start = mem_state->extendMmap(length);
             process->pTable->map(start, device->hsaPacketProc().pioAddr,
                     length, false);
             break;
         case KFD_MMAP_TYPE_EVENTS:
             DPRINTF(GPUDriver, "amdkfd mmap type EVENTS offset\n");
             panic_if(start != 0,
                      "Start address should be provided by KFD\n");
             panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT,
                      "Requested length %d, expected length %d; length "
                      "mismatch\n", length, 8* KFD_SIGNAL_EVENT_LIMIT);
             /**
              * We don't actually access these pages.  We just need to reserve
              * some VA space.  See commit id 5ce8abce for details on how
              * events are currently implemented.
              */
             if (!eventPage) {
                 eventPage = mem_state->extendMmap(length);
                 start = eventPage;
             }
             break;
         default:
             warn_once("Unrecognized kfd mmap type %llx\n", mmap_type);
             break;
     }

     return start;
 }

 /**
  * Forward relevant parameters to packet processor; queueId
  * is used to link doorbell. The queueIDs are not re-used
  * in current implementation, and we allocate only one page
  * (4096 bytes) for doorbells, so check if this queueID can
  * be mapped into that page.
  */
 void
 GPUComputeDriver::allocateQueue(PortProxy &mem_proxy, Addr ioc_buf)
 {
     TypedBufferArg<kfd_ioctl_create_queue_args> args(ioc_buf);
     args.copyIn(mem_proxy);

     if ((doorbellSize() * queueId) > 4096) {
         fatal("%s: Exceeded maximum number of HSA queues allowed\n", name());
     }

     args->doorbell_offset = (KFD_MMAP_TYPE_DOORBELL |
         KFD_MMAP_GPU_ID(args->gpu_id)) << PAGE_SHIFT;

     // for vega offset needs to include exact value of doorbell
     if (doorbellSize())
         args->doorbell_offset += queueId * doorbellSize();

     args->queue_id = queueId++;
     auto &hsa_pp = device->hsaPacketProc();
     hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
                               args->ring_base_address, args->queue_id,
                               args->ring_size, doorbellSize());
     args.copyOut(mem_proxy);
 }

 void
 GPUComputeDriver::DriverWakeupEvent::scheduleWakeup(Tick wakeup_delay)
 {
     assert(driver);
     driver->schedule(this, curTick() + wakeup_delay);
 }

 void
 GPUComputeDriver::signalWakeupEvent(uint32_t event_id)
 {
     panic_if(event_id >= eventSlotIndex,
         "Trying wakeup on an event that is not yet created\n");
     if (ETable[event_id].threadWaiting) {
         panic_if(!ETable[event_id].tc,
                  "No thread context to wake up\n");
         ThreadContext *tc = ETable[event_id].tc;
         DPRINTF(GPUDriver,
                 "Signal event: Waking up CPU %d\n", tc->cpuId());
         // Remove events that can wakeup this thread
         TCEvents[tc].clearEvents();
         // Now wakeup this thread
         tc->activate();
     } else {
        // This may be a race condition between an ioctl call asking to wait on
        // this event and this signalWakeupEvent. Taking care of this race
        // condition here by setting the event here. The ioctl call should take
        // the necessary action when waiting on an already set event.  However,
        // this may be a genuine instance in which the runtime has decided not
        // to wait on this event. But since we cannot distinguish this case with
        // the race condition, we are any way setting the event.
        ETable[event_id].setEvent = true;
     }
 }

 void
 GPUComputeDriver::DriverWakeupEvent::process()
 {
     DPRINTF(GPUDriver,
             "Timer event: Waking up CPU %d\n", tc->cpuId());
     // Remove events that can wakeup this thread
     driver->TCEvents[tc].clearEvents();
     // Now wakeup this thread
     tc->activate();
 }

 int
 GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
 {
     auto &virt_proxy = tc->getVirtProxy();
     auto process = tc->getProcessPtr();
     auto mem_state = process->memState;

     switch (req) {
         case AMDKFD_IOC_GET_VERSION:
           {
             DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_VERSION\n");

             TypedBufferArg<kfd_ioctl_get_version_args> args(ioc_buf);
             args->major_version = KFD_IOCTL_MAJOR_VERSION;
             args->minor_version = KFD_IOCTL_MINOR_VERSION;

             args.copyOut(virt_proxy);
           }
           break;
         case AMDKFD_IOC_CREATE_QUEUE:
           {
             DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_QUEUE\n");

             allocateQueue(virt_proxy, ioc_buf);

             DPRINTF(GPUDriver, "Creating queue %d\n", queueId);
           }
           break;
         case AMDKFD_IOC_DESTROY_QUEUE:
           {
             TypedBufferArg<kfd_ioctl_destroy_queue_args> args(ioc_buf);
             args.copyIn(virt_proxy);
             DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_QUEUE;" \
                     "queue offset %d\n", args->queue_id);
             device->hsaPacketProc().unsetDeviceQueueDesc(args->queue_id,
                                                          doorbellSize());
           }
           break;
         case AMDKFD_IOC_SET_MEMORY_POLICY:
           {
             /**
              * This is where the runtime requests MTYPE from an aperture.
              * Basically, the globally memory aperture is divided up into
              * a default aperture and an alternate aperture each of which have
              * their own MTYPE policies.  This is done to mark a small piece
              * of the global memory as uncacheable.  Host memory mappings will
              * be carved out of this uncacheable aperture, which is how they
              * implement 'coherent' host/device memory on dGPUs.
              *
              * TODO: Need to reflect per-aperture MTYPE policies based on this
              * call.
              *
              */
             warn("unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n");
           }
           break;
         case AMDKFD_IOC_GET_CLOCK_COUNTERS:
           {
             DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_CLOCK_COUNTERS\n");

             TypedBufferArg<kfd_ioctl_get_clock_counters_args> args(ioc_buf);
             args.copyIn(virt_proxy);

             // Set nanosecond resolution
             args->system_clock_freq = 1000000000;

             /**
              * Derive all clock counters based on the tick. All
              * device clocks are identical and perfectly in sync.
              */
             uint64_t elapsed_nsec = curTick() / sim_clock::as_int::ns;
             args->gpu_clock_counter = elapsed_nsec;
             args->cpu_clock_counter = elapsed_nsec;
             args->system_clock_counter = elapsed_nsec;

             args.copyOut(virt_proxy);
           }
           break;
         case AMDKFD_IOC_GET_PROCESS_APERTURES:
           {
             DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES\n");

             TypedBufferArg<kfd_ioctl_get_process_apertures_args> args(ioc_buf);
             args->num_of_nodes = 1;

             /**
              * Set the GPUVM/LDS/Scratch APEs exactly as they
              * are in the real driver, see the KFD driver
              * in the ROCm Linux kernel source:
              * drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
              */
             for (int i = 0; i < args->num_of_nodes; ++i) {
                 /**
                  * While the GPU node numbers start at 0, we add 1
                  * to force the count to start at 1. This is to
                  * ensure that the base/limit addresses are
                  * calculated correctly.
                  */
                 args->process_apertures[i].scratch_base
                     = scratchApeBase(i + 1);
                 args->process_apertures[i].scratch_limit =
                     scratchApeLimit(args->process_apertures[i].scratch_base);

                 args->process_apertures[i].lds_base = ldsApeBase(i + 1);
                 args->process_apertures[i].lds_limit =
                     ldsApeLimit(args->process_apertures[i].lds_base);

                 args->process_apertures[i].gpuvm_base = gpuVmApeBase(i + 1);
                 args->process_apertures[i].gpuvm_limit =
                     gpuVmApeLimit(args->process_apertures[i].gpuvm_base);

                 // NOTE: Must match ID populated by hsaTopology.py
                 //
                 // https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/
                 // blob/6a986c0943e9acd8c4c0cf2a9d510ff42167b43f/include/uapi/
                 // linux/kfd_ioctl.h#L564
                 //
                 // The gpu_id is a device identifier used by the driver for
                 // ioctls that allocate arguments. Each device has an unique
                 // id composed out of a non-zero base and an offset.
                 if (isdGPU) {
                     switch (gfxVersion) {
                       case GfxVersion::gfx803:
                         args->process_apertures[i].gpu_id = 50156;
                         break;
                       case GfxVersion::gfx900:
                         args->process_apertures[i].gpu_id = 22124;
                         break;
                       default:
                         fatal("Invalid gfx version for dGPU\n");
                     }
                 } else {
                     switch (gfxVersion) {
                       case GfxVersion::gfx801:
                         args->process_apertures[i].gpu_id = 2765;
                         break;
                       default:
                         fatal("Invalid gfx version for APU\n");
                     }
                 }

                 DPRINTF(GPUDriver, "GPUVM base for node[%i] = %#x\n", i,
                         args->process_apertures[i].gpuvm_base);
                 DPRINTF(GPUDriver, "GPUVM limit for node[%i] = %#x\n", i,
                         args->process_apertures[i].gpuvm_limit);

                 DPRINTF(GPUDriver, "LDS base for node[%i] = %#x\n", i,
                         args->process_apertures[i].lds_base);
                 DPRINTF(GPUDriver, "LDS limit for node[%i] = %#x\n", i,
                         args->process_apertures[i].lds_limit);

                 DPRINTF(GPUDriver, "Scratch base for node[%i] = %#x\n", i,
                         args->process_apertures[i].scratch_base);
                 DPRINTF(GPUDriver, "Scratch limit for node[%i] = %#x\n", i,
                         args->process_apertures[i].scratch_limit);

                 /**
                  * The CPU's 64b address space can only use the
                  * areas with VA[63:47] == 0x1ffff or VA[63:47] == 0,
                  * therefore we must ensure that the apertures do not
                  * fall in the CPU's address space.
                  */
                 assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
                        47) != 0x1ffff);
                 assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
                        47) != 0);
                 assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
                        47) != 0x1ffff);
                 assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
                        47) != 0);
                 assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
                        47) != 0x1ffff);
                 assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
                        47) != 0);
                 assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
                        47) != 0x1ffff);
                 assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
                        47) != 0);
                 assert(bits<Addr>(args->process_apertures[i].gpuvm_base, 63,
                        47) != 0x1ffff);
                 assert(bits<Addr>(args->process_apertures[i].gpuvm_base, 63,
                        47) != 0);
                 assert(bits<Addr>(args->process_apertures[i].gpuvm_limit, 63,
                        47) != 0x1ffff);
                 assert(bits<Addr>(args->process_apertures[i].gpuvm_limit, 63,
                        47) != 0);
             }

             args.copyOut(virt_proxy);
           }
           break;
         case AMDKFD_IOC_UPDATE_QUEUE:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_UPDATE_QUEUE\n");
           }
           break;
         case AMDKFD_IOC_CREATE_EVENT:
           {
             DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_EVENT\n");

             TypedBufferArg<kfd_ioctl_create_event_args> args(ioc_buf);
             args.copyIn(virt_proxy);
             if (args->event_type != KFD_IOC_EVENT_SIGNAL) {
                 fatal("Signal events are only supported currently\n");
             } else if (eventSlotIndex == SLOTS_PER_PAGE) {
                 fatal("Signal event wasn't created; signal limit reached\n");
             }
             // Currently, we allocate only one signal_page for events.
             // Note that this signal page is of size 8 * KFD_SIGNAL_EVENT_LIMIT
             uint64_t page_index = 0;
             args->event_page_offset = (page_index | KFD_MMAP_TYPE_EVENTS);
             args->event_page_offset <<= PAGE_SHIFT;
             // TODO: Currently we support only signal events, hence using
             // the same ID for both signal slot and event slot
             args->event_slot_index = eventSlotIndex;
             args->event_id = eventSlotIndex++;
             args->event_trigger_data = args->event_id;
             DPRINTF(GPUDriver, "amdkfd create events"
                     "(event_id: 0x%x, offset: 0x%x)\n",
                     args->event_id, args->event_page_offset);
             // Since eventSlotIndex is increased everytime a new event is
             // created ETable at eventSlotIndex(event_id) is guaranteed to be
             // empty. In a future implementation that reuses deleted event_ids,
             // we should check if event table at this
             // eventSlotIndex(event_id) is empty before inserting a new event
             // table entry
             ETable.emplace(std::pair<uint32_t, ETEntry>(args->event_id, {}));
             args.copyOut(virt_proxy);
           }
           break;
         case AMDKFD_IOC_DESTROY_EVENT:
           {
             DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_EVENT\n");
             TypedBufferArg<kfd_ioctl_destroy_event_args> args(ioc_buf);
             args.copyIn(virt_proxy);
             DPRINTF(GPUDriver, "amdkfd destroying event %d\n", args->event_id);
             fatal_if(ETable.count(args->event_id) == 0,
                      "Event ID invalid, cannot destroy this event\n");
             ETable.erase(args->event_id);
           }
           break;
         case AMDKFD_IOC_SET_EVENT:
           {
             DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_SET_EVENTS\n");
             TypedBufferArg<kfd_ioctl_set_event_args> args(ioc_buf);
             args.copyIn(virt_proxy);
             DPRINTF(GPUDriver, "amdkfd set event %d\n", args->event_id);
             fatal_if(ETable.count(args->event_id) == 0,
                      "Event ID invlaid, cannot set this event\n");
             ETable[args->event_id].setEvent = true;
             signalWakeupEvent(args->event_id);
           }
           break;
         case AMDKFD_IOC_RESET_EVENT:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_RESET_EVENT\n");
           }
           break;
         case AMDKFD_IOC_WAIT_EVENTS:
           {
             DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_WAIT_EVENTS\n");
             TypedBufferArg<kfd_ioctl_wait_events_args> args(ioc_buf);
             args.copyIn(virt_proxy);
             kfd_event_data *events =
                 (kfd_event_data *)args->events_ptr;
             DPRINTF(GPUDriver, "amdkfd wait for events"
                     "(wait on all: %d, timeout : %d, num_events: %s)\n",
                     args->wait_for_all, args->timeout, args->num_events);
             panic_if(args->wait_for_all != 0 && args->num_events > 1,
                     "Wait for all events not supported\n");
             bool should_sleep = true;
             if (TCEvents.count(tc) == 0) {
                 // This thread context trying to wait on an event for the first
                 // time, initialize it.
                 TCEvents.emplace(std::piecewise_construct, std::make_tuple(tc),
                                  std::make_tuple(this, tc));
                 DPRINTF(GPUDriver, "\tamdkfd creating event list"
                         " for thread  %d\n", tc->cpuId());
             }
             panic_if(TCEvents[tc].signalEvents.size() != 0,
                      "There are %d events that put this thread to sleep,"
                      " this thread should not be running\n",
                      TCEvents[tc].signalEvents.size());
             for (int i = 0; i < args->num_events; i++) {
                 panic_if(!events,
                          "Event pointer invalid\n");
                 Addr eventDataAddr = (Addr)(events + i);
                 TypedBufferArg<kfd_event_data> EventData(
                     eventDataAddr, sizeof(kfd_event_data));
                 EventData.copyIn(virt_proxy);
                 DPRINTF(GPUDriver,
                         "\tamdkfd wait for event %d\n", EventData->event_id);
                 panic_if(ETable.count(EventData->event_id) == 0,
                          "Event ID invalid, cannot set this event\n");
                 panic_if(ETable[EventData->event_id].threadWaiting,
                          "Multiple threads waiting on the same event\n");
                 if (ETable[EventData->event_id].setEvent) {
                     // If event is already set, the event has already happened.
                     // Just unset the event and dont put this thread to sleep.
                     ETable[EventData->event_id].setEvent = false;
                     should_sleep = false;
                 }
                 if (should_sleep) {
                     // Put this thread to sleep
                     ETable[EventData->event_id].threadWaiting = true;
                     ETable[EventData->event_id].tc = tc;
                     TCEvents[tc].signalEvents.insert(EventData->event_id);
                 }
             }

             // TODO: Return the correct wait_result back. Currently, returning
             // success for both KFD_WAIT_TIMEOUT and KFD_WAIT_COMPLETE.
             // Ideally, this needs to be done after the event is triggered and
             // after the thread is woken up.
             args->wait_result = 0;
             args.copyOut(virt_proxy);
             if (should_sleep) {
                 // Put this thread to sleep
                 sleepCPU(tc, args->timeout);
             } else {
                 // Remove events that tried to put this thread to sleep
                 TCEvents[tc].clearEvents();
             }
           }
           break;
         case AMDKFD_IOC_DBG_REGISTER:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_DBG_REGISTER\n");
           }
           break;
         case AMDKFD_IOC_DBG_UNREGISTER:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_DBG_UNREGISTER\n");
           }
           break;
         case AMDKFD_IOC_DBG_ADDRESS_WATCH:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_DBG_ADDRESS_WATCH\n");
           }
           break;
         case AMDKFD_IOC_DBG_WAVE_CONTROL:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n");
           }
           break;
         /**
          * In real hardware, this IOCTL maps host memory, dGPU memory, or dGPU
          * doorbells into GPUVM space. Essentially, ROCm implements SVM by
          * carving out a region of free VA space that both the host and GPUVM
          * can agree upon.  The entire GPU VA space is reserved on the host
          * using a fixed mmap at a low VA range that is also directly
          * accessable by the GPU's limited number of VA bits.  When we actually
          * call memory allocation later in the program, this IOCTL is invoked
          * to create BOs/VMAs in the driver and bind them to physical
          * memory/doorbells.
          *
          * For gem5, we don't need to carve out any GPUVM space here (we don't
          * support GPUVM and use host page tables on the GPU directly). We can
          * can just use the existing host SVM region. We comment on each memory
          * type seperately.
          */
         case AMDKFD_IOC_ALLOC_MEMORY_OF_GPU:
           {
             DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
             TypedBufferArg<kfd_ioctl_alloc_memory_of_gpu_args> args(ioc_buf);
             args.copyIn(virt_proxy);

             assert(isdGPU);
             assert((args->va_addr % TheISA::PageBytes) == 0);
             GEM5_VAR_USED Addr mmap_offset = 0;

             Request::CacheCoherenceFlags mtype = defaultMtype;
             Addr pa_addr = 0;

             int npages = divCeil(args->size, (int64_t)TheISA::PageBytes);
             bool cacheable = true;

             if (KFD_IOC_ALLOC_MEM_FLAGS_VRAM & args->flags) {
                 DPRINTF(GPUDriver, "amdkfd allocation type: VRAM\n");
                 args->mmap_offset = args->va_addr;
                 // VRAM allocations are device memory mapped into GPUVM
                 // space.
                 //
                 // We can't rely on the lazy host allocator (fixupFault) to
                 // handle this mapping since it needs to be placed in dGPU
                 // framebuffer memory.  The lazy allocator will try to place
                 // this in host memory.
                 //
                 // TODO: We don't have the appropriate bifurcation of the
                 // physical address space with different memory controllers
                 // yet.  This is where we will explicitly add the PT maps to
                 // dGPU memory in the future.
                 //
                 // Bind the VA space to the dGPU physical memory pool.  Mark
                 // this region as Uncacheable.  The Uncacheable flag is only
                 // really used by the CPU and is ignored by the GPU. We mark
                 // this as uncacheable from the CPU so that we can implement
                 // direct CPU framebuffer access similar to what we currently
                 // offer in real HW through the so-called Large BAR feature.
                 pa_addr = process->system->allocPhysPages(npages, dGPUPoolID);
                 //
                 // TODO: Uncacheable accesses need to be supported by the
                 // CPU-side protocol for this to work correctly.  I believe
                 // it only works right now if the physical memory is MMIO
                 cacheable = false;

                 DPRINTF(GPUDriver, "Mapping VA %p to framebuffer PA %p size "
                         "%d\n", args->va_addr, pa_addr, args->size);

             } else if (KFD_IOC_ALLOC_MEM_FLAGS_USERPTR & args->flags) {
                 DPRINTF(GPUDriver, "amdkfd allocation type: USERPTR\n");
                 mmap_offset = args->mmap_offset;
                 // USERPTR allocations are system memory mapped into GPUVM
                 // space.  The user provides the driver with the pointer.
                 pa_addr = process->system->allocPhysPages(npages);

                 DPRINTF(GPUDriver, "Mapping VA %p to framebuffer PA %p size "
                         "%d\n", args->va_addr, pa_addr, args->size);

                 // If the HSA runtime requests system coherent memory, than we
                 // need to explicity mark this region as uncacheable from the
                 // perspective of the GPU.
                 if (args->flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)
                     mtype.clear();

             } else if (KFD_IOC_ALLOC_MEM_FLAGS_GTT & args->flags) {
                 DPRINTF(GPUDriver, "amdkfd allocation type: GTT\n");
                 args->mmap_offset = args->va_addr;
                 // GTT allocations are system memory mapped into GPUVM space.
                 // It's different than a USERPTR allocation since the driver
                 // itself allocates the physical memory on the host.
                 //
                 // We will lazily map it into host memory on first touch.  The
                 // fixupFault will find the original SVM aperture mapped to the
                 // host.
                 pa_addr = process->system->allocPhysPages(npages);

                 DPRINTF(GPUDriver, "Mapping VA %p to framebuffer PA %p size "
                         "%d\n", args->va_addr, pa_addr, args->size);

                 // If the HSA runtime requests system coherent memory, than we
                 // need to explicity mark this region as uncacheable from the
                 // perspective of the GPU.
                 if (args->flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)
                     mtype.clear();

                 // Note that for GTT the thunk layer needs to call mmap on the
                 // driver FD later if it wants the host to have access to this
                 // memory (which it probably does).  This will be ignored.
             } else if (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL & args->flags) {
                 DPRINTF(GPUDriver, "amdkfd allocation type: DOORBELL\n");
                 // DOORBELL allocations are the queue doorbells that are
                 // memory mapped into GPUVM space.
                 //
                 // Explicitly map this virtual address to our PIO doorbell
                 // interface in the page tables (non-cacheable)
                 pa_addr = device->hsaPacketProc().pioAddr;
                 cacheable = false;
             }

             DPRINTF(GPUDriver, "amdkfd allocation arguments: va_addr %p "
                     "size %lu, mmap_offset %p, gpu_id %d\n",
                     args->va_addr, args->size, mmap_offset, args->gpu_id);

             // Bind selected physical memory to provided virtual address range
             // in X86 page tables.
             process->pTable->map(args->va_addr, pa_addr, args->size,
                 cacheable);

             // We keep track of allocated regions of GPU mapped memory,
             // just like the driver would.  This allows us to provide the
             // user with a unique handle for a given allocation.  The user
             // will only provide us with a handle after allocation and expect
             // us to be able to use said handle to extract all the properties
             // of the region.
             //
             // This is a simplified version of regular system VMAs, but for
             // GPUVM space (non of the clobber/remap nonsense we find in real
             // OS managed memory).
             allocateGpuVma(mtype, args->va_addr, args->size);

             // Used by the runtime to uniquely identify this allocation.
             // We can just use the starting address of the VMA region.
             args->handle= args->va_addr;
             args.copyOut(virt_proxy);
           }
           break;
         case AMDKFD_IOC_FREE_MEMORY_OF_GPU:
           {
             DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n");
             TypedBufferArg<kfd_ioctl_free_memory_of_gpu_args> args(ioc_buf);
             args.copyIn(virt_proxy);

             assert(isdGPU);
             DPRINTF(GPUDriver, "amdkfd free arguments: handle %p ",
                     args->handle);

             // We don't recycle physical pages in SE mode
             Addr size = deallocateGpuVma(args->handle);
             process->pTable->unmap(args->handle, size);

             // TODO: IOMMU and GPUTLBs do not seem to correctly support
             // shootdown.  This is also a potential issue for APU systems
             // that perform unmap or remap with system memory.
             tc->getMMUPtr()->flushAll();

             args.copyOut(virt_proxy);
           }
           break;
         /**
          * Called to map an already allocated region of memory to this GPU's
          * GPUVM VA space.  We don't need to implement this in the simulator
          * since we only have a single VM system.  If the region has already
          * been allocated somewhere like the CPU, then it's already visible
          * to the device.
          */
         case AMDKFD_IOC_MAP_MEMORY_TO_GPU:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n");
           }
           break;
         case AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU\n");
           }
           break;
         case AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH\n");
           }
           break;
         case AMDKFD_IOC_SET_CU_MASK:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_SET_CU_MASK\n");
           }
           break;
         case AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE"
                  "\n");
           }
           break;
         case AMDKFD_IOC_SET_TRAP_HANDLER:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_SET_TRAP_HANDLER\n");
           }
           break;
         case AMDKFD_IOC_GET_PROCESS_APERTURES_NEW:
           {
             DPRINTF(GPUDriver,
                     "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES_NEW\n");

             TypedBufferArg<kfd_ioctl_get_process_apertures_new_args>
                 ioc_args(ioc_buf);

             ioc_args.copyIn(virt_proxy);
             ioc_args->num_of_nodes = 1;

             for (int i = 0; i < ioc_args->num_of_nodes; ++i) {
                 TypedBufferArg<kfd_process_device_apertures> ape_args
                     (ioc_args->kfd_process_device_apertures_ptr);

                 ape_args->scratch_base = scratchApeBase(i + 1);
                 ape_args->scratch_limit =
                     scratchApeLimit(ape_args->scratch_base);
                 ape_args->lds_base = ldsApeBase(i + 1);
                 ape_args->lds_limit = ldsApeLimit(ape_args->lds_base);
                 ape_args->gpuvm_base = gpuVmApeBase(i + 1);
                 ape_args->gpuvm_limit = gpuVmApeLimit(ape_args->gpuvm_base);

                 // NOTE: Must match ID populated by hsaTopology.py
                 if (isdGPU) {
                     switch (gfxVersion) {
                       case GfxVersion::gfx803:
                         ape_args->gpu_id = 50156;
                         break;
                       case GfxVersion::gfx900:
                         ape_args->gpu_id = 22124;
                         break;
                       default:
                         fatal("Invalid gfx version for dGPU\n");
                     }
                 } else {
                     switch (gfxVersion) {
                       case GfxVersion::gfx801:
                         ape_args->gpu_id = 2765;
                         break;
                       default:
                         fatal("Invalid gfx version for APU\n");
                     }
                 }

                 assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0x1ffff);
                 assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0);
                 assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0x1ffff);
                 assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0);
                 assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0x1ffff);
                 assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0);
                 assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0x1ffff);
                 assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0);
                 assert(bits<Addr>(ape_args->gpuvm_base, 63, 47) != 0x1ffff);
                 assert(bits<Addr>(ape_args->gpuvm_base, 63, 47) != 0);
                 assert(bits<Addr>(ape_args->gpuvm_limit, 63, 47) != 0x1ffff);
                 assert(bits<Addr>(ape_args->gpuvm_limit, 63, 47) != 0);

                 ape_args.copyOut(virt_proxy);
             }

             ioc_args.copyOut(virt_proxy);
           }
           break;
         case AMDKFD_IOC_GET_DMABUF_INFO:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_GET_DMABUF_INFO\n");
           }
           break;
         case AMDKFD_IOC_IMPORT_DMABUF:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_IMPORT_DMABUF\n");
           }
           break;
         case AMDKFD_IOC_GET_TILE_CONFIG:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_GET_TILE_CONFIG\n");
           }
           break;
         case AMDKFD_IOC_IPC_IMPORT_HANDLE:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_IPC_IMPORT_HANDLE\n");
           }
           break;
         case AMDKFD_IOC_IPC_EXPORT_HANDLE:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_IPC_EXPORT_HANDLE\n");
           }
           break;
         case AMDKFD_IOC_CROSS_MEMORY_COPY:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_CROSS_MEMORY_COPY\n");
           }
           break;
         case AMDKFD_IOC_OPEN_GRAPHIC_HANDLE:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_OPEN_GRAPHIC_HANDLE\n");
           }
           break;
         default:
           fatal("%s: bad ioctl %d\n", req);
           break;
     }
     return 0;
 }

 void
 GPUComputeDriver::sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout)
 {
     // Convert millisecs to ticks
     Tick wakeup_delay((uint64_t)milliSecTimeout * 1000000000);
     assert(TCEvents.count(tc) == 1);
     TCEvents[tc].timerEvent.scheduleWakeup(wakeup_delay);
     tc->suspend();
     DPRINTF(GPUDriver,
             "CPU %d is put to sleep\n", tc->cpuId());
 }

 Addr
 GPUComputeDriver::gpuVmApeBase(int gpuNum) const
 {
     return ((Addr)gpuNum << 61) + 0x1000000000000L;
 }

 Addr
 GPUComputeDriver::gpuVmApeLimit(Addr apeBase) const
 {
     return (apeBase & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
 }

 Addr
 GPUComputeDriver::scratchApeBase(int gpuNum) const
 {
     return ((Addr)gpuNum << 61) + 0x100000000L;
 }

 Addr
 GPUComputeDriver::scratchApeLimit(Addr apeBase) const
 {
     return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
 }

 Addr
 GPUComputeDriver::ldsApeBase(int gpuNum) const
 {
     return ((Addr)gpuNum << 61) + 0x0;
 }

 Addr
 GPUComputeDriver::ldsApeLimit(Addr apeBase) const
 {
     return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
 }

 void
 GPUComputeDriver::allocateGpuVma(Request::CacheCoherenceFlags mtype,
                                  Addr start, Addr length)
 {
     AddrRange range = AddrRange(start, start + length - 1);
     DPRINTF(GPUDriver, "Registering [%p - %p] with MTYPE %d\n",
             range.start(), range.end(), mtype);
     fatal_if(gpuVmas.insert(range, mtype) == gpuVmas.end(),
              "Attempted to double register Mtypes for [%p - %p]\n",
              range.start(), range.end());
 }

 Addr
 GPUComputeDriver::deallocateGpuVma(Addr start)
 {
     auto vma = gpuVmas.contains(start);
     assert(vma != gpuVmas.end());
     assert((vma->first.start() == start));
     Addr size = vma->first.size();
     DPRINTF(GPUDriver, "Unregistering [%p - %p]\n", vma->first.start(),
             vma->first.end());
     gpuVmas.erase(vma);
     return size;
 }

 void
 GPUComputeDriver::setMtype(RequestPtr req)
 {
     // If we are a dGPU then set the MTYPE from our VMAs.
     if (isdGPU) {
         AddrRange range = RangeSize(req->getVaddr(), req->getSize());
         auto vma = gpuVmas.contains(range);
         assert(vma != gpuVmas.end());
         DPRINTF(GPUShader, "Setting req from [%p - %p] MTYPE %d\n"
                 "%d\n", range.start(), range.end(), vma->second);
         req->setCacheCoherenceFlags(vma->second);
     // APUs always get the default MTYPE
     } else {
         req->setCacheCoherenceFlags(defaultMtype);
     }
 }