src/gpu-compute/gpu_compute_driver.hh - public/gem5 - Git at Google

 /*
  * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */

 /**
  * @file
  * The GPUComputeDriver implements an HSADriver for an HSA AMD GPU
  * agent. Other GPU devices, or other HSA agents, should not derive
  * from this class. Instead device-specific implementations of an
  * HSADriver should be provided for each unique device.
  */

 #ifndef __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
 #define __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__

 #include <cassert>
 #include <cstdint>
 #include <set>
 #include <unordered_map>

 #include "base/addr_range_map.hh"
 #include "base/types.hh"
 #include "enums/GfxVersion.hh"
 #include "mem/request.hh"
 #include "sim/emul_driver.hh"

 namespace gem5
 {

 struct GPUComputeDriverParams;
 class GPUCommandProcessor;
 class PortProxy;
 class ThreadContext;

 class GPUComputeDriver final : public EmulatedDriver
 {
   public:
     typedef GPUComputeDriverParams Params;
     GPUComputeDriver(const Params &p);
     int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override;

     int open(ThreadContext *tc, int mode, int flags) override;
     Addr mmap(ThreadContext *tc, Addr start, uint64_t length,
               int prot, int tgt_flags, int tgt_fd, off_t offset) override;
     virtual void signalWakeupEvent(uint32_t event_id);
     void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout);
     /**
      * Called by the compute units right before a request is issued to ruby.
      * This uses our VMAs to correctly set the MTYPE on a per-request basis.
      * In real hardware, this is actually done through PTE bits in GPUVM.
      * Since we are running a single VM (x86 PT) system, the MTYPE bits aren't
      * available.  Adding GPUVM specific bits to x86 page tables probably
      * isn't the best way to proceed.  For now we just have the driver set
      * these until we implement a proper dual PT system.
      */
     void setMtype(RequestPtr req);

     int
     doorbellSize()
     {
         switch (gfxVersion) {
           case GfxVersion::gfx801:
           case GfxVersion::gfx803:
           case GfxVersion::gfx902:
             return 4;
           case GfxVersion::gfx900:
             // gfx900 supports large BAR, so it has a larger doorbell
             return 8;
           default:
             fatal("Invalid GPU type\n");
         }
         return 4;
     }

     class DriverWakeupEvent : public Event
     {
       public:
         DriverWakeupEvent(GPUComputeDriver *gpu_driver,
                           ThreadContext *thrd_cntxt)
           : driver(gpu_driver), tc(thrd_cntxt) {}
         void process() override;
         const char *description() const override;
         void scheduleWakeup(Tick wakeup_delay);
       private:
         GPUComputeDriver *driver;
         ThreadContext *tc;
     };

     class EventTableEntry
     {
       public:
         EventTableEntry() :
             mailBoxPtr(0), tc(nullptr), threadWaiting(false), setEvent(false)
         {}
         // Mail box pointer for this address. Current implementation does not
         // use this mailBoxPtr to notify events but directly calls
         // signalWakeupEvent from dispatcher (GPU) to notifiy events. So,
         // currently this mailBoxPtr is not used. But a future implementation
         // may communicate to the driver using mailBoxPtr.
         Addr mailBoxPtr;
         // Thread context waiting on this even. We do not support multiple
         // threads waiting on an event currently.
         ThreadContext *tc;
         // threadWaiting = true, if some thread context is waiting on this
         // event. A thread context waiting on this event is put to sleep.
         bool threadWaiting;
         // setEvent = true, if this event is triggered but when this event
         // triggered, no thread context was waiting on it. In the future, some
         // thread context will try to wait on this event but since event has
         // already happened, we will not allow that thread context to go to
         // sleep. The above mentioned scneario can happen when the waiting
         // thread and wakeup thread race on this event and the wakeup thread
         // beat the waiting thread at the driver.
         bool setEvent;
     };
     typedef class EventTableEntry ETEntry;

   private:
     /**
      * GPU that is controlled by this driver.
      */
     GPUCommandProcessor *device;
     uint32_t queueId;
     bool isdGPU;
     GfxVersion gfxVersion;
     int dGPUPoolID;
     Addr eventPage;
     uint32_t eventSlotIndex;
     //Event table that keeps track of events. It is indexed with event ID.
     std::unordered_map<uint32_t, ETEntry> ETable;

     /**
      * VMA structures for GPUVM memory.
      */
     AddrRangeMap<Request::CacheCoherenceFlags, 1> gpuVmas;

     /**
      * Mtype bits {Cached, Read Write, Shared} for caches
      */
     enum MtypeFlags
     {
         SHARED                  = 0,
         READ_WRITE              = 1,
         CACHED                  = 2,
         NUM_MTYPE_BITS
     };

     Request::CacheCoherenceFlags defaultMtype;

     // TCEvents map keeps trak of the events that can wakeup this thread. When
     // multiple events can wake up this thread, this data structure helps to
     // reset all events when one of those events wake up this thread. the
     // signal events that can wake up this thread are stored in signalEvents
     // whereas the timer wakeup event is stored in timerEvent.
     class EventList
     {
       public:
         EventList() : driver(nullptr), timerEvent(nullptr, nullptr) {}
         EventList(GPUComputeDriver *gpu_driver, ThreadContext *thrd_cntxt)
             : driver(gpu_driver), timerEvent(gpu_driver, thrd_cntxt)
         { }
         void clearEvents() {
             assert(driver);
             for (auto event : signalEvents) {
                 assert(event < driver->eventSlotIndex);
                 driver->ETable[event].tc = nullptr;
                 driver->ETable[event].threadWaiting = false;
             }
             signalEvents.clear();
             if (timerEvent.scheduled()) {
                 driver->deschedule(timerEvent);
             }
         }
         GPUComputeDriver *driver;
         DriverWakeupEvent timerEvent;
         // The set of events that can wake up the same thread.
         std::set<uint32_t> signalEvents;
     };
     std::unordered_map<ThreadContext *, EventList> TCEvents;

     /**
      * Register a region of host memory as uncacheable from the perspective
      * of the dGPU.
      */
     void registerUncacheableMemory(Addr start, Addr length);

     /**
      * The aperture (APE) base/limit pairs are set
      * statically at startup by the real KFD. AMD
      * x86_64 CPUs only use the areas in the 64b
      * address space where VA[63:47] == 0x1ffff or
      * VA[63:47] = 0. These methods generate the APE
      * base/limit pairs in exactly the same way as
      * the real KFD does, which ensures these APEs do
      * not fall into the CPU's address space
      *
      * see the macros in the KFD driver in the ROCm
      * Linux kernel source:
      *
      * drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
      */
     Addr gpuVmApeBase(int gpuNum) const;
     Addr gpuVmApeLimit(Addr apeBase) const;
     Addr scratchApeBase(int gpuNum) const;
     Addr scratchApeBaseV9() const;
     Addr scratchApeLimit(Addr apeBase) const;
     Addr ldsApeBase(int gpuNum) const;
     Addr ldsApeBaseV9() const;
     Addr ldsApeLimit(Addr apeBase) const;

     /**
      * Allocate/deallocate GPUVM VMAs for tracking virtual address allocations
      * and properties on DGPUs.  For now, we use these to track MTYPE and to
      * be able to select which pages to unmap when the user provides us with
      * a handle during the free ioctl.
      */
     void allocateGpuVma(Request::CacheCoherenceFlags mtype, Addr start,
                         Addr length);
     Addr deallocateGpuVma(Addr start);

     void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf_addr);

 };

 } // namespace gem5

 #endif // __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
	/*
	* Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice,
	* this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* 3. Neither the name of the copyright holder nor the names of its
	* contributors may be used to endorse or promote products derived from this
	* software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/

	/**
	* @file
	* The GPUComputeDriver implements an HSADriver for an HSA AMD GPU
	* agent. Other GPU devices, or other HSA agents, should not derive
	* from this class. Instead device-specific implementations of an
	* HSADriver should be provided for each unique device.
	*/

	#ifndef __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
	#define __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__

	#include <cassert>
	#include <cstdint>
	#include <set>
	#include <unordered_map>

	#include "base/addr_range_map.hh"
	#include "base/types.hh"
	#include "enums/GfxVersion.hh"
	#include "mem/request.hh"
	#include "sim/emul_driver.hh"

	namespace gem5
	{

	struct GPUComputeDriverParams;
	class GPUCommandProcessor;
	class PortProxy;
	class ThreadContext;

	class GPUComputeDriver final : public EmulatedDriver
	{
	public:
	typedef GPUComputeDriverParams Params;
	GPUComputeDriver(const Params &p);
	int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override;

	int open(ThreadContext *tc, int mode, int flags) override;
	Addr mmap(ThreadContext *tc, Addr start, uint64_t length,
	int prot, int tgt_flags, int tgt_fd, off_t offset) override;
	virtual void signalWakeupEvent(uint32_t event_id);
	void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout);
	/**
	* Called by the compute units right before a request is issued to ruby.
	* This uses our VMAs to correctly set the MTYPE on a per-request basis.
	* In real hardware, this is actually done through PTE bits in GPUVM.
	* Since we are running a single VM (x86 PT) system, the MTYPE bits aren't
	* available. Adding GPUVM specific bits to x86 page tables probably
	* isn't the best way to proceed. For now we just have the driver set
	* these until we implement a proper dual PT system.
	*/
	void setMtype(RequestPtr req);

	int
	doorbellSize()
	{
	switch (gfxVersion) {
	case GfxVersion::gfx801:
	case GfxVersion::gfx803:
	case GfxVersion::gfx902:
	return 4;
	case GfxVersion::gfx900:
	// gfx900 supports large BAR, so it has a larger doorbell
	return 8;
	default:
	fatal("Invalid GPU type\n");
	}
	return 4;
	}

	class DriverWakeupEvent : public Event
	{
	public:
	DriverWakeupEvent(GPUComputeDriver *gpu_driver,
	ThreadContext *thrd_cntxt)
	: driver(gpu_driver), tc(thrd_cntxt) {}
	void process() override;
	const char *description() const override;
	void scheduleWakeup(Tick wakeup_delay);
	private:
	GPUComputeDriver *driver;
	ThreadContext *tc;
	};

	class EventTableEntry
	{
	public:
	EventTableEntry() :
	mailBoxPtr(0), tc(nullptr), threadWaiting(false), setEvent(false)
	{}
	// Mail box pointer for this address. Current implementation does not
	// use this mailBoxPtr to notify events but directly calls
	// signalWakeupEvent from dispatcher (GPU) to notifiy events. So,
	// currently this mailBoxPtr is not used. But a future implementation
	// may communicate to the driver using mailBoxPtr.
	Addr mailBoxPtr;
	// Thread context waiting on this even. We do not support multiple
	// threads waiting on an event currently.
	ThreadContext *tc;
	// threadWaiting = true, if some thread context is waiting on this
	// event. A thread context waiting on this event is put to sleep.
	bool threadWaiting;
	// setEvent = true, if this event is triggered but when this event
	// triggered, no thread context was waiting on it. In the future, some
	// thread context will try to wait on this event but since event has
	// already happened, we will not allow that thread context to go to
	// sleep. The above mentioned scneario can happen when the waiting
	// thread and wakeup thread race on this event and the wakeup thread
	// beat the waiting thread at the driver.
	bool setEvent;
	};
	typedef class EventTableEntry ETEntry;

	private:
	/**
	* GPU that is controlled by this driver.
	*/
	GPUCommandProcessor *device;
	uint32_t queueId;
	bool isdGPU;
	GfxVersion gfxVersion;
	int dGPUPoolID;
	Addr eventPage;
	uint32_t eventSlotIndex;
	//Event table that keeps track of events. It is indexed with event ID.
	std::unordered_map<uint32_t, ETEntry> ETable;

	/**
	* VMA structures for GPUVM memory.
	*/
	AddrRangeMap<Request::CacheCoherenceFlags, 1> gpuVmas;

	/**
	* Mtype bits {Cached, Read Write, Shared} for caches
	*/
	enum MtypeFlags
	{
	SHARED = 0,
	READ_WRITE = 1,
	CACHED = 2,
	NUM_MTYPE_BITS
	};

	Request::CacheCoherenceFlags defaultMtype;

	// TCEvents map keeps trak of the events that can wakeup this thread. When
	// multiple events can wake up this thread, this data structure helps to
	// reset all events when one of those events wake up this thread. the
	// signal events that can wake up this thread are stored in signalEvents
	// whereas the timer wakeup event is stored in timerEvent.
	class EventList
	{
	public:
	EventList() : driver(nullptr), timerEvent(nullptr, nullptr) {}
	EventList(GPUComputeDriver gpu_driver, ThreadContext thrd_cntxt)
	: driver(gpu_driver), timerEvent(gpu_driver, thrd_cntxt)
	{ }
	void clearEvents() {
	assert(driver);
	for (auto event : signalEvents) {
	assert(event < driver->eventSlotIndex);
	driver->ETable[event].tc = nullptr;
	driver->ETable[event].threadWaiting = false;
	}
	signalEvents.clear();
	if (timerEvent.scheduled()) {
	driver->deschedule(timerEvent);
	}
	}
	GPUComputeDriver *driver;
	DriverWakeupEvent timerEvent;
	// The set of events that can wake up the same thread.
	std::set<uint32_t> signalEvents;
	};
	std::unordered_map<ThreadContext *, EventList> TCEvents;

	/**
	* Register a region of host memory as uncacheable from the perspective
	* of the dGPU.
	*/
	void registerUncacheableMemory(Addr start, Addr length);

	/**
	* The aperture (APE) base/limit pairs are set
	* statically at startup by the real KFD. AMD
	* x86_64 CPUs only use the areas in the 64b
	* address space where VA[63:47] == 0x1ffff or
	* VA[63:47] = 0. These methods generate the APE
	* base/limit pairs in exactly the same way as
	* the real KFD does, which ensures these APEs do
	* not fall into the CPU's address space
	*
	* see the macros in the KFD driver in the ROCm
	* Linux kernel source:
	*
	* drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
	*/
	Addr gpuVmApeBase(int gpuNum) const;
	Addr gpuVmApeLimit(Addr apeBase) const;
	Addr scratchApeBase(int gpuNum) const;
	Addr scratchApeBaseV9() const;
	Addr scratchApeLimit(Addr apeBase) const;
	Addr ldsApeBase(int gpuNum) const;
	Addr ldsApeBaseV9() const;
	Addr ldsApeLimit(Addr apeBase) const;

	/**
	* Allocate/deallocate GPUVM VMAs for tracking virtual address allocations
	* and properties on DGPUs. For now, we use these to track MTYPE and to
	* be able to select which pages to unmap when the user provides us with
	* a handle during the free ioctl.
	*/
	void allocateGpuVma(Request::CacheCoherenceFlags mtype, Addr start,
	Addr length);
	Addr deallocateGpuVma(Addr start);

	void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf_addr);

	};

	} // namespace gem5

	#endif // __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__