blob: 3549c598ba18d1c9322c36120628874cf9c1fa24 [file] [log] [blame]
/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Lisa Hsu
*/
#ifndef __GPU_TLB_HH__
#define __GPU_TLB_HH__
#include <fstream>
#include <list>
#include <queue>
#include <string>
#include <vector>
#include "arch/generic/tlb.hh"
#include "arch/x86/pagetable.hh"
#include "arch/x86/pagetable_walker.hh"
#include "arch/x86/regs/segment.hh"
#include "base/callback.hh"
#include "base/misc.hh"
#include "base/statistics.hh"
#include "gpu-compute/compute_unit.hh"
#include "mem/mem_object.hh"
#include "mem/port.hh"
#include "mem/request.hh"
#include "params/X86GPUTLB.hh"
#include "sim/sim_object.hh"
class BaseTLB;
class Packet;
class ThreadContext;
namespace X86ISA
{
class GpuTlbEntry : public TlbEntry
{
public:
GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid)
: TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { }
GpuTlbEntry() : TlbEntry() { }
bool valid;
};
class GpuTLB : public MemObject
{
protected:
friend class Walker;
typedef std::list<GpuTlbEntry*> EntryList;
uint32_t configAddress;
// TLB clock: will inherit clock from shader's clock period in terms
// of nuber of ticks of curTime (aka global simulation clock)
// The assignment of TLB clock from shader clock is done in the python
// config files.
int clock;
public:
// clock related functions ; maps to-and-from Simulation ticks and
// object clocks.
Tick frequency() const { return SimClock::Frequency / clock; }
Tick
ticks(int numCycles) const
{
return (Tick)clock * numCycles;
}
Tick curCycle() const { return curTick() / clock; }
Tick tickToCycles(Tick val) const { return val / clock;}
typedef X86GPUTLBParams Params;
GpuTLB(const Params *p);
~GpuTLB();
typedef enum BaseTLB::Mode Mode;
class Translation
{
public:
virtual ~Translation() { }
/**
* Signal that the translation has been delayed due to a hw page
* table walk.
*/
virtual void markDelayed() = 0;
/**
* The memory for this object may be dynamically allocated, and it
* may be responsible for cleaning itslef up which will happen in
* this function. Once it's called the object is no longer valid.
*/
virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc,
Mode mode) = 0;
};
void dumpAll();
GpuTlbEntry *lookup(Addr va, bool update_lru=true);
void setConfigAddress(uint32_t addr);
protected:
EntryList::iterator lookupIt(Addr va, bool update_lru=true);
Walker *walker;
public:
Walker *getWalker();
void invalidateAll();
void invalidateNonGlobal();
void demapPage(Addr va, uint64_t asn);
protected:
int size;
int assoc;
int numSets;
/**
* true if this is a fully-associative TLB
*/
bool FA;
Addr setMask;
/**
* Allocation Policy: true if we always allocate on a hit, false
* otherwise. Default is true.
*/
bool allocationPolicy;
/**
* if true, then this is not the last level TLB
*/
bool hasMemSidePort;
/**
* Print out accessDistance stats. One stat file
* per TLB.
*/
bool accessDistance;
GpuTlbEntry *tlb;
/*
* It's a per-set list. As long as we have not reached
* the full capacity of the given set, grab an entry from
* the freeList.
*/
std::vector<EntryList> freeList;
/**
* An entryList per set is the equivalent of an LRU stack;
* it's used to guide replacement decisions. The head of the list
* contains the MRU TLB entry of the given set. If the freeList
* for this set is empty, the last element of the list
* is evicted (i.e., dropped on the floor).
*/
std::vector<EntryList> entryList;
Fault translateInt(RequestPtr req, ThreadContext *tc);
Fault translate(RequestPtr req, ThreadContext *tc,
Translation *translation, Mode mode, bool &delayedResponse,
bool timing, int &latency);
public:
// latencies for a TLB hit, miss and page fault
int hitLatency;
int missLatency1;
int missLatency2;
// local_stats are as seen from the TLB
// without taking into account coalescing
Stats::Scalar localNumTLBAccesses;
Stats::Scalar localNumTLBHits;
Stats::Scalar localNumTLBMisses;
Stats::Formula localTLBMissRate;
// global_stats are as seen from the
// CU's perspective taking into account
// all coalesced requests.
Stats::Scalar globalNumTLBAccesses;
Stats::Scalar globalNumTLBHits;
Stats::Scalar globalNumTLBMisses;
Stats::Formula globalTLBMissRate;
// from the CU perspective (global)
Stats::Scalar accessCycles;
// from the CU perspective (global)
Stats::Scalar pageTableCycles;
Stats::Scalar numUniquePages;
// from the perspective of this TLB
Stats::Scalar localCycles;
// from the perspective of this TLB
Stats::Formula localLatency;
// I take the avg. per page and then
// the avg. over all pages.
Stats::Scalar avgReuseDistance;
void regStats();
void updatePageFootprint(Addr virt_page_addr);
void printAccessPattern();
Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
int &latency);
void translateTiming(RequestPtr req, ThreadContext *tc,
Translation *translation, Mode mode,
int &latency);
Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry);
// Checkpointing
virtual void serialize(CheckpointOut& cp) const;
virtual void unserialize(CheckpointIn& cp);
void issueTranslation();
enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats);
void handleTranslationReturn(Addr addr, tlbOutcome outcome,
PacketPtr pkt);
void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
GpuTlbEntry *tlb_entry, Mode mode);
void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry,
Addr phys_page_addr);
void issueTLBLookup(PacketPtr pkt);
// CpuSidePort is the TLB Port closer to the CPU/CU side
class CpuSidePort : public SlavePort
{
public:
CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
PortID _index)
: SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
protected:
GpuTLB *tlb;
int index;
virtual bool recvTimingReq(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt);
virtual void recvRangeChange() { }
virtual void recvReqRetry();
virtual void recvRespRetry() { assert(false); }
virtual AddrRangeList getAddrRanges() const;
};
/**
* MemSidePort is the TLB Port closer to the memory side
* If this is a last level TLB then this port will not be connected.
*
* Future action item: if we ever do real page walks, then this port
* should be connected to a RubyPort.
*/
class MemSidePort : public MasterPort
{
public:
MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
PortID _index)
: MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
std::deque<PacketPtr> retries;
protected:
GpuTLB *tlb;
int index;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry();
};
// TLB ports on the cpu Side
std::vector<CpuSidePort*> cpuSidePort;
// TLB ports on the memory side
std::vector<MemSidePort*> memSidePort;
BaseMasterPort &getMasterPort(const std::string &if_name,
PortID idx=InvalidPortID);
BaseSlavePort &getSlavePort(const std::string &if_name,
PortID idx=InvalidPortID);
/**
* TLB TranslationState: this currently is a somewhat bastardization of
* the usage of SenderState, whereby the receiver of a packet is not
* usually supposed to need to look at the contents of the senderState,
* you're really only supposed to look at what you pushed on, pop it
* off, and send it back.
*
* However, since there is state that we want to pass to the TLBs using
* the send/recv Timing/Functional/etc. APIs, which don't allow for new
* arguments, we need a common TLB senderState to pass between TLBs,
* both "forwards" and "backwards."
*
* So, basically, the rule is that any packet received by a TLB port
* (cpuside OR memside) must be safely castable to a TranslationState.
*/
struct TranslationState : public Packet::SenderState
{
// TLB mode, read or write
Mode tlbMode;
// Thread context associated with this req
ThreadContext *tc;
/*
* TLB entry to be populated and passed back and filled in
* previous TLBs. Equivalent to the data cache concept of
* "data return."
*/
GpuTlbEntry *tlbEntry;
// Is this a TLB prefetch request?
bool prefetch;
// When was the req for this translation issued
uint64_t issueTime;
// Remember where this came from
std::vector<SlavePort*>ports;
// keep track of #uncoalesced reqs per packet per TLB level;
// reqCnt per level >= reqCnt higher level
std::vector<int> reqCnt;
// TLB level this packet hit in; 0 if it hit in the page table
int hitLevel;
Packet::SenderState *saved;
TranslationState(Mode tlb_mode, ThreadContext *_tc,
bool _prefetch=false,
Packet::SenderState *_saved=nullptr)
: tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
prefetch(_prefetch), issueTime(0),
hitLevel(0),saved(_saved) { }
};
// maximum number of permitted coalesced requests per cycle
int maxCoalescedReqs;
// Current number of outstandings coalesced requests.
// Should be <= maxCoalescedReqs
int outstandingReqs;
/**
* A TLBEvent is scheduled after the TLB lookup and helps us take the
* appropriate actions:
* (e.g., update TLB on a hit,
* send request to lower level TLB on a miss,
* or start a page walk if this was the last-level TLB).
*/
void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
PacketPtr pkt);
class TLBEvent : public Event
{
private:
GpuTLB *tlb;
Addr virtPageAddr;
/**
* outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
*/
tlbOutcome outcome;
PacketPtr pkt;
public:
TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
PacketPtr _pkt);
void process();
const char *description() const;
// updateOutcome updates the tlbOutcome of a TLBEvent
void updateOutcome(tlbOutcome _outcome);
Addr getTLBEventVaddr();
};
std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
// this FIFO queue keeps track of the virt. page addresses
// that are pending cleanup
std::queue<Addr> cleanupQueue;
// the cleanupEvent is scheduled after a TLBEvent triggers in order to
// free memory and do the required clean-up
void cleanup();
EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent;
/**
* This hash map will use the virtual page address as a key
* and will keep track of total number of accesses per page
*/
struct AccessInfo
{
unsigned int lastTimeAccessed; // last access to this page
unsigned int accessesPerPage;
// need to divide it by accessesPerPage at the end
unsigned int totalReuseDistance;
/**
* The field below will help us compute the access distance,
* that is the number of (coalesced) TLB accesses that
* happened in between each access to this page
*
* localTLBAccesses[x] is the value of localTLBNumAccesses
* when the page <Addr> was accessed for the <x>th time
*/
std::vector<unsigned int> localTLBAccesses;
unsigned int sumDistance;
unsigned int meanDistance;
};
typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
AccessPatternTable TLBFootprint;
// Called at the end of simulation to dump page access stats.
void exitCallback();
EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent;
};
}
#endif // __GPU_TLB_HH__