src/gpu-compute/gpu_tlb.cc - arm/gem5 - Git at Google

 /*
  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * For use for simulation and test purposes only
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its contributors
  * may be used to endorse or promote products derived from this software
  * without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * Author: Lisa Hsu
  */

 #include "gpu-compute/gpu_tlb.hh"

 #include <cmath>
 #include <cstring>

 #include "arch/x86/faults.hh"
 #include "arch/x86/insts/microldstop.hh"
 #include "arch/x86/pagetable.hh"
 #include "arch/x86/pagetable_walker.hh"
 #include "arch/x86/regs/misc.hh"
 #include "arch/x86/x86_traits.hh"
 #include "base/bitfield.hh"
 #include "base/output.hh"
 #include "base/trace.hh"
 #include "cpu/base.hh"
 #include "cpu/thread_context.hh"
 #include "debug/GPUPrefetch.hh"
 #include "debug/GPUTLB.hh"
 #include "mem/packet_access.hh"
 #include "mem/page_table.hh"
 #include "mem/request.hh"
 #include "sim/process.hh"

 namespace X86ISA
 {

     GpuTLB::GpuTLB(const Params *p)
         : MemObject(p), configAddress(0), size(p->size),
           cleanupEvent([this]{ cleanup(); }, name(), false,
                        Event::Maximum_Pri),
           exitEvent([this]{ exitCallback(); }, name())
     {
         assoc = p->assoc;
         assert(assoc <= size);
         numSets = size/assoc;
         allocationPolicy = p->allocationPolicy;
         hasMemSidePort = false;
         accessDistance = p->accessDistance;
         clock = p->clk_domain->clockPeriod();

         tlb.assign(size, GpuTlbEntry());

         freeList.resize(numSets);
         entryList.resize(numSets);

         for (int set = 0; set < numSets; ++set) {
             for (int way = 0; way < assoc; ++way) {
                 int x = set * assoc + way;
                 freeList[set].push_back(&tlb.at(x));
             }
         }

         FA = (size == assoc);

         /**
          * @warning: the set-associative version assumes you have a
          * fixed page size of 4KB.
          * If the page size is greather than 4KB (as defined in the
          * TheISA::PageBytes), then there are various issues w/ the current
          * implementation (you'd have the same 8KB page being replicated in
          * different sets etc)
          */
         setMask = numSets - 1;

     #if 0
         // GpuTLB doesn't yet support full system
         walker = p->walker;
         walker->setTLB(this);
     #endif

         maxCoalescedReqs = p->maxOutstandingReqs;

         // Do not allow maxCoalescedReqs to be more than the TLB associativity
         if (maxCoalescedReqs > assoc) {
             maxCoalescedReqs = assoc;
             cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc);
         }

         outstandingReqs = 0;
         hitLatency = p->hitLatency;
         missLatency1 = p->missLatency1;
         missLatency2 = p->missLatency2;

         // create the slave ports based on the number of connected ports
         for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
             cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d",
                                   name(), i), this, i));
         }

         // create the master ports based on the number of connected ports
         for (size_t i = 0; i < p->port_master_connection_count; ++i) {
             memSidePort.push_back(new MemSidePort(csprintf("%s-port%d",
                                   name(), i), this, i));
         }
     }

     // fixme: this is never called?
     GpuTLB::~GpuTLB()
     {
         // make sure all the hash-maps are empty
         assert(translationReturnEvent.empty());
     }

     BaseSlavePort&
     GpuTLB::getSlavePort(const std::string &if_name, PortID idx)
     {
         if (if_name == "slave") {
             if (idx >= static_cast<PortID>(cpuSidePort.size())) {
                 panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx);
             }

             return *cpuSidePort[idx];
         } else {
             panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name);
         }
     }

     BaseMasterPort&
     GpuTLB::getMasterPort(const std::string &if_name, PortID idx)
     {
         if (if_name == "master") {
             if (idx >= static_cast<PortID>(memSidePort.size())) {
                 panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx);
             }

             hasMemSidePort = true;

             return *memSidePort[idx];
         } else {
             panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name);
         }
     }

     GpuTlbEntry*
     GpuTLB::insert(Addr vpn, GpuTlbEntry &entry)
     {
         GpuTlbEntry *newEntry = nullptr;

         /**
          * vpn holds the virtual page address
          * The least significant bits are simply masked
          */
         int set = (vpn >> TheISA::PageShift) & setMask;

         if (!freeList[set].empty()) {
             newEntry = freeList[set].front();
             freeList[set].pop_front();
         } else {
             newEntry = entryList[set].back();
             entryList[set].pop_back();
         }

         *newEntry = entry;
         newEntry->vaddr = vpn;
         entryList[set].push_front(newEntry);

         return newEntry;
     }

     GpuTLB::EntryList::iterator
     GpuTLB::lookupIt(Addr va, bool update_lru)
     {
         int set = (va >> TheISA::PageShift) & setMask;

         if (FA) {
             assert(!set);
         }

         auto entry = entryList[set].begin();
         for (; entry != entryList[set].end(); ++entry) {
             int page_size = (*entry)->size();

             if ((*entry)->vaddr <= va && (*entry)->vaddr + page_size > va) {
                 DPRINTF(GPUTLB, "Matched vaddr %#x to entry starting at %#x "
                         "with size %#x.\n", va, (*entry)->vaddr, page_size);

                 if (update_lru) {
                     entryList[set].push_front(*entry);
                     entryList[set].erase(entry);
                     entry = entryList[set].begin();
                 }

                 break;
             }
         }

         return entry;
     }

     GpuTlbEntry*
     GpuTLB::lookup(Addr va, bool update_lru)
     {
         int set = (va >> TheISA::PageShift) & setMask;

         auto entry = lookupIt(va, update_lru);

         if (entry == entryList[set].end())
             return nullptr;
         else
             return *entry;
     }

     void
     GpuTLB::invalidateAll()
     {
         DPRINTF(GPUTLB, "Invalidating all entries.\n");

         for (int i = 0; i < numSets; ++i) {
             while (!entryList[i].empty()) {
                 GpuTlbEntry *entry = entryList[i].front();
                 entryList[i].pop_front();
                 freeList[i].push_back(entry);
             }
         }
     }

     void
     GpuTLB::setConfigAddress(uint32_t addr)
     {
         configAddress = addr;
     }

     void
     GpuTLB::invalidateNonGlobal()
     {
         DPRINTF(GPUTLB, "Invalidating all non global entries.\n");

         for (int i = 0; i < numSets; ++i) {
             for (auto entryIt = entryList[i].begin();
                  entryIt != entryList[i].end();) {
                 if (!(*entryIt)->global) {
                     freeList[i].push_back(*entryIt);
                     entryList[i].erase(entryIt++);
                 } else {
                     ++entryIt;
                 }
             }
         }
     }

     void
     GpuTLB::demapPage(Addr va, uint64_t asn)
     {

         int set = (va >> TheISA::PageShift) & setMask;
         auto entry = lookupIt(va, false);

         if (entry != entryList[set].end()) {
             freeList[set].push_back(*entry);
             entryList[set].erase(entry);
         }
     }

     Fault
     GpuTLB::translateInt(RequestPtr req, ThreadContext *tc)
     {
         DPRINTF(GPUTLB, "Addresses references internal memory.\n");
         Addr vaddr = req->getVaddr();
         Addr prefix = (vaddr >> 3) & IntAddrPrefixMask;

         if (prefix == IntAddrPrefixCPUID) {
             panic("CPUID memory space not yet implemented!\n");
         } else if (prefix == IntAddrPrefixMSR) {
             vaddr = vaddr >> 3;
             req->setFlags(Request::MMAPPED_IPR);
             Addr regNum = 0;

             switch (vaddr & ~IntAddrPrefixMask) {
               case 0x10:
                 regNum = MISCREG_TSC;
                 break;
               case 0x1B:
                 regNum = MISCREG_APIC_BASE;
                 break;
               case 0xFE:
                 regNum = MISCREG_MTRRCAP;
                 break;
               case 0x174:
                 regNum = MISCREG_SYSENTER_CS;
                 break;
               case 0x175:
                 regNum = MISCREG_SYSENTER_ESP;
                 break;
               case 0x176:
                 regNum = MISCREG_SYSENTER_EIP;
                 break;
               case 0x179:
                 regNum = MISCREG_MCG_CAP;
                 break;
               case 0x17A:
                 regNum = MISCREG_MCG_STATUS;
                 break;
               case 0x17B:
                 regNum = MISCREG_MCG_CTL;
                 break;
               case 0x1D9:
                 regNum = MISCREG_DEBUG_CTL_MSR;
                 break;
               case 0x1DB:
                 regNum = MISCREG_LAST_BRANCH_FROM_IP;
                 break;
               case 0x1DC:
                 regNum = MISCREG_LAST_BRANCH_TO_IP;
                 break;
               case 0x1DD:
                 regNum = MISCREG_LAST_EXCEPTION_FROM_IP;
                 break;
               case 0x1DE:
                 regNum = MISCREG_LAST_EXCEPTION_TO_IP;
                 break;
               case 0x200:
                 regNum = MISCREG_MTRR_PHYS_BASE_0;
                 break;
               case 0x201:
                 regNum = MISCREG_MTRR_PHYS_MASK_0;
                 break;
               case 0x202:
                 regNum = MISCREG_MTRR_PHYS_BASE_1;
                 break;
               case 0x203:
                 regNum = MISCREG_MTRR_PHYS_MASK_1;
                 break;
               case 0x204:
                 regNum = MISCREG_MTRR_PHYS_BASE_2;
                 break;
               case 0x205:
                 regNum = MISCREG_MTRR_PHYS_MASK_2;
                 break;
               case 0x206:
                 regNum = MISCREG_MTRR_PHYS_BASE_3;
                 break;
               case 0x207:
                 regNum = MISCREG_MTRR_PHYS_MASK_3;
                 break;
               case 0x208:
                 regNum = MISCREG_MTRR_PHYS_BASE_4;
                 break;
               case 0x209:
                 regNum = MISCREG_MTRR_PHYS_MASK_4;
                 break;
               case 0x20A:
                 regNum = MISCREG_MTRR_PHYS_BASE_5;
                 break;
               case 0x20B:
                 regNum = MISCREG_MTRR_PHYS_MASK_5;
                 break;
               case 0x20C:
                 regNum = MISCREG_MTRR_PHYS_BASE_6;
                 break;
               case 0x20D:
                 regNum = MISCREG_MTRR_PHYS_MASK_6;
                 break;
               case 0x20E:
                 regNum = MISCREG_MTRR_PHYS_BASE_7;
                 break;
               case 0x20F:
                 regNum = MISCREG_MTRR_PHYS_MASK_7;
                 break;
               case 0x250:
                 regNum = MISCREG_MTRR_FIX_64K_00000;
                 break;
               case 0x258:
                 regNum = MISCREG_MTRR_FIX_16K_80000;
                 break;
               case 0x259:
                 regNum = MISCREG_MTRR_FIX_16K_A0000;
                 break;
               case 0x268:
                 regNum = MISCREG_MTRR_FIX_4K_C0000;
                 break;
               case 0x269:
                 regNum = MISCREG_MTRR_FIX_4K_C8000;
                 break;
               case 0x26A:
                 regNum = MISCREG_MTRR_FIX_4K_D0000;
                 break;
               case 0x26B:
                 regNum = MISCREG_MTRR_FIX_4K_D8000;
                 break;
               case 0x26C:
                 regNum = MISCREG_MTRR_FIX_4K_E0000;
                 break;
               case 0x26D:
                 regNum = MISCREG_MTRR_FIX_4K_E8000;
                 break;
               case 0x26E:
                 regNum = MISCREG_MTRR_FIX_4K_F0000;
                 break;
               case 0x26F:
                 regNum = MISCREG_MTRR_FIX_4K_F8000;
                 break;
               case 0x277:
                 regNum = MISCREG_PAT;
                 break;
               case 0x2FF:
                 regNum = MISCREG_DEF_TYPE;
                 break;
               case 0x400:
                 regNum = MISCREG_MC0_CTL;
                 break;
               case 0x404:
                 regNum = MISCREG_MC1_CTL;
                 break;
               case 0x408:
                 regNum = MISCREG_MC2_CTL;
                 break;
               case 0x40C:
                 regNum = MISCREG_MC3_CTL;
                 break;
               case 0x410:
                 regNum = MISCREG_MC4_CTL;
                 break;
               case 0x414:
                 regNum = MISCREG_MC5_CTL;
                 break;
               case 0x418:
                 regNum = MISCREG_MC6_CTL;
                 break;
               case 0x41C:
                 regNum = MISCREG_MC7_CTL;
                 break;
               case 0x401:
                 regNum = MISCREG_MC0_STATUS;
                 break;
               case 0x405:
                 regNum = MISCREG_MC1_STATUS;
                 break;
               case 0x409:
                 regNum = MISCREG_MC2_STATUS;
                 break;
               case 0x40D:
                 regNum = MISCREG_MC3_STATUS;
                 break;
               case 0x411:
                 regNum = MISCREG_MC4_STATUS;
                 break;
               case 0x415:
                 regNum = MISCREG_MC5_STATUS;
                 break;
               case 0x419:
                 regNum = MISCREG_MC6_STATUS;
                 break;
               case 0x41D:
                 regNum = MISCREG_MC7_STATUS;
                 break;
               case 0x402:
                 regNum = MISCREG_MC0_ADDR;
                 break;
               case 0x406:
                 regNum = MISCREG_MC1_ADDR;
                 break;
               case 0x40A:
                 regNum = MISCREG_MC2_ADDR;
                 break;
               case 0x40E:
                 regNum = MISCREG_MC3_ADDR;
                 break;
               case 0x412:
                 regNum = MISCREG_MC4_ADDR;
                 break;
               case 0x416:
                 regNum = MISCREG_MC5_ADDR;
                 break;
               case 0x41A:
                 regNum = MISCREG_MC6_ADDR;
                 break;
               case 0x41E:
                 regNum = MISCREG_MC7_ADDR;
                 break;
               case 0x403:
                 regNum = MISCREG_MC0_MISC;
                 break;
               case 0x407:
                 regNum = MISCREG_MC1_MISC;
                 break;
               case 0x40B:
                 regNum = MISCREG_MC2_MISC;
                 break;
               case 0x40F:
                 regNum = MISCREG_MC3_MISC;
                 break;
               case 0x413:
                 regNum = MISCREG_MC4_MISC;
                 break;
               case 0x417:
                 regNum = MISCREG_MC5_MISC;
                 break;
               case 0x41B:
                 regNum = MISCREG_MC6_MISC;
                 break;
               case 0x41F:
                 regNum = MISCREG_MC7_MISC;
                 break;
               case 0xC0000080:
                 regNum = MISCREG_EFER;
                 break;
               case 0xC0000081:
                 regNum = MISCREG_STAR;
                 break;
               case 0xC0000082:
                 regNum = MISCREG_LSTAR;
                 break;
               case 0xC0000083:
                 regNum = MISCREG_CSTAR;
                 break;
               case 0xC0000084:
                 regNum = MISCREG_SF_MASK;
                 break;
               case 0xC0000100:
                 regNum = MISCREG_FS_BASE;
                 break;
               case 0xC0000101:
                 regNum = MISCREG_GS_BASE;
                 break;
               case 0xC0000102:
                 regNum = MISCREG_KERNEL_GS_BASE;
                 break;
               case 0xC0000103:
                 regNum = MISCREG_TSC_AUX;
                 break;
               case 0xC0010000:
                 regNum = MISCREG_PERF_EVT_SEL0;
                 break;
               case 0xC0010001:
                 regNum = MISCREG_PERF_EVT_SEL1;
                 break;
               case 0xC0010002:
                 regNum = MISCREG_PERF_EVT_SEL2;
                 break;
               case 0xC0010003:
                 regNum = MISCREG_PERF_EVT_SEL3;
                 break;
               case 0xC0010004:
                 regNum = MISCREG_PERF_EVT_CTR0;
                 break;
               case 0xC0010005:
                 regNum = MISCREG_PERF_EVT_CTR1;
                 break;
               case 0xC0010006:
                 regNum = MISCREG_PERF_EVT_CTR2;
                 break;
               case 0xC0010007:
                 regNum = MISCREG_PERF_EVT_CTR3;
                 break;
               case 0xC0010010:
                 regNum = MISCREG_SYSCFG;
                 break;
               case 0xC0010016:
                 regNum = MISCREG_IORR_BASE0;
                 break;
               case 0xC0010017:
                 regNum = MISCREG_IORR_BASE1;
                 break;
               case 0xC0010018:
                 regNum = MISCREG_IORR_MASK0;
                 break;
               case 0xC0010019:
                 regNum = MISCREG_IORR_MASK1;
                 break;
               case 0xC001001A:
                 regNum = MISCREG_TOP_MEM;
                 break;
               case 0xC001001D:
                 regNum = MISCREG_TOP_MEM2;
                 break;
               case 0xC0010114:
                 regNum = MISCREG_VM_CR;
                 break;
               case 0xC0010115:
                 regNum = MISCREG_IGNNE;
                 break;
               case 0xC0010116:
                 regNum = MISCREG_SMM_CTL;
                 break;
               case 0xC0010117:
                 regNum = MISCREG_VM_HSAVE_PA;
                 break;
               default:
                 return std::make_shared<GeneralProtection>(0);
             }
             //The index is multiplied by the size of a MiscReg so that
             //any memory dependence calculations will not see these as
             //overlapping.
             req->setPaddr(regNum * sizeof(MiscReg));
             return NoFault;
         } else if (prefix == IntAddrPrefixIO) {
             // TODO If CPL > IOPL or in virtual mode, check the I/O permission
             // bitmap in the TSS.

             Addr IOPort = vaddr & ~IntAddrPrefixMask;
             // Make sure the address fits in the expected 16 bit IO address
             // space.
             assert(!(IOPort & ~0xFFFF));

             if (IOPort == 0xCF8 && req->getSize() == 4) {
                 req->setFlags(Request::MMAPPED_IPR);
                 req->setPaddr(MISCREG_PCI_CONFIG_ADDRESS * sizeof(MiscReg));
             } else if ((IOPort & ~mask(2)) == 0xCFC) {
                 req->setFlags(Request::UNCACHEABLE);

                 Addr configAddress =
                     tc->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS);

                 if (bits(configAddress, 31, 31)) {
                     req->setPaddr(PhysAddrPrefixPciConfig |
                                   mbits(configAddress, 30, 2) |
                                   (IOPort & mask(2)));
                 } else {
                     req->setPaddr(PhysAddrPrefixIO | IOPort);
                 }
             } else {
                 req->setFlags(Request::UNCACHEABLE);
                 req->setPaddr(PhysAddrPrefixIO | IOPort);
             }
             return NoFault;
         } else {
             panic("Access to unrecognized internal address space %#x.\n",
                   prefix);
         }
     }

     /**
      * TLB_lookup will only perform a TLB lookup returning true on a TLB hit
      * and false on a TLB miss.
      * Many of the checks about different modes have been converted to
      * assertions, since these parts of the code are not really used.
      * On a hit it will update the LRU stack.
      */
     bool
     GpuTLB::tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats)
     {
         bool tlb_hit = false;
     #ifndef NDEBUG
         uint32_t flags = req->getFlags();
         int seg = flags & SegmentFlagMask;
     #endif

         assert(seg != SEGMENT_REG_MS);
         Addr vaddr = req->getVaddr();
         DPRINTF(GPUTLB, "TLB Lookup for vaddr %#x.\n", vaddr);
         HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);

         if (m5Reg.prot) {
             DPRINTF(GPUTLB, "In protected mode.\n");
             // make sure we are in 64-bit mode
             assert(m5Reg.mode == LongMode);

             // If paging is enabled, do the translation.
             if (m5Reg.paging) {
                 DPRINTF(GPUTLB, "Paging enabled.\n");
                 //update LRU stack on a hit
                 GpuTlbEntry *entry = lookup(vaddr, true);

                 if (entry)
                     tlb_hit = true;

                 if (!update_stats) {
                     // functional tlb access for memory initialization
                     // i.e., memory seeding or instr. seeding -> don't update
                     // TLB and stats
                     return tlb_hit;
                 }

                 localNumTLBAccesses++;

                 if (!entry) {
                     localNumTLBMisses++;
                 } else {
                     localNumTLBHits++;
                 }
             }
         }

         return tlb_hit;
     }

     Fault
     GpuTLB::translate(RequestPtr req, ThreadContext *tc,
                       Translation *translation, Mode mode,
                       bool &delayedResponse, bool timing, int &latency)
     {
         uint32_t flags = req->getFlags();
         int seg = flags & SegmentFlagMask;
         bool storeCheck = flags & (StoreCheck << FlagShift);

         // If this is true, we're dealing with a request
         // to a non-memory address space.
         if (seg == SEGMENT_REG_MS) {
             return translateInt(req, tc);
         }

         delayedResponse = false;
         Addr vaddr = req->getVaddr();
         DPRINTF(GPUTLB, "Translating vaddr %#x.\n", vaddr);

         HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);

         // If protected mode has been enabled...
         if (m5Reg.prot) {
             DPRINTF(GPUTLB, "In protected mode.\n");
             // If we're not in 64-bit mode, do protection/limit checks
             if (m5Reg.mode != LongMode) {
                 DPRINTF(GPUTLB, "Not in long mode. Checking segment "
                         "protection.\n");

                 // Check for a null segment selector.
                 if (!(seg == SEGMENT_REG_TSG || seg == SYS_SEGMENT_REG_IDTR ||
                     seg == SEGMENT_REG_HS || seg == SEGMENT_REG_LS)
                     && !tc->readMiscRegNoEffect(MISCREG_SEG_SEL(seg))) {
                     return std::make_shared<GeneralProtection>(0);
                 }

                 bool expandDown = false;
                 SegAttr attr = tc->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg));

                 if (seg >= SEGMENT_REG_ES && seg <= SEGMENT_REG_HS) {
                     if (!attr.writable && (mode == BaseTLB::Write ||
                         storeCheck))
                         return std::make_shared<GeneralProtection>(0);

                     if (!attr.readable && mode == BaseTLB::Read)
                         return std::make_shared<GeneralProtection>(0);

                     expandDown = attr.expandDown;

                 }

                 Addr base = tc->readMiscRegNoEffect(MISCREG_SEG_BASE(seg));
                 Addr limit = tc->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg));
                 // This assumes we're not in 64 bit mode. If we were, the
                 // default address size is 64 bits, overridable to 32.
                 int size = 32;
                 bool sizeOverride = (flags & (AddrSizeFlagBit << FlagShift));
                 SegAttr csAttr = tc->readMiscRegNoEffect(MISCREG_CS_ATTR);

                 if ((csAttr.defaultSize && sizeOverride) ||
                     (!csAttr.defaultSize && !sizeOverride)) {
                     size = 16;
                 }

                 Addr offset = bits(vaddr - base, size - 1, 0);
                 Addr endOffset = offset + req->getSize() - 1;

                 if (expandDown) {
                     DPRINTF(GPUTLB, "Checking an expand down segment.\n");
                     warn_once("Expand down segments are untested.\n");

                     if (offset <= limit || endOffset <= limit)
                         return std::make_shared<GeneralProtection>(0);
                 } else {
                     if (offset > limit || endOffset > limit)
                         return std::make_shared<GeneralProtection>(0);
                 }
             }

             // If paging is enabled, do the translation.
             if (m5Reg.paging) {
                 DPRINTF(GPUTLB, "Paging enabled.\n");
                 // The vaddr already has the segment base applied.
                 GpuTlbEntry *entry = lookup(vaddr);
                 localNumTLBAccesses++;

                 if (!entry) {
                     localNumTLBMisses++;
                     if (timing) {
                         latency = missLatency1;
                     }

                     if (FullSystem) {
                         fatal("GpuTLB doesn't support full-system mode\n");
                     } else {
                         DPRINTF(GPUTLB, "Handling a TLB miss for address %#x "
                                 "at pc %#x.\n", vaddr, tc->instAddr());

                         Process *p = tc->getProcessPtr();
                         const EmulationPageTable::Entry *pte =
                             p->pTable->lookup(vaddr);

                         if (!pte && mode != BaseTLB::Execute) {
                             // penalize a "page fault" more
                             if (timing)
                                 latency += missLatency2;

                             if (p->fixupStackFault(vaddr))
                                 pte = p->pTable->lookup(vaddr);
                         }

                         if (!pte) {
                             return std::make_shared<PageFault>(vaddr, true,
                                                                mode, true,
                                                                false);
                         } else {
                             Addr alignedVaddr = p->pTable->pageAlign(vaddr);

                             DPRINTF(GPUTLB, "Mapping %#x to %#x\n",
                                     alignedVaddr, pte->paddr);

                             GpuTlbEntry gpuEntry(
                                 p->pTable->pid(), alignedVaddr,
                                 pte->paddr, true);
                             entry = insert(alignedVaddr, gpuEntry);
                         }

                         DPRINTF(GPUTLB, "Miss was serviced.\n");
                     }
                 } else {
                     localNumTLBHits++;

                     if (timing) {
                         latency = hitLatency;
                     }
                 }

                 // Do paging protection checks.
                 bool inUser = (m5Reg.cpl == 3 &&
                                !(flags & (CPL0FlagBit << FlagShift)));

                 CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
                 bool badWrite = (!entry->writable && (inUser || cr0.wp));

                 if ((inUser && !entry->user) || (mode == BaseTLB::Write &&
                      badWrite)) {
                     // The page must have been present to get into the TLB in
                     // the first place. We'll assume the reserved bits are
                     // fine even though we're not checking them.
                     return std::make_shared<PageFault>(vaddr, true, mode,
                                                        inUser, false);
                 }

                 if (storeCheck && badWrite) {
                     // This would fault if this were a write, so return a page
                     // fault that reflects that happening.
                     return std::make_shared<PageFault>(vaddr, true,
                                                        BaseTLB::Write,
                                                        inUser, false);
                 }


                 DPRINTF(GPUTLB, "Entry found with paddr %#x, doing protection "
                         "checks.\n", entry->paddr);

                 int page_size = entry->size();
                 Addr paddr = entry->paddr | (vaddr & (page_size - 1));
                 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
                 req->setPaddr(paddr);

                 if (entry->uncacheable)
                     req->setFlags(Request::UNCACHEABLE);
             } else {
                 //Use the address which already has segmentation applied.
                 DPRINTF(GPUTLB, "Paging disabled.\n");
                 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
                 req->setPaddr(vaddr);
             }
         } else {
             // Real mode
             DPRINTF(GPUTLB, "In real mode.\n");
             DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
             req->setPaddr(vaddr);
         }

         // Check for an access to the local APIC
         if (FullSystem) {
             LocalApicBase localApicBase =
                 tc->readMiscRegNoEffect(MISCREG_APIC_BASE);

             Addr baseAddr = localApicBase.base * PageBytes;
             Addr paddr = req->getPaddr();

             if (baseAddr <= paddr && baseAddr + PageBytes > paddr) {
                 // Force the access to be uncacheable.
                 req->setFlags(Request::UNCACHEABLE);
                 req->setPaddr(x86LocalAPICAddress(tc->contextId(),
                                                   paddr - baseAddr));
             }
         }

         return NoFault;
     };

     Fault
     GpuTLB::translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
                             int &latency)
     {
         bool delayedResponse;

         return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false,
                                  latency);
     }

     void
     GpuTLB::translateTiming(RequestPtr req, ThreadContext *tc,
             Translation *translation, Mode mode, int &latency)
     {
         bool delayedResponse;
         assert(translation);

         Fault fault = GpuTLB::translate(req, tc, translation, mode,
                                         delayedResponse, true, latency);

         if (!delayedResponse)
             translation->finish(fault, req, tc, mode);
     }

     Walker*
     GpuTLB::getWalker()
     {
         return walker;
     }


     void
     GpuTLB::serialize(CheckpointOut &cp) const
     {
     }

     void
     GpuTLB::unserialize(CheckpointIn &cp)
     {
     }

     void
     GpuTLB::regStats()
     {
         MemObject::regStats();

         localNumTLBAccesses
             .name(name() + ".local_TLB_accesses")
             .desc("Number of TLB accesses")
             ;

         localNumTLBHits
             .name(name() + ".local_TLB_hits")
             .desc("Number of TLB hits")
             ;

         localNumTLBMisses
             .name(name() + ".local_TLB_misses")
             .desc("Number of TLB misses")
             ;

         localTLBMissRate
             .name(name() + ".local_TLB_miss_rate")
             .desc("TLB miss rate")
             ;

         accessCycles
             .name(name() + ".access_cycles")
             .desc("Cycles spent accessing this TLB level")
             ;

         pageTableCycles
             .name(name() + ".page_table_cycles")
             .desc("Cycles spent accessing the page table")
             ;

         localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;

         numUniquePages
             .name(name() + ".unique_pages")
             .desc("Number of unique pages touched")
             ;

         localCycles
             .name(name() + ".local_cycles")
             .desc("Number of cycles spent in queue for all incoming reqs")
             ;

         localLatency
             .name(name() + ".local_latency")
             .desc("Avg. latency over incoming coalesced reqs")
             ;

         localLatency = localCycles / localNumTLBAccesses;

         globalNumTLBAccesses
             .name(name() + ".global_TLB_accesses")
             .desc("Number of TLB accesses")
             ;

         globalNumTLBHits
             .name(name() + ".global_TLB_hits")
             .desc("Number of TLB hits")
             ;

         globalNumTLBMisses
             .name(name() + ".global_TLB_misses")
             .desc("Number of TLB misses")
             ;

         globalTLBMissRate
             .name(name() + ".global_TLB_miss_rate")
             .desc("TLB miss rate")
             ;

         globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;

         avgReuseDistance
             .name(name() + ".avg_reuse_distance")
             .desc("avg. reuse distance over all pages (in ticks)")
             ;

     }

     /**
      * Do the TLB lookup for this coalesced request and schedule
      * another event <TLB access latency> cycles later.
      */

     void
     GpuTLB::issueTLBLookup(PacketPtr pkt)
     {
         assert(pkt);
         assert(pkt->senderState);

         Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
                                         TheISA::PageBytes);

         TranslationState *sender_state =
                 safe_cast<TranslationState*>(pkt->senderState);

         bool update_stats = !sender_state->prefetch;
         ThreadContext * tmp_tc = sender_state->tc;

         DPRINTF(GPUTLB, "Translation req. for virt. page addr %#x\n",
                 virt_page_addr);

         int req_cnt = sender_state->reqCnt.back();

         if (update_stats) {
             accessCycles -= (curTick() * req_cnt);
             localCycles -= curTick();
             updatePageFootprint(virt_page_addr);
             globalNumTLBAccesses += req_cnt;
         }

         tlbOutcome lookup_outcome = TLB_MISS;
         RequestPtr tmp_req = pkt->req;

         // Access the TLB and figure out if it's a hit or a miss.
         bool success = tlbLookup(tmp_req, tmp_tc, update_stats);

         if (success) {
             lookup_outcome = TLB_HIT;
             // Put the entry in SenderState
             GpuTlbEntry *entry = lookup(tmp_req->getVaddr(), false);
             assert(entry);

             sender_state->tlbEntry =
                 new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid);

             if (update_stats) {
                 // the reqCnt has an entry per level, so its size tells us
                 // which level we are in
                 sender_state->hitLevel = sender_state->reqCnt.size();
                 globalNumTLBHits += req_cnt;
             }
         } else {
             if (update_stats)
                 globalNumTLBMisses += req_cnt;
         }

         /*
          * We now know the TLB lookup outcome (if it's a hit or a miss), as well
          * as the TLB access latency.
          *
          * We create and schedule a new TLBEvent which will help us take the
          * appropriate actions (e.g., update TLB on a hit, send request to lower
          * level TLB on a miss, or start a page walk if this was the last-level
          * TLB)
          */
         TLBEvent *tlb_event =
             new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);

         if (translationReturnEvent.count(virt_page_addr)) {
             panic("Virtual Page Address %#x already has a return event\n",
                   virt_page_addr);
         }

         translationReturnEvent[virt_page_addr] = tlb_event;
         assert(tlb_event);

         DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
                 curTick() + this->ticks(hitLatency));

         schedule(tlb_event, curTick() + this->ticks(hitLatency));
     }

     GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome,
                                PacketPtr _pkt)
         : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
         outcome(tlb_outcome), pkt(_pkt)
     {
     }

     /**
      * Do Paging protection checks. If we encounter a page fault, then
      * an assertion is fired.
      */
     void
     GpuTLB::pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
             GpuTlbEntry * tlb_entry, Mode mode)
     {
         HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
         uint32_t flags = pkt->req->getFlags();
         bool storeCheck = flags & (StoreCheck << FlagShift);

         // Do paging protection checks.
         bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
         CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);

         bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));

         if ((inUser && !tlb_entry->user) ||
             (mode == BaseTLB::Write && badWrite)) {
            // The page must have been present to get into the TLB in
            // the first place. We'll assume the reserved bits are
            // fine even though we're not checking them.
            assert(false);
         }

         if (storeCheck && badWrite) {
            // This would fault if this were a write, so return a page
            // fault that reflects that happening.
            assert(false);
         }
     }

     /**
      * handleTranslationReturn is called on a TLB hit,
      * when a TLB miss returns or when a page fault returns.
      * The latter calls handelHit with TLB miss as tlbOutcome.
      */
     void
     GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome,
             PacketPtr pkt)
     {

         assert(pkt);
         Addr vaddr = pkt->req->getVaddr();

         TranslationState *sender_state =
             safe_cast<TranslationState*>(pkt->senderState);

         ThreadContext *tc = sender_state->tc;
         Mode mode = sender_state->tlbMode;

         GpuTlbEntry *local_entry, *new_entry;

         if (tlb_outcome == TLB_HIT) {
             DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr);
             local_entry = sender_state->tlbEntry;
         } else {
             DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
                     vaddr);

             // We are returning either from a page walk or from a hit at a lower
             // TLB level. The senderState should be "carrying" a pointer to the
             // correct TLBEntry.
             new_entry = sender_state->tlbEntry;
             assert(new_entry);
             local_entry = new_entry;

             if (allocationPolicy) {
                 DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
                         virt_page_addr);

                 local_entry = insert(virt_page_addr, *new_entry);
             }

             assert(local_entry);
         }

         /**
          * At this point the packet carries an up-to-date tlbEntry pointer
          * in its senderState.
          * Next step is to do the paging protection checks.
          */
         DPRINTF(GPUTLB, "Entry found with vaddr %#x,  doing protection checks "
                 "while paddr was %#x.\n", local_entry->vaddr,
                 local_entry->paddr);

         pagingProtectionChecks(tc, pkt, local_entry, mode);
         int page_size = local_entry->size();
         Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
         DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);

         // Since this packet will be sent through the cpu side slave port,
         // it must be converted to a response pkt if it is not one already
         if (pkt->isRequest()) {
             pkt->makeTimingResponse();
         }

         pkt->req->setPaddr(paddr);

         if (local_entry->uncacheable) {
              pkt->req->setFlags(Request::UNCACHEABLE);
         }

         //send packet back to coalescer
         cpuSidePort[0]->sendTimingResp(pkt);
         //schedule cleanup event
         cleanupQueue.push(virt_page_addr);

         // schedule this only once per cycle.
         // The check is required because we might have multiple translations
         // returning the same cycle
         // this is a maximum priority event and must be on the same cycle
         // as the cleanup event in TLBCoalescer to avoid a race with
         // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry
         if (!cleanupEvent.scheduled())
             schedule(cleanupEvent, curTick());
     }

     /**
      * Here we take the appropriate actions based on the result of the
      * TLB lookup.
      */
     void
     GpuTLB::translationReturn(Addr virtPageAddr, tlbOutcome outcome,
                               PacketPtr pkt)
     {
         DPRINTF(GPUTLB, "Triggered TLBEvent for addr %#x\n", virtPageAddr);

         assert(translationReturnEvent[virtPageAddr]);
         assert(pkt);

         TranslationState *tmp_sender_state =
             safe_cast<TranslationState*>(pkt->senderState);

         int req_cnt = tmp_sender_state->reqCnt.back();
         bool update_stats = !tmp_sender_state->prefetch;


         if (outcome == TLB_HIT) {
             handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);

             if (update_stats) {
                 accessCycles += (req_cnt * curTick());
                 localCycles += curTick();
             }

         } else if (outcome == TLB_MISS) {

             DPRINTF(GPUTLB, "This is a TLB miss\n");
             if (update_stats) {
                 accessCycles += (req_cnt*curTick());
                 localCycles += curTick();
             }

             if (hasMemSidePort) {
                 // the one cyle added here represent the delay from when we get
                 // the reply back till when we propagate it to the coalescer
                 // above.
                 if (update_stats) {
                     accessCycles += (req_cnt * 1);
                     localCycles += 1;
                 }

                 /**
                  * There is a TLB below. Send the coalesced request.
                  * We actually send the very first packet of all the
                  * pending packets for this virtual page address.
                  */
                 if (!memSidePort[0]->sendTimingReq(pkt)) {
                     DPRINTF(GPUTLB, "Failed sending translation request to "
                             "lower level TLB for addr %#x\n", virtPageAddr);

                     memSidePort[0]->retries.push_back(pkt);
                 } else {
                     DPRINTF(GPUTLB, "Sent translation request to lower level "
                             "TLB for addr %#x\n", virtPageAddr);
                 }
             } else {
                 //this is the last level TLB. Start a page walk
                 DPRINTF(GPUTLB, "Last level TLB - start a page walk for "
                         "addr %#x\n", virtPageAddr);

                 if (update_stats)
                     pageTableCycles -= (req_cnt*curTick());

                 TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
                 assert(tlb_event);
                 tlb_event->updateOutcome(PAGE_WALK);
                 schedule(tlb_event, curTick() + ticks(missLatency2));
             }
         } else if (outcome == PAGE_WALK) {
             if (update_stats)
                 pageTableCycles += (req_cnt*curTick());

             // Need to access the page table and update the TLB
             DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
                     virtPageAddr);

             TranslationState *sender_state =
                 safe_cast<TranslationState*>(pkt->senderState);

             Process *p = sender_state->tc->getProcessPtr();
             Addr vaddr = pkt->req->getVaddr();
     #ifndef NDEBUG
             Addr alignedVaddr = p->pTable->pageAlign(vaddr);
             assert(alignedVaddr == virtPageAddr);
     #endif
             const EmulationPageTable::Entry *pte = p->pTable->lookup(vaddr);
             if (!pte && sender_state->tlbMode != BaseTLB::Execute &&
                     p->fixupStackFault(vaddr)) {
                 pte = p->pTable->lookup(vaddr);
             }

             if (pte) {
                 DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
                         pte->paddr);

                 sender_state->tlbEntry =
                     new GpuTlbEntry(0, virtPageAddr, pte->paddr, true);
             } else {
                 sender_state->tlbEntry =
                     new GpuTlbEntry(0, 0, 0, false);
             }

             handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
         } else if (outcome == MISS_RETURN) {
             /** we add an extra cycle in the return path of the translation
              * requests in between the various TLB levels.
              */
             handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
         } else {
             assert(false);
         }
     }

     void
     GpuTLB::TLBEvent::process()
     {
         tlb->translationReturn(virtPageAddr, outcome, pkt);
     }

     const char*
     GpuTLB::TLBEvent::description() const
     {
         return "trigger translationDoneEvent";
     }

     void
     GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome)
     {
         outcome = _outcome;
     }

     Addr
     GpuTLB::TLBEvent::getTLBEventVaddr()
     {
         return virtPageAddr;
     }

     /*
      * recvTiming receives a coalesced timing request from a TLBCoalescer
      * and it calls issueTLBLookup()
      * It only rejects the packet if we have exceeded the max
      * outstanding number of requests for the TLB
      */
     bool
     GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt)
     {
         if (tlb->outstandingReqs < tlb->maxCoalescedReqs) {
             tlb->issueTLBLookup(pkt);
             // update number of outstanding translation requests
             tlb->outstandingReqs++;
             return true;
          } else {
             DPRINTF(GPUTLB, "Reached maxCoalescedReqs number %d\n",
                     tlb->outstandingReqs);
             return false;
          }
     }

     /**
      * handleFuncTranslationReturn is called on a TLB hit,
      * when a TLB miss returns or when a page fault returns.
      * It updates LRU, inserts the TLB entry on a miss
      * depending on the allocation policy and does the required
      * protection checks. It does NOT create a new packet to
      * update the packet's addr; this is done in hsail-gpu code.
      */
     void
     GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome)
     {
         TranslationState *sender_state =
             safe_cast<TranslationState*>(pkt->senderState);

         ThreadContext *tc = sender_state->tc;
         Mode mode = sender_state->tlbMode;
         Addr vaddr = pkt->req->getVaddr();

         GpuTlbEntry *local_entry, *new_entry;

         if (tlb_outcome == TLB_HIT) {
             DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr "
                     "%#x\n", vaddr);

             local_entry = sender_state->tlbEntry;
         } else {
             DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
                     "%#x\n", vaddr);

             // We are returning either from a page walk or from a hit at a lower
             // TLB level. The senderState should be "carrying" a pointer to the
             // correct TLBEntry.
             new_entry = sender_state->tlbEntry;
             assert(new_entry);
             local_entry = new_entry;

             if (allocationPolicy) {
                 Addr virt_page_addr = roundDown(vaddr, TheISA::PageBytes);

                 DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
                         virt_page_addr);

                 local_entry = insert(virt_page_addr, *new_entry);
             }

             assert(local_entry);
         }

         DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
                 "while paddr was %#x.\n", local_entry->vaddr,
                 local_entry->paddr);

         // Do paging checks if it's a normal functional access.  If it's for a
         // prefetch, then sometimes you can try to prefetch something that won't
         // pass protection. We don't actually want to fault becuase there is no
         // demand access to deem this a violation.  Just put it in the TLB and
         // it will fault if indeed a future demand access touches it in
         // violation.
         if (!sender_state->prefetch && sender_state->tlbEntry->valid)
             pagingProtectionChecks(tc, pkt, local_entry, mode);

         int page_size = local_entry->size();
         Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
         DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);

         pkt->req->setPaddr(paddr);

         if (local_entry->uncacheable)
              pkt->req->setFlags(Request::UNCACHEABLE);
     }

     // This is used for atomic translations. Need to
     // make it all happen during the same cycle.
     void
     GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt)
     {
         TranslationState *sender_state =
             safe_cast<TranslationState*>(pkt->senderState);

         ThreadContext *tc = sender_state->tc;
         bool update_stats = !sender_state->prefetch;

         Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
                                         TheISA::PageBytes);

         if (update_stats)
             tlb->updatePageFootprint(virt_page_addr);

         // do the TLB lookup without updating the stats
         bool success = tlb->tlbLookup(pkt->req, tc, update_stats);
         tlbOutcome tlb_outcome = success ? TLB_HIT : TLB_MISS;

         // functional mode means no coalescing
         // global metrics are the same as the local metrics
         if (update_stats) {
             tlb->globalNumTLBAccesses++;

             if (success) {
                 sender_state->hitLevel = sender_state->reqCnt.size();
                 tlb->globalNumTLBHits++;
             }
         }

         if (!success) {
             if (update_stats)
                 tlb->globalNumTLBMisses++;
             if (tlb->hasMemSidePort) {
                 // there is a TLB below -> propagate down the TLB hierarchy
                 tlb->memSidePort[0]->sendFunctional(pkt);
                 // If no valid translation from a prefetch, then just return
                 if (sender_state->prefetch && !pkt->req->hasPaddr())
                     return;
             } else {
                 // Need to access the page table and update the TLB
                 DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
                         virt_page_addr);

                 Process *p = tc->getProcessPtr();

                 Addr vaddr = pkt->req->getVaddr();
     #ifndef NDEBUG
                 Addr alignedVaddr = p->pTable->pageAlign(vaddr);
                 assert(alignedVaddr == virt_page_addr);
     #endif

                 const EmulationPageTable::Entry *pte =
                         p->pTable->lookup(vaddr);
                 if (!pte && sender_state->tlbMode != BaseTLB::Execute &&
                         p->fixupStackFault(vaddr)) {
                     pte = p->pTable->lookup(vaddr);
                 }

                 if (!sender_state->prefetch) {
                     // no PageFaults are permitted after
                     // the second page table lookup
                     assert(success);

                     DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
                             pte->paddr);

                     sender_state->tlbEntry =
                         new GpuTlbEntry(0, virt_page_addr,
                                         pte->paddr, success);
                 } else {
                     // If this was a prefetch, then do the normal thing if it
                     // was a successful translation.  Otherwise, send an empty
                     // TLB entry back so that it can be figured out as empty and
                     // handled accordingly.
                     if (pte) {
                         DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
                                 pte->paddr);

                         sender_state->tlbEntry =
                             new GpuTlbEntry(0, virt_page_addr,
                                             pte->paddr, success);
                     } else {
                         DPRINTF(GPUPrefetch, "Prefetch failed %#x\n",
                                 alignedVaddr);

                         sender_state->tlbEntry = new GpuTlbEntry();

                         return;
                     }
                 }
             }
         } else {
             DPRINTF(GPUPrefetch, "Functional Hit for vaddr %#x\n",
                     tlb->lookup(pkt->req->getVaddr()));

             GpuTlbEntry *entry = tlb->lookup(pkt->req->getVaddr(),
                                              update_stats);

             assert(entry);

             sender_state->tlbEntry =
                 new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid);
         }
         // This is the function that would populate pkt->req with the paddr of
         // the translation. But if no translation happens (i.e Prefetch fails)
         // then the early returns in the above code wiill keep this function
         // from executing.
         tlb->handleFuncTranslationReturn(pkt, tlb_outcome);
     }

     void
     GpuTLB::CpuSidePort::recvReqRetry()
     {
         // The CPUSidePort never sends anything but replies. No retries
         // expected.
         assert(false);
     }

     AddrRangeList
     GpuTLB::CpuSidePort::getAddrRanges() const
     {
         // currently not checked by the master
         AddrRangeList ranges;

         return ranges;
     }

     /**
      * MemSidePort receives the packet back.
      * We need to call the handleTranslationReturn
      * and propagate up the hierarchy.
      */
     bool
     GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt)
     {
         Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
                                         TheISA::PageBytes);

         DPRINTF(GPUTLB, "MemSidePort recvTiming for virt_page_addr %#x\n",
                 virt_page_addr);

         TLBEvent *tlb_event = tlb->translationReturnEvent[virt_page_addr];
         assert(tlb_event);
         assert(virt_page_addr == tlb_event->getTLBEventVaddr());

         tlb_event->updateOutcome(MISS_RETURN);
         tlb->schedule(tlb_event, curTick()+tlb->ticks(1));

         return true;
     }

     void
     GpuTLB::MemSidePort::recvReqRetry()
     {
         // No retries should reach the TLB. The retries
         // should only reach the TLBCoalescer.
         assert(false);
     }

     void
     GpuTLB::cleanup()
     {
         while (!cleanupQueue.empty()) {
             Addr cleanup_addr = cleanupQueue.front();
             cleanupQueue.pop();

             // delete TLBEvent
             TLBEvent * old_tlb_event = translationReturnEvent[cleanup_addr];
             delete old_tlb_event;
             translationReturnEvent.erase(cleanup_addr);

             // update number of outstanding requests
             outstandingReqs--;
         }

         /** the higher level coalescer should retry if it has
          * any pending requests.
          */
         for (int i = 0; i < cpuSidePort.size(); ++i) {
             cpuSidePort[i]->sendRetryReq();
         }
     }

     void
     GpuTLB::updatePageFootprint(Addr virt_page_addr)
     {

         std::pair<AccessPatternTable::iterator, bool> ret;

         AccessInfo tmp_access_info;
         tmp_access_info.lastTimeAccessed = 0;
         tmp_access_info.accessesPerPage = 0;
         tmp_access_info.totalReuseDistance = 0;
         tmp_access_info.sumDistance = 0;
         tmp_access_info.meanDistance = 0;

         ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr,
                                   tmp_access_info));

         bool first_page_access = ret.second;

         if (first_page_access) {
             numUniquePages++;
         } else  {
             int accessed_before;
             accessed_before  = curTick() - ret.first->second.lastTimeAccessed;
             ret.first->second.totalReuseDistance += accessed_before;
         }

         ret.first->second.accessesPerPage++;
         ret.first->second.lastTimeAccessed = curTick();

         if (accessDistance) {
             ret.first->second.localTLBAccesses
                 .push_back(localNumTLBAccesses.value());
         }
     }

     void
     GpuTLB::exitCallback()
     {
         std::ostream *page_stat_file = nullptr;

         if (accessDistance) {

             // print per page statistics to a separate file (.csv format)
             // simout is the gem5 output directory (default is m5out or the one
             // specified with -d
             page_stat_file = simout.create(name().c_str())->stream();

             // print header
             *page_stat_file << "page,max_access_distance,mean_access_distance, "
                             << "stddev_distance" << std::endl;
         }

         // update avg. reuse distance footprint
         AccessPatternTable::iterator iter, iter_begin, iter_end;
         unsigned int sum_avg_reuse_distance_per_page = 0;

         // iterate through all pages seen by this TLB
         for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) {
             sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance /
                                                iter->second.accessesPerPage;

             if (accessDistance) {
                 unsigned int tmp = iter->second.localTLBAccesses[0];
                 unsigned int prev = tmp;

                 for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
                     if (i) {
                         tmp = prev + 1;
                     }

                     prev = iter->second.localTLBAccesses[i];
                     // update the localTLBAccesses value
                     // with the actual differece
                     iter->second.localTLBAccesses[i] -= tmp;
                     // compute the sum of AccessDistance per page
                     // used later for mean
                     iter->second.sumDistance +=
                         iter->second.localTLBAccesses[i];
                 }

                 iter->second.meanDistance =
                     iter->second.sumDistance / iter->second.accessesPerPage;

                 // compute std_dev and max  (we need a second round because we
                 // need to know the mean value
                 unsigned int max_distance = 0;
                 unsigned int stddev_distance = 0;

                 for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
                     unsigned int tmp_access_distance =
                         iter->second.localTLBAccesses[i];

                     if (tmp_access_distance > max_distance) {
                         max_distance = tmp_access_distance;
                     }

                     unsigned int diff =
                         tmp_access_distance - iter->second.meanDistance;
                     stddev_distance += pow(diff, 2);

                 }

                 stddev_distance =
                     sqrt(stddev_distance/iter->second.accessesPerPage);

                 if (page_stat_file) {
                     *page_stat_file << std::hex << iter->first << ",";
                     *page_stat_file << std::dec << max_distance << ",";
                     *page_stat_file << std::dec << iter->second.meanDistance
                                     << ",";
                     *page_stat_file << std::dec << stddev_distance;
                     *page_stat_file << std::endl;
                 }

                 // erase the localTLBAccesses array
                 iter->second.localTLBAccesses.clear();
             }
         }

         if (!TLBFootprint.empty()) {
             avgReuseDistance =
                 sum_avg_reuse_distance_per_page / TLBFootprint.size();
         }

         //clear the TLBFootprint map
         TLBFootprint.clear();
     }
 } // namespace X86ISA

 X86ISA::GpuTLB*
 X86GPUTLBParams::create()
 {
     return new X86ISA::GpuTLB(this);
 }