src/arch/amdgpu/vega/tlb.cc - public/gem5 - Git at Google

 /*
  * Copyright (c) 2021 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */

 #include "arch/amdgpu/vega/tlb.hh"

 #include <cmath>
 #include <cstring>

 #include "arch/amdgpu/common/gpu_translation_state.hh"
 #include "arch/amdgpu/vega/faults.hh"
 #include "arch/amdgpu/vega/pagetable_walker.hh"
 #include "debug/GPUPrefetch.hh"
 #include "debug/GPUTLB.hh"
 #include "dev/amdgpu/amdgpu_device.hh"

 namespace gem5
 {
 namespace VegaISA
 {

 // we have no limit for the number of translations we send
 // downstream as we depend on the limit of the coalescer
 // above us
 GpuTLB::GpuTLB(const VegaGPUTLBParams &p)
     :  ClockedObject(p), walker(p.walker),
       gpuDevice(p.gpu_device), size(p.size), stats(this),
       cleanupEvent([this]{ cleanup(); }, name(), false,
                    Event::Maximum_Pri)
 {
     assoc = p.assoc;
     assert(assoc <= size);
     numSets = size/assoc;
     allocationPolicy = p.allocationPolicy;
     hasMemSidePort = false;

     tlb.assign(size, VegaTlbEntry());

     freeList.resize(numSets);
     entryList.resize(numSets);

     for (int set = 0; set < numSets; ++set) {
         for (int way = 0; way < assoc; ++way) {
             int x = set * assoc + way;
             freeList[set].push_back(&tlb.at(x));
         }
     }

     FA = (size == assoc);
     setMask = numSets - 1;

     maxCoalescedReqs = p.maxOutstandingReqs;


     outstandingReqs = 0;
     hitLatency = p.hitLatency;
     missLatency1 = p.missLatency1;
     missLatency2 = p.missLatency2;

     // create the response ports based on the number of connected ports
     for (size_t i = 0; i < p.port_cpu_side_ports_connection_count; ++i) {
         cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d",
                               name(), i), this, i));
     }

     // create the requestor ports based on the number of connected ports
     for (size_t i = 0; i < p.port_mem_side_ports_connection_count; ++i) {
         memSidePort.push_back(new MemSidePort(csprintf("%s-port%d",
                               name(), i), this, i));
     }

     // assuming one walker per TLB, set our walker's TLB to this TLB.
     walker->setTLB(this);

     // gpuDevice should be non-null in full system only and is set by GpuTLB
     // params from the config file.
     if (gpuDevice) {
         gpuDevice->getVM().registerTLB(this);
     }
 }

 GpuTLB::~GpuTLB()
 {
 }

 Port &
 GpuTLB::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "cpu_side_ports") {
         if (idx >= static_cast<PortID>(cpuSidePort.size())) {
             panic("TLBCoalescer::getPort: unknown index %d\n", idx);
         }

         return *cpuSidePort[idx];
     } else if (if_name == "mem_side_ports") {
         if (idx >= static_cast<PortID>(memSidePort.size())) {
             panic("TLBCoalescer::getPort: unknown index %d\n", idx);
         }

         hasMemSidePort = true;

         return *memSidePort[idx];
     } else {
         panic("TLBCoalescer::getPort: unknown port %s\n", if_name);
     }
 }

 Fault
 GpuTLB::createPagefault(Addr vaddr, Mode mode)
 {
     DPRINTF(GPUTLB, "GPUTLB: Raising page fault.\n");
     ExceptionCode code;
     if (mode == BaseMMU::Read)
         code = ExceptionCode::LOAD_PAGE;
     else if (mode == BaseMMU::Write)
         code = ExceptionCode::STORE_PAGE;
     else
         code = ExceptionCode::INST_PAGE;
     return std::make_shared<PageFault>(vaddr, code, true, mode, true);
 }

 Addr
 GpuTLB::pageAlign(Addr vaddr)
 {
     Addr pageMask = mask(VegaISA::PageShift);
     return (vaddr & ~pageMask);
 }

 VegaTlbEntry*
 GpuTLB::insert(Addr vpn, VegaTlbEntry &entry)
 {
     VegaTlbEntry *newEntry = nullptr;

     int set = (entry.vaddr >> VegaISA::PageShift) & setMask;

     if (!freeList[set].empty()) {
         newEntry = freeList[set].front();
         freeList[set].pop_front();
     } else {
         newEntry = entryList[set].back();
         entryList[set].pop_back();
     }

     *newEntry = entry;
     entryList[set].push_front(newEntry);

     DPRINTF(GPUTLB, "Inserted %#lx -> %#lx of size %#lx into set %d\n",
             newEntry->vaddr, newEntry->paddr, entry.size(), set);

     return newEntry;
 }

 GpuTLB::EntryList::iterator
 GpuTLB::lookupIt(Addr va, bool update_lru)
 {
     int set = (va >> VegaISA::PageShift) & setMask;

     if (FA) {
         assert(!set);
     }

     auto entry = entryList[set].begin();
     for (; entry != entryList[set].end(); ++entry) {
         int page_size = (*entry)->size();

         if ((*entry)->vaddr <= va && (*entry)->vaddr + page_size > va) {
             DPRINTF(GPUTLB, "Matched vaddr %#x to entry starting at %#x "
                     "with size %#x.\n", va, (*entry)->vaddr, page_size);

             if (update_lru) {
                 entryList[set].push_front(*entry);
                 entryList[set].erase(entry);
                 entry = entryList[set].begin();
             }

             break;
         }
     }

     return entry;
 }

 VegaTlbEntry*
 GpuTLB::lookup(Addr va, bool update_lru)
 {
     int set = (va >> VegaISA::PageShift) & setMask;

     auto entry = lookupIt(va, update_lru);

     if (entry == entryList[set].end())
         return nullptr;
     else
         return *entry;
 }

 void
 GpuTLB::invalidateAll()
 {
     DPRINTF(GPUTLB, "Invalidating all entries.\n");

     for (int i = 0; i < numSets; ++i) {
         while (!entryList[i].empty()) {
             VegaTlbEntry *entry = entryList[i].front();
             entryList[i].pop_front();
             freeList[i].push_back(entry);
         }
     }
 }

 void
 GpuTLB::demapPage(Addr va, uint64_t asn)
 {

     int set = (va >> VegaISA::PageShift) & setMask;
     auto entry = lookupIt(va, false);

     if (entry != entryList[set].end()) {
         freeList[set].push_back(*entry);
         entryList[set].erase(entry);
     }
 }


 /**
  * TLB_lookup will only perform a TLB lookup returning the TLB entry on a TLB
  * hit and nullptr on a TLB miss.
  * Many of the checks about different modes have been converted to
  * assertions, since these parts of the code are not really used.
  * On a hit it will update the LRU stack.
  */
 VegaTlbEntry *
 GpuTLB::tlbLookup(const RequestPtr &req, bool update_stats)
 {
     Addr vaddr = req->getVaddr();
     Addr alignedVaddr = pageAlign(vaddr);
     DPRINTF(GPUTLB, "TLB Lookup for vaddr %#x.\n", vaddr);

     //update LRU stack on a hit
     VegaTlbEntry *entry = lookup(alignedVaddr, true);

     if (!update_stats) {
         // functional tlb access for memory initialization
         // i.e., memory seeding or instr. seeding -> don't update
         // TLB and stats
         return entry;
     }

     stats.localNumTLBAccesses++;

     if (!entry) {
         stats.localNumTLBMisses++;
     } else {
         stats.localNumTLBHits++;
     }

     return entry;
 }

 Walker*
 GpuTLB::getWalker()
 {
     return walker;
 }


 void
 GpuTLB::serialize(CheckpointOut &cp) const
 {
 }

 void
 GpuTLB::unserialize(CheckpointIn &cp)
 {
 }

 /**
  * Do the TLB lookup for this coalesced request and schedule
  * another event <TLB access latency> cycles later.
  */

 void
 GpuTLB::issueTLBLookup(PacketPtr pkt)
 {
     assert(pkt);
     assert(pkt->senderState);

     /**
      * The page size is not fixed in Vega and tracking events by VPN could
      * potentially lead to redundant page walks by using the smallest page
      * size. The actual VPN can be determined after the first walk is done
      * and fixed up later.
      */
     Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
                                     VegaISA::PageBytes);

     GpuTranslationState *sender_state =
             safe_cast<GpuTranslationState*>(pkt->senderState);

     bool update_stats = !sender_state->isPrefetch;

     DPRINTF(GPUTLB, "Translation req. for virt. page addr %#x\n",
             virt_page_addr);

     int req_cnt = sender_state->reqCnt.back();

     if (update_stats) {
         stats.accessCycles -= (curCycle() * req_cnt);
         stats.localCycles -= curCycle();
         stats.globalNumTLBAccesses += req_cnt;
     }

     tlbOutcome lookup_outcome = TLB_MISS;
     const RequestPtr &tmp_req = pkt->req;

     // Access the TLB and figure out if it's a hit or a miss.
     auto entry = tlbLookup(tmp_req, update_stats);

     if (entry) {
         lookup_outcome = TLB_HIT;
         // Put the entry in SenderState
         VegaTlbEntry *entry = lookup(virt_page_addr, false);
         assert(entry);

         // Set if this is a system request
         pkt->req->setSystemReq(entry->pte.s);

         Addr alignedPaddr = pageAlign(entry->paddr);
         sender_state->tlbEntry =
             new VegaTlbEntry(1 /* VMID */, virt_page_addr, alignedPaddr,
                             entry->logBytes, entry->pte);

         if (update_stats) {
             // the reqCnt has an entry per level, so its size tells us
             // which level we are in
             sender_state->hitLevel = sender_state->reqCnt.size();
             stats.globalNumTLBHits += req_cnt;
         }
     } else {
         if (update_stats)
             stats.globalNumTLBMisses += req_cnt;
     }

     /*
      * We now know the TLB lookup outcome (if it's a hit or a miss), as
      * well as the TLB access latency.
      *
      * We create and schedule a new TLBEvent which will help us take the
      * appropriate actions (e.g., update TLB on a hit, send request to
      * lower level TLB on a miss, or start a page walk if this was the
      * last-level TLB)
      */
     TLBEvent *tlb_event =
         new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);

     if (translationReturnEvent.count(virt_page_addr)) {
         panic("Virtual Page Address %#x already has a return event\n",
               virt_page_addr);
     }

     translationReturnEvent[virt_page_addr] = tlb_event;
     assert(tlb_event);

     DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
             curTick() + cyclesToTicks(Cycles(hitLatency)));

     schedule(tlb_event, curTick() + cyclesToTicks(Cycles(hitLatency)));
 }

 GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr,
     tlbOutcome tlb_outcome, PacketPtr _pkt)
         : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
           outcome(tlb_outcome), pkt(_pkt)
 {
 }

 /**
  * Do Paging protection checks. If we encounter a page fault, then
  * an assertion is fired.
  */
 void
 GpuTLB::pagingProtectionChecks(PacketPtr pkt, VegaTlbEntry * tlb_entry,
         Mode mode)
 {
     // Do paging protection checks.
     bool badWrite = (!tlb_entry->writable());

     if (mode == BaseMMU::Write && badWrite) {
         // The page must have been present to get into the TLB in
         // the first place. We'll assume the reserved bits are
         // fine even though we're not checking them.
         fatal("Page fault on addr %lx PTE=%#lx", pkt->req->getVaddr(),
                 (uint64_t)tlb_entry->pte);
     }
 }

 void
 GpuTLB::walkerResponse(VegaTlbEntry& entry, PacketPtr pkt)
 {
     DPRINTF(GPUTLB, "WalkerResponse for %#lx. Entry: (%#lx, %#lx, %#lx)\n",
             pkt->req->getVaddr(), entry.vaddr, entry.paddr, entry.size());

     Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
                                     VegaISA::PageBytes);

     Addr page_addr = entry.pte.ppn << VegaISA::PageShift;
     Addr paddr = page_addr + (entry.vaddr & mask(entry.logBytes));
     pkt->req->setPaddr(paddr);
     pkt->req->setSystemReq(entry.pte.s);

     GpuTranslationState *sender_state =
         safe_cast<GpuTranslationState*>(pkt->senderState);
     sender_state->tlbEntry = new VegaTlbEntry(entry);

     handleTranslationReturn(virt_page_addr, TLB_MISS, pkt);
 }

 /**
  * handleTranslationReturn is called on a TLB hit,
  * when a TLB miss returns or when a page fault returns.
  * The latter calls handelHit with TLB miss as tlbOutcome.
  */
 void
 GpuTLB::handleTranslationReturn(Addr virt_page_addr,
     tlbOutcome tlb_outcome, PacketPtr pkt)
 {
     assert(pkt);
     Addr vaddr = pkt->req->getVaddr();

     GpuTranslationState *sender_state =
         safe_cast<GpuTranslationState*>(pkt->senderState);

     Mode mode = sender_state->tlbMode;

     VegaTlbEntry *local_entry, *new_entry;

     int req_cnt = sender_state->reqCnt.back();
     bool update_stats = !sender_state->isPrefetch;

     if (update_stats) {
         stats.accessCycles += (req_cnt * curCycle());
         stats.localCycles += curCycle();
     }

     if (tlb_outcome == TLB_HIT) {
         DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n",
             vaddr);
         local_entry = safe_cast<VegaTlbEntry *>(sender_state->tlbEntry);
     } else {
         DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
                 vaddr);

         /**
          * We are returning either from a page walk or from a hit at a
          * lower TLB level. The senderState should be "carrying" a pointer
          * to the correct TLBEntry.
          */
         new_entry = safe_cast<VegaTlbEntry *>(sender_state->tlbEntry);
         assert(new_entry);
         local_entry = new_entry;

         if (allocationPolicy) {
             assert(new_entry->pte);
             DPRINTF(GPUTLB, "allocating entry w/ addr %#lx of size %#lx\n",
                     virt_page_addr, new_entry->size());

             local_entry = insert(virt_page_addr, *new_entry);
         }

         assert(local_entry);
     }

     /**
      * At this point the packet carries an up-to-date tlbEntry pointer
      * in its senderState.
      * Next step is to do the paging protection checks.
      */
     DPRINTF(GPUTLB, "Entry found with vaddr %#x,  doing protection checks "
             "while paddr was %#x.\n", local_entry->vaddr,
             local_entry->paddr);

     pagingProtectionChecks(pkt, local_entry, mode);
     int page_size = local_entry->size();
     Addr paddr = local_entry->paddr + (vaddr & (page_size - 1));
     DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);

     // Since this packet will be sent through the cpu side port, it must be
     // converted to a response pkt if it is not one already
     if (pkt->isRequest()) {
         pkt->makeTimingResponse();
     }

     pkt->req->setPaddr(paddr);

     if (local_entry->uncacheable()) {
          pkt->req->setFlags(Request::UNCACHEABLE);
     }

     //send packet back to coalescer
     cpuSidePort[0]->sendTimingResp(pkt);
     //schedule cleanup event
     cleanupQueue.push(virt_page_addr);

     DPRINTF(GPUTLB, "Scheduled %#lx for cleanup\n", virt_page_addr);

     // schedule this only once per cycle.
     // The check is required because we might have multiple translations
     // returning the same cycle
     // this is a maximum priority event and must be on the same cycle
     // as the cleanup event in TLBCoalescer to avoid a race with
     // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry
     if (!cleanupEvent.scheduled())
         schedule(cleanupEvent, curTick());
 }

 /**
  * Here we take the appropriate actions based on the result of the
  * TLB lookup.
  */
 void
 GpuTLB::translationReturn(Addr virtPageAddr, tlbOutcome outcome,
                           PacketPtr pkt)
 {
     DPRINTF(GPUTLB, "Triggered TLBEvent for addr %#x\n", virtPageAddr);

     assert(translationReturnEvent[virtPageAddr]);
     assert(pkt);

     GpuTranslationState *tmp_sender_state =
         safe_cast<GpuTranslationState*>(pkt->senderState);

     int req_cnt = tmp_sender_state->reqCnt.back();
     bool update_stats = !tmp_sender_state->isPrefetch;


     if (outcome == TLB_HIT) {
         handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);

     } else if (outcome == TLB_MISS) {

         DPRINTF(GPUTLB, "This is a TLB miss\n");
         if (hasMemSidePort) {
             // the one cyle added here represent the delay from when we get
             // the reply back till when we propagate it to the coalescer
             // above.

             /**
              * There is a TLB below. Send the coalesced request.
              * We actually send the very first packet of all the
              * pending packets for this virtual page address.
              */
             tmp_sender_state->deviceId = 1;
             tmp_sender_state->pasId = 0;

             if (!memSidePort[0]->sendTimingReq(pkt)) {
                 DPRINTF(GPUTLB, "Failed sending translation request to "
                         "lower level TLB for addr %#x\n", virtPageAddr);

                 memSidePort[0]->retries.push_back(pkt);
             } else {
                 DPRINTF(GPUTLB, "Sent translation request to lower level "
                         "TLB for addr %#x\n", virtPageAddr);
             }
         } else {
             //this is the last level TLB. Start a page walk
             DPRINTF(GPUTLB, "Last level TLB - start a page walk for "
                     "addr %#x\n", virtPageAddr);

             if (update_stats)
                 stats.pageTableCycles -= (req_cnt*curCycle());

             TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
             assert(tlb_event);
             tlb_event->updateOutcome(PAGE_WALK);
             schedule(tlb_event,
                      curTick() + cyclesToTicks(Cycles(missLatency2)));
         }
     } else if (outcome == PAGE_WALK) {
         if (update_stats)
             stats.pageTableCycles += (req_cnt*curCycle());

         // Need to access the page table and update the TLB
         DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
                 virtPageAddr);

         Addr base = gpuDevice->getVM().getPageTableBase(1);
         Addr vaddr = pkt->req->getVaddr();
         walker->setDevRequestor(gpuDevice->vramRequestorId());

         // Do page table walk
         walker->startTiming(pkt, base, vaddr, BaseMMU::Mode::Read);
     } else if (outcome == MISS_RETURN) {
         /** we add an extra cycle in the return path of the translation
          * requests in between the various TLB levels.
          */
         handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
     } else {
         panic("Unexpected TLB outcome %d", outcome);
     }
 }

 void
 GpuTLB::TLBEvent::process()
 {
     tlb->translationReturn(virtPageAddr, outcome, pkt);
 }

 const char*
 GpuTLB::TLBEvent::description() const
 {
     return "trigger translationDoneEvent";
 }

 void
 GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome)
 {
     outcome = _outcome;
 }

 Addr
 GpuTLB::TLBEvent::getTLBEventVaddr()
 {
     return virtPageAddr;
 }

 /**
  * recvTiming receives a coalesced timing request from a TLBCoalescer
  * and it calls issueTLBLookup()
  * It only rejects the packet if we have exceeded the max
  * outstanding number of requests for the TLB
  */
 bool
 GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt)
 {
     bool ret = false;
     [[maybe_unused]] Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
                                                      VegaISA::PageBytes);

     if (tlb->outstandingReqs < tlb->maxCoalescedReqs) {
         assert(!tlb->translationReturnEvent.count(virt_page_addr));
         tlb->issueTLBLookup(pkt);
         // update number of outstanding translation requests
         tlb->outstandingReqs++;
         ret = true;
     } else {
         DPRINTF(GPUTLB, "Reached maxCoalescedReqs number %d\n",
                 tlb->outstandingReqs);
         tlb->stats.maxDownstreamReached++;
         ret = false;

     }

     if (tlb->outstandingReqs > tlb->stats.outstandingReqsMax.value())
         tlb->stats.outstandingReqsMax = tlb->outstandingReqs;

     return ret;
 }

 /**
  * handleFuncTranslationReturn is called on a TLB hit,
  * when a TLB miss returns or when a page fault returns.
  * It updates LRU, inserts the TLB entry on a miss
  * depending on the allocation policy and does the required
  * protection checks. It does NOT create a new packet to
  * update the packet's addr; this is done in hsail-gpu code.
  */
 void
 GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome)
 {
     GpuTranslationState *sender_state =
         safe_cast<GpuTranslationState*>(pkt->senderState);

     Mode mode = sender_state->tlbMode;
     Addr vaddr = pkt->req->getVaddr();

     VegaTlbEntry *local_entry, *new_entry;

     if (tlb_outcome == TLB_HIT) {
         DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr "
                 "%#x\n", vaddr);

         local_entry = safe_cast<VegaTlbEntry *>(sender_state->tlbEntry);
     } else {
         DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
                 "%#x\n", vaddr);

         /**
          * We are returning either from a page walk or from a hit at a
          * lower TLB level. The senderState should be "carrying" a pointer
          * to the correct TLBEntry.
          */
         new_entry = safe_cast<VegaTlbEntry *>(sender_state->tlbEntry);
         assert(new_entry);
         local_entry = new_entry;

         if (allocationPolicy) {
             Addr virt_page_addr = roundDown(vaddr, VegaISA::PageBytes);

             DPRINTF(GPUTLB, "allocating entry w/ addr %#lx\n",
                     virt_page_addr);

             local_entry = insert(virt_page_addr, *new_entry);
         }

         assert(local_entry);
     }

     DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
             "while paddr was %#x.\n", local_entry->vaddr,
             local_entry->paddr);

     /**
      * Do paging checks if it's a normal functional access.  If it's for a
      * prefetch, then sometimes you can try to prefetch something that
      * won't pass protection. We don't actually want to fault becuase there
      * is no demand access to deem this a violation.  Just put it in the
      * TLB and it will fault if indeed a future demand access touches it in
      * violation.
      *
      * This feature could be used to explore security issues around
      * speculative memory accesses.
      */
     if (!sender_state->isPrefetch && sender_state->tlbEntry)
         pagingProtectionChecks(pkt, local_entry, mode);

     int page_size = local_entry->size();
     Addr paddr = local_entry->paddr + (vaddr & (page_size - 1));
     DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);

     pkt->req->setPaddr(paddr);

     if (local_entry->uncacheable())
          pkt->req->setFlags(Request::UNCACHEABLE);
 }

 // This is used for atomic translations. Need to
 // make it all happen during the same cycle.
 void
 GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt)
 {
     GpuTranslationState *sender_state =
         safe_cast<GpuTranslationState*>(pkt->senderState);

     bool update_stats = !sender_state->isPrefetch;

     Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
                                     VegaISA::PageBytes);

     // do the TLB lookup without updating the stats
     bool success = tlb->tlbLookup(pkt->req, update_stats);
     tlbOutcome tlb_outcome = success ? TLB_HIT : TLB_MISS;

     // functional mode means no coalescing
     // global metrics are the same as the local metrics
     if (update_stats) {
         tlb->stats.globalNumTLBAccesses++;

         if (success) {
             sender_state->hitLevel = sender_state->reqCnt.size();
             tlb->stats.globalNumTLBHits++;
         } else {
             tlb->stats.globalNumTLBMisses++;
         }
     }

     if (!success) {
         if (tlb->hasMemSidePort) {
             // there is a TLB below -> propagate down the TLB hierarchy
             tlb->memSidePort[0]->sendFunctional(pkt);
             // If no valid translation from a prefetch, then just return
             if (sender_state->isPrefetch && !pkt->req->hasPaddr())
                 return;
         } else {
             // Need to access the page table and update the TLB
             DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
                     virt_page_addr);

             Addr vaddr = pkt->req->getVaddr();
             [[maybe_unused]] Addr alignedVaddr =
                 tlb->pageAlign(virt_page_addr);
             assert(alignedVaddr == virt_page_addr);

             unsigned logBytes;
             PageTableEntry pte;

             // Initialize walker state for VMID
             Addr base = tlb->gpuDevice->getVM().getPageTableBase(1);
             tlb->walker->setDevRequestor(tlb->gpuDevice->vramRequestorId());

             // Do page table walk
             Fault fault = tlb->walker->startFunctional(base, vaddr, pte,
                                                        logBytes,
                                                        BaseMMU::Mode::Read);
             if (fault != NoFault) {
                 fatal("Translation fault in TLB at %d!", __LINE__);
             }

             // PPN is already shifted by fragment so we only shift by native
             // page size. Fragment is still used via logBytes to select lower
             // bits from vaddr.
             Addr page_addr = pte.ppn << PageShift;
             Addr paddr = page_addr + (vaddr & mask(logBytes));
             Addr alignedPaddr = tlb->pageAlign(paddr);
             pkt->req->setPaddr(paddr);
             pkt->req->setSystemReq(pte.s);

             if (!sender_state->isPrefetch) {
                 assert(paddr);

                 DPRINTF(GPUTLB, "Mapping %#x to %#x\n", vaddr, paddr);

                 sender_state->tlbEntry =
                     new VegaTlbEntry(1 /* VMID */, virt_page_addr,
                                  alignedPaddr, logBytes, pte);
             } else {
                 // If this was a prefetch, then do the normal thing if it
                 // was a successful translation.  Otherwise, send an empty
                 // TLB entry back so that it can be figured out as empty
                 // and handled accordingly.
                 if (paddr) {
                     DPRINTF(GPUTLB, "Mapping %#x to %#x\n", vaddr, paddr);

                     sender_state->tlbEntry =
                         new VegaTlbEntry(1 /* VMID */, virt_page_addr,
                                      alignedPaddr, logBytes, pte);
                 } else {
                     DPRINTF(GPUPrefetch, "Prefetch failed %#x\n", vaddr);

                     sender_state->tlbEntry = nullptr;

                     return;
                 }
             }
         }
     } else {
         VegaTlbEntry *entry = tlb->lookup(virt_page_addr, update_stats);
         assert(entry);

         if (sender_state->isPrefetch) {
             DPRINTF(GPUPrefetch, "Functional Hit for vaddr %#x\n",
                     entry->vaddr);
         }

         sender_state->tlbEntry = new VegaTlbEntry(1 /* VMID */, entry->vaddr,
                                                  entry->paddr, entry->logBytes,
                                                  entry->pte);
     }

     // This is the function that would populate pkt->req with the paddr of
     // the translation. But if no translation happens (i.e Prefetch fails)
     // then the early returns in the above code wiill keep this function
     // from executing.
     tlb->handleFuncTranslationReturn(pkt, tlb_outcome);
 }

 void
 GpuTLB::CpuSidePort::recvReqRetry()
 {
     // The CPUSidePort never sends anything but replies. No retries
     // expected.
     panic("recvReqRetry called");
 }

 AddrRangeList
 GpuTLB::CpuSidePort::getAddrRanges() const
 {
     // currently not checked by the requestor
     AddrRangeList ranges;

     return ranges;
 }

 /**
  * MemSidePort receives the packet back.
  * We need to call the handleTranslationReturn
  * and propagate up the hierarchy.
  */
 bool
 GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt)
 {
     Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
                                     VegaISA::PageBytes);

     DPRINTF(GPUTLB, "MemSidePort recvTiming for virt_page_addr %#x\n",
             virt_page_addr);

     TLBEvent *tlb_event = tlb->translationReturnEvent[virt_page_addr];
     assert(tlb_event);
     assert(virt_page_addr == tlb_event->getTLBEventVaddr());

     tlb_event->updateOutcome(MISS_RETURN);
     tlb->schedule(tlb_event, curTick()+tlb->clockPeriod());

     return true;
 }

 void
 GpuTLB::MemSidePort::recvReqRetry()
 {
     // No retries should reach the TLB. The retries
     // should only reach the TLBCoalescer.
     panic("recvReqRetry called");
 }

 void
 GpuTLB::cleanup()
 {
     while (!cleanupQueue.empty()) {
         Addr cleanup_addr = cleanupQueue.front();
         cleanupQueue.pop();

         DPRINTF(GPUTLB, "Deleting return event for %#lx\n", cleanup_addr);

         // delete TLBEvent
         TLBEvent * old_tlb_event = translationReturnEvent[cleanup_addr];
         delete old_tlb_event;
         translationReturnEvent.erase(cleanup_addr);

         // update number of outstanding requests
         outstandingReqs--;
     }

     /** the higher level coalescer should retry if it has
      * any pending requests.
      */
     for (int i = 0; i < cpuSidePort.size(); ++i) {
         cpuSidePort[i]->sendRetryReq();
     }
 }

 GpuTLB::VegaTLBStats::VegaTLBStats(statistics::Group *parent)
     : statistics::Group(parent),
       ADD_STAT(maxDownstreamReached, "Number of refused translation requests"),
       ADD_STAT(outstandingReqsMax, "Maximum count in coalesced request queue"),
       ADD_STAT(localNumTLBAccesses, "Number of TLB accesses"),
       ADD_STAT(localNumTLBHits, "Number of TLB hits"),
       ADD_STAT(localNumTLBMisses, "Number of TLB misses"),
       ADD_STAT(localTLBMissRate, "TLB miss rate"),
       ADD_STAT(globalNumTLBAccesses, "Number of TLB accesses"),
       ADD_STAT(globalNumTLBHits, "Number of TLB hits"),
       ADD_STAT(globalNumTLBMisses, "Number of TLB misses"),
       ADD_STAT(globalTLBMissRate, "TLB miss rate"),
       ADD_STAT(accessCycles, "Cycles spent accessing this TLB level"),
       ADD_STAT(pageTableCycles, "Cycles spent accessing the page table"),
       ADD_STAT(localCycles, "Number of cycles spent in queue for all "
                "incoming reqs"),
       ADD_STAT(localLatency, "Avg. latency over incoming coalesced reqs")
 {
     localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
     globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;

     localLatency = localCycles / localNumTLBAccesses;
 }

 } // namespace VegaISA
 } // namespace gem5