| /* |
| * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. |
| * All rights reserved. |
| * |
| * For use for simulation and test purposes only |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * 3. Neither the name of the copyright holder nor the names of its contributors |
| * may be used to endorse or promote products derived from this software |
| * without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| * |
| * Author: Lisa Hsu |
| */ |
| |
| #include "gpu-compute/tlb_coalescer.hh" |
| |
| #include <cstring> |
| |
| #include "debug/GPUTLB.hh" |
| |
| TLBCoalescer::TLBCoalescer(const Params *p) |
| : MemObject(p), |
| clock(p->clk_domain->clockPeriod()), |
| TLBProbesPerCycle(p->probesPerCycle), |
| coalescingWindow(p->coalescingWindow), |
| disableCoalescing(p->disableCoalescing), |
| probeTLBEvent([this]{ processProbeTLBEvent(); }, |
| "Probe the TLB below", |
| false, Event::CPU_Tick_Pri), |
| cleanupEvent([this]{ processCleanupEvent(); }, |
| "Cleanup issuedTranslationsTable hashmap", |
| false, Event::Maximum_Pri) |
| { |
| // create the slave ports based on the number of connected ports |
| for (size_t i = 0; i < p->port_slave_connection_count; ++i) { |
| cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i), |
| this, i)); |
| } |
| |
| // create the master ports based on the number of connected ports |
| for (size_t i = 0; i < p->port_master_connection_count; ++i) { |
| memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i), |
| this, i)); |
| } |
| } |
| |
| BaseSlavePort& |
| TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx) |
| { |
| if (if_name == "slave") { |
| if (idx >= static_cast<PortID>(cpuSidePort.size())) { |
| panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx); |
| } |
| |
| return *cpuSidePort[idx]; |
| } else { |
| panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name); |
| } |
| } |
| |
| BaseMasterPort& |
| TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx) |
| { |
| if (if_name == "master") { |
| if (idx >= static_cast<PortID>(memSidePort.size())) { |
| panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx); |
| } |
| |
| return *memSidePort[idx]; |
| } else { |
| panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name); |
| } |
| } |
| |
| /* |
| * This method returns true if the <incoming_pkt> |
| * can be coalesced with <coalesced_pkt> and false otherwise. |
| * A given set of rules is checked. |
| * The rules can potentially be modified based on the TLB level. |
| */ |
| bool |
| TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt) |
| { |
| if (disableCoalescing) |
| return false; |
| |
| TheISA::GpuTLB::TranslationState *incoming_state = |
| safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState); |
| |
| TheISA::GpuTLB::TranslationState *coalesced_state = |
| safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState); |
| |
| // Rule 1: Coalesce requests only if they |
| // fall within the same virtual page |
| Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(), |
| TheISA::PageBytes); |
| |
| Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(), |
| TheISA::PageBytes); |
| |
| if (incoming_virt_page_addr != coalesced_virt_page_addr) |
| return false; |
| |
| //* Rule 2: Coalesce requests only if they |
| // share a TLB Mode, i.e. they are both read |
| // or write requests. |
| BaseTLB::Mode incoming_mode = incoming_state->tlbMode; |
| BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode; |
| |
| if (incoming_mode != coalesced_mode) |
| return false; |
| |
| // when we can coalesce a packet update the reqCnt |
| // that is the number of packets represented by |
| // this coalesced packet |
| if (!incoming_state->prefetch) |
| coalesced_state->reqCnt.back() += incoming_state->reqCnt.back(); |
| |
| return true; |
| } |
| |
| /* |
| * We need to update the physical addresses of all the translation requests |
| * that were coalesced into the one that just returned. |
| */ |
| void |
| TLBCoalescer::updatePhysAddresses(PacketPtr pkt) |
| { |
| Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); |
| |
| DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n", |
| issuedTranslationsTable[virt_page_addr].size(), virt_page_addr); |
| |
| TheISA::GpuTLB::TranslationState *sender_state = |
| safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); |
| |
| TheISA::GpuTlbEntry *tlb_entry = sender_state->tlbEntry; |
| assert(tlb_entry); |
| Addr first_entry_vaddr = tlb_entry->vaddr; |
| Addr first_entry_paddr = tlb_entry->paddr; |
| int page_size = tlb_entry->size(); |
| bool uncacheable = tlb_entry->uncacheable; |
| int first_hit_level = sender_state->hitLevel; |
| bool valid = tlb_entry->valid; |
| |
| // Get the physical page address of the translated request |
| // Using the page_size specified in the TLBEntry allows us |
| // to support different page sizes. |
| Addr phys_page_paddr = pkt->req->getPaddr(); |
| phys_page_paddr &= ~(page_size - 1); |
| |
| for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) { |
| PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i]; |
| TheISA::GpuTLB::TranslationState *sender_state = |
| safe_cast<TheISA::GpuTLB::TranslationState*>( |
| local_pkt->senderState); |
| |
| // we are sending the packet back, so pop the reqCnt associated |
| // with this level in the TLB hiearchy |
| if (!sender_state->prefetch) |
| sender_state->reqCnt.pop_back(); |
| |
| /* |
| * Only the first packet from this coalesced request has been |
| * translated. Grab the translated phys. page addr and update the |
| * physical addresses of the remaining packets with the appropriate |
| * page offsets. |
| */ |
| if (i) { |
| Addr paddr = phys_page_paddr; |
| paddr |= (local_pkt->req->getVaddr() & (page_size - 1)); |
| local_pkt->req->setPaddr(paddr); |
| |
| if (uncacheable) |
| local_pkt->req->setFlags(Request::UNCACHEABLE); |
| |
| // update senderState->tlbEntry, so we can insert |
| // the correct TLBEentry in the TLBs above. |
| sender_state->tlbEntry = |
| new TheISA::GpuTlbEntry(0, first_entry_vaddr, first_entry_paddr, |
| valid); |
| |
| // update the hitLevel for all uncoalesced reqs |
| // so that each packet knows where it hit |
| // (used for statistics in the CUs) |
| sender_state->hitLevel = first_hit_level; |
| } |
| |
| SlavePort *return_port = sender_state->ports.back(); |
| sender_state->ports.pop_back(); |
| |
| // Translation is done - Convert to a response pkt if necessary and |
| // send the translation back |
| if (local_pkt->isRequest()) { |
| local_pkt->makeTimingResponse(); |
| } |
| |
| return_port->sendTimingResp(local_pkt); |
| } |
| |
| // schedule clean up for end of this cycle |
| // This is a maximum priority event and must be on |
| // the same cycle as GPUTLB cleanup event to prevent |
| // race conditions with an IssueProbeEvent caused by |
| // MemSidePort::recvReqRetry |
| cleanupQueue.push(virt_page_addr); |
| |
| if (!cleanupEvent.scheduled()) |
| schedule(cleanupEvent, curTick()); |
| } |
| |
| // Receive translation requests, create a coalesced request, |
| // and send them to the TLB (TLBProbesPerCycle) |
| bool |
| TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt) |
| { |
| // first packet of a coalesced request |
| PacketPtr first_packet = nullptr; |
| // true if we are able to do coalescing |
| bool didCoalesce = false; |
| // number of coalesced reqs for a given window |
| int coalescedReq_cnt = 0; |
| |
| TheISA::GpuTLB::TranslationState *sender_state = |
| safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); |
| |
| // push back the port to remember the path back |
| sender_state->ports.push_back(this); |
| |
| bool update_stats = !sender_state->prefetch; |
| |
| if (update_stats) { |
| // if reqCnt is empty then this packet does not represent |
| // multiple uncoalesced reqs(pkts) but just a single pkt. |
| // If it does though then the reqCnt for each level in the |
| // hierarchy accumulates the total number of reqs this packet |
| // represents |
| int req_cnt = 1; |
| |
| if (!sender_state->reqCnt.empty()) |
| req_cnt = sender_state->reqCnt.back(); |
| |
| sender_state->reqCnt.push_back(req_cnt); |
| |
| // update statistics |
| coalescer->uncoalescedAccesses++; |
| req_cnt = sender_state->reqCnt.back(); |
| DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt); |
| coalescer->queuingCycles -= (curTick() * req_cnt); |
| coalescer->localqueuingCycles -= curTick(); |
| } |
| |
| // FIXME if you want to coalesce not based on the issueTime |
| // of the packets (i.e., from the compute unit's perspective) |
| // but based on when they reached this coalescer then |
| // remove the following if statement and use curTick() or |
| // coalescingWindow for the tick_index. |
| if (!sender_state->issueTime) |
| sender_state->issueTime = curTick(); |
| |
| // The tick index is used as a key to the coalescerFIFO hashmap. |
| // It is shared by all candidates that fall within the |
| // given coalescingWindow. |
| int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow; |
| |
| if (coalescer->coalescerFIFO.count(tick_index)) { |
| coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size(); |
| } |
| |
| // see if we can coalesce the incoming pkt with another |
| // coalesced request with the same tick_index |
| for (int i = 0; i < coalescedReq_cnt; ++i) { |
| first_packet = coalescer->coalescerFIFO[tick_index][i][0]; |
| |
| if (coalescer->canCoalesce(pkt, first_packet)) { |
| coalescer->coalescerFIFO[tick_index][i].push_back(pkt); |
| |
| DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n", |
| i, tick_index, |
| coalescer->coalescerFIFO[tick_index][i].size()); |
| |
| didCoalesce = true; |
| break; |
| } |
| } |
| |
| // if this is the first request for this tick_index |
| // or we did not manage to coalesce, update stats |
| // and make necessary allocations. |
| if (!coalescedReq_cnt || !didCoalesce) { |
| if (update_stats) |
| coalescer->coalescedAccesses++; |
| |
| std::vector<PacketPtr> new_array; |
| new_array.push_back(pkt); |
| coalescer->coalescerFIFO[tick_index].push_back(new_array); |
| |
| DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after " |
| "push\n", tick_index, |
| coalescer->coalescerFIFO[tick_index].size()); |
| } |
| |
| //schedule probeTLBEvent next cycle to send the |
| //coalesced requests to the TLB |
| if (!coalescer->probeTLBEvent.scheduled()) { |
| coalescer->schedule(coalescer->probeTLBEvent, |
| curTick() + coalescer->ticks(1)); |
| } |
| |
| return true; |
| } |
| |
| void |
| TLBCoalescer::CpuSidePort::recvReqRetry() |
| { |
| assert(false); |
| } |
| |
| void |
| TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt) |
| { |
| |
| TheISA::GpuTLB::TranslationState *sender_state = |
| safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); |
| |
| bool update_stats = !sender_state->prefetch; |
| |
| if (update_stats) |
| coalescer->uncoalescedAccesses++; |
| |
| // If there is a pending timing request for this virtual address |
| // print a warning message. This is a temporary caveat of |
| // the current simulator where atomic and timing requests can |
| // coexist. FIXME remove this check/warning in the future. |
| Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); |
| int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr); |
| |
| if (map_count) { |
| DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing " |
| "req. pending\n", virt_page_addr); |
| } |
| |
| coalescer->memSidePort[0]->sendFunctional(pkt); |
| } |
| |
| AddrRangeList |
| TLBCoalescer::CpuSidePort::getAddrRanges() const |
| { |
| // currently not checked by the master |
| AddrRangeList ranges; |
| |
| return ranges; |
| } |
| |
| bool |
| TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt) |
| { |
| // a translation completed and returned |
| coalescer->updatePhysAddresses(pkt); |
| |
| return true; |
| } |
| |
| void |
| TLBCoalescer::MemSidePort::recvReqRetry() |
| { |
| //we've receeived a retry. Schedule a probeTLBEvent |
| if (!coalescer->probeTLBEvent.scheduled()) |
| coalescer->schedule(coalescer->probeTLBEvent, |
| curTick() + coalescer->ticks(1)); |
| } |
| |
| void |
| TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt) |
| { |
| fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n"); |
| } |
| |
| /* |
| * Here we scan the coalescer FIFO and issue the max |
| * number of permitted probes to the TLB below. We |
| * permit bypassing of coalesced requests for the same |
| * tick_index. |
| * |
| * We do not access the next tick_index unless we've |
| * drained the previous one. The coalesced requests |
| * that are successfully sent are moved to the |
| * issuedTranslationsTable table (the table which keeps |
| * track of the outstanding reqs) |
| */ |
| void |
| TLBCoalescer::processProbeTLBEvent() |
| { |
| // number of TLB probes sent so far |
| int sent_probes = 0; |
| // rejected denotes a blocking event |
| bool rejected = false; |
| |
| // It is set to true either when the recvTiming of the TLB below |
| // returns false or when there is another outstanding request for the |
| // same virt. page. |
| |
| DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__); |
| |
| for (auto iter = coalescerFIFO.begin(); |
| iter != coalescerFIFO.end() && !rejected; ) { |
| int coalescedReq_cnt = iter->second.size(); |
| int i = 0; |
| int vector_index = 0; |
| |
| DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n", |
| coalescedReq_cnt, iter->first); |
| |
| while (i < coalescedReq_cnt) { |
| ++i; |
| PacketPtr first_packet = iter->second[vector_index][0]; |
| |
| // compute virtual page address for this request |
| Addr virt_page_addr = roundDown(first_packet->req->getVaddr(), |
| TheISA::PageBytes); |
| |
| // is there another outstanding request for the same page addr? |
| int pending_reqs = |
| issuedTranslationsTable.count(virt_page_addr); |
| |
| if (pending_reqs) { |
| DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for " |
| "page %#x\n", virt_page_addr); |
| |
| ++vector_index; |
| rejected = true; |
| |
| continue; |
| } |
| |
| // send the coalesced request for virt_page_addr |
| if (!memSidePort[0]->sendTimingReq(first_packet)) { |
| DPRINTF(GPUTLB, "Failed to send TLB request for page %#x", |
| virt_page_addr); |
| |
| // No need for a retries queue since we are already buffering |
| // the coalesced request in coalescerFIFO. |
| rejected = true; |
| ++vector_index; |
| } else { |
| TheISA::GpuTLB::TranslationState *tmp_sender_state = |
| safe_cast<TheISA::GpuTLB::TranslationState*> |
| (first_packet->senderState); |
| |
| bool update_stats = !tmp_sender_state->prefetch; |
| |
| if (update_stats) { |
| // req_cnt is total number of packets represented |
| // by the one we just sent counting all the way from |
| // the top of TLB hiearchy (i.e., from the CU) |
| int req_cnt = tmp_sender_state->reqCnt.back(); |
| queuingCycles += (curTick() * req_cnt); |
| |
| DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n", |
| name(), req_cnt); |
| |
| // pkt_cnt is number of packets we coalesced into the one |
| // we just sent but only at this coalescer level |
| int pkt_cnt = iter->second[vector_index].size(); |
| localqueuingCycles += (curTick() * pkt_cnt); |
| } |
| |
| DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x", |
| virt_page_addr); |
| |
| //copy coalescedReq to issuedTranslationsTable |
| issuedTranslationsTable[virt_page_addr] |
| = iter->second[vector_index]; |
| |
| //erase the entry of this coalesced req |
| iter->second.erase(iter->second.begin() + vector_index); |
| |
| if (iter->second.empty()) |
| assert(i == coalescedReq_cnt); |
| |
| sent_probes++; |
| if (sent_probes == TLBProbesPerCycle) |
| return; |
| } |
| } |
| |
| //if there are no more coalesced reqs for this tick_index |
| //erase the hash_map with the first iterator |
| if (iter->second.empty()) { |
| coalescerFIFO.erase(iter++); |
| } else { |
| ++iter; |
| } |
| } |
| } |
| |
| void |
| TLBCoalescer::processCleanupEvent() |
| { |
| while (!cleanupQueue.empty()) { |
| Addr cleanup_addr = cleanupQueue.front(); |
| cleanupQueue.pop(); |
| issuedTranslationsTable.erase(cleanup_addr); |
| |
| DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n", |
| cleanup_addr); |
| } |
| } |
| |
| void |
| TLBCoalescer::regStats() |
| { |
| MemObject::regStats(); |
| |
| uncoalescedAccesses |
| .name(name() + ".uncoalesced_accesses") |
| .desc("Number of uncoalesced TLB accesses") |
| ; |
| |
| coalescedAccesses |
| .name(name() + ".coalesced_accesses") |
| .desc("Number of coalesced TLB accesses") |
| ; |
| |
| queuingCycles |
| .name(name() + ".queuing_cycles") |
| .desc("Number of cycles spent in queue") |
| ; |
| |
| localqueuingCycles |
| .name(name() + ".local_queuing_cycles") |
| .desc("Number of cycles spent in queue for all incoming reqs") |
| ; |
| |
| localLatency |
| .name(name() + ".local_latency") |
| .desc("Avg. latency over all incoming pkts") |
| ; |
| |
| localLatency = localqueuingCycles / uncoalescedAccesses; |
| } |
| |
| |
| TLBCoalescer* |
| TLBCoalescerParams::create() |
| { |
| return new TLBCoalescer(this); |
| } |
| |