| /* |
| * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. |
| * All rights reserved. |
| * |
| * For use for simulation and test purposes only |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * 3. Neither the name of the copyright holder nor the names of its |
| * contributors may be used to endorse or promote products derived from this |
| * software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "mem/ruby/system/GPUCoalescer.hh" |
| |
| #include "base/logging.hh" |
| #include "base/str.hh" |
| #include "config/the_isa.hh" |
| #include "cpu/testers/rubytest/RubyTester.hh" |
| #include "debug/GPUCoalescer.hh" |
| #include "debug/MemoryAccess.hh" |
| #include "debug/ProtocolTrace.hh" |
| #include "debug/RubyPort.hh" |
| #include "debug/RubyStats.hh" |
| #include "gpu-compute/shader.hh" |
| #include "mem/packet.hh" |
| #include "mem/ruby/common/DataBlock.hh" |
| #include "mem/ruby/common/SubBlock.hh" |
| #include "mem/ruby/network/MessageBuffer.hh" |
| #include "mem/ruby/profiler/Profiler.hh" |
| #include "mem/ruby/slicc_interface/AbstractController.hh" |
| #include "mem/ruby/slicc_interface/RubyRequest.hh" |
| #include "mem/ruby/structures/CacheMemory.hh" |
| #include "mem/ruby/system/RubySystem.hh" |
| #include "params/RubyGPUCoalescer.hh" |
| |
| UncoalescedTable::UncoalescedTable(GPUCoalescer *gc) |
| : coalescer(gc) |
| { |
| } |
| |
| void |
| UncoalescedTable::insertPacket(PacketPtr pkt) |
| { |
| uint64_t seqNum = pkt->req->getReqInstSeqNum(); |
| |
| instMap[seqNum].push_back(pkt); |
| DPRINTF(GPUCoalescer, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n", |
| pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size()); |
| } |
| |
| bool |
| UncoalescedTable::packetAvailable() |
| { |
| return !instMap.empty(); |
| } |
| |
| void |
| UncoalescedTable::initPacketsRemaining(InstSeqNum seqNum, int count) |
| { |
| if (!instPktsRemaining.count(seqNum)) { |
| instPktsRemaining[seqNum] = count; |
| } |
| } |
| |
| int |
| UncoalescedTable::getPacketsRemaining(InstSeqNum seqNum) |
| { |
| return instPktsRemaining[seqNum]; |
| } |
| |
| void |
| UncoalescedTable::setPacketsRemaining(InstSeqNum seqNum, int count) |
| { |
| instPktsRemaining[seqNum] = count; |
| } |
| |
| PerInstPackets* |
| UncoalescedTable::getInstPackets(int offset) |
| { |
| if (offset >= instMap.size()) { |
| return nullptr; |
| } |
| |
| auto instMapIter = instMap.begin(); |
| std::advance(instMapIter, offset); |
| |
| return &(instMapIter->second); |
| } |
| |
| void |
| UncoalescedTable::updateResources() |
| { |
| for (auto iter = instMap.begin(); iter != instMap.end(); ) { |
| InstSeqNum seq_num = iter->first; |
| DPRINTF(GPUCoalescer, "%s checking remaining pkts for %d\n", |
| coalescer->name().c_str(), seq_num); |
| assert(instPktsRemaining.count(seq_num)); |
| |
| if (instPktsRemaining[seq_num] == 0) { |
| assert(iter->second.empty()); |
| |
| // Remove from both maps |
| instMap.erase(iter++); |
| instPktsRemaining.erase(seq_num); |
| |
| // Release the token |
| DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", seq_num); |
| coalescer->getGMTokenPort().sendTokens(1); |
| } else { |
| ++iter; |
| } |
| } |
| } |
| |
| bool |
| UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) { |
| // iterate the instructions held in UncoalescedTable to see whether there |
| // are more requests to issue; if yes, not yet done; otherwise, done |
| for (auto& inst : instMap) { |
| DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n" |
| ,inst.first, inst.second.size()); |
| if (inst.first == instSeqNum) { return false; } |
| } |
| |
| return true; |
| } |
| |
| void |
| UncoalescedTable::printRequestTable(std::stringstream& ss) |
| { |
| ss << "Listing pending packets from " << instMap.size() << " instructions"; |
| |
| for (auto& inst : instMap) { |
| ss << "\tAddr: " << printAddress(inst.first) << " with " |
| << inst.second.size() << " pending packets" << std::endl; |
| } |
| } |
| |
| void |
| UncoalescedTable::checkDeadlock(Tick threshold) |
| { |
| Tick current_time = curTick(); |
| |
| for (auto &it : instMap) { |
| for (auto &pkt : it.second) { |
| if (current_time - pkt->req->time() > threshold) { |
| std::stringstream ss; |
| printRequestTable(ss); |
| |
| panic("Possible Deadlock detected. Aborting!\n" |
| "version: %d request.paddr: 0x%x uncoalescedTable: %d " |
| "current time: %u issue_time: %d difference: %d\n" |
| "Request Tables:\n\n%s", coalescer->getId(), |
| pkt->getAddr(), instMap.size(), current_time, |
| pkt->req->time(), current_time - pkt->req->time(), |
| ss.str()); |
| } |
| } |
| } |
| } |
| |
| GPUCoalescer::GPUCoalescer(const Params &p) |
| : RubyPort(p), |
| issueEvent([this]{ completeIssue(); }, "Issue coalesced request", |
| false, Event::Progress_Event_Pri), |
| uncoalescedTable(this), |
| deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"), |
| gmTokenPort(name() + ".gmTokenPort", this) |
| { |
| m_store_waiting_on_load_cycles = 0; |
| m_store_waiting_on_store_cycles = 0; |
| m_load_waiting_on_store_cycles = 0; |
| m_load_waiting_on_load_cycles = 0; |
| |
| m_outstanding_count = 0; |
| |
| coalescingWindow = p.max_coalesces_per_cycle; |
| |
| m_max_outstanding_requests = 0; |
| m_instCache_ptr = nullptr; |
| m_dataCache_ptr = nullptr; |
| |
| m_instCache_ptr = p.icache; |
| m_dataCache_ptr = p.dcache; |
| m_max_outstanding_requests = p.max_outstanding_requests; |
| m_deadlock_threshold = p.deadlock_threshold; |
| |
| assert(m_max_outstanding_requests > 0); |
| assert(m_deadlock_threshold > 0); |
| assert(m_instCache_ptr); |
| assert(m_dataCache_ptr); |
| |
| m_runningGarnetStandalone = p.garnet_standalone; |
| |
| |
| // These statistical variables are not for display. |
| // The profiler will collate these across different |
| // coalescers and display those collated statistics. |
| m_outstandReqHist.init(10); |
| m_latencyHist.init(10); |
| m_missLatencyHist.init(10); |
| |
| for (int i = 0; i < RubyRequestType_NUM; i++) { |
| m_typeLatencyHist.push_back(new Stats::Histogram()); |
| m_typeLatencyHist[i]->init(10); |
| |
| m_missTypeLatencyHist.push_back(new Stats::Histogram()); |
| m_missTypeLatencyHist[i]->init(10); |
| } |
| |
| for (int i = 0; i < MachineType_NUM; i++) { |
| m_missMachLatencyHist.push_back(new Stats::Histogram()); |
| m_missMachLatencyHist[i]->init(10); |
| |
| m_IssueToInitialDelayHist.push_back(new Stats::Histogram()); |
| m_IssueToInitialDelayHist[i]->init(10); |
| |
| m_InitialToForwardDelayHist.push_back(new Stats::Histogram()); |
| m_InitialToForwardDelayHist[i]->init(10); |
| |
| m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram()); |
| m_ForwardToFirstResponseDelayHist[i]->init(10); |
| |
| m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram()); |
| m_FirstResponseToCompletionDelayHist[i]->init(10); |
| } |
| |
| for (int i = 0; i < RubyRequestType_NUM; i++) { |
| m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>()); |
| |
| for (int j = 0; j < MachineType_NUM; j++) { |
| m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram()); |
| m_missTypeMachLatencyHist[i][j]->init(10); |
| } |
| } |
| |
| } |
| |
| GPUCoalescer::~GPUCoalescer() |
| { |
| } |
| |
| Port & |
| GPUCoalescer::getPort(const std::string &if_name, PortID idx) |
| { |
| if (if_name == "gmTokenPort") { |
| return gmTokenPort; |
| } |
| |
| // delgate to RubyPort otherwise |
| return RubyPort::getPort(if_name, idx); |
| } |
| |
| void |
| GPUCoalescer::wakeup() |
| { |
| Cycles current_time = curCycle(); |
| for (auto& requestList : coalescedTable) { |
| for (auto& req : requestList.second) { |
| if (current_time - req->getIssueTime() > m_deadlock_threshold) { |
| std::stringstream ss; |
| printRequestTable(ss); |
| warn("GPUCoalescer %d Possible deadlock detected!\n%s\n", |
| m_version, ss.str()); |
| panic("Aborting due to deadlock!\n"); |
| } |
| } |
| } |
| |
| Tick tick_threshold = cyclesToTicks(m_deadlock_threshold); |
| uncoalescedTable.checkDeadlock(tick_threshold); |
| |
| if (m_outstanding_count > 0) { |
| schedule(deadlockCheckEvent, |
| m_deadlock_threshold * clockPeriod() + |
| curTick()); |
| } |
| } |
| |
| void |
| GPUCoalescer::printRequestTable(std::stringstream& ss) |
| { |
| ss << "Printing out " << coalescedTable.size() |
| << " outstanding requests in the coalesced table\n"; |
| |
| for (auto& requestList : coalescedTable) { |
| for (auto& request : requestList.second) { |
| ss << "\tAddr: " << printAddress(requestList.first) << "\n" |
| << "\tInstruction sequence number: " |
| << request->getSeqNum() << "\n" |
| << "\t\tType: " |
| << RubyRequestType_to_string(request->getRubyType()) << "\n" |
| << "\t\tNumber of associated packets: " |
| << request->getPackets().size() << "\n" |
| << "\t\tIssue time: " |
| << request->getIssueTime() * clockPeriod() << "\n" |
| << "\t\tDifference from current tick: " |
| << (curCycle() - request->getIssueTime()) * clockPeriod(); |
| } |
| } |
| |
| // print out packets waiting to be issued in uncoalesced table |
| uncoalescedTable.printRequestTable(ss); |
| } |
| |
| void |
| GPUCoalescer::resetStats() |
| { |
| m_latencyHist.reset(); |
| m_missLatencyHist.reset(); |
| for (int i = 0; i < RubyRequestType_NUM; i++) { |
| m_typeLatencyHist[i]->reset(); |
| m_missTypeLatencyHist[i]->reset(); |
| for (int j = 0; j < MachineType_NUM; j++) { |
| m_missTypeMachLatencyHist[i][j]->reset(); |
| } |
| } |
| |
| for (int i = 0; i < MachineType_NUM; i++) { |
| m_missMachLatencyHist[i]->reset(); |
| |
| m_IssueToInitialDelayHist[i]->reset(); |
| m_InitialToForwardDelayHist[i]->reset(); |
| m_ForwardToFirstResponseDelayHist[i]->reset(); |
| m_FirstResponseToCompletionDelayHist[i]->reset(); |
| } |
| } |
| |
| void |
| GPUCoalescer::printProgress(std::ostream& out) const |
| { |
| } |
| |
| // sets the kernelEndList |
| void |
| GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt) |
| { |
| // Don't know if this will happen or is possible |
| // but I just want to be careful and not have it become |
| // simulator hang in the future |
| DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id); |
| assert(kernelEndList.count(wavefront_id) == 0); |
| |
| kernelEndList[wavefront_id] = pkt; |
| DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n", |
| kernelEndList.size()); |
| } |
| |
| void |
| GPUCoalescer::writeCallback(Addr address, DataBlock& data) |
| { |
| writeCallback(address, MachineType_NULL, data); |
| } |
| |
| void |
| GPUCoalescer::writeCallback(Addr address, |
| MachineType mach, |
| DataBlock& data) |
| { |
| writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); |
| } |
| |
| void |
| GPUCoalescer::writeCallback(Addr address, |
| MachineType mach, |
| DataBlock& data, |
| Cycles initialRequestTime, |
| Cycles forwardRequestTime, |
| Cycles firstResponseTime) |
| { |
| writeCallback(address, mach, data, |
| initialRequestTime, forwardRequestTime, firstResponseTime, |
| false); |
| } |
| |
| void |
| GPUCoalescer::writeCallback(Addr address, |
| MachineType mach, |
| DataBlock& data, |
| Cycles initialRequestTime, |
| Cycles forwardRequestTime, |
| Cycles firstResponseTime, |
| bool isRegion) |
| { |
| assert(address == makeLineAddress(address)); |
| assert(coalescedTable.count(address)); |
| |
| auto crequest = coalescedTable.at(address).front(); |
| |
| hitCallback(crequest, mach, data, true, crequest->getIssueTime(), |
| forwardRequestTime, firstResponseTime, isRegion); |
| |
| // remove this crequest in coalescedTable |
| delete crequest; |
| coalescedTable.at(address).pop_front(); |
| |
| if (coalescedTable.at(address).empty()) { |
| coalescedTable.erase(address); |
| } else { |
| auto nextRequest = coalescedTable.at(address).front(); |
| issueRequest(nextRequest); |
| } |
| } |
| |
| void |
| GPUCoalescer::writeCompleteCallback(Addr address, |
| uint64_t instSeqNum, |
| MachineType mach) |
| { |
| DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x" |
| " instSeqNum = %d\n", address, instSeqNum); |
| |
| assert(pendingWriteInsts.count(instSeqNum) == 1); |
| PendingWriteInst& inst = pendingWriteInsts[instSeqNum]; |
| |
| // check the uncoalescedTable to see whether all requests for the inst |
| // have been issued or not |
| bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum); |
| DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, " |
| "reqsAllIssued=%d\n", reqsAllIssued, |
| inst.getNumPendingStores()-1, reqsAllIssued); |
| |
| if (inst.receiveWriteCompleteAck() && reqsAllIssued ) { |
| // if the pending write instruction has received all write completion |
| // callbacks for its issued Ruby requests, we can now start respond |
| // the requesting CU in one response packet. |
| inst.ackWriteCompletion(m_usingRubyTester); |
| |
| DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n", |
| instSeqNum); |
| pendingWriteInsts.erase(instSeqNum); |
| } |
| } |
| |
| void |
| GPUCoalescer::readCallback(Addr address, DataBlock& data) |
| { |
| readCallback(address, MachineType_NULL, data); |
| } |
| |
| void |
| GPUCoalescer::readCallback(Addr address, |
| MachineType mach, |
| DataBlock& data) |
| { |
| readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); |
| } |
| |
| void |
| GPUCoalescer::readCallback(Addr address, |
| MachineType mach, |
| DataBlock& data, |
| Cycles initialRequestTime, |
| Cycles forwardRequestTime, |
| Cycles firstResponseTime) |
| { |
| |
| readCallback(address, mach, data, |
| initialRequestTime, forwardRequestTime, firstResponseTime, |
| false); |
| } |
| |
| void |
| GPUCoalescer::readCallback(Addr address, |
| MachineType mach, |
| DataBlock& data, |
| Cycles initialRequestTime, |
| Cycles forwardRequestTime, |
| Cycles firstResponseTime, |
| bool isRegion) |
| { |
| assert(address == makeLineAddress(address)); |
| assert(coalescedTable.count(address)); |
| |
| auto crequest = coalescedTable.at(address).front(); |
| fatal_if(crequest->getRubyType() != RubyRequestType_LD, |
| "readCallback received non-read type response\n"); |
| |
| // Iterate over the coalesced requests to respond to as many loads as |
| // possible until another request type is seen. Models MSHR for TCP. |
| while (crequest->getRubyType() == RubyRequestType_LD) { |
| hitCallback(crequest, mach, data, true, crequest->getIssueTime(), |
| forwardRequestTime, firstResponseTime, isRegion); |
| |
| delete crequest; |
| coalescedTable.at(address).pop_front(); |
| if (coalescedTable.at(address).empty()) { |
| break; |
| } |
| |
| crequest = coalescedTable.at(address).front(); |
| } |
| |
| if (coalescedTable.at(address).empty()) { |
| coalescedTable.erase(address); |
| } else { |
| auto nextRequest = coalescedTable.at(address).front(); |
| issueRequest(nextRequest); |
| } |
| } |
| |
| void |
| GPUCoalescer::hitCallback(CoalescedRequest* crequest, |
| MachineType mach, |
| DataBlock& data, |
| bool success, |
| Cycles initialRequestTime, |
| Cycles forwardRequestTime, |
| Cycles firstResponseTime, |
| bool isRegion) |
| { |
| PacketPtr pkt = crequest->getFirstPkt(); |
| Addr request_address = pkt->getAddr(); |
| M5_VAR_USED Addr request_line_address = makeLineAddress(request_address); |
| |
| RubyRequestType type = crequest->getRubyType(); |
| |
| DPRINTF(GPUCoalescer, "Got hitCallback for 0x%X\n", request_line_address); |
| |
| recordMissLatency(crequest, mach, |
| initialRequestTime, |
| forwardRequestTime, |
| firstResponseTime, |
| success, isRegion); |
| // update the data |
| // |
| // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER |
| std::vector<PacketPtr> pktList = crequest->getPackets(); |
| DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n", |
| pktList.size(), request_line_address); |
| for (auto& pkt : pktList) { |
| request_address = pkt->getAddr(); |
| if (pkt->getPtr<uint8_t>()) { |
| if ((type == RubyRequestType_LD) || |
| (type == RubyRequestType_ATOMIC) || |
| (type == RubyRequestType_ATOMIC_RETURN) || |
| (type == RubyRequestType_IFETCH) || |
| (type == RubyRequestType_RMW_Read) || |
| (type == RubyRequestType_Locked_RMW_Read) || |
| (type == RubyRequestType_Load_Linked)) { |
| pkt->setData( |
| data.getData(getOffset(request_address), pkt->getSize())); |
| } else { |
| data.setData(pkt->getPtr<uint8_t>(), |
| getOffset(request_address), pkt->getSize()); |
| } |
| } else { |
| DPRINTF(MemoryAccess, |
| "WARNING. Data not transfered from Ruby to M5 for type " \ |
| "%s\n", |
| RubyRequestType_to_string(type)); |
| } |
| } |
| |
| m_outstanding_count--; |
| assert(m_outstanding_count >= 0); |
| |
| completeHitCallback(pktList); |
| } |
| |
| bool |
| GPUCoalescer::empty() const |
| { |
| return coalescedTable.empty(); |
| } |
| |
| RubyRequestType |
| GPUCoalescer::getRequestType(PacketPtr pkt) |
| { |
| RubyRequestType req_type = RubyRequestType_NULL; |
| |
| // These types are not support or not used in GPU caches. |
| assert(!pkt->req->isLLSC()); |
| assert(!pkt->req->isLockedRMW()); |
| assert(!pkt->req->isInstFetch()); |
| assert(!pkt->isFlush()); |
| |
| if (pkt->req->isAtomicReturn()) { |
| req_type = RubyRequestType_ATOMIC_RETURN; |
| } else if (pkt->req->isAtomicNoReturn()) { |
| req_type = RubyRequestType_ATOMIC_NO_RETURN; |
| } else if (pkt->isRead()) { |
| req_type = RubyRequestType_LD; |
| } else if (pkt->isWrite()) { |
| req_type = RubyRequestType_ST; |
| } else { |
| panic("Unsupported ruby packet type\n"); |
| } |
| |
| return req_type; |
| } |
| |
| // Places an uncoalesced packet in uncoalescedTable. If the packet is a |
| // special type (MemFence, scoping, etc), it is issued immediately. |
| RequestStatus |
| GPUCoalescer::makeRequest(PacketPtr pkt) |
| { |
| // all packets must have valid instruction sequence numbers |
| assert(pkt->req->hasInstSeqNum()); |
| |
| if (pkt->cmd == MemCmd::MemSyncReq) { |
| // issue mem_sync requests immediately to the cache system without |
| // going through uncoalescedTable like normal LD/ST/Atomic requests |
| issueMemSyncRequest(pkt); |
| } else { |
| // otherwise, this must be either read or write command |
| assert(pkt->isRead() || pkt->isWrite()); |
| |
| InstSeqNum seq_num = pkt->req->getReqInstSeqNum(); |
| |
| // in the case of protocol tester, there is one packet per sequence |
| // number. The number of packets during simulation depends on the |
| // number of lanes actives for that vmem request (i.e., the popcnt |
| // of the exec_mask. |
| int num_packets = 1; |
| if (!m_usingRubyTester) { |
| num_packets = getDynInst(pkt)->exec_mask.count(); |
| } |
| |
| // the pkt is temporarily stored in the uncoalesced table until |
| // it's picked for coalescing process later in this cycle or in a |
| // future cycle. Packets remaining is set to the number of excepted |
| // requests from the instruction based on its exec_mask. |
| uncoalescedTable.insertPacket(pkt); |
| uncoalescedTable.initPacketsRemaining(seq_num, num_packets); |
| DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n", |
| pkt->getAddr()); |
| |
| // we schedule an issue event here to process the uncoalesced table |
| // and try to issue Ruby request to cache system |
| if (!issueEvent.scheduled()) { |
| DPRINTF(GPUCoalescer, "Scheduled issueEvent for seqNum %d\n", |
| seq_num); |
| schedule(issueEvent, curTick()); |
| } |
| } |
| |
| // we always return RequestStatus_Issued in this coalescer |
| // b/c the coalescer's resouce was checked ealier and the coalescer is |
| // queueing up aliased requets in its coalesced table |
| return RequestStatus_Issued; |
| } |
| |
| template <class KEY, class VALUE> |
| std::ostream & |
| operator<<(std::ostream &out, const std::unordered_map<KEY, VALUE> &map) |
| { |
| out << "["; |
| for (auto i = map.begin(); i != map.end(); ++i) |
| out << " " << i->first << "=" << i->second; |
| out << " ]"; |
| |
| return out; |
| } |
| |
| void |
| GPUCoalescer::print(std::ostream& out) const |
| { |
| out << "[GPUCoalescer: " << m_version |
| << ", outstanding requests: " << m_outstanding_count |
| << "]"; |
| } |
| |
| GPUDynInstPtr |
| GPUCoalescer::getDynInst(PacketPtr pkt) const |
| { |
| RubyPort::SenderState* ss = |
| safe_cast<RubyPort::SenderState*>(pkt->senderState); |
| |
| ComputeUnit::DataPort::SenderState* cu_state = |
| safe_cast<ComputeUnit::DataPort::SenderState*> |
| (ss->predecessor); |
| |
| return cu_state->_gpuDynInst; |
| } |
| |
| bool |
| GPUCoalescer::coalescePacket(PacketPtr pkt) |
| { |
| uint64_t seqNum = pkt->req->getReqInstSeqNum(); |
| Addr line_addr = makeLineAddress(pkt->getAddr()); |
| |
| // If the packet has the same line address as a request already in the |
| // coalescedTable and has the same sequence number, it can be coalesced. |
| if (coalescedTable.count(line_addr)) { |
| // Search for a previous coalesced request with the same seqNum. |
| auto& creqQueue = coalescedTable.at(line_addr); |
| auto citer = std::find_if(creqQueue.begin(), creqQueue.end(), |
| [&](CoalescedRequest* c) { return c->getSeqNum() == seqNum; } |
| ); |
| if (citer != creqQueue.end()) { |
| (*citer)->insertPacket(pkt); |
| return true; |
| } |
| } |
| |
| if (m_outstanding_count < m_max_outstanding_requests) { |
| // This is an "aliased" or new request. Create a RubyRequest and |
| // append it to the list of "targets" in the coalescing table. |
| DPRINTF(GPUCoalescer, "Creating new or aliased request for 0x%X\n", |
| line_addr); |
| |
| CoalescedRequest *creq = new CoalescedRequest(seqNum); |
| creq->insertPacket(pkt); |
| creq->setRubyType(getRequestType(pkt)); |
| creq->setIssueTime(curCycle()); |
| |
| if (!coalescedTable.count(line_addr)) { |
| // If there is no outstanding request for this line address, |
| // create a new coalecsed request and issue it immediately. |
| auto reqList = std::deque<CoalescedRequest*> { creq }; |
| coalescedTable.insert(std::make_pair(line_addr, reqList)); |
| if (!coalescedReqs.count(seqNum)) { |
| coalescedReqs.insert(std::make_pair(seqNum, reqList)); |
| } else { |
| coalescedReqs.at(seqNum).push_back(creq); |
| } |
| } else { |
| // The request is for a line address that is already outstanding |
| // but for a different instruction. Add it as a new request to be |
| // issued when the current outstanding request is completed. |
| coalescedTable.at(line_addr).push_back(creq); |
| DPRINTF(GPUCoalescer, "found address 0x%X with new seqNum %d\n", |
| line_addr, seqNum); |
| } |
| |
| // In both cases, requests are added to the coalescing table and will |
| // be counted as outstanding requests. |
| m_outstanding_count++; |
| |
| // We track all issued or to-be-issued Ruby requests associated with |
| // write instructions. An instruction may have multiple Ruby |
| // requests. |
| if (pkt->cmd == MemCmd::WriteReq) { |
| DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to" |
| " the pending write instruction list\n", seqNum, |
| line_addr); |
| |
| RubyPort::SenderState* ss = |
| safe_cast<RubyPort::SenderState*>(pkt->senderState); |
| |
| // we need to save this port because it will be used to call |
| // back the requesting CU when we receive write |
| // complete callbacks for all issued Ruby requests of this |
| // instruction. |
| RubyPort::MemResponsePort* mem_response_port = ss->port; |
| |
| GPUDynInstPtr gpuDynInst = nullptr; |
| |
| if (!m_usingRubyTester) { |
| // If this coalescer is connected to a real CU, we need |
| // to save the corresponding gpu dynamic instruction. |
| // CU will use that instruction to decrement wait counters |
| // in the issuing wavefront. |
| // For Ruby tester, gpuDynInst == nullptr |
| gpuDynInst = getDynInst(pkt); |
| } |
| |
| PendingWriteInst& inst = pendingWriteInsts[seqNum]; |
| inst.addPendingReq(mem_response_port, gpuDynInst, |
| m_usingRubyTester); |
| } |
| |
| return true; |
| } |
| |
| // The maximum number of outstanding requests have been issued. |
| return false; |
| } |
| |
| void |
| GPUCoalescer::completeIssue() |
| { |
| // Iterate over the maximum number of instructions we can coalesce |
| // per cycle (coalescingWindow). |
| for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) { |
| PerInstPackets *pkt_list = |
| uncoalescedTable.getInstPackets(instIdx); |
| |
| // getInstPackets will return nullptr if no instruction |
| // exists at the current offset. |
| if (!pkt_list) { |
| break; |
| } else if (pkt_list->empty()) { |
| // Found something, but it has not been cleaned up by update |
| // resources yet. See if there is anything else to coalesce. |
| // Assume we can't check anymore if the coalescing window is 1. |
| continue; |
| } else { |
| // All packets in the list have the same seqNum, use first. |
| InstSeqNum seq_num = pkt_list->front()->req->getReqInstSeqNum(); |
| |
| // The difference in list size before and after tells us the |
| // number of packets which were coalesced. |
| size_t pkt_list_size = pkt_list->size(); |
| |
| // Since we have a pointer to the list of packets in the inst, |
| // erase them from the list if coalescing is successful and |
| // leave them in the list otherwise. This aggressively attempts |
| // to coalesce as many packets as possible from the current inst. |
| pkt_list->remove_if( |
| [&](PacketPtr pkt) { return coalescePacket(pkt); } |
| ); |
| |
| if (coalescedReqs.count(seq_num)) { |
| auto& creqs = coalescedReqs.at(seq_num); |
| for (auto creq : creqs) { |
| DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n", |
| RubyRequestType_to_string(creq->getRubyType()), |
| seq_num); |
| issueRequest(creq); |
| } |
| coalescedReqs.erase(seq_num); |
| } |
| |
| assert(pkt_list_size >= pkt_list->size()); |
| size_t pkt_list_diff = pkt_list_size - pkt_list->size(); |
| |
| int num_remaining = uncoalescedTable.getPacketsRemaining(seq_num); |
| num_remaining -= pkt_list_diff; |
| assert(num_remaining >= 0); |
| |
| uncoalescedTable.setPacketsRemaining(seq_num, num_remaining); |
| DPRINTF(GPUCoalescer, |
| "Coalesced %d pkts for seqNum %d, %d remaining\n", |
| pkt_list_diff, seq_num, num_remaining); |
| } |
| } |
| |
| // Clean up any instructions in the uncoalesced table that have had |
| // all of their packets coalesced and return a token for that column. |
| uncoalescedTable.updateResources(); |
| |
| // have Kernel End releases been issued this cycle |
| int len = newKernelEnds.size(); |
| for (int i = 0; i < len; i++) { |
| kernelCallback(newKernelEnds[i]); |
| } |
| newKernelEnds.clear(); |
| } |
| |
| void |
| GPUCoalescer::evictionCallback(Addr address) |
| { |
| ruby_eviction_callback(address); |
| } |
| |
| void |
| GPUCoalescer::kernelCallback(int wavefront_id) |
| { |
| assert(kernelEndList.count(wavefront_id)); |
| |
| ruby_hit_callback(kernelEndList[wavefront_id]); |
| |
| kernelEndList.erase(wavefront_id); |
| } |
| |
| void |
| GPUCoalescer::atomicCallback(Addr address, |
| MachineType mach, |
| const DataBlock& data) |
| { |
| assert(address == makeLineAddress(address)); |
| assert(coalescedTable.count(address)); |
| |
| auto crequest = coalescedTable.at(address).front(); |
| |
| fatal_if((crequest->getRubyType() != RubyRequestType_ATOMIC && |
| crequest->getRubyType() != RubyRequestType_ATOMIC_RETURN && |
| crequest->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN), |
| "atomicCallback saw non-atomic type response\n"); |
| |
| hitCallback(crequest, mach, (DataBlock&)data, true, |
| crequest->getIssueTime(), Cycles(0), Cycles(0), false); |
| |
| delete crequest; |
| coalescedTable.at(address).pop_front(); |
| |
| if (coalescedTable.at(address).empty()) { |
| coalescedTable.erase(address); |
| } else { |
| auto nextRequest = coalescedTable.at(address).front(); |
| issueRequest(nextRequest); |
| } |
| } |
| |
| void |
| GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist) |
| { |
| for (auto& pkt : mylist) { |
| RubyPort::SenderState *ss = |
| safe_cast<RubyPort::SenderState *>(pkt->senderState); |
| MemResponsePort *port = ss->port; |
| assert(port != NULL); |
| |
| pkt->senderState = ss->predecessor; |
| |
| if (pkt->cmd != MemCmd::WriteReq) { |
| // for WriteReq, we keep the original senderState until |
| // writeCompleteCallback |
| delete ss; |
| } |
| |
| port->hitCallback(pkt); |
| trySendRetries(); |
| } |
| |
| // We schedule an event in the same tick as hitCallback (similar to |
| // makeRequest) rather than calling completeIssue directly to reduce |
| // function calls to complete issue. This can only happen if the max |
| // outstanding requests is less than the number of slots in the |
| // uncoalesced table and makeRequest is not called again. |
| if (uncoalescedTable.packetAvailable() && !issueEvent.scheduled()) { |
| schedule(issueEvent, curTick()); |
| } |
| |
| testDrainComplete(); |
| } |
| |
| void |
| GPUCoalescer::recordMissLatency(CoalescedRequest* crequest, |
| MachineType mach, |
| Cycles initialRequestTime, |
| Cycles forwardRequestTime, |
| Cycles firstResponseTime, |
| bool success, bool isRegion) |
| { |
| } |
| |