src/mem/ruby/system/GPUCoalescer.cc - amd/gem5 - Git at Google

 /*
  * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * For use for simulation and test purposes only
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * Authors: Sooraj Puthoor
  */

 #include "base/logging.hh"
 #include "base/str.hh"
 #include "config/the_isa.hh"

 #if THE_ISA == X86_ISA
 #include "arch/x86/insts/microldstop.hh"

 #endif // X86_ISA
 #include "mem/ruby/system/GPUCoalescer.hh"

 #include "cpu/testers/rubytest/RubyTester.hh"
 #include "debug/GPUCoalescer.hh"
 #include "debug/MemoryAccess.hh"
 #include "debug/ProtocolTrace.hh"
 #include "debug/RubyPort.hh"
 #include "debug/RubyStats.hh"
 #include "gpu-compute/shader.hh"
 #include "mem/packet.hh"
 #include "mem/ruby/common/DataBlock.hh"
 #include "mem/ruby/common/SubBlock.hh"
 #include "mem/ruby/network/MessageBuffer.hh"
 #include "mem/ruby/profiler/Profiler.hh"
 #include "mem/ruby/slicc_interface/AbstractController.hh"
 #include "mem/ruby/slicc_interface/RubyRequest.hh"
 #include "mem/ruby/structures/CacheMemory.hh"
 #include "mem/ruby/system/RubySystem.hh"
 #include "params/RubyGPUCoalescer.hh"

 using namespace std;

 GPUCoalescer *
 RubyGPUCoalescerParams::create()
 {
     return new GPUCoalescer(this);
 }

 HSAScope
 reqScopeToHSAScope(const RequestPtr &req)
 {
     HSAScope accessScope = HSAScope_UNSPECIFIED;
     if (req->isScoped()) {
         if (req->isWavefrontScope()) {
             accessScope = HSAScope_WAVEFRONT;
         } else if (req->isWorkgroupScope()) {
             accessScope = HSAScope_WORKGROUP;
         } else if (req->isDeviceScope()) {
             accessScope = HSAScope_DEVICE;
         } else if (req->isSystemScope()) {
             accessScope = HSAScope_SYSTEM;
         } else {
             fatal("Bad scope type");
         }
     }
     return accessScope;
 }

 HSASegment
 reqSegmentToHSASegment(const RequestPtr &req)
 {
     HSASegment accessSegment = HSASegment_GLOBAL;

     if (req->isGlobalSegment()) {
         accessSegment = HSASegment_GLOBAL;
     } else if (req->isGroupSegment()) {
         accessSegment = HSASegment_GROUP;
     } else if (req->isPrivateSegment()) {
         accessSegment = HSASegment_PRIVATE;
     } else if (req->isKernargSegment()) {
         accessSegment = HSASegment_KERNARG;
     } else if (req->isReadonlySegment()) {
         accessSegment = HSASegment_READONLY;
     } else if (req->isSpillSegment()) {
         accessSegment = HSASegment_SPILL;
     } else if (req->isArgSegment()) {
         accessSegment = HSASegment_ARG;
     } else {
         fatal("Bad segment type");
     }

     return accessSegment;
 }

 GPUCoalescer::GPUCoalescer(const Params *p)
     : RubyPort(p),
       issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
                  false, Event::Progress_Event_Pri),
       deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check")
 {
     m_store_waiting_on_load_cycles = 0;
     m_store_waiting_on_store_cycles = 0;
     m_load_waiting_on_store_cycles = 0;
     m_load_waiting_on_load_cycles = 0;

     m_outstanding_count = 0;

     m_max_outstanding_requests = 0;
     m_deadlock_threshold = 0;
     m_instCache_ptr = nullptr;
     m_dataCache_ptr = nullptr;

     m_instCache_ptr = p->icache;
     m_dataCache_ptr = p->dcache;
     m_max_outstanding_requests = p->max_outstanding_requests;
     m_deadlock_threshold = p->deadlock_threshold;

     assert(m_max_outstanding_requests > 0);
     assert(m_deadlock_threshold > 0);
     assert(m_instCache_ptr);
     assert(m_dataCache_ptr);

     m_data_cache_hit_latency = p->dcache_hit_latency;

     m_runningGarnetStandalone = p->garnet_standalone;
     assumingRfOCoherence = p->assume_rfo;
 }

 GPUCoalescer::~GPUCoalescer()
 {
 }

 void
 GPUCoalescer::wakeup()
 {
     // Check for deadlock of any of the requests
     Cycles current_time = curCycle();

     // Check across all outstanding requests
     int total_outstanding = 0;

     RequestTable::iterator read = m_readRequestTable.begin();
     RequestTable::iterator read_end = m_readRequestTable.end();
     for (; read != read_end; ++read) {
         GPUCoalescerRequest* request = read->second;
         if (current_time - request->issue_time < m_deadlock_threshold)
             continue;

         panic("Possible Deadlock detected. Aborting!\n"
              "version: %d request.paddr: 0x%x m_readRequestTable: %d "
              "current time: %u issue_time: %d difference: %d\n", m_version,
               request->pkt->getAddr(), m_readRequestTable.size(),
               current_time * clockPeriod(), request->issue_time * clockPeriod(),
               (current_time - request->issue_time)*clockPeriod());
     }

     RequestTable::iterator write = m_writeRequestTable.begin();
     RequestTable::iterator write_end = m_writeRequestTable.end();
     for (; write != write_end; ++write) {
         GPUCoalescerRequest* request = write->second;
         if (current_time - request->issue_time < m_deadlock_threshold)
             continue;

         panic("Possible Deadlock detected. Aborting!\n"
              "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
              "current time: %u issue_time: %d difference: %d\n", m_version,
               request->pkt->getAddr(), m_writeRequestTable.size(),
               current_time * clockPeriod(), request->issue_time * clockPeriod(),
               (current_time - request->issue_time) * clockPeriod());
     }

     total_outstanding += m_writeRequestTable.size();
     total_outstanding += m_readRequestTable.size();

     assert(m_outstanding_count == total_outstanding);

     if (m_outstanding_count > 0) {
         // If there are still outstanding requests, keep checking
         schedule(deadlockCheckEvent,
                  m_deadlock_threshold * clockPeriod() +
                  curTick());
     }
 }

 void
 GPUCoalescer::resetStats()
 {
     m_latencyHist.reset();
     m_missLatencyHist.reset();
     for (int i = 0; i < RubyRequestType_NUM; i++) {
         m_typeLatencyHist[i]->reset();
         m_missTypeLatencyHist[i]->reset();
         for (int j = 0; j < MachineType_NUM; j++) {
             m_missTypeMachLatencyHist[i][j]->reset();
         }
     }

     for (int i = 0; i < MachineType_NUM; i++) {
         m_missMachLatencyHist[i]->reset();

         m_IssueToInitialDelayHist[i]->reset();
         m_InitialToForwardDelayHist[i]->reset();
         m_ForwardToFirstResponseDelayHist[i]->reset();
         m_FirstResponseToCompletionDelayHist[i]->reset();
     }
 }

 void
 GPUCoalescer::printProgress(ostream& out) const
 {
 }

 RequestStatus
 GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
 {
     Addr line_addr = makeLineAddress(pkt->getAddr());

     if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
         return RequestStatus_BufferFull;
     }

     if (m_controller->isBlocked(line_addr) &&
        request_type != RubyRequestType_Locked_RMW_Write) {
         return RequestStatus_Aliased;
     }

     if ((request_type == RubyRequestType_ST) ||
         (request_type == RubyRequestType_ATOMIC) ||
         (request_type == RubyRequestType_ATOMIC_RETURN) ||
         (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
         (request_type == RubyRequestType_RMW_Read) ||
         (request_type == RubyRequestType_RMW_Write) ||
         (request_type == RubyRequestType_Load_Linked) ||
         (request_type == RubyRequestType_Store_Conditional) ||
         (request_type == RubyRequestType_Locked_RMW_Read) ||
         (request_type == RubyRequestType_Locked_RMW_Write) ||
         (request_type == RubyRequestType_FLUSH)) {

         // Check if there is any outstanding read request for the same
         // cache line.
         if (m_readRequestTable.count(line_addr) > 0) {
             m_store_waiting_on_load_cycles++;
             return RequestStatus_Aliased;
         }

         if (m_writeRequestTable.count(line_addr) > 0) {
           // There is an outstanding write request for the cache line
           m_store_waiting_on_store_cycles++;
           return RequestStatus_Aliased;
         }
     } else {
         // Check if there is any outstanding write request for the same
         // cache line.
         if (m_writeRequestTable.count(line_addr) > 0) {
             m_load_waiting_on_store_cycles++;
             return RequestStatus_Aliased;
         }

         if (m_readRequestTable.count(line_addr) > 0) {
             // There is an outstanding read request for the cache line
             m_load_waiting_on_load_cycles++;
             return RequestStatus_Aliased;
         }
     }

     return RequestStatus_Ready;

 }


 // sets the kernelEndList
 void
 GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
 {
     // Don't know if this will happen or is possible
     // but I just want to be careful and not have it become
     // simulator hang in the future
     DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
     assert(kernelEndList.count(wavefront_id) == 0);

     kernelEndList[wavefront_id] = pkt;
     DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
             kernelEndList.size());
 }


 // Insert the request on the correct request table.  Return true if
 // the entry was already present.
 bool
 GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
 {
     assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
            pkt->req->isLockedRMW() ||
            !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));

     int total_outstanding M5_VAR_USED =
         m_writeRequestTable.size() + m_readRequestTable.size();

     assert(m_outstanding_count == total_outstanding);

     // See if we should schedule a deadlock check
     if (!deadlockCheckEvent.scheduled()) {
         schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
     }

     Addr line_addr = makeLineAddress(pkt->getAddr());
     if ((request_type == RubyRequestType_ST) ||
         (request_type == RubyRequestType_ATOMIC) ||
         (request_type == RubyRequestType_ATOMIC_RETURN) ||
         (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
         (request_type == RubyRequestType_RMW_Read) ||
         (request_type == RubyRequestType_RMW_Write) ||
         (request_type == RubyRequestType_Load_Linked) ||
         (request_type == RubyRequestType_Store_Conditional) ||
         (request_type == RubyRequestType_Locked_RMW_Read) ||
         (request_type == RubyRequestType_Locked_RMW_Write) ||
         (request_type == RubyRequestType_FLUSH)) {

         pair<RequestTable::iterator, bool> r =
           m_writeRequestTable.insert(RequestTable::value_type(line_addr,
                                        (GPUCoalescerRequest*) NULL));
         if (r.second) {
             RequestTable::iterator i = r.first;
             i->second = new GPUCoalescerRequest(pkt, request_type,
                                                 curCycle());
             DPRINTF(GPUCoalescer,
                     "Inserting write request for paddr %#x for type %d\n",
                     pkt->req->getPaddr(), i->second->m_type);
             m_outstanding_count++;
         } else {
             return true;
         }
     } else {
         pair<RequestTable::iterator, bool> r =
             m_readRequestTable.insert(RequestTable::value_type(line_addr,
                                         (GPUCoalescerRequest*) NULL));

         if (r.second) {
             RequestTable::iterator i = r.first;
             i->second = new GPUCoalescerRequest(pkt, request_type,
                                              curCycle());
             DPRINTF(GPUCoalescer,
                     "Inserting read request for paddr %#x for type %d\n",
                     pkt->req->getPaddr(), i->second->m_type);
             m_outstanding_count++;
         } else {
             return true;
         }
     }

     m_outstandReqHist.sample(m_outstanding_count);

     total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
     assert(m_outstanding_count == total_outstanding);

     return false;
 }

 void
 GPUCoalescer::markRemoved()
 {
     m_outstanding_count--;
     assert(m_outstanding_count ==
            m_writeRequestTable.size() + m_readRequestTable.size());
 }

 void
 GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
 {
     assert(m_outstanding_count ==
            m_writeRequestTable.size() + m_readRequestTable.size());

     Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
     if ((srequest->m_type == RubyRequestType_ST) ||
         (srequest->m_type == RubyRequestType_RMW_Read) ||
         (srequest->m_type == RubyRequestType_RMW_Write) ||
         (srequest->m_type == RubyRequestType_Load_Linked) ||
         (srequest->m_type == RubyRequestType_Store_Conditional) ||
         (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
         (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
         m_writeRequestTable.erase(line_addr);
     } else {
         m_readRequestTable.erase(line_addr);
     }

     markRemoved();
 }

 bool
 GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
 {
     //
     // The success flag indicates whether the LLSC operation was successful.
     // LL ops will always succeed, but SC may fail if the cache line is no
     // longer locked.
     //
     bool success = true;
     if (request->m_type == RubyRequestType_Store_Conditional) {
         if (!m_dataCache_ptr->isLocked(address, m_version)) {
             //
             // For failed SC requests, indicate the failure to the cpu by
             // setting the extra data to zero.
             //
             request->pkt->req->setExtraData(0);
             success = false;
         } else {
             //
             // For successful SC requests, indicate the success to the cpu by
             // setting the extra data to one.
             //
             request->pkt->req->setExtraData(1);
         }
         //
         // Independent of success, all SC operations must clear the lock
         //
         m_dataCache_ptr->clearLocked(address);
     } else if (request->m_type == RubyRequestType_Load_Linked) {
         //
         // Note: To fully follow Alpha LLSC semantics, should the LL clear any
         // previously locked cache lines?
         //
         m_dataCache_ptr->setLocked(address, m_version);
     } else if ((m_dataCache_ptr->isTagPresent(address)) &&
                (m_dataCache_ptr->isLocked(address, m_version))) {
         //
         // Normal writes should clear the locked address
         //
         m_dataCache_ptr->clearLocked(address);
     }
     return success;
 }

 void
 GPUCoalescer::writeCallback(Addr address, DataBlock& data)
 {
     writeCallback(address, MachineType_NULL, data);
 }

 void
 GPUCoalescer::writeCallback(Addr address,
                          MachineType mach,
                          DataBlock& data)
 {
     writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
 }

 void
 GPUCoalescer::writeCallback(Addr address,
                          MachineType mach,
                          DataBlock& data,
                          Cycles initialRequestTime,
                          Cycles forwardRequestTime,
                          Cycles firstResponseTime)
 {
     writeCallback(address, mach, data,
                   initialRequestTime, forwardRequestTime, firstResponseTime,
                   false);
 }

 void
 GPUCoalescer::writeCallback(Addr address,
                          MachineType mach,
                          DataBlock& data,
                          Cycles initialRequestTime,
                          Cycles forwardRequestTime,
                          Cycles firstResponseTime,
                          bool isRegion)
 {
     assert(address == makeLineAddress(address));

     DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
     assert(m_writeRequestTable.count(makeLineAddress(address)));

     RequestTable::iterator i = m_writeRequestTable.find(address);
     assert(i != m_writeRequestTable.end());
     GPUCoalescerRequest* request = i->second;

     m_writeRequestTable.erase(i);
     markRemoved();

     assert((request->m_type == RubyRequestType_ST) ||
            (request->m_type == RubyRequestType_ATOMIC) ||
            (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
            (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
            (request->m_type == RubyRequestType_RMW_Read) ||
            (request->m_type == RubyRequestType_RMW_Write) ||
            (request->m_type == RubyRequestType_Load_Linked) ||
            (request->m_type == RubyRequestType_Store_Conditional) ||
            (request->m_type == RubyRequestType_Locked_RMW_Read) ||
            (request->m_type == RubyRequestType_Locked_RMW_Write) ||
            (request->m_type == RubyRequestType_FLUSH));


     //
     // For Alpha, properly handle LL, SC, and write requests with respect to
     // locked cache blocks.
     //
     // Not valid for Garnet_standalone protocl
     //
     bool success = true;
     if (!m_runningGarnetStandalone)
         success = handleLlsc(address, request);

     if (request->m_type == RubyRequestType_Locked_RMW_Read) {
         m_controller->blockOnQueue(address, m_mandatory_q_ptr);
     } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
         m_controller->unblock(address);
     }

     hitCallback(request, mach, data, success,
                 request->issue_time, forwardRequestTime, firstResponseTime,
                 isRegion);
 }

 void
 GPUCoalescer::readCallback(Addr address, DataBlock& data)
 {
     readCallback(address, MachineType_NULL, data);
 }

 void
 GPUCoalescer::readCallback(Addr address,
                         MachineType mach,
                         DataBlock& data)
 {
     readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
 }

 void
 GPUCoalescer::readCallback(Addr address,
                         MachineType mach,
                         DataBlock& data,
                         Cycles initialRequestTime,
                         Cycles forwardRequestTime,
                         Cycles firstResponseTime)
 {

     readCallback(address, mach, data,
                  initialRequestTime, forwardRequestTime, firstResponseTime,
                  false);
 }

 void
 GPUCoalescer::readCallback(Addr address,
                         MachineType mach,
                         DataBlock& data,
                         Cycles initialRequestTime,
                         Cycles forwardRequestTime,
                         Cycles firstResponseTime,
                         bool isRegion)
 {
     assert(address == makeLineAddress(address));
     assert(m_readRequestTable.count(makeLineAddress(address)));

     DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
     RequestTable::iterator i = m_readRequestTable.find(address);
     assert(i != m_readRequestTable.end());
     GPUCoalescerRequest* request = i->second;

     m_readRequestTable.erase(i);
     markRemoved();

     assert((request->m_type == RubyRequestType_LD) ||
            (request->m_type == RubyRequestType_IFETCH));

     hitCallback(request, mach, data, true,
                 request->issue_time, forwardRequestTime, firstResponseTime,
                 isRegion);
 }

 void
 GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
                        MachineType mach,
                        DataBlock& data,
                        bool success,
                        Cycles initialRequestTime,
                        Cycles forwardRequestTime,
                        Cycles firstResponseTime,
                        bool isRegion)
 {
     PacketPtr pkt = srequest->pkt;
     Addr request_address = pkt->getAddr();
     Addr request_line_address = makeLineAddress(request_address);

     RubyRequestType type = srequest->m_type;

     // Set this cache entry to the most recently used
     if (type == RubyRequestType_IFETCH) {
         if (m_instCache_ptr->isTagPresent(request_line_address))
             m_instCache_ptr->setMRU(request_line_address);
     } else {
         if (m_dataCache_ptr->isTagPresent(request_line_address))
             m_dataCache_ptr->setMRU(request_line_address);
     }

     recordMissLatency(srequest, mach,
                       initialRequestTime,
                       forwardRequestTime,
                       firstResponseTime,
                       success, isRegion);
     // update the data
     //
     // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
     int len = reqCoalescer[request_line_address].size();
     std::vector<PacketPtr> mylist;
     for (int i = 0; i < len; ++i) {
         PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
         assert(type == reqCoalescer[request_line_address][i].primaryType);
         request_address = pkt->getAddr();
         request_line_address = makeLineAddress(pkt->getAddr());
         if (pkt->getPtr<uint8_t>()) {
             if ((type == RubyRequestType_LD) ||
                 (type == RubyRequestType_ATOMIC) ||
                 (type == RubyRequestType_ATOMIC_RETURN) ||
                 (type == RubyRequestType_IFETCH) ||
                 (type == RubyRequestType_RMW_Read) ||
                 (type == RubyRequestType_Locked_RMW_Read) ||
                 (type == RubyRequestType_Load_Linked)) {
                 memcpy(pkt->getPtr<uint8_t>(),
                        data.getData(getOffset(request_address),
                                     pkt->getSize()),
                        pkt->getSize());
             } else {
                 data.setData(pkt->getPtr<uint8_t>(),
                              getOffset(request_address), pkt->getSize());
             }
         } else {
             DPRINTF(MemoryAccess,
                     "WARNING.  Data not transfered from Ruby to M5 for type " \
                     "%s\n",
                     RubyRequestType_to_string(type));
         }

         // If using the RubyTester, update the RubyTester sender state's
         // subBlock with the recieved data.  The tester will later access
         // this state.
         // Note: RubyPort will access it's sender state before the
         // RubyTester.
         if (m_usingRubyTester) {
             RubyPort::SenderState *requestSenderState =
                 safe_cast<RubyPort::SenderState*>(pkt->senderState);
             RubyTester::SenderState* testerSenderState =
                 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
             testerSenderState->subBlock.mergeFrom(data);
         }

         mylist.push_back(pkt);
     }
     delete srequest;
     reqCoalescer.erase(request_line_address);
     assert(!reqCoalescer.count(request_line_address));


     completeHitCallback(mylist, len);
 }

 bool
 GPUCoalescer::empty() const
 {
     return m_writeRequestTable.empty() && m_readRequestTable.empty();
 }

 // Analyzes the packet to see if this request can be coalesced.
 // If request can be coalesced, this request is added to the reqCoalescer table
 // and makeRequest returns RequestStatus_Issued;
 // If this is the first request to a cacheline, request is added to both
 // newRequests queue and to the reqCoalescer table; makeRequest
 // returns RequestStatus_Issued.
 // If there is a pending request to this cacheline and this request
 // can't be coalesced, RequestStatus_Aliased is returned and
 // the packet needs to be reissued.
 RequestStatus
 GPUCoalescer::makeRequest(PacketPtr pkt)
 {
     // Check for GPU Barrier Kernel End or Kernel Begin
     // Leave these to be handled by the child class
     // Kernel End/Barrier = isFlush + isRelease
     // Kernel Begin = isFlush + isAcquire
     if (pkt->req->isKernel()) {
         if (pkt->req->isAcquire()){
             // This is a Kernel Begin leave handling to
             // virtual xCoalescer::makeRequest
             return RequestStatus_Issued;
         }else if (pkt->req->isRelease()) {
             // This is a Kernel End leave handling to
             // virtual xCoalescer::makeRequest
             // If we are here then we didn't call
             // a virtual version of this function
             // so we will also schedule the callback
             int wf_id = 0;
             if (pkt->req->hasContextId()) {
                 wf_id = pkt->req->contextId();
             }
             insertKernel(wf_id, pkt);
             newKernelEnds.push_back(wf_id);
             if (!issueEvent.scheduled()) {
                 schedule(issueEvent, curTick());
             }
             return RequestStatus_Issued;
         }
     }

     // If number of outstanding requests greater than the max allowed,
     // return RequestStatus_BufferFull. This logic can be extended to
     // support proper backpressure.
     if (m_outstanding_count >= m_max_outstanding_requests) {
         return RequestStatus_BufferFull;
     }

     RubyRequestType primary_type = RubyRequestType_NULL;
     RubyRequestType secondary_type = RubyRequestType_NULL;

     if (pkt->isLLSC()) {
         //
         // Alpha LL/SC instructions need to be handled carefully by the cache
         // coherence protocol to ensure they follow the proper semantics. In
         // particular, by identifying the operations as atomic, the protocol
         // should understand that migratory sharing optimizations should not
         // be performed (i.e. a load between the LL and SC should not steal
         // away exclusive permission).
         //
         if (pkt->isWrite()) {
             primary_type = RubyRequestType_Store_Conditional;
         } else {
             assert(pkt->isRead());
             primary_type = RubyRequestType_Load_Linked;
         }
         secondary_type = RubyRequestType_ATOMIC;
     } else if (pkt->req->isLockedRMW()) {
         //
         // x86 locked instructions are translated to store cache coherence
         // requests because these requests should always be treated as read
         // exclusive operations and should leverage any migratory sharing
         // optimization built into the protocol.
         //
         if (pkt->isWrite()) {
             primary_type = RubyRequestType_Locked_RMW_Write;
         } else {
             assert(pkt->isRead());
             primary_type = RubyRequestType_Locked_RMW_Read;
         }
         secondary_type = RubyRequestType_ST;
     } else if (pkt->isAtomicOp()) {
         //
         // GPU Atomic Operation
         //
         primary_type = RubyRequestType_ATOMIC;
         secondary_type = RubyRequestType_ATOMIC;
     } else {
         if (pkt->isRead()) {
             if (pkt->req->isInstFetch()) {
                 primary_type = secondary_type = RubyRequestType_IFETCH;
             } else {
 #if THE_ISA == X86_ISA
                 uint32_t flags = pkt->req->getFlags();
                 bool storeCheck = flags &
                         (TheISA::StoreCheck << TheISA::FlagShift);
 #else
                 bool storeCheck = false;
 #endif // X86_ISA
                 if (storeCheck) {
                     primary_type = RubyRequestType_RMW_Read;
                     secondary_type = RubyRequestType_ST;
                 } else {
                     primary_type = secondary_type = RubyRequestType_LD;
                 }
             }
         } else if (pkt->isWrite()) {
             //
             // Note: M5 packets do not differentiate ST from RMW_Write
             //
             primary_type = secondary_type = RubyRequestType_ST;
         } else if (pkt->isFlush()) {
             primary_type = secondary_type = RubyRequestType_FLUSH;
         } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
             if (assumingRfOCoherence) {
                 // If we reached here, this request must be a memFence
                 // and the protocol implements RfO, the coalescer can
                 // assume sequentially consistency and schedule the callback
                 // immediately.
                 // Currently the code implements fence callbacks
                 // by reusing the mechanism for kernel completions.
                 // This should be fixed.
                 int wf_id = 0;
                 if (pkt->req->hasContextId()) {
                     wf_id = pkt->req->contextId();
                 }
                 insertKernel(wf_id, pkt);
                 newKernelEnds.push_back(wf_id);
                 if (!issueEvent.scheduled()) {
                     schedule(issueEvent, curTick());
                 }
                 return RequestStatus_Issued;
             } else {
                 // If not RfO, return issued here and let the child coalescer
                 // take care of it.
                 return RequestStatus_Issued;
             }
         } else {
             panic("Unsupported ruby packet type\n");
         }
     }

     // Check if there is any pending request to this cache line from
     // previous cycles.
     // If there is a pending request, return aliased. Since coalescing
     // across time is not permitted, aliased requests are not coalesced.
     // If a request for this address has already been issued, we must block
     RequestStatus status = getRequestStatus(pkt, primary_type);
     if (status != RequestStatus_Ready)
         return status;

     Addr line_addr = makeLineAddress(pkt->getAddr());

     // Check if this request can be coalesced with previous
     // requests from this cycle.
     if (!reqCoalescer.count(line_addr)) {
         // This is the first access to this cache line.
         // A new request to the memory subsystem has to be
         // made in the next cycle for this cache line, so
         // add this line addr to the "newRequests" queue
         newRequests.push_back(line_addr);

     // There was a request to this cache line in this cycle,
     // let us see if we can coalesce this request with the previous
     // requests from this cycle
     } else if (primary_type !=
                reqCoalescer[line_addr][0].primaryType) {
         // can't coalesce loads, stores and atomics!
         return RequestStatus_Aliased;
     } else if (pkt->req->isLockedRMW() ||
                reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) {
         // can't coalesce locked accesses, but can coalesce atomics!
         return RequestStatus_Aliased;
     } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
                pkt->req->contextId() !=
                reqCoalescer[line_addr][0].pkt->req->contextId()) {
         // can't coalesce releases from different wavefronts
         return RequestStatus_Aliased;
     }

     // in addition to the packet, we need to save both request types
     reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type);
     if (!issueEvent.scheduled())
         schedule(issueEvent, curTick());
     // TODO: issue hardware prefetches here
     return RequestStatus_Issued;
 }

 void
 GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
 {

     int proc_id = -1;
     if (pkt != NULL && pkt->req->hasContextId()) {
         proc_id = pkt->req->contextId();
     }

     // If valid, copy the pc to the ruby request
     Addr pc = 0;
     if (pkt->req->hasPC()) {
         pc = pkt->req->getPC();
     }

     // At the moment setting scopes only counts
     // for GPU spill space accesses
     // which is pkt->req->isStack()
     // this scope is REPLACE since it
     // does not need to be flushed at the end
     // of a kernel Private and local may need
     // to be visible at the end of the kernel
     HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
     HSAScope accessScope = reqScopeToHSAScope(pkt->req);

     Addr line_addr = makeLineAddress(pkt->getAddr());

     // Creating WriteMask that records written bytes
     // and atomic operations. This enables partial writes
     // and partial reads of those writes
     DataBlock dataBlock;
     dataBlock.clear();
     uint32_t blockSize = RubySystem::getBlockSizeBytes();
     std::vector<bool> accessMask(blockSize,false);
     std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
     uint32_t tableSize = reqCoalescer[line_addr].size();
     for (int i = 0; i < tableSize; i++) {
         PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt;
         uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
         uint32_t tmpSize = tmpPkt->getSize();
         if (tmpPkt->isAtomicOp()) {
             std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
                                                         tmpPkt->getAtomicOp());
             atomicOps.push_back(tmpAtomicOp);
         } else if (tmpPkt->isWrite()) {
             dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
                               tmpOffset, tmpSize);
         }
         for (int j = 0; j < tmpSize; j++) {
             accessMask[tmpOffset + j] = true;
         }
     }
     std::shared_ptr<RubyRequest> msg;
     if (pkt->isAtomicOp()) {
         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
                               pkt->getPtr<uint8_t>(),
                               pkt->getSize(), pc, secondary_type,
                               RubyAccessMode_Supervisor, pkt,
                               PrefetchBit_No, proc_id, 100,
                               blockSize, accessMask,
                               dataBlock, atomicOps,
                               accessScope, accessSegment);
     } else {
         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
                               pkt->getPtr<uint8_t>(),
                               pkt->getSize(), pc, secondary_type,
                               RubyAccessMode_Supervisor, pkt,
                               PrefetchBit_No, proc_id, 100,
                               blockSize, accessMask,
                               dataBlock,
                               accessScope, accessSegment);
     }
     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
              curTick(), m_version, "Coal", "Begin", "", "",
              printAddress(msg->getPhysicalAddress()),
              RubyRequestType_to_string(secondary_type));

     fatal_if(secondary_type == RubyRequestType_IFETCH,
              "there should not be any I-Fetch requests in the GPU Coalescer");

     // Send the message to the cache controller
     fatal_if(m_data_cache_hit_latency == 0,
              "should not have a latency of zero");

     assert(m_mandatory_q_ptr);
     m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
 }

 template <class KEY, class VALUE>
 std::ostream &
 operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
 {
     out << "[";
     for (auto i = map.begin(); i != map.end(); ++i)
         out << " " << i->first << "=" << i->second;
     out << " ]";

     return out;
 }

 void
 GPUCoalescer::print(ostream& out) const
 {
     out << "[GPUCoalescer: " << m_version
         << ", outstanding requests: " << m_outstanding_count
         << ", read request table: " << m_readRequestTable
         << ", write request table: " << m_writeRequestTable
         << "]";
 }

 // this can be called from setState whenever coherence permissions are
 // upgraded when invoked, coherence violations will be checked for the
 // given block
 void
 GPUCoalescer::checkCoherence(Addr addr)
 {
 #ifdef CHECK_COHERENCE
     m_ruby_system->checkGlobalCoherenceInvariant(addr);
 #endif
 }

 void
 GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
     DPRINTF(RubyStats, "Recorded statistic: %s\n",
             SequencerRequestType_to_string(requestType));
 }


 void
 GPUCoalescer::completeIssue()
 {
     // newRequests has the cacheline addresses of all the
     // requests which need to be issued to the memory subsystem
     // in this cycle
     int len = newRequests.size();
     DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
     for (int i = 0; i < len; ++i) {
         // Get the requests from reqCoalescer table. Get only the
         // first request for each cacheline, the remaining requests
         // can be coalesced with the first request. So, only
         // one request is issued per cacheline.
         RequestDesc info = reqCoalescer[newRequests[i]][0];
         PacketPtr pkt = info.pkt;
         DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
                 i, pkt->req->getPaddr());
         // Insert this request to the read/writeRequestTables. These tables
         // are used to track aliased requests in makeRequest subroutine
         bool found = insertRequest(pkt, info.primaryType);

         if (found) {
             panic("GPUCoalescer::makeRequest should never be called if the "
                   "request is already outstanding\n");
         }

         // Issue request to ruby subsystem
         issueRequest(pkt, info.secondaryType);
     }
     newRequests.clear();

     // have Kernel End releases been issued this cycle
     len = newKernelEnds.size();
     for (int i = 0; i < len; i++) {
         kernelCallback(newKernelEnds[i]);
     }
     newKernelEnds.clear();
 }

 void
 GPUCoalescer::evictionCallback(Addr address)
 {
     ruby_eviction_callback(address);
 }

 void
 GPUCoalescer::kernelCallback(int wavefront_id)
 {
     assert(kernelEndList.count(wavefront_id));

     ruby_hit_callback(kernelEndList[wavefront_id]);

     kernelEndList.erase(wavefront_id);
 }

 void
 GPUCoalescer::atomicCallback(Addr address,
                              MachineType mach,
                              const DataBlock& data)
 {
     assert(address == makeLineAddress(address));

     DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
     assert(m_writeRequestTable.count(makeLineAddress(address)));

     RequestTable::iterator i = m_writeRequestTable.find(address);
     assert(i != m_writeRequestTable.end());
     GPUCoalescerRequest* srequest = i->second;

     m_writeRequestTable.erase(i);
     markRemoved();

     assert((srequest->m_type == RubyRequestType_ATOMIC) ||
            (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
            (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));


     // Atomics don't write to cache, so there is no MRU update...

     recordMissLatency(srequest, mach,
                       srequest->issue_time, Cycles(0), Cycles(0), true, false);

     PacketPtr pkt = srequest->pkt;
     Addr request_address = pkt->getAddr();
     Addr request_line_address = makeLineAddress(pkt->getAddr());

     int len = reqCoalescer[request_line_address].size();
     std::vector<PacketPtr> mylist;
     for (int i = 0; i < len; ++i) {
         PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
         assert(srequest->m_type ==
                reqCoalescer[request_line_address][i].primaryType);
         request_address = (pkt->getAddr());
         request_line_address = makeLineAddress(request_address);
         if (pkt->getPtr<uint8_t>() &&
             srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
             /* atomics are done in memory, and return the data *before* the atomic op... */
             memcpy(pkt->getPtr<uint8_t>(),
                    data.getData(getOffset(request_address),
                                 pkt->getSize()),
                    pkt->getSize());
         } else {
             DPRINTF(MemoryAccess,
                     "WARNING.  Data not transfered from Ruby to M5 for type " \
                     "%s\n",
                     RubyRequestType_to_string(srequest->m_type));
         }

         // If using the RubyTester, update the RubyTester sender state's
         // subBlock with the recieved data.  The tester will later access
         // this state.
         // Note: RubyPort will access it's sender state before the
         // RubyTester.
         if (m_usingRubyTester) {
             RubyPort::SenderState *requestSenderState =
                 safe_cast<RubyPort::SenderState*>(pkt->senderState);
             RubyTester::SenderState* testerSenderState =
                 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
             testerSenderState->subBlock.mergeFrom(data);
         }

         mylist.push_back(pkt);
     }
     delete srequest;
     reqCoalescer.erase(request_line_address);
     assert(!reqCoalescer.count(request_line_address));

     completeHitCallback(mylist, len);
 }

 void
 GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
 {
     if (myMachID == senderMachID) {
         CP_TCPLdHits++;
     } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
         CP_TCPLdTransfers++;
     } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
         CP_TCCLdHits++;
     } else {
         CP_LdMiss++;
     }
 }

 void
 GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
 {
     if (myMachID == senderMachID) {
         CP_TCPStHits++;
     } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
         CP_TCPStTransfers++;
     } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
         CP_TCCStHits++;
     } else {
         CP_StMiss++;
     }
 }

 void
 GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
 {
     for (int i = 0; i < len; ++i) {
         RubyPort::SenderState *ss =
             safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
         MemSlavePort *port = ss->port;
         assert(port != NULL);

         mylist[i]->senderState = ss->predecessor;
         delete ss;
         port->hitCallback(mylist[i]);
         trySendRetries();
     }

     testDrainComplete();
 }

 PacketPtr
 GPUCoalescer::mapAddrToPkt(Addr address)
 {
     RequestTable::iterator i = m_readRequestTable.find(address);
     assert(i != m_readRequestTable.end());
     GPUCoalescerRequest* request = i->second;
     return request->pkt;
 }

 void
 GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
                                 MachineType mach,
                                 Cycles initialRequestTime,
                                 Cycles forwardRequestTime,
                                 Cycles firstResponseTime,
                                 bool success, bool isRegion)
 {
     RubyRequestType type = srequest->m_type;
     Cycles issued_time = srequest->issue_time;
     Cycles completion_time = curCycle();
     assert(completion_time >= issued_time);
     Cycles total_lat = completion_time - issued_time;

     // cache stats (valid for RfO protocol only)
     if (mach == MachineType_TCP) {
         if (type == RubyRequestType_LD) {
             GPU_TCPLdHits++;
         } else {
             GPU_TCPStHits++;
         }
     } else if (mach == MachineType_L1Cache_wCC) {
         if (type == RubyRequestType_LD) {
             GPU_TCPLdTransfers++;
         } else {
             GPU_TCPStTransfers++;
         }
     } else if (mach == MachineType_TCC) {
         if (type == RubyRequestType_LD) {
             GPU_TCCLdHits++;
         } else {
             GPU_TCCStHits++;
         }
     } else  {
         if (type == RubyRequestType_LD) {
             GPU_LdMiss++;
         } else {
             GPU_StMiss++;
         }
     }

     // Profile all access latency, even zero latency accesses
     m_latencyHist.sample(total_lat);
     m_typeLatencyHist[type]->sample(total_lat);

     // Profile the miss latency for all non-zero demand misses
     if (total_lat != Cycles(0)) {
         m_missLatencyHist.sample(total_lat);
         m_missTypeLatencyHist[type]->sample(total_lat);

         if (mach != MachineType_NUM) {
             m_missMachLatencyHist[mach]->sample(total_lat);
             m_missTypeMachLatencyHist[type][mach]->sample(total_lat);

             if ((issued_time <= initialRequestTime) &&
                 (initialRequestTime <= forwardRequestTime) &&
                 (forwardRequestTime <= firstResponseTime) &&
                 (firstResponseTime <= completion_time)) {

                 m_IssueToInitialDelayHist[mach]->sample(
                     initialRequestTime - issued_time);
                 m_InitialToForwardDelayHist[mach]->sample(
                     forwardRequestTime - initialRequestTime);
                 m_ForwardToFirstResponseDelayHist[mach]->sample(
                     firstResponseTime - forwardRequestTime);
                 m_FirstResponseToCompletionDelayHist[mach]->sample(
                     completion_time - firstResponseTime);
             }
         }

     }

     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
              curTick(), m_version, "Coal",
              success ? "Done" : "SC_Failed", "", "",
              printAddress(srequest->pkt->getAddr()), total_lat);
 }

 void
 GPUCoalescer::regStats()
 {
     RubyPort::regStats();

     // These statistical variables are not for display.
     // The profiler will collate these across different
     // coalescers and display those collated statistics.
     m_outstandReqHist.init(10);
     m_latencyHist.init(10);
     m_missLatencyHist.init(10);

     for (int i = 0; i < RubyRequestType_NUM; i++) {
         m_typeLatencyHist.push_back(new Stats::Histogram());
         m_typeLatencyHist[i]->init(10);

         m_missTypeLatencyHist.push_back(new Stats::Histogram());
         m_missTypeLatencyHist[i]->init(10);
     }

     for (int i = 0; i < MachineType_NUM; i++) {
         m_missMachLatencyHist.push_back(new Stats::Histogram());
         m_missMachLatencyHist[i]->init(10);

         m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
         m_IssueToInitialDelayHist[i]->init(10);

         m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
         m_InitialToForwardDelayHist[i]->init(10);

         m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
         m_ForwardToFirstResponseDelayHist[i]->init(10);

         m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
         m_FirstResponseToCompletionDelayHist[i]->init(10);
     }

     for (int i = 0; i < RubyRequestType_NUM; i++) {
         m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());

         for (int j = 0; j < MachineType_NUM; j++) {
             m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
             m_missTypeMachLatencyHist[i][j]->init(10);
         }
     }

     // GPU cache stats
     GPU_TCPLdHits
         .name(name() + ".gpu_tcp_ld_hits")
         .desc("loads that hit in the TCP")
         ;
     GPU_TCPLdTransfers
         .name(name() + ".gpu_tcp_ld_transfers")
         .desc("TCP to TCP load transfers")
         ;
     GPU_TCCLdHits
         .name(name() + ".gpu_tcc_ld_hits")
         .desc("loads that hit in the TCC")
         ;
     GPU_LdMiss
         .name(name() + ".gpu_ld_misses")
         .desc("loads that miss in the GPU")
         ;

     GPU_TCPStHits
         .name(name() + ".gpu_tcp_st_hits")
         .desc("stores that hit in the TCP")
         ;
     GPU_TCPStTransfers
         .name(name() + ".gpu_tcp_st_transfers")
         .desc("TCP to TCP store transfers")
         ;
     GPU_TCCStHits
         .name(name() + ".gpu_tcc_st_hits")
         .desc("stores that hit in the TCC")
         ;
     GPU_StMiss
         .name(name() + ".gpu_st_misses")
         .desc("stores that miss in the GPU")
         ;

     // CP cache stats
     CP_TCPLdHits
         .name(name() + ".cp_tcp_ld_hits")
         .desc("loads that hit in the TCP")
         ;
     CP_TCPLdTransfers
         .name(name() + ".cp_tcp_ld_transfers")
         .desc("TCP to TCP load transfers")
         ;
     CP_TCCLdHits
         .name(name() + ".cp_tcc_ld_hits")
         .desc("loads that hit in the TCC")
         ;
     CP_LdMiss
         .name(name() + ".cp_ld_misses")
         .desc("loads that miss in the GPU")
         ;

     CP_TCPStHits
         .name(name() + ".cp_tcp_st_hits")
         .desc("stores that hit in the TCP")
         ;
     CP_TCPStTransfers
         .name(name() + ".cp_tcp_st_transfers")
         .desc("TCP to TCP store transfers")
         ;
     CP_TCCStHits
         .name(name() + ".cp_tcc_st_hits")
         .desc("stores that hit in the TCC")
         ;
     CP_StMiss
         .name(name() + ".cp_st_misses")
         .desc("stores that miss in the GPU")
         ;
 }