blob: e17bf63fdd78125f30bfcd01dfd07839dfb7b11c [file] [log] [blame]
/*
* Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Sooraj Puthoor
*/
#include "base/logging.hh"
#include "base/str.hh"
#include "config/the_isa.hh"
#if THE_ISA == X86_ISA
#include "arch/x86/insts/microldstop.hh"
#endif // X86_ISA
#include "mem/ruby/system/GPUCoalescer.hh"
#include "cpu/testers/rubytest/RubyTester.hh"
#include "debug/GPUCoalescer.hh"
#include "debug/MemoryAccess.hh"
#include "debug/ProtocolTrace.hh"
#include "debug/RubyPort.hh"
#include "debug/RubyStats.hh"
#include "gpu-compute/shader.hh"
#include "mem/packet.hh"
#include "mem/ruby/common/DataBlock.hh"
#include "mem/ruby/common/SubBlock.hh"
#include "mem/ruby/network/MessageBuffer.hh"
#include "mem/ruby/profiler/Profiler.hh"
#include "mem/ruby/slicc_interface/AbstractController.hh"
#include "mem/ruby/slicc_interface/RubyRequest.hh"
#include "mem/ruby/structures/CacheMemory.hh"
#include "mem/ruby/system/RubySystem.hh"
#include "params/RubyGPUCoalescer.hh"
using namespace std;
GPUCoalescer *
RubyGPUCoalescerParams::create()
{
return new GPUCoalescer(this);
}
HSAScope
reqScopeToHSAScope(const RequestPtr &req)
{
HSAScope accessScope = HSAScope_UNSPECIFIED;
if (req->isScoped()) {
if (req->isWavefrontScope()) {
accessScope = HSAScope_WAVEFRONT;
} else if (req->isWorkgroupScope()) {
accessScope = HSAScope_WORKGROUP;
} else if (req->isDeviceScope()) {
accessScope = HSAScope_DEVICE;
} else if (req->isSystemScope()) {
accessScope = HSAScope_SYSTEM;
} else {
fatal("Bad scope type");
}
}
return accessScope;
}
HSASegment
reqSegmentToHSASegment(const RequestPtr &req)
{
HSASegment accessSegment = HSASegment_GLOBAL;
if (req->isGlobalSegment()) {
accessSegment = HSASegment_GLOBAL;
} else if (req->isGroupSegment()) {
accessSegment = HSASegment_GROUP;
} else if (req->isPrivateSegment()) {
accessSegment = HSASegment_PRIVATE;
} else if (req->isKernargSegment()) {
accessSegment = HSASegment_KERNARG;
} else if (req->isReadonlySegment()) {
accessSegment = HSASegment_READONLY;
} else if (req->isSpillSegment()) {
accessSegment = HSASegment_SPILL;
} else if (req->isArgSegment()) {
accessSegment = HSASegment_ARG;
} else {
fatal("Bad segment type");
}
return accessSegment;
}
GPUCoalescer::GPUCoalescer(const Params *p)
: RubyPort(p),
issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
false, Event::Progress_Event_Pri),
deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check")
{
m_store_waiting_on_load_cycles = 0;
m_store_waiting_on_store_cycles = 0;
m_load_waiting_on_store_cycles = 0;
m_load_waiting_on_load_cycles = 0;
m_outstanding_count = 0;
m_max_outstanding_requests = 0;
m_deadlock_threshold = 0;
m_instCache_ptr = nullptr;
m_dataCache_ptr = nullptr;
m_instCache_ptr = p->icache;
m_dataCache_ptr = p->dcache;
m_max_outstanding_requests = p->max_outstanding_requests;
m_deadlock_threshold = p->deadlock_threshold;
assert(m_max_outstanding_requests > 0);
assert(m_deadlock_threshold > 0);
assert(m_instCache_ptr);
assert(m_dataCache_ptr);
m_data_cache_hit_latency = p->dcache_hit_latency;
m_runningGarnetStandalone = p->garnet_standalone;
assumingRfOCoherence = p->assume_rfo;
}
GPUCoalescer::~GPUCoalescer()
{
}
void
GPUCoalescer::wakeup()
{
// Check for deadlock of any of the requests
Cycles current_time = curCycle();
// Check across all outstanding requests
int total_outstanding = 0;
RequestTable::iterator read = m_readRequestTable.begin();
RequestTable::iterator read_end = m_readRequestTable.end();
for (; read != read_end; ++read) {
GPUCoalescerRequest* request = read->second;
if (current_time - request->issue_time < m_deadlock_threshold)
continue;
panic("Possible Deadlock detected. Aborting!\n"
"version: %d request.paddr: 0x%x m_readRequestTable: %d "
"current time: %u issue_time: %d difference: %d\n", m_version,
request->pkt->getAddr(), m_readRequestTable.size(),
current_time * clockPeriod(), request->issue_time * clockPeriod(),
(current_time - request->issue_time)*clockPeriod());
}
RequestTable::iterator write = m_writeRequestTable.begin();
RequestTable::iterator write_end = m_writeRequestTable.end();
for (; write != write_end; ++write) {
GPUCoalescerRequest* request = write->second;
if (current_time - request->issue_time < m_deadlock_threshold)
continue;
panic("Possible Deadlock detected. Aborting!\n"
"version: %d request.paddr: 0x%x m_writeRequestTable: %d "
"current time: %u issue_time: %d difference: %d\n", m_version,
request->pkt->getAddr(), m_writeRequestTable.size(),
current_time * clockPeriod(), request->issue_time * clockPeriod(),
(current_time - request->issue_time) * clockPeriod());
}
total_outstanding += m_writeRequestTable.size();
total_outstanding += m_readRequestTable.size();
assert(m_outstanding_count == total_outstanding);
if (m_outstanding_count > 0) {
// If there are still outstanding requests, keep checking
schedule(deadlockCheckEvent,
m_deadlock_threshold * clockPeriod() +
curTick());
}
}
void
GPUCoalescer::resetStats()
{
m_latencyHist.reset();
m_missLatencyHist.reset();
for (int i = 0; i < RubyRequestType_NUM; i++) {
m_typeLatencyHist[i]->reset();
m_missTypeLatencyHist[i]->reset();
for (int j = 0; j < MachineType_NUM; j++) {
m_missTypeMachLatencyHist[i][j]->reset();
}
}
for (int i = 0; i < MachineType_NUM; i++) {
m_missMachLatencyHist[i]->reset();
m_IssueToInitialDelayHist[i]->reset();
m_InitialToForwardDelayHist[i]->reset();
m_ForwardToFirstResponseDelayHist[i]->reset();
m_FirstResponseToCompletionDelayHist[i]->reset();
}
}
void
GPUCoalescer::printProgress(ostream& out) const
{
}
RequestStatus
GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
{
Addr line_addr = makeLineAddress(pkt->getAddr());
if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
return RequestStatus_BufferFull;
}
if (m_controller->isBlocked(line_addr) &&
request_type != RubyRequestType_Locked_RMW_Write) {
return RequestStatus_Aliased;
}
if ((request_type == RubyRequestType_ST) ||
(request_type == RubyRequestType_ATOMIC) ||
(request_type == RubyRequestType_ATOMIC_RETURN) ||
(request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
(request_type == RubyRequestType_RMW_Read) ||
(request_type == RubyRequestType_RMW_Write) ||
(request_type == RubyRequestType_Load_Linked) ||
(request_type == RubyRequestType_Store_Conditional) ||
(request_type == RubyRequestType_Locked_RMW_Read) ||
(request_type == RubyRequestType_Locked_RMW_Write) ||
(request_type == RubyRequestType_FLUSH)) {
// Check if there is any outstanding read request for the same
// cache line.
if (m_readRequestTable.count(line_addr) > 0) {
m_store_waiting_on_load_cycles++;
return RequestStatus_Aliased;
}
if (m_writeRequestTable.count(line_addr) > 0) {
// There is an outstanding write request for the cache line
m_store_waiting_on_store_cycles++;
return RequestStatus_Aliased;
}
} else {
// Check if there is any outstanding write request for the same
// cache line.
if (m_writeRequestTable.count(line_addr) > 0) {
m_load_waiting_on_store_cycles++;
return RequestStatus_Aliased;
}
if (m_readRequestTable.count(line_addr) > 0) {
// There is an outstanding read request for the cache line
m_load_waiting_on_load_cycles++;
return RequestStatus_Aliased;
}
}
return RequestStatus_Ready;
}
// sets the kernelEndList
void
GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
{
// Don't know if this will happen or is possible
// but I just want to be careful and not have it become
// simulator hang in the future
DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
assert(kernelEndList.count(wavefront_id) == 0);
kernelEndList[wavefront_id] = pkt;
DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
kernelEndList.size());
}
// Insert the request on the correct request table. Return true if
// the entry was already present.
bool
GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
{
assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
pkt->req->isLockedRMW() ||
!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
int total_outstanding M5_VAR_USED =
m_writeRequestTable.size() + m_readRequestTable.size();
assert(m_outstanding_count == total_outstanding);
// See if we should schedule a deadlock check
if (!deadlockCheckEvent.scheduled()) {
schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
}
Addr line_addr = makeLineAddress(pkt->getAddr());
if ((request_type == RubyRequestType_ST) ||
(request_type == RubyRequestType_ATOMIC) ||
(request_type == RubyRequestType_ATOMIC_RETURN) ||
(request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
(request_type == RubyRequestType_RMW_Read) ||
(request_type == RubyRequestType_RMW_Write) ||
(request_type == RubyRequestType_Load_Linked) ||
(request_type == RubyRequestType_Store_Conditional) ||
(request_type == RubyRequestType_Locked_RMW_Read) ||
(request_type == RubyRequestType_Locked_RMW_Write) ||
(request_type == RubyRequestType_FLUSH)) {
pair<RequestTable::iterator, bool> r =
m_writeRequestTable.insert(RequestTable::value_type(line_addr,
(GPUCoalescerRequest*) NULL));
if (r.second) {
RequestTable::iterator i = r.first;
i->second = new GPUCoalescerRequest(pkt, request_type,
curCycle());
DPRINTF(GPUCoalescer,
"Inserting write request for paddr %#x for type %d\n",
pkt->req->getPaddr(), i->second->m_type);
m_outstanding_count++;
} else {
return true;
}
} else {
pair<RequestTable::iterator, bool> r =
m_readRequestTable.insert(RequestTable::value_type(line_addr,
(GPUCoalescerRequest*) NULL));
if (r.second) {
RequestTable::iterator i = r.first;
i->second = new GPUCoalescerRequest(pkt, request_type,
curCycle());
DPRINTF(GPUCoalescer,
"Inserting read request for paddr %#x for type %d\n",
pkt->req->getPaddr(), i->second->m_type);
m_outstanding_count++;
} else {
return true;
}
}
m_outstandReqHist.sample(m_outstanding_count);
total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
assert(m_outstanding_count == total_outstanding);
return false;
}
void
GPUCoalescer::markRemoved()
{
m_outstanding_count--;
assert(m_outstanding_count ==
m_writeRequestTable.size() + m_readRequestTable.size());
}
void
GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
{
assert(m_outstanding_count ==
m_writeRequestTable.size() + m_readRequestTable.size());
Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
if ((srequest->m_type == RubyRequestType_ST) ||
(srequest->m_type == RubyRequestType_RMW_Read) ||
(srequest->m_type == RubyRequestType_RMW_Write) ||
(srequest->m_type == RubyRequestType_Load_Linked) ||
(srequest->m_type == RubyRequestType_Store_Conditional) ||
(srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
(srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
m_writeRequestTable.erase(line_addr);
} else {
m_readRequestTable.erase(line_addr);
}
markRemoved();
}
bool
GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
{
//
// The success flag indicates whether the LLSC operation was successful.
// LL ops will always succeed, but SC may fail if the cache line is no
// longer locked.
//
bool success = true;
if (request->m_type == RubyRequestType_Store_Conditional) {
if (!m_dataCache_ptr->isLocked(address, m_version)) {
//
// For failed SC requests, indicate the failure to the cpu by
// setting the extra data to zero.
//
request->pkt->req->setExtraData(0);
success = false;
} else {
//
// For successful SC requests, indicate the success to the cpu by
// setting the extra data to one.
//
request->pkt->req->setExtraData(1);
}
//
// Independent of success, all SC operations must clear the lock
//
m_dataCache_ptr->clearLocked(address);
} else if (request->m_type == RubyRequestType_Load_Linked) {
//
// Note: To fully follow Alpha LLSC semantics, should the LL clear any
// previously locked cache lines?
//
m_dataCache_ptr->setLocked(address, m_version);
} else if ((m_dataCache_ptr->isTagPresent(address)) &&
(m_dataCache_ptr->isLocked(address, m_version))) {
//
// Normal writes should clear the locked address
//
m_dataCache_ptr->clearLocked(address);
}
return success;
}
void
GPUCoalescer::writeCallback(Addr address, DataBlock& data)
{
writeCallback(address, MachineType_NULL, data);
}
void
GPUCoalescer::writeCallback(Addr address,
MachineType mach,
DataBlock& data)
{
writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
}
void
GPUCoalescer::writeCallback(Addr address,
MachineType mach,
DataBlock& data,
Cycles initialRequestTime,
Cycles forwardRequestTime,
Cycles firstResponseTime)
{
writeCallback(address, mach, data,
initialRequestTime, forwardRequestTime, firstResponseTime,
false);
}
void
GPUCoalescer::writeCallback(Addr address,
MachineType mach,
DataBlock& data,
Cycles initialRequestTime,
Cycles forwardRequestTime,
Cycles firstResponseTime,
bool isRegion)
{
assert(address == makeLineAddress(address));
DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
assert(m_writeRequestTable.count(makeLineAddress(address)));
RequestTable::iterator i = m_writeRequestTable.find(address);
assert(i != m_writeRequestTable.end());
GPUCoalescerRequest* request = i->second;
m_writeRequestTable.erase(i);
markRemoved();
assert((request->m_type == RubyRequestType_ST) ||
(request->m_type == RubyRequestType_ATOMIC) ||
(request->m_type == RubyRequestType_ATOMIC_RETURN) ||
(request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
(request->m_type == RubyRequestType_RMW_Read) ||
(request->m_type == RubyRequestType_RMW_Write) ||
(request->m_type == RubyRequestType_Load_Linked) ||
(request->m_type == RubyRequestType_Store_Conditional) ||
(request->m_type == RubyRequestType_Locked_RMW_Read) ||
(request->m_type == RubyRequestType_Locked_RMW_Write) ||
(request->m_type == RubyRequestType_FLUSH));
//
// For Alpha, properly handle LL, SC, and write requests with respect to
// locked cache blocks.
//
// Not valid for Garnet_standalone protocl
//
bool success = true;
if (!m_runningGarnetStandalone)
success = handleLlsc(address, request);
if (request->m_type == RubyRequestType_Locked_RMW_Read) {
m_controller->blockOnQueue(address, m_mandatory_q_ptr);
} else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
m_controller->unblock(address);
}
hitCallback(request, mach, data, success,
request->issue_time, forwardRequestTime, firstResponseTime,
isRegion);
}
void
GPUCoalescer::readCallback(Addr address, DataBlock& data)
{
readCallback(address, MachineType_NULL, data);
}
void
GPUCoalescer::readCallback(Addr address,
MachineType mach,
DataBlock& data)
{
readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
}
void
GPUCoalescer::readCallback(Addr address,
MachineType mach,
DataBlock& data,
Cycles initialRequestTime,
Cycles forwardRequestTime,
Cycles firstResponseTime)
{
readCallback(address, mach, data,
initialRequestTime, forwardRequestTime, firstResponseTime,
false);
}
void
GPUCoalescer::readCallback(Addr address,
MachineType mach,
DataBlock& data,
Cycles initialRequestTime,
Cycles forwardRequestTime,
Cycles firstResponseTime,
bool isRegion)
{
assert(address == makeLineAddress(address));
assert(m_readRequestTable.count(makeLineAddress(address)));
DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
RequestTable::iterator i = m_readRequestTable.find(address);
assert(i != m_readRequestTable.end());
GPUCoalescerRequest* request = i->second;
m_readRequestTable.erase(i);
markRemoved();
assert((request->m_type == RubyRequestType_LD) ||
(request->m_type == RubyRequestType_IFETCH));
hitCallback(request, mach, data, true,
request->issue_time, forwardRequestTime, firstResponseTime,
isRegion);
}
void
GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
MachineType mach,
DataBlock& data,
bool success,
Cycles initialRequestTime,
Cycles forwardRequestTime,
Cycles firstResponseTime,
bool isRegion)
{
PacketPtr pkt = srequest->pkt;
Addr request_address = pkt->getAddr();
Addr request_line_address = makeLineAddress(request_address);
RubyRequestType type = srequest->m_type;
// Set this cache entry to the most recently used
if (type == RubyRequestType_IFETCH) {
if (m_instCache_ptr->isTagPresent(request_line_address))
m_instCache_ptr->setMRU(request_line_address);
} else {
if (m_dataCache_ptr->isTagPresent(request_line_address))
m_dataCache_ptr->setMRU(request_line_address);
}
recordMissLatency(srequest, mach,
initialRequestTime,
forwardRequestTime,
firstResponseTime,
success, isRegion);
// update the data
//
// MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
int len = reqCoalescer[request_line_address].size();
std::vector<PacketPtr> mylist;
for (int i = 0; i < len; ++i) {
PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
assert(type == reqCoalescer[request_line_address][i].primaryType);
request_address = pkt->getAddr();
request_line_address = makeLineAddress(pkt->getAddr());
if (pkt->getPtr<uint8_t>()) {
if ((type == RubyRequestType_LD) ||
(type == RubyRequestType_ATOMIC) ||
(type == RubyRequestType_ATOMIC_RETURN) ||
(type == RubyRequestType_IFETCH) ||
(type == RubyRequestType_RMW_Read) ||
(type == RubyRequestType_Locked_RMW_Read) ||
(type == RubyRequestType_Load_Linked)) {
memcpy(pkt->getPtr<uint8_t>(),
data.getData(getOffset(request_address),
pkt->getSize()),
pkt->getSize());
} else {
data.setData(pkt->getPtr<uint8_t>(),
getOffset(request_address), pkt->getSize());
}
} else {
DPRINTF(MemoryAccess,
"WARNING. Data not transfered from Ruby to M5 for type " \
"%s\n",
RubyRequestType_to_string(type));
}
// If using the RubyTester, update the RubyTester sender state's
// subBlock with the recieved data. The tester will later access
// this state.
// Note: RubyPort will access it's sender state before the
// RubyTester.
if (m_usingRubyTester) {
RubyPort::SenderState *requestSenderState =
safe_cast<RubyPort::SenderState*>(pkt->senderState);
RubyTester::SenderState* testerSenderState =
safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
testerSenderState->subBlock.mergeFrom(data);
}
mylist.push_back(pkt);
}
delete srequest;
reqCoalescer.erase(request_line_address);
assert(!reqCoalescer.count(request_line_address));
completeHitCallback(mylist, len);
}
bool
GPUCoalescer::empty() const
{
return m_writeRequestTable.empty() && m_readRequestTable.empty();
}
// Analyzes the packet to see if this request can be coalesced.
// If request can be coalesced, this request is added to the reqCoalescer table
// and makeRequest returns RequestStatus_Issued;
// If this is the first request to a cacheline, request is added to both
// newRequests queue and to the reqCoalescer table; makeRequest
// returns RequestStatus_Issued.
// If there is a pending request to this cacheline and this request
// can't be coalesced, RequestStatus_Aliased is returned and
// the packet needs to be reissued.
RequestStatus
GPUCoalescer::makeRequest(PacketPtr pkt)
{
// Check for GPU Barrier Kernel End or Kernel Begin
// Leave these to be handled by the child class
// Kernel End/Barrier = isFlush + isRelease
// Kernel Begin = isFlush + isAcquire
if (pkt->req->isKernel()) {
if (pkt->req->isAcquire()){
// This is a Kernel Begin leave handling to
// virtual xCoalescer::makeRequest
return RequestStatus_Issued;
}else if (pkt->req->isRelease()) {
// This is a Kernel End leave handling to
// virtual xCoalescer::makeRequest
// If we are here then we didn't call
// a virtual version of this function
// so we will also schedule the callback
int wf_id = 0;
if (pkt->req->hasContextId()) {
wf_id = pkt->req->contextId();
}
insertKernel(wf_id, pkt);
newKernelEnds.push_back(wf_id);
if (!issueEvent.scheduled()) {
schedule(issueEvent, curTick());
}
return RequestStatus_Issued;
}
}
// If number of outstanding requests greater than the max allowed,
// return RequestStatus_BufferFull. This logic can be extended to
// support proper backpressure.
if (m_outstanding_count >= m_max_outstanding_requests) {
return RequestStatus_BufferFull;
}
RubyRequestType primary_type = RubyRequestType_NULL;
RubyRequestType secondary_type = RubyRequestType_NULL;
if (pkt->isLLSC()) {
//
// Alpha LL/SC instructions need to be handled carefully by the cache
// coherence protocol to ensure they follow the proper semantics. In
// particular, by identifying the operations as atomic, the protocol
// should understand that migratory sharing optimizations should not
// be performed (i.e. a load between the LL and SC should not steal
// away exclusive permission).
//
if (pkt->isWrite()) {
primary_type = RubyRequestType_Store_Conditional;
} else {
assert(pkt->isRead());
primary_type = RubyRequestType_Load_Linked;
}
secondary_type = RubyRequestType_ATOMIC;
} else if (pkt->req->isLockedRMW()) {
//
// x86 locked instructions are translated to store cache coherence
// requests because these requests should always be treated as read
// exclusive operations and should leverage any migratory sharing
// optimization built into the protocol.
//
if (pkt->isWrite()) {
primary_type = RubyRequestType_Locked_RMW_Write;
} else {
assert(pkt->isRead());
primary_type = RubyRequestType_Locked_RMW_Read;
}
secondary_type = RubyRequestType_ST;
} else if (pkt->isAtomicOp()) {
//
// GPU Atomic Operation
//
primary_type = RubyRequestType_ATOMIC;
secondary_type = RubyRequestType_ATOMIC;
} else {
if (pkt->isRead()) {
if (pkt->req->isInstFetch()) {
primary_type = secondary_type = RubyRequestType_IFETCH;
} else {
#if THE_ISA == X86_ISA
uint32_t flags = pkt->req->getFlags();
bool storeCheck = flags &
(TheISA::StoreCheck << TheISA::FlagShift);
#else
bool storeCheck = false;
#endif // X86_ISA
if (storeCheck) {
primary_type = RubyRequestType_RMW_Read;
secondary_type = RubyRequestType_ST;
} else {
primary_type = secondary_type = RubyRequestType_LD;
}
}
} else if (pkt->isWrite()) {
//
// Note: M5 packets do not differentiate ST from RMW_Write
//
primary_type = secondary_type = RubyRequestType_ST;
} else if (pkt->isFlush()) {
primary_type = secondary_type = RubyRequestType_FLUSH;
} else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
if (assumingRfOCoherence) {
// If we reached here, this request must be a memFence
// and the protocol implements RfO, the coalescer can
// assume sequentially consistency and schedule the callback
// immediately.
// Currently the code implements fence callbacks
// by reusing the mechanism for kernel completions.
// This should be fixed.
int wf_id = 0;
if (pkt->req->hasContextId()) {
wf_id = pkt->req->contextId();
}
insertKernel(wf_id, pkt);
newKernelEnds.push_back(wf_id);
if (!issueEvent.scheduled()) {
schedule(issueEvent, curTick());
}
return RequestStatus_Issued;
} else {
// If not RfO, return issued here and let the child coalescer
// take care of it.
return RequestStatus_Issued;
}
} else {
panic("Unsupported ruby packet type\n");
}
}
// Check if there is any pending request to this cache line from
// previous cycles.
// If there is a pending request, return aliased. Since coalescing
// across time is not permitted, aliased requests are not coalesced.
// If a request for this address has already been issued, we must block
RequestStatus status = getRequestStatus(pkt, primary_type);
if (status != RequestStatus_Ready)
return status;
Addr line_addr = makeLineAddress(pkt->getAddr());
// Check if this request can be coalesced with previous
// requests from this cycle.
if (!reqCoalescer.count(line_addr)) {
// This is the first access to this cache line.
// A new request to the memory subsystem has to be
// made in the next cycle for this cache line, so
// add this line addr to the "newRequests" queue
newRequests.push_back(line_addr);
// There was a request to this cache line in this cycle,
// let us see if we can coalesce this request with the previous
// requests from this cycle
} else if (primary_type !=
reqCoalescer[line_addr][0].primaryType) {
// can't coalesce loads, stores and atomics!
return RequestStatus_Aliased;
} else if (pkt->req->isLockedRMW() ||
reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) {
// can't coalesce locked accesses, but can coalesce atomics!
return RequestStatus_Aliased;
} else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
pkt->req->contextId() !=
reqCoalescer[line_addr][0].pkt->req->contextId()) {
// can't coalesce releases from different wavefronts
return RequestStatus_Aliased;
}
// in addition to the packet, we need to save both request types
reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type);
if (!issueEvent.scheduled())
schedule(issueEvent, curTick());
// TODO: issue hardware prefetches here
return RequestStatus_Issued;
}
void
GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
{
int proc_id = -1;
if (pkt != NULL && pkt->req->hasContextId()) {
proc_id = pkt->req->contextId();
}
// If valid, copy the pc to the ruby request
Addr pc = 0;
if (pkt->req->hasPC()) {
pc = pkt->req->getPC();
}
// At the moment setting scopes only counts
// for GPU spill space accesses
// which is pkt->req->isStack()
// this scope is REPLACE since it
// does not need to be flushed at the end
// of a kernel Private and local may need
// to be visible at the end of the kernel
HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
HSAScope accessScope = reqScopeToHSAScope(pkt->req);
Addr line_addr = makeLineAddress(pkt->getAddr());
// Creating WriteMask that records written bytes
// and atomic operations. This enables partial writes
// and partial reads of those writes
DataBlock dataBlock;
dataBlock.clear();
uint32_t blockSize = RubySystem::getBlockSizeBytes();
std::vector<bool> accessMask(blockSize,false);
std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
uint32_t tableSize = reqCoalescer[line_addr].size();
for (int i = 0; i < tableSize; i++) {
PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt;
uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
uint32_t tmpSize = tmpPkt->getSize();
if (tmpPkt->isAtomicOp()) {
std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
tmpPkt->getAtomicOp());
atomicOps.push_back(tmpAtomicOp);
} else if (tmpPkt->isWrite()) {
dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
tmpOffset, tmpSize);
}
for (int j = 0; j < tmpSize; j++) {
accessMask[tmpOffset + j] = true;
}
}
std::shared_ptr<RubyRequest> msg;
if (pkt->isAtomicOp()) {
msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
pkt->getPtr<uint8_t>(),
pkt->getSize(), pc, secondary_type,
RubyAccessMode_Supervisor, pkt,
PrefetchBit_No, proc_id, 100,
blockSize, accessMask,
dataBlock, atomicOps,
accessScope, accessSegment);
} else {
msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
pkt->getPtr<uint8_t>(),
pkt->getSize(), pc, secondary_type,
RubyAccessMode_Supervisor, pkt,
PrefetchBit_No, proc_id, 100,
blockSize, accessMask,
dataBlock,
accessScope, accessSegment);
}
DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
curTick(), m_version, "Coal", "Begin", "", "",
printAddress(msg->getPhysicalAddress()),
RubyRequestType_to_string(secondary_type));
fatal_if(secondary_type == RubyRequestType_IFETCH,
"there should not be any I-Fetch requests in the GPU Coalescer");
// Send the message to the cache controller
fatal_if(m_data_cache_hit_latency == 0,
"should not have a latency of zero");
assert(m_mandatory_q_ptr);
m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
}
template <class KEY, class VALUE>
std::ostream &
operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
{
out << "[";
for (auto i = map.begin(); i != map.end(); ++i)
out << " " << i->first << "=" << i->second;
out << " ]";
return out;
}
void
GPUCoalescer::print(ostream& out) const
{
out << "[GPUCoalescer: " << m_version
<< ", outstanding requests: " << m_outstanding_count
<< ", read request table: " << m_readRequestTable
<< ", write request table: " << m_writeRequestTable
<< "]";
}
// this can be called from setState whenever coherence permissions are
// upgraded when invoked, coherence violations will be checked for the
// given block
void
GPUCoalescer::checkCoherence(Addr addr)
{
#ifdef CHECK_COHERENCE
m_ruby_system->checkGlobalCoherenceInvariant(addr);
#endif
}
void
GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
DPRINTF(RubyStats, "Recorded statistic: %s\n",
SequencerRequestType_to_string(requestType));
}
void
GPUCoalescer::completeIssue()
{
// newRequests has the cacheline addresses of all the
// requests which need to be issued to the memory subsystem
// in this cycle
int len = newRequests.size();
DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
for (int i = 0; i < len; ++i) {
// Get the requests from reqCoalescer table. Get only the
// first request for each cacheline, the remaining requests
// can be coalesced with the first request. So, only
// one request is issued per cacheline.
RequestDesc info = reqCoalescer[newRequests[i]][0];
PacketPtr pkt = info.pkt;
DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
i, pkt->req->getPaddr());
// Insert this request to the read/writeRequestTables. These tables
// are used to track aliased requests in makeRequest subroutine
bool found = insertRequest(pkt, info.primaryType);
if (found) {
panic("GPUCoalescer::makeRequest should never be called if the "
"request is already outstanding\n");
}
// Issue request to ruby subsystem
issueRequest(pkt, info.secondaryType);
}
newRequests.clear();
// have Kernel End releases been issued this cycle
len = newKernelEnds.size();
for (int i = 0; i < len; i++) {
kernelCallback(newKernelEnds[i]);
}
newKernelEnds.clear();
}
void
GPUCoalescer::evictionCallback(Addr address)
{
ruby_eviction_callback(address);
}
void
GPUCoalescer::kernelCallback(int wavefront_id)
{
assert(kernelEndList.count(wavefront_id));
ruby_hit_callback(kernelEndList[wavefront_id]);
kernelEndList.erase(wavefront_id);
}
void
GPUCoalescer::atomicCallback(Addr address,
MachineType mach,
const DataBlock& data)
{
assert(address == makeLineAddress(address));
DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
assert(m_writeRequestTable.count(makeLineAddress(address)));
RequestTable::iterator i = m_writeRequestTable.find(address);
assert(i != m_writeRequestTable.end());
GPUCoalescerRequest* srequest = i->second;
m_writeRequestTable.erase(i);
markRemoved();
assert((srequest->m_type == RubyRequestType_ATOMIC) ||
(srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
(srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
// Atomics don't write to cache, so there is no MRU update...
recordMissLatency(srequest, mach,
srequest->issue_time, Cycles(0), Cycles(0), true, false);
PacketPtr pkt = srequest->pkt;
Addr request_address = pkt->getAddr();
Addr request_line_address = makeLineAddress(pkt->getAddr());
int len = reqCoalescer[request_line_address].size();
std::vector<PacketPtr> mylist;
for (int i = 0; i < len; ++i) {
PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
assert(srequest->m_type ==
reqCoalescer[request_line_address][i].primaryType);
request_address = (pkt->getAddr());
request_line_address = makeLineAddress(request_address);
if (pkt->getPtr<uint8_t>() &&
srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
/* atomics are done in memory, and return the data *before* the atomic op... */
memcpy(pkt->getPtr<uint8_t>(),
data.getData(getOffset(request_address),
pkt->getSize()),
pkt->getSize());
} else {
DPRINTF(MemoryAccess,
"WARNING. Data not transfered from Ruby to M5 for type " \
"%s\n",
RubyRequestType_to_string(srequest->m_type));
}
// If using the RubyTester, update the RubyTester sender state's
// subBlock with the recieved data. The tester will later access
// this state.
// Note: RubyPort will access it's sender state before the
// RubyTester.
if (m_usingRubyTester) {
RubyPort::SenderState *requestSenderState =
safe_cast<RubyPort::SenderState*>(pkt->senderState);
RubyTester::SenderState* testerSenderState =
safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
testerSenderState->subBlock.mergeFrom(data);
}
mylist.push_back(pkt);
}
delete srequest;
reqCoalescer.erase(request_line_address);
assert(!reqCoalescer.count(request_line_address));
completeHitCallback(mylist, len);
}
void
GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
{
if (myMachID == senderMachID) {
CP_TCPLdHits++;
} else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
CP_TCPLdTransfers++;
} else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
CP_TCCLdHits++;
} else {
CP_LdMiss++;
}
}
void
GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
{
if (myMachID == senderMachID) {
CP_TCPStHits++;
} else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
CP_TCPStTransfers++;
} else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
CP_TCCStHits++;
} else {
CP_StMiss++;
}
}
void
GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
{
for (int i = 0; i < len; ++i) {
RubyPort::SenderState *ss =
safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
MemSlavePort *port = ss->port;
assert(port != NULL);
mylist[i]->senderState = ss->predecessor;
delete ss;
port->hitCallback(mylist[i]);
trySendRetries();
}
testDrainComplete();
}
PacketPtr
GPUCoalescer::mapAddrToPkt(Addr address)
{
RequestTable::iterator i = m_readRequestTable.find(address);
assert(i != m_readRequestTable.end());
GPUCoalescerRequest* request = i->second;
return request->pkt;
}
void
GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
MachineType mach,
Cycles initialRequestTime,
Cycles forwardRequestTime,
Cycles firstResponseTime,
bool success, bool isRegion)
{
RubyRequestType type = srequest->m_type;
Cycles issued_time = srequest->issue_time;
Cycles completion_time = curCycle();
assert(completion_time >= issued_time);
Cycles total_lat = completion_time - issued_time;
// cache stats (valid for RfO protocol only)
if (mach == MachineType_TCP) {
if (type == RubyRequestType_LD) {
GPU_TCPLdHits++;
} else {
GPU_TCPStHits++;
}
} else if (mach == MachineType_L1Cache_wCC) {
if (type == RubyRequestType_LD) {
GPU_TCPLdTransfers++;
} else {
GPU_TCPStTransfers++;
}
} else if (mach == MachineType_TCC) {
if (type == RubyRequestType_LD) {
GPU_TCCLdHits++;
} else {
GPU_TCCStHits++;
}
} else {
if (type == RubyRequestType_LD) {
GPU_LdMiss++;
} else {
GPU_StMiss++;
}
}
// Profile all access latency, even zero latency accesses
m_latencyHist.sample(total_lat);
m_typeLatencyHist[type]->sample(total_lat);
// Profile the miss latency for all non-zero demand misses
if (total_lat != Cycles(0)) {
m_missLatencyHist.sample(total_lat);
m_missTypeLatencyHist[type]->sample(total_lat);
if (mach != MachineType_NUM) {
m_missMachLatencyHist[mach]->sample(total_lat);
m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
if ((issued_time <= initialRequestTime) &&
(initialRequestTime <= forwardRequestTime) &&
(forwardRequestTime <= firstResponseTime) &&
(firstResponseTime <= completion_time)) {
m_IssueToInitialDelayHist[mach]->sample(
initialRequestTime - issued_time);
m_InitialToForwardDelayHist[mach]->sample(
forwardRequestTime - initialRequestTime);
m_ForwardToFirstResponseDelayHist[mach]->sample(
firstResponseTime - forwardRequestTime);
m_FirstResponseToCompletionDelayHist[mach]->sample(
completion_time - firstResponseTime);
}
}
}
DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
curTick(), m_version, "Coal",
success ? "Done" : "SC_Failed", "", "",
printAddress(srequest->pkt->getAddr()), total_lat);
}
void
GPUCoalescer::regStats()
{
RubyPort::regStats();
// These statistical variables are not for display.
// The profiler will collate these across different
// coalescers and display those collated statistics.
m_outstandReqHist.init(10);
m_latencyHist.init(10);
m_missLatencyHist.init(10);
for (int i = 0; i < RubyRequestType_NUM; i++) {
m_typeLatencyHist.push_back(new Stats::Histogram());
m_typeLatencyHist[i]->init(10);
m_missTypeLatencyHist.push_back(new Stats::Histogram());
m_missTypeLatencyHist[i]->init(10);
}
for (int i = 0; i < MachineType_NUM; i++) {
m_missMachLatencyHist.push_back(new Stats::Histogram());
m_missMachLatencyHist[i]->init(10);
m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
m_IssueToInitialDelayHist[i]->init(10);
m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
m_InitialToForwardDelayHist[i]->init(10);
m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
m_ForwardToFirstResponseDelayHist[i]->init(10);
m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
m_FirstResponseToCompletionDelayHist[i]->init(10);
}
for (int i = 0; i < RubyRequestType_NUM; i++) {
m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
for (int j = 0; j < MachineType_NUM; j++) {
m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
m_missTypeMachLatencyHist[i][j]->init(10);
}
}
// GPU cache stats
GPU_TCPLdHits
.name(name() + ".gpu_tcp_ld_hits")
.desc("loads that hit in the TCP")
;
GPU_TCPLdTransfers
.name(name() + ".gpu_tcp_ld_transfers")
.desc("TCP to TCP load transfers")
;
GPU_TCCLdHits
.name(name() + ".gpu_tcc_ld_hits")
.desc("loads that hit in the TCC")
;
GPU_LdMiss
.name(name() + ".gpu_ld_misses")
.desc("loads that miss in the GPU")
;
GPU_TCPStHits
.name(name() + ".gpu_tcp_st_hits")
.desc("stores that hit in the TCP")
;
GPU_TCPStTransfers
.name(name() + ".gpu_tcp_st_transfers")
.desc("TCP to TCP store transfers")
;
GPU_TCCStHits
.name(name() + ".gpu_tcc_st_hits")
.desc("stores that hit in the TCC")
;
GPU_StMiss
.name(name() + ".gpu_st_misses")
.desc("stores that miss in the GPU")
;
// CP cache stats
CP_TCPLdHits
.name(name() + ".cp_tcp_ld_hits")
.desc("loads that hit in the TCP")
;
CP_TCPLdTransfers
.name(name() + ".cp_tcp_ld_transfers")
.desc("TCP to TCP load transfers")
;
CP_TCCLdHits
.name(name() + ".cp_tcc_ld_hits")
.desc("loads that hit in the TCC")
;
CP_LdMiss
.name(name() + ".cp_ld_misses")
.desc("loads that miss in the GPU")
;
CP_TCPStHits
.name(name() + ".cp_tcp_st_hits")
.desc("stores that hit in the TCP")
;
CP_TCPStTransfers
.name(name() + ".cp_tcp_st_transfers")
.desc("TCP to TCP store transfers")
;
CP_TCCStHits
.name(name() + ".cp_tcc_st_hits")
.desc("stores that hit in the TCC")
;
CP_StMiss
.name(name() + ".cp_st_misses")
.desc("stores that miss in the GPU")
;
}