blob: 6aa3153045caa899133401eba405d1cd84f64327 [file] [log] [blame]
/*
* Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Sooraj Puthoor
*/
#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
#include <iostream>
#include <unordered_map>
#include "base/statistics.hh"
#include "cpu/testers/gpu_ruby_test/ProtocolTester.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/misc.hh"
#include "mem/protocol/PrefetchBit.hh"
#include "mem/protocol/RubyAccessMode.hh"
#include "mem/protocol/RubyRequestType.hh"
#include "mem/protocol/SequencerRequestType.hh"
#include "mem/request.hh"
#include "mem/ruby/common/Address.hh"
#include "mem/ruby/common/Consumer.hh"
#include "mem/ruby/system/Sequencer.hh"
class DataBlock;
class CacheMsg;
class MachineID;
class CacheMemory;
class RubyGPUCoalescerParams;
// List of packets that belongs to a specific instruction.
typedef std::list<PacketPtr> PerInstPackets;
class UncoalescedTable
{
public:
UncoalescedTable(GPUCoalescer *gc);
~UncoalescedTable() {}
void insertPacket(PacketPtr pkt);
bool packetAvailable();
void printRequestTable(std::stringstream& ss);
// Returns a pointer to the list of packets corresponding to an
// instruction in the instruction map or nullptr if there are no
// instructions at the offset.
PerInstPackets* getInstPackets(int offset);
void updateResources();
bool areRequestsDone(const uint64_t instSeqNum);
private:
GPUCoalescer *coalescer;
// Maps an instructions unique sequence number to a queue of packets
// which need responses. This data structure assumes the sequence number
// is monotonically increasing (which is true for CU class) in order to
// issue packets in age order.
std::map<uint64_t, PerInstPackets> instMap;
};
class CoalescedRequest
{
public:
CoalescedRequest(uint64_t _seqNum)
: seqNum(_seqNum), issueTime(Cycles(0)),
rubyType(RubyRequestType_NULL)
{}
~CoalescedRequest() {}
void insertPacket(PacketPtr pkt) { pkts.push_back(pkt); }
void setSeqNum(uint64_t _seqNum) { seqNum = _seqNum; }
void setIssueTime(Cycles _issueTime) { issueTime = _issueTime; }
void setRubyType(RubyRequestType type) { rubyType = type; }
uint64_t getSeqNum() const { return seqNum; }
PacketPtr getFirstPkt() const { return pkts[0]; }
Cycles getIssueTime() const { return issueTime; }
RubyRequestType getRubyType() const { return rubyType; }
std::vector<PacketPtr>& getPackets() { return pkts; }
private:
uint64_t seqNum;
Cycles issueTime;
RubyRequestType rubyType;
std::vector<PacketPtr> pkts;
};
// PendingWriteInst tracks the number of outstanding Ruby requests
// per write instruction. Once all requests associated with one instruction
// are completely done in Ruby, we call back the requester to mark
// that this instruction is complete.
class PendingWriteInst
{
public:
PendingWriteInst()
: numPendingStores(0),
numRequiredWriteCompleteAcks(0),
originalPort(nullptr),
gpuDynInstPtr(nullptr)
{}
~PendingWriteInst()
{}
void
addPendingReq(RubyPort::MemSlavePort* port, PacketPtr pkt,
bool usingRubyTester)
{
assert(port);
originalPort = port;
RubyPort::SenderState* ss =
safe_cast<RubyPort::SenderState*>(pkt->senderState);
if (usingRubyTester) {
// If this coalescer is connected to a tester thread, we need
// to save the corresponding requesting thread.
// get the requesting thread from the original sender state
ProtocolTester::SenderState* senderState =
safe_cast<ProtocolTester::SenderState*>
(ss->predecessor);
testerThreadPtr = senderState->th;
numRequiredWriteCompleteAcks++;
} else {
// If this coalescer is connected to a real CU, we need
// to save the corresponding gpu dynamic instruction.
// CU will use that instruction to decrement wait counters
// in the issuing wavefront.
// For Ruby tester, gpuDynInst == nullptr
ComputeUnit::DataPort::SenderState* cu_state =
safe_cast<ComputeUnit::DataPort::SenderState*>
(ss->predecessor);
gpuDynInstPtr = cu_state->_gpuDynInst;
numRequiredWriteCompleteAcks = 1;
}
numPendingStores++;
}
// return true if no more ack is expected
bool
receiveWriteCompleteAck()
{
assert(numPendingStores > 0);
numPendingStores--;
return (numPendingStores == 0) ? true : false;
}
// ack the original requester that this write instruction is complete
void
ackWriteCompletion(bool usingRubyTester)
{
assert(numPendingStores == 0);
assert(numRequiredWriteCompleteAcks > 0);
for (int i = 0; i < numRequiredWriteCompleteAcks; ++i) {
// make a response packet
PacketPtr pkt = new Packet(new Request(), MemCmd::MessageResp);
if (usingRubyTester) {
assert(testerThreadPtr);
ProtocolTester::SenderState* ss =
new ProtocolTester::SenderState(testerThreadPtr);
pkt->senderState = ss;
} else {
assert(gpuDynInstPtr);
ComputeUnit::DataPort::SenderState* ss =
new ComputeUnit::DataPort::SenderState
(gpuDynInstPtr, 0, nullptr);
pkt->senderState = ss;
}
// send the ack response to the requester
originalPort->sendTimingResp(pkt);
}
}
int
getNumPendingStores() {
return numPendingStores;
}
private:
// the number of stores waiting for writeCompleteCallback
int numPendingStores;
// number of write-complete acks required by the requestor
// if the requestor is a CU, this number is always 1
// if the requestor is a tester thread, this number is equal to the number
// of store requests sent to the coalescer
int numRequiredWriteCompleteAcks;
// The original port that sent one of packets associated with this
// write instruction. We may have more than one packet per instruction,
// which implies multiple ports per instruction. However, we need
// only 1 of the ports to call back the CU. Therefore, here we keep
// track the port that sent the first packet of this instruction.
RubyPort::MemSlavePort* originalPort;
// similar to the originalPort, this gpuDynInstPtr is set only for
// the first packet of this instruction.
GPUDynInstPtr gpuDynInstPtr;
// if protocol tester is used, this points to the requesting tester thread
Thread* testerThreadPtr;
};
class GPUCoalescer : public RubyPort
{
public:
typedef RubyGPUCoalescerParams Params;
GPUCoalescer(const Params *);
~GPUCoalescer();
// Public Methods
void wakeup(); // Used only for deadlock detection
void printRequestTable(std::stringstream& ss);
void printProgress(std::ostream& out) const;
void resetStats();
void collateStats();
void regStats();
// each store request needs two callbacks:
// (1) writeCallback is called when the store is received and processed
// by TCP. This writeCallback does not guarantee the store is actually
// completed at its destination cache or memory. writeCallback helps
// release hardware resources (e.g., its entry in coalescedTable)
// allocated for the store so that subsequent requests will not be
// blocked unnecessarily due to hardware resource constraints.
// (2) writeCompleteCallback is called when the store is fully completed
// at its destination cache or memory. writeCompleteCallback
// guarantees that the store is fully completed. This callback
// will decrement hardware counters in CU
void writeCallback(Addr address, DataBlock& data);
void writeCallback(Addr address,
MachineType mach,
DataBlock& data);
void writeCallback(Addr address,
MachineType mach,
DataBlock& data,
Cycles initialRequestTime,
Cycles forwardRequestTime,
Cycles firstResponseTime,
bool isRegion);
void writeCallback(Addr address,
MachineType mach,
DataBlock& data,
Cycles initialRequestTime,
Cycles forwardRequestTime,
Cycles firstResponseTime);
void writeCompleteCallback(Addr address,
uint64_t instSeqNum,
MachineType mach);
void readCallback(Addr address, DataBlock& data);
void readCallback(Addr address,
MachineType mach,
DataBlock& data);
void readCallback(Addr address,
MachineType mach,
DataBlock& data,
Cycles initialRequestTime,
Cycles forwardRequestTime,
Cycles firstResponseTime);
void readCallback(Addr address,
MachineType mach,
DataBlock& data,
Cycles initialRequestTime,
Cycles forwardRequestTime,
Cycles firstResponseTime,
bool isRegion);
/* atomics need their own callback because the data
might be const coming from SLICC */
virtual void atomicCallback(Addr address,
MachineType mach,
const DataBlock& data);
RequestStatus makeRequest(PacketPtr pkt);
int outstandingCount() const { return m_outstanding_count; }
bool
isDeadlockEventScheduled() const
{
return deadlockCheckEvent.scheduled();
}
void
descheduleDeadlockEvent()
{
deschedule(deadlockCheckEvent);
}
bool empty() const;
void print(std::ostream& out) const;
void checkCoherence(Addr address);
void evictionCallback(Addr address);
void completeIssue();
void insertKernel(int wavefront_id, PacketPtr pkt);
Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
Stats::Histogram& getLatencyHist() { return m_latencyHist; }
Stats::Histogram& getTypeLatencyHist(uint32_t t)
{ return *m_typeLatencyHist[t]; }
Stats::Histogram& getMissLatencyHist()
{ return m_missLatencyHist; }
Stats::Histogram& getMissTypeLatencyHist(uint32_t t)
{ return *m_missTypeLatencyHist[t]; }
Stats::Histogram& getMissMachLatencyHist(uint32_t t) const
{ return *m_missMachLatencyHist[t]; }
Stats::Histogram&
getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
{ return *m_missTypeMachLatencyHist[r][t]; }
Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const
{ return *m_IssueToInitialDelayHist[t]; }
Stats::Histogram&
getInitialToForwardDelayHist(const MachineType t) const
{ return *m_InitialToForwardDelayHist[t]; }
Stats::Histogram&
getForwardRequestToFirstResponseHist(const MachineType t) const
{ return *m_ForwardToFirstResponseDelayHist[t]; }
Stats::Histogram&
getFirstResponseToCompletionDelayHist(const MachineType t) const
{ return *m_FirstResponseToCompletionDelayHist[t]; }
protected:
bool tryCacheAccess(Addr addr, RubyRequestType type,
Addr pc, RubyAccessMode access_mode,
int size, DataBlock*& data_ptr);
// since the two following issue functions are protocol-specific,
// they must be implemented in a derived coalescer
virtual void issueRequest(CoalescedRequest* crequest) = 0;
virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
void kernelCallback(int wavefront_id);
void hitCallback(CoalescedRequest* crequest,
MachineType mach,
DataBlock& data,
bool success,
Cycles initialRequestTime,
Cycles forwardRequestTime,
Cycles firstResponseTime,
bool isRegion);
void recordMissLatency(CoalescedRequest* crequest,
MachineType mach,
Cycles initialRequestTime,
Cycles forwardRequestTime,
Cycles firstResponseTime,
bool success, bool isRegion);
void completeHitCallback(std::vector<PacketPtr> & mylist);
virtual RubyRequestType getRequestType(PacketPtr pkt);
// Attempt to remove a packet from the uncoalescedTable and coalesce
// with a previous request from the same instruction. If there is no
// previous instruction and the max number of outstanding requests has
// not be reached, a new coalesced request is created and added to the
// "target" list of the coalescedTable.
bool coalescePacket(PacketPtr pkt);
EventFunctionWrapper issueEvent;
protected:
int m_max_outstanding_requests;
int m_deadlock_threshold;
CacheMemory* m_dataCache_ptr;
CacheMemory* m_instCache_ptr;
// The cache access latency for this GPU data cache. This is assessed at
// the beginning of each access. This should be very similar to the
// implementation in Sequencer() as this is very much like a Sequencer
Cycles m_data_cache_hit_latency;
// coalescingWindow is the maximum number of instructions that are
// allowed to be coalesced in a single cycle.
int coalescingWindow;
// The uncoalescedTable contains several "columns" which hold memory
// request packets for an instruction. The maximum size is the number of
// columns * the wavefront size.
UncoalescedTable uncoalescedTable;
// An MSHR-like struct for holding coalesced requests. The requests in
// this table may or may not be outstanding in the memory hierarchy. The
// maximum size is equal to the maximum outstanding requests for a CU
// (typically the number of blocks in TCP). If there are duplicates of
// an address, the are serviced in age order.
std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
// a map btw an instruction sequence number and PendingWriteInst
// this is used to do a final call back for each write when it is
// completely done in the memory system
std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
// Global outstanding request count, across all request tables
int m_outstanding_count;
bool m_deadlock_check_scheduled;
std::unordered_map<int, PacketPtr> kernelEndList;
std::vector<int> newKernelEnds;
int m_store_waiting_on_load_cycles;
int m_store_waiting_on_store_cycles;
int m_load_waiting_on_store_cycles;
int m_load_waiting_on_load_cycles;
bool m_runningGarnetStandalone;
EventFunctionWrapper deadlockCheckEvent;
bool assumingRfOCoherence;
// TODO - Need to update the following stats once the VIPER protocol
// is re-integrated.
// // m5 style stats for TCP hit/miss counts
// Stats::Scalar GPU_TCPLdHits;
// Stats::Scalar GPU_TCPLdTransfers;
// Stats::Scalar GPU_TCCLdHits;
// Stats::Scalar GPU_LdMiss;
//
// Stats::Scalar GPU_TCPStHits;
// Stats::Scalar GPU_TCPStTransfers;
// Stats::Scalar GPU_TCCStHits;
// Stats::Scalar GPU_StMiss;
//
// Stats::Scalar CP_TCPLdHits;
// Stats::Scalar CP_TCPLdTransfers;
// Stats::Scalar CP_TCCLdHits;
// Stats::Scalar CP_LdMiss;
//
// Stats::Scalar CP_TCPStHits;
// Stats::Scalar CP_TCPStTransfers;
// Stats::Scalar CP_TCCStHits;
// Stats::Scalar CP_StMiss;
//! Histogram for number of outstanding requests per cycle.
Stats::Histogram m_outstandReqHist;
//! Histogram for holding latency profile of all requests.
Stats::Histogram m_latencyHist;
std::vector<Stats::Histogram *> m_typeLatencyHist;
//! Histogram for holding latency profile of all requests that
//! miss in the controller connected to this sequencer.
Stats::Histogram m_missLatencyHist;
std::vector<Stats::Histogram *> m_missTypeLatencyHist;
//! Histograms for profiling the latencies for requests that
//! required external messages.
std::vector<Stats::Histogram *> m_missMachLatencyHist;
std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist;
//! Histograms for recording the breakdown of miss latency
std::vector<Stats::Histogram *> m_IssueToInitialDelayHist;
std::vector<Stats::Histogram *> m_InitialToForwardDelayHist;
std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
// TODO - Need to update the following stats once the VIPER protocol
// is re-integrated.
// Stats::Distribution numHopDelays;
// Stats::Distribution tcpToTccDelay;
// Stats::Distribution tccToSdDelay;
// Stats::Distribution sdToSdDelay;
// Stats::Distribution sdToTccDelay;
// Stats::Distribution tccToTcpDelay;
//
// Stats::Average avgTcpToTcc;
// Stats::Average avgTccToSd;
// Stats::Average avgSdToSd;
// Stats::Average avgSdToTcc;
// Stats::Average avgTccToTcp;
private:
// Private copy constructor and assignment operator
GPUCoalescer(const GPUCoalescer& obj);
GPUCoalescer& operator=(const GPUCoalescer& obj);
};
inline std::ostream&
operator<<(std::ostream& out, const GPUCoalescer& obj)
{
obj.print(out);
out << std::flush;
return out;
}
#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__