| /* |
| * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. |
| * All rights reserved. |
| * |
| * For use for simulation and test purposes only |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * 3. Neither the name of the copyright holder nor the names of its |
| * contributors may be used to endorse or promote products derived from this |
| * software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "gpu-compute/shader.hh" |
| |
| #include <limits> |
| |
| #include "arch/x86/linux/linux.hh" |
| #include "arch/x86/page_size.hh" |
| #include "base/chunk_generator.hh" |
| #include "debug/GPUAgentDisp.hh" |
| #include "debug/GPUDisp.hh" |
| #include "debug/GPUMem.hh" |
| #include "debug/GPUShader.hh" |
| #include "debug/GPUWgLatency.hh" |
| #include "gpu-compute/dispatcher.hh" |
| #include "gpu-compute/gpu_command_processor.hh" |
| #include "gpu-compute/gpu_static_inst.hh" |
| #include "gpu-compute/hsa_queue_entry.hh" |
| #include "gpu-compute/wavefront.hh" |
| #include "mem/packet.hh" |
| #include "mem/ruby/system/RubySystem.hh" |
| #include "sim/sim_exit.hh" |
| |
| namespace gem5 |
| { |
| |
| Shader::Shader(const Params &p) : ClockedObject(p), |
| _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr), |
| gpuTc(nullptr), cpuPointer(p.cpu_pointer), |
| tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event", |
| false, Event::CPU_Tick_Pri), |
| timingSim(p.timing), hsail_mode(SIMT), |
| impl_kern_launch_acq(p.impl_kern_launch_acq), |
| impl_kern_end_rel(p.impl_kern_end_rel), |
| coissue_return(1), |
| trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf), |
| globalMemSize(p.globalmem), |
| nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc), |
| _dispatcher(*p.dispatcher), |
| max_valu_insts(p.max_valu_insts), total_valu_insts(0), |
| stats(this, p.CUs[0]->wfSize()) |
| { |
| gpuCmdProc.setShader(this); |
| _dispatcher.setShader(this); |
| |
| _gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L; |
| _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL; |
| |
| _ldsApe.base = ((Addr)1 << 61) + 0x0; |
| _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF; |
| |
| _scratchApe.base = ((Addr)1 << 61) + 0x100000000L; |
| _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF; |
| |
| shHiddenPrivateBaseVmid = 0; |
| |
| cuList.resize(n_cu); |
| |
| panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD"); |
| |
| for (int i = 0; i < n_cu; ++i) { |
| cuList[i] = p.CUs[i]; |
| assert(i == cuList[i]->cu_id); |
| cuList[i]->shader = this; |
| cuList[i]->idleCUTimeout = p.idlecu_timeout; |
| } |
| } |
| |
| GPUDispatcher& |
| Shader::dispatcher() |
| { |
| return _dispatcher; |
| } |
| |
| Addr |
| Shader::mmap(int length) |
| { |
| |
| Addr start; |
| |
| // round up length to the next page |
| length = roundUp(length, X86ISA::PageBytes); |
| |
| Process *proc = gpuTc->getProcessPtr(); |
| auto mem_state = proc->memState; |
| |
| if (proc->mmapGrowsDown()) { |
| DPRINTF(GPUShader, "GROWS DOWN"); |
| start = mem_state->getMmapEnd() - length; |
| mem_state->setMmapEnd(start); |
| } else { |
| DPRINTF(GPUShader, "GROWS UP"); |
| start = mem_state->getMmapEnd(); |
| mem_state->setMmapEnd(start + length); |
| |
| // assertion to make sure we don't overwrite the stack (it grows down) |
| assert(mem_state->getStackBase() - mem_state->getMaxStackSize() > |
| mem_state->getMmapEnd()); |
| } |
| |
| DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length); |
| |
| proc->allocateMem(start, length); |
| |
| return start; |
| } |
| |
| void |
| Shader::init() |
| { |
| // grab the threadContext of the thread running on the CPU |
| assert(cpuPointer); |
| gpuTc = cpuPointer->getContext(0); |
| assert(gpuTc); |
| } |
| |
| Shader::~Shader() |
| { |
| for (int j = 0; j < n_cu; ++j) |
| delete cuList[j]; |
| } |
| |
| void |
| Shader::updateContext(int cid) { |
| // context of the thread which dispatched work |
| assert(cpuPointer); |
| gpuTc = cpuPointer->getContext(cid); |
| assert(gpuTc); |
| } |
| |
| void |
| Shader::execScheduledAdds() |
| { |
| assert(!sa_when.empty()); |
| |
| // apply any scheduled adds |
| for (int i = 0; i < sa_n; ++i) { |
| if (sa_when[i] <= curTick()) { |
| *sa_val[i] += sa_x[i]; |
| panic_if(*sa_val[i] < 0, "Negative counter value\n"); |
| sa_val.erase(sa_val.begin() + i); |
| sa_x.erase(sa_x.begin() + i); |
| sa_when.erase(sa_when.begin() + i); |
| --sa_n; |
| --i; |
| } |
| } |
| if (!sa_when.empty()) { |
| Tick shader_wakeup = *std::max_element(sa_when.begin(), |
| sa_when.end()); |
| DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup); |
| schedule(tickEvent, shader_wakeup); |
| } else { |
| DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n"); |
| } |
| } |
| |
| /* |
| * dispatcher/shader arranges invalidate requests to the CUs |
| */ |
| void |
| Shader::prepareInvalidate(HSAQueueEntry *task) { |
| // if invalidate has already started/finished, then do nothing |
| if (task->isInvStarted()) return; |
| |
| // invalidate has never started; it can only perform once at kernel launch |
| assert(task->outstandingInvs() == -1); |
| int kernId = task->dispatchId(); |
| // counter value is 0 now, indicating the inv is about to start |
| _dispatcher.updateInvCounter(kernId, +1); |
| |
| // iterate all cus managed by the shader, to perform invalidate. |
| for (int i_cu = 0; i_cu < n_cu; ++i_cu) { |
| // create a request to hold INV info; the request's fields will |
| // be updated in cu before use |
| auto req = std::make_shared<Request>(0, 0, 0, |
| cuList[i_cu]->requestorId(), |
| 0, -1); |
| |
| _dispatcher.updateInvCounter(kernId, +1); |
| // all necessary INV flags are all set now, call cu to execute |
| cuList[i_cu]->doInvalidate(req, task->dispatchId()); |
| |
| // I don't like this. This is intrusive coding. |
| cuList[i_cu]->resetRegisterPool(); |
| } |
| } |
| |
| /** |
| * dispatcher/shader arranges flush requests to the CUs |
| */ |
| void |
| Shader::prepareFlush(GPUDynInstPtr gpuDynInst){ |
| int kernId = gpuDynInst->kern_id; |
| // flush has never been started, performed only once at kernel end |
| assert(_dispatcher.getOutstandingWbs(kernId) == 0); |
| |
| // the first cu, managed by the shader, performs flush operation, |
| // assuming that L2 cache is shared by all cus in the shader |
| int i_cu = 0; |
| _dispatcher.updateWbCounter(kernId, +1); |
| cuList[i_cu]->doFlush(gpuDynInst); |
| } |
| |
| bool |
| Shader::dispatchWorkgroups(HSAQueueEntry *task) |
| { |
| bool scheduledSomething = false; |
| int cuCount = 0; |
| int curCu = nextSchedCu; |
| int disp_count(0); |
| |
| while (cuCount < n_cu) { |
| //Every time we try a CU, update nextSchedCu |
| nextSchedCu = (nextSchedCu + 1) % n_cu; |
| |
| // dispatch workgroup iff the following two conditions are met: |
| // (a) wg_rem is true - there are unassigned workgroups in the grid |
| // (b) there are enough free slots in cu cuList[i] for this wg |
| int num_wfs_in_wg = 0; |
| bool can_disp = cuList[curCu]->hasDispResources(task, num_wfs_in_wg); |
| if (!task->dispComplete() && can_disp) { |
| scheduledSomething = true; |
| DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n", |
| curCu, task->globalWgId()); |
| DPRINTF(GPUAgentDisp, "Dispatching a workgroup to CU %d: WG %d\n", |
| curCu, task->globalWgId()); |
| DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n", |
| curTick(), task->globalWgId(), curCu); |
| |
| if (!cuList[curCu]->tickEvent.scheduled()) { |
| if (!_activeCus) |
| _lastInactiveTick = curTick(); |
| _activeCus++; |
| } |
| |
| panic_if(_activeCus <= 0 || _activeCus > cuList.size(), |
| "Invalid activeCu size\n"); |
| cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg); |
| |
| task->markWgDispatch(); |
| ++disp_count; |
| } |
| |
| ++cuCount; |
| curCu = nextSchedCu; |
| } |
| |
| DPRINTF(GPUWgLatency, "Shader Dispatched %d Wgs\n", disp_count); |
| |
| return scheduledSomething; |
| } |
| |
| void |
| Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, |
| bool suppress_func_errors, int cu_id) |
| { |
| int block_size = cuList.at(cu_id)->cacheLineSize(); |
| unsigned size = req->getSize(); |
| |
| Addr tmp_addr; |
| BaseMMU::Mode trans_mode; |
| |
| if (cmd == MemCmd::ReadReq) { |
| trans_mode = BaseMMU::Read; |
| } else if (cmd == MemCmd::WriteReq) { |
| trans_mode = BaseMMU::Write; |
| } else { |
| fatal("unexcepted MemCmd\n"); |
| } |
| |
| tmp_addr = req->getVaddr(); |
| Addr split_addr = roundDown(tmp_addr + size - 1, block_size); |
| |
| assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size); |
| |
| // Misaligned access |
| if (split_addr > tmp_addr) { |
| RequestPtr req1, req2; |
| req->splitOnVaddr(split_addr, req1, req2); |
| |
| PacketPtr pkt1 = new Packet(req2, cmd); |
| PacketPtr pkt2 = new Packet(req1, cmd); |
| |
| functionalTLBAccess(pkt1, cu_id, trans_mode); |
| functionalTLBAccess(pkt2, cu_id, trans_mode); |
| |
| PacketPtr new_pkt1 = new Packet(pkt1->req, cmd); |
| PacketPtr new_pkt2 = new Packet(pkt2->req, cmd); |
| |
| new_pkt1->dataStatic(data); |
| new_pkt2->dataStatic((uint8_t*)data + req1->getSize()); |
| |
| if (suppress_func_errors) { |
| new_pkt1->setSuppressFuncError(); |
| new_pkt2->setSuppressFuncError(); |
| } |
| |
| // fixme: this should be cuList[cu_id] if cu_id != n_cu |
| // The latter requires a memPort in the dispatcher |
| cuList[0]->memPort[0].sendFunctional(new_pkt1); |
| cuList[0]->memPort[0].sendFunctional(new_pkt2); |
| |
| delete new_pkt1; |
| delete new_pkt2; |
| delete pkt1; |
| delete pkt2; |
| } else { |
| PacketPtr pkt = new Packet(req, cmd); |
| functionalTLBAccess(pkt, cu_id, trans_mode); |
| PacketPtr new_pkt = new Packet(pkt->req, cmd); |
| new_pkt->dataStatic(data); |
| |
| if (suppress_func_errors) { |
| new_pkt->setSuppressFuncError(); |
| }; |
| |
| // fixme: this should be cuList[cu_id] if cu_id != n_cu |
| // The latter requires a memPort in the dispatcher |
| cuList[0]->memPort[0].sendFunctional(new_pkt); |
| |
| delete new_pkt; |
| delete pkt; |
| } |
| } |
| |
| void |
| Shader::ScheduleAdd(int *val,Tick when,int x) |
| { |
| sa_val.push_back(val); |
| when += curTick(); |
| sa_when.push_back(when); |
| sa_x.push_back(x); |
| ++sa_n; |
| if (!tickEvent.scheduled() || (when < tickEvent.when())) { |
| DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at " |
| "%lu\n", when); |
| reschedule(tickEvent, when, true); |
| } else { |
| assert(tickEvent.scheduled()); |
| DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at " |
| "%lu\n", when); |
| } |
| } |
| |
| void |
| Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, |
| MemCmd cmd, bool suppress_func_errors) |
| { |
| uint8_t *data_buf = (uint8_t*)ptr; |
| |
| for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize()); |
| !gen.done(); gen.next()) { |
| |
| RequestPtr req = std::make_shared<Request>( |
| gen.addr(), gen.size(), 0, |
| cuList[0]->requestorId(), 0, 0, nullptr); |
| |
| doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id); |
| data_buf += gen.size(); |
| } |
| } |
| |
| void |
| Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id) |
| { |
| AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false); |
| } |
| |
| void |
| Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id, |
| bool suppress_func_errors) |
| { |
| AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, |
| suppress_func_errors); |
| } |
| |
| void |
| Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id) |
| { |
| AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false); |
| } |
| |
| void |
| Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id, |
| bool suppress_func_errors) |
| { |
| AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, |
| suppress_func_errors); |
| } |
| |
| /* |
| * Send a packet through the appropriate TLB functional port. |
| * If cu_id=n_cu, then this is the dispatcher's TLB. |
| * Otherwise it's the TLB of the cu_id compute unit. |
| */ |
| void |
| Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode) |
| { |
| // update senderState. Need to know the gpuTc and the TLB mode |
| pkt->senderState = |
| new TheISA::GpuTLB::TranslationState(mode, gpuTc, false); |
| |
| // even when the perLaneTLB flag is turned on |
| // it's ok tp send all accesses through lane 0 |
| // since the lane # is not known here, |
| // This isn't important since these are functional accesses. |
| cuList[cu_id]->tlbPort[0].sendFunctional(pkt); |
| |
| /* safe_cast the senderState */ |
| TheISA::GpuTLB::TranslationState *sender_state = |
| safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); |
| |
| delete sender_state->tlbEntry; |
| delete pkt->senderState; |
| } |
| |
| /* |
| * allow the shader to sample stats from constituent devices |
| */ |
| void |
| Shader::sampleStore(const Tick accessTime) |
| { |
| stats.storeLatencyDist.sample(accessTime); |
| stats.allLatencyDist.sample(accessTime); |
| } |
| |
| /* |
| * allow the shader to sample stats from constituent devices |
| */ |
| void |
| Shader::sampleLoad(const Tick accessTime) |
| { |
| stats.loadLatencyDist.sample(accessTime); |
| stats.allLatencyDist.sample(accessTime); |
| } |
| |
| void |
| Shader::sampleInstRoundTrip(std::vector<Tick> roundTripTime) |
| { |
| // Only sample instructions that go all the way to main memory |
| if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) { |
| return; |
| } |
| |
| Tick t1 = roundTripTime[0]; |
| Tick t2 = roundTripTime[1]; |
| Tick t3 = roundTripTime[2]; |
| Tick t4 = roundTripTime[3]; |
| Tick t5 = roundTripTime[4]; |
| |
| stats.initToCoalesceLatency.sample(t2-t1); |
| stats.rubyNetworkLatency.sample(t3-t2); |
| stats.gmEnqueueLatency.sample(t4-t3); |
| stats.gmToCompleteLatency.sample(t5-t4); |
| } |
| |
| void |
| Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap) |
| { |
| stats.coalsrLineAddresses.sample(lineMap.size()); |
| std::vector<Tick> netTimes; |
| |
| // For each cache block address generated by a vmem inst, calculate |
| // the round-trip time for that cache block. |
| for (auto& it : lineMap) { |
| const std::vector<Tick>& timeVec = it.second; |
| if (timeVec.size() == 2) { |
| netTimes.push_back(timeVec[1] - timeVec[0]); |
| } |
| } |
| |
| // Sort the cache block round trip times so that the first |
| // distrubtion is always measuring the fastests and the last |
| // distrubtion is always measuring the slowest cache block. |
| std::sort(netTimes.begin(), netTimes.end()); |
| |
| // Sample the round trip time for each N cache blocks into the |
| // Nth distribution. |
| int idx = 0; |
| for (auto& time : netTimes) { |
| stats.cacheBlockRoundTrip[idx].sample(time); |
| ++idx; |
| } |
| } |
| |
| void |
| Shader::notifyCuSleep() { |
| // If all CUs attached to his shader are asleep, update shaderActiveTicks |
| panic_if(_activeCus <= 0 || _activeCus > cuList.size(), |
| "Invalid activeCu size\n"); |
| _activeCus--; |
| if (!_activeCus) |
| stats.shaderActiveTicks += curTick() - _lastInactiveTick; |
| } |
| |
| Shader::ShaderStats::ShaderStats(statistics::Group *parent, int wf_size) |
| : statistics::Group(parent), |
| ADD_STAT(allLatencyDist, "delay distribution for all"), |
| ADD_STAT(loadLatencyDist, "delay distribution for loads"), |
| ADD_STAT(storeLatencyDist, "delay distribution for stores"), |
| ADD_STAT(initToCoalesceLatency, |
| "Ticks from vmem inst initiateAcc to coalescer issue"), |
| ADD_STAT(rubyNetworkLatency, |
| "Ticks from coalescer issue to coalescer hit callback"), |
| ADD_STAT(gmEnqueueLatency, |
| "Ticks from coalescer hit callback to GM pipe enqueue"), |
| ADD_STAT(gmToCompleteLatency, |
| "Ticks queued in GM pipes ordered response buffer"), |
| ADD_STAT(coalsrLineAddresses, |
| "Number of cache lines for coalesced request"), |
| ADD_STAT(shaderActiveTicks, |
| "Total ticks that any CU attached to this shader is active"), |
| ADD_STAT(vectorInstSrcOperand, |
| "vector instruction source operand distribution"), |
| ADD_STAT(vectorInstDstOperand, |
| "vector instruction destination operand distribution") |
| { |
| allLatencyDist |
| .init(0, 1600000, 10000) |
| .flags(statistics::pdf | statistics::oneline); |
| |
| loadLatencyDist |
| .init(0, 1600000, 10000) |
| .flags(statistics::pdf | statistics::oneline); |
| |
| storeLatencyDist |
| .init(0, 1600000, 10000) |
| .flags(statistics::pdf | statistics::oneline); |
| |
| initToCoalesceLatency |
| .init(0, 1600000, 10000) |
| .flags(statistics::pdf | statistics::oneline); |
| |
| rubyNetworkLatency |
| .init(0, 1600000, 10000) |
| .flags(statistics::pdf | statistics::oneline); |
| |
| gmEnqueueLatency |
| .init(0, 1600000, 10000) |
| .flags(statistics::pdf | statistics::oneline); |
| |
| gmToCompleteLatency |
| .init(0, 1600000, 10000) |
| .flags(statistics::pdf | statistics::oneline); |
| |
| coalsrLineAddresses |
| .init(0, 20, 1) |
| .flags(statistics::pdf | statistics::oneline); |
| |
| vectorInstSrcOperand.init(4); |
| vectorInstDstOperand.init(4); |
| |
| cacheBlockRoundTrip = new statistics::Distribution[wf_size]; |
| for (int idx = 0; idx < wf_size; ++idx) { |
| std::stringstream namestr; |
| ccprintf(namestr, "%s.cacheBlockRoundTrip%d", |
| static_cast<Shader*>(parent)->name(), idx); |
| cacheBlockRoundTrip[idx] |
| .init(0, 1600000, 10000) |
| .name(namestr.str()) |
| .desc("Coalsr-to-coalsr time for the Nth cache block in an inst") |
| .flags(statistics::pdf | statistics::oneline); |
| } |
| } |
| |
| } // namespace gem5 |