blob: 4083c1c85a81197bc35747f6f519d8366a97e458 [file] [log] [blame]
/*
* Copyright (c) 2017-2018 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* @file
* HSAQueuEntry is the simulator's internal representation of an
* AQL queue entry (task). It encasulates all of the relevant info
* about a task, which is gathered from various runtime data
* structures including: the AQL MQD, the AQL packet, and the code
* object.
*/
#ifndef __GPU_COMPUTE_HSA_QUEUE_ENTRY__
#define __GPU_COMPUTE_HSA_QUEUE_ENTRY__
#include <bitset>
#include <cstdint>
#include <cstring>
#include <iostream>
#include <vector>
#include "base/intmath.hh"
#include "base/types.hh"
#include "dev/hsa/hsa_packet.hh"
#include "dev/hsa/hsa_queue.hh"
#include "enums/GfxVersion.hh"
#include "gpu-compute/kernel_code.hh"
namespace gem5
{
class HSAQueueEntry
{
public:
HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
int dispatch_id, void *disp_pkt, AMDKernelCode *akc,
Addr host_pkt_addr, Addr code_addr, GfxVersion gfx_version)
: kernName(kernel_name),
_wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,
(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_z}},
_gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x,
(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y,
(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}},
numVgprs(akc->workitem_vgpr_count),
numSgprs(akc->wavefront_sgpr_count),
_queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt),
_hostDispPktAddr(host_pkt_addr),
_completionSignal(((_hsa_dispatch_packet_t*)disp_pkt)
->completion_signal),
codeAddress(code_addr),
kernargAddress(((_hsa_dispatch_packet_t*)disp_pkt)->kernarg_address),
_outstandingInvs(-1), _outstandingWbs(0),
_ldsSize((int)((_hsa_dispatch_packet_t*)disp_pkt)->
group_segment_size),
_privMemPerItem((int)((_hsa_dispatch_packet_t*)disp_pkt)->
private_segment_size),
_contextId(0), _wgId{{ 0, 0, 0 }},
_numWgTotal(1), numWgArrivedAtBarrier(0), _numWgCompleted(0),
_globalWgId(0), dispatchComplete(false)
{
// Precompiled BLIT kernels actually violate the spec a bit
// and don't set many of the required akc fields. For these kernels,
// we need to rip register usage from the resource registers.
//
// We can't get an exact number of registers from the resource
// registers because they round, but we can get an upper bound on it.
// We determine the number of registers by solving for "vgprs_used"
// in the LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
// #code-object-v3-kernel-descriptor
// Currently, the only supported gfx version in gem5 that computes
// this differently is gfx90a.
if (!numVgprs) {
if (gfx_version == GfxVersion::gfx90a) {
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
} else {
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
}
}
if (!numSgprs || numSgprs ==
std::numeric_limits<decltype(akc->wavefront_sgpr_count)>::max()) {
// Supported major generation numbers: 0 (BLIT kernels), 8, and 9
uint16_t version = akc->amd_machine_version_major;
assert((version == 0) || (version == 8) || (version == 9));
// SGPR allocation granularies:
// - GFX8: 8
// - GFX9: 16
// Source: https://llvm.org/docs/AMDGPUUsage.html
if ((version == 0) || (version == 8)) {
// We assume that BLIT kernels use the same granularity as GFX8
numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8;
} else if (version == 9) {
numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
}
}
initialVgprState.reset();
initialSgprState.reset();
for (int i = 0; i < MAX_DIM; ++i) {
_numWg[i] = divCeil(_gridSize[i], _wgSize[i]);
_numWgTotal *= _numWg[i];
}
parseKernelCode(akc);
}
const std::string&
kernelName() const
{
return kernName;
}
int
wgSize(int dim) const
{
assert(dim < MAX_DIM);
return _wgSize[dim];
}
int
gridSize(int dim) const
{
assert(dim < MAX_DIM);
return _gridSize[dim];
}
int
numVectorRegs() const
{
return numVgprs;
}
int
numScalarRegs() const
{
return numSgprs;
}
uint32_t
queueId() const
{
return _queueId;
}
int
dispatchId() const
{
return _dispatchId;
}
void*
dispPktPtr()
{
return dispPkt;
}
Addr
hostDispPktAddr() const
{
return _hostDispPktAddr;
}
Addr
completionSignal() const
{
return _completionSignal;
}
Addr
codeAddr() const
{
return codeAddress;
}
Addr
kernargAddr() const
{
return kernargAddress;
}
int
ldsSize() const
{
return _ldsSize;
}
int privMemPerItem() const { return _privMemPerItem; }
int
contextId() const
{
return _contextId;
}
bool
dispComplete() const
{
return dispatchComplete;
}
int
wgId(int dim) const
{
assert(dim < MAX_DIM);
return _wgId[dim];
}
void
wgId(int dim, int val)
{
assert(dim < MAX_DIM);
_wgId[dim] = val;
}
int
globalWgId() const
{
return _globalWgId;
}
void
globalWgId(int val)
{
_globalWgId = val;
}
int
numWg(int dim) const
{
assert(dim < MAX_DIM);
return _numWg[dim];
}
void
notifyWgCompleted()
{
++_numWgCompleted;
}
int
numWgCompleted() const
{
return _numWgCompleted;
}
int
numWgTotal() const
{
return _numWgTotal;
}
void
markWgDispatch()
{
++_wgId[0];
++_globalWgId;
if (wgId(0) * wgSize(0) >= gridSize(0)) {
_wgId[0] = 0;
++_wgId[1];
if (wgId(1) * wgSize(1) >= gridSize(1)) {
_wgId[1] = 0;
++_wgId[2];
if (wgId(2) * wgSize(2) >= gridSize(2)) {
dispatchComplete = true;
}
}
}
}
int
numWgAtBarrier() const
{
return numWgArrivedAtBarrier;
}
bool vgprBitEnabled(int bit) const
{
return initialVgprState.test(bit);
}
bool sgprBitEnabled(int bit) const
{
return initialSgprState.test(bit);
}
/**
* Host-side addr of the amd_queue_t on which
* this task was queued.
*/
Addr hostAMDQueueAddr;
/**
* Keep a copy of the AMD HSA queue because we
* need info from some of its fields to initialize
* register state.
*/
_amd_queue_t amdQueue;
// the maximum number of dimensions for a grid or workgroup
const static int MAX_DIM = 3;
/* getter */
int
outstandingInvs() {
return _outstandingInvs;
}
/**
* Whether invalidate has started or finished -1 is the
* initial value indicating inv has not started for the
* kernel.
*/
bool
isInvStarted()
{
return (_outstandingInvs != -1);
}
/**
* update the number of pending invalidate requests
*
* val: negative to decrement, positive to increment
*/
void
updateOutstandingInvs(int val)
{
_outstandingInvs += val;
assert(_outstandingInvs >= 0);
}
/**
* Forcefully change the state to be inv done.
*/
void
markInvDone()
{
_outstandingInvs = 0;
}
/**
* Is invalidate done?
*/
bool
isInvDone() const
{
assert(_outstandingInvs >= 0);
return (_outstandingInvs == 0);
}
int
outstandingWbs() const
{
return _outstandingWbs;
}
/**
* Update the number of pending writeback requests.
*
* val: negative to decrement, positive to increment
*/
void
updateOutstandingWbs(int val)
{
_outstandingWbs += val;
assert(_outstandingWbs >= 0);
}
private:
void
parseKernelCode(AMDKernelCode *akc)
{
/** set the enable bits for the initial SGPR state */
initialSgprState.set(PrivateSegBuf,
akc->enable_sgpr_private_segment_buffer);
initialSgprState.set(DispatchPtr,
akc->enable_sgpr_dispatch_ptr);
initialSgprState.set(QueuePtr,
akc->enable_sgpr_queue_ptr);
initialSgprState.set(KernargSegPtr,
akc->enable_sgpr_kernarg_segment_ptr);
initialSgprState.set(DispatchId,
akc->enable_sgpr_dispatch_id);
initialSgprState.set(FlatScratchInit,
akc->enable_sgpr_flat_scratch_init);
initialSgprState.set(PrivateSegSize,
akc->enable_sgpr_private_segment_size);
initialSgprState.set(GridWorkgroupCountX,
akc->enable_sgpr_grid_workgroup_count_x);
initialSgprState.set(GridWorkgroupCountY,
akc->enable_sgpr_grid_workgroup_count_y);
initialSgprState.set(GridWorkgroupCountZ,
akc->enable_sgpr_grid_workgroup_count_z);
initialSgprState.set(WorkgroupIdX,
akc->enable_sgpr_workgroup_id_x);
initialSgprState.set(WorkgroupIdY,
akc->enable_sgpr_workgroup_id_y);
initialSgprState.set(WorkgroupIdZ,
akc->enable_sgpr_workgroup_id_z);
initialSgprState.set(WorkgroupInfo,
akc->enable_sgpr_workgroup_info);
initialSgprState.set(PrivSegWaveByteOffset,
akc->enable_sgpr_private_segment_wave_byte_offset);
/**
* set the enable bits for the initial VGPR state. the
* workitem Id in the X dimension is always initialized.
*/
initialVgprState.set(WorkitemIdX, true);
initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id > 0);
initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id > 1);
}
// name of the kernel associated with the AQL entry
std::string kernName;
// workgroup Size (3 dimensions)
std::array<int, MAX_DIM> _wgSize;
// grid Size (3 dimensions)
std::array<int, MAX_DIM> _gridSize;
// total number of VGPRs per work-item
int numVgprs;
// total number of SGPRs per wavefront
int numSgprs;
// id of AQL queue in which this entry is placed
uint32_t _queueId;
int _dispatchId;
// raw AQL packet pointer
void *dispPkt;
// host-side addr of the dispatch packet
Addr _hostDispPktAddr;
// pointer to bool
Addr _completionSignal;
// base address of the raw machine code
Addr codeAddress;
// base address of the kernel args
Addr kernargAddress;
/**
* Number of outstanding invs for the kernel.
* values:
* -1: initial value, invalidate has not started for the kernel
* 0: 1)-1->0, about to start (a transient state, added in the same cycle)
* 2)+1->0, all inv requests are finished, i.e., invalidate done
* ?: positive value, indicating the number of pending inv requests
*/
int _outstandingInvs;
/**
* Number of outstanding wbs for the kernel
* values:
* 0: 1)initial value, flush has not started for the kernel
* 2)+1->0: all wb requests are finished, i.e., flush done
* ?: positive value, indicating the number of pending wb requests
*/
int _outstandingWbs;
int _ldsSize;
int _privMemPerItem;
int _contextId;
std::array<int, MAX_DIM> _wgId;
std::array<int, MAX_DIM> _numWg;
int _numWgTotal;
int numWgArrivedAtBarrier;
// The number of completed work groups
int _numWgCompleted;
int _globalWgId;
bool dispatchComplete;
std::bitset<NumVectorInitFields> initialVgprState;
std::bitset<NumScalarInitFields> initialSgprState;
};
} // namespace gem5
#endif // __GPU_COMPUTE_HSA_QUEUE_ENTRY__