blob: f1cec5ec15b09bc45b933a361495f2920810c2be [file] [log] [blame]
/*
* Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
*/
#ifndef __ARCH_HSAIL_INSTS_MEM_HH__
#define __ARCH_HSAIL_INSTS_MEM_HH__
#include <type_traits>
#include "arch/hsail/insts/decl.hh"
#include "arch/hsail/insts/gpu_static_inst.hh"
#include "arch/hsail/operand.hh"
#include "gpu-compute/compute_unit.hh"
namespace HsailISA
{
class MemInst
{
public:
MemInst() : size(0), addr_operand(nullptr) { }
MemInst(Enums::MemType m_type)
{
if (m_type == Enums::M_U64 ||
m_type == Enums::M_S64 ||
m_type == Enums::M_F64) {
size = 8;
} else if (m_type == Enums::M_U32 ||
m_type == Enums::M_S32 ||
m_type == Enums::M_F32) {
size = 4;
} else if (m_type == Enums::M_U16 ||
m_type == Enums::M_S16 ||
m_type == Enums::M_F16) {
size = 2;
} else {
size = 1;
}
addr_operand = nullptr;
}
void
init_addr(AddrOperandBase *_addr_operand)
{
addr_operand = _addr_operand;
}
private:
int size;
AddrOperandBase *addr_operand;
public:
int getMemOperandSize() { return size; }
AddrOperandBase *getAddressOperand() { return addr_operand; }
};
template<typename DestOperandType, typename AddrOperandType>
class LdaInstBase : public HsailGPUStaticInst
{
public:
typename DestOperandType::DestOperand dest;
AddrOperandType addr;
LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
const char *_opcode)
: HsailGPUStaticInst(obj, _opcode)
{
using namespace Brig;
setFlag(ALU);
unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
dest.init(op_offs, obj);
op_offs = obj->getOperandPtr(ib->operands, 1);
addr.init(op_offs, obj);
}
int numSrcRegOperands() override
{ return(this->addr.isVectorRegister()); }
int numDstRegOperands() override
{ return dest.isVectorRegister(); }
bool isVectorRegister(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
return((operandIndex == 0) ? dest.isVectorRegister() :
this->addr.isVectorRegister());
}
bool isCondRegister(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
return((operandIndex == 0) ? dest.isCondRegister() :
this->addr.isCondRegister());
}
bool isScalarRegister(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
return((operandIndex == 0) ? dest.isScalarRegister() :
this->addr.isScalarRegister());
}
bool isSrcOperand(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if (operandIndex > 0)
return(this->addr.isVectorRegister());
return false;
}
bool isDstOperand(int operandIndex) override {
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
return(operandIndex == 0);
}
int getOperandSize(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
return((operandIndex == 0) ? dest.opSize() :
this->addr.opSize());
}
int
getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
return((operandIndex == 0) ? dest.regIndex() :
this->addr.regIndex());
}
int getNumOperands() override
{
if (this->addr.isVectorRegister())
return 2;
return 1;
}
};
template<typename DestDataType, typename AddrOperandType>
class LdaInst :
public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>,
public MemInst
{
public:
void generateDisassembly();
LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
const char *_opcode)
: LdaInstBase<typename DestDataType::OperandType,
AddrOperandType>(ib, obj, _opcode)
{
init_addr(&this->addr);
}
void execute(GPUDynInstPtr gpuDynInst);
};
template<typename DataType>
GPUStaticInst*
decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj)
{
unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj);
if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas");
} else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
// V2/V4 not allowed
switch (regDataType.regKind) {
case Brig::BRIG_REGISTER_KIND_SINGLE:
return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas");
case Brig::BRIG_REGISTER_KIND_DOUBLE:
return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas");
default:
fatal("Bad ldas register operand type %d\n", regDataType.type);
}
} else {
fatal("Bad ldas register operand kind %d\n", regDataType.kind);
}
}
template<typename MemOperandType, typename DestOperandType,
typename AddrOperandType>
class LdInstBase : public HsailGPUStaticInst
{
public:
Brig::BrigWidth8_t width;
typename DestOperandType::DestOperand dest;
AddrOperandType addr;
Brig::BrigSegment segment;
Brig::BrigMemoryOrder memoryOrder;
Brig::BrigMemoryScope memoryScope;
unsigned int equivClass;
LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
const char *_opcode)
: HsailGPUStaticInst(obj, _opcode)
{
using namespace Brig;
setFlag(MemoryRef);
setFlag(Load);
if (ib->opcode == BRIG_OPCODE_LD) {
const BrigInstMem *ldst = (const BrigInstMem*)ib;
segment = (BrigSegment)ldst->segment;
memoryOrder = BRIG_MEMORY_ORDER_NONE;
memoryScope = BRIG_MEMORY_SCOPE_NONE;
equivClass = ldst->equivClass;
width = ldst->width;
unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
dest.init(op_offs, obj);
op_offs = obj->getOperandPtr(ib->operands, 1);
addr.init(op_offs, obj);
} else {
const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
segment = (BrigSegment)at->segment;
memoryOrder = (BrigMemoryOrder)at->memoryOrder;
memoryScope = (BrigMemoryScope)at->memoryScope;
equivClass = 0;
width = BRIG_WIDTH_1;
unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
dest.init(op_offs, obj);
op_offs = obj->getOperandPtr(ib->operands,1);
addr.init(op_offs, obj);
}
switch (memoryOrder) {
case BRIG_MEMORY_ORDER_NONE:
setFlag(NoOrder);
break;
case BRIG_MEMORY_ORDER_RELAXED:
setFlag(RelaxedOrder);
break;
case BRIG_MEMORY_ORDER_SC_ACQUIRE:
setFlag(Acquire);
break;
case BRIG_MEMORY_ORDER_SC_RELEASE:
setFlag(Release);
break;
case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
setFlag(AcquireRelease);
break;
default:
fatal("LdInst has bad memory order type\n");
}
switch (memoryScope) {
case BRIG_MEMORY_SCOPE_NONE:
setFlag(NoScope);
break;
case BRIG_MEMORY_SCOPE_WORKITEM:
setFlag(WorkitemScope);
break;
case BRIG_MEMORY_SCOPE_WORKGROUP:
setFlag(WorkgroupScope);
break;
case BRIG_MEMORY_SCOPE_AGENT:
setFlag(DeviceScope);
break;
case BRIG_MEMORY_SCOPE_SYSTEM:
setFlag(SystemScope);
break;
default:
fatal("LdInst has bad memory scope type\n");
}
switch (segment) {
case BRIG_SEGMENT_GLOBAL:
setFlag(GlobalSegment);
break;
case BRIG_SEGMENT_GROUP:
setFlag(GroupSegment);
break;
case BRIG_SEGMENT_PRIVATE:
setFlag(PrivateSegment);
break;
case BRIG_SEGMENT_READONLY:
setFlag(ReadOnlySegment);
break;
case BRIG_SEGMENT_SPILL:
setFlag(SpillSegment);
break;
case BRIG_SEGMENT_FLAT:
setFlag(Flat);
break;
case BRIG_SEGMENT_KERNARG:
setFlag(KernArgSegment);
break;
case BRIG_SEGMENT_ARG:
setFlag(ArgSegment);
break;
default:
panic("Ld: segment %d not supported\n", segment);
}
}
int numSrcRegOperands() override
{ return(this->addr.isVectorRegister()); }
int numDstRegOperands() override { return dest.isVectorRegister(); }
int getNumOperands() override
{
if (this->addr.isVectorRegister())
return 2;
else
return 1;
}
bool isVectorRegister(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
return((operandIndex == 0) ? dest.isVectorRegister() :
this->addr.isVectorRegister());
}
bool isCondRegister(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
return((operandIndex == 0) ? dest.isCondRegister() :
this->addr.isCondRegister());
}
bool isScalarRegister(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
return((operandIndex == 0) ? dest.isScalarRegister() :
this->addr.isScalarRegister());
}
bool isSrcOperand(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if (operandIndex > 0)
return(this->addr.isVectorRegister());
return false;
}
bool isDstOperand(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
return(operandIndex == 0);
}
int getOperandSize(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
return((operandIndex == 0) ? dest.opSize() :
this->addr.opSize());
}
int
getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
return((operandIndex == 0) ? dest.regIndex() :
this->addr.regIndex());
}
};
template<typename MemDataType, typename DestDataType,
typename AddrOperandType>
class LdInst :
public LdInstBase<typename MemDataType::CType,
typename DestDataType::OperandType, AddrOperandType>,
public MemInst
{
typename DestDataType::OperandType::DestOperand dest_vect[4];
uint16_t num_dest_operands;
void generateDisassembly() override;
public:
LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
const char *_opcode)
: LdInstBase<typename MemDataType::CType,
typename DestDataType::OperandType,
AddrOperandType>(ib, obj, _opcode),
MemInst(MemDataType::memType)
{
init_addr(&this->addr);
unsigned op_offs = obj->getOperandPtr(ib->operands,0);
const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
const Brig::BrigOperandOperandList *brigRegVecOp =
(const Brig::BrigOperandOperandList*)brigOp;
num_dest_operands =
*((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
assert(num_dest_operands <= 4);
} else {
num_dest_operands = 1;
}
if (num_dest_operands > 1) {
assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
for (int i = 0; i < num_dest_operands; ++i) {
dest_vect[i].init_from_vect(op_offs, obj, i);
}
}
}
void
initiateAcc(GPUDynInstPtr gpuDynInst) override
{
typedef typename MemDataType::CType c0;
gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
if (num_dest_operands > 1) {
for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
if (gpuDynInst->exec_mask[i])
gpuDynInst->statusVector.push_back(num_dest_operands);
else
gpuDynInst->statusVector.push_back(0);
}
for (int k = 0; k < num_dest_operands; ++k) {
c0 *d = &((c0*)gpuDynInst->d_data)
[k * gpuDynInst->computeUnit()->wfSize()];
for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
if (gpuDynInst->exec_mask[i]) {
Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
if (this->isLocalMem()) {
// load from shared memory
*d = gpuDynInst->wavefront()->ldsChunk->
read<c0>(vaddr);
} else {
RequestPtr req = std::make_shared<Request>(0,
vaddr, sizeof(c0), 0,
gpuDynInst->computeUnit()->masterId(),
0, gpuDynInst->wfDynId);
gpuDynInst->setRequestFlags(req);
PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
pkt->dataStatic(d);
if (gpuDynInst->computeUnit()->shader->
separate_acquire_release &&
gpuDynInst->isAcquire()) {
// if this load has acquire semantics,
// set the response continuation function
// to perform an Acquire request
gpuDynInst->execContinuation =
&GPUStaticInst::execLdAcq;
gpuDynInst->useContinuation = true;
} else {
// the request will be finished when
// the load completes
gpuDynInst->useContinuation = false;
}
// translation is performed in sendRequest()
gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
i, pkt);
}
}
++d;
}
}
gpuDynInst->updateStats();
}
void
completeAcc(GPUDynInstPtr gpuDynInst) override
{
typedef typename MemDataType::CType c1;
constexpr bool is_vt_32 = DestDataType::vgprType == VT_32;
/**
* this code essentially replaces the long if-else chain
* that was in used GlobalMemPipeline::exec() to infer the
* size (single/double) and type (floating point/integer) of
* the destination register. this is needed for load
* instructions because the loaded value and the
* destination type can be of different sizes, and we also
* need to know if the value we're writing back is floating
* point and signed/unsigned, so we can properly cast the
* writeback value
*/
typedef typename std::conditional<is_vt_32,
typename std::conditional<std::is_floating_point<c1>::value,
float, typename std::conditional<std::is_signed<c1>::value,
int32_t, uint32_t>::type>::type,
typename std::conditional<std::is_floating_point<c1>::value,
double, typename std::conditional<std::is_signed<c1>::value,
int64_t, uint64_t>::type>::type>::type c0;
Wavefront *w = gpuDynInst->wavefront();
std::vector<uint32_t> regVec;
// iterate over number of destination register operands since
// this is a load
for (int k = 0; k < num_dest_operands; ++k) {
assert((sizeof(c1) * num_dest_operands)
<= MAX_WIDTH_FOR_MEM_INST);
int dst = this->dest.regIndex() + k;
if (num_dest_operands > MAX_REGS_FOR_NON_VEC_MEM_INST)
dst = dest_vect[k].regIndex();
// virtual->physical VGPR mapping
int physVgpr = w->remap(dst, sizeof(c0), 1);
// save the physical VGPR index
regVec.push_back(physVgpr);
c1 *p1 =
&((c1*)gpuDynInst->d_data)[k * w->computeUnit->wfSize()];
for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
if (gpuDynInst->exec_mask[i]) {
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
"$%s%d <- %d global ld done (src = wavefront "
"ld inst)\n", w->computeUnit->cu_id, w->simdId,
w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
dst, *p1);
// write the value into the physical VGPR. This is a
// purely functional operation. No timing is modeled.
w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
*p1, i);
}
++p1;
}
}
// Schedule the write operation of the load data on the VRF.
// This simply models the timing aspect of the VRF write operation.
// It does not modify the physical VGPR.
int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
sizeof(c0), gpuDynInst->time);
if (this->isGlobalMem()) {
gpuDynInst->computeUnit()->globalMemoryPipe
.incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
} else {
assert(this->isLocalMem());
gpuDynInst->computeUnit()->localMemoryPipe
.incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
}
}
private:
void
execLdAcq(GPUDynInstPtr gpuDynInst) override
{
// after the load has complete and if the load has acquire
// semantics, issue an acquire request.
if (!this->isLocalMem()) {
if (gpuDynInst->computeUnit()->shader->separate_acquire_release
&& gpuDynInst->isAcquire()) {
gpuDynInst->statusBitVector = VectorMask(1);
gpuDynInst->useContinuation = false;
// create request
RequestPtr req = std::make_shared<Request>(0, 0, 0, 0,
gpuDynInst->computeUnit()->masterId(),
0, gpuDynInst->wfDynId);
req->setFlags(Request::ACQUIRE);
gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
}
}
}
public:
bool isVectorRegister(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if ((num_dest_operands != getNumOperands()) &&
(operandIndex == (getNumOperands()-1)))
return(this->addr.isVectorRegister());
if (num_dest_operands > 1) {
return dest_vect[operandIndex].isVectorRegister();
}
else if (num_dest_operands == 1) {
return LdInstBase<typename MemDataType::CType,
typename DestDataType::OperandType,
AddrOperandType>::dest.isVectorRegister();
}
return false;
}
bool isCondRegister(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if ((num_dest_operands != getNumOperands()) &&
(operandIndex == (getNumOperands()-1)))
return(this->addr.isCondRegister());
if (num_dest_operands > 1)
return dest_vect[operandIndex].isCondRegister();
else if (num_dest_operands == 1)
return LdInstBase<typename MemDataType::CType,
typename DestDataType::OperandType,
AddrOperandType>::dest.isCondRegister();
return false;
}
bool isScalarRegister(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if ((num_dest_operands != getNumOperands()) &&
(operandIndex == (getNumOperands()-1)))
return(this->addr.isScalarRegister());
if (num_dest_operands > 1)
return dest_vect[operandIndex].isScalarRegister();
else if (num_dest_operands == 1)
return LdInstBase<typename MemDataType::CType,
typename DestDataType::OperandType,
AddrOperandType>::dest.isScalarRegister();
return false;
}
bool isSrcOperand(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if ((num_dest_operands != getNumOperands()) &&
(operandIndex == (getNumOperands()-1)))
return(this->addr.isVectorRegister());
return false;
}
bool isDstOperand(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if ((num_dest_operands != getNumOperands()) &&
(operandIndex == (getNumOperands()-1)))
return false;
return true;
}
int getOperandSize(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if ((num_dest_operands != getNumOperands()) &&
(operandIndex == (getNumOperands()-1)))
return(this->addr.opSize());
if (num_dest_operands > 1)
return(dest_vect[operandIndex].opSize());
else if (num_dest_operands == 1)
return(LdInstBase<typename MemDataType::CType,
typename DestDataType::OperandType,
AddrOperandType>::dest.opSize());
return 0;
}
int
getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if ((num_dest_operands != getNumOperands()) &&
(operandIndex == (getNumOperands()-1)))
return(this->addr.regIndex());
if (num_dest_operands > 1)
return(dest_vect[operandIndex].regIndex());
else if (num_dest_operands == 1)
return(LdInstBase<typename MemDataType::CType,
typename DestDataType::OperandType,
AddrOperandType>::dest.regIndex());
return -1;
}
int getNumOperands() override
{
if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
return(num_dest_operands+1);
else
return(num_dest_operands);
}
void execute(GPUDynInstPtr gpuDynInst) override;
};
template<typename MemDT, typename DestDT>
GPUStaticInst*
decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj)
{
unsigned op_offs = obj->getOperandPtr(ib->operands,1);
BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld");
} else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
switch (tmp.regKind) {
case Brig::BRIG_REGISTER_KIND_SINGLE:
return new LdInst<MemDT, DestDT,
SRegAddrOperand>(ib, obj, "ld");
case Brig::BRIG_REGISTER_KIND_DOUBLE:
return new LdInst<MemDT, DestDT,
DRegAddrOperand>(ib, obj, "ld");
default:
fatal("Bad ld register operand type %d\n", tmp.regKind);
}
} else {
fatal("Bad ld register operand kind %d\n", tmp.kind);
}
}
template<typename MemDT>
GPUStaticInst*
decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj)
{
unsigned op_offs = obj->getOperandPtr(ib->operands,0);
BrigRegOperandInfo dest = findRegDataType(op_offs, obj);
assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
switch(dest.regKind) {
case Brig::BRIG_REGISTER_KIND_SINGLE:
switch (ib->type) {
case Brig::BRIG_TYPE_B8:
case Brig::BRIG_TYPE_B16:
case Brig::BRIG_TYPE_B32:
return decodeLd2<MemDT, B32>(ib, obj);
case Brig::BRIG_TYPE_U8:
case Brig::BRIG_TYPE_U16:
case Brig::BRIG_TYPE_U32:
return decodeLd2<MemDT, U32>(ib, obj);
case Brig::BRIG_TYPE_S8:
case Brig::BRIG_TYPE_S16:
case Brig::BRIG_TYPE_S32:
return decodeLd2<MemDT, S32>(ib, obj);
case Brig::BRIG_TYPE_F16:
case Brig::BRIG_TYPE_F32:
return decodeLd2<MemDT, U32>(ib, obj);
default:
fatal("Bad ld register operand type %d, %d\n",
dest.regKind, ib->type);
};
case Brig::BRIG_REGISTER_KIND_DOUBLE:
switch (ib->type) {
case Brig::BRIG_TYPE_B64:
return decodeLd2<MemDT, B64>(ib, obj);
case Brig::BRIG_TYPE_U64:
return decodeLd2<MemDT, U64>(ib, obj);
case Brig::BRIG_TYPE_S64:
return decodeLd2<MemDT, S64>(ib, obj);
case Brig::BRIG_TYPE_F64:
return decodeLd2<MemDT, U64>(ib, obj);
default:
fatal("Bad ld register operand type %d, %d\n",
dest.regKind, ib->type);
};
default:
fatal("Bad ld register operand type %d, %d\n", dest.regKind,
ib->type);
}
}
template<typename MemDataType, typename SrcOperandType,
typename AddrOperandType>
class StInstBase : public HsailGPUStaticInst
{
public:
typename SrcOperandType::SrcOperand src;
AddrOperandType addr;
Brig::BrigSegment segment;
Brig::BrigMemoryScope memoryScope;
Brig::BrigMemoryOrder memoryOrder;
unsigned int equivClass;
StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
const char *_opcode)
: HsailGPUStaticInst(obj, _opcode)
{
using namespace Brig;
setFlag(MemoryRef);
setFlag(Store);
if (ib->opcode == BRIG_OPCODE_ST) {
const BrigInstMem *ldst = (const BrigInstMem*)ib;
segment = (BrigSegment)ldst->segment;
memoryOrder = BRIG_MEMORY_ORDER_NONE;
memoryScope = BRIG_MEMORY_SCOPE_NONE;
equivClass = ldst->equivClass;
unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
const BrigOperand *baseOp = obj->getOperand(op_offs);
if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) ||
(baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) {
src.init(op_offs, obj);
}
op_offs = obj->getOperandPtr(ib->operands, 1);
addr.init(op_offs, obj);
} else {
const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
segment = (BrigSegment)at->segment;
memoryScope = (BrigMemoryScope)at->memoryScope;
memoryOrder = (BrigMemoryOrder)at->memoryOrder;
equivClass = 0;
unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
addr.init(op_offs, obj);
op_offs = obj->getOperandPtr(ib->operands, 1);
src.init(op_offs, obj);
}
switch (memoryOrder) {
case BRIG_MEMORY_ORDER_NONE:
setFlag(NoOrder);
break;
case BRIG_MEMORY_ORDER_RELAXED:
setFlag(RelaxedOrder);
break;
case BRIG_MEMORY_ORDER_SC_ACQUIRE:
setFlag(Acquire);
break;
case BRIG_MEMORY_ORDER_SC_RELEASE:
setFlag(Release);
break;
case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
setFlag(AcquireRelease);
break;
default:
fatal("StInst has bad memory order type\n");
}
switch (memoryScope) {
case BRIG_MEMORY_SCOPE_NONE:
setFlag(NoScope);
break;
case BRIG_MEMORY_SCOPE_WORKITEM:
setFlag(WorkitemScope);
break;
case BRIG_MEMORY_SCOPE_WORKGROUP:
setFlag(WorkgroupScope);
break;
case BRIG_MEMORY_SCOPE_AGENT:
setFlag(DeviceScope);
break;
case BRIG_MEMORY_SCOPE_SYSTEM:
setFlag(SystemScope);
break;
default:
fatal("StInst has bad memory scope type\n");
}
switch (segment) {
case BRIG_SEGMENT_GLOBAL:
setFlag(GlobalSegment);
break;
case BRIG_SEGMENT_GROUP:
setFlag(GroupSegment);
break;
case BRIG_SEGMENT_PRIVATE:
setFlag(PrivateSegment);
break;
case BRIG_SEGMENT_READONLY:
setFlag(ReadOnlySegment);
break;
case BRIG_SEGMENT_SPILL:
setFlag(SpillSegment);
break;
case BRIG_SEGMENT_FLAT:
setFlag(Flat);
break;
case BRIG_SEGMENT_ARG:
setFlag(ArgSegment);
break;
default:
panic("St: segment %d not supported\n", segment);
}
}
int numDstRegOperands() override { return 0; }
int numSrcRegOperands() override
{
return src.isVectorRegister() + this->addr.isVectorRegister();
}
int getNumOperands() override
{
if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
return 2;
else
return 1;
}
bool isVectorRegister(int operandIndex) override
{
assert(operandIndex >= 0 && operandIndex < getNumOperands());
return !operandIndex ? src.isVectorRegister() :
this->addr.isVectorRegister();
}
bool isCondRegister(int operandIndex) override
{
assert(operandIndex >= 0 && operandIndex < getNumOperands());
return !operandIndex ? src.isCondRegister() :
this->addr.isCondRegister();
}
bool isScalarRegister(int operandIndex) override
{
assert(operandIndex >= 0 && operandIndex < getNumOperands());
return !operandIndex ? src.isScalarRegister() :
this->addr.isScalarRegister();
}
bool isSrcOperand(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
return true;
}
bool isDstOperand(int operandIndex) override { return false; }
int getOperandSize(int operandIndex) override
{
assert(operandIndex >= 0 && operandIndex < getNumOperands());
return !operandIndex ? src.opSize() : this->addr.opSize();
}
int
getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
{
assert(operandIndex >= 0 && operandIndex < getNumOperands());
return !operandIndex ? src.regIndex() : this->addr.regIndex();
}
};
template<typename MemDataType, typename SrcDataType,
typename AddrOperandType>
class StInst :
public StInstBase<MemDataType, typename SrcDataType::OperandType,
AddrOperandType>,
public MemInst
{
public:
typename SrcDataType::OperandType::SrcOperand src_vect[4];
uint16_t num_src_operands;
void generateDisassembly() override;
StInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
const char *_opcode, int srcIdx)
: StInstBase<MemDataType, typename SrcDataType::OperandType,
AddrOperandType>(ib, obj, _opcode),
MemInst(SrcDataType::memType)
{
init_addr(&this->addr);
BrigRegOperandInfo rinfo;
unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx);
const Brig::BrigOperand *baseOp = obj->getOperand(op_offs);
if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
const Brig::BrigOperandConstantBytes *op =
(Brig::BrigOperandConstantBytes*)baseOp;
rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind,
Brig::BRIG_TYPE_NONE);
} else {
rinfo = findRegDataType(op_offs, obj);
}
if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
const Brig::BrigOperandOperandList *brigRegVecOp =
(const Brig::BrigOperandOperandList*)baseOp;
num_src_operands =
*((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
assert(num_src_operands <= 4);
} else {
num_src_operands = 1;
}
if (num_src_operands > 1) {
assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
for (int i = 0; i < num_src_operands; ++i) {
src_vect[i].init_from_vect(op_offs, obj, i);
}
}
}
void
initiateAcc(GPUDynInstPtr gpuDynInst) override
{
// before performing a store, check if this store has
// release semantics, and if so issue a release first
if (!this->isLocalMem()) {
if (gpuDynInst->computeUnit()->shader->separate_acquire_release
&& gpuDynInst->isRelease()) {
gpuDynInst->statusBitVector = VectorMask(1);
gpuDynInst->execContinuation = &GPUStaticInst::execSt;
gpuDynInst->useContinuation = true;
// create request
RequestPtr req = std::make_shared<Request>(0, 0, 0, 0,
gpuDynInst->computeUnit()->masterId(),
0, gpuDynInst->wfDynId);
req->setFlags(Request::RELEASE);
gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
return;
}
}
// if there is no release semantic, perform stores immediately
execSt(gpuDynInst);
}
// stores don't write anything back, so there is nothing
// to do here. we only override this method to avoid the
// fatal in the base class implementation
void completeAcc(GPUDynInstPtr gpuDynInst) override { }
private:
// execSt may be called through a continuation
// if the store had release semantics. see comment for
// execSt in gpu_static_inst.hh
void
execSt(GPUDynInstPtr gpuDynInst) override
{
typedef typename MemDataType::CType c0;
gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
if (num_src_operands > 1) {
for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
if (gpuDynInst->exec_mask[i])
gpuDynInst->statusVector.push_back(num_src_operands);
else
gpuDynInst->statusVector.push_back(0);
}
for (int k = 0; k < num_src_operands; ++k) {
c0 *d = &((c0*)gpuDynInst->d_data)
[k * gpuDynInst->computeUnit()->wfSize()];
for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
if (gpuDynInst->exec_mask[i]) {
Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
if (this->isLocalMem()) {
//store to shared memory
gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr,
*d);
} else {
RequestPtr req = std::make_shared<Request>(
0, vaddr, sizeof(c0), 0,
gpuDynInst->computeUnit()->masterId(),
0, gpuDynInst->wfDynId);
gpuDynInst->setRequestFlags(req);
PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
pkt->dataStatic<c0>(d);
// translation is performed in sendRequest()
// the request will be finished when the store completes
gpuDynInst->useContinuation = false;
gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
i, pkt);
}
}
++d;
}
}
gpuDynInst->updateStats();
}
public:
bool isVectorRegister(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if (operandIndex == num_src_operands)
return this->addr.isVectorRegister();
if (num_src_operands > 1)
return src_vect[operandIndex].isVectorRegister();
else if (num_src_operands == 1)
return StInstBase<MemDataType,
typename SrcDataType::OperandType,
AddrOperandType>::src.isVectorRegister();
return false;
}
bool isCondRegister(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if (operandIndex == num_src_operands)
return this->addr.isCondRegister();
if (num_src_operands > 1)
return src_vect[operandIndex].isCondRegister();
else if (num_src_operands == 1)
return StInstBase<MemDataType,
typename SrcDataType::OperandType,
AddrOperandType>::src.isCondRegister();
return false;
}
bool isScalarRegister(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if (operandIndex == num_src_operands)
return this->addr.isScalarRegister();
if (num_src_operands > 1)
return src_vect[operandIndex].isScalarRegister();
else if (num_src_operands == 1)
return StInstBase<MemDataType,
typename SrcDataType::OperandType,
AddrOperandType>::src.isScalarRegister();
return false;
}
bool isSrcOperand(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
return true;
}
bool isDstOperand(int operandIndex) override { return false; }
int getOperandSize(int operandIndex) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if (operandIndex == num_src_operands)
return this->addr.opSize();
if (num_src_operands > 1)
return src_vect[operandIndex].opSize();
else if (num_src_operands == 1)
return StInstBase<MemDataType,
typename SrcDataType::OperandType,
AddrOperandType>::src.opSize();
return 0;
}
int
getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if (operandIndex == num_src_operands)
return this->addr.regIndex();
if (num_src_operands > 1)
return src_vect[operandIndex].regIndex();
else if (num_src_operands == 1)
return StInstBase<MemDataType,
typename SrcDataType::OperandType,
AddrOperandType>::src.regIndex();
return -1;
}
int getNumOperands() override
{
if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
return num_src_operands + 1;
else
return num_src_operands;
}
void execute(GPUDynInstPtr gpuDynInst) override;
};
template<typename DataType, typename SrcDataType>
GPUStaticInst*
decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj)
{
int srcIdx = 0;
int destIdx = 1;
if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC ||
ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) {
srcIdx = 1;
destIdx = 0;
}
unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx);
BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
return new StInst<DataType, SrcDataType,
NoRegAddrOperand>(ib, obj, "st", srcIdx);
} else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
// V2/V4 not allowed
switch (tmp.regKind) {
case Brig::BRIG_REGISTER_KIND_SINGLE:
return new StInst<DataType, SrcDataType,
SRegAddrOperand>(ib, obj, "st", srcIdx);
case Brig::BRIG_REGISTER_KIND_DOUBLE:
return new StInst<DataType, SrcDataType,
DRegAddrOperand>(ib, obj, "st", srcIdx);
default:
fatal("Bad st register operand type %d\n", tmp.type);
}
} else {
fatal("Bad st register operand kind %d\n", tmp.kind);
}
}
template<typename OperandType, typename AddrOperandType, int NumSrcOperands,
bool HasDst>
class AtomicInstBase : public HsailGPUStaticInst
{
public:
typename OperandType::DestOperand dest;
typename OperandType::SrcOperand src[NumSrcOperands];
AddrOperandType addr;
Brig::BrigSegment segment;
Brig::BrigMemoryOrder memoryOrder;
Brig::BrigAtomicOperation atomicOperation;
Brig::BrigMemoryScope memoryScope;
Brig::BrigOpcode opcode;
AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
const char *_opcode)
: HsailGPUStaticInst(obj, _opcode)
{
using namespace Brig;
const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
segment = (BrigSegment)at->segment;
memoryScope = (BrigMemoryScope)at->memoryScope;
memoryOrder = (BrigMemoryOrder)at->memoryOrder;
atomicOperation = (BrigAtomicOperation)at->atomicOperation;
opcode = (BrigOpcode)ib->opcode;
assert(opcode == Brig::BRIG_OPCODE_ATOMICNORET ||
opcode == Brig::BRIG_OPCODE_ATOMIC);
setFlag(MemoryRef);
if (opcode == Brig::BRIG_OPCODE_ATOMIC) {
setFlag(AtomicReturn);
} else {
setFlag(AtomicNoReturn);
}
switch (memoryOrder) {
case BRIG_MEMORY_ORDER_NONE:
setFlag(NoOrder);
break;
case BRIG_MEMORY_ORDER_RELAXED:
setFlag(RelaxedOrder);
break;
case BRIG_MEMORY_ORDER_SC_ACQUIRE:
setFlag(Acquire);
break;
case BRIG_MEMORY_ORDER_SC_RELEASE:
setFlag(Release);
break;
case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
setFlag(AcquireRelease);
break;
default:
fatal("AtomicInst has bad memory order type\n");
}
switch (memoryScope) {
case BRIG_MEMORY_SCOPE_NONE:
setFlag(NoScope);
break;
case BRIG_MEMORY_SCOPE_WORKITEM:
setFlag(WorkitemScope);
break;
case BRIG_MEMORY_SCOPE_WORKGROUP:
setFlag(WorkgroupScope);
break;
case BRIG_MEMORY_SCOPE_AGENT:
setFlag(DeviceScope);
break;
case BRIG_MEMORY_SCOPE_SYSTEM:
setFlag(SystemScope);
break;
default:
fatal("AtomicInst has bad memory scope type\n");
}
switch (atomicOperation) {
case Brig::BRIG_ATOMIC_AND:
setFlag(AtomicAnd);
break;
case Brig::BRIG_ATOMIC_OR:
setFlag(AtomicOr);
break;
case Brig::BRIG_ATOMIC_XOR:
setFlag(AtomicXor);
break;
case Brig::BRIG_ATOMIC_CAS:
setFlag(AtomicCAS);
break;
case Brig::BRIG_ATOMIC_EXCH:
setFlag(AtomicExch);
break;
case Brig::BRIG_ATOMIC_ADD:
setFlag(AtomicAdd);
break;
case Brig::BRIG_ATOMIC_WRAPINC:
setFlag(AtomicInc);
break;
case Brig::BRIG_ATOMIC_WRAPDEC:
setFlag(AtomicDec);
break;
case Brig::BRIG_ATOMIC_MIN:
setFlag(AtomicMin);
break;
case Brig::BRIG_ATOMIC_MAX:
setFlag(AtomicMax);
break;
case Brig::BRIG_ATOMIC_SUB:
setFlag(AtomicSub);
break;
default:
fatal("Bad BrigAtomicOperation code %d\n", atomicOperation);
}
switch (segment) {
case BRIG_SEGMENT_GLOBAL:
setFlag(GlobalSegment);
break;
case BRIG_SEGMENT_GROUP:
setFlag(GroupSegment);
break;
case BRIG_SEGMENT_FLAT:
setFlag(Flat);
break;
default:
panic("Atomic: segment %d not supported\n", segment);
}
if (HasDst) {
unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
dest.init(op_offs, obj);
op_offs = obj->getOperandPtr(ib->operands, 1);
addr.init(op_offs, obj);
for (int i = 0; i < NumSrcOperands; ++i) {
op_offs = obj->getOperandPtr(ib->operands, i + 2);
src[i].init(op_offs, obj);
}
} else {
unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
addr.init(op_offs, obj);
for (int i = 0; i < NumSrcOperands; ++i) {
op_offs = obj->getOperandPtr(ib->operands, i + 1);
src[i].init(op_offs, obj);
}
}
}
int numSrcRegOperands()
{
int operands = 0;
for (int i = 0; i < NumSrcOperands; i++) {
if (src[i].isVectorRegister()) {
operands++;
}
}
if (addr.isVectorRegister())
operands++;
return operands;
}
int numDstRegOperands() { return dest.isVectorRegister(); }
int getNumOperands()
{
if (addr.isVectorRegister())
return(NumSrcOperands + 2);
return(NumSrcOperands + 1);
}
bool isVectorRegister(int operandIndex)
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if (operandIndex < NumSrcOperands)
return src[operandIndex].isVectorRegister();
else if (operandIndex == NumSrcOperands)
return(addr.isVectorRegister());
else
return dest.isVectorRegister();
}
bool isCondRegister(int operandIndex)
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if (operandIndex < NumSrcOperands)
return src[operandIndex].isCondRegister();
else if (operandIndex == NumSrcOperands)
return(addr.isCondRegister());
else
return dest.isCondRegister();
}
bool isScalarRegister(int operandIndex)
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if (operandIndex < NumSrcOperands)
return src[operandIndex].isScalarRegister();
else if (operandIndex == NumSrcOperands)
return(addr.isScalarRegister());
else
return dest.isScalarRegister();
}
bool isSrcOperand(int operandIndex)
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if (operandIndex < NumSrcOperands)
return true;
else if (operandIndex == NumSrcOperands)
return(addr.isVectorRegister());
else
return false;
}
bool isDstOperand(int operandIndex)
{
if (operandIndex <= NumSrcOperands)
return false;
else
return true;
}
int getOperandSize(int operandIndex)
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if (operandIndex < NumSrcOperands)
return(src[operandIndex].opSize());
else if (operandIndex == NumSrcOperands)
return(addr.opSize());
else
return(dest.opSize());
}
int
getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst)
{
assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
if (operandIndex < NumSrcOperands)
return(src[operandIndex].regIndex());
else if (operandIndex == NumSrcOperands)
return(addr.regIndex());
else
return(dest.regIndex());
return -1;
}
};
template<typename MemDataType, typename AddrOperandType, int NumSrcOperands,
bool HasDst>
class AtomicInst :
public AtomicInstBase<typename MemDataType::OperandType,
AddrOperandType, NumSrcOperands, HasDst>,
public MemInst
{
public:
void generateDisassembly() override;
AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
const char *_opcode)
: AtomicInstBase<typename MemDataType::OperandType, AddrOperandType,
NumSrcOperands, HasDst>
(ib, obj, _opcode),
MemInst(MemDataType::memType)
{
init_addr(&this->addr);
}
void
initiateAcc(GPUDynInstPtr gpuDynInst) override
{
// before doing the RMW, check if this atomic has
// release semantics, and if so issue a release first
if (!this->isLocalMem()) {
if (gpuDynInst->computeUnit()->shader->separate_acquire_release
&& (gpuDynInst->isRelease()
|| gpuDynInst->isAcquireRelease())) {
gpuDynInst->statusBitVector = VectorMask(1);
gpuDynInst->execContinuation = &GPUStaticInst::execAtomic;
gpuDynInst->useContinuation = true;
// create request
RequestPtr req = std::make_shared<Request>(0, 0, 0, 0,
gpuDynInst->computeUnit()->masterId(),
0, gpuDynInst->wfDynId);
req->setFlags(Request::RELEASE);
gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
return;
}
}
// if there is no release semantic, execute the RMW immediately
execAtomic(gpuDynInst);
}
void
completeAcc(GPUDynInstPtr gpuDynInst) override
{
// if this is not an atomic return op, then we
// have nothing more to do.
if (this->isAtomicRet()) {
// the size of the src operands and the
// memory being operated on must match
// for HSAIL atomics - this assumption may
// not apply to all ISAs
typedef typename MemDataType::CType CType;
Wavefront *w = gpuDynInst->wavefront();
int dst = this->dest.regIndex();
std::vector<uint32_t> regVec;
// virtual->physical VGPR mapping
int physVgpr = w->remap(dst, sizeof(CType), 1);
regVec.push_back(physVgpr);
CType *p1 = &((CType*)gpuDynInst->d_data)[0];
for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
if (gpuDynInst->exec_mask[i]) {
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
"$%s%d <- %d global ld done (src = wavefront "
"ld inst)\n", w->computeUnit->cu_id, w->simdId,
w->wfSlotId, i, sizeof(CType) == 4 ? "s" : "d",
dst, *p1);
// write the value into the physical VGPR. This is a
// purely functional operation. No timing is modeled.
w->computeUnit->vrf[w->simdId]->write<CType>(physVgpr, *p1, i);
}
++p1;
}
// Schedule the write operation of the load data on the VRF.
// This simply models the timing aspect of the VRF write operation.
// It does not modify the physical VGPR.
int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
sizeof(CType), gpuDynInst->time);
if (this->isGlobalMem()) {
gpuDynInst->computeUnit()->globalMemoryPipe
.incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
} else {
assert(this->isLocalMem());
gpuDynInst->computeUnit()->localMemoryPipe
.incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
}
}
}
void execute(GPUDynInstPtr gpuDynInst) override;
private:
// execAtomic may be called through a continuation
// if the RMW had release semantics. see comment for
// execContinuation in gpu_dyn_inst.hh
void
execAtomic(GPUDynInstPtr gpuDynInst) override
{
gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
typedef typename MemDataType::CType c0;
c0 *d = &((c0*) gpuDynInst->d_data)[0];
c0 *e = &((c0*) gpuDynInst->a_data)[0];
c0 *f = &((c0*) gpuDynInst->x_data)[0];
for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
if (gpuDynInst->exec_mask[i]) {
Addr vaddr = gpuDynInst->addr[i];
if (this->isLocalMem()) {
Wavefront *wavefront = gpuDynInst->wavefront();
*d = wavefront->ldsChunk->read<c0>(vaddr);
if (this->isAtomicAdd()) {
wavefront->ldsChunk->write<c0>(vaddr,
wavefront->ldsChunk->read<c0>(vaddr) + (*e));
} else if (this->isAtomicSub()) {
wavefront->ldsChunk->write<c0>(vaddr,
wavefront->ldsChunk->read<c0>(vaddr) - (*e));
} else if (this->isAtomicMax()) {
wavefront->ldsChunk->write<c0>(vaddr,
std::max(wavefront->ldsChunk->read<c0>(vaddr),
(*e)));
} else if (this->isAtomicMin()) {
wavefront->ldsChunk->write<c0>(vaddr,
std::min(wavefront->ldsChunk->read<c0>(vaddr),
(*e)));
} else if (this->isAtomicAnd()) {
wavefront->ldsChunk->write<c0>(vaddr,
wavefront->ldsChunk->read<c0>(vaddr) & (*e));
} else if (this->isAtomicOr()) {
wavefront->ldsChunk->write<c0>(vaddr,
wavefront->ldsChunk->read<c0>(vaddr) | (*e));
} else if (this->isAtomicXor()) {
wavefront->ldsChunk->write<c0>(vaddr,
wavefront->ldsChunk->read<c0>(vaddr) ^ (*e));
} else if (this->isAtomicInc()) {
wavefront->ldsChunk->write<c0>(vaddr,
wavefront->ldsChunk->read<c0>(vaddr) + 1);
} else if (this->isAtomicDec()) {
wavefront->ldsChunk->write<c0>(vaddr,
wavefront->ldsChunk->read<c0>(vaddr) - 1);
} else if (this->isAtomicExch()) {
wavefront->ldsChunk->write<c0>(vaddr, (*e));
} else if (this->isAtomicCAS()) {
wavefront->ldsChunk->write<c0>(vaddr,
(wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ?
(*f) : wavefront->ldsChunk->read<c0>(vaddr));
} else {
fatal("Unrecognized or invalid HSAIL atomic op "
"type.\n");
}
} else {
RequestPtr req =
std::make_shared<Request>(0, vaddr, sizeof(c0), 0,
gpuDynInst->computeUnit()->masterId(),
0, gpuDynInst->wfDynId,
gpuDynInst->makeAtomicOpFunctor<c0>(e,
f));
gpuDynInst->setRequestFlags(req);
PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
pkt->dataStatic(d);
if (gpuDynInst->computeUnit()->shader->
separate_acquire_release &&
(gpuDynInst->isAcquire())) {
// if this atomic has acquire semantics,
// schedule the continuation to perform an
// acquire after the RMW completes
gpuDynInst->execContinuation =
&GPUStaticInst::execAtomicAcq;
gpuDynInst->useContinuation = true;
} else {
// the request will be finished when the RMW completes
gpuDynInst->useContinuation = false;
}
// translation is performed in sendRequest()
gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i,
pkt);
}
}
++d;
++e;
++f;
}
gpuDynInst->updateStats();
}
// execAtomicACq will always be called through a continuation.
// see comment for execContinuation in gpu_dyn_inst.hh
void
execAtomicAcq(GPUDynInstPtr gpuDynInst) override
{
// after performing the RMW, check to see if this instruction
// has acquire semantics, and if so, issue an acquire
if (!this->isLocalMem()) {
if (gpuDynInst->computeUnit()->shader->separate_acquire_release
&& gpuDynInst->isAcquire()) {
gpuDynInst->statusBitVector = VectorMask(1);
// the request will be finished when
// the acquire completes
gpuDynInst->useContinuation = false;
// create request
RequestPtr req = std::make_shared<Request>(0, 0, 0, 0,
gpuDynInst->computeUnit()->masterId(),
0, gpuDynInst->wfDynId);
req->setFlags(Request::ACQUIRE);
gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
}
}
}
};
template<typename DataType, typename AddrOperandType, int NumSrcOperands>
GPUStaticInst*
constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
{
const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) {
return decodeLd<DataType>(ib, obj);
} else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) {
switch (ib->type) {
case Brig::BRIG_TYPE_B8:
return decodeSt<S8,S8>(ib, obj);
case Brig::BRIG_TYPE_B16:
return decodeSt<S16,S16>(ib, obj);
case Brig::BRIG_TYPE_B32:
return decodeSt<S32,S32>(ib, obj);
case Brig::BRIG_TYPE_B64:
return decodeSt<S64,S64>(ib, obj);
default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type);
}
} else {
if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET)
return new AtomicInst<DataType, AddrOperandType,
NumSrcOperands, false>(ib, obj, "atomicnoret");
else
return new AtomicInst<DataType, AddrOperandType,
NumSrcOperands, true>(ib, obj, "atomic");
}
}
template<typename DataType, int NumSrcOperands>
GPUStaticInst*
decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj)
{
unsigned addrIndex = (Brig::BrigOpcode)ib->opcode ==
Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1;
unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex);
BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
return constructAtomic<DataType, NoRegAddrOperand,
NumSrcOperands>(ib, obj);
} else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
// V2/V4 not allowed
switch (tmp.regKind) {
case Brig::BRIG_REGISTER_KIND_SINGLE:
return constructAtomic<DataType, SRegAddrOperand,
NumSrcOperands>(ib, obj);
case Brig::BRIG_REGISTER_KIND_DOUBLE:
return constructAtomic<DataType, DRegAddrOperand,
NumSrcOperands>(ib, obj);
default:
fatal("Bad atomic register operand type %d\n", tmp.type);
}
} else {
fatal("Bad atomic register operand kind %d\n", tmp.kind);
}
}
template<typename DataType>
GPUStaticInst*
decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
{
const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
return decodeAtomicHelper<DataType, 2>(ib, obj);
} else {
return decodeAtomicHelper<DataType, 1>(ib, obj);
}
}
template<typename DataType>
GPUStaticInst*
decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj)
{
const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
return decodeAtomicHelper<DataType, 2>(ib, obj);
} else {
return decodeAtomicHelper<DataType, 1>(ib, obj);
}
}
} // namespace HsailISA
#endif // __ARCH_HSAIL_INSTS_MEM_HH__