src/arch/amdgpu/gcn3/operand.hh - public/gem5 - Git at Google

 /*
  * Copyright (c) 2017-2021 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */

 #ifndef __ARCH_GCN3_OPERAND_HH__
 #define __ARCH_GCN3_OPERAND_HH__

 #include <array>

 #include "arch/amdgpu/gcn3/gpu_registers.hh"
 #include "arch/generic/vec_reg.hh"
 #include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"

 namespace gem5
 {

 /**
  * classes that represnt vector/scalar operands in GCN3 ISA. these classes
  * wrap the generic vector register type (i.e., src/arch/generic/vec_reg.hh)
  * and allow them to be manipulated in ways that are unique to GCN3 insts.
  */

 namespace Gcn3ISA
 {
     /**
      * convenience traits so we can automatically infer the correct FP type
      * without looking at the number of dwords (i.e., to determine if we
      * need a float or a double when creating FP constants).
      */
     template<typename T> struct OpTraits { typedef float FloatT; };
     template<> struct OpTraits<ScalarRegF64> { typedef double FloatT; };
     template<> struct OpTraits<ScalarRegU64> { typedef double FloatT; };

     class Operand
     {
       public:
         Operand() = delete;

         Operand(GPUDynInstPtr gpuDynInst, int opIdx)
             : _gpuDynInst(gpuDynInst), _opIdx(opIdx)
         {
             assert(_gpuDynInst);
             assert(_opIdx >= 0);
         }

         /**
          * read from and write to the underlying register(s) that
          * this operand is referring to.
          */
         virtual void read() = 0;
         virtual void write() = 0;

       protected:
         /**
          * instruction object that owns this operand
          */
         GPUDynInstPtr _gpuDynInst;
         /**
          * op selector value for this operand. note that this is not
          * the same as the register file index, be it scalar or vector.
          * this could refer to inline constants, system regs, or even
          * special values.
          */
         int _opIdx;
     };

     template<typename DataType, bool Const, size_t NumDwords>
     class ScalarOperand;

     template<typename DataType, bool Const,
         size_t NumDwords = sizeof(DataType) / sizeof(VecElemU32)>
     class VecOperand final : public Operand
     {
       static_assert(NumDwords >= 1 && NumDwords <= MaxOperandDwords,
             "Incorrect number of DWORDS for GCN3 operand.");

       public:
         VecOperand() = delete;

         VecOperand(GPUDynInstPtr gpuDynInst, int opIdx)
             : Operand(gpuDynInst, opIdx), scalar(false), absMod(false),
               negMod(false), scRegData(gpuDynInst, _opIdx),
               vrfData{{ nullptr }}
         {
             vecReg.zero();
         }

         ~VecOperand()
         {
         }

         /**
          * certain vector operands can read from the vrf/srf or constants.
          * we use this method to first determine the type of the operand,
          * then we read from the appropriate source. if vector we read
          * directly from the vrf. if scalar, we read in the data through
          * the scalar operand component. this should only be used for VSRC
          * operands.
          */
         void
         readSrc()
         {
             if (isVectorReg(_opIdx)) {
                 _opIdx = opSelectorToRegIdx(_opIdx, _gpuDynInst->wavefront()
                     ->reservedScalarRegs);
                 read();
             } else {
                 readScalar();
             }
         }

         /**
          * read from the vrf. this should only be used by vector inst
          * source operands that are explicitly vector (i.e., VSRC).
          */
         void
         read() override
         {
             assert(_gpuDynInst);
             assert(_gpuDynInst->wavefront());
             assert(_gpuDynInst->computeUnit());
             Wavefront *wf = _gpuDynInst->wavefront();
             ComputeUnit *cu = _gpuDynInst->computeUnit();

             for (auto i = 0; i < NumDwords; ++i) {
                 int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx + i);
                 vrfData[i] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);

                 DPRINTF(GPUVRF, "Read v[%d]\n", vgprIdx);
                 cu->vrf[wf->simdId]->printReg(wf, vgprIdx);
             }

             if (NumDwords == 1) {
                 assert(vrfData[0]);
                 auto vgpr = vecReg.template as<DataType>();
                 auto reg_file_vgpr = vrfData[0]->template as<VecElemU32>();
                 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                     std::memcpy((void*)&vgpr[lane],
                         (void*)&reg_file_vgpr[lane], sizeof(DataType));
                 }
             } else if (NumDwords == 2) {
                 assert(vrfData[0]);
                 assert(vrfData[1]);
                 auto vgpr = vecReg.template as<VecElemU64>();
                 auto reg_file_vgpr0 = vrfData[0]->template as<VecElemU32>();
                 auto reg_file_vgpr1 = vrfData[1]->template as<VecElemU32>();

                 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                     VecElemU64 tmp_val(0);
                     ((VecElemU32*)&tmp_val)[0] = reg_file_vgpr0[lane];
                     ((VecElemU32*)&tmp_val)[1] = reg_file_vgpr1[lane];
                     vgpr[lane] = tmp_val;
                 }
             }
         }

         /**
          * write to the vrf. we maintain a copy of the underlying vector
          * reg(s) for this operand (i.e., vrfData/scRegData), as well as a
          * temporary vector register representation (i.e., vecReg) of the
          * vector register, which allows the execute() methods of instructions
          * to easily write their operand data using operator[] regardless of
          * their size. after the result is calculated we use write() to write
          * the data to the actual register file storage. this allows us to do
          * type conversion, etc., in a single call as opposed to doing it
          * in each execute() method.
          */
         void
         write() override
         {
             assert(_gpuDynInst);
             assert(_gpuDynInst->wavefront());
             assert(_gpuDynInst->computeUnit());
             Wavefront *wf = _gpuDynInst->wavefront();
             ComputeUnit *cu = _gpuDynInst->computeUnit();
             VectorMask &exec_mask = _gpuDynInst->isLoad()
                 ? _gpuDynInst->exec_mask : wf->execMask();

             if (NumDwords == 1) {
                 int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx);
                 vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
                 assert(vrfData[0]);
                 auto reg_file_vgpr = vrfData[0]->template as<VecElemU32>();
                 auto vgpr = vecReg.template as<DataType>();

                 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                     if (exec_mask[lane] || _gpuDynInst->ignoreExec()) {
                         std::memcpy((void*)&reg_file_vgpr[lane],
                             (void*)&vgpr[lane], sizeof(DataType));
                     }
                 }

                 DPRINTF(GPUVRF, "Write v[%d]\n", vgprIdx);
                 cu->vrf[wf->simdId]->printReg(wf, vgprIdx);
             } else if (NumDwords == 2) {
                 int vgprIdx0 = cu->registerManager->mapVgpr(wf, _opIdx);
                 int vgprIdx1 = cu->registerManager->mapVgpr(wf, _opIdx + 1);
                 vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx0);
                 vrfData[1] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx1);
                 assert(vrfData[0]);
                 assert(vrfData[1]);
                 auto reg_file_vgpr0 = vrfData[0]->template as<VecElemU32>();
                 auto reg_file_vgpr1 = vrfData[1]->template as<VecElemU32>();
                 auto vgpr = vecReg.template as<VecElemU64>();

                 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                     if (exec_mask[lane] || _gpuDynInst->ignoreExec()) {
                         reg_file_vgpr0[lane] = ((VecElemU32*)&vgpr[lane])[0];
                         reg_file_vgpr1[lane] = ((VecElemU32*)&vgpr[lane])[1];
                     }
                 }

                 DPRINTF(GPUVRF, "Write v[%d:%d]\n", vgprIdx0, vgprIdx1);
                 cu->vrf[wf->simdId]->printReg(wf, vgprIdx0);
                 cu->vrf[wf->simdId]->printReg(wf, vgprIdx1);
             }
         }

         void
         negModifier()
         {
             negMod = true;
         }

         void
         absModifier()
         {
             absMod = true;
         }

         /**
          * getter [] operator. only enable if this operand is constant
          * (i.e, a source operand) and if it can be represented using
          * primitive types (i.e., 8b to 64b primitives).
          */
         template<bool Condition = (NumDwords == 1 || NumDwords == 2) && Const>
         typename std::enable_if_t<Condition, const DataType>
         operator[](size_t idx) const
         {
             assert(idx < NumVecElemPerVecReg);

             if (scalar) {
                 DataType ret_val = scRegData.rawData();

                 if (absMod) {
                     assert(std::is_floating_point_v<DataType>);
                     ret_val = std::fabs(ret_val);
                 }

                 if (negMod) {
                     assert(std::is_floating_point_v<DataType>);
                     ret_val = -ret_val;
                 }

                 return ret_val;
             } else {
                 auto vgpr = vecReg.template as<DataType>();
                 DataType ret_val = vgpr[idx];

                 if (absMod) {
                     assert(std::is_floating_point_v<DataType>);
                     ret_val = std::fabs(ret_val);
                 }

                 if (negMod) {
                     assert(std::is_floating_point_v<DataType>);
                     ret_val = -ret_val;
                 }

                 return ret_val;
             }
         }

         /**
          * setter [] operator. only enable if this operand is non-constant
          * (i.e, a destination operand) and if it can be represented using
          * primitive types (i.e., 8b to 64b primitives).
          */
         template<bool Condition = (NumDwords == 1 || NumDwords == 2) && !Const>
         typename std::enable_if_t<Condition, DataType&>
         operator[](size_t idx)
         {
             assert(!scalar);
             assert(idx < NumVecElemPerVecReg);

             return vecReg.template as<DataType>()[idx];
         }

         private:
           /**
            * if we determine that this operand is a scalar (reg or constant)
            * then we read the scalar data into the scalar operand data member.
            */
           void
           readScalar()
           {
               scalar = true;
               scRegData.read();
           }

           using VecRegCont =
               VecRegContainer<sizeof(DataType) * NumVecElemPerVecReg>;

           /**
            * whether this operand a scalar or not.
            */
           bool scalar;
           /**
            * absolute value and negative modifiers. VOP3 instructions
            * may indicate that their input/output operands must be
            * modified, either by taking the absolute value or negating
            * them. these bools indicate which modifier, if any, to use.
            */
           bool absMod;
           bool negMod;
           /**
            * this holds all the operand data in a single vector register
            * object (i.e., if an operand is 64b, this will hold the data
            * from both registers the operand is using).
            */
           VecRegCont vecReg;
           /**
            * for src operands that read scalars (i.e., scalar regs or
            * a scalar constant).
            */
           ScalarOperand<DataType, Const, NumDwords> scRegData;
           /**
            * pointers to the underlyding registers (i.e., the actual
            * registers in the register file).
            */
           std::array<VecRegContainerU32*, NumDwords> vrfData;
     };

     template<typename DataType, bool Const,
         size_t NumDwords = sizeof(DataType) / sizeof(ScalarRegU32)>
     class ScalarOperand final : public Operand
     {
       static_assert(NumDwords >= 1 && NumDwords <= MaxOperandDwords,
             "Incorrect number of DWORDS for GCN3 operand.");
       public:
         ScalarOperand() = delete;

         ScalarOperand(GPUDynInstPtr gpuDynInst, int opIdx)
             : Operand(gpuDynInst, opIdx)
         {
             std::memset(srfData.data(), 0, NumDwords * sizeof(ScalarRegU32));
         }

         ~ScalarOperand()
         {
         }

         /**
          * we store scalar data in a std::array, however if we need the
          * full operand data we use this method to copy all elements of
          * the scalar operand data to a single primitive container. only
          * useful for 8b to 64b primitive types, as they are the only types
          * that we need to perform computation on.
          */
         template<bool Condition = NumDwords == 1 || NumDwords == 2>
         typename std::enable_if_t<Condition, DataType>
         rawData() const
         {
             assert(sizeof(DataType) <= sizeof(srfData));
             DataType raw_data((DataType)0);
             std::memcpy((void*)&raw_data, (void*)srfData.data(),
                 sizeof(DataType));

             return raw_data;
         }

         void*
         rawDataPtr()
         {
             return (void*)srfData.data();
         }

         void
         read() override
         {
             Wavefront *wf = _gpuDynInst->wavefront();
             ComputeUnit *cu = _gpuDynInst->computeUnit();

             if (!isScalarReg(_opIdx)) {
                 readSpecialVal();
             } else {
                 for (auto i = 0; i < NumDwords; ++i) {
                     int sgprIdx = regIdx(i);
                     srfData[i] = cu->srf[wf->simdId]->read(sgprIdx);
                     DPRINTF(GPUSRF, "Read s[%d]\n", sgprIdx);
                     cu->srf[wf->simdId]->printReg(wf, sgprIdx);
                 }
             }
         }

         void
         write() override
         {
             Wavefront *wf = _gpuDynInst->wavefront();
             ComputeUnit *cu = _gpuDynInst->computeUnit();

             if (!isScalarReg(_opIdx)) {
                 if (_opIdx == REG_EXEC_LO) {
                     ScalarRegU64 new_exec_mask_val
                         = wf->execMask().to_ullong();
                     if (NumDwords == 1) {
                         std::memcpy((void*)&new_exec_mask_val,
                             (void*)srfData.data(), sizeof(VecElemU32));
                     } else if (NumDwords == 2) {
                         std::memcpy((void*)&new_exec_mask_val,
                             (void*)srfData.data(), sizeof(VecElemU64));
                     } else {
                         panic("Trying to write more than 2 DWORDS to EXEC\n");
                     }
                     VectorMask new_exec_mask(new_exec_mask_val);
                     wf->execMask() = new_exec_mask;
                     DPRINTF(GPUSRF, "Write EXEC\n");
                     DPRINTF(GPUSRF, "EXEC = %#x\n", new_exec_mask_val);
                 } else if (_opIdx == REG_EXEC_HI) {
                     /**
                      * If we're writing only the upper half of the EXEC mask
                      * this ought to be a single dword operand.
                      */
                     assert(NumDwords == 1);
                     ScalarRegU32 new_exec_mask_hi_val(0);
                     ScalarRegU64 new_exec_mask_val
                         = wf->execMask().to_ullong();
                     std::memcpy((void*)&new_exec_mask_hi_val,
                         (void*)srfData.data(), sizeof(new_exec_mask_hi_val));
                     replaceBits(new_exec_mask_val, 63, 32,
                                 new_exec_mask_hi_val);
                     VectorMask new_exec_mask(new_exec_mask_val);
                     wf->execMask() = new_exec_mask;
                     DPRINTF(GPUSRF, "Write EXEC\n");
                     DPRINTF(GPUSRF, "EXEC = %#x\n", new_exec_mask_val);
                 } else {
                     _gpuDynInst->writeMiscReg(_opIdx, srfData[0]);
                 }
             } else {
                 for (auto i = 0; i < NumDwords; ++i) {
                     int sgprIdx = regIdx(i);
                     auto &sgpr = cu->srf[wf->simdId]->readWriteable(sgprIdx);
                     if (_gpuDynInst->isLoad()) {
                         assert(sizeof(DataType) <= sizeof(ScalarRegU64));
                         sgpr = reinterpret_cast<ScalarRegU32*>(
                             _gpuDynInst->scalar_data)[i];
                     } else {
                         sgpr = srfData[i];
                     }
                     DPRINTF(GPUSRF, "Write s[%d]\n", sgprIdx);
                     cu->srf[wf->simdId]->printReg(wf, sgprIdx);
                 }
             }
         }

         /**
          * bit access to scalar data. primarily used for setting vcc bits.
          */
         template<bool Condition = NumDwords == 1 || NumDwords == 2>
         typename std::enable_if_t<Condition, void>
         setBit(int bit, int bit_val)
         {
             DataType &sgpr = *((DataType*)srfData.data());
             replaceBits(sgpr, bit, bit_val);
         }

         template<bool Condition = (NumDwords == 1 || NumDwords == 2) && !Const>
         typename std::enable_if_t<Condition, ScalarOperand&>
         operator=(DataType rhs)
         {
             std::memcpy((void*)srfData.data(), (void*)&rhs, sizeof(DataType));
             return *this;
         }

       private:
         /**
          * we have determined that we are not reading our scalar operand data
          * from the register file, so here we figure out which special value
          * we are reading (i.e., float constant, int constant, inline
          * constant, or various other system registers (e.g., exec mask).
          */
         void
         readSpecialVal()
         {
             assert(NumDwords == 1 || NumDwords == 2);

             switch(_opIdx) {
               case REG_EXEC_LO:
                 {
                     if (NumDwords == 1) {
                         ScalarRegU32 exec_mask = _gpuDynInst->wavefront()->
                             execMask().to_ulong();
                         std::memcpy((void*)srfData.data(), (void*)&exec_mask,
                             sizeof(exec_mask));
                         DPRINTF(GPUSRF, "Read EXEC\n");
                         DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask);
                     } else {
                         assert(NumDwords == 2);
                         ScalarRegU64 exec_mask = _gpuDynInst->wavefront()->
                             execMask().to_ullong();
                         std::memcpy((void*)srfData.data(), (void*)&exec_mask,
                             sizeof(exec_mask));
                         DPRINTF(GPUSRF, "Read EXEC\n");
                         DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask);
                     }
                 }
                 break;
               case REG_EXEC_HI:
                 {
                     /**
                      * If we're reading only the upper half of the EXEC mask
                      * this ought to be a single dword operand.
                      */
                     assert(NumDwords == 1);
                     ScalarRegU64 exec_mask = _gpuDynInst->wavefront()
                         ->execMask().to_ullong();

                     ScalarRegU32 exec_mask_hi = bits(exec_mask, 63, 32);
                     std::memcpy((void*)srfData.data(), (void*)&exec_mask_hi,
                                 sizeof(exec_mask_hi));
                     DPRINTF(GPUSRF, "Read EXEC_HI\n");
                     DPRINTF(GPUSRF, "EXEC_HI = %#x\n", exec_mask_hi);
                 }
                 break;
               case REG_SRC_SWDA:
               case REG_SRC_DPP:
               case REG_SRC_LITERAL:
                 assert(NumDwords == 1);
                 srfData[0] = _gpuDynInst->srcLiteral();
                 break;
               case REG_POS_HALF:
                 {
                     typename OpTraits<DataType>::FloatT pos_half = 0.5;
                     std::memcpy((void*)srfData.data(), (void*)&pos_half,
                         sizeof(pos_half));

                 }
                 break;
               case REG_NEG_HALF:
                 {
                     typename OpTraits<DataType>::FloatT neg_half = -0.5;
                     std::memcpy((void*)srfData.data(), (void*)&neg_half,
                         sizeof(neg_half));
                 }
                 break;
               case REG_POS_ONE:
                 {
                     typename OpTraits<DataType>::FloatT pos_one = 1.0;
                     std::memcpy(srfData.data(), &pos_one, sizeof(pos_one));
                 }
                 break;
               case REG_NEG_ONE:
                 {
                     typename OpTraits<DataType>::FloatT neg_one = -1.0;
                     std::memcpy(srfData.data(), &neg_one, sizeof(neg_one));
                 }
                 break;
               case REG_POS_TWO:
                 {
                     typename OpTraits<DataType>::FloatT pos_two = 2.0;
                     std::memcpy(srfData.data(), &pos_two, sizeof(pos_two));
                 }
                 break;
               case REG_NEG_TWO:
                 {
                     typename OpTraits<DataType>::FloatT neg_two = -2.0;
                     std::memcpy(srfData.data(), &neg_two, sizeof(neg_two));
                 }
                 break;
               case REG_POS_FOUR:
                 {
                     typename OpTraits<DataType>::FloatT pos_four = 4.0;
                     std::memcpy(srfData.data(), &pos_four, sizeof(pos_four));
                 }
                 break;
               case REG_NEG_FOUR:
                 {
                     typename OpTraits<DataType>::FloatT neg_four = -4.0;
                     std::memcpy((void*)srfData.data(), (void*)&neg_four ,
                         sizeof(neg_four));
                 }
                 break;
                 case REG_PI:
                 {
                     assert(sizeof(DataType) == sizeof(ScalarRegF64)
                         || sizeof(DataType) == sizeof(ScalarRegF32));

                     const ScalarRegU32 pi_u32(0x3e22f983UL);
                     const ScalarRegU64 pi_u64(0x3fc45f306dc9c882ULL);

                     if (sizeof(DataType) == sizeof(ScalarRegF64)) {
                         std::memcpy((void*)srfData.data(),
                             (void*)&pi_u64, sizeof(pi_u64));
                     } else {
                         std::memcpy((void*)srfData.data(),
                             (void*)&pi_u32, sizeof(pi_u32));
                     }
                 }
                 break;
               default:
                 {
                     assert(sizeof(DataType) <= sizeof(srfData));
                     DataType misc_val(0);
                     if (isConstVal(_opIdx)) {
                         misc_val = (DataType)_gpuDynInst
                             ->readConstVal<DataType>(_opIdx);
                     } else {
                         misc_val = (DataType)_gpuDynInst->readMiscReg(_opIdx);
                     }
                     std::memcpy((void*)srfData.data(), (void*)&misc_val,
                                 sizeof(DataType));
                 }
             }
         }

         /**
          * for scalars we need to do some extra work to figure out how to
          * map the op selector to the sgpr idx because some op selectors
          * do not map directly to the srf (i.e., vcc/flat_scratch).
          */
         int
         regIdx(int dword) const
         {
             Wavefront *wf = _gpuDynInst->wavefront();
             ComputeUnit *cu = _gpuDynInst->computeUnit();
             int sgprIdx(-1);

             if (_opIdx == REG_VCC_HI) {
                 sgprIdx = cu->registerManager
                     ->mapSgpr(wf, wf->reservedScalarRegs - 1 + dword);
             } else if (_opIdx == REG_VCC_LO) {
                 sgprIdx = cu->registerManager
                     ->mapSgpr(wf, wf->reservedScalarRegs - 2 + dword);
             } else if (_opIdx == REG_FLAT_SCRATCH_HI) {
                 sgprIdx = cu->registerManager
                     ->mapSgpr(wf, wf->reservedScalarRegs - 3 + dword);
             } else if (_opIdx == REG_FLAT_SCRATCH_LO) {
                 assert(NumDwords == 1);
                 sgprIdx = cu->registerManager
                     ->mapSgpr(wf, wf->reservedScalarRegs - 4 + dword);
             } else {
                 sgprIdx = cu->registerManager->mapSgpr(wf, _opIdx + dword);
             }

             assert(sgprIdx > -1);

             return sgprIdx;
         }

         /**
          * in GCN3 each register is represented as a 32b unsigned value,
          * however operands may require up to 16 registers, so we store
          * all the individual 32b components here. for sub-dword operand
          * we still consider them to be 1 dword because the minimum size
          * of a register is 1 dword. this class will take care to do the
          * proper packing/unpacking of sub-dword operands.
          */
         std::array<ScalarRegU32, NumDwords> srfData;
     };

     // typedefs for the various sizes/types of scalar operands
     using ScalarOperandU8 = ScalarOperand<ScalarRegU8, false, 1>;
     using ScalarOperandI8 = ScalarOperand<ScalarRegI8, false, 1>;
     using ScalarOperandU16 = ScalarOperand<ScalarRegU16, false, 1>;
     using ScalarOperandI16 = ScalarOperand<ScalarRegI16, false, 1>;
     using ScalarOperandU32 = ScalarOperand<ScalarRegU32, false>;
     using ScalarOperandI32 = ScalarOperand<ScalarRegI32, false>;
     using ScalarOperandF32 = ScalarOperand<ScalarRegF32, false>;
     using ScalarOperandU64 = ScalarOperand<ScalarRegU64, false>;
     using ScalarOperandI64 = ScalarOperand<ScalarRegI64, false>;
     using ScalarOperandF64 = ScalarOperand<ScalarRegF64, false>;
     using ScalarOperandU128 = ScalarOperand<ScalarRegU32, false, 4>;
     using ScalarOperandU256 = ScalarOperand<ScalarRegU32, false, 8>;
     using ScalarOperandU512 = ScalarOperand<ScalarRegU32, false, 16>;
     // non-writeable versions of scalar operands
     using ConstScalarOperandU8 = ScalarOperand<ScalarRegU8, true, 1>;
     using ConstScalarOperandI8 = ScalarOperand<ScalarRegI8, true, 1>;
     using ConstScalarOperandU16 = ScalarOperand<ScalarRegU16, true, 1>;
     using ConstScalarOperandI16 = ScalarOperand<ScalarRegI16, true, 1>;
     using ConstScalarOperandU32 = ScalarOperand<ScalarRegU32, true>;
     using ConstScalarOperandI32 = ScalarOperand<ScalarRegI32, true>;
     using ConstScalarOperandF32 = ScalarOperand<ScalarRegF32, true>;
     using ConstScalarOperandU64 = ScalarOperand<ScalarRegU64, true>;
     using ConstScalarOperandI64 = ScalarOperand<ScalarRegI64, true>;
     using ConstScalarOperandF64 = ScalarOperand<ScalarRegF64, true>;
     using ConstScalarOperandU128 = ScalarOperand<ScalarRegU32, true, 4>;
     using ConstScalarOperandU256 = ScalarOperand<ScalarRegU32, true, 8>;
     using ConstScalarOperandU512 = ScalarOperand<ScalarRegU32, true, 16>;
     // typedefs for the various sizes/types of vector operands
     using VecOperandU8 = VecOperand<VecElemU8, false, 1>;
     using VecOperandI8 = VecOperand<VecElemI8, false, 1>;
     using VecOperandU16 = VecOperand<VecElemU16, false, 1>;
     using VecOperandI16 = VecOperand<VecElemI16, false, 1>;
     using VecOperandU32 = VecOperand<VecElemU32, false>;
     using VecOperandI32 = VecOperand<VecElemI32, false>;
     using VecOperandF32 = VecOperand<VecElemF32, false>;
     using VecOperandU64 = VecOperand<VecElemU64, false>;
     using VecOperandF64 = VecOperand<VecElemF64, false>;
     using VecOperandI64 = VecOperand<VecElemI64, false>;
     using VecOperandU96 = VecOperand<VecElemU32, false, 3>;
     using VecOperandU128 = VecOperand<VecElemU32, false, 4>;
     using VecOperandU256 = VecOperand<VecElemU32, false, 8>;
     using VecOperandU512 = VecOperand<VecElemU32, false, 16>;
     // non-writeable versions of vector operands
     using ConstVecOperandU8 = VecOperand<VecElemU8, true, 1>;
     using ConstVecOperandI8 = VecOperand<VecElemI8, true, 1>;
     using ConstVecOperandU16 = VecOperand<VecElemU16, true, 1>;
     using ConstVecOperandI16 = VecOperand<VecElemI16, true, 1>;
     using ConstVecOperandU32 = VecOperand<VecElemU32, true>;
     using ConstVecOperandI32 = VecOperand<VecElemI32, true>;
     using ConstVecOperandF32 = VecOperand<VecElemF32, true>;
     using ConstVecOperandU64 = VecOperand<VecElemU64, true>;
     using ConstVecOperandI64 = VecOperand<VecElemI64, true>;
     using ConstVecOperandF64 = VecOperand<VecElemF64, true>;
     using ConstVecOperandU96 = VecOperand<VecElemU32, true, 3>;
     using ConstVecOperandU128 = VecOperand<VecElemU32, true, 4>;
     using ConstVecOperandU256 = VecOperand<VecElemU32, true, 8>;
     using ConstVecOperandU512 = VecOperand<VecElemU32, true, 16>;
 }

 } // namespace gem5

 #endif // __ARCH_GCN3_OPERAND_HH__