| /* |
| * Copyright (c) 2015-2021 Advanced Micro Devices, Inc. |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * 3. Neither the name of the copyright holder nor the names of its |
| * contributors may be used to endorse or promote products derived from this |
| * software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #ifndef __ARCH_GCN3_INSTS_INST_UTIL_HH__ |
| #define __ARCH_GCN3_INSTS_INST_UTIL_HH__ |
| |
| #include <cmath> |
| |
| #include "arch/amdgpu/gcn3/gpu_registers.hh" |
| |
| namespace gem5 |
| { |
| |
| // values for SDWA select operations |
| enum SDWASelVals : int |
| { |
| SDWA_BYTE_0 = 0, /* select data[7:0] */ |
| SDWA_BYTE_1 = 1, /* select data[15:8] */ |
| SDWA_BYTE_2 = 2, /* select data[23:16] */ |
| SDWA_BYTE_3 = 3, /* select data[31:24] */ |
| SDWA_WORD_0 = 4, /* select data[15:0] */ |
| SDWA_WORD_1 = 5, /* select data[31:16] */ |
| SDWA_DWORD = 6 /* select data[31:0] */ |
| }; |
| |
| // values for format of destination bits for SDWA operations |
| enum SDWADstVals : int |
| { |
| SDWA_UNUSED_PAD = 0, /* Pad all unused bits with 0 */ |
| SDWA_UNUSED_SEXT = 1, /* Sign-extend upper bits; pad lower bits w/ 0 */ |
| SDWA_UNUSED_PRESERVE = 2 /* select data[31:0] */ |
| }; |
| |
| // values for DPP operations |
| enum SqDPPVals : int |
| { |
| SQ_DPP_QUAD_PERM_MAX = 0xFF, |
| SQ_DPP_RESERVED = 0x100, |
| SQ_DPP_ROW_SL1 = 0x101, |
| SQ_DPP_ROW_SL15 = 0x10F, |
| SQ_DPP_ROW_SR1 = 0x111, |
| SQ_DPP_ROW_SR15 = 0x11F, |
| SQ_DPP_ROW_RR1 = 0x121, |
| SQ_DPP_ROW_RR15 = 0x12F, |
| SQ_DPP_WF_SL1 = 0x130, |
| SQ_DPP_WF_RL1 = 0x134, |
| SQ_DPP_WF_SR1 = 0x138, |
| SQ_DPP_WF_RR1 = 0x13C, |
| SQ_DPP_ROW_MIRROR = 0x140, |
| SQ_DPP_ROW_HALF_MIRROR = 0x141, |
| SQ_DPP_ROW_BCAST15 = 0x142, |
| SQ_DPP_ROW_BCAST31 = 0x143 |
| }; |
| static const int ROW_SIZE = 16; /* 16 registers per row */ |
| static const int NUM_BANKS = 4; /* 64 registers, 16/bank */ |
| |
| namespace Gcn3ISA |
| { |
| template<typename T> |
| inline T |
| wholeQuadMode(T val) |
| { |
| T wqm = 0; |
| T mask = 0xF; |
| |
| for (T bits = val; mask != 0; mask <<= 4) |
| if ((bits & mask) != 0) |
| wqm |= mask; |
| |
| return wqm; |
| } |
| |
| template<typename T> |
| inline T |
| quadMask(T val) |
| { |
| T qmsk = 0; |
| T mask = 0xF; |
| T qbit = 0x1; |
| |
| for (T bits = val; mask != 0; mask <<= 4, qbit <<= 1) { |
| if (bits & mask) { |
| qmsk |= qbit; |
| } |
| } |
| |
| return qmsk; |
| } |
| |
| template<typename T> |
| inline ScalarRegI32 |
| countZeroBits(T val) |
| { |
| ScalarRegI32 num_zeros |
| = std::numeric_limits<T>::digits - popCount(val); |
| |
| return num_zeros; |
| } |
| |
| template<typename T> |
| inline ScalarRegI32 |
| findFirstZero(T val) |
| { |
| if (val == ~T(0)) { |
| return -1; |
| } |
| |
| return findLsbSet(~val); |
| } |
| |
| template<typename T> |
| inline ScalarRegI32 |
| findFirstOne(T val) |
| { |
| if (!val) { |
| return -1; |
| } |
| |
| return findLsbSet(val); |
| } |
| |
| template<typename T> |
| inline ScalarRegI32 |
| findFirstOneMsb(T val) |
| { |
| if (!val) { |
| return -1; |
| } |
| |
| return findMsbSet(val); |
| } |
| |
| template<typename T> |
| inline ScalarRegI32 |
| countZeroBitsMsb(T val) |
| { |
| if (!val) { |
| return -1; |
| } |
| |
| return std::numeric_limits<T>::digits - 1 - findMsbSet(val); |
| } |
| |
| inline ScalarRegI32 |
| firstOppositeSignBit(ScalarRegI32 val) |
| { |
| bool found(false); |
| bool sign_bit = (val & 0x80000000) != 0; |
| ScalarRegU32 tmp_val(0); |
| int count(0); |
| |
| if (!val || val == -1) { |
| return -1; |
| } |
| |
| for (int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++i) { |
| tmp_val = val & (0x80000000 >> i); |
| |
| if (!sign_bit) { |
| if (tmp_val) { |
| found = true; |
| break; |
| } |
| } else { |
| if (!tmp_val) { |
| found = true; |
| break; |
| } |
| } |
| ++count; |
| } |
| |
| if (found) { |
| return count; |
| } else { |
| return -1; |
| } |
| } |
| |
| inline ScalarRegI32 |
| firstOppositeSignBit(ScalarRegI64 val) |
| { |
| bool found(false); |
| bool sign_bit = (val & 0x8000000000000000ULL) != 0; |
| ScalarRegU64 tmp_val(0); |
| int count(0); |
| |
| if (!val || val == -1) { |
| return -1; |
| } |
| |
| for (int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++i) { |
| tmp_val = val & (0x8000000000000000ULL >> i); |
| |
| if (!sign_bit) { |
| if (tmp_val) { |
| found = true; |
| break; |
| } |
| } else { |
| if (!tmp_val) { |
| found = true; |
| break; |
| } |
| } |
| ++count; |
| } |
| |
| if (found) { |
| return count; |
| } else { |
| return -1; |
| } |
| } |
| |
| template<typename T> |
| inline T |
| median(T val_0, T val_1, T val_2) |
| { |
| if (std::is_floating_point_v<T>) { |
| return std::fmax(std::fmin(val_0, val_1), |
| std::fmin(std::fmax(val_0, val_1), val_2)); |
| } else { |
| return std::max(std::min(val_0, val_1), |
| std::min(std::max(val_0, val_1), val_2)); |
| } |
| } |
| |
| template <typename T> |
| inline T roundNearestEven(T val) |
| { |
| T int_part = 0; |
| T nearest_round = std::floor(val + 0.5); |
| if ((int)std::floor(val) % 2 == 0 |
| && std::modf(std::abs(val), &int_part) == 0.5) { |
| nearest_round = nearest_round - 1; |
| } |
| |
| return nearest_round; |
| } |
| |
| inline VecElemU32 |
| muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1, |
| VecElemU64 val_2) |
| { |
| __uint128_t u0 = (__uint128_t)val_0; |
| __uint128_t u1 = (__uint128_t)val_1; |
| __uint128_t u2 = (__uint128_t)val_2; |
| __uint128_t result = u0 * u1 + u2; |
| |
| dst = (VecElemU64)result; |
| |
| return (VecElemU32)(result >> 64) ? 1 : 0; |
| } |
| |
| inline VecElemU32 |
| muladd(VecElemI64 &dst, VecElemI32 val_0, VecElemI32 val_1, |
| VecElemI64 val_2) |
| { |
| __int128_t u0 = (__int128_t)val_0; |
| __int128_t u1 = (__int128_t)val_1; |
| __int128_t u2 = (__int128_t)val_2; |
| __int128_t result = u0 * u1 + u2; |
| |
| dst = (VecElemI64)result; |
| |
| return (VecElemU32)(result >> 64) ? 1 : 0; |
| } |
| |
| /** |
| * dppInstImpl is a helper function that performs the inputted operation |
| * on the inputted vector register lane. The returned output lane |
| * represents the input lane given the destination lane and DPP_CTRL word. |
| * |
| * Currently the values are: |
| * 0x0 - 0xFF: full permute of four threads |
| * 0x100: reserved |
| * 0x101 - 0x10F: row shift right by 1-15 threads |
| * 0x111 - 0x11F: row shift right by 1-15 threads |
| * 0x121 - 0x12F: row shift right by 1-15 threads |
| * 0x130: wavefront left shift by 1 thread |
| * 0x134: wavefront left rotate by 1 thread |
| * 0x138: wavefront right shift by 1 thread |
| * 0x13C: wavefront right rotate by 1 thread |
| * 0x140: mirror threads within row |
| * 0x141: mirror threads within 1/2 row (8 threads) |
| * 0x142: broadcast 15th thread of each row to next row |
| * 0x143: broadcast thread 31 to rows 2 and 3 |
| */ |
| int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum, |
| int rowOffset, bool & outOfBounds) |
| { |
| // local variables |
| // newLane will be the same as the input lane unless swizzling happens |
| int newLane = currLane; |
| // for shift/rotate permutations; positive values are LEFT rotates |
| int count = 1; |
| int localRowOffset = rowOffset; |
| int localRowNum = rowNum; |
| |
| if (dppCtrl <= SQ_DPP_QUAD_PERM_MAX) { // DPP_QUAD_PERM{00:FF} |
| int quadBase = (currLane & ~(3)); |
| int quadPix = (currLane & 3); |
| quadPix = ((dppCtrl >> (2 * quadPix)) & 3); |
| newLane = (quadBase | quadPix); |
| } else if (dppCtrl == SQ_DPP_RESERVED) { |
| panic("ERROR: instruction using reserved DPP_CTRL value\n"); |
| } else if ((dppCtrl >= SQ_DPP_ROW_SL1) && |
| (dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15} |
| count -= (dppCtrl - SQ_DPP_ROW_SL1 + 1); |
| if ((localRowOffset + count >= 0) && |
| (localRowOffset + count < ROW_SIZE)) { |
| localRowOffset += count; |
| newLane = (rowNum | localRowOffset); |
| } else { |
| outOfBounds = true; |
| } |
| } else if ((dppCtrl >= SQ_DPP_ROW_SR1) && |
| (dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15} |
| count -= (dppCtrl - SQ_DPP_ROW_SR1 + 1); |
| if ((localRowOffset + count >= 0) && |
| (localRowOffset + count < ROW_SIZE)) { |
| localRowOffset += count; |
| newLane = (rowNum | localRowOffset); |
| } else { |
| outOfBounds = true; |
| } |
| } else if ((dppCtrl >= SQ_DPP_ROW_RR1) && |
| (dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15} |
| count -= (dppCtrl - SQ_DPP_ROW_RR1 + 1); |
| localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE; |
| newLane = (rowNum | localRowOffset); |
| } else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1 |
| count = 1; |
| if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) { |
| newLane += count; |
| } else { |
| outOfBounds = true; |
| } |
| } else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1 |
| count = 1; |
| newLane = (currLane + count + NumVecElemPerVecReg) % |
| NumVecElemPerVecReg; |
| } else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1 |
| count = -1; |
| int currVal = (currLane + count); |
| if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) { |
| newLane += count; |
| } else { |
| outOfBounds = true; |
| } |
| } else if (dppCtrl == SQ_DPP_WF_RR1) { // DPP_WF_RR1 |
| count = -1; |
| newLane = (currLane + count + NumVecElemPerVecReg) % |
| NumVecElemPerVecReg; |
| } else if (dppCtrl == SQ_DPP_ROW_MIRROR) { // DPP_ROW_MIRROR |
| localRowOffset = (15 - localRowOffset); |
| newLane = (rowNum | localRowOffset); |
| } else if (dppCtrl == SQ_DPP_ROW_HALF_MIRROR) { // DPP_ROW_HALF_MIRROR |
| localRowNum = (currLane & -0x7); |
| localRowOffset = (currLane & 0x7); |
| localRowOffset = (7 - localRowNum); |
| newLane = (localRowNum | localRowOffset); |
| } else if (dppCtrl == SQ_DPP_ROW_BCAST15) { // DPP_ROW_BCAST15 |
| count = 15; |
| if (currLane > count) { |
| newLane = (currLane & ~count) - 1; |
| } |
| } else if (dppCtrl == SQ_DPP_ROW_BCAST31) { // DPP_ROW_BCAST31 |
| count = 31; |
| if (currLane > count) { |
| newLane = (currLane & ~count) - 1; |
| } |
| } else { |
| panic("Unimplemented DPP control operation: %d\n", dppCtrl); |
| } |
| |
| return newLane; |
| } |
| |
| /** |
| * processDPP is a helper function for implementing Data Parallel Primitive |
| * instructions. This function may be called by many different VOP1 |
| * instructions to do operations within a register. |
| */ |
| template<typename T> |
| void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, |
| T & src0) |
| { |
| // local variables |
| SqDPPVals dppCtrl = (SqDPPVals)dppInst.DPP_CTRL; |
| int boundCtrl = dppInst.BOUND_CTRL; |
| int bankMask = dppInst.BANK_MASK; |
| int rowMask = dppInst.ROW_MASK; |
| // row, bank info to be calculated per lane |
| int rowNum = 0, bankNum = 0, rowOffset = 0; |
| // outLane will be the same as the input lane unless swizzling happens |
| int outLane = 0; |
| bool laneDisabled = false; |
| // flags used for determining if a lane should be written to/reset/etc. |
| bool outOfBounds = false, zeroSrc = false; |
| long long threadValid = 0; |
| |
| /** |
| * STEP 1a: check if the absolute value (ABS) or negation (NEG) tags |
| * are set. If so, do the appropriate action(s) on src0 and/or src1. |
| * |
| * NOTE: ABS takes priority over NEG. |
| */ |
| if (dppInst.SRC0_NEG) { |
| src0.negModifier(); |
| } |
| |
| if (dppInst.SRC0_ABS) { |
| src0.absModifier(); |
| } |
| |
| // iterate over all register lanes, performing steps 2-4 |
| for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { |
| threadValid = (0x1LL << lane); |
| /** |
| * STEP 2: check the row and bank mask values. These determine |
| * which threads are enabled for the subsequent DPP_CTRL |
| * operations. |
| */ |
| rowNum = (lane / ROW_SIZE); |
| rowOffset = (lane % ROW_SIZE); |
| bankNum = (rowOffset / NUM_BANKS); |
| |
| if (((rowMask & (0x1 << rowNum)) == 0) /* row mask */ || |
| ((bankMask & (0x1 << bankNum)) == 0) /* bank mask */) { |
| laneDisabled = true; |
| continue; |
| } |
| |
| /** |
| * STEP 4: Handle the potential values of DPP_CTRL: |
| * 0x0 - 0xFF: full permute of four threads |
| * 0x100: reserved |
| * 0x101 - 0x10F: row shift right by 1-15 threads |
| * 0x111 - 0x11F: row shift right by 1-15 threads |
| * 0x121 - 0x12F: row shift right by 1-15 threads |
| * 0x130: wavefront left shift by 1 thread |
| * 0x134: wavefront left rotate by 1 thread |
| * 0x138: wavefront right shift by 1 thread |
| * 0x13C: wavefront right rotate by 1 thread |
| * 0x140: mirror threads within row |
| * 0x141: mirror threads within 1/2 row (8 threads) |
| * 0x142: broadcast 15th thread of each row to next row |
| * 0x143: broadcast thread 31 to rows 2 and 3 |
| */ |
| if (!laneDisabled) { |
| outLane = dppInstImpl(dppCtrl, lane, rowNum, rowOffset, |
| outOfBounds); |
| } |
| |
| /** |
| * STEP 4: Implement bound control for disabled threads. If thread |
| * is disabled but boundCtrl is set, then we need to set the source |
| * data to 0 (i.e., set this lane to 0). |
| */ |
| if (laneDisabled) { |
| threadValid = 0; |
| } else if (outOfBounds) { |
| if (boundCtrl == 1) { |
| zeroSrc = true; |
| } else { |
| threadValid = 0; |
| } |
| } else if (!gpuDynInst->exec_mask[lane]) { |
| if (boundCtrl == 1) { |
| zeroSrc = true; |
| } else { |
| threadValid = 0; |
| } |
| } |
| |
| if (threadValid != 0 && !outOfBounds && !zeroSrc) { |
| assert(!laneDisabled); |
| src0[outLane] = src0[lane]; |
| } else if (zeroSrc) { |
| src0[lane] = 0; |
| } |
| |
| // reset for next iteration |
| laneDisabled = false; |
| } |
| } |
| |
| /** |
| * processDPP is a helper function for implementing Data Parallel Primitive |
| * instructions. This function may be called by many different |
| * VOP2/VOPC instructions to do operations within a register. |
| */ |
| template<typename T> |
| void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, |
| T & src0, T & src1) |
| { |
| /** |
| * STEP 1b: check if the absolute value (ABS) or negation (NEG) tags |
| * are set. If so, do the appropriate action(s) on src0 and/or src1. |
| * |
| * NOTE: ABS takes priority over NEG. |
| */ |
| if (dppInst.SRC1_NEG) { |
| src1.negModifier(); |
| } |
| |
| if (dppInst.SRC1_ABS) { |
| src1.absModifier(); |
| } |
| |
| // Since only difference for VOP1 and VOP2/VOPC instructions is SRC1, |
| // which is only used for negation/absolute value, call other version |
| // to do everything else. |
| processDPP(gpuDynInst, dppInst, src0); |
| } |
| |
| /** |
| * sdwaInstSrcImpl_helper contains the per-lane code for selecting the |
| * appropriate bytes/words of the lane and doing the appropriate |
| * masking/padding/sign extending. It returns the value after these |
| * operations are done on it. |
| */ |
| template<typename T> |
| T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal, |
| const SDWASelVals sel, const bool signExt) |
| { |
| // local variables |
| int low_bit = 0, high_bit = 0; |
| bool signExt_local = signExt; |
| T retVal = 0; |
| |
| // if we're preserving all of the bits, then we can immediately return |
| if (sel == SDWA_DWORD) { |
| return currOperVal; |
| } |
| |
| if (sel < SDWA_WORD_0) { // we are selecting 1 byte |
| /* |
| Process byte 0 first. This code eiter selects the original bits |
| of byte 0, or makes the bits of the selected byte be byte 0 (and |
| next either sign extends or zero's out upper bits). |
| */ |
| low_bit = (sel * Gcn3ISA::BITS_PER_BYTE); |
| high_bit = low_bit + Gcn3ISA::MSB_PER_BYTE; |
| retVal = bits(currOperVal, high_bit, low_bit); |
| |
| // make sure update propagated, since used next |
| fatal_if(bits(retVal, Gcn3ISA::MSB_PER_BYTE) != |
| bits(origOperVal, high_bit), |
| "ERROR: SDWA byte update not propagated: retVal: %d, " |
| "orig: %d\n", bits(retVal, Gcn3ISA::MSB_PER_BYTE), |
| bits(origOperVal, high_bit)); |
| // sign extended value depends on upper-most bit of the new byte 0 |
| signExt_local = (signExt && |
| (bits(retVal, Gcn3ISA::MSB_PER_BYTE, 0) & 0x80)); |
| |
| // process all other bytes -- if sign extending, make them 1, else |
| // all 0's so leave as is |
| if (signExt_local) { |
| retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_BYTE>(retVal); |
| } |
| } else if (sel < SDWA_DWORD) { // we are selecting 1 word |
| /* |
| Process word 0 first. This code eiter selects the original bits |
| of word 0, or makes the bits of the selected word be word 0 (and |
| next either sign extends or zero's out upper bits). |
| */ |
| low_bit = (sel & 1) * Gcn3ISA::BITS_PER_WORD; |
| high_bit = low_bit + Gcn3ISA::MSB_PER_WORD; |
| retVal = bits(currOperVal, high_bit, low_bit); |
| |
| // make sure update propagated, since used next |
| fatal_if(bits(retVal, Gcn3ISA::MSB_PER_WORD) != |
| bits(origOperVal, high_bit), |
| "ERROR: SDWA word update not propagated: retVal: %d, " |
| "orig: %d\n", |
| bits(retVal, Gcn3ISA::MSB_PER_WORD), |
| bits(origOperVal, high_bit)); |
| // sign extended value depends on upper-most bit of the new word 0 |
| signExt_local = (signExt && |
| (bits(retVal, Gcn3ISA::MSB_PER_WORD, 0) & |
| 0x8000)); |
| |
| // process other word -- if sign extending, make them 1, else all |
| // 0's so leave as is |
| if (signExt_local) { |
| retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_WORD>(retVal); |
| } |
| } else { |
| assert(sel != SDWA_DWORD); // should have returned earlier |
| panic("Unimplemented SDWA select operation: %d\n", sel); |
| } |
| |
| return retVal; |
| } |
| |
| |
| /** |
| * sdwaInstSrcImpl is a helper function that selects the appropriate |
| * bits/bytes for each lane of the inputted source operand of an SDWA |
| * instruction, does the appropriate masking/padding/sign extending for the |
| * non-selected bits/bytes, and updates the operands values with the |
| * resultant value. |
| * |
| * The desired behavior is: |
| * 1. Select the appropriate bits/bytes based on sel: |
| * 0 (SDWA_BYTE_0): select data[7:0] |
| * 1 (SDWA_BYTE_1): select data[15:8] |
| * 2 (SDWA_BYTE_2): select data[23:16] |
| * 3 (SDWA_BYTE_3): select data[31:24] |
| * 4 (SDWA_WORD_0): select data[15:0] |
| * 5 (SDWA_WORD_1): select data[31:16] |
| * 6 (SDWA_DWORD): select data[31:0] |
| * 2. if sign extend is set, then sign extend the value |
| */ |
| template<typename T> |
| void sdwaInstSrcImpl(T & currOper, T & origCurrOper, |
| const SDWASelVals sel, const bool signExt) |
| { |
| // iterate over all lanes, setting appropriate, selected value |
| for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { |
| currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane], |
| origCurrOper[lane], sel, |
| signExt); |
| } |
| } |
| |
| |
| /** |
| * sdwaInstDstImpl_helper contains the per-lane code for selecting the |
| * appropriate bytes/words of the lane and doing the appropriate |
| * masking/padding/sign extending. It returns the value after these |
| * operations are done on it. |
| */ |
| template<typename T> |
| T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal, |
| const bool clamp, const SDWASelVals sel, |
| const SDWADstVals unusedBits_format) |
| { |
| // local variables |
| int low_bit = 0, high_bit = 0; |
| bool signExt = (unusedBits_format == SDWA_UNUSED_SEXT); |
| //bool pad = (unusedBits_format == SDWA_UNUSED_PAD); |
| bool preserve = (unusedBits_format == SDWA_UNUSED_PRESERVE); |
| T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0, |
| origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0; |
| |
| // if we're preserving all of the bits, then we can immediately return |
| if (unusedBits_format == SDWA_UNUSED_PRESERVE) { |
| assert(sel == SDWA_DWORD); |
| return currDstVal; |
| } else if (sel == SDWA_DWORD) { |
| // NOTE: users may set the unused bits variable to anything in this |
| // scenario, because it will be ignored |
| return currDstVal; |
| } |
| |
| if (sel < SDWA_WORD_0) { // we are selecting 1 byte |
| // if we sign extended depends on upper-most bit of byte 0 |
| signExt = (signExt && |
| (bits(currDstVal, Gcn3ISA::MSB_PER_WORD, 0) & 0x80)); |
| |
| for (int byte = 0; byte < 4; ++byte) { |
| low_bit = byte * Gcn3ISA::BITS_PER_BYTE; |
| high_bit = low_bit + Gcn3ISA::MSB_PER_BYTE; |
| /* |
| Options: |
| 1. byte == sel: we are keeping all bits in this byte |
| 2. preserve is set: keep this byte as is because the |
| output preserve flag is set |
| 3. byte > sel && signExt: we're sign extending and |
| this byte is one of the bytes we need to sign extend |
| */ |
| origBits_thisByte = bits(origDstVal, high_bit, low_bit); |
| currBits_thisByte = bits(currDstVal, high_bit, low_bit); |
| newBits = ((byte == sel) ? origBits_thisByte : |
| ((preserve) ? currBits_thisByte : |
| (((byte > sel) && signExt) ? 0xff : 0))); |
| retVal = insertBits(retVal, high_bit, low_bit, newBits); |
| } |
| } else if (sel < SDWA_DWORD) { // we are selecting 1 word |
| low_bit = 0; |
| high_bit = low_bit + Gcn3ISA::MSB_PER_WORD; |
| // if we sign extended depends on upper-most bit of word 0 |
| signExt = (signExt && |
| (bits(currDstVal, high_bit, low_bit) & 0x8000)); |
| |
| for (int word = 0; word < 2; ++word) { |
| low_bit = word * Gcn3ISA::BITS_PER_WORD; |
| high_bit = low_bit + Gcn3ISA::MSB_PER_WORD; |
| /* |
| Options: |
| 1. word == sel & 1: we are keeping all bits in this word |
| 2. preserve is set: keep this word as is because the |
| output preserve flag is set |
| 3. word > (sel & 1) && signExt: we're sign extending and |
| this word is one of the words we need to sign extend |
| */ |
| origBits_thisWord = bits(origDstVal, high_bit, low_bit); |
| currBits_thisWord = bits(currDstVal, high_bit, low_bit); |
| newBits = ((word == (sel & 0x1)) ? origBits_thisWord : |
| ((preserve) ? currBits_thisWord : |
| (((word > (sel & 0x1)) && signExt) ? 0xffff : 0))); |
| retVal = insertBits(retVal, high_bit, low_bit, newBits); |
| } |
| } else { |
| assert(sel != SDWA_DWORD); // should have returned earlier |
| panic("Unimplemented SDWA select operation: %d\n", sel); |
| } |
| |
| return retVal; |
| } |
| |
| |
| /** |
| * sdwaInstDestImpl is a helper function that selects the appropriate |
| * bits/bytes for the inputted dest operand of an SDWA instruction, does |
| * the appropriate masking/padding/sign extending for the non-selected |
| * bits/bytes, and updates the operands values with the resultant value. |
| * |
| * The desired behavior is: |
| * 1. Select the appropriate bits/bytes based on sel: |
| * 0 (SDWA_BYTE_0): select data[7:0] |
| * 1 (SDWA_BYTE_1): select data[15:8] |
| * 2 (SDWA_BYTE_2): select data[23:16] |
| * 3 (SDWA_BYTE_3): select data[31:24] |
| * 4 (SDWA_WORD_0): select data[15:0] |
| * 5 (SDWA_WORD_1): select data[31:16] |
| * 6 (SDWA_DWORD): select data[31:0] |
| * 2. either pad, sign extend, or select all bits based on the value of |
| * unusedBits_format: |
| * 0 (SDWA_UNUSED_PAD): pad all unused bits with 0 |
| * 1 (SDWA_UNUSED_SEXT): sign-extend upper bits; pad lower bits w/ 0 |
| * 2 (SDWA_UNUSED_PRESERVE): select data[31:0] |
| */ |
| template<typename T> |
| void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp, |
| const SDWASelVals sel, |
| const SDWADstVals unusedBits_format) |
| { |
| // iterate over all lanes, setting appropriate, selected value |
| for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { |
| dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane], |
| origDstOper[lane], clamp, |
| sel, unusedBits_format); |
| } |
| } |
| |
| |
| /** |
| * processSDWA_srcHelper is a helper function for implementing sub d-word |
| * addressing instructions for the src operands. This function may be |
| * called by many different VOP1/VOP2/VOPC instructions to do operations |
| * within a register. This function is also agnostic of which operand it |
| * is operating on, so that it can be called for any src operand. |
| */ |
| template<typename T> |
| void processSDWA_src_helper(T & currSrc, T & origCurrSrc, |
| const SDWASelVals src_sel, |
| const bool src_signExt, const bool src_abs, |
| const bool src_neg) |
| { |
| /** |
| * STEP 1: check if the absolute value (ABS) or negation (NEG) tags |
| * are set. If so, do the appropriate action(s) on the src operand. |
| * |
| * NOTE: According to the CSim implementation, ABS takes priority over |
| * NEG. |
| */ |
| if (src_neg) { |
| currSrc.negModifier(); |
| } |
| |
| if (src_abs) { |
| currSrc.absModifier(); |
| } |
| |
| /** |
| * STEP 2: select the appropriate bits for each lane of source operand. |
| */ |
| sdwaInstSrcImpl(currSrc, origCurrSrc, src_sel, src_signExt); |
| } |
| |
| |
| /** |
| * processSDWA_src is a helper function for implementing sub d-word |
| * addressing instructions for the src operands. This function may be |
| * called by many different VOP1 instructions to do operations within a |
| * register. processSDWA_dst is called after the math, while |
| * processSDWA_src is called before the math. |
| */ |
| template<typename T> |
| void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0) |
| { |
| // local variables |
| const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL; |
| const bool src0_signExt = sdwaInst.SRC0_SEXT; |
| const bool src0_neg = sdwaInst.SRC0_NEG; |
| const bool src0_abs = sdwaInst.SRC0_ABS; |
| |
| // NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1 |
| // operand. So ensure that SRC1 fields are not set, then call helper |
| // function only on src0. |
| assert(!sdwaInst.SRC1_SEXT); |
| assert(!sdwaInst.SRC1_NEG); |
| assert(!sdwaInst.SRC1_ABS); |
| |
| processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt, |
| src0_abs, src0_neg); |
| } |
| |
| |
| /** |
| * processSDWA_src is a helper function for implementing sub d-word |
| * addressing instructions. This function may be called by many different |
| * VOP2/VOPC instructions to do operations within a register. |
| * processSDWA_dst is called after the math, while processSDWA_src is |
| * called before the math. |
| */ |
| template<typename T> |
| void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0, |
| T & src1, T & origSrc1) |
| { |
| // local variables |
| const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL; |
| const bool src0_signExt = sdwaInst.SRC0_SEXT; |
| const bool src0_neg = sdwaInst.SRC0_NEG; |
| const bool src0_abs = sdwaInst.SRC0_ABS; |
| const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL; |
| const bool src1_signExt = sdwaInst.SRC1_SEXT; |
| const bool src1_neg = sdwaInst.SRC1_NEG; |
| const bool src1_abs = sdwaInst.SRC1_ABS; |
| |
| processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt, |
| src0_abs, src0_neg); |
| processSDWA_src_helper(src1, origSrc1, src1_sel, src1_signExt, |
| src1_abs, src1_neg); |
| } |
| |
| |
| /** |
| * processSDWA_dst is a helper function for implementing sub d-word |
| * addressing instructions for the dst operand. This function may be |
| * called by many different VOP1/VOP2/VOPC instructions to do operations |
| * within a register. processSDWA_dst is called after the math, while |
| * processSDWA_src is called before the math. |
| */ |
| template<typename T> |
| void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst) |
| { |
| // local variables |
| const SDWADstVals dst_unusedBits_format = |
| (SDWADstVals)sdwaInst.DST_UNUSED; |
| const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL; |
| const bool clamp = sdwaInst.CLAMP; |
| |
| /** |
| * STEP 1: select the appropriate bits for dst and pad/sign-extend as |
| * appropriate. |
| */ |
| sdwaInstDstImpl(dst, origDst, clamp, dst_sel, dst_unusedBits_format); |
| } |
| } // namespace Gcn3ISA |
| } // namespace gem5 |
| |
| #endif // __ARCH_GCN3_INSTS_INST_UTIL_HH__ |