/*
 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "arch/gcn3/insts/instructions.hh"

#include <cmath>

#include "arch/gcn3/insts/inst_util.hh"
#include "debug/GCN3.hh"
#include "debug/GPUSync.hh"
#include "gpu-compute/shader.hh"

namespace Gcn3ISA
{

    Inst_SOP2__S_ADD_U32::Inst_SOP2__S_ADD_U32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_add_u32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_ADD_U32

    Inst_SOP2__S_ADD_U32::~Inst_SOP2__S_ADD_U32()
    {
    } // ~Inst_SOP2__S_ADD_U32

    // D.u = S0.u + S1.u;
    // SCC = (S0.u + S1.u >= 0x100000000ULL ? 1 : 0) is an unsigned
    // overflow/carry-out.
    void
    Inst_SOP2__S_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = src0.rawData() + src1.rawData();
        scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData())
            >= 0x100000000ULL ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_SUB_U32::Inst_SOP2__S_SUB_U32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_sub_u32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_SUB_U32

    Inst_SOP2__S_SUB_U32::~Inst_SOP2__S_SUB_U32()
    {
    } // ~Inst_SOP2__S_SUB_U32

    // D.u = S0.u - S1.u;
    // SCC = (S1.u > S0.u ? 1 : 0) is an unsigned overflow or carry-out.
    void
    Inst_SOP2__S_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = src0.rawData() - src1.rawData();
        scc = (src1.rawData() > src0.rawData()) ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_ADD_I32::Inst_SOP2__S_ADD_I32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_add_i32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_ADD_I32

    Inst_SOP2__S_ADD_I32::~Inst_SOP2__S_ADD_I32()
    {
    } // ~Inst_SOP2__S_ADD_I32

    // D.i = S0.i + S1.i;
    // SCC = (S0.u[31] == S1.u[31] && S0.u[31] != D.u[31]) is a signed
    // overflow.
    void
    Inst_SOP2__S_ADD_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = src0.rawData() + src1.rawData();
        scc = (bits(src0.rawData(), 31) == bits(src1.rawData(), 31)
            && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31))
            ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_SUB_I32::Inst_SOP2__S_SUB_I32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_sub_i32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_SUB_I32

    Inst_SOP2__S_SUB_I32::~Inst_SOP2__S_SUB_I32()
    {
    } // ~Inst_SOP2__S_SUB_I32

    // D.i = S0.i - S1.i;
    // SCC = (S0.u[31] != S1.u[31] && S0.u[31] != D.u[31]) is a signed
    // overflow.
    void
    Inst_SOP2__S_SUB_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = src0.rawData() - src1.rawData();
        scc = (bits(src0.rawData(), 31) != bits(src1.rawData(), 31)
            && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_ADDC_U32::Inst_SOP2__S_ADDC_U32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_addc_u32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_ADDC_U32

    Inst_SOP2__S_ADDC_U32::~Inst_SOP2__S_ADDC_U32()
    {
    } // ~Inst_SOP2__S_ADDC_U32

    // D.u = S0.u + S1.u + SCC;
    // SCC = (S0.u + S1.u + SCC >= 0x100000000ULL ? 1 : 0) is an unsigned
    // overflow.
    void
    Inst_SOP2__S_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();
        scc.read();

        sdst = src0.rawData() + src1.rawData() + scc.rawData();
        scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData()
            + (ScalarRegU64)scc.rawData()) >= 0x100000000ULL ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_SUBB_U32::Inst_SOP2__S_SUBB_U32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_subb_u32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_SUBB_U32

    Inst_SOP2__S_SUBB_U32::~Inst_SOP2__S_SUBB_U32()
    {
    } // ~Inst_SOP2__S_SUBB_U32

    // D.u = S0.u - S1.u - SCC;
    // SCC = (S1.u + SCC > S0.u ? 1 : 0) is an unsigned overflow.
    void
    Inst_SOP2__S_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();
        scc.read();

        sdst = src0.rawData() - src1.rawData() - scc.rawData();
        scc = (src1.rawData() + scc.rawData()) > src0.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_MIN_I32::Inst_SOP2__S_MIN_I32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_min_i32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_MIN_I32

    Inst_SOP2__S_MIN_I32::~Inst_SOP2__S_MIN_I32()
    {
    } // ~Inst_SOP2__S_MIN_I32

    // D.i = (S0.i < S1.i) ? S0.i : S1.i;
    // SCC = 1 if S0 is chosen as the minimum value.
    void
    Inst_SOP2__S_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = std::min(src0.rawData(), src1.rawData());
        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_MIN_U32::Inst_SOP2__S_MIN_U32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_min_u32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_MIN_U32

    Inst_SOP2__S_MIN_U32::~Inst_SOP2__S_MIN_U32()
    {
    } // ~Inst_SOP2__S_MIN_U32

    // D.u = (S0.u < S1.u) ? S0.u : S1.u;
    // SCC = 1 if S0 is chosen as the minimum value.
    void
    Inst_SOP2__S_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = std::min(src0.rawData(), src1.rawData());
        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_MAX_I32::Inst_SOP2__S_MAX_I32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_max_i32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_MAX_I32

    Inst_SOP2__S_MAX_I32::~Inst_SOP2__S_MAX_I32()
    {
    } // ~Inst_SOP2__S_MAX_I32

    // D.i = (S0.i > S1.i) ? S0.i : S1.i;
    // SCC = 1 if S0 is chosen as the maximum value.
    void
    Inst_SOP2__S_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = std::max(src0.rawData(), src1.rawData());
        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_MAX_U32::Inst_SOP2__S_MAX_U32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_max_u32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_MAX_U32

    Inst_SOP2__S_MAX_U32::~Inst_SOP2__S_MAX_U32()
    {
    } // ~Inst_SOP2__S_MAX_U32

    // D.u = (S0.u > S1.u) ? S0.u : S1.u;
    // SCC = 1 if S0 is chosen as the maximum value.
    void
    Inst_SOP2__S_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = std::max(src0.rawData(), src1.rawData());
        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_CSELECT_B32::Inst_SOP2__S_CSELECT_B32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_cselect_b32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_CSELECT_B32

    Inst_SOP2__S_CSELECT_B32::~Inst_SOP2__S_CSELECT_B32()
    {
    } // ~Inst_SOP2__S_CSELECT_B32

    // D.u = SCC ? S0.u : S1.u (conditional select).
    void
    Inst_SOP2__S_CSELECT_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();
        scc.read();

        sdst = scc.rawData() ? src0.rawData() : src1.rawData();

        sdst.write();
    }

    Inst_SOP2__S_CSELECT_B64::Inst_SOP2__S_CSELECT_B64(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_cselect_b64")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_CSELECT_B64

    Inst_SOP2__S_CSELECT_B64::~Inst_SOP2__S_CSELECT_B64()
    {
    } // ~Inst_SOP2__S_CSELECT_B64

    // D.u64 = SCC ? S0.u64 : S1.u64 (conditional select).
    void
    Inst_SOP2__S_CSELECT_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();
        scc.read();

        sdst = scc.rawData() ? src0.rawData() : src1.rawData();

        sdst.write();
    }

    Inst_SOP2__S_AND_B32::Inst_SOP2__S_AND_B32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_and_b32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_AND_B32

    Inst_SOP2__S_AND_B32::~Inst_SOP2__S_AND_B32()
    {
    } // ~Inst_SOP2__S_AND_B32

    // D.u = S0.u & S1.u;
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_AND_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = src0.rawData() & src1.rawData();
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_AND_B64::Inst_SOP2__S_AND_B64(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_and_b64")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_AND_B64

    Inst_SOP2__S_AND_B64::~Inst_SOP2__S_AND_B64()
    {
    } // ~Inst_SOP2__S_AND_B64

    // D.u64 = S0.u64 & S1.u64;
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_AND_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = src0.rawData() & src1.rawData();
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_OR_B32::Inst_SOP2__S_OR_B32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_or_b32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_OR_B32

    Inst_SOP2__S_OR_B32::~Inst_SOP2__S_OR_B32()
    {
    } // ~Inst_SOP2__S_OR_B32

    // D.u = S0.u | S1.u;
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_OR_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = src0.rawData() | src1.rawData();
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_OR_B64::Inst_SOP2__S_OR_B64(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_or_b64")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_OR_B64

    Inst_SOP2__S_OR_B64::~Inst_SOP2__S_OR_B64()
    {
    } // ~Inst_SOP2__S_OR_B64

    // D.u64 = S0.u64 | S1.u64;
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_OR_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = src0.rawData() | src1.rawData();
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_XOR_B32::Inst_SOP2__S_XOR_B32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_xor_b32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_XOR_B32

    Inst_SOP2__S_XOR_B32::~Inst_SOP2__S_XOR_B32()
    {
    } // ~Inst_SOP2__S_XOR_B32

    // D.u = S0.u ^ S1.u;
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = src0.rawData() ^ src1.rawData();
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_XOR_B64::Inst_SOP2__S_XOR_B64(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_xor_b64")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_XOR_B64

    Inst_SOP2__S_XOR_B64::~Inst_SOP2__S_XOR_B64()
    {
    } // ~Inst_SOP2__S_XOR_B64

    // D.u64 = S0.u64 ^ S1.u64;
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = src0.rawData() ^ src1.rawData();
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_ANDN2_B32::Inst_SOP2__S_ANDN2_B32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_andn2_b32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_ANDN2_B32

    Inst_SOP2__S_ANDN2_B32::~Inst_SOP2__S_ANDN2_B32()
    {
    } // ~Inst_SOP2__S_ANDN2_B32

    // D.u = S0.u & ~S1.u;
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_ANDN2_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = src0.rawData() &~ src1.rawData();
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_ANDN2_B64::Inst_SOP2__S_ANDN2_B64(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_andn2_b64")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_ANDN2_B64

    Inst_SOP2__S_ANDN2_B64::~Inst_SOP2__S_ANDN2_B64()
    {
    } // ~Inst_SOP2__S_ANDN2_B64

    // D.u64 = S0.u64 & ~S1.u64;
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_ANDN2_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = src0.rawData() &~ src1.rawData();
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_ORN2_B32::Inst_SOP2__S_ORN2_B32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_orn2_b32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_ORN2_B32

    Inst_SOP2__S_ORN2_B32::~Inst_SOP2__S_ORN2_B32()
    {
    } // ~Inst_SOP2__S_ORN2_B32

    // D.u = S0.u | ~S1.u;
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_ORN2_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = src0.rawData() |~ src1.rawData();
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_ORN2_B64::Inst_SOP2__S_ORN2_B64(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_orn2_b64")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_ORN2_B64

    Inst_SOP2__S_ORN2_B64::~Inst_SOP2__S_ORN2_B64()
    {
    } // ~Inst_SOP2__S_ORN2_B64

    // D.u64 = S0.u64 | ~S1.u64;
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_ORN2_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = src0.rawData() |~ src1.rawData();
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_NAND_B32::Inst_SOP2__S_NAND_B32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_nand_b32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_NAND_B32

    Inst_SOP2__S_NAND_B32::~Inst_SOP2__S_NAND_B32()
    {
    } // ~Inst_SOP2__S_NAND_B32

    // D.u = ~(S0.u & S1.u);
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_NAND_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = ~(src0.rawData() & src1.rawData());
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_NAND_B64::Inst_SOP2__S_NAND_B64(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_nand_b64")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_NAND_B64

    Inst_SOP2__S_NAND_B64::~Inst_SOP2__S_NAND_B64()
    {
    } // ~Inst_SOP2__S_NAND_B64

    // D.u64 = ~(S0.u64 & S1.u64);
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_NAND_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = ~(src0.rawData() & src1.rawData());
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_NOR_B32::Inst_SOP2__S_NOR_B32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_nor_b32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_NOR_B32

    Inst_SOP2__S_NOR_B32::~Inst_SOP2__S_NOR_B32()
    {
    } // ~Inst_SOP2__S_NOR_B32

    // D.u = ~(S0.u | S1.u);
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_NOR_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = ~(src0.rawData() | src1.rawData());
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_NOR_B64::Inst_SOP2__S_NOR_B64(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_nor_b64")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_NOR_B64

    Inst_SOP2__S_NOR_B64::~Inst_SOP2__S_NOR_B64()
    {
    } // ~Inst_SOP2__S_NOR_B64

    // D.u64 = ~(S0.u64 | S1.u64);
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_NOR_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = ~(src0.rawData() | src1.rawData());
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_XNOR_B32::Inst_SOP2__S_XNOR_B32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_xnor_b32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_XNOR_B32

    Inst_SOP2__S_XNOR_B32::~Inst_SOP2__S_XNOR_B32()
    {
    } // ~Inst_SOP2__S_XNOR_B32

    // D.u = ~(S0.u ^ S1.u);
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_XNOR_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = ~(src0.rawData() ^ src1.rawData());
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_XNOR_B64::Inst_SOP2__S_XNOR_B64(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_xnor_b64")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_XNOR_B64

    Inst_SOP2__S_XNOR_B64::~Inst_SOP2__S_XNOR_B64()
    {
    } // ~Inst_SOP2__S_XNOR_B64

    // D.u64 = ~(S0.u64 ^ S1.u64);
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_XNOR_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = ~(src0.rawData() ^ src1.rawData());
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_LSHL_B32::Inst_SOP2__S_LSHL_B32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_lshl_b32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_LSHL_B32

    Inst_SOP2__S_LSHL_B32::~Inst_SOP2__S_LSHL_B32()
    {
    } // ~Inst_SOP2__S_LSHL_B32

    // D.u = S0.u << S1.u[4:0];
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_LSHL_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = (src0.rawData() << bits(src1.rawData(), 4, 0));
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_LSHL_B64::Inst_SOP2__S_LSHL_B64(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_lshl_b64")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_LSHL_B64

    Inst_SOP2__S_LSHL_B64::~Inst_SOP2__S_LSHL_B64()
    {
    } // ~Inst_SOP2__S_LSHL_B64

    // D.u64 = S0.u64 << S1.u[5:0];
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_LSHL_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = (src0.rawData() << bits(src1.rawData(), 5, 0));
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_LSHR_B32::Inst_SOP2__S_LSHR_B32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_lshr_b32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_LSHR_B32

    Inst_SOP2__S_LSHR_B32::~Inst_SOP2__S_LSHR_B32()
    {
    } // ~Inst_SOP2__S_LSHR_B32

    // D.u = S0.u >> S1.u[4:0];
    // SCC = 1 if result is non-zero.
    // The vacated bits are set to zero.
    void
    Inst_SOP2__S_LSHR_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_LSHR_B64::Inst_SOP2__S_LSHR_B64(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_lshr_b64")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_LSHR_B64

    Inst_SOP2__S_LSHR_B64::~Inst_SOP2__S_LSHR_B64()
    {
    } // ~Inst_SOP2__S_LSHR_B64

    // D.u64 = S0.u64 >> S1.u[5:0];
    // SCC = 1 if result is non-zero.
    // The vacated bits are set to zero.
    void
    Inst_SOP2__S_LSHR_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_ASHR_I32::Inst_SOP2__S_ASHR_I32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_ashr_i32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_ASHR_I32

    Inst_SOP2__S_ASHR_I32::~Inst_SOP2__S_ASHR_I32()
    {
    } // ~Inst_SOP2__S_ASHR_I32

    // D.i = signext(S0.i) >> S1.u[4:0];
    // SCC = 1 if result is non-zero.
    // The vacated bits are set to the sign bit of the input value.
    void
    Inst_SOP2__S_ASHR_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_ASHR_I64::Inst_SOP2__S_ASHR_I64(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_ashr_i64")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_ASHR_I64

    Inst_SOP2__S_ASHR_I64::~Inst_SOP2__S_ASHR_I64()
    {
    } // ~Inst_SOP2__S_ASHR_I64

    // D.i64 = signext(S0.i64) >> S1.u[5:0];
    // SCC = 1 if result is non-zero.
    // The vacated bits are set to the sign bit of the input value.
    void
    Inst_SOP2__S_ASHR_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_BFM_B32::Inst_SOP2__S_BFM_B32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_bfm_b32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_BFM_B32

    Inst_SOP2__S_BFM_B32::~Inst_SOP2__S_BFM_B32()
    {
    } // ~Inst_SOP2__S_BFM_B32

    // D.u = ((1 << S0.u[4:0]) - 1) << S1.u[4:0] (bitfield mask).
    void
    Inst_SOP2__S_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);

        src0.read();
        src1.read();

        sdst = ((1 << bits(src0.rawData(), 4, 0)) - 1)
            << bits(src1.rawData(), 4, 0);

        sdst.write();
    }

    Inst_SOP2__S_BFM_B64::Inst_SOP2__S_BFM_B64(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_bfm_b64")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_BFM_B64

    Inst_SOP2__S_BFM_B64::~Inst_SOP2__S_BFM_B64()
    {
    } // ~Inst_SOP2__S_BFM_B64

    // D.u64 = ((1ULL << S0.u[5:0]) - 1) << S1.u[5:0] (bitfield mask).
    void
    Inst_SOP2__S_BFM_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

        src0.read();
        src1.read();

        sdst = ((1ULL << bits(src0.rawData(), 5, 0)) - 1)
            << bits(src1.rawData(), 5, 0);

        sdst.write();
    }

    Inst_SOP2__S_MUL_I32::Inst_SOP2__S_MUL_I32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_mul_i32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_MUL_I32

    Inst_SOP2__S_MUL_I32::~Inst_SOP2__S_MUL_I32()
    {
    } // ~Inst_SOP2__S_MUL_I32

    // D.i = S0.i * S1.i.
    void
    Inst_SOP2__S_MUL_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

        src0.read();
        src1.read();

        sdst = src0.rawData() * src1.rawData();

        sdst.write();
    }

    Inst_SOP2__S_BFE_U32::Inst_SOP2__S_BFE_U32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_bfe_u32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_BFE_U32

    Inst_SOP2__S_BFE_U32::~Inst_SOP2__S_BFE_U32()
    {
    } // ~Inst_SOP2__S_BFE_U32

    // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
    // field width.
    // D.u = (S0.u >> S1.u[4:0]) & ((1 << S1.u[22:16]) - 1);
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_BFE_I32::Inst_SOP2__S_BFE_I32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_bfe_i32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_BFE_I32

    Inst_SOP2__S_BFE_I32::~Inst_SOP2__S_BFE_I32()
    {
    } // ~Inst_SOP2__S_BFE_I32

    // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
    // field width.
    // D.i = (S0.i >> S1.u[4:0]) & ((1 << S1.u[22:16]) - 1);
    // Sign-extend the result;
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_BFE_U64::Inst_SOP2__S_BFE_U64(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_bfe_u64")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_BFE_U64

    Inst_SOP2__S_BFE_U64::~Inst_SOP2__S_BFE_U64()
    {
    } // ~Inst_SOP2__S_BFE_U64

    // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
    // field width.
    // D.u64 = (S0.u64 >> S1.u[5:0]) & ((1 << S1.u[22:16]) - 1);
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_BFE_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_BFE_I64::Inst_SOP2__S_BFE_I64(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_bfe_i64")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_BFE_I64

    Inst_SOP2__S_BFE_I64::~Inst_SOP2__S_BFE_I64()
    {
    } // ~Inst_SOP2__S_BFE_I64

    // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
    // field width.
    // D.i64 = (S0.i64 >> S1.u[5:0]) & ((1 << S1.u[22:16]) - 1);
    // Sign-extend result;
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP2__S_BFE_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_CBRANCH_G_FORK::Inst_SOP2__S_CBRANCH_G_FORK(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_cbranch_g_fork")
    {
        setFlag(Branch);
    } // Inst_SOP2__S_CBRANCH_G_FORK

    Inst_SOP2__S_CBRANCH_G_FORK::~Inst_SOP2__S_CBRANCH_G_FORK()
    {
    } // ~Inst_SOP2__S_CBRANCH_G_FORK

    // Conditional branch using branch-stack.
    // S0 = compare mask(vcc or any sgpr) and
    // S1 = 64-bit byte address of target instruction.
    void
    Inst_SOP2__S_CBRANCH_G_FORK::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOP2__S_ABSDIFF_I32::Inst_SOP2__S_ABSDIFF_I32(InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_absdiff_i32")
    {
        setFlag(ALU);
    } // Inst_SOP2__S_ABSDIFF_I32

    Inst_SOP2__S_ABSDIFF_I32::~Inst_SOP2__S_ABSDIFF_I32()
    {
    } // ~Inst_SOP2__S_ABSDIFF_I32

    // D.i = S0.i - S1.i;
    // if (D.i < 0) then D.i = -D.i;
    // SCC = 1 if result is non-zero.
    // Compute the absolute value of difference between two values.
    void
    Inst_SOP2__S_ABSDIFF_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        sdst = std::abs(src0.rawData() - src1.rawData());
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP2__S_RFE_RESTORE_B64::Inst_SOP2__S_RFE_RESTORE_B64(
          InFmt_SOP2 *iFmt)
        : Inst_SOP2(iFmt, "s_rfe_restore_b64")
    {
    } // Inst_SOP2__S_RFE_RESTORE_B64

    Inst_SOP2__S_RFE_RESTORE_B64::~Inst_SOP2__S_RFE_RESTORE_B64()
    {
    } // ~Inst_SOP2__S_RFE_RESTORE_B64

    // Return from exception handler and continue.
    void
    Inst_SOP2__S_RFE_RESTORE_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPK__S_MOVK_I32::Inst_SOPK__S_MOVK_I32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_movk_i32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_MOVK_I32

    Inst_SOPK__S_MOVK_I32::~Inst_SOPK__S_MOVK_I32()
    {
    } // ~Inst_SOPK__S_MOVK_I32

    // D.i = signext(SIMM16) (sign extension).
    void
    Inst_SOPK__S_MOVK_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

        sdst = simm16;

        sdst.write();
    }

    Inst_SOPK__S_CMOVK_I32::Inst_SOPK__S_CMOVK_I32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_cmovk_i32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_CMOVK_I32

    Inst_SOPK__S_CMOVK_I32::~Inst_SOPK__S_CMOVK_I32()
    {
    } // ~Inst_SOPK__S_CMOVK_I32

    // if (SCC) then D.i = signext(SIMM16);
    // else NOP.
    // Conditional move with sign extension.
    void
    Inst_SOPK__S_CMOVK_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);

        scc.read();

        if (scc.rawData()) {
            sdst = simm16;
            sdst.write();
        }
    }

    Inst_SOPK__S_CMPK_EQ_I32::Inst_SOPK__S_CMPK_EQ_I32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_cmpk_eq_i32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_CMPK_EQ_I32

    Inst_SOPK__S_CMPK_EQ_I32::~Inst_SOPK__S_CMPK_EQ_I32()
    {
    } // ~Inst_SOPK__S_CMPK_EQ_I32

    // SCC = (S0.i == signext(SIMM16)).
    void
    Inst_SOPK__S_CMPK_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        scc = (src.rawData() == simm16) ? 1 : 0;

        scc.write();
    }

    Inst_SOPK__S_CMPK_LG_I32::Inst_SOPK__S_CMPK_LG_I32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_cmpk_lg_i32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_CMPK_LG_I32

    Inst_SOPK__S_CMPK_LG_I32::~Inst_SOPK__S_CMPK_LG_I32()
    {
    } // ~Inst_SOPK__S_CMPK_LG_I32

    // SCC = (S0.i != signext(SIMM16)).
    void
    Inst_SOPK__S_CMPK_LG_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        scc = (src.rawData() != simm16) ? 1 : 0;

        scc.write();
    }

    Inst_SOPK__S_CMPK_GT_I32::Inst_SOPK__S_CMPK_GT_I32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_cmpk_gt_i32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_CMPK_GT_I32

    Inst_SOPK__S_CMPK_GT_I32::~Inst_SOPK__S_CMPK_GT_I32()
    {
    } // ~Inst_SOPK__S_CMPK_GT_I32

    // SCC = (S0.i > signext(SIMM16)).
    void
    Inst_SOPK__S_CMPK_GT_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        scc = (src.rawData() > simm16) ? 1 : 0;

        scc.write();
    }

    Inst_SOPK__S_CMPK_GE_I32::Inst_SOPK__S_CMPK_GE_I32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_cmpk_ge_i32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_CMPK_GE_I32

    Inst_SOPK__S_CMPK_GE_I32::~Inst_SOPK__S_CMPK_GE_I32()
    {
    } // ~Inst_SOPK__S_CMPK_GE_I32

    // SCC = (S0.i >= signext(SIMM16)).
    void
    Inst_SOPK__S_CMPK_GE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        scc = (src.rawData() >= simm16) ? 1 : 0;

        scc.write();
    }

    Inst_SOPK__S_CMPK_LT_I32::Inst_SOPK__S_CMPK_LT_I32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_cmpk_lt_i32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_CMPK_LT_I32

    Inst_SOPK__S_CMPK_LT_I32::~Inst_SOPK__S_CMPK_LT_I32()
    {
    } // ~Inst_SOPK__S_CMPK_LT_I32

    // SCC = (S0.i < signext(SIMM16)).
    void
    Inst_SOPK__S_CMPK_LT_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        scc = (src.rawData() < simm16) ? 1 : 0;

        scc.write();
    }

    Inst_SOPK__S_CMPK_LE_I32::Inst_SOPK__S_CMPK_LE_I32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_cmpk_le_i32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_CMPK_LE_I32

    Inst_SOPK__S_CMPK_LE_I32::~Inst_SOPK__S_CMPK_LE_I32()
    {
    } // ~Inst_SOPK__S_CMPK_LE_I32

    // SCC = (S0.i <= signext(SIMM16)).
    void
    Inst_SOPK__S_CMPK_LE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        scc = (src.rawData() <= simm16) ? 1 : 0;

        scc.write();
    }

    Inst_SOPK__S_CMPK_EQ_U32::Inst_SOPK__S_CMPK_EQ_U32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_cmpk_eq_u32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_CMPK_EQ_U32

    Inst_SOPK__S_CMPK_EQ_U32::~Inst_SOPK__S_CMPK_EQ_U32()
    {
    } // ~Inst_SOPK__S_CMPK_EQ_U32

    // SCC = (S0.u == SIMM16).
    void
    Inst_SOPK__S_CMPK_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        scc = (src.rawData() == simm16) ? 1 : 0;

        scc.write();
    }

    Inst_SOPK__S_CMPK_LG_U32::Inst_SOPK__S_CMPK_LG_U32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_cmpk_lg_u32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_CMPK_LG_U32

    Inst_SOPK__S_CMPK_LG_U32::~Inst_SOPK__S_CMPK_LG_U32()
    {
    } // ~Inst_SOPK__S_CMPK_LG_U32

    // SCC = (S0.u != SIMM16).
    void
    Inst_SOPK__S_CMPK_LG_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        scc = (src.rawData() != simm16) ? 1 : 0;

        scc.write();
    }

    Inst_SOPK__S_CMPK_GT_U32::Inst_SOPK__S_CMPK_GT_U32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_cmpk_gt_u32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_CMPK_GT_U32

    Inst_SOPK__S_CMPK_GT_U32::~Inst_SOPK__S_CMPK_GT_U32()
    {
    } // ~Inst_SOPK__S_CMPK_GT_U32

    // SCC = (S0.u > SIMM16).
    void
    Inst_SOPK__S_CMPK_GT_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        scc = (src.rawData() > simm16) ? 1 : 0;

        scc.write();
    }

    Inst_SOPK__S_CMPK_GE_U32::Inst_SOPK__S_CMPK_GE_U32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_cmpk_ge_u32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_CMPK_GE_U32

    Inst_SOPK__S_CMPK_GE_U32::~Inst_SOPK__S_CMPK_GE_U32()
    {
    } // ~Inst_SOPK__S_CMPK_GE_U32

    // SCC = (S0.u >= SIMM16).
    void
    Inst_SOPK__S_CMPK_GE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        scc = (src.rawData() >= simm16) ? 1 : 0;

        scc.write();
    }

    Inst_SOPK__S_CMPK_LT_U32::Inst_SOPK__S_CMPK_LT_U32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_cmpk_lt_u32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_CMPK_LT_U32

    Inst_SOPK__S_CMPK_LT_U32::~Inst_SOPK__S_CMPK_LT_U32()
    {
    } // ~Inst_SOPK__S_CMPK_LT_U32

    // SCC = (S0.u < SIMM16).
    void
    Inst_SOPK__S_CMPK_LT_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        scc = (src.rawData() < simm16) ? 1 : 0;

        scc.write();
    }

    Inst_SOPK__S_CMPK_LE_U32::Inst_SOPK__S_CMPK_LE_U32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_cmpk_le_u32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_CMPK_LE_U32

    Inst_SOPK__S_CMPK_LE_U32::~Inst_SOPK__S_CMPK_LE_U32()
    {
    } // ~Inst_SOPK__S_CMPK_LE_U32

    // SCC = (S0.u <= SIMM16).
    void
    Inst_SOPK__S_CMPK_LE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        scc = (src.rawData() <= simm16) ? 1 : 0;

        scc.write();
    }

    Inst_SOPK__S_ADDK_I32::Inst_SOPK__S_ADDK_I32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_addk_i32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_ADDK_I32

    Inst_SOPK__S_ADDK_I32::~Inst_SOPK__S_ADDK_I32()
    {
    } // ~Inst_SOPK__S_ADDK_I32

    // D.i = D.i + signext(SIMM16);
    // SCC = overflow.
    void
    Inst_SOPK__S_ADDK_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegI16 simm16 = instData.SIMM16;
        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = src.rawData() + (ScalarRegI32)sext<16>(simm16);
        scc = (bits(src.rawData(), 31) == bits(simm16, 15)
            && bits(src.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOPK__S_MULK_I32::Inst_SOPK__S_MULK_I32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_mulk_i32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_MULK_I32

    Inst_SOPK__S_MULK_I32::~Inst_SOPK__S_MULK_I32()
    {
    } // ~Inst_SOPK__S_MULK_I32

    // D.i = D.i * signext(SIMM16).
    void
    Inst_SOPK__S_MULK_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegI16 simm16 = instData.SIMM16;
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

        sdst.read();

        sdst = sdst.rawData() * (ScalarRegI32)sext<16>(simm16);

        sdst.write();
    }

    Inst_SOPK__S_CBRANCH_I_FORK::Inst_SOPK__S_CBRANCH_I_FORK(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_cbranch_i_fork")
    {
        setFlag(Branch);
    } // Inst_SOPK__S_CBRANCH_I_FORK

    Inst_SOPK__S_CBRANCH_I_FORK::~Inst_SOPK__S_CBRANCH_I_FORK()
    {
    } // ~Inst_SOPK__S_CBRANCH_I_FORK

    // Conditional branch using branch-stack.
    // S0 = compare mask(vcc or any sgpr), and
    // SIMM16 = signed DWORD branch offset relative to next instruction.
    void
    Inst_SOPK__S_CBRANCH_I_FORK::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPK__S_GETREG_B32::Inst_SOPK__S_GETREG_B32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_getreg_b32")
    {
    } // Inst_SOPK__S_GETREG_B32

    Inst_SOPK__S_GETREG_B32::~Inst_SOPK__S_GETREG_B32()
    {
    } // ~Inst_SOPK__S_GETREG_B32

    // D.u = hardware-reg. Read some or all of a hardware register into the
    // LSBs of D.
    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
    // is 1..32.
    void
    Inst_SOPK__S_GETREG_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPK__S_SETREG_B32::Inst_SOPK__S_SETREG_B32(InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_setreg_b32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_SETREG_B32

    Inst_SOPK__S_SETREG_B32::~Inst_SOPK__S_SETREG_B32()
    {
    } // ~Inst_SOPK__S_SETREG_B32

    // hardware-reg = S0.u. Write some or all of the LSBs of D into a hardware
    // register.
    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
    // is 1..32.
    void
    Inst_SOPK__S_SETREG_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegI16 simm16 = instData.SIMM16;
        ScalarRegU32 hwregId = simm16 & 0x3f;
        ScalarRegU32 offset = (simm16 >> 6) & 31;
        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;

        ScalarOperandU32 hwreg(gpuDynInst, hwregId);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        hwreg.read();
        sdst.read();

        // Store value from SDST to part of the hardware register.
        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
        hwreg = ((hwreg.rawData() & ~mask)
                        | ((sdst.rawData() << offset) & mask));
        hwreg.write();

        // set MODE register to control the behavior of single precision
        // floating-point numbers: denormal mode or round mode
        if (hwregId==1 && size==2
                        && (offset==4 || offset==0)) {
            warn_once("Be cautious that s_setreg_b32 has no real effect "
                            "on FP modes: %s\n", gpuDynInst->disassemble());
            return;
        }

        // panic if not changing MODE of floating-point numbers
        panicUnimplemented();
    }

    Inst_SOPK__S_SETREG_IMM32_B32::Inst_SOPK__S_SETREG_IMM32_B32(
          InFmt_SOPK *iFmt)
        : Inst_SOPK(iFmt, "s_setreg_imm32_b32")
    {
        setFlag(ALU);
    } // Inst_SOPK__S_SETREG_IMM32_B32

    Inst_SOPK__S_SETREG_IMM32_B32::~Inst_SOPK__S_SETREG_IMM32_B32()
    {
    } // ~Inst_SOPK__S_SETREG_IMM32_B32

    // Write some or all of the LSBs of IMM32 into a hardware register; this
    // instruction requires a 32-bit literal constant.
    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
    // is 1..32.
    void
    Inst_SOPK__S_SETREG_IMM32_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegI16 simm16 = instData.SIMM16;
        ScalarRegU32 hwregId = simm16 & 0x3f;
        ScalarRegU32 offset = (simm16 >> 6) & 31;
        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;

        ScalarOperandU32 hwreg(gpuDynInst, hwregId);
        ScalarRegU32 simm32 = extData.imm_u32;
        hwreg.read();

        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
        hwreg = ((hwreg.rawData() & ~mask)
                    | ((simm32 << offset) & mask));
        hwreg.write();

        if (hwregId==1 && size==2
                        && (offset==4 || offset==0)) {
            warn_once("Be cautious that s_setreg_imm32_b32 has no real effect "
                            "on FP modes: %s\n", gpuDynInst->disassemble());
            return;
        }

        // panic if not changing MODE of floating-point numbers
        panicUnimplemented();
    }

    Inst_SOP1__S_MOV_B32::Inst_SOP1__S_MOV_B32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_mov_b32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_MOV_B32

    Inst_SOP1__S_MOV_B32::~Inst_SOP1__S_MOV_B32()
    {
    } // ~Inst_SOP1__S_MOV_B32

    // D.u = S0.u.
    void
    Inst_SOP1__S_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = src.rawData();

        sdst.write();
    }

    Inst_SOP1__S_MOV_B64::Inst_SOP1__S_MOV_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_mov_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_MOV_B64

    Inst_SOP1__S_MOV_B64::~Inst_SOP1__S_MOV_B64()
    {
    } // ~Inst_SOP1__S_MOV_B64

    // D.u64 = S0.u64.
    void
    Inst_SOP1__S_MOV_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = src.rawData();

        sdst.write();
    }

    Inst_SOP1__S_CMOV_B32::Inst_SOP1__S_CMOV_B32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_cmov_b32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_CMOV_B32

    Inst_SOP1__S_CMOV_B32::~Inst_SOP1__S_CMOV_B32()
    {
    } // ~Inst_SOP1__S_CMOV_B32

    // if (SCC) then D.u = S0.u;
    // else NOP.
    // Conditional move.
    void
    Inst_SOP1__S_CMOV_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();
        scc.read();

        if (scc.rawData()) {
            sdst = src.rawData();
            sdst.write();
        }
    }

    Inst_SOP1__S_CMOV_B64::Inst_SOP1__S_CMOV_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_cmov_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_CMOV_B64

    Inst_SOP1__S_CMOV_B64::~Inst_SOP1__S_CMOV_B64()
    {
    } // ~Inst_SOP1__S_CMOV_B64

    // if (SCC) then D.u64 = S0.u64;
    // else NOP.
    // Conditional move.
    void
    Inst_SOP1__S_CMOV_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();
        scc.read();

        if (scc.rawData()) {
            sdst = src.rawData();
            sdst.write();
        }
    }

    Inst_SOP1__S_NOT_B32::Inst_SOP1__S_NOT_B32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_not_b32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_NOT_B32

    Inst_SOP1__S_NOT_B32::~Inst_SOP1__S_NOT_B32()
    {
    } // ~Inst_SOP1__S_NOT_B32

    // D.u = ~S0.u;
    // SCC = 1 if result is non-zero.
    // Bitwise negation.
    void
    Inst_SOP1__S_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = ~src.rawData();

        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_NOT_B64::Inst_SOP1__S_NOT_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_not_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_NOT_B64

    Inst_SOP1__S_NOT_B64::~Inst_SOP1__S_NOT_B64()
    {
    } // ~Inst_SOP1__S_NOT_B64

    // D.u64 = ~S0.u64;
    // SCC = 1 if result is non-zero.
    // Bitwise negation.
    void
    Inst_SOP1__S_NOT_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = ~src.rawData();
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_WQM_B32::Inst_SOP1__S_WQM_B32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_wqm_b32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_WQM_B32

    Inst_SOP1__S_WQM_B32::~Inst_SOP1__S_WQM_B32()
    {
    } // ~Inst_SOP1__S_WQM_B32

    // Computes whole quad mode for an active/valid mask.
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP1__S_WQM_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = wholeQuadMode(src.rawData());
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_WQM_B64::Inst_SOP1__S_WQM_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_wqm_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_WQM_B64

    Inst_SOP1__S_WQM_B64::~Inst_SOP1__S_WQM_B64()
    {
    } // ~Inst_SOP1__S_WQM_B64

    // Computes whole quad mode for an active/valid mask.
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP1__S_WQM_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = wholeQuadMode(src.rawData());
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_BREV_B32::Inst_SOP1__S_BREV_B32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_brev_b32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_BREV_B32

    Inst_SOP1__S_BREV_B32::~Inst_SOP1__S_BREV_B32()
    {
    } // ~Inst_SOP1__S_BREV_B32

    // D.u[31:0] = S0.u[0:31] (reverse bits).
    void
    Inst_SOP1__S_BREV_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = reverseBits(src.rawData());

        sdst.write();
    }

    Inst_SOP1__S_BREV_B64::Inst_SOP1__S_BREV_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_brev_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_BREV_B64

    Inst_SOP1__S_BREV_B64::~Inst_SOP1__S_BREV_B64()
    {
    } // ~Inst_SOP1__S_BREV_B64

    // D.u64[63:0] = S0.u64[0:63] (reverse bits).
    void
    Inst_SOP1__S_BREV_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = reverseBits(src.rawData());

        sdst.write();
    }

    Inst_SOP1__S_BCNT0_I32_B32::Inst_SOP1__S_BCNT0_I32_B32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_bcnt0_i32_b32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_BCNT0_I32_B32

    Inst_SOP1__S_BCNT0_I32_B32::~Inst_SOP1__S_BCNT0_I32_B32()
    {
    } // ~Inst_SOP1__S_BCNT0_I32_B32

    // D.i = CountZeroBits(S0.u);
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP1__S_BCNT0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = countZeroBits(src.rawData());
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_BCNT0_I32_B64::Inst_SOP1__S_BCNT0_I32_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_bcnt0_i32_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_BCNT0_I32_B64

    Inst_SOP1__S_BCNT0_I32_B64::~Inst_SOP1__S_BCNT0_I32_B64()
    {
    } // ~Inst_SOP1__S_BCNT0_I32_B64

    // D.i = CountZeroBits(S0.u64);
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP1__S_BCNT0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = countZeroBits(src.rawData());
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_BCNT1_I32_B32::Inst_SOP1__S_BCNT1_I32_B32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_bcnt1_i32_b32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_BCNT1_I32_B32

    Inst_SOP1__S_BCNT1_I32_B32::~Inst_SOP1__S_BCNT1_I32_B32()
    {
    } // ~Inst_SOP1__S_BCNT1_I32_B32

    // D.i = CountOneBits(S0.u);
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP1__S_BCNT1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = popCount(src.rawData());
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_BCNT1_I32_B64::Inst_SOP1__S_BCNT1_I32_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_bcnt1_i32_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_BCNT1_I32_B64

    Inst_SOP1__S_BCNT1_I32_B64::~Inst_SOP1__S_BCNT1_I32_B64()
    {
    } // ~Inst_SOP1__S_BCNT1_I32_B64

    // D.i = CountOneBits(S0.u64);
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP1__S_BCNT1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = popCount(src.rawData());
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_FF0_I32_B32::Inst_SOP1__S_FF0_I32_B32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_ff0_i32_b32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_FF0_I32_B32

    Inst_SOP1__S_FF0_I32_B32::~Inst_SOP1__S_FF0_I32_B32()
    {
    } // ~Inst_SOP1__S_FF0_I32_B32

    // D.i = FindFirstZero(S0.u);
    // If no zeros are found, return -1.
    // Returns the bit position of the first zero from the LSB.
    void
    Inst_SOP1__S_FF0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = findFirstZero(src.rawData());

        sdst.write();
    }

    Inst_SOP1__S_FF0_I32_B64::Inst_SOP1__S_FF0_I32_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_ff0_i32_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_FF0_I32_B64

    Inst_SOP1__S_FF0_I32_B64::~Inst_SOP1__S_FF0_I32_B64()
    {
    } // ~Inst_SOP1__S_FF0_I32_B64

    // D.i = FindFirstZero(S0.u64);
    // If no zeros are found, return -1.
    // Returns the bit position of the first zero from the LSB.
    void
    Inst_SOP1__S_FF0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = findFirstZero(src.rawData());

        sdst.write();
    }

    Inst_SOP1__S_FF1_I32_B32::Inst_SOP1__S_FF1_I32_B32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_ff1_i32_b32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_FF1_I32_B32

    Inst_SOP1__S_FF1_I32_B32::~Inst_SOP1__S_FF1_I32_B32()
    {
    } // ~Inst_SOP1__S_FF1_I32_B32

    // D.i = FindFirstOne(S0.u);
    // If no ones are found, return -1.
    // Returns the bit position of the first one from the LSB.
    void
    Inst_SOP1__S_FF1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = findFirstOne(src.rawData());

        sdst.write();
    }

    Inst_SOP1__S_FF1_I32_B64::Inst_SOP1__S_FF1_I32_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_ff1_i32_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_FF1_I32_B64

    Inst_SOP1__S_FF1_I32_B64::~Inst_SOP1__S_FF1_I32_B64()
    {
    } // ~Inst_SOP1__S_FF1_I32_B64

    // D.i = FindFirstOne(S0.u64);
    // If no ones are found, return -1.
    // Returns the bit position of the first one from the LSB.
    void
    Inst_SOP1__S_FF1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = findFirstOne(src.rawData());

        sdst.write();
    }

    Inst_SOP1__S_FLBIT_I32_B32::Inst_SOP1__S_FLBIT_I32_B32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_flbit_i32_b32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_FLBIT_I32_B32

    Inst_SOP1__S_FLBIT_I32_B32::~Inst_SOP1__S_FLBIT_I32_B32()
    {
    } // ~Inst_SOP1__S_FLBIT_I32_B32

    // D.i = FindFirstOne(S0.u);
    // If no ones are found, return -1.
    // Counts how many zeros before the first one starting from the MSB.
    void
    Inst_SOP1__S_FLBIT_I32_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = countZeroBitsMsb(src.rawData());

        sdst.write();
    }

    Inst_SOP1__S_FLBIT_I32_B64::Inst_SOP1__S_FLBIT_I32_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_flbit_i32_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_FLBIT_I32_B64

    Inst_SOP1__S_FLBIT_I32_B64::~Inst_SOP1__S_FLBIT_I32_B64()
    {
    } // ~Inst_SOP1__S_FLBIT_I32_B64

    // D.i = FindFirstOne(S0.u64);
    // If no ones are found, return -1.
    // Counts how many zeros before the first one starting from the MSB.
    void
    Inst_SOP1__S_FLBIT_I32_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = countZeroBitsMsb(src.rawData());

        sdst.write();
    }

    Inst_SOP1__S_FLBIT_I32::Inst_SOP1__S_FLBIT_I32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_flbit_i32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_FLBIT_I32

    Inst_SOP1__S_FLBIT_I32::~Inst_SOP1__S_FLBIT_I32()
    {
    } // ~Inst_SOP1__S_FLBIT_I32

    // D.i = FirstOppositeSignBit(S0.i);
    // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
    // Counts how many bits in a row (from MSB to LSB) are the same as the
    // sign bit.
    void
    Inst_SOP1__S_FLBIT_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = firstOppositeSignBit(src.rawData());

        sdst.write();
    }

    Inst_SOP1__S_FLBIT_I32_I64::Inst_SOP1__S_FLBIT_I32_I64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_flbit_i32_i64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_FLBIT_I32_I64

    Inst_SOP1__S_FLBIT_I32_I64::~Inst_SOP1__S_FLBIT_I32_I64()
    {
    } // ~Inst_SOP1__S_FLBIT_I32_I64

    // D.i = FirstOppositeSignBit(S0.i64);
    // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
    // Counts how many bits in a row (from MSB to LSB) are the same as the
    // sign bit.
    void
    Inst_SOP1__S_FLBIT_I32_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = firstOppositeSignBit(src.rawData());

        sdst.write();
    }

    Inst_SOP1__S_SEXT_I32_I8::Inst_SOP1__S_SEXT_I32_I8(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_sext_i32_i8")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_SEXT_I32_I8

    Inst_SOP1__S_SEXT_I32_I8::~Inst_SOP1__S_SEXT_I32_I8()
    {
    } // ~Inst_SOP1__S_SEXT_I32_I8

    // D.i = signext(S0.i[7:0]) (sign extension).
    void
    Inst_SOP1__S_SEXT_I32_I8::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = sext<std::numeric_limits<ScalarRegI8>::digits>(
            bits(src.rawData(), 7, 0));

        sdst.write();
    }

    Inst_SOP1__S_SEXT_I32_I16::Inst_SOP1__S_SEXT_I32_I16(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_sext_i32_i16")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_SEXT_I32_I16

    Inst_SOP1__S_SEXT_I32_I16::~Inst_SOP1__S_SEXT_I32_I16()
    {
    } // ~Inst_SOP1__S_SEXT_I32_I16

    // D.i = signext(S0.i[15:0]) (sign extension).
    void
    Inst_SOP1__S_SEXT_I32_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = sext<std::numeric_limits<ScalarRegI16>::digits>(
            bits(src.rawData(), 15, 0));

        sdst.write();
    }

    Inst_SOP1__S_BITSET0_B32::Inst_SOP1__S_BITSET0_B32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_bitset0_b32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_BITSET0_B32

    Inst_SOP1__S_BITSET0_B32::~Inst_SOP1__S_BITSET0_B32()
    {
    } // ~Inst_SOP1__S_BITSET0_B32

    // D.u[S0.u[4:0]] = 0.
    void
    Inst_SOP1__S_BITSET0_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst.setBit(bits(src.rawData(), 4, 0), 0);

        sdst.write();
    }

    Inst_SOP1__S_BITSET0_B64::Inst_SOP1__S_BITSET0_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_bitset0_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_BITSET0_B64

    Inst_SOP1__S_BITSET0_B64::~Inst_SOP1__S_BITSET0_B64()
    {
    } // ~Inst_SOP1__S_BITSET0_B64

    // D.u64[S0.u[5:0]] = 0.
    void
    Inst_SOP1__S_BITSET0_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst.setBit(bits(src.rawData(), 5, 0), 0);

        sdst.write();
    }

    Inst_SOP1__S_BITSET1_B32::Inst_SOP1__S_BITSET1_B32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_bitset1_b32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_BITSET1_B32

    Inst_SOP1__S_BITSET1_B32::~Inst_SOP1__S_BITSET1_B32()
    {
    } // ~Inst_SOP1__S_BITSET1_B32

    // D.u[S0.u[4:0]] = 1.
    void
    Inst_SOP1__S_BITSET1_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst.setBit(bits(src.rawData(), 4, 0), 1);

        sdst.write();
    }

    Inst_SOP1__S_BITSET1_B64::Inst_SOP1__S_BITSET1_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_bitset1_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_BITSET1_B64

    Inst_SOP1__S_BITSET1_B64::~Inst_SOP1__S_BITSET1_B64()
    {
    } // ~Inst_SOP1__S_BITSET1_B64

    // D.u64[S0.u[5:0]] = 1.
    void
    Inst_SOP1__S_BITSET1_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst.setBit(bits(src.rawData(), 5, 0), 1);

        sdst.write();
    }

    Inst_SOP1__S_GETPC_B64::Inst_SOP1__S_GETPC_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_getpc_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_GETPC_B64

    Inst_SOP1__S_GETPC_B64::~Inst_SOP1__S_GETPC_B64()
    {
    } // ~Inst_SOP1__S_GETPC_B64

    // D.u64 = PC + 4.
    // Destination receives the byte address of the next instruction.
    void
    Inst_SOP1__S_GETPC_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        Addr pc = wf->pc();
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

        sdst = pc + 4;

        sdst.write();
    }

    Inst_SOP1__S_SETPC_B64::Inst_SOP1__S_SETPC_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_setpc_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_SETPC_B64

    Inst_SOP1__S_SETPC_B64::~Inst_SOP1__S_SETPC_B64()
    {
    } // ~Inst_SOP1__S_SETPC_B64

    // PC = S0.u64.
    // S0.u64 is a byte address of the instruction to jump to.
    void
    Inst_SOP1__S_SETPC_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);

        src.read();

        wf->pc(src.rawData());
    }

    Inst_SOP1__S_SWAPPC_B64::Inst_SOP1__S_SWAPPC_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_swappc_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_SWAPPC_B64

    Inst_SOP1__S_SWAPPC_B64::~Inst_SOP1__S_SWAPPC_B64()
    {
    } // ~Inst_SOP1__S_SWAPPC_B64

    // D.u64 = PC + 4; PC = S0.u64.
    // S0.u64 is a byte address of the instruction to jump to.
    void
    Inst_SOP1__S_SWAPPC_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        Addr pc = wf->pc();
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = pc + 4;

        wf->pc(src.rawData());
        sdst.write();
    }

    Inst_SOP1__S_RFE_B64::Inst_SOP1__S_RFE_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_rfe_b64")
    {
    } // Inst_SOP1__S_RFE_B64

    Inst_SOP1__S_RFE_B64::~Inst_SOP1__S_RFE_B64()
    {
    } // ~Inst_SOP1__S_RFE_B64

    // Return from exception handler and continue.
    void
    Inst_SOP1__S_RFE_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOP1__S_AND_SAVEEXEC_B64::Inst_SOP1__S_AND_SAVEEXEC_B64(
          InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_and_saveexec_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_AND_SAVEEXEC_B64

    Inst_SOP1__S_AND_SAVEEXEC_B64::~Inst_SOP1__S_AND_SAVEEXEC_B64()
    {
    } // ~Inst_SOP1__S_AND_SAVEEXEC_B64

    // D.u64 = EXEC;
    // EXEC = S0.u64 & EXEC;
    // SCC = 1 if the new value of EXEC is non-zero.
    void
    Inst_SOP1__S_AND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = wf->execMask().to_ullong();
        wf->execMask() = src.rawData() & wf->execMask().to_ullong();
        scc = wf->execMask().any() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_OR_SAVEEXEC_B64::Inst_SOP1__S_OR_SAVEEXEC_B64(
          InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_or_saveexec_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_OR_SAVEEXEC_B64

    Inst_SOP1__S_OR_SAVEEXEC_B64::~Inst_SOP1__S_OR_SAVEEXEC_B64()
    {
    } // ~Inst_SOP1__S_OR_SAVEEXEC_B64

    // D.u64 = EXEC;
    // EXEC = S0.u64 | EXEC;
    // SCC = 1 if the new value of EXEC is non-zero.
    void
    Inst_SOP1__S_OR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = wf->execMask().to_ullong();
        wf->execMask() = src.rawData() | wf->execMask().to_ullong();
        scc = wf->execMask().any() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_XOR_SAVEEXEC_B64::Inst_SOP1__S_XOR_SAVEEXEC_B64(
          InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_xor_saveexec_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_XOR_SAVEEXEC_B64

    Inst_SOP1__S_XOR_SAVEEXEC_B64::~Inst_SOP1__S_XOR_SAVEEXEC_B64()
    {
    } // ~Inst_SOP1__S_XOR_SAVEEXEC_B64

    // D.u64 = EXEC;
    // EXEC = S0.u64 ^ EXEC;
    // SCC = 1 if the new value of EXEC is non-zero.
    void
    Inst_SOP1__S_XOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = wf->execMask().to_ullong();
        wf->execMask() = src.rawData() ^ wf->execMask().to_ullong();
        scc = wf->execMask().any() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::Inst_SOP1__S_ANDN2_SAVEEXEC_B64(
          InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_andn2_saveexec_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_ANDN2_SAVEEXEC_B64

    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::~Inst_SOP1__S_ANDN2_SAVEEXEC_B64()
    {
    } // ~Inst_SOP1__S_ANDN2_SAVEEXEC_B64

    // D.u64 = EXEC;
    // EXEC = S0.u64 & ~EXEC;
    // SCC = 1 if the new value of EXEC is non-zero.
    void
    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = wf->execMask().to_ullong();
        wf->execMask() = src.rawData() &~ wf->execMask().to_ullong();
        scc = wf->execMask().any() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_ORN2_SAVEEXEC_B64::Inst_SOP1__S_ORN2_SAVEEXEC_B64(
          InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_orn2_saveexec_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_ORN2_SAVEEXEC_B64

    Inst_SOP1__S_ORN2_SAVEEXEC_B64::~Inst_SOP1__S_ORN2_SAVEEXEC_B64()
    {
    } // ~Inst_SOP1__S_ORN2_SAVEEXEC_B64

    // D.u64 = EXEC;
    // EXEC = S0.u64 | ~EXEC;
    // SCC = 1 if the new value of EXEC is non-zero.
    void
    Inst_SOP1__S_ORN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = wf->execMask().to_ullong();
        wf->execMask() = src.rawData() |~ wf->execMask().to_ullong();
        scc = wf->execMask().any() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_NAND_SAVEEXEC_B64::Inst_SOP1__S_NAND_SAVEEXEC_B64(
          InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_nand_saveexec_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_NAND_SAVEEXEC_B64

    Inst_SOP1__S_NAND_SAVEEXEC_B64::~Inst_SOP1__S_NAND_SAVEEXEC_B64()
    {
    } // ~Inst_SOP1__S_NAND_SAVEEXEC_B64

    // D.u64 = EXEC;
    // EXEC = ~(S0.u64 & EXEC);
    // SCC = 1 if the new value of EXEC is non-zero.
    void
    Inst_SOP1__S_NAND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = wf->execMask().to_ullong();
        wf->execMask() = ~(src.rawData() & wf->execMask().to_ullong());
        scc = wf->execMask().any() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_NOR_SAVEEXEC_B64::Inst_SOP1__S_NOR_SAVEEXEC_B64(
          InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_nor_saveexec_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_NOR_SAVEEXEC_B64

    Inst_SOP1__S_NOR_SAVEEXEC_B64::~Inst_SOP1__S_NOR_SAVEEXEC_B64()
    {
    } // ~Inst_SOP1__S_NOR_SAVEEXEC_B64

    // D.u64 = EXEC;
    // EXEC = ~(S0.u64 | EXEC);
    // SCC = 1 if the new value of EXEC is non-zero.
    void
    Inst_SOP1__S_NOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = wf->execMask().to_ullong();
        wf->execMask() = ~(src.rawData() | wf->execMask().to_ullong());
        scc = wf->execMask().any() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_XNOR_SAVEEXEC_B64::Inst_SOP1__S_XNOR_SAVEEXEC_B64(
          InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_xnor_saveexec_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_XNOR_SAVEEXEC_B64

    Inst_SOP1__S_XNOR_SAVEEXEC_B64::~Inst_SOP1__S_XNOR_SAVEEXEC_B64()
    {
    } // ~Inst_SOP1__S_XNOR_SAVEEXEC_B64

    // D.u64 = EXEC;
    // EXEC = ~(S0.u64 ^ EXEC);
    // SCC = 1 if the new value of EXEC is non-zero.
    void
    Inst_SOP1__S_XNOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = wf->execMask().to_ullong();
        wf->execMask() = ~(src.rawData() ^ wf->execMask().to_ullong());
        scc = wf->execMask().any() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_QUADMASK_B32::Inst_SOP1__S_QUADMASK_B32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_quadmask_b32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_QUADMASK_B32

    Inst_SOP1__S_QUADMASK_B32::~Inst_SOP1__S_QUADMASK_B32()
    {
    } // ~Inst_SOP1__S_QUADMASK_B32

    // D.u = QuadMask(S0.u):
    // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[31:8] = 0;
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP1__S_QUADMASK_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = quadMask(src.rawData());
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_QUADMASK_B64::Inst_SOP1__S_QUADMASK_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_quadmask_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_QUADMASK_B64

    Inst_SOP1__S_QUADMASK_B64::~Inst_SOP1__S_QUADMASK_B64()
    {
    } // ~Inst_SOP1__S_QUADMASK_B64

    // D.u64 = QuadMask(S0.u64):
    // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[63:16] = 0;
    // SCC = 1 if result is non-zero.
    void
    Inst_SOP1__S_QUADMASK_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = quadMask(src.rawData());
        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_MOVRELS_B32::Inst_SOP1__S_MOVRELS_B32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_movrels_b32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_MOVRELS_B32

    Inst_SOP1__S_MOVRELS_B32::~Inst_SOP1__S_MOVRELS_B32()
    {
    } // ~Inst_SOP1__S_MOVRELS_B32

    // D.u = SGPR[S0.u + M0.u].u (move from relative source).
    void
    Inst_SOP1__S_MOVRELS_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
        m0.read();
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0 + m0.rawData());
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = src.rawData();

        sdst.write();
    }

    Inst_SOP1__S_MOVRELS_B64::Inst_SOP1__S_MOVRELS_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_movrels_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_MOVRELS_B64

    Inst_SOP1__S_MOVRELS_B64::~Inst_SOP1__S_MOVRELS_B64()
    {
    } // ~Inst_SOP1__S_MOVRELS_B64

    // D.u64 = SGPR[S0.u + M0.u].u64 (move from relative source).
    // The index in M0.u must be even for this operation.
    void
    Inst_SOP1__S_MOVRELS_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
        m0.read();
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0 + m0.rawData());
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

        src.read();

        sdst = src.rawData();

        sdst.write();
    }

    Inst_SOP1__S_MOVRELD_B32::Inst_SOP1__S_MOVRELD_B32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_movreld_b32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_MOVRELD_B32

    Inst_SOP1__S_MOVRELD_B32::~Inst_SOP1__S_MOVRELD_B32()
    {
    } // ~Inst_SOP1__S_MOVRELD_B32

    // SGPR[D.u + M0.u].u = S0.u (move to relative destination).
    void
    Inst_SOP1__S_MOVRELD_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
        m0.read();
        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU32 sdst(gpuDynInst, instData.SDST + m0.rawData());

        src.read();

        sdst = src.rawData();

        sdst.write();
    }

    Inst_SOP1__S_MOVRELD_B64::Inst_SOP1__S_MOVRELD_B64(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_movreld_b64")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_MOVRELD_B64

    Inst_SOP1__S_MOVRELD_B64::~Inst_SOP1__S_MOVRELD_B64()
    {
    } // ~Inst_SOP1__S_MOVRELD_B64

    // SGPR[D.u + M0.u].u64 = S0.u64 (move to relative destination).
    // The index in M0.u must be even for this operation.
    void
    Inst_SOP1__S_MOVRELD_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
        m0.read();
        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST + m0.rawData());

        src.read();

        sdst = src.rawData();

        sdst.write();
    }

    Inst_SOP1__S_CBRANCH_JOIN::Inst_SOP1__S_CBRANCH_JOIN(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_cbranch_join")
    {
        setFlag(Branch);
    } // Inst_SOP1__S_CBRANCH_JOIN

    Inst_SOP1__S_CBRANCH_JOIN::~Inst_SOP1__S_CBRANCH_JOIN()
    {
    } // ~Inst_SOP1__S_CBRANCH_JOIN

    // Conditional branch join point (end of conditional branch block).
    void
    Inst_SOP1__S_CBRANCH_JOIN::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOP1__S_ABS_I32::Inst_SOP1__S_ABS_I32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_abs_i32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_ABS_I32

    Inst_SOP1__S_ABS_I32::~Inst_SOP1__S_ABS_I32()
    {
    } // ~Inst_SOP1__S_ABS_I32

    // if (S.i < 0) then D.i = -S.i;
    // else D.i = S.i;
    // SCC = 1 if result is non-zero.
    // Integer absolute value.
    void
    Inst_SOP1__S_ABS_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src.read();

        sdst = std::abs(src.rawData());

        scc = sdst.rawData() ? 1 : 0;

        sdst.write();
        scc.write();
    }

    Inst_SOP1__S_MOV_FED_B32::Inst_SOP1__S_MOV_FED_B32(InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_mov_fed_b32")
    {
        setFlag(ALU);
    } // Inst_SOP1__S_MOV_FED_B32

    Inst_SOP1__S_MOV_FED_B32::~Inst_SOP1__S_MOV_FED_B32()
    {
    } // ~Inst_SOP1__S_MOV_FED_B32

    // D.u = S0.u.
    void
    Inst_SOP1__S_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOP1__S_SET_GPR_IDX_IDX::Inst_SOP1__S_SET_GPR_IDX_IDX(
          InFmt_SOP1 *iFmt)
        : Inst_SOP1(iFmt, "s_set_gpr_idx_idx")
    {
    } // Inst_SOP1__S_SET_GPR_IDX_IDX

    Inst_SOP1__S_SET_GPR_IDX_IDX::~Inst_SOP1__S_SET_GPR_IDX_IDX()
    {
    } // ~Inst_SOP1__S_SET_GPR_IDX_IDX

    // M0[7:0] = S0.u[7:0].
    // Modify the index used in vector GPR indexing.
    void
    Inst_SOP1__S_SET_GPR_IDX_IDX::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPC__S_CMP_EQ_I32::Inst_SOPC__S_CMP_EQ_I32(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_cmp_eq_i32")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_CMP_EQ_I32

    Inst_SOPC__S_CMP_EQ_I32::~Inst_SOPC__S_CMP_EQ_I32()
    {
    } // ~Inst_SOPC__S_CMP_EQ_I32

    // SCC = (S0.i == S1.i).
    void
    Inst_SOPC__S_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_CMP_LG_I32::Inst_SOPC__S_CMP_LG_I32(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_cmp_lg_i32")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_CMP_LG_I32

    Inst_SOPC__S_CMP_LG_I32::~Inst_SOPC__S_CMP_LG_I32()
    {
    } // ~Inst_SOPC__S_CMP_LG_I32

    // SCC = (S0.i != S1.i).
    void
    Inst_SOPC__S_CMP_LG_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_CMP_GT_I32::Inst_SOPC__S_CMP_GT_I32(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_cmp_gt_i32")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_CMP_GT_I32

    Inst_SOPC__S_CMP_GT_I32::~Inst_SOPC__S_CMP_GT_I32()
    {
    } // ~Inst_SOPC__S_CMP_GT_I32

    // SCC = (S0.i > S1.i).
    void
    Inst_SOPC__S_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_CMP_GE_I32::Inst_SOPC__S_CMP_GE_I32(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_cmp_ge_i32")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_CMP_GE_I32

    Inst_SOPC__S_CMP_GE_I32::~Inst_SOPC__S_CMP_GE_I32()
    {
    } // ~Inst_SOPC__S_CMP_GE_I32

    // SCC = (S0.i >= S1.i).
    void
    Inst_SOPC__S_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_CMP_LT_I32::Inst_SOPC__S_CMP_LT_I32(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_cmp_lt_i32")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_CMP_LT_I32

    Inst_SOPC__S_CMP_LT_I32::~Inst_SOPC__S_CMP_LT_I32()
    {
    } // ~Inst_SOPC__S_CMP_LT_I32

    // SCC = (S0.i < S1.i).
    void
    Inst_SOPC__S_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_CMP_LE_I32::Inst_SOPC__S_CMP_LE_I32(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_cmp_le_i32")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_CMP_LE_I32

    Inst_SOPC__S_CMP_LE_I32::~Inst_SOPC__S_CMP_LE_I32()
    {
    } // ~Inst_SOPC__S_CMP_LE_I32

    // SCC = (S0.i <= S1.i).
    void
    Inst_SOPC__S_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_CMP_EQ_U32::Inst_SOPC__S_CMP_EQ_U32(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_cmp_eq_u32")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_CMP_EQ_U32

    Inst_SOPC__S_CMP_EQ_U32::~Inst_SOPC__S_CMP_EQ_U32()
    {
    } // ~Inst_SOPC__S_CMP_EQ_U32

    // SCC = (S0.u == S1.u).
    void
    Inst_SOPC__S_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_CMP_LG_U32::Inst_SOPC__S_CMP_LG_U32(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_cmp_lg_u32")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_CMP_LG_U32

    Inst_SOPC__S_CMP_LG_U32::~Inst_SOPC__S_CMP_LG_U32()
    {
    } // ~Inst_SOPC__S_CMP_LG_U32

    // SCC = (S0.u != S1.u).
    void
    Inst_SOPC__S_CMP_LG_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_CMP_GT_U32::Inst_SOPC__S_CMP_GT_U32(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_cmp_gt_u32")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_CMP_GT_U32

    Inst_SOPC__S_CMP_GT_U32::~Inst_SOPC__S_CMP_GT_U32()
    {
    } // ~Inst_SOPC__S_CMP_GT_U32

    // SCC = (S0.u > S1.u).
    void
    Inst_SOPC__S_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_CMP_GE_U32::Inst_SOPC__S_CMP_GE_U32(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_cmp_ge_u32")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_CMP_GE_U32

    Inst_SOPC__S_CMP_GE_U32::~Inst_SOPC__S_CMP_GE_U32()
    {
    } // ~Inst_SOPC__S_CMP_GE_U32

    // SCC = (S0.u >= S1.u).
    void
    Inst_SOPC__S_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_CMP_LT_U32::Inst_SOPC__S_CMP_LT_U32(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_cmp_lt_u32")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_CMP_LT_U32

    Inst_SOPC__S_CMP_LT_U32::~Inst_SOPC__S_CMP_LT_U32()
    {
    } // ~Inst_SOPC__S_CMP_LT_U32

    // SCC = (S0.u < S1.u).
    void
    Inst_SOPC__S_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_CMP_LE_U32::Inst_SOPC__S_CMP_LE_U32(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_cmp_le_u32")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_CMP_LE_U32

    Inst_SOPC__S_CMP_LE_U32::~Inst_SOPC__S_CMP_LE_U32()
    {
    } // ~Inst_SOPC__S_CMP_LE_U32

    // SCC = (S0.u <= S1.u).
    void
    Inst_SOPC__S_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_BITCMP0_B32::Inst_SOPC__S_BITCMP0_B32(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_bitcmp0_b32")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_BITCMP0_B32

    Inst_SOPC__S_BITCMP0_B32::~Inst_SOPC__S_BITCMP0_B32()
    {
    } // ~Inst_SOPC__S_BITCMP0_B32

    // SCC = (S0.u[S1.u[4:0]] == 0).
    void
    Inst_SOPC__S_BITCMP0_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = !bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_BITCMP1_B32::Inst_SOPC__S_BITCMP1_B32(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_bitcmp1_b32")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_BITCMP1_B32

    Inst_SOPC__S_BITCMP1_B32::~Inst_SOPC__S_BITCMP1_B32()
    {
    } // ~Inst_SOPC__S_BITCMP1_B32

    // SCC = (S0.u[S1.u[4:0]] == 1).
    void
    Inst_SOPC__S_BITCMP1_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_BITCMP0_B64::Inst_SOPC__S_BITCMP0_B64(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_bitcmp0_b64")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_BITCMP0_B64

    Inst_SOPC__S_BITCMP0_B64::~Inst_SOPC__S_BITCMP0_B64()
    {
    } // ~Inst_SOPC__S_BITCMP0_B64

    // SCC = (S0.u64[S1.u[5:0]] == 0).
    void
    Inst_SOPC__S_BITCMP0_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = !bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_BITCMP1_B64::Inst_SOPC__S_BITCMP1_B64(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_bitcmp1_b64")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_BITCMP1_B64

    Inst_SOPC__S_BITCMP1_B64::~Inst_SOPC__S_BITCMP1_B64()
    {
    } // ~Inst_SOPC__S_BITCMP1_B64

    // SCC = (S0.u64[S1.u[5:0]] == 1).
    void
    Inst_SOPC__S_BITCMP1_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_SETVSKIP::Inst_SOPC__S_SETVSKIP(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_setvskip")
    {
        setFlag(UnconditionalJump);
    } // Inst_SOPC__S_SETVSKIP

    Inst_SOPC__S_SETVSKIP::~Inst_SOPC__S_SETVSKIP()
    {
    } // ~Inst_SOPC__S_SETVSKIP

    // VSKIP = S0.u[S1.u[4:0]].
    // Enables and disables VSKIP mode.
    // When VSKIP is enabled, no VOP*/M*BUF/MIMG/DS/FLAT/EXP instuctions are
    // issued.
    void
    Inst_SOPC__S_SETVSKIP::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPC__S_SET_GPR_IDX_ON::Inst_SOPC__S_SET_GPR_IDX_ON(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_set_gpr_idx_on")
    {
    } // Inst_SOPC__S_SET_GPR_IDX_ON

    Inst_SOPC__S_SET_GPR_IDX_ON::~Inst_SOPC__S_SET_GPR_IDX_ON()
    {
    } // ~Inst_SOPC__S_SET_GPR_IDX_ON

    // MODE.gpr_idx_en = 1;
    // M0[7:0] = S0.u[7:0];
    // M0[15:12] = SIMM4 (direct contents of S1 field);
    // Remaining bits of M0 are unmodified.
    // Enable GPR indexing mode. Vector operations after this will perform
    // relative GPR addressing based on the contents of M0.
    // The raw contents of the S1 field are read and used to set the enable
    // bits. S1[0] = VSRC0_REL, S1[1] = VSRC1_REL, S1[2] = VSRC2_REL and
    // S1[3] = VDST_REL.
    void
    Inst_SOPC__S_SET_GPR_IDX_ON::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPC__S_CMP_EQ_U64::Inst_SOPC__S_CMP_EQ_U64(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_cmp_eq_u64")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_CMP_EQ_U64

    Inst_SOPC__S_CMP_EQ_U64::~Inst_SOPC__S_CMP_EQ_U64()
    {
    } // ~Inst_SOPC__S_CMP_EQ_U64

    // SCC = (S0.i64 == S1.i64).
    void
    Inst_SOPC__S_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;

        scc.write();
    }

    Inst_SOPC__S_CMP_LG_U64::Inst_SOPC__S_CMP_LG_U64(InFmt_SOPC *iFmt)
        : Inst_SOPC(iFmt, "s_cmp_lg_u64")
    {
        setFlag(ALU);
    } // Inst_SOPC__S_CMP_LG_U64

    Inst_SOPC__S_CMP_LG_U64::~Inst_SOPC__S_CMP_LG_U64()
    {
    } // ~Inst_SOPC__S_CMP_LG_U64

    // SCC = (S0.i64 != S1.i64).
    void
    Inst_SOPC__S_CMP_LG_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
        ScalarOperandU32 scc(gpuDynInst, REG_SCC);

        src0.read();
        src1.read();

        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;

        scc.write();
    }

    Inst_SOPP__S_NOP::Inst_SOPP__S_NOP(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_nop")
    {
        setFlag(Nop);
    } // Inst_SOPP__S_NOP

    Inst_SOPP__S_NOP::~Inst_SOPP__S_NOP()
    {
    } // ~Inst_SOPP__S_NOP

    // Do nothing.
    void
    Inst_SOPP__S_NOP::execute(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_SOPP__S_ENDPGM::Inst_SOPP__S_ENDPGM(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_endpgm")
    {
        setFlag(EndOfKernel);
    } // Inst_SOPP__S_ENDPGM

    Inst_SOPP__S_ENDPGM::~Inst_SOPP__S_ENDPGM()
    {
    } // ~Inst_SOPP__S_ENDPGM

    // End of program; terminate wavefront.
    void
    Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ComputeUnit *cu = gpuDynInst->computeUnit();

        // delete extra instructions fetched for completed work-items
        wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1,
            wf->instructionBuffer.end());

        if (wf->pendingFetch) {
            wf->dropFetch = true;
        }

        wf->computeUnit->fetchStage.fetchUnit(wf->simdId)
            .flushBuf(wf->wfSlotId);
        wf->setStatus(Wavefront::S_STOPPED);

        int refCount = wf->computeUnit->getLds()
            .decreaseRefCounter(wf->dispatchId, wf->wgId);

        /**
         * The parent WF of this instruction is exiting, therefore
         * it should not participate in this barrier any longer. This
         * prevents possible deadlock issues if WFs exit early.
         */
        int bar_id = WFBarrier::InvalidID;
        if (wf->hasBarrier()) {
            assert(wf->getStatus() != Wavefront::S_BARRIER);
            bar_id = wf->barrierId();
            assert(bar_id != WFBarrier::InvalidID);
            wf->releaseBarrier();
            cu->decMaxBarrierCnt(bar_id);
            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the "
                    "program and decrementing max barrier count for "
                    "barrier Id%d. New max count: %d.\n", cu->cu_id,
                    wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id,
                    cu->maxBarrierCnt(bar_id));
        }

        DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
            wf->computeUnit->cu_id, wf->wgId, refCount);

        wf->computeUnit->registerManager->freeRegisters(wf);
        wf->computeUnit->stats.completedWfs++;
        wf->computeUnit->activeWaves--;

        panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less "
            "than zero\n", wf->computeUnit->cu_id);

        DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
            wf->computeUnit->cu_id, wf->simdId, wf->wfSlotId, wf->wfDynId);

        for (int i = 0; i < wf->vecReads.size(); i++) {
            if (wf->rawDist.find(i) != wf->rawDist.end()) {
                wf->stats.readsPerWrite.sample(wf->vecReads.at(i));
            }
        }
        wf->vecReads.clear();
        wf->rawDist.clear();
        wf->lastInstExec = 0;

        if (!refCount) {
            /**
             * If all WFs have finished, and hence the WG has finished,
             * then we can free up the barrier belonging to the parent
             * WG, but only if we actually used a barrier (i.e., more
             * than one WF in the WG).
             */
            if (bar_id != WFBarrier::InvalidID) {
                DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves are "
                        "now complete. Releasing barrier Id%d.\n", cu->cu_id,
                        wf->simdId, wf->wfSlotId, wf->wfDynId,
                        wf->barrierId());
                cu->releaseBarrier(bar_id);
            }

           /**
             * Last wavefront of the workgroup has executed return. If the
             * workgroup is not the final one in the kernel, then simply
             * retire it; however, if it is the final one (i.e., indicating
             * the kernel end) then release operation is needed.
             */

            // check whether the workgroup is indicating the kernel end (i.e.,
            // the last workgroup in the kernel).
            bool kernelEnd =
                wf->computeUnit->shader->dispatcher().isReachingKernelEnd(wf);
            // further check whether 'release @ kernel end' is needed
            bool relNeeded =
                wf->computeUnit->shader->impl_kern_end_rel;

            // if not a kernel end or no release needed, retire the workgroup
            // directly
            if (!kernelEnd || !relNeeded) {
                wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
                wf->setStatus(Wavefront::S_STOPPED);
                wf->computeUnit->stats.completedWGs++;

                return;
            }

            /**
             * If a kernel end and release needed, inject a memory sync and
             * retire the workgroup after receving all acks.
             */
            setFlag(MemSync);
            setFlag(GlobalSegment);
            // Notify Memory System of Kernel Completion
            wf->setStatus(Wavefront::S_RETURNING);
            gpuDynInst->simdId = wf->simdId;
            gpuDynInst->wfSlotId = wf->wfSlotId;
            gpuDynInst->wfDynId = wf->wfDynId;

            DPRINTF(GPUExec, "inject global memory fence for CU%d: "
                            "WF[%d][%d][%d]\n", wf->computeUnit->cu_id,
                            wf->simdId, wf->wfSlotId, wf->wfDynId);

            // call shader to prepare the flush operations
            wf->computeUnit->shader->prepareFlush(gpuDynInst);

            wf->computeUnit->stats.completedWGs++;
        } else {
            wf->computeUnit->shader->dispatcher().scheduleDispatch();
        }
    }


    Inst_SOPP__S_BRANCH::Inst_SOPP__S_BRANCH(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_branch")
    {
        setFlag(Branch);
    } // Inst_SOPP__S_BRANCH

    Inst_SOPP__S_BRANCH::~Inst_SOPP__S_BRANCH()
    {
    } // ~Inst_SOPP__S_BRANCH

    // PC = PC + signext(SIMM16 * 4) + 4 (short jump).
    void
    Inst_SOPP__S_BRANCH::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        Addr pc = wf->pc();
        ScalarRegI16 simm16 = instData.SIMM16;

        pc = pc + ((ScalarRegI64)sext<18>(simm16 * 4LL)) + 4LL;

        wf->pc(pc);
    }

    Inst_SOPP__S_WAKEUP::Inst_SOPP__S_WAKEUP(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_wakeup")
    {
    } // Inst_SOPP__S_WAKEUP

    Inst_SOPP__S_WAKEUP::~Inst_SOPP__S_WAKEUP()
    {
    } // ~Inst_SOPP__S_WAKEUP

    // Allow a wave to wakeup all the other waves in its workgroup to force
    // them to wake up immediately from an S_SLEEP instruction. The wakeup is
    // ignored if the waves are not sleeping.
    void
    Inst_SOPP__S_WAKEUP::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_CBRANCH_SCC0::Inst_SOPP__S_CBRANCH_SCC0(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_cbranch_scc0")
    {
        setFlag(Branch);
    } // Inst_SOPP__S_CBRANCH_SCC0

    Inst_SOPP__S_CBRANCH_SCC0::~Inst_SOPP__S_CBRANCH_SCC0()
    {
    } // ~Inst_SOPP__S_CBRANCH_SCC0

    // if (SCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
    // else NOP.
    void
    Inst_SOPP__S_CBRANCH_SCC0::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        Addr pc = wf->pc();
        ScalarRegI16 simm16 = instData.SIMM16;
        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);

        scc.read();

        if (!scc.rawData()) {
            pc = pc + ((ScalarRegI64)sext<18>(simm16 * 4LL)) + 4LL;
        }

        wf->pc(pc);
    }

    Inst_SOPP__S_CBRANCH_SCC1::Inst_SOPP__S_CBRANCH_SCC1(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_cbranch_scc1")
    {
        setFlag(Branch);
    } // Inst_SOPP__S_CBRANCH_SCC1

    Inst_SOPP__S_CBRANCH_SCC1::~Inst_SOPP__S_CBRANCH_SCC1()
    {
    } // ~Inst_SOPP__S_CBRANCH_SCC1

    // if (SCC == 1) then PC = PC + signext(SIMM16 * 4) + 4;
    // else NOP.
    void
    Inst_SOPP__S_CBRANCH_SCC1::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        Addr pc = wf->pc();
        ScalarRegI16 simm16 = instData.SIMM16;
        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);

        scc.read();

        if (scc.rawData()) {
            pc = pc + ((ScalarRegI64)sext<18>(simm16 * 4LL)) + 4LL;
        }

        wf->pc(pc);
    }

    Inst_SOPP__S_CBRANCH_VCCZ::Inst_SOPP__S_CBRANCH_VCCZ(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_cbranch_vccz")
    {
        setFlag(Branch);
        setFlag(ReadsVCC);
    } // Inst_SOPP__S_CBRANCH_VCCZ

    Inst_SOPP__S_CBRANCH_VCCZ::~Inst_SOPP__S_CBRANCH_VCCZ()
    {
    } // ~Inst_SOPP__S_CBRANCH_VCCZ

    // if (VCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
    // else NOP.
    void
    Inst_SOPP__S_CBRANCH_VCCZ::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
        Addr pc = wf->pc();
        ScalarRegI16 simm16 = instData.SIMM16;

        vcc.read();

        if (!vcc.rawData()) {
            pc = pc + ((ScalarRegI64)sext<18>(simm16 * 4LL)) + 4LL;
        }

        wf->pc(pc);
    }

    Inst_SOPP__S_CBRANCH_VCCNZ::Inst_SOPP__S_CBRANCH_VCCNZ(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_cbranch_vccnz")
    {
        setFlag(Branch);
        setFlag(ReadsVCC);
    } // Inst_SOPP__S_CBRANCH_VCCNZ

    Inst_SOPP__S_CBRANCH_VCCNZ::~Inst_SOPP__S_CBRANCH_VCCNZ()
    {
    } // ~Inst_SOPP__S_CBRANCH_VCCNZ

    // if (VCC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
    // else NOP.
    void
    Inst_SOPP__S_CBRANCH_VCCNZ::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        vcc.read();

        if (vcc.rawData()) {
            Addr pc = wf->pc();
            ScalarRegI16 simm16 = instData.SIMM16;
            pc = pc + ((ScalarRegI64)sext<18>(simm16 * 4LL)) + 4LL;
            wf->pc(pc);
        }
    }

    Inst_SOPP__S_CBRANCH_EXECZ::Inst_SOPP__S_CBRANCH_EXECZ(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_cbranch_execz")
    {
        setFlag(Branch);
    } // Inst_SOPP__S_CBRANCH_EXECZ

    Inst_SOPP__S_CBRANCH_EXECZ::~Inst_SOPP__S_CBRANCH_EXECZ()
    {
    } // ~Inst_SOPP__S_CBRANCH_EXECZ

    // if (EXEC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
    // else NOP.
    void
    Inst_SOPP__S_CBRANCH_EXECZ::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            Addr pc = wf->pc();
            ScalarRegI16 simm16 = instData.SIMM16;
            pc = pc + ((ScalarRegI64)sext<18>(simm16 * 4LL)) + 4LL;
            wf->pc(pc);
        }
    }

    Inst_SOPP__S_CBRANCH_EXECNZ::Inst_SOPP__S_CBRANCH_EXECNZ(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_cbranch_execnz")
    {
        setFlag(Branch);
    } // Inst_SOPP__S_CBRANCH_EXECNZ

    Inst_SOPP__S_CBRANCH_EXECNZ::~Inst_SOPP__S_CBRANCH_EXECNZ()
    {
    } // ~Inst_SOPP__S_CBRANCH_EXECNZ

    // if (EXEC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
    // else NOP.
    void
    Inst_SOPP__S_CBRANCH_EXECNZ::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().any()) {
            Addr pc = wf->pc();
            ScalarRegI16 simm16 = instData.SIMM16;
            pc = pc + ((ScalarRegI64)sext<18>(simm16 * 4LL)) + 4LL;
            wf->pc(pc);
        }
    }

    Inst_SOPP__S_BARRIER::Inst_SOPP__S_BARRIER(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_barrier")
    {
        setFlag(MemBarrier);
    } // Inst_SOPP__S_BARRIER

    Inst_SOPP__S_BARRIER::~Inst_SOPP__S_BARRIER()
    {
    } // ~Inst_SOPP__S_BARRIER

    /**
     * Synchronize waves within a workgroup. If not all waves of the workgroup
     * have been created yet, wait for entire group before proceeding. If some
     * waves in the wokgroup have already terminated, this waits on only the
     * surviving waves.
     */
    void
    Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ComputeUnit *cu = gpuDynInst->computeUnit();

        if (wf->hasBarrier()) {
            int bar_id = wf->barrierId();
            assert(wf->getStatus() != Wavefront::S_BARRIER);
            wf->setStatus(Wavefront::S_BARRIER);
            cu->incNumAtBarrier(bar_id);
            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalling at "
                    "barrier Id%d. %d waves now at barrier, %d waves "
                    "remain.\n", cu->cu_id, wf->simdId, wf->wfSlotId,
                    wf->wfDynId, bar_id, cu->numAtBarrier(bar_id),
                    cu->numYetToReachBarrier(bar_id));
        }
    } // execute
    // --- Inst_SOPP__S_SETKILL class methods ---

    Inst_SOPP__S_SETKILL::Inst_SOPP__S_SETKILL(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_setkill")
    {
    } // Inst_SOPP__S_SETKILL

    Inst_SOPP__S_SETKILL::~Inst_SOPP__S_SETKILL()
    {
    } // ~Inst_SOPP__S_SETKILL

    void
    Inst_SOPP__S_SETKILL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_WAITCNT::Inst_SOPP__S_WAITCNT(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_waitcnt")
    {
        setFlag(ALU);
        setFlag(Waitcnt);
    } // Inst_SOPP__S_WAITCNT

    Inst_SOPP__S_WAITCNT::~Inst_SOPP__S_WAITCNT()
    {
    } // ~Inst_SOPP__S_WAITCNT

    // Wait for the counts of outstanding lds, vector-memory and
    // export/vmem-write-data to be at or below the specified levels.
    // SIMM16[3:0] = vmcount (vector memory operations),
    // SIMM16[6:4] = export/mem-write-data count,
    // SIMM16[12:8] = LGKM_cnt (scalar-mem/GDS/LDS count).
    void
    Inst_SOPP__S_WAITCNT::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegI32 vm_cnt = 0;
        ScalarRegI32 exp_cnt = 0;
        ScalarRegI32 lgkm_cnt = 0;
        vm_cnt = bits<ScalarRegI16>(instData.SIMM16, 3, 0);
        exp_cnt = bits<ScalarRegI16>(instData.SIMM16, 6, 4);
        lgkm_cnt = bits<ScalarRegI16>(instData.SIMM16, 12, 8);
        gpuDynInst->wavefront()->setWaitCnts(vm_cnt, exp_cnt, lgkm_cnt);
    }

    Inst_SOPP__S_SETHALT::Inst_SOPP__S_SETHALT(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_sethalt")
    {
    } // Inst_SOPP__S_SETHALT

    Inst_SOPP__S_SETHALT::~Inst_SOPP__S_SETHALT()
    {
    } // ~Inst_SOPP__S_SETHALT

    void
    Inst_SOPP__S_SETHALT::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_SLEEP::Inst_SOPP__S_SLEEP(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_sleep")
    {
        setFlag(ALU);
        setFlag(Sleep);
    } // Inst_SOPP__S_SLEEP

    Inst_SOPP__S_SLEEP::~Inst_SOPP__S_SLEEP()
    {
    } // ~Inst_SOPP__S_SLEEP

    // Cause a wave to sleep for (64 * SIMM16[2:0] + 1..64) clocks.
    void
    Inst_SOPP__S_SLEEP::execute(GPUDynInstPtr gpuDynInst)
    {
        ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
        gpuDynInst->wavefront()->setStatus(Wavefront::S_STALLED_SLEEP);
        // sleep duration is specified in multiples of 64 cycles
        gpuDynInst->wavefront()->setSleepTime(64 * simm16);
    } // execute
    // --- Inst_SOPP__S_SETPRIO class methods ---

    Inst_SOPP__S_SETPRIO::Inst_SOPP__S_SETPRIO(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_setprio")
    {
    } // Inst_SOPP__S_SETPRIO

    Inst_SOPP__S_SETPRIO::~Inst_SOPP__S_SETPRIO()
    {
    } // ~Inst_SOPP__S_SETPRIO

    // User settable wave priority is set to SIMM16[1:0]. 0 = lowest,
    // 3 = highest.
    void
    Inst_SOPP__S_SETPRIO::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_SENDMSG::Inst_SOPP__S_SENDMSG(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_sendmsg")
    {
    } // Inst_SOPP__S_SENDMSG

    Inst_SOPP__S_SENDMSG::~Inst_SOPP__S_SENDMSG()
    {
    } // ~Inst_SOPP__S_SENDMSG

    void
    Inst_SOPP__S_SENDMSG::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_SENDMSGHALT::Inst_SOPP__S_SENDMSGHALT(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_sendmsghalt")
    {
    } // Inst_SOPP__S_SENDMSGHALT

    Inst_SOPP__S_SENDMSGHALT::~Inst_SOPP__S_SENDMSGHALT()
    {
    } // ~Inst_SOPP__S_SENDMSGHALT

    void
    Inst_SOPP__S_SENDMSGHALT::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_TRAP::Inst_SOPP__S_TRAP(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_trap")
    {
    } // Inst_SOPP__S_TRAP

    Inst_SOPP__S_TRAP::~Inst_SOPP__S_TRAP()
    {
    } // ~Inst_SOPP__S_TRAP

    // Enter the trap handler.
    void
    Inst_SOPP__S_TRAP::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_ICACHE_INV::Inst_SOPP__S_ICACHE_INV(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_icache_inv")
    {
    } // Inst_SOPP__S_ICACHE_INV

    Inst_SOPP__S_ICACHE_INV::~Inst_SOPP__S_ICACHE_INV()
    {
    } // ~Inst_SOPP__S_ICACHE_INV

    // Invalidate entire L1 instruction cache.
    void
    Inst_SOPP__S_ICACHE_INV::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_INCPERFLEVEL::Inst_SOPP__S_INCPERFLEVEL(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_incperflevel")
    {
    } // Inst_SOPP__S_INCPERFLEVEL

    Inst_SOPP__S_INCPERFLEVEL::~Inst_SOPP__S_INCPERFLEVEL()
    {
    } // ~Inst_SOPP__S_INCPERFLEVEL

    void
    Inst_SOPP__S_INCPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_DECPERFLEVEL::Inst_SOPP__S_DECPERFLEVEL(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_decperflevel")
    {
    } // Inst_SOPP__S_DECPERFLEVEL

    Inst_SOPP__S_DECPERFLEVEL::~Inst_SOPP__S_DECPERFLEVEL()
    {
    } // ~Inst_SOPP__S_DECPERFLEVEL

    void
    Inst_SOPP__S_DECPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_TTRACEDATA::Inst_SOPP__S_TTRACEDATA(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_ttracedata")
    {
    } // Inst_SOPP__S_TTRACEDATA

    Inst_SOPP__S_TTRACEDATA::~Inst_SOPP__S_TTRACEDATA()
    {
    } // ~Inst_SOPP__S_TTRACEDATA

    void
    Inst_SOPP__S_TTRACEDATA::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_CBRANCH_CDBGSYS::Inst_SOPP__S_CBRANCH_CDBGSYS(
          InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys")
    {
        setFlag(Branch);
    } // Inst_SOPP__S_CBRANCH_CDBGSYS

    Inst_SOPP__S_CBRANCH_CDBGSYS::~Inst_SOPP__S_CBRANCH_CDBGSYS()
    {
    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS

    void
    Inst_SOPP__S_CBRANCH_CDBGSYS::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_CBRANCH_CDBGUSER::Inst_SOPP__S_CBRANCH_CDBGUSER(
          InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_cbranch_cdbguser")
    {
        setFlag(Branch);
    } // Inst_SOPP__S_CBRANCH_CDBGUSER

    Inst_SOPP__S_CBRANCH_CDBGUSER::~Inst_SOPP__S_CBRANCH_CDBGUSER()
    {
    } // ~Inst_SOPP__S_CBRANCH_CDBGUSER

    void
    Inst_SOPP__S_CBRANCH_CDBGUSER::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER(
          InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_or_user")
    {
        setFlag(Branch);
    } // Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER

    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::
        ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER()
    {
    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER

    void
    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
        Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER(InFmt_SOPP *iFmt)
            : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_and_user")
    {
        setFlag(Branch);
    } // Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER

    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
        ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER()
    {
    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER

    void
    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_ENDPGM_SAVED::Inst_SOPP__S_ENDPGM_SAVED(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_endpgm_saved")
    {
    } // Inst_SOPP__S_ENDPGM_SAVED

    Inst_SOPP__S_ENDPGM_SAVED::~Inst_SOPP__S_ENDPGM_SAVED()
    {
    } // ~Inst_SOPP__S_ENDPGM_SAVED

    // End of program.
    void
    Inst_SOPP__S_ENDPGM_SAVED::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_SET_GPR_IDX_OFF::Inst_SOPP__S_SET_GPR_IDX_OFF(
          InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_set_gpr_idx_off")
    {
    } // Inst_SOPP__S_SET_GPR_IDX_OFF

    Inst_SOPP__S_SET_GPR_IDX_OFF::~Inst_SOPP__S_SET_GPR_IDX_OFF()
    {
    } // ~Inst_SOPP__S_SET_GPR_IDX_OFF

    // MODE.gpr_idx_en = 0.
    // Clear GPR indexing mode. Vector operations after this will not perform
    // relative GPR addressing regardless of the contents of M0.
    void
    Inst_SOPP__S_SET_GPR_IDX_OFF::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SOPP__S_SET_GPR_IDX_MODE::Inst_SOPP__S_SET_GPR_IDX_MODE(
          InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_set_gpr_idx_mode")
    {
    } // Inst_SOPP__S_SET_GPR_IDX_MODE

    Inst_SOPP__S_SET_GPR_IDX_MODE::~Inst_SOPP__S_SET_GPR_IDX_MODE()
    {
    } // ~Inst_SOPP__S_SET_GPR_IDX_MODE

    // M0[15:12] = SIMM4.
    // Modify the mode used for vector GPR indexing.
    // The raw contents of the source field are read and used to set the enable
    // bits. SIMM4[0] = VSRC0_REL, SIMM4[1] = VSRC1_REL, SIMM4[2] = VSRC2_REL
    // and SIMM4[3] = VDST_REL.
    void
    Inst_SOPP__S_SET_GPR_IDX_MODE::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SMEM__S_LOAD_DWORD::Inst_SMEM__S_LOAD_DWORD(InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_load_dword")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_SMEM__S_LOAD_DWORD

    Inst_SMEM__S_LOAD_DWORD::~Inst_SMEM__S_LOAD_DWORD()
    {
    } // ~Inst_SMEM__S_LOAD_DWORD

    /**
     * Read 1 dword from scalar data cache. If the offset is specified as an
     * sgpr, the sgpr contains an unsigned byte offset (the 2 LSBs are
     * ignored). If the offset is specified as an immediate 20-bit constant,
     * the constant is an unsigned byte offset.
     */
    void
    Inst_SMEM__S_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
        ScalarRegU32 offset(0);
        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);

        addr.read();

        if (instData.IMM) {
            offset = extData.OFFSET;
        } else {
            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
            off_sgpr.read();
            offset = off_sgpr.rawData();
        }

        calcAddr(gpuDynInst, addr, offset);

        gpuDynInst->computeUnit()->scalarMemoryPipe
            .getGMReqFIFO().push(gpuDynInst);

        wf->scalarRdGmReqsInPipe--;
        wf->scalarOutstandingReqsRdGm++;
        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_SMEM__S_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<1>(gpuDynInst);
    } // initiateAcc

    void
    Inst_SMEM__S_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
        sdst.write();
    } // completeAcc

    Inst_SMEM__S_LOAD_DWORDX2::Inst_SMEM__S_LOAD_DWORDX2(InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_load_dwordx2")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_SMEM__S_LOAD_DWORDX2

    Inst_SMEM__S_LOAD_DWORDX2::~Inst_SMEM__S_LOAD_DWORDX2()
    {
    } // ~Inst_SMEM__S_LOAD_DWORDX2

    /**
     * Read 2 dwords from scalar data cache. See s_load_dword for details on
     * the offset input.
     */
    void
    Inst_SMEM__S_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
        ScalarRegU32 offset(0);
        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);

        addr.read();

        if (instData.IMM) {
            offset = extData.OFFSET;
        } else {
            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
            off_sgpr.read();
            offset = off_sgpr.rawData();
        }

        calcAddr(gpuDynInst, addr, offset);

        gpuDynInst->computeUnit()->scalarMemoryPipe.
            getGMReqFIFO().push(gpuDynInst);

        wf->scalarRdGmReqsInPipe--;
        wf->scalarOutstandingReqsRdGm++;
        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_SMEM__S_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<2>(gpuDynInst);
    } // initiateAcc

    void
    Inst_SMEM__S_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
        sdst.write();
    } // completeAcc

    Inst_SMEM__S_LOAD_DWORDX4::Inst_SMEM__S_LOAD_DWORDX4(InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_load_dwordx4")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_SMEM__S_LOAD_DWORDX4

    Inst_SMEM__S_LOAD_DWORDX4::~Inst_SMEM__S_LOAD_DWORDX4()
    {
    } // ~Inst_SMEM__S_LOAD_DWORDX4

    // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
    // the offset input.
    void
    Inst_SMEM__S_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
        ScalarRegU32 offset(0);
        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);

        addr.read();

        if (instData.IMM) {
            offset = extData.OFFSET;
        } else {
            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
            off_sgpr.read();
            offset = off_sgpr.rawData();
        }

        calcAddr(gpuDynInst, addr, offset);

        gpuDynInst->computeUnit()->scalarMemoryPipe.
            getGMReqFIFO().push(gpuDynInst);

        wf->scalarRdGmReqsInPipe--;
        wf->scalarOutstandingReqsRdGm++;
        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_SMEM__S_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<4>(gpuDynInst);
    } // initiateAcc

    void
    Inst_SMEM__S_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
        sdst.write();
    } // completeAcc

    Inst_SMEM__S_LOAD_DWORDX8::Inst_SMEM__S_LOAD_DWORDX8(InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_load_dwordx8")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_SMEM__S_LOAD_DWORDX8

    Inst_SMEM__S_LOAD_DWORDX8::~Inst_SMEM__S_LOAD_DWORDX8()
    {
    } // ~Inst_SMEM__S_LOAD_DWORDX8

    // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
    // the offset input.
    void
    Inst_SMEM__S_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
        ScalarRegU32 offset(0);
        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);

        addr.read();

        if (instData.IMM) {
            offset = extData.OFFSET;
        } else {
            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
            off_sgpr.read();
            offset = off_sgpr.rawData();
        }

        calcAddr(gpuDynInst, addr, offset);

        gpuDynInst->computeUnit()->scalarMemoryPipe.
            getGMReqFIFO().push(gpuDynInst);

        wf->scalarRdGmReqsInPipe--;
        wf->scalarOutstandingReqsRdGm++;
        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_SMEM__S_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<8>(gpuDynInst);
    } // initiateAcc

    void
    Inst_SMEM__S_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
        sdst.write();
    } // completeAcc

    Inst_SMEM__S_LOAD_DWORDX16::Inst_SMEM__S_LOAD_DWORDX16(InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_load_dwordx16")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_SMEM__S_LOAD_DWORDX16

    Inst_SMEM__S_LOAD_DWORDX16::~Inst_SMEM__S_LOAD_DWORDX16()
    {
    } // ~Inst_SMEM__S_LOAD_DWORDX16

    // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
    // the offset input.
    void
    Inst_SMEM__S_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
        ScalarRegU32 offset(0);
        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);

        addr.read();

        if (instData.IMM) {
            offset = extData.OFFSET;
        } else {
            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
            off_sgpr.read();
            offset = off_sgpr.rawData();
        }

        calcAddr(gpuDynInst, addr, offset);

        gpuDynInst->computeUnit()->scalarMemoryPipe.
            getGMReqFIFO().push(gpuDynInst);

        wf->scalarRdGmReqsInPipe--;
        wf->scalarOutstandingReqsRdGm++;
        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_SMEM__S_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<16>(gpuDynInst);
    } // initiateAcc

    void
    Inst_SMEM__S_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
        sdst.write();
    } // completeAcc

    Inst_SMEM__S_BUFFER_LOAD_DWORD::Inst_SMEM__S_BUFFER_LOAD_DWORD(
          InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_buffer_load_dword")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_SMEM__S_BUFFER_LOAD_DWORD

    Inst_SMEM__S_BUFFER_LOAD_DWORD::~Inst_SMEM__S_BUFFER_LOAD_DWORD()
    {
    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORD

    // Read 1 dword from scalar data cache. See S_LOAD_DWORD for details on the
    // offset input.
    void
    Inst_SMEM__S_BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
        ScalarRegU32 offset(0);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);

        rsrcDesc.read();

        if (instData.IMM) {
            offset = extData.OFFSET;
        } else {
            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
            off_sgpr.read();
            offset = off_sgpr.rawData();
        }

        calcAddr(gpuDynInst, rsrcDesc, offset);

        gpuDynInst->computeUnit()->scalarMemoryPipe
            .getGMReqFIFO().push(gpuDynInst);

        wf->scalarRdGmReqsInPipe--;
        wf->scalarOutstandingReqsRdGm++;
        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    } // execute

    void
    Inst_SMEM__S_BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<1>(gpuDynInst);
    } // initiateAcc

    void
    Inst_SMEM__S_BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        // 1 request, size 32
        ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
        sdst.write();
    } // completeAcc

    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::Inst_SMEM__S_BUFFER_LOAD_DWORDX2(
          InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_buffer_load_dwordx2")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX2

    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::~Inst_SMEM__S_BUFFER_LOAD_DWORDX2()
    {
    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX2

    // Read 2 dwords from scalar data cache. See S_LOAD_DWORD for details on
    // the offset input.
    void
    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
        ScalarRegU32 offset(0);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);

        rsrcDesc.read();

        if (instData.IMM) {
            offset = extData.OFFSET;
        } else {
            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
            off_sgpr.read();
            offset = off_sgpr.rawData();
        }

        calcAddr(gpuDynInst, rsrcDesc, offset);

        gpuDynInst->computeUnit()->scalarMemoryPipe
            .getGMReqFIFO().push(gpuDynInst);

        wf->scalarRdGmReqsInPipe--;
        wf->scalarOutstandingReqsRdGm++;
        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    } // execute

    void
    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<2>(gpuDynInst);
    } // initiateAcc

    void
    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        // use U64 because 2 requests, each size 32
        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
        sdst.write();
    } // completeAcc

    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::Inst_SMEM__S_BUFFER_LOAD_DWORDX4(
          InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_buffer_load_dwordx4")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX4

    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::~Inst_SMEM__S_BUFFER_LOAD_DWORDX4()
    {
    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX4

    // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
    // the offset input.
    void
    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
        ScalarRegU32 offset(0);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);

        rsrcDesc.read();

        if (instData.IMM) {
            offset = extData.OFFSET;
        } else {
            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
            off_sgpr.read();
            offset = off_sgpr.rawData();
        }

        calcAddr(gpuDynInst, rsrcDesc, offset);

        gpuDynInst->computeUnit()->scalarMemoryPipe
            .getGMReqFIFO().push(gpuDynInst);

        wf->scalarRdGmReqsInPipe--;
        wf->scalarOutstandingReqsRdGm++;
        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    } // execute

    void
    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<4>(gpuDynInst);
    } // initiateAcc

    void
    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        // 4 requests, each size 32
        ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
        sdst.write();
    } // completeAcc

    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::Inst_SMEM__S_BUFFER_LOAD_DWORDX8(
          InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_buffer_load_dwordx8")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX8

    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::~Inst_SMEM__S_BUFFER_LOAD_DWORDX8()
    {
    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX8

    // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
    // the offset input.
    void
    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
        ScalarRegU32 offset(0);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);

        rsrcDesc.read();

        if (instData.IMM) {
            offset = extData.OFFSET;
        } else {
            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
            off_sgpr.read();
            offset = off_sgpr.rawData();
        }

        calcAddr(gpuDynInst, rsrcDesc, offset);

        gpuDynInst->computeUnit()->scalarMemoryPipe
            .getGMReqFIFO().push(gpuDynInst);

        wf->scalarRdGmReqsInPipe--;
        wf->scalarOutstandingReqsRdGm++;
        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    } // execute

    void
    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<8>(gpuDynInst);
    } // initiateAcc

    void
    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        // 8 requests, each size 32
        ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
        sdst.write();
    } // completeAcc

    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::Inst_SMEM__S_BUFFER_LOAD_DWORDX16(
          InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_buffer_load_dwordx16")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX16

    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::~Inst_SMEM__S_BUFFER_LOAD_DWORDX16()
    {
    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX16

    // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
    // the offset input.
    void
    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
        ScalarRegU32 offset(0);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);

        rsrcDesc.read();

        if (instData.IMM) {
            offset = extData.OFFSET;
        } else {
            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
            off_sgpr.read();
            offset = off_sgpr.rawData();
        }

        calcAddr(gpuDynInst, rsrcDesc, offset);

        gpuDynInst->computeUnit()->scalarMemoryPipe
            .getGMReqFIFO().push(gpuDynInst);

        wf->scalarRdGmReqsInPipe--;
        wf->scalarOutstandingReqsRdGm++;
        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    } // execute

    void
    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<16>(gpuDynInst);
    } // initiateAcc

    void
    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        // 16 requests, each size 32
        ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
        sdst.write();
    } // completeAcc

    Inst_SMEM__S_STORE_DWORD::Inst_SMEM__S_STORE_DWORD(InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_store_dword")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_SMEM__S_STORE_DWORD

    Inst_SMEM__S_STORE_DWORD::~Inst_SMEM__S_STORE_DWORD()
    {
    } // ~Inst_SMEM__S_STORE_DWORD

    // Write 1 dword to scalar data cache.
    // If the offset is specified as an SGPR, the SGPR contains an unsigned
    // BYTE offset (the 2 LSBs are ignored).
    // If the offset is specified as an immediate 20-bit constant, the
    // constant is an unsigned BYTE offset.
    void
    Inst_SMEM__S_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
        ScalarRegU32 offset(0);
        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);

        addr.read();

        if (instData.IMM) {
            offset = extData.OFFSET;
        } else {
            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
            off_sgpr.read();
            offset = off_sgpr.rawData();
        }

        calcAddr(gpuDynInst, addr, offset);

        gpuDynInst->computeUnit()->scalarMemoryPipe.
            getGMReqFIFO().push(gpuDynInst);

        wf->scalarWrGmReqsInPipe--;
        wf->scalarOutstandingReqsWrGm++;
        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_SMEM__S_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA);
        sdata.read();
        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
            sizeof(ScalarRegU32));
        initMemWrite<1>(gpuDynInst);
    } // initiateAcc

    void
    Inst_SMEM__S_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_SMEM__S_STORE_DWORDX2::Inst_SMEM__S_STORE_DWORDX2(InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_store_dwordx2")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_SMEM__S_STORE_DWORDX2

    Inst_SMEM__S_STORE_DWORDX2::~Inst_SMEM__S_STORE_DWORDX2()
    {
    } // ~Inst_SMEM__S_STORE_DWORDX2

    // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
    // the offset input.
    void
    Inst_SMEM__S_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
        ScalarRegU32 offset(0);
        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);

        addr.read();

        if (instData.IMM) {
            offset = extData.OFFSET;
        } else {
            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
            off_sgpr.read();
            offset = off_sgpr.rawData();
        }

        calcAddr(gpuDynInst, addr, offset);

        gpuDynInst->computeUnit()->scalarMemoryPipe.
            getGMReqFIFO().push(gpuDynInst);

        wf->scalarWrGmReqsInPipe--;
        wf->scalarOutstandingReqsWrGm++;
        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_SMEM__S_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
        sdata.read();
        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
            sizeof(ScalarRegU64));
        initMemWrite<2>(gpuDynInst);
    } // initiateAcc

    void
    Inst_SMEM__S_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_SMEM__S_STORE_DWORDX4::Inst_SMEM__S_STORE_DWORDX4(InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_store_dwordx4")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_SMEM__S_STORE_DWORDX4

    Inst_SMEM__S_STORE_DWORDX4::~Inst_SMEM__S_STORE_DWORDX4()
    {
    } // ~Inst_SMEM__S_STORE_DWORDX4

    // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
    // the offset input.
    void
    Inst_SMEM__S_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
        ScalarRegU32 offset(0);
        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);

        addr.read();

        if (instData.IMM) {
            offset = extData.OFFSET;
        } else {
            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
            off_sgpr.read();
            offset = off_sgpr.rawData();
        }

        calcAddr(gpuDynInst, addr, offset);

        gpuDynInst->computeUnit()->scalarMemoryPipe.
            getGMReqFIFO().push(gpuDynInst);

        wf->scalarWrGmReqsInPipe--;
        wf->scalarOutstandingReqsWrGm++;
        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_SMEM__S_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU128 sdata(gpuDynInst, instData.SDATA);
        sdata.read();
        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
            4 * sizeof(ScalarRegU32));
        initMemWrite<4>(gpuDynInst);
    } // initiateAcc

    void
    Inst_SMEM__S_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_SMEM__S_BUFFER_STORE_DWORD::Inst_SMEM__S_BUFFER_STORE_DWORD(
          InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_buffer_store_dword")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_SMEM__S_BUFFER_STORE_DWORD

    Inst_SMEM__S_BUFFER_STORE_DWORD::~Inst_SMEM__S_BUFFER_STORE_DWORD()
    {
    } // ~Inst_SMEM__S_BUFFER_STORE_DWORD

    // Write 1 dword to scalar data cache. See S_STORE_DWORD for details on the
    // offset input.
    void
    Inst_SMEM__S_BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_SMEM__S_BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_SMEM__S_BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_SMEM__S_BUFFER_STORE_DWORDX2::Inst_SMEM__S_BUFFER_STORE_DWORDX2(
          InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_buffer_store_dwordx2")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_SMEM__S_BUFFER_STORE_DWORDX2

    Inst_SMEM__S_BUFFER_STORE_DWORDX2::~Inst_SMEM__S_BUFFER_STORE_DWORDX2()
    {
    } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX2

    // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
    // the offset input.
    void
    Inst_SMEM__S_BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_SMEM__S_BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_SMEM__S_BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_SMEM__S_BUFFER_STORE_DWORDX4::Inst_SMEM__S_BUFFER_STORE_DWORDX4(
          InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_buffer_store_dwordx4")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_SMEM__S_BUFFER_STORE_DWORDX4

    Inst_SMEM__S_BUFFER_STORE_DWORDX4::~Inst_SMEM__S_BUFFER_STORE_DWORDX4()
    {
    } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX4

    // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
    // the offset input.
    void
    Inst_SMEM__S_BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_SMEM__S_BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_SMEM__S_BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_SMEM__S_DCACHE_INV::Inst_SMEM__S_DCACHE_INV(InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_dcache_inv")
    {
    } // Inst_SMEM__S_DCACHE_INV

    Inst_SMEM__S_DCACHE_INV::~Inst_SMEM__S_DCACHE_INV()
    {
    } // ~Inst_SMEM__S_DCACHE_INV

    // Invalidate the scalar data cache.
    void
    Inst_SMEM__S_DCACHE_INV::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SMEM__S_DCACHE_WB::Inst_SMEM__S_DCACHE_WB(InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_dcache_wb")
    {
    } // Inst_SMEM__S_DCACHE_WB

    Inst_SMEM__S_DCACHE_WB::~Inst_SMEM__S_DCACHE_WB()
    {
    } // ~Inst_SMEM__S_DCACHE_WB

    // Write back dirty data in the scalar data cache.
    void
    Inst_SMEM__S_DCACHE_WB::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SMEM__S_DCACHE_INV_VOL::Inst_SMEM__S_DCACHE_INV_VOL(InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_dcache_inv_vol")
    {
    } // Inst_SMEM__S_DCACHE_INV_VOL

    Inst_SMEM__S_DCACHE_INV_VOL::~Inst_SMEM__S_DCACHE_INV_VOL()
    {
    } // ~Inst_SMEM__S_DCACHE_INV_VOL

    // Invalidate the scalar data cache volatile lines.
    void
    Inst_SMEM__S_DCACHE_INV_VOL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SMEM__S_DCACHE_WB_VOL::Inst_SMEM__S_DCACHE_WB_VOL(InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_dcache_wb_vol")
    {
    } // Inst_SMEM__S_DCACHE_WB_VOL

    Inst_SMEM__S_DCACHE_WB_VOL::~Inst_SMEM__S_DCACHE_WB_VOL()
    {
    } // ~Inst_SMEM__S_DCACHE_WB_VOL

    // Write back dirty data in the scalar data cache volatile lines.
    void
    Inst_SMEM__S_DCACHE_WB_VOL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SMEM__S_MEMTIME::Inst_SMEM__S_MEMTIME(InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_memtime")
    {
    } // Inst_SMEM__S_MEMTIME

    Inst_SMEM__S_MEMTIME::~Inst_SMEM__S_MEMTIME()
    {
    } // ~Inst_SMEM__S_MEMTIME

    // Return current 64-bit timestamp.
    void
    Inst_SMEM__S_MEMTIME::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SMEM__S_MEMREALTIME::Inst_SMEM__S_MEMREALTIME(InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_memrealtime")
    {
    } // Inst_SMEM__S_MEMREALTIME

    Inst_SMEM__S_MEMREALTIME::~Inst_SMEM__S_MEMREALTIME()
    {
    } // ~Inst_SMEM__S_MEMREALTIME

    // Return current 64-bit RTC.
    void
    Inst_SMEM__S_MEMREALTIME::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SMEM__S_ATC_PROBE::Inst_SMEM__S_ATC_PROBE(InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_atc_probe")
    {
    } // Inst_SMEM__S_ATC_PROBE

    Inst_SMEM__S_ATC_PROBE::~Inst_SMEM__S_ATC_PROBE()
    {
    } // ~Inst_SMEM__S_ATC_PROBE

    void
    Inst_SMEM__S_ATC_PROBE::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_SMEM__S_ATC_PROBE_BUFFER::Inst_SMEM__S_ATC_PROBE_BUFFER(
          InFmt_SMEM *iFmt)
        : Inst_SMEM(iFmt, "s_atc_probe_buffer")
    {
    } // Inst_SMEM__S_ATC_PROBE_BUFFER

    Inst_SMEM__S_ATC_PROBE_BUFFER::~Inst_SMEM__S_ATC_PROBE_BUFFER()
    {
    } // ~Inst_SMEM__S_ATC_PROBE_BUFFER

    void
    Inst_SMEM__S_ATC_PROBE_BUFFER::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP2__V_CNDMASK_B32::Inst_VOP2__V_CNDMASK_B32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_cndmask_b32")
    {
        setFlag(ALU);
        setFlag(ReadsVCC);
    } // Inst_VOP2__V_CNDMASK_B32

    Inst_VOP2__V_CNDMASK_B32::~Inst_VOP2__V_CNDMASK_B32()
    {
    } // ~Inst_VOP2__V_CNDMASK_B32

    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
    // as a scalar GPR in S2.
    void
    Inst_VOP2__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);
        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();
        vcc.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane]
                    = bits(vcc.rawData(), lane) ? src1[lane] : src0[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_ADD_F32::Inst_VOP2__V_ADD_F32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_add_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP2__V_ADD_F32

    Inst_VOP2__V_ADD_F32::~Inst_VOP2__V_ADD_F32()
    {
    } // ~Inst_VOP2__V_ADD_F32

    // D.f = S0.f + S1.f.
    void
    Inst_VOP2__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        VecOperandF32 src1(gpuDynInst, instData.VSRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        if (isDPPInst()) {
            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
            src0_dpp.read();

            DPRINTF(GCN3, "Handling V_ADD_F32 SRC DPP. SRC0: register v[%d], "
                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
                    "SRC1_ABS: %d, SRC1_NEG: %d, BOUND_CTRL: %d, "
                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
                    extData.iFmt_VOP_DPP.DPP_CTRL,
                    extData.iFmt_VOP_DPP.SRC0_ABS,
                    extData.iFmt_VOP_DPP.SRC0_NEG,
                    extData.iFmt_VOP_DPP.SRC1_ABS,
                    extData.iFmt_VOP_DPP.SRC1_NEG,
                    extData.iFmt_VOP_DPP.BOUND_CTRL,
                    extData.iFmt_VOP_DPP.BANK_MASK,
                    extData.iFmt_VOP_DPP.ROW_MASK);

            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (wf->execMask(lane)) {
                    vdst[lane] = src0_dpp[lane] + src1[lane];
                }
            }
        } else {
            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (wf->execMask(lane)) {
                    vdst[lane] = src0[lane] + src1[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_SUB_F32::Inst_VOP2__V_SUB_F32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_sub_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP2__V_SUB_F32

    Inst_VOP2__V_SUB_F32::~Inst_VOP2__V_SUB_F32()
    {
    } // ~Inst_VOP2__V_SUB_F32

    // D.f = S0.f - S1.f.
    void
    Inst_VOP2__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] - src1[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_SUBREV_F32::Inst_VOP2__V_SUBREV_F32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_subrev_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP2__V_SUBREV_F32

    Inst_VOP2__V_SUBREV_F32::~Inst_VOP2__V_SUBREV_F32()
    {
    } // ~Inst_VOP2__V_SUBREV_F32

    // D.f = S1.f - S0.f.
    void
    Inst_VOP2__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] - src0[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MUL_LEGACY_F32::Inst_VOP2__V_MUL_LEGACY_F32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_mul_legacy_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP2__V_MUL_LEGACY_F32

    Inst_VOP2__V_MUL_LEGACY_F32::~Inst_VOP2__V_MUL_LEGACY_F32()
    {
    } // ~Inst_VOP2__V_MUL_LEGACY_F32

    // D.f = S0.f * S1.f
    void
    Inst_VOP2__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] * src1[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MUL_F32::Inst_VOP2__V_MUL_F32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_mul_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP2__V_MUL_F32

    Inst_VOP2__V_MUL_F32::~Inst_VOP2__V_MUL_F32()
    {
    } // ~Inst_VOP2__V_MUL_F32

    // D.f = S0.f * S1.f.
    void
    Inst_VOP2__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isnan(src0[lane]) ||
                    std::isnan(src1[lane])) {
                    vdst[lane] = NAN;
                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                           std::fpclassify(src0[lane]) == FP_ZERO) &&
                           !std::signbit(src0[lane])) {
                    if (std::isinf(src1[lane])) {
                        vdst[lane] = NAN;
                    } else if (!std::signbit(src1[lane])) {
                        vdst[lane] = +0.0;
                    } else {
                        vdst[lane] = -0.0;
                    }
                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                           std::fpclassify(src0[lane]) == FP_ZERO) &&
                           std::signbit(src0[lane])) {
                    if (std::isinf(src1[lane])) {
                        vdst[lane] = NAN;
                    } else if (std::signbit(src1[lane])) {
                        vdst[lane] = +0.0;
                    } else {
                        vdst[lane] = -0.0;
                    }
                } else if (std::isinf(src0[lane]) &&
                           !std::signbit(src0[lane])) {
                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                        std::fpclassify(src1[lane]) == FP_ZERO) {
                        vdst[lane] = NAN;
                    } else if (!std::signbit(src1[lane])) {
                        vdst[lane] = +INFINITY;
                    } else {
                        vdst[lane] = -INFINITY;
                    }
                } else if (std::isinf(src0[lane]) &&
                           std::signbit(src0[lane])) {
                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                        std::fpclassify(src1[lane]) == FP_ZERO) {
                        vdst[lane] = NAN;
                    } else if (std::signbit(src1[lane])) {
                        vdst[lane] = +INFINITY;
                    } else {
                        vdst[lane] = -INFINITY;
                    }
                } else {
                    vdst[lane] = src0[lane] * src1[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MUL_I32_I24::Inst_VOP2__V_MUL_I32_I24(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_mul_i32_i24")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_MUL_I32_I24

    Inst_VOP2__V_MUL_I32_I24::~Inst_VOP2__V_MUL_I32_I24()
    {
    } // ~Inst_VOP2__V_MUL_I32_I24

    // D.i = S0.i[23:0] * S1.i[23:0].
    void
    Inst_VOP2__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
                    * sext<24>(bits(src1[lane], 23, 0));
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MUL_HI_I32_I24::Inst_VOP2__V_MUL_HI_I32_I24(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_mul_hi_i32_i24")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_MUL_HI_I32_I24

    Inst_VOP2__V_MUL_HI_I32_I24::~Inst_VOP2__V_MUL_HI_I32_I24()
    {
    } // ~Inst_VOP2__V_MUL_HI_I32_I24

    // D.i = (S0.i[23:0] * S1.i[23:0]) >> 32.
    void
    Inst_VOP2__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemI64 tmp_src0
                    = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
                VecElemI64 tmp_src1
                    = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));

                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MUL_U32_U24::Inst_VOP2__V_MUL_U32_U24(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_mul_u32_u24")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_MUL_U32_U24

    Inst_VOP2__V_MUL_U32_U24::~Inst_VOP2__V_MUL_U32_U24()
    {
    } // ~Inst_VOP2__V_MUL_U32_U24

    // D.u = S0.u[23:0] * S1.u[23:0].
    void
    Inst_VOP2__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        if (isSDWAInst()) {
            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
            // use copies of original src0, src1, and dest during selecting
            VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                        extData.iFmt_VOP_SDWA.SRC0);
            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
            VecOperandU32 origVdst(gpuDynInst, instData.VDST);

            src0_sdwa.read();
            origSrc0_sdwa.read();
            origSrc1.read();

            DPRINTF(GCN3, "Handling V_MUL_U32_U24 SRC SDWA. SRC0: register "
                    "v[%d], DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: "
                    "%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: "
                    "%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
                    extData.iFmt_VOP_SDWA.DST_UNUSED,
                    extData.iFmt_VOP_SDWA.CLAMP,
                    extData.iFmt_VOP_SDWA.SRC0_SEL,
                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
                    extData.iFmt_VOP_SDWA.SRC0_NEG,
                    extData.iFmt_VOP_SDWA.SRC0_ABS,
                    extData.iFmt_VOP_SDWA.SRC1_SEL,
                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
                    extData.iFmt_VOP_SDWA.SRC1_NEG,
                    extData.iFmt_VOP_SDWA.SRC1_ABS);

            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
                            src1, origSrc1);

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (wf->execMask(lane)) {
                    vdst[lane] = bits(src0_sdwa[lane], 23, 0) *
                                 bits(src1[lane], 23, 0);
                    origVdst[lane] = vdst[lane]; // keep copy consistent
                }
            }

            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
        } else {
            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (wf->execMask(lane)) {
                    vdst[lane] = bits(src0[lane], 23, 0) *
                                 bits(src1[lane], 23, 0);
                }
            }
        }


        vdst.write();
    }

    Inst_VOP2__V_MUL_HI_U32_U24::Inst_VOP2__V_MUL_HI_U32_U24(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_mul_hi_u32_u24")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_MUL_HI_U32_U24

    Inst_VOP2__V_MUL_HI_U32_U24::~Inst_VOP2__V_MUL_HI_U32_U24()
    {
    } // ~Inst_VOP2__V_MUL_HI_U32_U24

    // D.i = (S0.u[23:0] * S1.u[23:0]) >> 32.
    void
    Inst_VOP2__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MIN_F32::Inst_VOP2__V_MIN_F32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_min_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP2__V_MIN_F32

    Inst_VOP2__V_MIN_F32::~Inst_VOP2__V_MIN_F32()
    {
    } // ~Inst_VOP2__V_MIN_F32

    // D.f = (S0.f < S1.f ? S0.f : S1.f).
    void
    Inst_VOP2__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fmin(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MAX_F32::Inst_VOP2__V_MAX_F32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_max_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP2__V_MAX_F32

    Inst_VOP2__V_MAX_F32::~Inst_VOP2__V_MAX_F32()
    {
    } // ~Inst_VOP2__V_MAX_F32

    // D.f = (S0.f >= S1.f ? S0.f : S1.f).
    void
    Inst_VOP2__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fmax(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MIN_I32::Inst_VOP2__V_MIN_I32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_min_i32")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_MIN_I32

    Inst_VOP2__V_MIN_I32::~Inst_VOP2__V_MIN_I32()
    {
    } // ~Inst_VOP2__V_MIN_I32

    // D.i = min(S0.i, S1.i).
    void
    Inst_VOP2__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::min(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MAX_I32::Inst_VOP2__V_MAX_I32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_max_i32")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_MAX_I32

    Inst_VOP2__V_MAX_I32::~Inst_VOP2__V_MAX_I32()
    {
    } // ~Inst_VOP2__V_MAX_I32

    // D.i = max(S0.i, S1.i).
    void
    Inst_VOP2__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::max(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MIN_U32::Inst_VOP2__V_MIN_U32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_min_u32")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_MIN_U32

    Inst_VOP2__V_MIN_U32::~Inst_VOP2__V_MIN_U32()
    {
    } // ~Inst_VOP2__V_MIN_U32

    // D.u = min(S0.u, S1.u).
    void
    Inst_VOP2__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::min(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MAX_U32::Inst_VOP2__V_MAX_U32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_max_u32")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_MAX_U32

    Inst_VOP2__V_MAX_U32::~Inst_VOP2__V_MAX_U32()
    {
    } // ~Inst_VOP2__V_MAX_U32

    // D.u = max(S0.u, S1.u).
    void
    Inst_VOP2__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::max(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_LSHRREV_B32::Inst_VOP2__V_LSHRREV_B32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_lshrrev_b32")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_LSHRREV_B32

    Inst_VOP2__V_LSHRREV_B32::~Inst_VOP2__V_LSHRREV_B32()
    {
    } // ~Inst_VOP2__V_LSHRREV_B32

    // D.u = S1.u >> S0.u[4:0].
    // The vacated bits are set to zero.
    void
    Inst_VOP2__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_ASHRREV_I32::Inst_VOP2__V_ASHRREV_I32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_ashrrev_i32")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_ASHRREV_I32

    Inst_VOP2__V_ASHRREV_I32::~Inst_VOP2__V_ASHRREV_I32()
    {
    } // ~Inst_VOP2__V_ASHRREV_I32

    // D.i = signext(S1.i) >> S0.i[4:0].
    // The vacated bits are set to the sign bit of the input value.
    void
    Inst_VOP2__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_LSHLREV_B32::Inst_VOP2__V_LSHLREV_B32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_lshlrev_b32")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_LSHLREV_B32

    Inst_VOP2__V_LSHLREV_B32::~Inst_VOP2__V_LSHLREV_B32()
    {
    } // ~Inst_VOP2__V_LSHLREV_B32

    // D.u = S1.u << S0.u[4:0].
    void
    Inst_VOP2__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        if (isSDWAInst()) {
            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
            // use copies of original src0, src1, and vdst during selecting
            VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                        extData.iFmt_VOP_SDWA.SRC0);
            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
            VecOperandU32 origVdst(gpuDynInst, instData.VDST);

            src0_sdwa.read();
            origSrc0_sdwa.read();
            origSrc1.read();

            DPRINTF(GCN3, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register "
                    "v[%d], DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: "
                    "%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: "
                    "%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
                    extData.iFmt_VOP_SDWA.DST_UNUSED,
                    extData.iFmt_VOP_SDWA.CLAMP,
                    extData.iFmt_VOP_SDWA.SRC0_SEL,
                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
                    extData.iFmt_VOP_SDWA.SRC0_NEG,
                    extData.iFmt_VOP_SDWA.SRC0_ABS,
                    extData.iFmt_VOP_SDWA.SRC1_SEL,
                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
                    extData.iFmt_VOP_SDWA.SRC1_NEG,
                    extData.iFmt_VOP_SDWA.SRC1_ABS);

            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
                            src1, origSrc1);

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (wf->execMask(lane)) {
                    vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0);
                    origVdst[lane] = vdst[lane]; // keep copy consistent
                }
            }

            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
        } else {
            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (wf->execMask(lane)) {
                    vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
                }
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_AND_B32::Inst_VOP2__V_AND_B32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_and_b32")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_AND_B32

    Inst_VOP2__V_AND_B32::~Inst_VOP2__V_AND_B32()
    {
    } // ~Inst_VOP2__V_AND_B32

    // D.u = S0.u & S1.u.
    // Input and output modifiers not supported.
    void
    Inst_VOP2__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] & src1[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_OR_B32::Inst_VOP2__V_OR_B32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_or_b32")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_OR_B32

    Inst_VOP2__V_OR_B32::~Inst_VOP2__V_OR_B32()
    {
    } // ~Inst_VOP2__V_OR_B32

    // D.u = S0.u | S1.u.
    // Input and output modifiers not supported.
    void
    Inst_VOP2__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        if (isSDWAInst()) {
            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
            // use copies of original src0, src1, and dest during selecting
            VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                        extData.iFmt_VOP_SDWA.SRC0);
            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
            VecOperandU32 origVdst(gpuDynInst, instData.VDST);

            src0_sdwa.read();
            origSrc0_sdwa.read();
            origSrc1.read();

            DPRINTF(GCN3, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], "
                    "DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: %d, "
                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
                    extData.iFmt_VOP_SDWA.DST_UNUSED,
                    extData.iFmt_VOP_SDWA.CLAMP,
                    extData.iFmt_VOP_SDWA.SRC0_SEL,
                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
                    extData.iFmt_VOP_SDWA.SRC0_NEG,
                    extData.iFmt_VOP_SDWA.SRC0_ABS,
                    extData.iFmt_VOP_SDWA.SRC1_SEL,
                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
                    extData.iFmt_VOP_SDWA.SRC1_NEG,
                    extData.iFmt_VOP_SDWA.SRC1_ABS);

            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
                            src1, origSrc1);

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (wf->execMask(lane)) {
                    vdst[lane] = src0_sdwa[lane] | src1[lane];
                    origVdst[lane] = vdst[lane]; // keep copy consistent
                }
            }

            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
        } else {
            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (wf->execMask(lane)) {
                    vdst[lane] = src0[lane] | src1[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_XOR_B32::Inst_VOP2__V_XOR_B32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_xor_b32")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_XOR_B32

    Inst_VOP2__V_XOR_B32::~Inst_VOP2__V_XOR_B32()
    {
    } // ~Inst_VOP2__V_XOR_B32

    // D.u = S0.u ^ S1.u.
    // Input and output modifiers not supported.
    void
    Inst_VOP2__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] ^ src1[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MAC_F32::Inst_VOP2__V_MAC_F32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_mac_f32")
    {
        setFlag(ALU);
        setFlag(F32);
        setFlag(MAC);
    } // Inst_VOP2__V_MAC_F32

    Inst_VOP2__V_MAC_F32::~Inst_VOP2__V_MAC_F32()
    {
    } // ~Inst_VOP2__V_MAC_F32

    // D.f = S0.f * S1.f + D.f.
    void
    Inst_VOP2__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        VecOperandF32 src1(gpuDynInst, instData.VSRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();
        vdst.read();

        if (isDPPInst()) {
            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
            src0_dpp.read();

            DPRINTF(GCN3, "Handling V_MAC_F32 SRC DPP. SRC0: register v[%d], "
                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
                    "SRC1_ABS: %d, SRC1_NEG: %d, BOUND_CTRL: %d, "
                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
                    extData.iFmt_VOP_DPP.DPP_CTRL,
                    extData.iFmt_VOP_DPP.SRC0_ABS,
                    extData.iFmt_VOP_DPP.SRC0_NEG,
                    extData.iFmt_VOP_DPP.SRC1_ABS,
                    extData.iFmt_VOP_DPP.SRC1_NEG,
                    extData.iFmt_VOP_DPP.BOUND_CTRL,
                    extData.iFmt_VOP_DPP.BANK_MASK,
                    extData.iFmt_VOP_DPP.ROW_MASK);

            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (wf->execMask(lane)) {
                    vdst[lane] = std::fma(src0_dpp[lane], src1[lane],
                                          vdst[lane]);
                }
            }
        } else {
            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (wf->execMask(lane)) {
                    vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
                }
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MADMK_F32::Inst_VOP2__V_MADMK_F32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_madmk_f32")
    {
        setFlag(ALU);
        setFlag(F32);
        setFlag(MAD);
    } // Inst_VOP2__V_MADMK_F32

    Inst_VOP2__V_MADMK_F32::~Inst_VOP2__V_MADMK_F32()
    {
    } // ~Inst_VOP2__V_MADMK_F32

    // D.f = S0.f * K + S1.f; K is a 32-bit inline constant.
    // This opcode cannot use the input/output modifiers.
    void
    Inst_VOP2__V_MADMK_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);
        VecElemF32 k = extData.imm_f32;

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fma(src0[lane], k, src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MADAK_F32::Inst_VOP2__V_MADAK_F32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_madak_f32")
    {
        setFlag(ALU);
        setFlag(F32);
        setFlag(MAD);
    } // Inst_VOP2__V_MADAK_F32

    Inst_VOP2__V_MADAK_F32::~Inst_VOP2__V_MADAK_F32()
    {
    } // ~Inst_VOP2__V_MADAK_F32

    // D.f = S0.f * S1.f + K; K is a 32-bit inline constant.
    // This opcode cannot use input/output modifiers.
    void
    Inst_VOP2__V_MADAK_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);
        VecElemF32 k = extData.imm_f32;

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fma(src0[lane], src1[lane], k);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_ADD_U32::Inst_VOP2__V_ADD_U32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_add_u32")
    {
        setFlag(ALU);
        setFlag(WritesVCC);
    } // Inst_VOP2__V_ADD_U32

    Inst_VOP2__V_ADD_U32::~Inst_VOP2__V_ADD_U32()
    {
    } // ~Inst_VOP2__V_ADD_U32

    // D.u = S0.u + S1.u;
    // VCC[threadId] = (S0.u + S1.u >= 0x100000000ULL ? 1 : 0) is an UNSIGNED
    // overflow or carry-out.
    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
    void
    Inst_VOP2__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        if (isSDWAInst()) {
            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
            // use copies of original src0, src1, and dest during selecting
            VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                        extData.iFmt_VOP_SDWA.SRC0);
            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
            VecOperandU32 origVdst(gpuDynInst, instData.VDST);

            src0_sdwa.read();
            origSrc0_sdwa.read();
            origSrc1.read();

            DPRINTF(GCN3, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], "
                    "DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: %d, "
                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
                    extData.iFmt_VOP_SDWA.DST_UNUSED,
                    extData.iFmt_VOP_SDWA.CLAMP,
                    extData.iFmt_VOP_SDWA.SRC0_SEL,
                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
                    extData.iFmt_VOP_SDWA.SRC0_NEG,
                    extData.iFmt_VOP_SDWA.SRC0_ABS,
                    extData.iFmt_VOP_SDWA.SRC1_SEL,
                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
                    extData.iFmt_VOP_SDWA.SRC1_NEG,
                    extData.iFmt_VOP_SDWA.SRC1_ABS);

            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
                            src1, origSrc1);

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (wf->execMask(lane)) {
                    vdst[lane] = src0_sdwa[lane] + src1[lane];
                    origVdst[lane] = vdst[lane]; // keep copy consistent
                    vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane]
                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
                }
            }

            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
        } else {
            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (wf->execMask(lane)) {
                    vdst[lane] = src0[lane] + src1[lane];
                    vcc.setBit(lane, ((VecElemU64)src0[lane]
                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
                }
            }
        }

        vcc.write();
        vdst.write();
    }

    Inst_VOP2__V_SUB_U32::Inst_VOP2__V_SUB_U32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_sub_u32")
    {
        setFlag(ALU);
        setFlag(WritesVCC);
    } // Inst_VOP2__V_SUB_U32

    Inst_VOP2__V_SUB_U32::~Inst_VOP2__V_SUB_U32()
    {
    } // ~Inst_VOP2__V_SUB_U32

    // D.u = S0.u - S1.u;
    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
    // carry-out.
    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
    void
    Inst_VOP2__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] - src1[lane];
                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
            }
        }

        vdst.write();
        vcc.write();
    }

    Inst_VOP2__V_SUBREV_U32::Inst_VOP2__V_SUBREV_U32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_subrev_u32")
    {
        setFlag(ALU);
        setFlag(WritesVCC);
    } // Inst_VOP2__V_SUBREV_U32

    Inst_VOP2__V_SUBREV_U32::~Inst_VOP2__V_SUBREV_U32()
    {
    } // ~Inst_VOP2__V_SUBREV_U32

    // D.u = S1.u - S0.u;
    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
    // carry-out.
    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
    void
    Inst_VOP2__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] - src0[lane];
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        vdst.write();
        vcc.write();
    }

    Inst_VOP2__V_ADDC_U32::Inst_VOP2__V_ADDC_U32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_addc_u32")
    {
        setFlag(ALU);
        setFlag(WritesVCC);
        setFlag(ReadsVCC);
    } // Inst_VOP2__V_ADDC_U32

    Inst_VOP2__V_ADDC_U32::~Inst_VOP2__V_ADDC_U32()
    {
    } // ~Inst_VOP2__V_ADDC_U32

    // D.u = S0.u + S1.u + VCC[threadId];
    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x100000000ULL ? 1 : 0)
    // is an UNSIGNED overflow.
    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
    // source comes from the SGPR-pair at S2.u.
    void
    Inst_VOP2__V_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();
        vcc.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] + src1[lane]
                    + bits(vcc.rawData(), lane);
                vcc.setBit(lane, ((VecElemU64)src0[lane]
                    + (VecElemU64)src1[lane]
                        + (VecElemU64)bits(vcc.rawData(), lane, lane))
                            >= 0x100000000 ? 1 : 0);
            }
        }

        vdst.write();
        vcc.write();
    }

    Inst_VOP2__V_SUBB_U32::Inst_VOP2__V_SUBB_U32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_subb_u32")
    {
        setFlag(ALU);
        setFlag(WritesVCC);
        setFlag(ReadsVCC);
    } // Inst_VOP2__V_SUBB_U32

    Inst_VOP2__V_SUBB_U32::~Inst_VOP2__V_SUBB_U32()
    {
    } // ~Inst_VOP2__V_SUBB_U32

    // D.u = S0.u - S1.u - VCC[threadId];
    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
    // overflow.
    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
    // source comes from the SGPR-pair at S2.u.
    void
    Inst_VOP2__V_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();
        vcc.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane]
                    = src0[lane] - src1[lane] - bits(vcc.rawData(), lane);
                vcc.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
                    > src0[lane] ? 1 : 0);
            }
        }

        vdst.write();
        vcc.write();
    }

    Inst_VOP2__V_SUBBREV_U32::Inst_VOP2__V_SUBBREV_U32(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_subbrev_u32")
    {
        setFlag(ALU);
        setFlag(WritesVCC);
        setFlag(ReadsVCC);
    } // Inst_VOP2__V_SUBBREV_U32

    Inst_VOP2__V_SUBBREV_U32::~Inst_VOP2__V_SUBBREV_U32()
    {
    } // ~Inst_VOP2__V_SUBBREV_U32

    // D.u = S1.u - S0.u - VCC[threadId];
    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
    // overflow.
    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
    // source comes from the SGPR-pair at S2.u.
    void
    Inst_VOP2__V_SUBBREV_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();
        vcc.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane]
                    = src1[lane] - src0[lane] - bits(vcc.rawData(), lane);
                vcc.setBit(lane, (src0[lane] + bits(vcc.rawData(), lane))
                    > src1[lane] ? 1 : 0);
            }
        }

        vdst.write();
        vcc.write();
    }

    Inst_VOP2__V_ADD_F16::Inst_VOP2__V_ADD_F16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_add_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP2__V_ADD_F16

    Inst_VOP2__V_ADD_F16::~Inst_VOP2__V_ADD_F16()
    {
    } // ~Inst_VOP2__V_ADD_F16

    // D.f16 = S0.f16 + S1.f16.
    void
    Inst_VOP2__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP2__V_SUB_F16::Inst_VOP2__V_SUB_F16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_sub_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP2__V_SUB_F16

    Inst_VOP2__V_SUB_F16::~Inst_VOP2__V_SUB_F16()
    {
    } // ~Inst_VOP2__V_SUB_F16

    // D.f16 = S0.f16 - S1.f16.
    void
    Inst_VOP2__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP2__V_SUBREV_F16::Inst_VOP2__V_SUBREV_F16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_subrev_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP2__V_SUBREV_F16

    Inst_VOP2__V_SUBREV_F16::~Inst_VOP2__V_SUBREV_F16()
    {
    } // ~Inst_VOP2__V_SUBREV_F16

    // D.f16 = S1.f16 - S0.f16.
    void
    Inst_VOP2__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP2__V_MUL_F16::Inst_VOP2__V_MUL_F16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_mul_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP2__V_MUL_F16

    Inst_VOP2__V_MUL_F16::~Inst_VOP2__V_MUL_F16()
    {
    } // ~Inst_VOP2__V_MUL_F16

    // D.f16 = S0.f16 * S1.f16.
    void
    Inst_VOP2__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP2__V_MAC_F16::Inst_VOP2__V_MAC_F16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_mac_f16")
    {
        setFlag(ALU);
        setFlag(F16);
        setFlag(MAC);
    } // Inst_VOP2__V_MAC_F16

    Inst_VOP2__V_MAC_F16::~Inst_VOP2__V_MAC_F16()
    {
    } // ~Inst_VOP2__V_MAC_F16

    // D.f16 = S0.f16 * S1.f16 + D.f16.
    void
    Inst_VOP2__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP2__V_MADMK_F16::Inst_VOP2__V_MADMK_F16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_madmk_f16")
    {
        setFlag(ALU);
        setFlag(F16);
        setFlag(MAD);
    } // Inst_VOP2__V_MADMK_F16

    Inst_VOP2__V_MADMK_F16::~Inst_VOP2__V_MADMK_F16()
    {
    } // ~Inst_VOP2__V_MADMK_F16

    // D.f16 = S0.f16 * K.f16 + S1.f16; K is a 16-bit inline constant stored
    // in the following literal DWORD.
    // This opcode cannot use the VOP3 encoding and cannot use input/output
    // modifiers.
    void
    Inst_VOP2__V_MADMK_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP2__V_MADAK_F16::Inst_VOP2__V_MADAK_F16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_madak_f16")
    {
        setFlag(ALU);
        setFlag(F16);
        setFlag(MAD);
    } // Inst_VOP2__V_MADAK_F16

    Inst_VOP2__V_MADAK_F16::~Inst_VOP2__V_MADAK_F16()
    {
    } // ~Inst_VOP2__V_MADAK_F16

    // D.f16 = S0.f16 * S1.f16 + K.f16; K is a 16-bit inline constant stored
    // in the following literal DWORD.
    // This opcode cannot use the VOP3 encoding and cannot use input/output
    // modifiers.
    void
    Inst_VOP2__V_MADAK_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP2__V_ADD_U16::Inst_VOP2__V_ADD_U16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_add_u16")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_ADD_U16

    Inst_VOP2__V_ADD_U16::~Inst_VOP2__V_ADD_U16()
    {
    } // ~Inst_VOP2__V_ADD_U16

    // D.u16 = S0.u16 + S1.u16.
    void
    Inst_VOP2__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] + src1[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_SUB_U16::Inst_VOP2__V_SUB_U16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_sub_u16")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_SUB_U16

    Inst_VOP2__V_SUB_U16::~Inst_VOP2__V_SUB_U16()
    {
    } // ~Inst_VOP2__V_SUB_U16

    // D.u16 = S0.u16 - S1.u16.
    void
    Inst_VOP2__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] - src1[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_SUBREV_U16::Inst_VOP2__V_SUBREV_U16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_subrev_u16")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_SUBREV_U16

    Inst_VOP2__V_SUBREV_U16::~Inst_VOP2__V_SUBREV_U16()
    {
    } // ~Inst_VOP2__V_SUBREV_U16

    // D.u16 = S1.u16 - S0.u16.
    void
    Inst_VOP2__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] - src0[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MUL_LO_U16::Inst_VOP2__V_MUL_LO_U16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_mul_lo_u16")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_MUL_LO_U16

    Inst_VOP2__V_MUL_LO_U16::~Inst_VOP2__V_MUL_LO_U16()
    {
    } // ~Inst_VOP2__V_MUL_LO_U16

    // D.u16 = S0.u16 * S1.u16.
    void
    Inst_VOP2__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] * src1[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_LSHLREV_B16::Inst_VOP2__V_LSHLREV_B16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_lshlrev_b16")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_LSHLREV_B16

    Inst_VOP2__V_LSHLREV_B16::~Inst_VOP2__V_LSHLREV_B16()
    {
    } // ~Inst_VOP2__V_LSHLREV_B16

    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
    void
    Inst_VOP2__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_LSHRREV_B16::Inst_VOP2__V_LSHRREV_B16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_lshrrev_b16")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_LSHRREV_B16

    Inst_VOP2__V_LSHRREV_B16::~Inst_VOP2__V_LSHRREV_B16()
    {
    } // ~Inst_VOP2__V_LSHRREV_B16

    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
    // The vacated bits are set to zero.
    void
    Inst_VOP2__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] >> src0[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_ASHRREV_I16::Inst_VOP2__V_ASHRREV_I16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_ashrrev_i16")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_ASHRREV_I16

    Inst_VOP2__V_ASHRREV_I16::~Inst_VOP2__V_ASHRREV_I16()
    {
    } // ~Inst_VOP2__V_ASHRREV_I16

    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
    // The vacated bits are set to the sign bit of the input value.
    void
    Inst_VOP2__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
        VecOperandI16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] >> src0[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MAX_F16::Inst_VOP2__V_MAX_F16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_max_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP2__V_MAX_F16

    Inst_VOP2__V_MAX_F16::~Inst_VOP2__V_MAX_F16()
    {
    } // ~Inst_VOP2__V_MAX_F16

    // D.f16 = max(S0.f16, S1.f16).
    void
    Inst_VOP2__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP2__V_MIN_F16::Inst_VOP2__V_MIN_F16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_min_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP2__V_MIN_F16

    Inst_VOP2__V_MIN_F16::~Inst_VOP2__V_MIN_F16()
    {
    } // ~Inst_VOP2__V_MIN_F16

    // D.f16 = min(S0.f16, S1.f16).
    void
    Inst_VOP2__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP2__V_MAX_U16::Inst_VOP2__V_MAX_U16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_max_u16")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_MAX_U16

    Inst_VOP2__V_MAX_U16::~Inst_VOP2__V_MAX_U16()
    {
    } // ~Inst_VOP2__V_MAX_U16

    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
    void
    Inst_VOP2__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::max(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MAX_I16::Inst_VOP2__V_MAX_I16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_max_i16")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_MAX_I16

    Inst_VOP2__V_MAX_I16::~Inst_VOP2__V_MAX_I16()
    {
    } // ~Inst_VOP2__V_MAX_I16

    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
    void
    Inst_VOP2__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
        VecOperandI16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::max(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MIN_U16::Inst_VOP2__V_MIN_U16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_min_u16")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_MIN_U16

    Inst_VOP2__V_MIN_U16::~Inst_VOP2__V_MIN_U16()
    {
    } // ~Inst_VOP2__V_MIN_U16

    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
    void
    Inst_VOP2__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::min(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_MIN_I16::Inst_VOP2__V_MIN_I16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_min_i16")
    {
        setFlag(ALU);
    } // Inst_VOP2__V_MIN_I16

    Inst_VOP2__V_MIN_I16::~Inst_VOP2__V_MIN_I16()
    {
    } // ~Inst_VOP2__V_MIN_I16

    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
    void
    Inst_VOP2__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
        VecOperandI16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::min(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP2__V_LDEXP_F16::Inst_VOP2__V_LDEXP_F16(InFmt_VOP2 *iFmt)
        : Inst_VOP2(iFmt, "v_ldexp_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP2__V_LDEXP_F16

    Inst_VOP2__V_LDEXP_F16::~Inst_VOP2__V_LDEXP_F16()
    {
    } // ~Inst_VOP2__V_LDEXP_F16

    // D.f16 = S0.f16 * (2 ** S1.i16).
    void
    Inst_VOP2__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_NOP::Inst_VOP1__V_NOP(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_nop")
    {
        setFlag(Nop);
        setFlag(ALU);
    } // Inst_VOP1__V_NOP

    Inst_VOP1__V_NOP::~Inst_VOP1__V_NOP()
    {
    } // ~Inst_VOP1__V_NOP

    // Do nothing.
    void
    Inst_VOP1__V_NOP::execute(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_VOP1__V_MOV_B32::Inst_VOP1__V_MOV_B32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_mov_b32")
    {
        setFlag(ALU);
    } // Inst_VOP1__V_MOV_B32

    Inst_VOP1__V_MOV_B32::~Inst_VOP1__V_MOV_B32()
    {
    } // ~Inst_VOP1__V_MOV_B32

    // D.u = S0.u.
    // Input and output modifiers not supported; this is an untyped operation.
    void
    Inst_VOP1__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (isDPPInst()) {
            VecOperandU32 src_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
            src_dpp.read();

            DPRINTF(GCN3, "Handling V_MOV_B32 SRC DPP. SRC0: register v[%d], "
                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
                    "SRC1_ABS: %d, SRC1_NEG: %d, BOUND_CTRL: %d, "
                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
                    extData.iFmt_VOP_DPP.DPP_CTRL,
                    extData.iFmt_VOP_DPP.SRC0_ABS,
                    extData.iFmt_VOP_DPP.SRC0_NEG,
                    extData.iFmt_VOP_DPP.SRC1_ABS,
                    extData.iFmt_VOP_DPP.SRC1_NEG,
                    extData.iFmt_VOP_DPP.BOUND_CTRL,
                    extData.iFmt_VOP_DPP.BANK_MASK,
                    extData.iFmt_VOP_DPP.ROW_MASK);

            // NOTE: For VOP1, there is no SRC1, so make sure we're not trying
            // to negate it or take the absolute value of it
            assert(!extData.iFmt_VOP_DPP.SRC1_ABS);
            assert(!extData.iFmt_VOP_DPP.SRC1_NEG);
            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src_dpp);

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (wf->execMask(lane)) {
                    vdst[lane] = src_dpp[lane];
                }
            }
        } else {
            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (wf->execMask(lane)) {
                    vdst[lane] = src[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_READFIRSTLANE_B32::Inst_VOP1__V_READFIRSTLANE_B32(
          InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_readfirstlane_b32")
    {
        setFlag(ALU);
    } // Inst_VOP1__V_READFIRSTLANE_B32

    Inst_VOP1__V_READFIRSTLANE_B32::~Inst_VOP1__V_READFIRSTLANE_B32()
    {
    } // ~Inst_VOP1__V_READFIRSTLANE_B32

    // Copy one VGPR value to one SGPR. D = SGPR destination, S0 = source data
    // (VGPR# or M0 for lds direct access), Lane# = FindFirst1fromLSB(exec)
    // (Lane# = 0 if exec is zero). Ignores exec mask for the access.
    // Input and output modifiers not supported; this is an untyped operation.
    void
    Inst_VOP1__V_READFIRSTLANE_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarRegI32 src_lane(0);
        ScalarRegU64 exec_mask = wf->execMask().to_ullong();
        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
        ScalarOperandU32 sdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (exec_mask) {
            src_lane = findLsbSet(exec_mask);
        }

        sdst = src[src_lane];

        sdst.write();
    }

    Inst_VOP1__V_CVT_I32_F64::Inst_VOP1__V_CVT_I32_F64(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_i32_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP1__V_CVT_I32_F64

    Inst_VOP1__V_CVT_I32_F64::~Inst_VOP1__V_CVT_I32_F64()
    {
    } // ~Inst_VOP1__V_CVT_I32_F64

    // D.i = (int)S0.d.
    // Out-of-range floating point values (including infinity) saturate. NaN
    // is converted to 0.
    void
    Inst_VOP1__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                int exp;
                std::frexp(src[lane],&exp);
                if (std::isnan(src[lane])) {
                    vdst[lane] = 0;
                } else if (std::isinf(src[lane]) || exp > 30) {
                    if (std::signbit(src[lane])) {
                        vdst[lane] = INT_MIN;
                    } else {
                        vdst[lane] = INT_MAX;
                    }
                } else {
                    vdst[lane] = (VecElemI32)src[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CVT_F64_I32::Inst_VOP1__V_CVT_F64_I32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_f64_i32")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP1__V_CVT_F64_I32

    Inst_VOP1__V_CVT_F64_I32::~Inst_VOP1__V_CVT_F64_I32()
    {
    } // ~Inst_VOP1__V_CVT_F64_I32

    // D.d = (double)S0.i.
    void
    Inst_VOP1__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF64)src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CVT_F32_I32::Inst_VOP1__V_CVT_F32_I32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_f32_i32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_CVT_F32_I32

    Inst_VOP1__V_CVT_F32_I32::~Inst_VOP1__V_CVT_F32_I32()
    {
    } // ~Inst_VOP1__V_CVT_F32_I32

    // D.f = (float)S0.i.
    void
    Inst_VOP1__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CVT_F32_U32::Inst_VOP1__V_CVT_F32_U32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_f32_u32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_CVT_F32_U32

    Inst_VOP1__V_CVT_F32_U32::~Inst_VOP1__V_CVT_F32_U32()
    {
    } // ~Inst_VOP1__V_CVT_F32_U32

    // D.f = (float)S0.u.
    void
    Inst_VOP1__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CVT_U32_F32::Inst_VOP1__V_CVT_U32_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_u32_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_CVT_U32_F32

    Inst_VOP1__V_CVT_U32_F32::~Inst_VOP1__V_CVT_U32_F32()
    {
    } // ~Inst_VOP1__V_CVT_U32_F32

    // D.u = (unsigned)S0.f.
    // Out-of-range floating point values (including infinity) saturate. NaN
    // is converted to 0.
    void
    Inst_VOP1__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                int exp;
                std::frexp(src[lane],&exp);
                if (std::isnan(src[lane])) {
                    vdst[lane] = 0;
                } else if (std::isinf(src[lane])) {
                    if (std::signbit(src[lane])) {
                        vdst[lane] = 0;
                    } else {
                        vdst[lane] = UINT_MAX;
                    }
                } else if (exp > 31) {
                    vdst[lane] = UINT_MAX;
                } else {
                    vdst[lane] = (VecElemU32)src[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CVT_I32_F32::Inst_VOP1__V_CVT_I32_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_i32_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_CVT_I32_F32

    Inst_VOP1__V_CVT_I32_F32::~Inst_VOP1__V_CVT_I32_F32()
    {
    } // ~Inst_VOP1__V_CVT_I32_F32

    // D.i = (int)S0.f.
    // Out-of-range floating point values (including infinity) saturate. NaN
    // is converted to 0.
    void
    Inst_VOP1__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                int exp;
                std::frexp(src[lane],&exp);
                if (std::isnan(src[lane])) {
                    vdst[lane] = 0;
                } else if (std::isinf(src[lane]) || exp > 30) {
                    if (std::signbit(src[lane])) {
                        vdst[lane] = INT_MIN;
                    } else {
                        vdst[lane] = INT_MAX;
                    }
                } else {
                    vdst[lane] = (VecElemI32)src[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_MOV_FED_B32::Inst_VOP1__V_MOV_FED_B32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_mov_fed_b32")
    {
        setFlag(ALU);
    } // Inst_VOP1__V_MOV_FED_B32

    Inst_VOP1__V_MOV_FED_B32::~Inst_VOP1__V_MOV_FED_B32()
    {
    } // ~Inst_VOP1__V_MOV_FED_B32

    // D.u = S0.u;
    // Input and output modifiers not supported; this is an untyped operation.
    void
    Inst_VOP1__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_CVT_F16_F32::Inst_VOP1__V_CVT_F16_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_f16_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_CVT_F16_F32

    Inst_VOP1__V_CVT_F16_F32::~Inst_VOP1__V_CVT_F16_F32()
    {
    } // ~Inst_VOP1__V_CVT_F16_F32

    // D.f16 = flt32_to_flt16(S0.f).
    void
    Inst_VOP1__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_CVT_F32_F16::Inst_VOP1__V_CVT_F32_F16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_f32_f16")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_CVT_F32_F16

    Inst_VOP1__V_CVT_F32_F16::~Inst_VOP1__V_CVT_F32_F16()
    {
    } // ~Inst_VOP1__V_CVT_F32_F16

    // D.f = flt16_to_flt32(S0.f16).
    void
    Inst_VOP1__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_CVT_RPI_I32_F32::Inst_VOP1__V_CVT_RPI_I32_F32(
          InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_rpi_i32_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_CVT_RPI_I32_F32

    Inst_VOP1__V_CVT_RPI_I32_F32::~Inst_VOP1__V_CVT_RPI_I32_F32()
    {
    } // ~Inst_VOP1__V_CVT_RPI_I32_F32

    // D.i = (int)floor(S0.f + 0.5).
    void
    Inst_VOP1__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CVT_FLR_I32_F32::Inst_VOP1__V_CVT_FLR_I32_F32(
          InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_flr_i32_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_CVT_FLR_I32_F32

    Inst_VOP1__V_CVT_FLR_I32_F32::~Inst_VOP1__V_CVT_FLR_I32_F32()
    {
    } // ~Inst_VOP1__V_CVT_FLR_I32_F32

    // D.i = (int)floor(S0.f).
    void
    Inst_VOP1__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemI32)std::floor(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CVT_OFF_F32_I4::Inst_VOP1__V_CVT_OFF_F32_I4(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_off_f32_i4")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_CVT_OFF_F32_I4

    Inst_VOP1__V_CVT_OFF_F32_I4::~Inst_VOP1__V_CVT_OFF_F32_I4()
    {
    } // ~Inst_VOP1__V_CVT_OFF_F32_I4

    // 4-bit signed int to 32-bit float.
    void
    Inst_VOP1__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_CVT_F32_F64::Inst_VOP1__V_CVT_F32_F64(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_f32_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP1__V_CVT_F32_F64

    Inst_VOP1__V_CVT_F32_F64::~Inst_VOP1__V_CVT_F32_F64()
    {
    } // ~Inst_VOP1__V_CVT_F32_F64

    // D.f = (float)S0.d.
    void
    Inst_VOP1__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CVT_F64_F32::Inst_VOP1__V_CVT_F64_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_f64_f32")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP1__V_CVT_F64_F32

    Inst_VOP1__V_CVT_F64_F32::~Inst_VOP1__V_CVT_F64_F32()
    {
    } // ~Inst_VOP1__V_CVT_F64_F32

    // D.d = (double)S0.f.
    void
    Inst_VOP1__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF64)src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CVT_F32_UBYTE0::Inst_VOP1__V_CVT_F32_UBYTE0(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte0")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_CVT_F32_UBYTE0

    Inst_VOP1__V_CVT_F32_UBYTE0::~Inst_VOP1__V_CVT_F32_UBYTE0()
    {
    } // ~Inst_VOP1__V_CVT_F32_UBYTE0

    // D.f = (float)(S0.u[7:0]).
    void
    Inst_VOP1__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)(bits(src[lane], 7, 0));
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CVT_F32_UBYTE1::Inst_VOP1__V_CVT_F32_UBYTE1(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte1")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_CVT_F32_UBYTE1

    Inst_VOP1__V_CVT_F32_UBYTE1::~Inst_VOP1__V_CVT_F32_UBYTE1()
    {
    } // ~Inst_VOP1__V_CVT_F32_UBYTE1

    // D.f = (float)(S0.u[15:8]).
    void
    Inst_VOP1__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)(bits(src[lane], 15, 8));
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CVT_F32_UBYTE2::Inst_VOP1__V_CVT_F32_UBYTE2(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte2")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_CVT_F32_UBYTE2

    Inst_VOP1__V_CVT_F32_UBYTE2::~Inst_VOP1__V_CVT_F32_UBYTE2()
    {
    } // ~Inst_VOP1__V_CVT_F32_UBYTE2

    // D.f = (float)(S0.u[23:16]).
    void
    Inst_VOP1__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)(bits(src[lane], 23, 16));
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CVT_F32_UBYTE3::Inst_VOP1__V_CVT_F32_UBYTE3(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte3")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_CVT_F32_UBYTE3

    Inst_VOP1__V_CVT_F32_UBYTE3::~Inst_VOP1__V_CVT_F32_UBYTE3()
    {
    } // ~Inst_VOP1__V_CVT_F32_UBYTE3

    // D.f = (float)(S0.u[31:24]).
    void
    Inst_VOP1__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)(bits(src[lane], 31, 24));
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CVT_U32_F64::Inst_VOP1__V_CVT_U32_F64(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_u32_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP1__V_CVT_U32_F64

    Inst_VOP1__V_CVT_U32_F64::~Inst_VOP1__V_CVT_U32_F64()
    {
    } // ~Inst_VOP1__V_CVT_U32_F64

    // D.u = (unsigned)S0.d.
    // Out-of-range floating point values (including infinity) saturate. NaN
    // is converted to 0.
    void
    Inst_VOP1__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                int exp;
                std::frexp(src[lane],&exp);
                if (std::isnan(src[lane])) {
                    vdst[lane] = 0;
                } else if (std::isinf(src[lane])) {
                    if (std::signbit(src[lane])) {
                        vdst[lane] = 0;
                    } else {
                        vdst[lane] = UINT_MAX;
                    }
                } else if (exp > 31) {
                    vdst[lane] = UINT_MAX;
                } else {
                    vdst[lane] = (VecElemU32)src[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CVT_F64_U32::Inst_VOP1__V_CVT_F64_U32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_f64_u32")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP1__V_CVT_F64_U32

    Inst_VOP1__V_CVT_F64_U32::~Inst_VOP1__V_CVT_F64_U32()
    {
    } // ~Inst_VOP1__V_CVT_F64_U32

    // D.d = (double)S0.u.
    void
    Inst_VOP1__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF64)src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_TRUNC_F64::Inst_VOP1__V_TRUNC_F64(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_trunc_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP1__V_TRUNC_F64

    Inst_VOP1__V_TRUNC_F64::~Inst_VOP1__V_TRUNC_F64()
    {
    } // ~Inst_VOP1__V_TRUNC_F64

    // D.d = trunc(S0.d), return integer part of S0.d.
    void
    Inst_VOP1__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::trunc(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CEIL_F64::Inst_VOP1__V_CEIL_F64(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_ceil_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP1__V_CEIL_F64

    Inst_VOP1__V_CEIL_F64::~Inst_VOP1__V_CEIL_F64()
    {
    } // ~Inst_VOP1__V_CEIL_F64

    // D.d = ceil(S0.d);
    void
    Inst_VOP1__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::ceil(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_RNDNE_F64::Inst_VOP1__V_RNDNE_F64(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_rndne_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP1__V_RNDNE_F64

    Inst_VOP1__V_RNDNE_F64::~Inst_VOP1__V_RNDNE_F64()
    {
    } // ~Inst_VOP1__V_RNDNE_F64

    // D.d = round_nearest_even(S0.d).
    void
    Inst_VOP1__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = roundNearestEven(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_FLOOR_F64::Inst_VOP1__V_FLOOR_F64(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_floor_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP1__V_FLOOR_F64

    Inst_VOP1__V_FLOOR_F64::~Inst_VOP1__V_FLOOR_F64()
    {
    } // ~Inst_VOP1__V_FLOOR_F64

    // D.d = floor(S0.d);
    void
    Inst_VOP1__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::floor(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_FRACT_F32::Inst_VOP1__V_FRACT_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_fract_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_FRACT_F32

    Inst_VOP1__V_FRACT_F32::~Inst_VOP1__V_FRACT_F32()
    {
    } // ~Inst_VOP1__V_FRACT_F32

    // D.f = modf(S0.f).
    void
    Inst_VOP1__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemF32 int_part(0.0);
                vdst[lane] = std::modf(src[lane], &int_part);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_TRUNC_F32::Inst_VOP1__V_TRUNC_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_trunc_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_TRUNC_F32

    Inst_VOP1__V_TRUNC_F32::~Inst_VOP1__V_TRUNC_F32()
    {
    } // ~Inst_VOP1__V_TRUNC_F32

    // D.f = trunc(S0.f), return integer part of S0.f.
    void
    Inst_VOP1__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst (gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::trunc(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CEIL_F32::Inst_VOP1__V_CEIL_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_ceil_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_CEIL_F32

    Inst_VOP1__V_CEIL_F32::~Inst_VOP1__V_CEIL_F32()
    {
    } // ~Inst_VOP1__V_CEIL_F32

    // D.f = ceil(S0.f);
    void
    Inst_VOP1__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::ceil(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_RNDNE_F32::Inst_VOP1__V_RNDNE_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_rndne_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_RNDNE_F32

    Inst_VOP1__V_RNDNE_F32::~Inst_VOP1__V_RNDNE_F32()
    {
    } // ~Inst_VOP1__V_RNDNE_F32

    // D.f = round_nearest_even(S0.f).
    void
    Inst_VOP1__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = roundNearestEven(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_FLOOR_F32::Inst_VOP1__V_FLOOR_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_floor_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_FLOOR_F32

    Inst_VOP1__V_FLOOR_F32::~Inst_VOP1__V_FLOOR_F32()
    {
    } // ~Inst_VOP1__V_FLOOR_F32

    // D.f = floor(S0.f);
    void
    Inst_VOP1__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::floor(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_EXP_F32::Inst_VOP1__V_EXP_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_exp_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_EXP_F32

    Inst_VOP1__V_EXP_F32::~Inst_VOP1__V_EXP_F32()
    {
    } // ~Inst_VOP1__V_EXP_F32

    // D.f = pow(2.0, S0.f).
    void
    Inst_VOP1__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::pow(2.0, src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_LOG_F32::Inst_VOP1__V_LOG_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_log_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_LOG_F32

    Inst_VOP1__V_LOG_F32::~Inst_VOP1__V_LOG_F32()
    {
    } // ~Inst_VOP1__V_LOG_F32

    // D.f = log2(S0.f).
    void
    Inst_VOP1__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::log2(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_RCP_F32::Inst_VOP1__V_RCP_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_rcp_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_RCP_F32

    Inst_VOP1__V_RCP_F32::~Inst_VOP1__V_RCP_F32()
    {
    } // ~Inst_VOP1__V_RCP_F32

    // D.f = 1.0 / S0.f.
    void
    Inst_VOP1__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = 1.0 / src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_RCP_IFLAG_F32::Inst_VOP1__V_RCP_IFLAG_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_rcp_iflag_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_RCP_IFLAG_F32

    Inst_VOP1__V_RCP_IFLAG_F32::~Inst_VOP1__V_RCP_IFLAG_F32()
    {
    } // ~Inst_VOP1__V_RCP_IFLAG_F32

    // D.f = 1.0 / S0.f.
    void
    Inst_VOP1__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = 1.0 / src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_RSQ_F32::Inst_VOP1__V_RSQ_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_rsq_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_RSQ_F32

    Inst_VOP1__V_RSQ_F32::~Inst_VOP1__V_RSQ_F32()
    {
    } // ~Inst_VOP1__V_RSQ_F32

    // D.f = 1.0 / sqrt(S0.f).
    void
    Inst_VOP1__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = 1.0 / std::sqrt(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_RCP_F64::Inst_VOP1__V_RCP_F64(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_rcp_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP1__V_RCP_F64

    Inst_VOP1__V_RCP_F64::~Inst_VOP1__V_RCP_F64()
    {
    } // ~Inst_VOP1__V_RCP_F64

    // D.d = 1.0 / S0.d.
    void
    Inst_VOP1__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::fpclassify(src[lane]) == FP_ZERO) {
                    vdst[lane] = +INFINITY;
                } else if (std::isnan(src[lane])) {
                    vdst[lane] = NAN;
                } else if (std::isinf(src[lane])) {
                    if (std::signbit(src[lane])) {
                        vdst[lane] = -0.0;
                    } else {
                        vdst[lane] = 0.0;
                    }
                } else {
                    vdst[lane] = 1.0 / src[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_RSQ_F64::Inst_VOP1__V_RSQ_F64(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_rsq_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP1__V_RSQ_F64

    Inst_VOP1__V_RSQ_F64::~Inst_VOP1__V_RSQ_F64()
    {
    } // ~Inst_VOP1__V_RSQ_F64

    // D.d = 1.0 / sqrt(S0.d).
    void
    Inst_VOP1__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::fpclassify(src[lane]) == FP_ZERO) {
                    vdst[lane] = +INFINITY;
                } else if (std::isnan(src[lane])) {
                    vdst[lane] = NAN;
                } else if (std::isinf(src[lane])
                           && !std::signbit(src[lane])) {
                    vdst[lane] = 0.0;
                } else if (std::signbit(src[lane])) {
                    vdst[lane] = NAN;
                } else {
                    vdst[lane] = 1.0 / std::sqrt(src[lane]);
                }
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_SQRT_F32::Inst_VOP1__V_SQRT_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_sqrt_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_SQRT_F32

    Inst_VOP1__V_SQRT_F32::~Inst_VOP1__V_SQRT_F32()
    {
    } // ~Inst_VOP1__V_SQRT_F32

    // D.f = sqrt(S0.f).
    void
    Inst_VOP1__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::sqrt(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_SQRT_F64::Inst_VOP1__V_SQRT_F64(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_sqrt_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP1__V_SQRT_F64

    Inst_VOP1__V_SQRT_F64::~Inst_VOP1__V_SQRT_F64()
    {
    } // ~Inst_VOP1__V_SQRT_F64

    // D.d = sqrt(S0.d).
    void
    Inst_VOP1__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::sqrt(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_SIN_F32::Inst_VOP1__V_SIN_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_sin_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_SIN_F32

    Inst_VOP1__V_SIN_F32::~Inst_VOP1__V_SIN_F32()
    {
    } // ~Inst_VOP1__V_SIN_F32

    // D.f = sin(S0.f * 2 * PI).
    void
    Inst_VOP1__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();
        pi.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (src[lane] < -256.0 || src[lane] > 256.0) {
                    vdst[lane] = 0.0;
                } else {
                    vdst[lane] = std::sin(src[lane] * 2.0 * pi.rawData());
                }
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_COS_F32::Inst_VOP1__V_COS_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cos_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_COS_F32

    Inst_VOP1__V_COS_F32::~Inst_VOP1__V_COS_F32()
    {
    } // ~Inst_VOP1__V_COS_F32

    // D.f = cos(S0.f * 2 * PI).
    void
    Inst_VOP1__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();
        pi.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (src[lane] < -256.0 || src[lane] > 256.0) {
                    vdst[lane] = 0.0;
                } else {
                    vdst[lane] = std::cos(src[lane] * 2.0 * pi.rawData());
                }
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_NOT_B32::Inst_VOP1__V_NOT_B32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_not_b32")
    {
        setFlag(ALU);
    } // Inst_VOP1__V_NOT_B32

    Inst_VOP1__V_NOT_B32::~Inst_VOP1__V_NOT_B32()
    {
    } // ~Inst_VOP1__V_NOT_B32

    // D.u = ~S0.u.
    // Input and output modifiers not supported.
    void
    Inst_VOP1__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = ~src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_BFREV_B32::Inst_VOP1__V_BFREV_B32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_bfrev_b32")
    {
        setFlag(ALU);
    } // Inst_VOP1__V_BFREV_B32

    Inst_VOP1__V_BFREV_B32::~Inst_VOP1__V_BFREV_B32()
    {
    } // ~Inst_VOP1__V_BFREV_B32

    // D.u[31:0] = S0.u[0:31], bitfield reverse.
    // Input and output modifiers not supported.
    void
    Inst_VOP1__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = reverseBits(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_FFBH_U32::Inst_VOP1__V_FFBH_U32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_ffbh_u32")
    {
        setFlag(ALU);
    } // Inst_VOP1__V_FFBH_U32

    Inst_VOP1__V_FFBH_U32::~Inst_VOP1__V_FFBH_U32()
    {
    } // ~Inst_VOP1__V_FFBH_U32

    // D.u = position of first 1 in S0.u from MSB;
    // D.u = 0xffffffff if S0.u == 0.
    void
    Inst_VOP1__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = findFirstOneMsb(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_FFBL_B32::Inst_VOP1__V_FFBL_B32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_ffbl_b32")
    {
        setFlag(ALU);
    } // Inst_VOP1__V_FFBL_B32

    Inst_VOP1__V_FFBL_B32::~Inst_VOP1__V_FFBL_B32()
    {
    } // ~Inst_VOP1__V_FFBL_B32

    // D.u = position of first 1 in S0.u from LSB;
    // D.u = 0xffffffff if S0.u == 0.
    void
    Inst_VOP1__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = findFirstOne(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_FFBH_I32::Inst_VOP1__V_FFBH_I32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_ffbh_i32")
    {
        setFlag(ALU);
    } // Inst_VOP1__V_FFBH_I32

    Inst_VOP1__V_FFBH_I32::~Inst_VOP1__V_FFBH_I32()
    {
    } // ~Inst_VOP1__V_FFBH_I32

    // D.u = position of first bit different from sign bit in S0.i from MSB;
    // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
    void
    Inst_VOP1__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = firstOppositeSignBit(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_FREXP_EXP_I32_F64::Inst_VOP1__V_FREXP_EXP_I32_F64(
          InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_frexp_exp_i32_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP1__V_FREXP_EXP_I32_F64

    Inst_VOP1__V_FREXP_EXP_I32_F64::~Inst_VOP1__V_FREXP_EXP_I32_F64()
    {
    } // ~Inst_VOP1__V_FREXP_EXP_I32_F64

    void
    Inst_VOP1__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
                    vdst[lane] = 0;
                } else {
                    VecElemI32 exp = 0;
                    std::frexp(src[lane], &exp);
                    vdst[lane] = exp;
                }
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_FREXP_MANT_F64::Inst_VOP1__V_FREXP_MANT_F64(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_frexp_mant_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP1__V_FREXP_MANT_F64

    Inst_VOP1__V_FREXP_MANT_F64::~Inst_VOP1__V_FREXP_MANT_F64()
    {
    } // ~Inst_VOP1__V_FREXP_MANT_F64

    void
    Inst_VOP1__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
                    vdst[lane] = src[lane];
                } else {
                    VecElemI32 exp(0);
                    vdst[lane] = std::frexp(src[lane], &exp);
                }
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_FRACT_F64::Inst_VOP1__V_FRACT_F64(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_fract_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP1__V_FRACT_F64

    Inst_VOP1__V_FRACT_F64::~Inst_VOP1__V_FRACT_F64()
    {
    } // ~Inst_VOP1__V_FRACT_F64

    void
    Inst_VOP1__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemF64 int_part(0.0);
                vdst[lane] = std::modf(src[lane], &int_part);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_FREXP_EXP_I32_F32::Inst_VOP1__V_FREXP_EXP_I32_F32(
          InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_frexp_exp_i32_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_FREXP_EXP_I32_F32

    Inst_VOP1__V_FREXP_EXP_I32_F32::~Inst_VOP1__V_FREXP_EXP_I32_F32()
    {
    } // ~Inst_VOP1__V_FREXP_EXP_I32_F32

    // frexp(S0.f, Exponent(S0.f))
    // if (S0.f == INF || S0.f == NAN) then D.i = 0;
    // else D.i = Exponent(S0.f);
    void
    Inst_VOP1__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
                    vdst[lane] = 0;
                } else {
                    VecElemI32 exp(0);
                    std::frexp(src[lane], &exp);
                    vdst[lane] = exp;
                }
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_FREXP_MANT_F32::Inst_VOP1__V_FREXP_MANT_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_frexp_mant_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_FREXP_MANT_F32

    Inst_VOP1__V_FREXP_MANT_F32::~Inst_VOP1__V_FREXP_MANT_F32()
    {
    } // ~Inst_VOP1__V_FREXP_MANT_F32

    // if (S0.f == INF || S0.f == NAN) then D.f = S0.f;
    // else D.f = frexp(S0.f, Exponent(S0.f)).
    void
    Inst_VOP1__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
                    vdst[lane] = src[lane];
                } else {
                    VecElemI32 exp(0);
                    vdst[lane] = std::frexp(src[lane], &exp);
                }
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_CLREXCP::Inst_VOP1__V_CLREXCP(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_clrexcp")
    {
        setFlag(ALU);
    } // Inst_VOP1__V_CLREXCP

    Inst_VOP1__V_CLREXCP::~Inst_VOP1__V_CLREXCP()
    {
    } // ~Inst_VOP1__V_CLREXCP

    void
    Inst_VOP1__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_CVT_F16_U16::Inst_VOP1__V_CVT_F16_U16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_f16_u16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_CVT_F16_U16

    Inst_VOP1__V_CVT_F16_U16::~Inst_VOP1__V_CVT_F16_U16()
    {
    } // ~Inst_VOP1__V_CVT_F16_U16

    // D.f16 = uint16_to_flt16(S.u16).
    void
    Inst_VOP1__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_CVT_F16_I16::Inst_VOP1__V_CVT_F16_I16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_f16_i16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_CVT_F16_I16

    Inst_VOP1__V_CVT_F16_I16::~Inst_VOP1__V_CVT_F16_I16()
    {
    } // ~Inst_VOP1__V_CVT_F16_I16

    // D.f16 = int16_to_flt16(S.i16).
    void
    Inst_VOP1__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_CVT_U16_F16::Inst_VOP1__V_CVT_U16_F16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_u16_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_CVT_U16_F16

    Inst_VOP1__V_CVT_U16_F16::~Inst_VOP1__V_CVT_U16_F16()
    {
    } // ~Inst_VOP1__V_CVT_U16_F16

    // D.u16 = flt16_to_uint16(S.f16).
    void
    Inst_VOP1__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_CVT_I16_F16::Inst_VOP1__V_CVT_I16_F16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cvt_i16_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_CVT_I16_F16

    Inst_VOP1__V_CVT_I16_F16::~Inst_VOP1__V_CVT_I16_F16()
    {
    } // ~Inst_VOP1__V_CVT_I16_F16

    // D.i16 = flt16_to_int16(S.f16).
    void
    Inst_VOP1__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_RCP_F16::Inst_VOP1__V_RCP_F16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_rcp_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_RCP_F16

    Inst_VOP1__V_RCP_F16::~Inst_VOP1__V_RCP_F16()
    {
    } // ~Inst_VOP1__V_RCP_F16

    // if (S0.f16 == 1.0f)
    //     D.f16 = 1.0f;
    // else
    //     D.f16 = 1 / S0.f16;
    void
    Inst_VOP1__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_SQRT_F16::Inst_VOP1__V_SQRT_F16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_sqrt_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_SQRT_F16

    Inst_VOP1__V_SQRT_F16::~Inst_VOP1__V_SQRT_F16()
    {
    } // ~Inst_VOP1__V_SQRT_F16

    // if (S0.f16 == 1.0f)
    //     D.f16 = 1.0f;
    // else
    //     D.f16 = sqrt(S0.f16);
    void
    Inst_VOP1__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_RSQ_F16::Inst_VOP1__V_RSQ_F16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_rsq_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_RSQ_F16

    Inst_VOP1__V_RSQ_F16::~Inst_VOP1__V_RSQ_F16()
    {
    } // ~Inst_VOP1__V_RSQ_F16

    // if (S0.f16 == 1.0f)
    //     D.f16 = 1.0f;
    // else
    //     D.f16 = 1 / sqrt(S0.f16);
    void
    Inst_VOP1__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_LOG_F16::Inst_VOP1__V_LOG_F16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_log_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_LOG_F16

    Inst_VOP1__V_LOG_F16::~Inst_VOP1__V_LOG_F16()
    {
    } // ~Inst_VOP1__V_LOG_F16

    // if (S0.f16 == 1.0f)
    //     D.f16 = 0.0f;
    // else
    //     D.f16 = log2(S0.f16);
    void
    Inst_VOP1__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_EXP_F16::Inst_VOP1__V_EXP_F16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_exp_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_EXP_F16

    Inst_VOP1__V_EXP_F16::~Inst_VOP1__V_EXP_F16()
    {
    } // ~Inst_VOP1__V_EXP_F16

    // if (S0.f16 == 0.0f)
    //     D.f16 = 1.0f;
    // else
    //     D.f16 = pow(2.0, S0.f16).
    void
    Inst_VOP1__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_FREXP_MANT_F16::Inst_VOP1__V_FREXP_MANT_F16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_frexp_mant_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_FREXP_MANT_F16

    Inst_VOP1__V_FREXP_MANT_F16::~Inst_VOP1__V_FREXP_MANT_F16()
    {
    } // ~Inst_VOP1__V_FREXP_MANT_F16

    // if (S0.f16 == +-INF || S0.f16 == NAN)
    //     D.f16 = S0.f16;
    // else
    //     D.f16 = mantissa(S0.f16).
    void
    Inst_VOP1__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_FREXP_EXP_I16_F16::Inst_VOP1__V_FREXP_EXP_I16_F16(
          InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_frexp_exp_i16_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_FREXP_EXP_I16_F16

    Inst_VOP1__V_FREXP_EXP_I16_F16::~Inst_VOP1__V_FREXP_EXP_I16_F16()
    {
    } // ~Inst_VOP1__V_FREXP_EXP_I16_F16

    // frexp(S0.f16, Exponent(S0.f16))
    // if (S0.f16 == +-INF || S0.f16 == NAN)
    //     D.i16 = 0;
    // else
    //     D.i16 = Exponent(S0.f16);
    void
    Inst_VOP1__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_FLOOR_F16::Inst_VOP1__V_FLOOR_F16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_floor_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_FLOOR_F16

    Inst_VOP1__V_FLOOR_F16::~Inst_VOP1__V_FLOOR_F16()
    {
    } // ~Inst_VOP1__V_FLOOR_F16

    // D.f16 = floor(S0.f16);
    void
    Inst_VOP1__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_CEIL_F16::Inst_VOP1__V_CEIL_F16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_ceil_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_CEIL_F16

    Inst_VOP1__V_CEIL_F16::~Inst_VOP1__V_CEIL_F16()
    {
    } // ~Inst_VOP1__V_CEIL_F16

    // D.f16 = ceil(S0.f16);
    void
    Inst_VOP1__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_TRUNC_F16::Inst_VOP1__V_TRUNC_F16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_trunc_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_TRUNC_F16

    Inst_VOP1__V_TRUNC_F16::~Inst_VOP1__V_TRUNC_F16()
    {
    } // ~Inst_VOP1__V_TRUNC_F16

    // D.f16 = trunc(S0.f16).
    void
    Inst_VOP1__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_RNDNE_F16::Inst_VOP1__V_RNDNE_F16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_rndne_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_RNDNE_F16

    Inst_VOP1__V_RNDNE_F16::~Inst_VOP1__V_RNDNE_F16()
    {
    } // ~Inst_VOP1__V_RNDNE_F16

    // D.f16 = roundNearestEven(S0.f16);
    void
    Inst_VOP1__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_FRACT_F16::Inst_VOP1__V_FRACT_F16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_fract_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_FRACT_F16

    Inst_VOP1__V_FRACT_F16::~Inst_VOP1__V_FRACT_F16()
    {
    } // ~Inst_VOP1__V_FRACT_F16

    // D.f16 = S0.f16 + -floor(S0.f16).
    void
    Inst_VOP1__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_SIN_F16::Inst_VOP1__V_SIN_F16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_sin_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_SIN_F16

    Inst_VOP1__V_SIN_F16::~Inst_VOP1__V_SIN_F16()
    {
    } // ~Inst_VOP1__V_SIN_F16

    // D.f16 = sin(S0.f16 * 2 * PI).
    void
    Inst_VOP1__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_COS_F16::Inst_VOP1__V_COS_F16(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_cos_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP1__V_COS_F16

    Inst_VOP1__V_COS_F16::~Inst_VOP1__V_COS_F16()
    {
    } // ~Inst_VOP1__V_COS_F16

    // D.f16 = cos(S0.f16 * 2 * PI).
    void
    Inst_VOP1__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP1__V_EXP_LEGACY_F32::Inst_VOP1__V_EXP_LEGACY_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_exp_legacy_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_EXP_LEGACY_F32

    Inst_VOP1__V_EXP_LEGACY_F32::~Inst_VOP1__V_EXP_LEGACY_F32()
    {
    } // ~Inst_VOP1__V_EXP_LEGACY_F32

    // D.f = pow(2.0, S0.f)
    void
    Inst_VOP1__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::pow(2.0, src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP1__V_LOG_LEGACY_F32::Inst_VOP1__V_LOG_LEGACY_F32(InFmt_VOP1 *iFmt)
        : Inst_VOP1(iFmt, "v_log_legacy_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP1__V_LOG_LEGACY_F32

    Inst_VOP1__V_LOG_LEGACY_F32::~Inst_VOP1__V_LOG_LEGACY_F32()
    {
    } // ~Inst_VOP1__V_LOG_LEGACY_F32

    // D.f = log2(S0.f).
    void
    Inst_VOP1__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::log2(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOPC__V_CMP_CLASS_F32::Inst_VOPC__V_CMP_CLASS_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_class_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_CLASS_F32

    Inst_VOPC__V_CMP_CLASS_F32::~Inst_VOPC__V_CMP_CLASS_F32()
    {
    } // ~Inst_VOPC__V_CMP_CLASS_F32

    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
    // The function reports true if the floating point value is any of the
    // numeric types selected in S1.u according to the following list:
    // S1.u[0] -- value is a signaling NaN.
    // S1.u[1] -- value is a quiet NaN.
    // S1.u[2] -- value is negative infinity.
    // S1.u[3] -- value is a negative normal value.
    // S1.u[4] -- value is a negative denormal value.
    // S1.u[5] -- value is negative zero.
    // S1.u[6] -- value is positive zero.
    // S1.u[7] -- value is a positive denormal value.
    // S1.u[8] -- value is a positive normal value.
    // S1.u[9] -- value is positive infinity.
    void
    Inst_VOPC__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
                    // is NaN
                    if (std::isnan(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 2)) {
                    // is -infinity
                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 3)) {
                    // is -normal
                    if (std::isnormal(src0[lane])
                        && std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 4)) {
                    // is -denormal
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                        && std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 5)) {
                    // is -zero
                    if (std::fpclassify(src0[lane]) == FP_ZERO
                        && std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 6)) {
                    // is +zero
                    if (std::fpclassify(src0[lane]) == FP_ZERO
                        && !std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 7)) {
                    // is +denormal
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                        && !std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 8)) {
                    // is +normal
                    if (std::isnormal(src0[lane])
                        && !std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 9)) {
                    // is +infinity
                    if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMPX_CLASS_F32::Inst_VOPC__V_CMPX_CLASS_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_class_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_CLASS_F32

    Inst_VOPC__V_CMPX_CLASS_F32::~Inst_VOPC__V_CMPX_CLASS_F32()
    {
    } // ~Inst_VOPC__V_CMPX_CLASS_F32

    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
    // S0.f The function reports true if the floating point value is any of
    // the numeric types selected in S1.u according to the following list:
    // S1.u[0] -- value is a signaling NaN.
    // S1.u[1] -- value is a quiet NaN.
    // S1.u[2] -- value is negative infinity.
    // S1.u[3] -- value is a negative normal value.
    // S1.u[4] -- value is a negative denormal value.
    // S1.u[5] -- value is negative zero.
    // S1.u[6] -- value is positive zero.
    // S1.u[7] -- value is a positive denormal value.
    // S1.u[8] -- value is a positive normal value.
    // S1.u[9] -- value is positive infinity.
    void
    Inst_VOPC__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
                    // is NaN
                    if (std::isnan(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 2)) {
                    // is -infinity
                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 3)) {
                    // is -normal
                    if (std::isnormal(src0[lane])
                        && std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 4)) {
                    // is -denormal
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                        && std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 5)) {
                    // is -zero
                    if (std::fpclassify(src0[lane]) == FP_ZERO
                        && std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 6)) {
                    // is +zero
                    if (std::fpclassify(src0[lane]) == FP_ZERO
                        && !std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 7)) {
                    // is +denormal
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                        && !std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 8)) {
                    // is +normal
                    if (std::isnormal(src0[lane])
                        && !std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 9)) {
                    // is +infinity
                    if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMP_CLASS_F64::Inst_VOPC__V_CMP_CLASS_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_class_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_CLASS_F64

    Inst_VOPC__V_CMP_CLASS_F64::~Inst_VOPC__V_CMP_CLASS_F64()
    {
    } // ~Inst_VOPC__V_CMP_CLASS_F64

    // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
    // The function reports true if the floating point value is any of the
    // numeric types selected in S1.u according to the following list:
    // S1.u[0] -- value is a signaling NaN.
    // S1.u[1] -- value is a quiet NaN.
    // S1.u[2] -- value is negative infinity.
    // S1.u[3] -- value is a negative normal value.
    // S1.u[4] -- value is a negative denormal value.
    // S1.u[5] -- value is negative zero.
    // S1.u[6] -- value is positive zero.
    // S1.u[7] -- value is a positive denormal value.
    // S1.u[8] -- value is a positive normal value.
    // S1.u[9] -- value is positive infinity.
    void
    Inst_VOPC__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
                    // is NaN
                    if (std::isnan(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 2)) {
                    // is -infinity
                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 3)) {
                    // is -normal
                    if (std::isnormal(src0[lane])
                        && std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 4)) {
                    // is -denormal
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                        && std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 5)) {
                    // is -zero
                    if (std::fpclassify(src0[lane]) == FP_ZERO
                        && std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 6)) {
                    // is +zero
                    if (std::fpclassify(src0[lane]) == FP_ZERO
                        && !std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 7)) {
                    // is +denormal
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                        && !std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 8)) {
                    // is +normal
                    if (std::isnormal(src0[lane])
                        && !std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 9)) {
                    // is +infinity
                    if (std::isinf(src0[lane])
                        && !std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMPX_CLASS_F64::Inst_VOPC__V_CMPX_CLASS_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_class_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_CLASS_F64

    Inst_VOPC__V_CMPX_CLASS_F64::~Inst_VOPC__V_CMPX_CLASS_F64()
    {
    } // ~Inst_VOPC__V_CMPX_CLASS_F64

    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
    // S0.d The function reports true if the floating point value is any of
    // the numeric types selected in S1.u according to the following list:
    // S1.u[0] -- value is a signaling NaN.
    // S1.u[1] -- value is a quiet NaN.
    // S1.u[2] -- value is negative infinity.
    // S1.u[3] -- value is a negative normal value.
    // S1.u[4] -- value is a negative denormal value.
    // S1.u[5] -- value is negative zero.
    // S1.u[6] -- value is positive zero.
    // S1.u[7] -- value is a positive denormal value.
    // S1.u[8] -- value is a positive normal value.
    // S1.u[9] -- value is positive infinity.
    void
    Inst_VOPC__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
                    // is NaN
                    if (std::isnan(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 2)) {
                    // is -infinity
                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 3)) {
                    // is -normal
                    if (std::isnormal(src0[lane])
                        && std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 4)) {
                    // is -denormal
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                        && std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 5)) {
                    // is -zero
                    if (std::fpclassify(src0[lane]) == FP_ZERO
                        && std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 6)) {
                    // is +zero
                    if (std::fpclassify(src0[lane]) == FP_ZERO
                        && !std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 7)) {
                    // is +denormal
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                        && !std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 8)) {
                    // is +normal
                    if (std::isnormal(src0[lane])
                        && !std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 9)) {
                    // is +infinity
                    if (std::isinf(src0[lane])
                        && !std::signbit(src0[lane])) {
                        vcc.setBit(lane, 1);
                        continue;
                    }
                }
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMP_CLASS_F16::Inst_VOPC__V_CMP_CLASS_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_class_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_CLASS_F16

    Inst_VOPC__V_CMP_CLASS_F16::~Inst_VOPC__V_CMP_CLASS_F16()
    {
    } // ~Inst_VOPC__V_CMP_CLASS_F16

    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
    // The function reports true if the floating point value is any of the
    // numeric types selected in S1.u according to the following list:
    // S1.u[0] -- value is a signaling NaN.
    // S1.u[1] -- value is a quiet NaN.
    // S1.u[2] -- value is negative infinity.
    // S1.u[3] -- value is a negative normal value.
    // S1.u[4] -- value is a negative denormal value.
    // S1.u[5] -- value is negative zero.
    // S1.u[6] -- value is positive zero.
    // S1.u[7] -- value is a positive denormal value.
    // S1.u[8] -- value is a positive normal value.
    // S1.u[9] -- value is positive infinity.
    void
    Inst_VOPC__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_CLASS_F16::Inst_VOPC__V_CMPX_CLASS_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_class_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_CLASS_F16

    Inst_VOPC__V_CMPX_CLASS_F16::~Inst_VOPC__V_CMPX_CLASS_F16()
    {
    } // ~Inst_VOPC__V_CMPX_CLASS_F16

    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
    // S0.f16
    // The function reports true if the floating point value is any of the
    // numeric types selected in S1.u according to the following list:
    // S1.u[0] -- value is a signaling NaN.
    // S1.u[1] -- value is a quiet NaN.
    // S1.u[2] -- value is negative infinity.
    // S1.u[3] -- value is a negative normal value.
    // S1.u[4] -- value is a negative denormal value.
    // S1.u[5] -- value is negative zero.
    // S1.u[6] -- value is positive zero.
    // S1.u[7] -- value is a positive denormal value.
    // S1.u[8] -- value is a positive normal value.
    // S1.u[9] -- value is positive infinity.
    void
    Inst_VOPC__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_F_F16::Inst_VOPC__V_CMP_F_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_f_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_F_F16

    Inst_VOPC__V_CMP_F_F16::~Inst_VOPC__V_CMP_F_F16()
    {
    } // ~Inst_VOPC__V_CMP_F_F16

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_LT_F16::Inst_VOPC__V_CMP_LT_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_lt_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_LT_F16

    Inst_VOPC__V_CMP_LT_F16::~Inst_VOPC__V_CMP_LT_F16()
    {
    } // ~Inst_VOPC__V_CMP_LT_F16

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_EQ_F16::Inst_VOPC__V_CMP_EQ_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_eq_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_EQ_F16

    Inst_VOPC__V_CMP_EQ_F16::~Inst_VOPC__V_CMP_EQ_F16()
    {
    } // ~Inst_VOPC__V_CMP_EQ_F16

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_LE_F16::Inst_VOPC__V_CMP_LE_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_le_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_LE_F16

    Inst_VOPC__V_CMP_LE_F16::~Inst_VOPC__V_CMP_LE_F16()
    {
    } // ~Inst_VOPC__V_CMP_LE_F16

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_GT_F16::Inst_VOPC__V_CMP_GT_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_gt_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_GT_F16

    Inst_VOPC__V_CMP_GT_F16::~Inst_VOPC__V_CMP_GT_F16()
    {
    } // ~Inst_VOPC__V_CMP_GT_F16

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_LG_F16::Inst_VOPC__V_CMP_LG_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_lg_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_LG_F16

    Inst_VOPC__V_CMP_LG_F16::~Inst_VOPC__V_CMP_LG_F16()
    {
    } // ~Inst_VOPC__V_CMP_LG_F16

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_GE_F16::Inst_VOPC__V_CMP_GE_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ge_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_GE_F16

    Inst_VOPC__V_CMP_GE_F16::~Inst_VOPC__V_CMP_GE_F16()
    {
    } // ~Inst_VOPC__V_CMP_GE_F16

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_O_F16::Inst_VOPC__V_CMP_O_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_o_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_O_F16

    Inst_VOPC__V_CMP_O_F16::~Inst_VOPC__V_CMP_O_F16()
    {
    } // ~Inst_VOPC__V_CMP_O_F16

    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_U_F16::Inst_VOPC__V_CMP_U_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_u_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_U_F16

    Inst_VOPC__V_CMP_U_F16::~Inst_VOPC__V_CMP_U_F16()
    {
    } // ~Inst_VOPC__V_CMP_U_F16

    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_NGE_F16::Inst_VOPC__V_CMP_NGE_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_nge_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_NGE_F16

    Inst_VOPC__V_CMP_NGE_F16::~Inst_VOPC__V_CMP_NGE_F16()
    {
    } // ~Inst_VOPC__V_CMP_NGE_F16

    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_NLG_F16::Inst_VOPC__V_CMP_NLG_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_nlg_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_NLG_F16

    Inst_VOPC__V_CMP_NLG_F16::~Inst_VOPC__V_CMP_NLG_F16()
    {
    } // ~Inst_VOPC__V_CMP_NLG_F16

    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_NGT_F16::Inst_VOPC__V_CMP_NGT_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ngt_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_NGT_F16

    Inst_VOPC__V_CMP_NGT_F16::~Inst_VOPC__V_CMP_NGT_F16()
    {
    } // ~Inst_VOPC__V_CMP_NGT_F16

    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_NLE_F16::Inst_VOPC__V_CMP_NLE_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_nle_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_NLE_F16

    Inst_VOPC__V_CMP_NLE_F16::~Inst_VOPC__V_CMP_NLE_F16()
    {
    } // ~Inst_VOPC__V_CMP_NLE_F16

    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_NEQ_F16::Inst_VOPC__V_CMP_NEQ_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_neq_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_NEQ_F16

    Inst_VOPC__V_CMP_NEQ_F16::~Inst_VOPC__V_CMP_NEQ_F16()
    {
    } // ~Inst_VOPC__V_CMP_NEQ_F16

    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_NLT_F16::Inst_VOPC__V_CMP_NLT_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_nlt_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_NLT_F16

    Inst_VOPC__V_CMP_NLT_F16::~Inst_VOPC__V_CMP_NLT_F16()
    {
    } // ~Inst_VOPC__V_CMP_NLT_F16

    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_TRU_F16::Inst_VOPC__V_CMP_TRU_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_tru_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMP_TRU_F16

    Inst_VOPC__V_CMP_TRU_F16::~Inst_VOPC__V_CMP_TRU_F16()
    {
    } // ~Inst_VOPC__V_CMP_TRU_F16

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_F_F16::Inst_VOPC__V_CMPX_F_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_f_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_F_F16

    Inst_VOPC__V_CMPX_F_F16::~Inst_VOPC__V_CMPX_F_F16()
    {
    } // ~Inst_VOPC__V_CMPX_F_F16

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_LT_F16::Inst_VOPC__V_CMPX_LT_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_lt_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_LT_F16

    Inst_VOPC__V_CMPX_LT_F16::~Inst_VOPC__V_CMPX_LT_F16()
    {
    } // ~Inst_VOPC__V_CMPX_LT_F16

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_EQ_F16::Inst_VOPC__V_CMPX_EQ_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_eq_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_EQ_F16

    Inst_VOPC__V_CMPX_EQ_F16::~Inst_VOPC__V_CMPX_EQ_F16()
    {
    } // ~Inst_VOPC__V_CMPX_EQ_F16

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_LE_F16::Inst_VOPC__V_CMPX_LE_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_le_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_LE_F16

    Inst_VOPC__V_CMPX_LE_F16::~Inst_VOPC__V_CMPX_LE_F16()
    {
    } // ~Inst_VOPC__V_CMPX_LE_F16

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_GT_F16::Inst_VOPC__V_CMPX_GT_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_gt_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_GT_F16

    Inst_VOPC__V_CMPX_GT_F16::~Inst_VOPC__V_CMPX_GT_F16()
    {
    } // ~Inst_VOPC__V_CMPX_GT_F16

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_LG_F16::Inst_VOPC__V_CMPX_LG_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_lg_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_LG_F16

    Inst_VOPC__V_CMPX_LG_F16::~Inst_VOPC__V_CMPX_LG_F16()
    {
    } // ~Inst_VOPC__V_CMPX_LG_F16

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_GE_F16::Inst_VOPC__V_CMPX_GE_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ge_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_GE_F16

    Inst_VOPC__V_CMPX_GE_F16::~Inst_VOPC__V_CMPX_GE_F16()
    {
    } // ~Inst_VOPC__V_CMPX_GE_F16

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_O_F16::Inst_VOPC__V_CMPX_O_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_o_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_O_F16

    Inst_VOPC__V_CMPX_O_F16::~Inst_VOPC__V_CMPX_O_F16()
    {
    } // ~Inst_VOPC__V_CMPX_O_F16

    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
    // encoding.
    void
    Inst_VOPC__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_U_F16::Inst_VOPC__V_CMPX_U_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_u_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_U_F16

    Inst_VOPC__V_CMPX_U_F16::~Inst_VOPC__V_CMPX_U_F16()
    {
    } // ~Inst_VOPC__V_CMPX_U_F16

    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
    // encoding.
    void
    Inst_VOPC__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_NGE_F16::Inst_VOPC__V_CMPX_NGE_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_nge_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_NGE_F16

    Inst_VOPC__V_CMPX_NGE_F16::~Inst_VOPC__V_CMPX_NGE_F16()
    {
    } // ~Inst_VOPC__V_CMPX_NGE_F16

    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_NLG_F16::Inst_VOPC__V_CMPX_NLG_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_nlg_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_NLG_F16

    Inst_VOPC__V_CMPX_NLG_F16::~Inst_VOPC__V_CMPX_NLG_F16()
    {
    } // ~Inst_VOPC__V_CMPX_NLG_F16

    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_NGT_F16::Inst_VOPC__V_CMPX_NGT_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ngt_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_NGT_F16

    Inst_VOPC__V_CMPX_NGT_F16::~Inst_VOPC__V_CMPX_NGT_F16()
    {
    } // ~Inst_VOPC__V_CMPX_NGT_F16

    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_NLE_F16::Inst_VOPC__V_CMPX_NLE_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_nle_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_NLE_F16

    Inst_VOPC__V_CMPX_NLE_F16::~Inst_VOPC__V_CMPX_NLE_F16()
    {
    } // ~Inst_VOPC__V_CMPX_NLE_F16

    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_NEQ_F16::Inst_VOPC__V_CMPX_NEQ_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_neq_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_NEQ_F16

    Inst_VOPC__V_CMPX_NEQ_F16::~Inst_VOPC__V_CMPX_NEQ_F16()
    {
    } // ~Inst_VOPC__V_CMPX_NEQ_F16

    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_NLT_F16::Inst_VOPC__V_CMPX_NLT_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_nlt_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_NLT_F16

    Inst_VOPC__V_CMPX_NLT_F16::~Inst_VOPC__V_CMPX_NLT_F16()
    {
    } // ~Inst_VOPC__V_CMPX_NLT_F16

    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMPX_TRU_F16::Inst_VOPC__V_CMPX_TRU_F16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_tru_f16")
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOPC__V_CMPX_TRU_F16

    Inst_VOPC__V_CMPX_TRU_F16::~Inst_VOPC__V_CMPX_TRU_F16()
    {
    } // ~Inst_VOPC__V_CMPX_TRU_F16

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOPC__V_CMP_F_F32::Inst_VOPC__V_CMP_F_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_f_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_F_F32

    Inst_VOPC__V_CMP_F_F32::~Inst_VOPC__V_CMP_F_F32()
    {
    } // ~Inst_VOPC__V_CMP_F_F32

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LT_F32::Inst_VOPC__V_CMP_LT_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_lt_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_LT_F32

    Inst_VOPC__V_CMP_LT_F32::~Inst_VOPC__V_CMP_LT_F32()
    {
    } // ~Inst_VOPC__V_CMP_LT_F32

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_EQ_F32::Inst_VOPC__V_CMP_EQ_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_eq_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_EQ_F32

    Inst_VOPC__V_CMP_EQ_F32::~Inst_VOPC__V_CMP_EQ_F32()
    {
    } // ~Inst_VOPC__V_CMP_EQ_F32

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LE_F32::Inst_VOPC__V_CMP_LE_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_le_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_LE_F32

    Inst_VOPC__V_CMP_LE_F32::~Inst_VOPC__V_CMP_LE_F32()
    {
    } // ~Inst_VOPC__V_CMP_LE_F32

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_GT_F32::Inst_VOPC__V_CMP_GT_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_gt_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_GT_F32

    Inst_VOPC__V_CMP_GT_F32::~Inst_VOPC__V_CMP_GT_F32()
    {
    } // ~Inst_VOPC__V_CMP_GT_F32

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LG_F32::Inst_VOPC__V_CMP_LG_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_lg_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_LG_F32

    Inst_VOPC__V_CMP_LG_F32::~Inst_VOPC__V_CMP_LG_F32()
    {
    } // ~Inst_VOPC__V_CMP_LG_F32

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, (src0[lane] < src1[lane]
                    || src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_GE_F32::Inst_VOPC__V_CMP_GE_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ge_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_GE_F32

    Inst_VOPC__V_CMP_GE_F32::~Inst_VOPC__V_CMP_GE_F32()
    {
    } // ~Inst_VOPC__V_CMP_GE_F32

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_O_F32::Inst_VOPC__V_CMP_O_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_o_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_O_F32

    Inst_VOPC__V_CMP_O_F32::~Inst_VOPC__V_CMP_O_F32()
    {
    } // ~Inst_VOPC__V_CMP_O_F32

    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, (!std::isnan(src0[lane])
                    && !std::isnan(src1[lane])) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_U_F32::Inst_VOPC__V_CMP_U_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_u_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_U_F32

    Inst_VOPC__V_CMP_U_F32::~Inst_VOPC__V_CMP_U_F32()
    {
    } // ~Inst_VOPC__V_CMP_U_F32

    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, (std::isnan(src0[lane])
                    || std::isnan(src1[lane])) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NGE_F32::Inst_VOPC__V_CMP_NGE_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_nge_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_NGE_F32

    Inst_VOPC__V_CMP_NGE_F32::~Inst_VOPC__V_CMP_NGE_F32()
    {
    } // ~Inst_VOPC__V_CMP_NGE_F32

    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NLG_F32::Inst_VOPC__V_CMP_NLG_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_nlg_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_NLG_F32

    Inst_VOPC__V_CMP_NLG_F32::~Inst_VOPC__V_CMP_NLG_F32()
    {
    } // ~Inst_VOPC__V_CMP_NLG_F32

    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] < src1[lane]
                    || src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NGT_F32::Inst_VOPC__V_CMP_NGT_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ngt_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_NGT_F32

    Inst_VOPC__V_CMP_NGT_F32::~Inst_VOPC__V_CMP_NGT_F32()
    {
    } // ~Inst_VOPC__V_CMP_NGT_F32

    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NLE_F32::Inst_VOPC__V_CMP_NLE_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_nle_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_NLE_F32

    Inst_VOPC__V_CMP_NLE_F32::~Inst_VOPC__V_CMP_NLE_F32()
    {
    } // ~Inst_VOPC__V_CMP_NLE_F32

    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NEQ_F32::Inst_VOPC__V_CMP_NEQ_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_neq_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_NEQ_F32

    Inst_VOPC__V_CMP_NEQ_F32::~Inst_VOPC__V_CMP_NEQ_F32()
    {
    } // ~Inst_VOPC__V_CMP_NEQ_F32

    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NLT_F32::Inst_VOPC__V_CMP_NLT_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_nlt_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_NLT_F32

    Inst_VOPC__V_CMP_NLT_F32::~Inst_VOPC__V_CMP_NLT_F32()
    {
    } // ~Inst_VOPC__V_CMP_NLT_F32

    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_TRU_F32::Inst_VOPC__V_CMP_TRU_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_tru_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMP_TRU_F32

    Inst_VOPC__V_CMP_TRU_F32::~Inst_VOPC__V_CMP_TRU_F32()
    {
    } // ~Inst_VOPC__V_CMP_TRU_F32

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 1);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMPX_F_F32::Inst_VOPC__V_CMPX_F_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_f_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_F_F32

    Inst_VOPC__V_CMPX_F_F32::~Inst_VOPC__V_CMPX_F_F32()
    {
    } // ~Inst_VOPC__V_CMPX_F_F32

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_LT_F32::Inst_VOPC__V_CMPX_LT_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_lt_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_LT_F32

    Inst_VOPC__V_CMPX_LT_F32::~Inst_VOPC__V_CMPX_LT_F32()
    {
    } // ~Inst_VOPC__V_CMPX_LT_F32

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_EQ_F32::Inst_VOPC__V_CMPX_EQ_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_eq_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_EQ_F32

    Inst_VOPC__V_CMPX_EQ_F32::~Inst_VOPC__V_CMPX_EQ_F32()
    {
    } // ~Inst_VOPC__V_CMPX_EQ_F32

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_LE_F32::Inst_VOPC__V_CMPX_LE_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_le_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_LE_F32

    Inst_VOPC__V_CMPX_LE_F32::~Inst_VOPC__V_CMPX_LE_F32()
    {
    } // ~Inst_VOPC__V_CMPX_LE_F32

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_GT_F32::Inst_VOPC__V_CMPX_GT_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_gt_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_GT_F32

    Inst_VOPC__V_CMPX_GT_F32::~Inst_VOPC__V_CMPX_GT_F32()
    {
    } // ~Inst_VOPC__V_CMPX_GT_F32

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_LG_F32::Inst_VOPC__V_CMPX_LG_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_lg_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_LG_F32

    Inst_VOPC__V_CMPX_LG_F32::~Inst_VOPC__V_CMPX_LG_F32()
    {
    } // ~Inst_VOPC__V_CMPX_LG_F32

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, (src0[lane] < src1[lane]
                    || src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_GE_F32::Inst_VOPC__V_CMPX_GE_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ge_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_GE_F32

    Inst_VOPC__V_CMPX_GE_F32::~Inst_VOPC__V_CMPX_GE_F32()
    {
    } // ~Inst_VOPC__V_CMPX_GE_F32

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_O_F32::Inst_VOPC__V_CMPX_O_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_o_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_O_F32

    Inst_VOPC__V_CMPX_O_F32::~Inst_VOPC__V_CMPX_O_F32()
    {
    } // ~Inst_VOPC__V_CMPX_O_F32

    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
    // encoding.
    void
    Inst_VOPC__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, (!std::isnan(src0[lane])
                    && !std::isnan(src1[lane])) ? 1 : 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_U_F32::Inst_VOPC__V_CMPX_U_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_u_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_U_F32

    Inst_VOPC__V_CMPX_U_F32::~Inst_VOPC__V_CMPX_U_F32()
    {
    } // ~Inst_VOPC__V_CMPX_U_F32

    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
    // encoding.
    void
    Inst_VOPC__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, (std::isnan(src0[lane])
                    || std::isnan(src1[lane])) ? 1 : 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_NGE_F32::Inst_VOPC__V_CMPX_NGE_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_nge_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_NGE_F32

    Inst_VOPC__V_CMPX_NGE_F32::~Inst_VOPC__V_CMPX_NGE_F32()
    {
    } // ~Inst_VOPC__V_CMPX_NGE_F32

    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_NLG_F32::Inst_VOPC__V_CMPX_NLG_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_nlg_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_NLG_F32

    Inst_VOPC__V_CMPX_NLG_F32::~Inst_VOPC__V_CMPX_NLG_F32()
    {
    } // ~Inst_VOPC__V_CMPX_NLG_F32

    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] < src1[lane]
                    || src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_NGT_F32::Inst_VOPC__V_CMPX_NGT_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ngt_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_NGT_F32

    Inst_VOPC__V_CMPX_NGT_F32::~Inst_VOPC__V_CMPX_NGT_F32()
    {
    } // ~Inst_VOPC__V_CMPX_NGT_F32

    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_NLE_F32::Inst_VOPC__V_CMPX_NLE_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_nle_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_NLE_F32

    Inst_VOPC__V_CMPX_NLE_F32::~Inst_VOPC__V_CMPX_NLE_F32()
    {
    } // ~Inst_VOPC__V_CMPX_NLE_F32

    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_NEQ_F32::Inst_VOPC__V_CMPX_NEQ_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_neq_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_NEQ_F32

    Inst_VOPC__V_CMPX_NEQ_F32::~Inst_VOPC__V_CMPX_NEQ_F32()
    {
    } // ~Inst_VOPC__V_CMPX_NEQ_F32

    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] == src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMPX_NLT_F32::Inst_VOPC__V_CMPX_NLT_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_nlt_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_NLT_F32

    Inst_VOPC__V_CMPX_NLT_F32::~Inst_VOPC__V_CMPX_NLT_F32()
    {
    } // ~Inst_VOPC__V_CMPX_NLT_F32

    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_TRU_F32::Inst_VOPC__V_CMPX_TRU_F32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_tru_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOPC__V_CMPX_TRU_F32

    Inst_VOPC__V_CMPX_TRU_F32::~Inst_VOPC__V_CMPX_TRU_F32()
    {
    } // ~Inst_VOPC__V_CMPX_TRU_F32

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 1);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMP_F_F64::Inst_VOPC__V_CMP_F_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_f_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_F_F64

    Inst_VOPC__V_CMP_F_F64::~Inst_VOPC__V_CMP_F_F64()
    {
    } // ~Inst_VOPC__V_CMP_F_F64

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LT_F64::Inst_VOPC__V_CMP_LT_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_lt_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_LT_F64

    Inst_VOPC__V_CMP_LT_F64::~Inst_VOPC__V_CMP_LT_F64()
    {
    } // ~Inst_VOPC__V_CMP_LT_F64

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_EQ_F64::Inst_VOPC__V_CMP_EQ_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_eq_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_EQ_F64

    Inst_VOPC__V_CMP_EQ_F64::~Inst_VOPC__V_CMP_EQ_F64()
    {
    } // ~Inst_VOPC__V_CMP_EQ_F64

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LE_F64::Inst_VOPC__V_CMP_LE_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_le_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_LE_F64

    Inst_VOPC__V_CMP_LE_F64::~Inst_VOPC__V_CMP_LE_F64()
    {
    } // ~Inst_VOPC__V_CMP_LE_F64

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_GT_F64::Inst_VOPC__V_CMP_GT_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_gt_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_GT_F64

    Inst_VOPC__V_CMP_GT_F64::~Inst_VOPC__V_CMP_GT_F64()
    {
    } // ~Inst_VOPC__V_CMP_GT_F64

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LG_F64::Inst_VOPC__V_CMP_LG_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_lg_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_LG_F64

    Inst_VOPC__V_CMP_LG_F64::~Inst_VOPC__V_CMP_LG_F64()
    {
    } // ~Inst_VOPC__V_CMP_LG_F64

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, (src0[lane] < src1[lane]
                    || src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_GE_F64::Inst_VOPC__V_CMP_GE_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ge_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_GE_F64

    Inst_VOPC__V_CMP_GE_F64::~Inst_VOPC__V_CMP_GE_F64()
    {
    } // ~Inst_VOPC__V_CMP_GE_F64

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_O_F64::Inst_VOPC__V_CMP_O_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_o_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_O_F64

    Inst_VOPC__V_CMP_O_F64::~Inst_VOPC__V_CMP_O_F64()
    {
    } // ~Inst_VOPC__V_CMP_O_F64

    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, (!std::isnan(src0[lane])
                    && !std::isnan(src1[lane])) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_U_F64::Inst_VOPC__V_CMP_U_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_u_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_U_F64

    Inst_VOPC__V_CMP_U_F64::~Inst_VOPC__V_CMP_U_F64()
    {
    } // ~Inst_VOPC__V_CMP_U_F64

    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, (std::isnan(src0[lane])
                    || std::isnan(src1[lane])) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NGE_F64::Inst_VOPC__V_CMP_NGE_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_nge_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_NGE_F64

    Inst_VOPC__V_CMP_NGE_F64::~Inst_VOPC__V_CMP_NGE_F64()
    {
    } // ~Inst_VOPC__V_CMP_NGE_F64

    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NLG_F64::Inst_VOPC__V_CMP_NLG_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_nlg_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_NLG_F64

    Inst_VOPC__V_CMP_NLG_F64::~Inst_VOPC__V_CMP_NLG_F64()
    {
    } // ~Inst_VOPC__V_CMP_NLG_F64

    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] < src1[lane]
                    || src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NGT_F64::Inst_VOPC__V_CMP_NGT_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ngt_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_NGT_F64

    Inst_VOPC__V_CMP_NGT_F64::~Inst_VOPC__V_CMP_NGT_F64()
    {
    } // ~Inst_VOPC__V_CMP_NGT_F64

    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NLE_F64::Inst_VOPC__V_CMP_NLE_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_nle_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_NLE_F64

    Inst_VOPC__V_CMP_NLE_F64::~Inst_VOPC__V_CMP_NLE_F64()
    {
    } // ~Inst_VOPC__V_CMP_NLE_F64

    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NEQ_F64::Inst_VOPC__V_CMP_NEQ_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_neq_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_NEQ_F64

    Inst_VOPC__V_CMP_NEQ_F64::~Inst_VOPC__V_CMP_NEQ_F64()
    {
    } // ~Inst_VOPC__V_CMP_NEQ_F64

    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NLT_F64::Inst_VOPC__V_CMP_NLT_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_nlt_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_NLT_F64

    Inst_VOPC__V_CMP_NLT_F64::~Inst_VOPC__V_CMP_NLT_F64()
    {
    } // ~Inst_VOPC__V_CMP_NLT_F64

    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_TRU_F64::Inst_VOPC__V_CMP_TRU_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_tru_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMP_TRU_F64

    Inst_VOPC__V_CMP_TRU_F64::~Inst_VOPC__V_CMP_TRU_F64()
    {
    } // ~Inst_VOPC__V_CMP_TRU_F64

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 1);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMPX_F_F64::Inst_VOPC__V_CMPX_F_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_f_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_F_F64

    Inst_VOPC__V_CMPX_F_F64::~Inst_VOPC__V_CMPX_F_F64()
    {
    } // ~Inst_VOPC__V_CMPX_F_F64

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_LT_F64::Inst_VOPC__V_CMPX_LT_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_lt_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_LT_F64

    Inst_VOPC__V_CMPX_LT_F64::~Inst_VOPC__V_CMPX_LT_F64()
    {
    } // ~Inst_VOPC__V_CMPX_LT_F64

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_EQ_F64::Inst_VOPC__V_CMPX_EQ_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_eq_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_EQ_F64

    Inst_VOPC__V_CMPX_EQ_F64::~Inst_VOPC__V_CMPX_EQ_F64()
    {
    } // ~Inst_VOPC__V_CMPX_EQ_F64

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
        wf->execMask() = vcc.rawData();
    }

    Inst_VOPC__V_CMPX_LE_F64::Inst_VOPC__V_CMPX_LE_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_le_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_LE_F64

    Inst_VOPC__V_CMPX_LE_F64::~Inst_VOPC__V_CMPX_LE_F64()
    {
    } // ~Inst_VOPC__V_CMPX_LE_F64

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_GT_F64::Inst_VOPC__V_CMPX_GT_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_gt_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_GT_F64

    Inst_VOPC__V_CMPX_GT_F64::~Inst_VOPC__V_CMPX_GT_F64()
    {
    } // ~Inst_VOPC__V_CMPX_GT_F64

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_LG_F64::Inst_VOPC__V_CMPX_LG_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_lg_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_LG_F64

    Inst_VOPC__V_CMPX_LG_F64::~Inst_VOPC__V_CMPX_LG_F64()
    {
    } // ~Inst_VOPC__V_CMPX_LG_F64

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, (src0[lane] < src1[lane]
                    || src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_GE_F64::Inst_VOPC__V_CMPX_GE_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ge_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_GE_F64

    Inst_VOPC__V_CMPX_GE_F64::~Inst_VOPC__V_CMPX_GE_F64()
    {
    } // ~Inst_VOPC__V_CMPX_GE_F64

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_O_F64::Inst_VOPC__V_CMPX_O_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_o_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_O_F64

    Inst_VOPC__V_CMPX_O_F64::~Inst_VOPC__V_CMPX_O_F64()
    {
    } // ~Inst_VOPC__V_CMPX_O_F64

    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
    // encoding.
    void
    Inst_VOPC__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, (!std::isnan(src0[lane])
                    && !std::isnan(src1[lane])) ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_U_F64::Inst_VOPC__V_CMPX_U_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_u_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_U_F64

    Inst_VOPC__V_CMPX_U_F64::~Inst_VOPC__V_CMPX_U_F64()
    {
    } // ~Inst_VOPC__V_CMPX_U_F64

    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
    // encoding.
    void
    Inst_VOPC__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, (std::isnan(src0[lane])
                    || std::isnan(src1[lane])) ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_NGE_F64::Inst_VOPC__V_CMPX_NGE_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_nge_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_NGE_F64

    Inst_VOPC__V_CMPX_NGE_F64::~Inst_VOPC__V_CMPX_NGE_F64()
    {
    } // ~Inst_VOPC__V_CMPX_NGE_F64

    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_NLG_F64::Inst_VOPC__V_CMPX_NLG_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_nlg_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_NLG_F64

    Inst_VOPC__V_CMPX_NLG_F64::~Inst_VOPC__V_CMPX_NLG_F64()
    {
    } // ~Inst_VOPC__V_CMPX_NLG_F64

    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] < src1[lane]
                    || src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_NGT_F64::Inst_VOPC__V_CMPX_NGT_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ngt_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_NGT_F64

    Inst_VOPC__V_CMPX_NGT_F64::~Inst_VOPC__V_CMPX_NGT_F64()
    {
    } // ~Inst_VOPC__V_CMPX_NGT_F64

    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_NLE_F64::Inst_VOPC__V_CMPX_NLE_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_nle_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_NLE_F64

    Inst_VOPC__V_CMPX_NLE_F64::~Inst_VOPC__V_CMPX_NLE_F64()
    {
    } // ~Inst_VOPC__V_CMPX_NLE_F64

    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_NEQ_F64::Inst_VOPC__V_CMPX_NEQ_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_neq_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_NEQ_F64

    Inst_VOPC__V_CMPX_NEQ_F64::~Inst_VOPC__V_CMPX_NEQ_F64()
    {
    } // ~Inst_VOPC__V_CMPX_NEQ_F64

    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_NLT_F64::Inst_VOPC__V_CMPX_NLT_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_nlt_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_NLT_F64

    Inst_VOPC__V_CMPX_NLT_F64::~Inst_VOPC__V_CMPX_NLT_F64()
    {
    } // ~Inst_VOPC__V_CMPX_NLT_F64

    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_TRU_F64::Inst_VOPC__V_CMPX_TRU_F64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_tru_f64")
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOPC__V_CMPX_TRU_F64

    Inst_VOPC__V_CMPX_TRU_F64::~Inst_VOPC__V_CMPX_TRU_F64()
    {
    } // ~Inst_VOPC__V_CMPX_TRU_F64

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 1);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMP_F_I16::Inst_VOPC__V_CMP_F_I16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_f_i16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_F_I16

    Inst_VOPC__V_CMP_F_I16::~Inst_VOPC__V_CMP_F_I16()
    {
    } // ~Inst_VOPC__V_CMP_F_I16

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LT_I16::Inst_VOPC__V_CMP_LT_I16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_lt_i16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_LT_I16

    Inst_VOPC__V_CMP_LT_I16::~Inst_VOPC__V_CMP_LT_I16()
    {
    } // ~Inst_VOPC__V_CMP_LT_I16

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_EQ_I16::Inst_VOPC__V_CMP_EQ_I16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_eq_i16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_EQ_I16

    Inst_VOPC__V_CMP_EQ_I16::~Inst_VOPC__V_CMP_EQ_I16()
    {
    } // ~Inst_VOPC__V_CMP_EQ_I16

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LE_I16::Inst_VOPC__V_CMP_LE_I16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_le_i16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_LE_I16

    Inst_VOPC__V_CMP_LE_I16::~Inst_VOPC__V_CMP_LE_I16()
    {
    } // ~Inst_VOPC__V_CMP_LE_I16

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_GT_I16::Inst_VOPC__V_CMP_GT_I16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_gt_i16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_GT_I16

    Inst_VOPC__V_CMP_GT_I16::~Inst_VOPC__V_CMP_GT_I16()
    {
    } // ~Inst_VOPC__V_CMP_GT_I16

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NE_I16::Inst_VOPC__V_CMP_NE_I16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ne_i16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_NE_I16

    Inst_VOPC__V_CMP_NE_I16::~Inst_VOPC__V_CMP_NE_I16()
    {
    } // ~Inst_VOPC__V_CMP_NE_I16

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_GE_I16::Inst_VOPC__V_CMP_GE_I16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ge_i16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_GE_I16

    Inst_VOPC__V_CMP_GE_I16::~Inst_VOPC__V_CMP_GE_I16()
    {
    } // ~Inst_VOPC__V_CMP_GE_I16

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_T_I16::Inst_VOPC__V_CMP_T_I16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_t_i16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_T_I16

    Inst_VOPC__V_CMP_T_I16::~Inst_VOPC__V_CMP_T_I16()
    {
    } // ~Inst_VOPC__V_CMP_T_I16

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 1);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_F_U16::Inst_VOPC__V_CMP_F_U16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_f_u16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_F_U16

    Inst_VOPC__V_CMP_F_U16::~Inst_VOPC__V_CMP_F_U16()
    {
    } // ~Inst_VOPC__V_CMP_F_U16

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LT_U16::Inst_VOPC__V_CMP_LT_U16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_lt_u16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_LT_U16

    Inst_VOPC__V_CMP_LT_U16::~Inst_VOPC__V_CMP_LT_U16()
    {
    } // ~Inst_VOPC__V_CMP_LT_U16

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_EQ_U16::Inst_VOPC__V_CMP_EQ_U16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_eq_u16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_EQ_U16

    Inst_VOPC__V_CMP_EQ_U16::~Inst_VOPC__V_CMP_EQ_U16()
    {
    } // ~Inst_VOPC__V_CMP_EQ_U16

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LE_U16::Inst_VOPC__V_CMP_LE_U16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_le_u16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_LE_U16

    Inst_VOPC__V_CMP_LE_U16::~Inst_VOPC__V_CMP_LE_U16()
    {
    } // ~Inst_VOPC__V_CMP_LE_U16

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_GT_U16::Inst_VOPC__V_CMP_GT_U16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_gt_u16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_GT_U16

    Inst_VOPC__V_CMP_GT_U16::~Inst_VOPC__V_CMP_GT_U16()
    {
    } // ~Inst_VOPC__V_CMP_GT_U16

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NE_U16::Inst_VOPC__V_CMP_NE_U16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ne_u16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_NE_U16

    Inst_VOPC__V_CMP_NE_U16::~Inst_VOPC__V_CMP_NE_U16()
    {
    } // ~Inst_VOPC__V_CMP_NE_U16

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_GE_U16::Inst_VOPC__V_CMP_GE_U16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ge_u16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_GE_U16

    Inst_VOPC__V_CMP_GE_U16::~Inst_VOPC__V_CMP_GE_U16()
    {
    } // ~Inst_VOPC__V_CMP_GE_U16

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_T_U16::Inst_VOPC__V_CMP_T_U16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_t_u16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_T_U16

    Inst_VOPC__V_CMP_T_U16::~Inst_VOPC__V_CMP_T_U16()
    {
    } // ~Inst_VOPC__V_CMP_T_U16

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 1);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMPX_F_I16::Inst_VOPC__V_CMPX_F_I16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_f_i16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_F_I16

    Inst_VOPC__V_CMPX_F_I16::~Inst_VOPC__V_CMPX_F_I16()
    {
    } // ~Inst_VOPC__V_CMPX_F_I16

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_LT_I16::Inst_VOPC__V_CMPX_LT_I16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_lt_i16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_LT_I16

    Inst_VOPC__V_CMPX_LT_I16::~Inst_VOPC__V_CMPX_LT_I16()
    {
    } // ~Inst_VOPC__V_CMPX_LT_I16

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_EQ_I16::Inst_VOPC__V_CMPX_EQ_I16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_eq_i16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_EQ_I16

    Inst_VOPC__V_CMPX_EQ_I16::~Inst_VOPC__V_CMPX_EQ_I16()
    {
    } // ~Inst_VOPC__V_CMPX_EQ_I16

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_LE_I16::Inst_VOPC__V_CMPX_LE_I16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_le_i16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_LE_I16

    Inst_VOPC__V_CMPX_LE_I16::~Inst_VOPC__V_CMPX_LE_I16()
    {
    } // ~Inst_VOPC__V_CMPX_LE_I16

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_GT_I16::Inst_VOPC__V_CMPX_GT_I16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_gt_i16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_GT_I16

    Inst_VOPC__V_CMPX_GT_I16::~Inst_VOPC__V_CMPX_GT_I16()
    {
    } // ~Inst_VOPC__V_CMPX_GT_I16

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_NE_I16::Inst_VOPC__V_CMPX_NE_I16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ne_i16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_NE_I16

    Inst_VOPC__V_CMPX_NE_I16::~Inst_VOPC__V_CMPX_NE_I16()
    {
    } // ~Inst_VOPC__V_CMPX_NE_I16

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_GE_I16::Inst_VOPC__V_CMPX_GE_I16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ge_i16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_GE_I16

    Inst_VOPC__V_CMPX_GE_I16::~Inst_VOPC__V_CMPX_GE_I16()
    {
    } // ~Inst_VOPC__V_CMPX_GE_I16

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_T_I16::Inst_VOPC__V_CMPX_T_I16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_t_i16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_T_I16

    Inst_VOPC__V_CMPX_T_I16::~Inst_VOPC__V_CMPX_T_I16()
    {
    } // ~Inst_VOPC__V_CMPX_T_I16

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 1);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_F_U16::Inst_VOPC__V_CMPX_F_U16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_f_u16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_F_U16

    Inst_VOPC__V_CMPX_F_U16::~Inst_VOPC__V_CMPX_F_U16()
    {
    } // ~Inst_VOPC__V_CMPX_F_U16

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_LT_U16::Inst_VOPC__V_CMPX_LT_U16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_lt_u16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_LT_U16

    Inst_VOPC__V_CMPX_LT_U16::~Inst_VOPC__V_CMPX_LT_U16()
    {
    } // ~Inst_VOPC__V_CMPX_LT_U16

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_EQ_U16::Inst_VOPC__V_CMPX_EQ_U16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_eq_u16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_EQ_U16

    Inst_VOPC__V_CMPX_EQ_U16::~Inst_VOPC__V_CMPX_EQ_U16()
    {
    } // ~Inst_VOPC__V_CMPX_EQ_U16

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_LE_U16::Inst_VOPC__V_CMPX_LE_U16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_le_u16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_LE_U16

    Inst_VOPC__V_CMPX_LE_U16::~Inst_VOPC__V_CMPX_LE_U16()
    {
    } // ~Inst_VOPC__V_CMPX_LE_U16

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_GT_U16::Inst_VOPC__V_CMPX_GT_U16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_gt_u16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_GT_U16

    Inst_VOPC__V_CMPX_GT_U16::~Inst_VOPC__V_CMPX_GT_U16()
    {
    } // ~Inst_VOPC__V_CMPX_GT_U16

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_NE_U16::Inst_VOPC__V_CMPX_NE_U16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ne_u16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_NE_U16

    Inst_VOPC__V_CMPX_NE_U16::~Inst_VOPC__V_CMPX_NE_U16()
    {
    } // ~Inst_VOPC__V_CMPX_NE_U16

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_GE_U16::Inst_VOPC__V_CMPX_GE_U16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ge_u16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_GE_U16

    Inst_VOPC__V_CMPX_GE_U16::~Inst_VOPC__V_CMPX_GE_U16()
    {
    } // ~Inst_VOPC__V_CMPX_GE_U16

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_T_U16::Inst_VOPC__V_CMPX_T_U16(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_t_u16")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_T_U16

    Inst_VOPC__V_CMPX_T_U16::~Inst_VOPC__V_CMPX_T_U16()
    {
    } // ~Inst_VOPC__V_CMPX_T_U16

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 1);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMP_F_I32::Inst_VOPC__V_CMP_F_I32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_f_i32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_F_I32

    Inst_VOPC__V_CMP_F_I32::~Inst_VOPC__V_CMP_F_I32()
    {
    } // ~Inst_VOPC__V_CMP_F_I32

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LT_I32::Inst_VOPC__V_CMP_LT_I32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_lt_i32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_LT_I32

    Inst_VOPC__V_CMP_LT_I32::~Inst_VOPC__V_CMP_LT_I32()
    {
    } // ~Inst_VOPC__V_CMP_LT_I32

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_EQ_I32::Inst_VOPC__V_CMP_EQ_I32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_eq_i32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_EQ_I32

    Inst_VOPC__V_CMP_EQ_I32::~Inst_VOPC__V_CMP_EQ_I32()
    {
    } // ~Inst_VOPC__V_CMP_EQ_I32

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LE_I32::Inst_VOPC__V_CMP_LE_I32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_le_i32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_LE_I32

    Inst_VOPC__V_CMP_LE_I32::~Inst_VOPC__V_CMP_LE_I32()
    {
    } // ~Inst_VOPC__V_CMP_LE_I32

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_GT_I32::Inst_VOPC__V_CMP_GT_I32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_gt_i32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_GT_I32

    Inst_VOPC__V_CMP_GT_I32::~Inst_VOPC__V_CMP_GT_I32()
    {
    } // ~Inst_VOPC__V_CMP_GT_I32

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NE_I32::Inst_VOPC__V_CMP_NE_I32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ne_i32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_NE_I32

    Inst_VOPC__V_CMP_NE_I32::~Inst_VOPC__V_CMP_NE_I32()
    {
    } // ~Inst_VOPC__V_CMP_NE_I32

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_GE_I32::Inst_VOPC__V_CMP_GE_I32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ge_i32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_GE_I32

    Inst_VOPC__V_CMP_GE_I32::~Inst_VOPC__V_CMP_GE_I32()
    {
    } // ~Inst_VOPC__V_CMP_GE_I32

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_T_I32::Inst_VOPC__V_CMP_T_I32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_t_i32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_T_I32

    Inst_VOPC__V_CMP_T_I32::~Inst_VOPC__V_CMP_T_I32()
    {
    } // ~Inst_VOPC__V_CMP_T_I32

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 1);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_F_U32::Inst_VOPC__V_CMP_F_U32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_f_u32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_F_U32

    Inst_VOPC__V_CMP_F_U32::~Inst_VOPC__V_CMP_F_U32()
    {
    } // ~Inst_VOPC__V_CMP_F_U32

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LT_U32::Inst_VOPC__V_CMP_LT_U32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_lt_u32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_LT_U32

    Inst_VOPC__V_CMP_LT_U32::~Inst_VOPC__V_CMP_LT_U32()
    {
    } // ~Inst_VOPC__V_CMP_LT_U32

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_EQ_U32::Inst_VOPC__V_CMP_EQ_U32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_eq_u32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_EQ_U32

    Inst_VOPC__V_CMP_EQ_U32::~Inst_VOPC__V_CMP_EQ_U32()
    {
    } // ~Inst_VOPC__V_CMP_EQ_U32

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LE_U32::Inst_VOPC__V_CMP_LE_U32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_le_u32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_LE_U32

    Inst_VOPC__V_CMP_LE_U32::~Inst_VOPC__V_CMP_LE_U32()
    {
    } // ~Inst_VOPC__V_CMP_LE_U32

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_GT_U32::Inst_VOPC__V_CMP_GT_U32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_gt_u32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_GT_U32

    Inst_VOPC__V_CMP_GT_U32::~Inst_VOPC__V_CMP_GT_U32()
    {
    } // ~Inst_VOPC__V_CMP_GT_U32

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NE_U32::Inst_VOPC__V_CMP_NE_U32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ne_u32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_NE_U32

    Inst_VOPC__V_CMP_NE_U32::~Inst_VOPC__V_CMP_NE_U32()
    {
    } // ~Inst_VOPC__V_CMP_NE_U32

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_GE_U32::Inst_VOPC__V_CMP_GE_U32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ge_u32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_GE_U32

    Inst_VOPC__V_CMP_GE_U32::~Inst_VOPC__V_CMP_GE_U32()
    {
    } // ~Inst_VOPC__V_CMP_GE_U32

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_T_U32::Inst_VOPC__V_CMP_T_U32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_t_u32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_T_U32

    Inst_VOPC__V_CMP_T_U32::~Inst_VOPC__V_CMP_T_U32()
    {
    } // ~Inst_VOPC__V_CMP_T_U32

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 1);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMPX_F_I32::Inst_VOPC__V_CMPX_F_I32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_f_i32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_F_I32

    Inst_VOPC__V_CMPX_F_I32::~Inst_VOPC__V_CMPX_F_I32()
    {
    } // ~Inst_VOPC__V_CMPX_F_I32

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_LT_I32::Inst_VOPC__V_CMPX_LT_I32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_lt_i32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_LT_I32

    Inst_VOPC__V_CMPX_LT_I32::~Inst_VOPC__V_CMPX_LT_I32()
    {
    } // ~Inst_VOPC__V_CMPX_LT_I32

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_EQ_I32::Inst_VOPC__V_CMPX_EQ_I32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_eq_i32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_EQ_I32

    Inst_VOPC__V_CMPX_EQ_I32::~Inst_VOPC__V_CMPX_EQ_I32()
    {
    } // ~Inst_VOPC__V_CMPX_EQ_I32

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_LE_I32::Inst_VOPC__V_CMPX_LE_I32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_le_i32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_LE_I32

    Inst_VOPC__V_CMPX_LE_I32::~Inst_VOPC__V_CMPX_LE_I32()
    {
    } // ~Inst_VOPC__V_CMPX_LE_I32

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_GT_I32::Inst_VOPC__V_CMPX_GT_I32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_gt_i32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_GT_I32

    Inst_VOPC__V_CMPX_GT_I32::~Inst_VOPC__V_CMPX_GT_I32()
    {
    } // ~Inst_VOPC__V_CMPX_GT_I32

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_NE_I32::Inst_VOPC__V_CMPX_NE_I32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ne_i32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_NE_I32

    Inst_VOPC__V_CMPX_NE_I32::~Inst_VOPC__V_CMPX_NE_I32()
    {
    } // ~Inst_VOPC__V_CMPX_NE_I32

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_GE_I32::Inst_VOPC__V_CMPX_GE_I32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ge_i32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_GE_I32

    Inst_VOPC__V_CMPX_GE_I32::~Inst_VOPC__V_CMPX_GE_I32()
    {
    } // ~Inst_VOPC__V_CMPX_GE_I32

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_T_I32::Inst_VOPC__V_CMPX_T_I32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_t_i32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_T_I32

    Inst_VOPC__V_CMPX_T_I32::~Inst_VOPC__V_CMPX_T_I32()
    {
    } // ~Inst_VOPC__V_CMPX_T_I32

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 1);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_F_U32::Inst_VOPC__V_CMPX_F_U32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_f_u32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_F_U32

    Inst_VOPC__V_CMPX_F_U32::~Inst_VOPC__V_CMPX_F_U32()
    {
    } // ~Inst_VOPC__V_CMPX_F_U32

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_LT_U32::Inst_VOPC__V_CMPX_LT_U32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_lt_u32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_LT_U32

    Inst_VOPC__V_CMPX_LT_U32::~Inst_VOPC__V_CMPX_LT_U32()
    {
    } // ~Inst_VOPC__V_CMPX_LT_U32

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_EQ_U32::Inst_VOPC__V_CMPX_EQ_U32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_eq_u32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_EQ_U32

    Inst_VOPC__V_CMPX_EQ_U32::~Inst_VOPC__V_CMPX_EQ_U32()
    {
    } // ~Inst_VOPC__V_CMPX_EQ_U32

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_LE_U32::Inst_VOPC__V_CMPX_LE_U32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_le_u32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_LE_U32

    Inst_VOPC__V_CMPX_LE_U32::~Inst_VOPC__V_CMPX_LE_U32()
    {
    } // ~Inst_VOPC__V_CMPX_LE_U32

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_GT_U32::Inst_VOPC__V_CMPX_GT_U32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_gt_u32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_GT_U32

    Inst_VOPC__V_CMPX_GT_U32::~Inst_VOPC__V_CMPX_GT_U32()
    {
    } // ~Inst_VOPC__V_CMPX_GT_U32

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_NE_U32::Inst_VOPC__V_CMPX_NE_U32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ne_u32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_NE_U32

    Inst_VOPC__V_CMPX_NE_U32::~Inst_VOPC__V_CMPX_NE_U32()
    {
    } // ~Inst_VOPC__V_CMPX_NE_U32

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_GE_U32::Inst_VOPC__V_CMPX_GE_U32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ge_u32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_GE_U32

    Inst_VOPC__V_CMPX_GE_U32::~Inst_VOPC__V_CMPX_GE_U32()
    {
    } // ~Inst_VOPC__V_CMPX_GE_U32

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_T_U32::Inst_VOPC__V_CMPX_T_U32(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_t_u32")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_T_U32

    Inst_VOPC__V_CMPX_T_U32::~Inst_VOPC__V_CMPX_T_U32()
    {
    } // ~Inst_VOPC__V_CMPX_T_U32

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 1);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMP_F_I64::Inst_VOPC__V_CMP_F_I64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_f_i64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_F_I64

    Inst_VOPC__V_CMP_F_I64::~Inst_VOPC__V_CMP_F_I64()
    {
    } // ~Inst_VOPC__V_CMP_F_I64

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LT_I64::Inst_VOPC__V_CMP_LT_I64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_lt_i64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_LT_I64

    Inst_VOPC__V_CMP_LT_I64::~Inst_VOPC__V_CMP_LT_I64()
    {
    } // ~Inst_VOPC__V_CMP_LT_I64

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_EQ_I64::Inst_VOPC__V_CMP_EQ_I64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_eq_i64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_EQ_I64

    Inst_VOPC__V_CMP_EQ_I64::~Inst_VOPC__V_CMP_EQ_I64()
    {
    } // ~Inst_VOPC__V_CMP_EQ_I64

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LE_I64::Inst_VOPC__V_CMP_LE_I64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_le_i64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_LE_I64

    Inst_VOPC__V_CMP_LE_I64::~Inst_VOPC__V_CMP_LE_I64()
    {
    } // ~Inst_VOPC__V_CMP_LE_I64

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_GT_I64::Inst_VOPC__V_CMP_GT_I64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_gt_i64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_GT_I64

    Inst_VOPC__V_CMP_GT_I64::~Inst_VOPC__V_CMP_GT_I64()
    {
    } // ~Inst_VOPC__V_CMP_GT_I64

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NE_I64::Inst_VOPC__V_CMP_NE_I64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ne_i64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_NE_I64

    Inst_VOPC__V_CMP_NE_I64::~Inst_VOPC__V_CMP_NE_I64()
    {
    } // ~Inst_VOPC__V_CMP_NE_I64

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_GE_I64::Inst_VOPC__V_CMP_GE_I64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ge_i64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_GE_I64

    Inst_VOPC__V_CMP_GE_I64::~Inst_VOPC__V_CMP_GE_I64()
    {
    } // ~Inst_VOPC__V_CMP_GE_I64

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_T_I64::Inst_VOPC__V_CMP_T_I64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_t_i64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_T_I64

    Inst_VOPC__V_CMP_T_I64::~Inst_VOPC__V_CMP_T_I64()
    {
    } // ~Inst_VOPC__V_CMP_T_I64

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 1);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_F_U64::Inst_VOPC__V_CMP_F_U64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_f_u64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_F_U64

    Inst_VOPC__V_CMP_F_U64::~Inst_VOPC__V_CMP_F_U64()
    {
    } // ~Inst_VOPC__V_CMP_F_U64

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LT_U64::Inst_VOPC__V_CMP_LT_U64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_lt_u64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_LT_U64

    Inst_VOPC__V_CMP_LT_U64::~Inst_VOPC__V_CMP_LT_U64()
    {
    } // ~Inst_VOPC__V_CMP_LT_U64

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_EQ_U64::Inst_VOPC__V_CMP_EQ_U64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_eq_u64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_EQ_U64

    Inst_VOPC__V_CMP_EQ_U64::~Inst_VOPC__V_CMP_EQ_U64()
    {
    } // ~Inst_VOPC__V_CMP_EQ_U64

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_LE_U64::Inst_VOPC__V_CMP_LE_U64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_le_u64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_LE_U64

    Inst_VOPC__V_CMP_LE_U64::~Inst_VOPC__V_CMP_LE_U64()
    {
    } // ~Inst_VOPC__V_CMP_LE_U64

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_GT_U64::Inst_VOPC__V_CMP_GT_U64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_gt_u64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_GT_U64

    Inst_VOPC__V_CMP_GT_U64::~Inst_VOPC__V_CMP_GT_U64()
    {
    } // ~Inst_VOPC__V_CMP_GT_U64

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_NE_U64::Inst_VOPC__V_CMP_NE_U64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ne_u64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_NE_U64

    Inst_VOPC__V_CMP_NE_U64::~Inst_VOPC__V_CMP_NE_U64()
    {
    } // ~Inst_VOPC__V_CMP_NE_U64

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_GE_U64::Inst_VOPC__V_CMP_GE_U64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_ge_u64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_GE_U64

    Inst_VOPC__V_CMP_GE_U64::~Inst_VOPC__V_CMP_GE_U64()
    {
    } // ~Inst_VOPC__V_CMP_GE_U64

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMP_T_U64::Inst_VOPC__V_CMP_T_U64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmp_t_u64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMP_T_U64

    Inst_VOPC__V_CMP_T_U64::~Inst_VOPC__V_CMP_T_U64()
    {
    } // ~Inst_VOPC__V_CMP_T_U64

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 1);
            }
        }

        vcc.write();
    }

    Inst_VOPC__V_CMPX_F_I64::Inst_VOPC__V_CMPX_F_I64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_f_i64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_F_I64

    Inst_VOPC__V_CMPX_F_I64::~Inst_VOPC__V_CMPX_F_I64()
    {
    } // ~Inst_VOPC__V_CMPX_F_I64

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_LT_I64::Inst_VOPC__V_CMPX_LT_I64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_lt_i64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_LT_I64

    Inst_VOPC__V_CMPX_LT_I64::~Inst_VOPC__V_CMPX_LT_I64()
    {
    } // ~Inst_VOPC__V_CMPX_LT_I64

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_EQ_I64::Inst_VOPC__V_CMPX_EQ_I64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_eq_i64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_EQ_I64

    Inst_VOPC__V_CMPX_EQ_I64::~Inst_VOPC__V_CMPX_EQ_I64()
    {
    } // ~Inst_VOPC__V_CMPX_EQ_I64

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_LE_I64::Inst_VOPC__V_CMPX_LE_I64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_le_i64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_LE_I64

    Inst_VOPC__V_CMPX_LE_I64::~Inst_VOPC__V_CMPX_LE_I64()
    {
    } // ~Inst_VOPC__V_CMPX_LE_I64

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_GT_I64::Inst_VOPC__V_CMPX_GT_I64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_gt_i64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_GT_I64

    Inst_VOPC__V_CMPX_GT_I64::~Inst_VOPC__V_CMPX_GT_I64()
    {
    } // ~Inst_VOPC__V_CMPX_GT_I64

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_NE_I64::Inst_VOPC__V_CMPX_NE_I64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ne_i64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_NE_I64

    Inst_VOPC__V_CMPX_NE_I64::~Inst_VOPC__V_CMPX_NE_I64()
    {
    } // ~Inst_VOPC__V_CMPX_NE_I64

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_GE_I64::Inst_VOPC__V_CMPX_GE_I64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ge_i64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_GE_I64

    Inst_VOPC__V_CMPX_GE_I64::~Inst_VOPC__V_CMPX_GE_I64()
    {
    } // ~Inst_VOPC__V_CMPX_GE_I64

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_T_I64::Inst_VOPC__V_CMPX_T_I64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_t_i64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_T_I64

    Inst_VOPC__V_CMPX_T_I64::~Inst_VOPC__V_CMPX_T_I64()
    {
    } // ~Inst_VOPC__V_CMPX_T_I64

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 1);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_F_U64::Inst_VOPC__V_CMPX_F_U64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_f_u64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_F_U64

    Inst_VOPC__V_CMPX_F_U64::~Inst_VOPC__V_CMPX_F_U64()
    {
    } // ~Inst_VOPC__V_CMPX_F_U64

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_LT_U64::Inst_VOPC__V_CMPX_LT_U64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_lt_u64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_LT_U64

    Inst_VOPC__V_CMPX_LT_U64::~Inst_VOPC__V_CMPX_LT_U64()
    {
    } // ~Inst_VOPC__V_CMPX_LT_U64

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_EQ_U64::Inst_VOPC__V_CMPX_EQ_U64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_eq_u64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_EQ_U64

    Inst_VOPC__V_CMPX_EQ_U64::~Inst_VOPC__V_CMPX_EQ_U64()
    {
    } // ~Inst_VOPC__V_CMPX_EQ_U64

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_LE_U64::Inst_VOPC__V_CMPX_LE_U64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_le_u64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_LE_U64

    Inst_VOPC__V_CMPX_LE_U64::~Inst_VOPC__V_CMPX_LE_U64()
    {
    } // ~Inst_VOPC__V_CMPX_LE_U64

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_GT_U64::Inst_VOPC__V_CMPX_GT_U64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_gt_u64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_GT_U64

    Inst_VOPC__V_CMPX_GT_U64::~Inst_VOPC__V_CMPX_GT_U64()
    {
    } // ~Inst_VOPC__V_CMPX_GT_U64

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_NE_U64::Inst_VOPC__V_CMPX_NE_U64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ne_u64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_NE_U64

    Inst_VOPC__V_CMPX_NE_U64::~Inst_VOPC__V_CMPX_NE_U64()
    {
    } // ~Inst_VOPC__V_CMPX_NE_U64

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_GE_U64::Inst_VOPC__V_CMPX_GE_U64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_ge_u64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_GE_U64

    Inst_VOPC__V_CMPX_GE_U64::~Inst_VOPC__V_CMPX_GE_U64()
    {
    } // ~Inst_VOPC__V_CMPX_GE_U64

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VOPC__V_CMPX_T_U64::Inst_VOPC__V_CMPX_T_U64(InFmt_VOPC *iFmt)
        : Inst_VOPC(iFmt, "v_cmpx_t_u64")
    {
        setFlag(ALU);
    } // Inst_VOPC__V_CMPX_T_U64

    Inst_VOPC__V_CMPX_T_U64::~Inst_VOPC__V_CMPX_T_U64()
    {
    } // ~Inst_VOPC__V_CMPX_T_U64

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOPC__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, 1);
            }
        }

        wf->execMask() = vcc.rawData();
        vcc.write();
    }

    Inst_VINTRP__V_INTERP_P1_F32::Inst_VINTRP__V_INTERP_P1_F32(
          InFmt_VINTRP *iFmt)
        : Inst_VINTRP(iFmt, "v_interp_p1_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VINTRP__V_INTERP_P1_F32

    Inst_VINTRP__V_INTERP_P1_F32::~Inst_VINTRP__V_INTERP_P1_F32()
    {
    } // ~Inst_VINTRP__V_INTERP_P1_F32

    // D.f = P10 * S.f + P0; parameter interpolation
    void
    Inst_VINTRP__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VINTRP__V_INTERP_P2_F32::Inst_VINTRP__V_INTERP_P2_F32(
          InFmt_VINTRP *iFmt)
        : Inst_VINTRP(iFmt, "v_interp_p2_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VINTRP__V_INTERP_P2_F32

    Inst_VINTRP__V_INTERP_P2_F32::~Inst_VINTRP__V_INTERP_P2_F32()
    {
    } // ~Inst_VINTRP__V_INTERP_P2_F32

    // D.f = P20 * S.f + D.f; parameter interpolation
    void
    Inst_VINTRP__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VINTRP__V_INTERP_MOV_F32::Inst_VINTRP__V_INTERP_MOV_F32(
          InFmt_VINTRP *iFmt)
        : Inst_VINTRP(iFmt, "v_interp_mov_f32")
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VINTRP__V_INTERP_MOV_F32

    Inst_VINTRP__V_INTERP_MOV_F32::~Inst_VINTRP__V_INTERP_MOV_F32()
    {
    } // ~Inst_VINTRP__V_INTERP_MOV_F32

    void
    Inst_VINTRP__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_CLASS_F32::Inst_VOP3__V_CMP_CLASS_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_class_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_CLASS_F32

    Inst_VOP3__V_CMP_CLASS_F32::~Inst_VOP3__V_CMP_CLASS_F32()
    {
    } // ~Inst_VOP3__V_CMP_CLASS_F32

    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
    // The function reports true if the floating point value is any of the
    // numeric types selected in S1.u according to the following list:
    // S1.u[0] -- value is a signaling NaN.
    // S1.u[1] -- value is a quiet NaN.
    // S1.u[2] -- value is negative infinity.
    // S1.u[3] -- value is a negative normal value.
    // S1.u[4] -- value is a negative denormal value.
    // S1.u[5] -- value is negative zero.
    // S1.u[6] -- value is positive zero.
    // S1.u[7] -- value is a positive denormal value.
    // S1.u[8] -- value is a positive normal value.
    // S1.u[9] -- value is positive infinity.
    void
    Inst_VOP3__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
                    // is NaN
                    if (std::isnan(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
                if (bits(src1[lane], 2)) {
                    // is -infinity
                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
                if (bits(src1[lane], 3)) {
                    // is -normal
                    if (std::isnormal(src0[lane])
                        && std::signbit(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
                if (bits(src1[lane], 4)) {
                    // is -denormal
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                        && std::signbit(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
                if (bits(src1[lane], 5)) {
                    // is -zero
                    if (std::fpclassify(src0[lane]) == FP_ZERO
                        && std::signbit(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
                if (bits(src1[lane], 6)) {
                    // is +zero
                    if (std::fpclassify(src0[lane]) == FP_ZERO
                        && !std::signbit(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
                if (bits(src1[lane], 7)) {
                    // is +denormal
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                        && !std::signbit(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
                if (bits(src1[lane], 8)) {
                    // is +normal
                    if (std::isnormal(src0[lane])
                        && !std::signbit(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
                if (bits(src1[lane], 9)) {
                    // is +infinity
                    if (std::isinf(src0[lane])
                        && !std::signbit(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMPX_CLASS_F32::Inst_VOP3__V_CMPX_CLASS_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_class_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_CLASS_F32

    Inst_VOP3__V_CMPX_CLASS_F32::~Inst_VOP3__V_CMPX_CLASS_F32()
    {
    } // ~Inst_VOP3__V_CMPX_CLASS_F32

    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
    // S0.f
    // The function reports true if the floating point value is any of the
    // numeric types selected in S1.u according to the following list:
    // S1.u[0] -- value is a signaling NaN.
    // S1.u[1] -- value is a quiet NaN.
    // S1.u[2] -- value is negative infinity.
    // S1.u[3] -- value is a negative normal value.
    // S1.u[4] -- value is a negative denormal value.
    // S1.u[5] -- value is negative zero.
    // S1.u[6] -- value is positive zero.
    // S1.u[7] -- value is a positive denormal value.
    // S1.u[8] -- value is a positive normal value.
    // S1.u[9] -- value is positive infinity.
    void
    Inst_VOP3__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
                    // is NaN
                    if (std::isnan(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
                if (bits(src1[lane], 2)) {
                    // is -infinity
                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
                if (bits(src1[lane], 3)) {
                    // is -normal
                    if (std::isnormal(src0[lane])
                        && std::signbit(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
                if (bits(src1[lane], 4)) {
                    // is -denormal
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                        && std::signbit(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
                if (bits(src1[lane], 5)) {
                    // is -zero
                    if (std::fpclassify(src0[lane]) == FP_ZERO
                        && std::signbit(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
                if (bits(src1[lane], 6)) {
                    // is +zero
                    if (std::fpclassify(src0[lane]) == FP_ZERO
                        && !std::signbit(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
                if (bits(src1[lane], 7)) {
                    // is +denormal
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                        && !std::signbit(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
                if (bits(src1[lane], 8)) {
                    // is +normal
                    if (std::isnormal(src0[lane])
                        && !std::signbit(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
                if (bits(src1[lane], 9)) {
                    // is +infinity
                    if (std::isinf(src0[lane])
                        && !std::signbit(src0[lane])) {
                        sdst.setBit(lane,  1);
                        continue;
                    }
                }
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMP_CLASS_F64::Inst_VOP3__V_CMP_CLASS_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_class_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_CLASS_F64

    Inst_VOP3__V_CMP_CLASS_F64::~Inst_VOP3__V_CMP_CLASS_F64()
    {
    } // ~Inst_VOP3__V_CMP_CLASS_F64

    // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
    // The function reports true if the floating point value is any of the
    // numeric types selected in S1.u according to the following list:
    // S1.u[0] -- value is a signaling NaN.
    // S1.u[1] -- value is a quiet NaN.
    // S1.u[2] -- value is negative infinity.
    // S1.u[3] -- value is a negative normal value.
    // S1.u[4] -- value is a negative denormal value.
    // S1.u[5] -- value is negative zero.
    // S1.u[6] -- value is positive zero.
    // S1.u[7] -- value is a positive denormal value.
    // S1.u[8] -- value is a positive normal value.
    // S1.u[9] -- value is positive infinity.
    void
    Inst_VOP3__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
                    // is NaN
                    if (std::isnan(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 2)) {
                    // is -infinity
                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 3)) {
                    // is -normal
                    if (std::isnormal(src0[lane])
                        && std::signbit(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 4)) {
                    // is -denormal
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                        && std::signbit(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 5)) {
                    // is -zero
                    if (std::fpclassify(src0[lane]) == FP_ZERO
                        && std::signbit(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 6)) {
                    // is +zero
                    if (std::fpclassify(src0[lane]) == FP_ZERO
                        && !std::signbit(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 7)) {
                    // is +denormal
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                        && !std::signbit(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 8)) {
                    // is +normal
                    if (std::isnormal(src0[lane])
                        && !std::signbit(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 9)) {
                    // is +infinity
                    if (std::isinf(src0[lane])
                        && !std::signbit(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMPX_CLASS_F64::Inst_VOP3__V_CMPX_CLASS_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_class_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_CLASS_F64

    Inst_VOP3__V_CMPX_CLASS_F64::~Inst_VOP3__V_CMPX_CLASS_F64()
    {
    } // ~Inst_VOP3__V_CMPX_CLASS_F64

    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
    // S0.d
    // The function reports true if the floating point value is any of the
    // numeric types selected in S1.u according to the following list:
    // S1.u[0] -- value is a signaling NaN.
    // S1.u[1] -- value is a quiet NaN.
    // S1.u[2] -- value is negative infinity.
    // S1.u[3] -- value is a negative normal value.
    // S1.u[4] -- value is a negative denormal value.
    // S1.u[5] -- value is negative zero.
    // S1.u[6] -- value is positive zero.
    // S1.u[7] -- value is a positive denormal value.
    // S1.u[8] -- value is a positive normal value.
    // S1.u[9] -- value is positive infinity.
    void
    Inst_VOP3__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
                    // is NaN
                    if (std::isnan(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 2)) {
                    // is -infinity
                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 3)) {
                    // is -normal
                    if (std::isnormal(src0[lane])
                        && std::signbit(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 4)) {
                    // is -denormal
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                        && std::signbit(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 5)) {
                    // is -zero
                    if (std::fpclassify(src0[lane]) == FP_ZERO
                        && std::signbit(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 6)) {
                    // is +zero
                    if (std::fpclassify(src0[lane]) == FP_ZERO
                        && !std::signbit(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 7)) {
                    // is +denormal
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                        && !std::signbit(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 8)) {
                    // is +normal
                    if (std::isnormal(src0[lane])
                        && !std::signbit(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
                if (bits(src1[lane], 9)) {
                    // is +infinity
                    if (std::isinf(src0[lane])
                        && !std::signbit(src0[lane])) {
                        sdst.setBit(lane, 1);
                        continue;
                    }
                }
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMP_CLASS_F16::Inst_VOP3__V_CMP_CLASS_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_class_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_CLASS_F16

    Inst_VOP3__V_CMP_CLASS_F16::~Inst_VOP3__V_CMP_CLASS_F16()
    {
    } // ~Inst_VOP3__V_CMP_CLASS_F16

    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
    // The function reports true if the floating point value is any of the
    // numeric types selected in S1.u according to the following list:
    // S1.u[0] -- value is a signaling NaN.
    // S1.u[1] -- value is a quiet NaN.
    // S1.u[2] -- value is negative infinity.
    // S1.u[3] -- value is a negative normal value.
    // S1.u[4] -- value is a negative denormal value.
    // S1.u[5] -- value is negative zero.
    // S1.u[6] -- value is positive zero.
    // S1.u[7] -- value is a positive denormal value.
    // S1.u[8] -- value is a positive normal value.
    // S1.u[9] -- value is positive infinity.
    void
    Inst_VOP3__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMPX_CLASS_F16::Inst_VOP3__V_CMPX_CLASS_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_class_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMPX_CLASS_F16

    Inst_VOP3__V_CMPX_CLASS_F16::~Inst_VOP3__V_CMPX_CLASS_F16()
    {
    } // ~Inst_VOP3__V_CMPX_CLASS_F16

    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
    // S0.f16
    // The function reports true if the floating point value is any of the
    // numeric types selected in S1.u according to the following list:
    // S1.u[0] -- value is a signaling NaN.
    // S1.u[1] -- value is a quiet NaN.
    // S1.u[2] -- value is negative infinity.
    // S1.u[3] -- value is a negative normal value.
    // S1.u[4] -- value is a negative denormal value.
    // S1.u[5] -- value is negative zero.
    // S1.u[6] -- value is positive zero.
    // S1.u[7] -- value is a positive denormal value.
    // S1.u[8] -- value is a positive normal value.
    // S1.u[9] -- value is positive infinity.
    void
    Inst_VOP3__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_F_F16::Inst_VOP3__V_CMP_F_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_f_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_F_F16

    Inst_VOP3__V_CMP_F_F16::~Inst_VOP3__V_CMP_F_F16()
    {
    } // ~Inst_VOP3__V_CMP_F_F16

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_LT_F16::Inst_VOP3__V_CMP_LT_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_lt_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_LT_F16

    Inst_VOP3__V_CMP_LT_F16::~Inst_VOP3__V_CMP_LT_F16()
    {
    } // ~Inst_VOP3__V_CMP_LT_F16

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_EQ_F16::Inst_VOP3__V_CMP_EQ_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_eq_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_EQ_F16

    Inst_VOP3__V_CMP_EQ_F16::~Inst_VOP3__V_CMP_EQ_F16()
    {
    } // ~Inst_VOP3__V_CMP_EQ_F16

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_LE_F16::Inst_VOP3__V_CMP_LE_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_le_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_LE_F16

    Inst_VOP3__V_CMP_LE_F16::~Inst_VOP3__V_CMP_LE_F16()
    {
    } // ~Inst_VOP3__V_CMP_LE_F16

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_GT_F16::Inst_VOP3__V_CMP_GT_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_gt_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_GT_F16

    Inst_VOP3__V_CMP_GT_F16::~Inst_VOP3__V_CMP_GT_F16()
    {
    } // ~Inst_VOP3__V_CMP_GT_F16

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_LG_F16::Inst_VOP3__V_CMP_LG_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_lg_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_LG_F16

    Inst_VOP3__V_CMP_LG_F16::~Inst_VOP3__V_CMP_LG_F16()
    {
    } // ~Inst_VOP3__V_CMP_LG_F16

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_GE_F16::Inst_VOP3__V_CMP_GE_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ge_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_GE_F16

    Inst_VOP3__V_CMP_GE_F16::~Inst_VOP3__V_CMP_GE_F16()
    {
    } // ~Inst_VOP3__V_CMP_GE_F16

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_O_F16::Inst_VOP3__V_CMP_O_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_o_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_O_F16

    Inst_VOP3__V_CMP_O_F16::~Inst_VOP3__V_CMP_O_F16()
    {
    } // ~Inst_VOP3__V_CMP_O_F16

    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_U_F16::Inst_VOP3__V_CMP_U_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_u_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_U_F16

    Inst_VOP3__V_CMP_U_F16::~Inst_VOP3__V_CMP_U_F16()
    {
    } // ~Inst_VOP3__V_CMP_U_F16

    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_NGE_F16::Inst_VOP3__V_CMP_NGE_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_nge_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_NGE_F16

    Inst_VOP3__V_CMP_NGE_F16::~Inst_VOP3__V_CMP_NGE_F16()
    {
    } // ~Inst_VOP3__V_CMP_NGE_F16

    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_NLG_F16::Inst_VOP3__V_CMP_NLG_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_nlg_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_NLG_F16

    Inst_VOP3__V_CMP_NLG_F16::~Inst_VOP3__V_CMP_NLG_F16()
    {
    } // ~Inst_VOP3__V_CMP_NLG_F16

    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_NGT_F16::Inst_VOP3__V_CMP_NGT_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ngt_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_NGT_F16

    Inst_VOP3__V_CMP_NGT_F16::~Inst_VOP3__V_CMP_NGT_F16()
    {
    } // ~Inst_VOP3__V_CMP_NGT_F16

    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_NLE_F16::Inst_VOP3__V_CMP_NLE_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_nle_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_NLE_F16

    Inst_VOP3__V_CMP_NLE_F16::~Inst_VOP3__V_CMP_NLE_F16()
    {
    } // ~Inst_VOP3__V_CMP_NLE_F16

    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_NEQ_F16::Inst_VOP3__V_CMP_NEQ_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_neq_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_NEQ_F16

    Inst_VOP3__V_CMP_NEQ_F16::~Inst_VOP3__V_CMP_NEQ_F16()
    {
    } // ~Inst_VOP3__V_CMP_NEQ_F16

    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_NLT_F16::Inst_VOP3__V_CMP_NLT_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_nlt_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_NLT_F16

    Inst_VOP3__V_CMP_NLT_F16::~Inst_VOP3__V_CMP_NLT_F16()
    {
    } // ~Inst_VOP3__V_CMP_NLT_F16

    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMP_TRU_F16::Inst_VOP3__V_CMP_TRU_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_tru_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMP_TRU_F16

    Inst_VOP3__V_CMP_TRU_F16::~Inst_VOP3__V_CMP_TRU_F16()
    {
    } // ~Inst_VOP3__V_CMP_TRU_F16

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMPX_F_F16::Inst_VOP3__V_CMPX_F_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_f_f16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_F_F16

    Inst_VOP3__V_CMPX_F_F16::~Inst_VOP3__V_CMPX_F_F16()
    {
    } // ~Inst_VOP3__V_CMPX_F_F16

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LT_F16::Inst_VOP3__V_CMPX_LT_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_lt_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMPX_LT_F16

    Inst_VOP3__V_CMPX_LT_F16::~Inst_VOP3__V_CMPX_LT_F16()
    {
    } // ~Inst_VOP3__V_CMPX_LT_F16

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMPX_EQ_F16::Inst_VOP3__V_CMPX_EQ_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_eq_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMPX_EQ_F16

    Inst_VOP3__V_CMPX_EQ_F16::~Inst_VOP3__V_CMPX_EQ_F16()
    {
    } // ~Inst_VOP3__V_CMPX_EQ_F16

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMPX_LE_F16::Inst_VOP3__V_CMPX_LE_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_le_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMPX_LE_F16

    Inst_VOP3__V_CMPX_LE_F16::~Inst_VOP3__V_CMPX_LE_F16()
    {
    } // ~Inst_VOP3__V_CMPX_LE_F16

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMPX_GT_F16::Inst_VOP3__V_CMPX_GT_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_gt_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMPX_GT_F16

    Inst_VOP3__V_CMPX_GT_F16::~Inst_VOP3__V_CMPX_GT_F16()
    {
    } // ~Inst_VOP3__V_CMPX_GT_F16

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMPX_LG_F16::Inst_VOP3__V_CMPX_LG_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_lg_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMPX_LG_F16

    Inst_VOP3__V_CMPX_LG_F16::~Inst_VOP3__V_CMPX_LG_F16()
    {
    } // ~Inst_VOP3__V_CMPX_LG_F16

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMPX_GE_F16::Inst_VOP3__V_CMPX_GE_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ge_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMPX_GE_F16

    Inst_VOP3__V_CMPX_GE_F16::~Inst_VOP3__V_CMPX_GE_F16()
    {
    } // ~Inst_VOP3__V_CMPX_GE_F16

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMPX_O_F16::Inst_VOP3__V_CMPX_O_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_o_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMPX_O_F16

    Inst_VOP3__V_CMPX_O_F16::~Inst_VOP3__V_CMPX_O_F16()
    {
    } // ~Inst_VOP3__V_CMPX_O_F16

    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
    // encoding.
    void
    Inst_VOP3__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMPX_U_F16::Inst_VOP3__V_CMPX_U_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_u_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMPX_U_F16

    Inst_VOP3__V_CMPX_U_F16::~Inst_VOP3__V_CMPX_U_F16()
    {
    } // ~Inst_VOP3__V_CMPX_U_F16

    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
    // encoding.
    void
    Inst_VOP3__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMPX_NGE_F16::Inst_VOP3__V_CMPX_NGE_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_nge_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMPX_NGE_F16

    Inst_VOP3__V_CMPX_NGE_F16::~Inst_VOP3__V_CMPX_NGE_F16()
    {
    } // ~Inst_VOP3__V_CMPX_NGE_F16

    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMPX_NLG_F16::Inst_VOP3__V_CMPX_NLG_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_nlg_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMPX_NLG_F16

    Inst_VOP3__V_CMPX_NLG_F16::~Inst_VOP3__V_CMPX_NLG_F16()
    {
    } // ~Inst_VOP3__V_CMPX_NLG_F16

    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMPX_NGT_F16::Inst_VOP3__V_CMPX_NGT_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ngt_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMPX_NGT_F16

    Inst_VOP3__V_CMPX_NGT_F16::~Inst_VOP3__V_CMPX_NGT_F16()
    {
    } // ~Inst_VOP3__V_CMPX_NGT_F16

    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMPX_NLE_F16::Inst_VOP3__V_CMPX_NLE_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_nle_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMPX_NLE_F16

    Inst_VOP3__V_CMPX_NLE_F16::~Inst_VOP3__V_CMPX_NLE_F16()
    {
    } // ~Inst_VOP3__V_CMPX_NLE_F16

    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMPX_NEQ_F16::Inst_VOP3__V_CMPX_NEQ_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_neq_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMPX_NEQ_F16

    Inst_VOP3__V_CMPX_NEQ_F16::~Inst_VOP3__V_CMPX_NEQ_F16()
    {
    } // ~Inst_VOP3__V_CMPX_NEQ_F16

    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMPX_NLT_F16::Inst_VOP3__V_CMPX_NLT_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_nlt_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMPX_NLT_F16

    Inst_VOP3__V_CMPX_NLT_F16::~Inst_VOP3__V_CMPX_NLT_F16()
    {
    } // ~Inst_VOP3__V_CMPX_NLT_F16

    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CMPX_TRU_F16::Inst_VOP3__V_CMPX_TRU_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_tru_f16", true)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CMPX_TRU_F16

    Inst_VOP3__V_CMPX_TRU_F16::~Inst_VOP3__V_CMPX_TRU_F16()
    {
    } // ~Inst_VOP3__V_CMPX_TRU_F16

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMP_F_F32::Inst_VOP3__V_CMP_F_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_f_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_F_F32

    Inst_VOP3__V_CMP_F_F32::~Inst_VOP3__V_CMP_F_F32()
    {
    } // ~Inst_VOP3__V_CMP_F_F32

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LT_F32::Inst_VOP3__V_CMP_LT_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_lt_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_LT_F32

    Inst_VOP3__V_CMP_LT_F32::~Inst_VOP3__V_CMP_LT_F32()
    {
    } // ~Inst_VOP3__V_CMP_LT_F32

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_EQ_F32::Inst_VOP3__V_CMP_EQ_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_eq_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_EQ_F32

    Inst_VOP3__V_CMP_EQ_F32::~Inst_VOP3__V_CMP_EQ_F32()
    {
    } // ~Inst_VOP3__V_CMP_EQ_F32

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LE_F32::Inst_VOP3__V_CMP_LE_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_le_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_LE_F32

    Inst_VOP3__V_CMP_LE_F32::~Inst_VOP3__V_CMP_LE_F32()
    {
    } // ~Inst_VOP3__V_CMP_LE_F32

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_GT_F32::Inst_VOP3__V_CMP_GT_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_gt_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_GT_F32

    Inst_VOP3__V_CMP_GT_F32::~Inst_VOP3__V_CMP_GT_F32()
    {
    } // ~Inst_VOP3__V_CMP_GT_F32

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LG_F32::Inst_VOP3__V_CMP_LG_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_lg_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_LG_F32

    Inst_VOP3__V_CMP_LG_F32::~Inst_VOP3__V_CMP_LG_F32()
    {
    } // ~Inst_VOP3__V_CMP_LG_F32

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_GE_F32::Inst_VOP3__V_CMP_GE_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ge_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_GE_F32

    Inst_VOP3__V_CMP_GE_F32::~Inst_VOP3__V_CMP_GE_F32()
    {
    } // ~Inst_VOP3__V_CMP_GE_F32

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_O_F32::Inst_VOP3__V_CMP_O_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_o_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_O_F32

    Inst_VOP3__V_CMP_O_F32::~Inst_VOP3__V_CMP_O_F32()
    {
    } // ~Inst_VOP3__V_CMP_O_F32

    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, (!std::isnan(src0[lane])
                    && !std::isnan(src1[lane])) ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_U_F32::Inst_VOP3__V_CMP_U_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_u_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_U_F32

    Inst_VOP3__V_CMP_U_F32::~Inst_VOP3__V_CMP_U_F32()
    {
    } // ~Inst_VOP3__V_CMP_U_F32

    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, (std::isnan(src0[lane])
                    || std::isnan(src1[lane])) ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NGE_F32::Inst_VOP3__V_CMP_NGE_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_nge_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_NGE_F32

    Inst_VOP3__V_CMP_NGE_F32::~Inst_VOP3__V_CMP_NGE_F32()
    {
    } // ~Inst_VOP3__V_CMP_NGE_F32

    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NLG_F32::Inst_VOP3__V_CMP_NLG_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_nlg_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_NLG_F32

    Inst_VOP3__V_CMP_NLG_F32::~Inst_VOP3__V_CMP_NLG_F32()
    {
    } // ~Inst_VOP3__V_CMP_NLG_F32

    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] < src1[lane]
                    || src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NGT_F32::Inst_VOP3__V_CMP_NGT_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ngt_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_NGT_F32

    Inst_VOP3__V_CMP_NGT_F32::~Inst_VOP3__V_CMP_NGT_F32()
    {
    } // ~Inst_VOP3__V_CMP_NGT_F32

    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NLE_F32::Inst_VOP3__V_CMP_NLE_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_nle_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_NLE_F32

    Inst_VOP3__V_CMP_NLE_F32::~Inst_VOP3__V_CMP_NLE_F32()
    {
    } // ~Inst_VOP3__V_CMP_NLE_F32

    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NEQ_F32::Inst_VOP3__V_CMP_NEQ_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_neq_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_NEQ_F32

    Inst_VOP3__V_CMP_NEQ_F32::~Inst_VOP3__V_CMP_NEQ_F32()
    {
    } // ~Inst_VOP3__V_CMP_NEQ_F32

    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NLT_F32::Inst_VOP3__V_CMP_NLT_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_nlt_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_NLT_F32

    Inst_VOP3__V_CMP_NLT_F32::~Inst_VOP3__V_CMP_NLT_F32()
    {
    } // ~Inst_VOP3__V_CMP_NLT_F32

    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_TRU_F32::Inst_VOP3__V_CMP_TRU_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_tru_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMP_TRU_F32

    Inst_VOP3__V_CMP_TRU_F32::~Inst_VOP3__V_CMP_TRU_F32()
    {
    } // ~Inst_VOP3__V_CMP_TRU_F32

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMPX_F_F32::Inst_VOP3__V_CMPX_F_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_f_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_F_F32

    Inst_VOP3__V_CMPX_F_F32::~Inst_VOP3__V_CMPX_F_F32()
    {
    } // ~Inst_VOP3__V_CMPX_F_F32

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LT_F32::Inst_VOP3__V_CMPX_LT_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_lt_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_LT_F32

    Inst_VOP3__V_CMPX_LT_F32::~Inst_VOP3__V_CMPX_LT_F32()
    {
    } // ~Inst_VOP3__V_CMPX_LT_F32

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_EQ_F32::Inst_VOP3__V_CMPX_EQ_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_eq_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_EQ_F32

    Inst_VOP3__V_CMPX_EQ_F32::~Inst_VOP3__V_CMPX_EQ_F32()
    {
    } // ~Inst_VOP3__V_CMPX_EQ_F32

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LE_F32::Inst_VOP3__V_CMPX_LE_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_le_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_LE_F32

    Inst_VOP3__V_CMPX_LE_F32::~Inst_VOP3__V_CMPX_LE_F32()
    {
    } // ~Inst_VOP3__V_CMPX_LE_F32

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_GT_F32::Inst_VOP3__V_CMPX_GT_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_gt_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_GT_F32

    Inst_VOP3__V_CMPX_GT_F32::~Inst_VOP3__V_CMPX_GT_F32()
    {
    } // ~Inst_VOP3__V_CMPX_GT_F32

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LG_F32::Inst_VOP3__V_CMPX_LG_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_lg_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_LG_F32

    Inst_VOP3__V_CMPX_LG_F32::~Inst_VOP3__V_CMPX_LG_F32()
    {
    } // ~Inst_VOP3__V_CMPX_LG_F32

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, (src0[lane] < src1[lane]
                    || src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_GE_F32::Inst_VOP3__V_CMPX_GE_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ge_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_GE_F32

    Inst_VOP3__V_CMPX_GE_F32::~Inst_VOP3__V_CMPX_GE_F32()
    {
    } // ~Inst_VOP3__V_CMPX_GE_F32

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_O_F32::Inst_VOP3__V_CMPX_O_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_o_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_O_F32

    Inst_VOP3__V_CMPX_O_F32::~Inst_VOP3__V_CMPX_O_F32()
    {
    } // ~Inst_VOP3__V_CMPX_O_F32

    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
    // encoding.
    void
    Inst_VOP3__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, (!std::isnan(src0[lane])
                    && !std::isnan(src1[lane])) ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_U_F32::Inst_VOP3__V_CMPX_U_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_u_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_U_F32

    Inst_VOP3__V_CMPX_U_F32::~Inst_VOP3__V_CMPX_U_F32()
    {
    } // ~Inst_VOP3__V_CMPX_U_F32

    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
    // encoding.
    void
    Inst_VOP3__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, (std::isnan(src0[lane])
                        || std::isnan(src1[lane])) ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NGE_F32::Inst_VOP3__V_CMPX_NGE_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_nge_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_NGE_F32

    Inst_VOP3__V_CMPX_NGE_F32::~Inst_VOP3__V_CMPX_NGE_F32()
    {
    } // ~Inst_VOP3__V_CMPX_NGE_F32

    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NLG_F32::Inst_VOP3__V_CMPX_NLG_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_nlg_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_NLG_F32

    Inst_VOP3__V_CMPX_NLG_F32::~Inst_VOP3__V_CMPX_NLG_F32()
    {
    } // ~Inst_VOP3__V_CMPX_NLG_F32

    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] < src1[lane]
                    || src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NGT_F32::Inst_VOP3__V_CMPX_NGT_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ngt_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_NGT_F32

    Inst_VOP3__V_CMPX_NGT_F32::~Inst_VOP3__V_CMPX_NGT_F32()
    {
    } // ~Inst_VOP3__V_CMPX_NGT_F32

    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NLE_F32::Inst_VOP3__V_CMPX_NLE_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_nle_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_NLE_F32

    Inst_VOP3__V_CMPX_NLE_F32::~Inst_VOP3__V_CMPX_NLE_F32()
    {
    } // ~Inst_VOP3__V_CMPX_NLE_F32

    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NEQ_F32::Inst_VOP3__V_CMPX_NEQ_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_neq_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_NEQ_F32

    Inst_VOP3__V_CMPX_NEQ_F32::~Inst_VOP3__V_CMPX_NEQ_F32()
    {
    } // ~Inst_VOP3__V_CMPX_NEQ_F32

    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NLT_F32::Inst_VOP3__V_CMPX_NLT_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_nlt_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_NLT_F32

    Inst_VOP3__V_CMPX_NLT_F32::~Inst_VOP3__V_CMPX_NLT_F32()
    {
    } // ~Inst_VOP3__V_CMPX_NLT_F32

    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_TRU_F32::Inst_VOP3__V_CMPX_TRU_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_tru_f32", true)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CMPX_TRU_F32

    Inst_VOP3__V_CMPX_TRU_F32::~Inst_VOP3__V_CMPX_TRU_F32()
    {
    } // ~Inst_VOP3__V_CMPX_TRU_F32

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMP_F_F64::Inst_VOP3__V_CMP_F_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_f_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_F_F64

    Inst_VOP3__V_CMP_F_F64::~Inst_VOP3__V_CMP_F_F64()
    {
    } // ~Inst_VOP3__V_CMP_F_F64

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LT_F64::Inst_VOP3__V_CMP_LT_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_lt_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_LT_F64

    Inst_VOP3__V_CMP_LT_F64::~Inst_VOP3__V_CMP_LT_F64()
    {
    } // ~Inst_VOP3__V_CMP_LT_F64

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_EQ_F64::Inst_VOP3__V_CMP_EQ_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_eq_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_EQ_F64

    Inst_VOP3__V_CMP_EQ_F64::~Inst_VOP3__V_CMP_EQ_F64()
    {
    } // ~Inst_VOP3__V_CMP_EQ_F64

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LE_F64::Inst_VOP3__V_CMP_LE_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_le_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_LE_F64

    Inst_VOP3__V_CMP_LE_F64::~Inst_VOP3__V_CMP_LE_F64()
    {
    } // ~Inst_VOP3__V_CMP_LE_F64

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_GT_F64::Inst_VOP3__V_CMP_GT_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_gt_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_GT_F64

    Inst_VOP3__V_CMP_GT_F64::~Inst_VOP3__V_CMP_GT_F64()
    {
    } // ~Inst_VOP3__V_CMP_GT_F64

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LG_F64::Inst_VOP3__V_CMP_LG_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_lg_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_LG_F64

    Inst_VOP3__V_CMP_LG_F64::~Inst_VOP3__V_CMP_LG_F64()
    {
    } // ~Inst_VOP3__V_CMP_LG_F64

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, (src0[lane] < src1[lane]
                    || src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_GE_F64::Inst_VOP3__V_CMP_GE_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ge_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_GE_F64

    Inst_VOP3__V_CMP_GE_F64::~Inst_VOP3__V_CMP_GE_F64()
    {
    } // ~Inst_VOP3__V_CMP_GE_F64

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_O_F64::Inst_VOP3__V_CMP_O_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_o_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_O_F64

    Inst_VOP3__V_CMP_O_F64::~Inst_VOP3__V_CMP_O_F64()
    {
    } // ~Inst_VOP3__V_CMP_O_F64

    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, (!std::isnan(src0[lane])
                    && !std::isnan(src1[lane])) ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_U_F64::Inst_VOP3__V_CMP_U_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_u_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_U_F64

    Inst_VOP3__V_CMP_U_F64::~Inst_VOP3__V_CMP_U_F64()
    {
    } // ~Inst_VOP3__V_CMP_U_F64

    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, (std::isnan(src0[lane])
                    || std::isnan(src1[lane])) ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NGE_F64::Inst_VOP3__V_CMP_NGE_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_nge_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_NGE_F64

    Inst_VOP3__V_CMP_NGE_F64::~Inst_VOP3__V_CMP_NGE_F64()
    {
    } // ~Inst_VOP3__V_CMP_NGE_F64

    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NLG_F64::Inst_VOP3__V_CMP_NLG_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_nlg_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_NLG_F64

    Inst_VOP3__V_CMP_NLG_F64::~Inst_VOP3__V_CMP_NLG_F64()
    {
    } // ~Inst_VOP3__V_CMP_NLG_F64

    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] < src1[lane]
                    || src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NGT_F64::Inst_VOP3__V_CMP_NGT_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ngt_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_NGT_F64

    Inst_VOP3__V_CMP_NGT_F64::~Inst_VOP3__V_CMP_NGT_F64()
    {
    } // ~Inst_VOP3__V_CMP_NGT_F64

    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NLE_F64::Inst_VOP3__V_CMP_NLE_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_nle_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_NLE_F64

    Inst_VOP3__V_CMP_NLE_F64::~Inst_VOP3__V_CMP_NLE_F64()
    {
    } // ~Inst_VOP3__V_CMP_NLE_F64

    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NEQ_F64::Inst_VOP3__V_CMP_NEQ_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_neq_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_NEQ_F64

    Inst_VOP3__V_CMP_NEQ_F64::~Inst_VOP3__V_CMP_NEQ_F64()
    {
    } // ~Inst_VOP3__V_CMP_NEQ_F64

    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NLT_F64::Inst_VOP3__V_CMP_NLT_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_nlt_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_NLT_F64

    Inst_VOP3__V_CMP_NLT_F64::~Inst_VOP3__V_CMP_NLT_F64()
    {
    } // ~Inst_VOP3__V_CMP_NLT_F64

    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_TRU_F64::Inst_VOP3__V_CMP_TRU_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_tru_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMP_TRU_F64

    Inst_VOP3__V_CMP_TRU_F64::~Inst_VOP3__V_CMP_TRU_F64()
    {
    } // ~Inst_VOP3__V_CMP_TRU_F64

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMPX_F_F64::Inst_VOP3__V_CMPX_F_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_f_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_F_F64

    Inst_VOP3__V_CMPX_F_F64::~Inst_VOP3__V_CMPX_F_F64()
    {
    } // ~Inst_VOP3__V_CMPX_F_F64

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LT_F64::Inst_VOP3__V_CMPX_LT_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_lt_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_LT_F64

    Inst_VOP3__V_CMPX_LT_F64::~Inst_VOP3__V_CMPX_LT_F64()
    {
    } // ~Inst_VOP3__V_CMPX_LT_F64

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_EQ_F64::Inst_VOP3__V_CMPX_EQ_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_eq_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_EQ_F64

    Inst_VOP3__V_CMPX_EQ_F64::~Inst_VOP3__V_CMPX_EQ_F64()
    {
    } // ~Inst_VOP3__V_CMPX_EQ_F64

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LE_F64::Inst_VOP3__V_CMPX_LE_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_le_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_LE_F64

    Inst_VOP3__V_CMPX_LE_F64::~Inst_VOP3__V_CMPX_LE_F64()
    {
    } // ~Inst_VOP3__V_CMPX_LE_F64

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_GT_F64::Inst_VOP3__V_CMPX_GT_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_gt_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_GT_F64

    Inst_VOP3__V_CMPX_GT_F64::~Inst_VOP3__V_CMPX_GT_F64()
    {
    } // ~Inst_VOP3__V_CMPX_GT_F64

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LG_F64::Inst_VOP3__V_CMPX_LG_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_lg_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_LG_F64

    Inst_VOP3__V_CMPX_LG_F64::~Inst_VOP3__V_CMPX_LG_F64()
    {
    } // ~Inst_VOP3__V_CMPX_LG_F64

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, (src0[lane] < src1[lane]
                    || src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_GE_F64::Inst_VOP3__V_CMPX_GE_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ge_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_GE_F64

    Inst_VOP3__V_CMPX_GE_F64::~Inst_VOP3__V_CMPX_GE_F64()
    {
    } // ~Inst_VOP3__V_CMPX_GE_F64

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_O_F64::Inst_VOP3__V_CMPX_O_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_o_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_O_F64

    Inst_VOP3__V_CMPX_O_F64::~Inst_VOP3__V_CMPX_O_F64()
    {
    } // ~Inst_VOP3__V_CMPX_O_F64

    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
    // encoding.
    void
    Inst_VOP3__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, (!std::isnan(src0[lane])
                    && !std::isnan(src1[lane])) ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_U_F64::Inst_VOP3__V_CMPX_U_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_u_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_U_F64

    Inst_VOP3__V_CMPX_U_F64::~Inst_VOP3__V_CMPX_U_F64()
    {
    } // ~Inst_VOP3__V_CMPX_U_F64

    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
    // encoding.
    void
    Inst_VOP3__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, (std::isnan(src0[lane])
                    || std::isnan(src1[lane])) ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NGE_F64::Inst_VOP3__V_CMPX_NGE_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_nge_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_NGE_F64

    Inst_VOP3__V_CMPX_NGE_F64::~Inst_VOP3__V_CMPX_NGE_F64()
    {
    } // ~Inst_VOP3__V_CMPX_NGE_F64

    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NLG_F64::Inst_VOP3__V_CMPX_NLG_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_nlg_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_NLG_F64

    Inst_VOP3__V_CMPX_NLG_F64::~Inst_VOP3__V_CMPX_NLG_F64()
    {
    } // ~Inst_VOP3__V_CMPX_NLG_F64

    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] < src1[lane]
                    || src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NGT_F64::Inst_VOP3__V_CMPX_NGT_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ngt_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_NGT_F64

    Inst_VOP3__V_CMPX_NGT_F64::~Inst_VOP3__V_CMPX_NGT_F64()
    {
    } // ~Inst_VOP3__V_CMPX_NGT_F64

    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NLE_F64::Inst_VOP3__V_CMPX_NLE_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_nle_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_NLE_F64

    Inst_VOP3__V_CMPX_NLE_F64::~Inst_VOP3__V_CMPX_NLE_F64()
    {
    } // ~Inst_VOP3__V_CMPX_NLE_F64

    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NEQ_F64::Inst_VOP3__V_CMPX_NEQ_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_neq_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_NEQ_F64

    Inst_VOP3__V_CMPX_NEQ_F64::~Inst_VOP3__V_CMPX_NEQ_F64()
    {
    } // ~Inst_VOP3__V_CMPX_NEQ_F64

    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NLT_F64::Inst_VOP3__V_CMPX_NLT_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_nlt_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_NLT_F64

    Inst_VOP3__V_CMPX_NLT_F64::~Inst_VOP3__V_CMPX_NLT_F64()
    {
    } // ~Inst_VOP3__V_CMPX_NLT_F64

    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_TRU_F64::Inst_VOP3__V_CMPX_TRU_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_tru_f64", true)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CMPX_TRU_F64

    Inst_VOP3__V_CMPX_TRU_F64::~Inst_VOP3__V_CMPX_TRU_F64()
    {
    } // ~Inst_VOP3__V_CMPX_TRU_F64

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMP_F_I16::Inst_VOP3__V_CMP_F_I16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_f_i16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_F_I16

    Inst_VOP3__V_CMP_F_I16::~Inst_VOP3__V_CMP_F_I16()
    {
    } // ~Inst_VOP3__V_CMP_F_I16

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LT_I16::Inst_VOP3__V_CMP_LT_I16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_lt_i16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_LT_I16

    Inst_VOP3__V_CMP_LT_I16::~Inst_VOP3__V_CMP_LT_I16()
    {
    } // ~Inst_VOP3__V_CMP_LT_I16

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_EQ_I16::Inst_VOP3__V_CMP_EQ_I16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_eq_i16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_EQ_I16

    Inst_VOP3__V_CMP_EQ_I16::~Inst_VOP3__V_CMP_EQ_I16()
    {
    } // ~Inst_VOP3__V_CMP_EQ_I16

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LE_I16::Inst_VOP3__V_CMP_LE_I16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_le_i16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_LE_I16

    Inst_VOP3__V_CMP_LE_I16::~Inst_VOP3__V_CMP_LE_I16()
    {
    } // ~Inst_VOP3__V_CMP_LE_I16

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_GT_I16::Inst_VOP3__V_CMP_GT_I16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_gt_i16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_GT_I16

    Inst_VOP3__V_CMP_GT_I16::~Inst_VOP3__V_CMP_GT_I16()
    {
    } // ~Inst_VOP3__V_CMP_GT_I16

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NE_I16::Inst_VOP3__V_CMP_NE_I16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ne_i16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_NE_I16

    Inst_VOP3__V_CMP_NE_I16::~Inst_VOP3__V_CMP_NE_I16()
    {
    } // ~Inst_VOP3__V_CMP_NE_I16

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_GE_I16::Inst_VOP3__V_CMP_GE_I16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ge_i16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_GE_I16

    Inst_VOP3__V_CMP_GE_I16::~Inst_VOP3__V_CMP_GE_I16()
    {
    } // ~Inst_VOP3__V_CMP_GE_I16

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_T_I16::Inst_VOP3__V_CMP_T_I16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_t_i16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_T_I16

    Inst_VOP3__V_CMP_T_I16::~Inst_VOP3__V_CMP_T_I16()
    {
    } // ~Inst_VOP3__V_CMP_T_I16

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_F_U16::Inst_VOP3__V_CMP_F_U16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_f_u16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_F_U16

    Inst_VOP3__V_CMP_F_U16::~Inst_VOP3__V_CMP_F_U16()
    {
    } // ~Inst_VOP3__V_CMP_F_U16

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LT_U16::Inst_VOP3__V_CMP_LT_U16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_lt_u16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_LT_U16

    Inst_VOP3__V_CMP_LT_U16::~Inst_VOP3__V_CMP_LT_U16()
    {
    } // ~Inst_VOP3__V_CMP_LT_U16

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_EQ_U16::Inst_VOP3__V_CMP_EQ_U16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_eq_u16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_EQ_U16

    Inst_VOP3__V_CMP_EQ_U16::~Inst_VOP3__V_CMP_EQ_U16()
    {
    } // ~Inst_VOP3__V_CMP_EQ_U16

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LE_U16::Inst_VOP3__V_CMP_LE_U16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_le_u16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_LE_U16

    Inst_VOP3__V_CMP_LE_U16::~Inst_VOP3__V_CMP_LE_U16()
    {
    } // ~Inst_VOP3__V_CMP_LE_U16

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_GT_U16::Inst_VOP3__V_CMP_GT_U16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_gt_u16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_GT_U16

    Inst_VOP3__V_CMP_GT_U16::~Inst_VOP3__V_CMP_GT_U16()
    {
    } // ~Inst_VOP3__V_CMP_GT_U16

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NE_U16::Inst_VOP3__V_CMP_NE_U16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ne_u16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_NE_U16

    Inst_VOP3__V_CMP_NE_U16::~Inst_VOP3__V_CMP_NE_U16()
    {
    } // ~Inst_VOP3__V_CMP_NE_U16

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_GE_U16::Inst_VOP3__V_CMP_GE_U16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ge_u16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_GE_U16

    Inst_VOP3__V_CMP_GE_U16::~Inst_VOP3__V_CMP_GE_U16()
    {
    } // ~Inst_VOP3__V_CMP_GE_U16

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_T_U16::Inst_VOP3__V_CMP_T_U16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_t_u16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_T_U16

    Inst_VOP3__V_CMP_T_U16::~Inst_VOP3__V_CMP_T_U16()
    {
    } // ~Inst_VOP3__V_CMP_T_U16

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMPX_F_I16::Inst_VOP3__V_CMPX_F_I16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_f_i16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_F_I16

    Inst_VOP3__V_CMPX_F_I16::~Inst_VOP3__V_CMPX_F_I16()
    {
    } // ~Inst_VOP3__V_CMPX_F_I16

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LT_I16::Inst_VOP3__V_CMPX_LT_I16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_lt_i16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_LT_I16

    Inst_VOP3__V_CMPX_LT_I16::~Inst_VOP3__V_CMPX_LT_I16()
    {
    } // ~Inst_VOP3__V_CMPX_LT_I16

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_EQ_I16::Inst_VOP3__V_CMPX_EQ_I16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_eq_i16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_EQ_I16

    Inst_VOP3__V_CMPX_EQ_I16::~Inst_VOP3__V_CMPX_EQ_I16()
    {
    } // ~Inst_VOP3__V_CMPX_EQ_I16

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LE_I16::Inst_VOP3__V_CMPX_LE_I16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_le_i16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_LE_I16

    Inst_VOP3__V_CMPX_LE_I16::~Inst_VOP3__V_CMPX_LE_I16()
    {
    } // ~Inst_VOP3__V_CMPX_LE_I16

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_GT_I16::Inst_VOP3__V_CMPX_GT_I16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_gt_i16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_GT_I16

    Inst_VOP3__V_CMPX_GT_I16::~Inst_VOP3__V_CMPX_GT_I16()
    {
    } // ~Inst_VOP3__V_CMPX_GT_I16

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NE_I16::Inst_VOP3__V_CMPX_NE_I16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ne_i16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_NE_I16

    Inst_VOP3__V_CMPX_NE_I16::~Inst_VOP3__V_CMPX_NE_I16()
    {
    } // ~Inst_VOP3__V_CMPX_NE_I16

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_GE_I16::Inst_VOP3__V_CMPX_GE_I16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ge_i16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_GE_I16

    Inst_VOP3__V_CMPX_GE_I16::~Inst_VOP3__V_CMPX_GE_I16()
    {
    } // ~Inst_VOP3__V_CMPX_GE_I16

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_T_I16::Inst_VOP3__V_CMPX_T_I16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_t_i16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_T_I16

    Inst_VOP3__V_CMPX_T_I16::~Inst_VOP3__V_CMPX_T_I16()
    {
    } // ~Inst_VOP3__V_CMPX_T_I16

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_F_U16::Inst_VOP3__V_CMPX_F_U16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_f_u16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_F_U16

    Inst_VOP3__V_CMPX_F_U16::~Inst_VOP3__V_CMPX_F_U16()
    {
    } // ~Inst_VOP3__V_CMPX_F_U16

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LT_U16::Inst_VOP3__V_CMPX_LT_U16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_lt_u16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_LT_U16

    Inst_VOP3__V_CMPX_LT_U16::~Inst_VOP3__V_CMPX_LT_U16()
    {
    } // ~Inst_VOP3__V_CMPX_LT_U16

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_EQ_U16::Inst_VOP3__V_CMPX_EQ_U16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_eq_u16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_EQ_U16

    Inst_VOP3__V_CMPX_EQ_U16::~Inst_VOP3__V_CMPX_EQ_U16()
    {
    } // ~Inst_VOP3__V_CMPX_EQ_U16

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LE_U16::Inst_VOP3__V_CMPX_LE_U16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_le_u16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_LE_U16

    Inst_VOP3__V_CMPX_LE_U16::~Inst_VOP3__V_CMPX_LE_U16()
    {
    } // ~Inst_VOP3__V_CMPX_LE_U16

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_GT_U16::Inst_VOP3__V_CMPX_GT_U16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_gt_u16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_GT_U16

    Inst_VOP3__V_CMPX_GT_U16::~Inst_VOP3__V_CMPX_GT_U16()
    {
    } // ~Inst_VOP3__V_CMPX_GT_U16

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NE_U16::Inst_VOP3__V_CMPX_NE_U16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ne_u16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_NE_U16

    Inst_VOP3__V_CMPX_NE_U16::~Inst_VOP3__V_CMPX_NE_U16()
    {
    } // ~Inst_VOP3__V_CMPX_NE_U16

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_GE_U16::Inst_VOP3__V_CMPX_GE_U16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ge_u16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_GE_U16

    Inst_VOP3__V_CMPX_GE_U16::~Inst_VOP3__V_CMPX_GE_U16()
    {
    } // ~Inst_VOP3__V_CMPX_GE_U16

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_T_U16::Inst_VOP3__V_CMPX_T_U16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_t_u16", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_T_U16

    Inst_VOP3__V_CMPX_T_U16::~Inst_VOP3__V_CMPX_T_U16()
    {
    } // ~Inst_VOP3__V_CMPX_T_U16

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMP_F_I32::Inst_VOP3__V_CMP_F_I32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_f_i32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_F_I32

    Inst_VOP3__V_CMP_F_I32::~Inst_VOP3__V_CMP_F_I32()
    {
    } // ~Inst_VOP3__V_CMP_F_I32

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMP_LT_I32::Inst_VOP3__V_CMP_LT_I32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_lt_i32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_LT_I32

    Inst_VOP3__V_CMP_LT_I32::~Inst_VOP3__V_CMP_LT_I32()
    {
    } // ~Inst_VOP3__V_CMP_LT_I32

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_EQ_I32::Inst_VOP3__V_CMP_EQ_I32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_eq_i32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_EQ_I32

    Inst_VOP3__V_CMP_EQ_I32::~Inst_VOP3__V_CMP_EQ_I32()
    {
    } // ~Inst_VOP3__V_CMP_EQ_I32

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LE_I32::Inst_VOP3__V_CMP_LE_I32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_le_i32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_LE_I32

    Inst_VOP3__V_CMP_LE_I32::~Inst_VOP3__V_CMP_LE_I32()
    {
    } // ~Inst_VOP3__V_CMP_LE_I32

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_GT_I32::Inst_VOP3__V_CMP_GT_I32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_gt_i32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_GT_I32

    Inst_VOP3__V_CMP_GT_I32::~Inst_VOP3__V_CMP_GT_I32()
    {
    } // ~Inst_VOP3__V_CMP_GT_I32

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NE_I32::Inst_VOP3__V_CMP_NE_I32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ne_i32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_NE_I32

    Inst_VOP3__V_CMP_NE_I32::~Inst_VOP3__V_CMP_NE_I32()
    {
    } // ~Inst_VOP3__V_CMP_NE_I32

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_GE_I32::Inst_VOP3__V_CMP_GE_I32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ge_i32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_GE_I32

    Inst_VOP3__V_CMP_GE_I32::~Inst_VOP3__V_CMP_GE_I32()
    {
    } // ~Inst_VOP3__V_CMP_GE_I32

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_T_I32::Inst_VOP3__V_CMP_T_I32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_t_i32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_T_I32

    Inst_VOP3__V_CMP_T_I32::~Inst_VOP3__V_CMP_T_I32()
    {
    } // ~Inst_VOP3__V_CMP_T_I32

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_F_U32::Inst_VOP3__V_CMP_F_U32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_f_u32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_F_U32

    Inst_VOP3__V_CMP_F_U32::~Inst_VOP3__V_CMP_F_U32()
    {
    } // ~Inst_VOP3__V_CMP_F_U32

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LT_U32::Inst_VOP3__V_CMP_LT_U32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_lt_u32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_LT_U32

    Inst_VOP3__V_CMP_LT_U32::~Inst_VOP3__V_CMP_LT_U32()
    {
    } // ~Inst_VOP3__V_CMP_LT_U32

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_EQ_U32::Inst_VOP3__V_CMP_EQ_U32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_eq_u32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_EQ_U32

    Inst_VOP3__V_CMP_EQ_U32::~Inst_VOP3__V_CMP_EQ_U32()
    {
    } // ~Inst_VOP3__V_CMP_EQ_U32

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LE_U32::Inst_VOP3__V_CMP_LE_U32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_le_u32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_LE_U32

    Inst_VOP3__V_CMP_LE_U32::~Inst_VOP3__V_CMP_LE_U32()
    {
    } // ~Inst_VOP3__V_CMP_LE_U32

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_GT_U32::Inst_VOP3__V_CMP_GT_U32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_gt_u32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_GT_U32

    Inst_VOP3__V_CMP_GT_U32::~Inst_VOP3__V_CMP_GT_U32()
    {
    } // ~Inst_VOP3__V_CMP_GT_U32

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NE_U32::Inst_VOP3__V_CMP_NE_U32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ne_u32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_NE_U32

    Inst_VOP3__V_CMP_NE_U32::~Inst_VOP3__V_CMP_NE_U32()
    {
    } // ~Inst_VOP3__V_CMP_NE_U32

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_GE_U32::Inst_VOP3__V_CMP_GE_U32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ge_u32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_GE_U32

    Inst_VOP3__V_CMP_GE_U32::~Inst_VOP3__V_CMP_GE_U32()
    {
    } // ~Inst_VOP3__V_CMP_GE_U32

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_T_U32::Inst_VOP3__V_CMP_T_U32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_t_u32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_T_U32

    Inst_VOP3__V_CMP_T_U32::~Inst_VOP3__V_CMP_T_U32()
    {
    } // ~Inst_VOP3__V_CMP_T_U32

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMPX_F_I32::Inst_VOP3__V_CMPX_F_I32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_f_i32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_F_I32

    Inst_VOP3__V_CMPX_F_I32::~Inst_VOP3__V_CMPX_F_I32()
    {
    } // ~Inst_VOP3__V_CMPX_F_I32

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LT_I32::Inst_VOP3__V_CMPX_LT_I32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_lt_i32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_LT_I32

    Inst_VOP3__V_CMPX_LT_I32::~Inst_VOP3__V_CMPX_LT_I32()
    {
    } // ~Inst_VOP3__V_CMPX_LT_I32

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_EQ_I32::Inst_VOP3__V_CMPX_EQ_I32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_eq_i32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_EQ_I32

    Inst_VOP3__V_CMPX_EQ_I32::~Inst_VOP3__V_CMPX_EQ_I32()
    {
    } // ~Inst_VOP3__V_CMPX_EQ_I32

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LE_I32::Inst_VOP3__V_CMPX_LE_I32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_le_i32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_LE_I32

    Inst_VOP3__V_CMPX_LE_I32::~Inst_VOP3__V_CMPX_LE_I32()
    {
    } // ~Inst_VOP3__V_CMPX_LE_I32

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_GT_I32::Inst_VOP3__V_CMPX_GT_I32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_gt_i32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_GT_I32

    Inst_VOP3__V_CMPX_GT_I32::~Inst_VOP3__V_CMPX_GT_I32()
    {
    } // ~Inst_VOP3__V_CMPX_GT_I32

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NE_I32::Inst_VOP3__V_CMPX_NE_I32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ne_i32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_NE_I32

    Inst_VOP3__V_CMPX_NE_I32::~Inst_VOP3__V_CMPX_NE_I32()
    {
    } // ~Inst_VOP3__V_CMPX_NE_I32

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_GE_I32::Inst_VOP3__V_CMPX_GE_I32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ge_i32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_GE_I32

    Inst_VOP3__V_CMPX_GE_I32::~Inst_VOP3__V_CMPX_GE_I32()
    {
    } // ~Inst_VOP3__V_CMPX_GE_I32

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_T_I32::Inst_VOP3__V_CMPX_T_I32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_t_i32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_T_I32

    Inst_VOP3__V_CMPX_T_I32::~Inst_VOP3__V_CMPX_T_I32()
    {
    } // ~Inst_VOP3__V_CMPX_T_I32

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_F_U32::Inst_VOP3__V_CMPX_F_U32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_f_u32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_F_U32

    Inst_VOP3__V_CMPX_F_U32::~Inst_VOP3__V_CMPX_F_U32()
    {
    } // ~Inst_VOP3__V_CMPX_F_U32

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LT_U32::Inst_VOP3__V_CMPX_LT_U32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_lt_u32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_LT_U32

    Inst_VOP3__V_CMPX_LT_U32::~Inst_VOP3__V_CMPX_LT_U32()
    {
    } // ~Inst_VOP3__V_CMPX_LT_U32

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_EQ_U32::Inst_VOP3__V_CMPX_EQ_U32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_eq_u32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_EQ_U32

    Inst_VOP3__V_CMPX_EQ_U32::~Inst_VOP3__V_CMPX_EQ_U32()
    {
    } // ~Inst_VOP3__V_CMPX_EQ_U32

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LE_U32::Inst_VOP3__V_CMPX_LE_U32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_le_u32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_LE_U32

    Inst_VOP3__V_CMPX_LE_U32::~Inst_VOP3__V_CMPX_LE_U32()
    {
    } // ~Inst_VOP3__V_CMPX_LE_U32

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_GT_U32::Inst_VOP3__V_CMPX_GT_U32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_gt_u32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_GT_U32

    Inst_VOP3__V_CMPX_GT_U32::~Inst_VOP3__V_CMPX_GT_U32()
    {
    } // ~Inst_VOP3__V_CMPX_GT_U32

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NE_U32::Inst_VOP3__V_CMPX_NE_U32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ne_u32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_NE_U32

    Inst_VOP3__V_CMPX_NE_U32::~Inst_VOP3__V_CMPX_NE_U32()
    {
    } // ~Inst_VOP3__V_CMPX_NE_U32

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_GE_U32::Inst_VOP3__V_CMPX_GE_U32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ge_u32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_GE_U32

    Inst_VOP3__V_CMPX_GE_U32::~Inst_VOP3__V_CMPX_GE_U32()
    {
    } // ~Inst_VOP3__V_CMPX_GE_U32

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_T_U32::Inst_VOP3__V_CMPX_T_U32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_t_u32", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_T_U32

    Inst_VOP3__V_CMPX_T_U32::~Inst_VOP3__V_CMPX_T_U32()
    {
    } // ~Inst_VOP3__V_CMPX_T_U32

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMP_F_I64::Inst_VOP3__V_CMP_F_I64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_f_i64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_F_I64

    Inst_VOP3__V_CMP_F_I64::~Inst_VOP3__V_CMP_F_I64()
    {
    } // ~Inst_VOP3__V_CMP_F_I64

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LT_I64::Inst_VOP3__V_CMP_LT_I64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_lt_i64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_LT_I64

    Inst_VOP3__V_CMP_LT_I64::~Inst_VOP3__V_CMP_LT_I64()
    {
    } // ~Inst_VOP3__V_CMP_LT_I64

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_EQ_I64::Inst_VOP3__V_CMP_EQ_I64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_eq_i64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_EQ_I64

    Inst_VOP3__V_CMP_EQ_I64::~Inst_VOP3__V_CMP_EQ_I64()
    {
    } // ~Inst_VOP3__V_CMP_EQ_I64

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LE_I64::Inst_VOP3__V_CMP_LE_I64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_le_i64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_LE_I64

    Inst_VOP3__V_CMP_LE_I64::~Inst_VOP3__V_CMP_LE_I64()
    {
    } // ~Inst_VOP3__V_CMP_LE_I64

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_GT_I64::Inst_VOP3__V_CMP_GT_I64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_gt_i64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_GT_I64

    Inst_VOP3__V_CMP_GT_I64::~Inst_VOP3__V_CMP_GT_I64()
    {
    } // ~Inst_VOP3__V_CMP_GT_I64

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NE_I64::Inst_VOP3__V_CMP_NE_I64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ne_i64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_NE_I64

    Inst_VOP3__V_CMP_NE_I64::~Inst_VOP3__V_CMP_NE_I64()
    {
    } // ~Inst_VOP3__V_CMP_NE_I64

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_GE_I64::Inst_VOP3__V_CMP_GE_I64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ge_i64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_GE_I64

    Inst_VOP3__V_CMP_GE_I64::~Inst_VOP3__V_CMP_GE_I64()
    {
    } // ~Inst_VOP3__V_CMP_GE_I64

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_T_I64::Inst_VOP3__V_CMP_T_I64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_t_i64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_T_I64

    Inst_VOP3__V_CMP_T_I64::~Inst_VOP3__V_CMP_T_I64()
    {
    } // ~Inst_VOP3__V_CMP_T_I64

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_F_U64::Inst_VOP3__V_CMP_F_U64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_f_u64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_F_U64

    Inst_VOP3__V_CMP_F_U64::~Inst_VOP3__V_CMP_F_U64()
    {
    } // ~Inst_VOP3__V_CMP_F_U64

    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LT_U64::Inst_VOP3__V_CMP_LT_U64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_lt_u64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_LT_U64

    Inst_VOP3__V_CMP_LT_U64::~Inst_VOP3__V_CMP_LT_U64()
    {
    } // ~Inst_VOP3__V_CMP_LT_U64

    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_EQ_U64::Inst_VOP3__V_CMP_EQ_U64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_eq_u64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_EQ_U64

    Inst_VOP3__V_CMP_EQ_U64::~Inst_VOP3__V_CMP_EQ_U64()
    {
    } // ~Inst_VOP3__V_CMP_EQ_U64

    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_LE_U64::Inst_VOP3__V_CMP_LE_U64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_le_u64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_LE_U64

    Inst_VOP3__V_CMP_LE_U64::~Inst_VOP3__V_CMP_LE_U64()
    {
    } // ~Inst_VOP3__V_CMP_LE_U64

    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_GT_U64::Inst_VOP3__V_CMP_GT_U64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_gt_u64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_GT_U64

    Inst_VOP3__V_CMP_GT_U64::~Inst_VOP3__V_CMP_GT_U64()
    {
    } // ~Inst_VOP3__V_CMP_GT_U64

    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_NE_U64::Inst_VOP3__V_CMP_NE_U64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ne_u64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_NE_U64

    Inst_VOP3__V_CMP_NE_U64::~Inst_VOP3__V_CMP_NE_U64()
    {
    } // ~Inst_VOP3__V_CMP_NE_U64

    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_GE_U64::Inst_VOP3__V_CMP_GE_U64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_ge_u64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_GE_U64

    Inst_VOP3__V_CMP_GE_U64::~Inst_VOP3__V_CMP_GE_U64()
    {
    } // ~Inst_VOP3__V_CMP_GE_U64

    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMP_T_U64::Inst_VOP3__V_CMP_T_U64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmp_t_u64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMP_T_U64

    Inst_VOP3__V_CMP_T_U64::~Inst_VOP3__V_CMP_T_U64()
    {
    } // ~Inst_VOP3__V_CMP_T_U64

    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        sdst.write();
    }

    Inst_VOP3__V_CMPX_F_I64::Inst_VOP3__V_CMPX_F_I64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_f_i64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_F_I64

    Inst_VOP3__V_CMPX_F_I64::~Inst_VOP3__V_CMPX_F_I64()
    {
    } // ~Inst_VOP3__V_CMPX_F_I64

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LT_I64::Inst_VOP3__V_CMPX_LT_I64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_lt_i64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_LT_I64

    Inst_VOP3__V_CMPX_LT_I64::~Inst_VOP3__V_CMPX_LT_I64()
    {
    } // ~Inst_VOP3__V_CMPX_LT_I64

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_EQ_I64::Inst_VOP3__V_CMPX_EQ_I64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_eq_i64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_EQ_I64

    Inst_VOP3__V_CMPX_EQ_I64::~Inst_VOP3__V_CMPX_EQ_I64()
    {
    } // ~Inst_VOP3__V_CMPX_EQ_I64

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LE_I64::Inst_VOP3__V_CMPX_LE_I64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_le_i64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_LE_I64

    Inst_VOP3__V_CMPX_LE_I64::~Inst_VOP3__V_CMPX_LE_I64()
    {
    } // ~Inst_VOP3__V_CMPX_LE_I64

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_GT_I64::Inst_VOP3__V_CMPX_GT_I64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_gt_i64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_GT_I64

    Inst_VOP3__V_CMPX_GT_I64::~Inst_VOP3__V_CMPX_GT_I64()
    {
    } // ~Inst_VOP3__V_CMPX_GT_I64

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NE_I64::Inst_VOP3__V_CMPX_NE_I64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ne_i64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_NE_I64

    Inst_VOP3__V_CMPX_NE_I64::~Inst_VOP3__V_CMPX_NE_I64()
    {
    } // ~Inst_VOP3__V_CMPX_NE_I64

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_GE_I64::Inst_VOP3__V_CMPX_GE_I64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ge_i64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_GE_I64

    Inst_VOP3__V_CMPX_GE_I64::~Inst_VOP3__V_CMPX_GE_I64()
    {
    } // ~Inst_VOP3__V_CMPX_GE_I64

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_T_I64::Inst_VOP3__V_CMPX_T_I64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_t_i64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_T_I64

    Inst_VOP3__V_CMPX_T_I64::~Inst_VOP3__V_CMPX_T_I64()
    {
    } // ~Inst_VOP3__V_CMPX_T_I64

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_F_U64::Inst_VOP3__V_CMPX_F_U64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_f_u64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_F_U64

    Inst_VOP3__V_CMPX_F_U64::~Inst_VOP3__V_CMPX_F_U64()
    {
    } // ~Inst_VOP3__V_CMPX_F_U64

    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LT_U64::Inst_VOP3__V_CMPX_LT_U64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_lt_u64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_LT_U64

    Inst_VOP3__V_CMPX_LT_U64::~Inst_VOP3__V_CMPX_LT_U64()
    {
    } // ~Inst_VOP3__V_CMPX_LT_U64

    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_EQ_U64::Inst_VOP3__V_CMPX_EQ_U64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_eq_u64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_EQ_U64

    Inst_VOP3__V_CMPX_EQ_U64::~Inst_VOP3__V_CMPX_EQ_U64()
    {
    } // ~Inst_VOP3__V_CMPX_EQ_U64

    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_LE_U64::Inst_VOP3__V_CMPX_LE_U64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_le_u64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_LE_U64

    Inst_VOP3__V_CMPX_LE_U64::~Inst_VOP3__V_CMPX_LE_U64()
    {
    } // ~Inst_VOP3__V_CMPX_LE_U64

    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_GT_U64::Inst_VOP3__V_CMPX_GT_U64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_gt_u64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_GT_U64

    Inst_VOP3__V_CMPX_GT_U64::~Inst_VOP3__V_CMPX_GT_U64()
    {
    } // ~Inst_VOP3__V_CMPX_GT_U64

    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_NE_U64::Inst_VOP3__V_CMPX_NE_U64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ne_u64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_NE_U64

    Inst_VOP3__V_CMPX_NE_U64::~Inst_VOP3__V_CMPX_NE_U64()
    {
    } // ~Inst_VOP3__V_CMPX_NE_U64

    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_GE_U64::Inst_VOP3__V_CMPX_GE_U64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_ge_u64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_GE_U64

    Inst_VOP3__V_CMPX_GE_U64::~Inst_VOP3__V_CMPX_GE_U64()
    {
    } // ~Inst_VOP3__V_CMPX_GE_U64

    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CMPX_T_U64::Inst_VOP3__V_CMPX_T_U64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cmpx_t_u64", true)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CMPX_T_U64

    Inst_VOP3__V_CMPX_T_U64::~Inst_VOP3__V_CMPX_T_U64()
    {
    } // ~Inst_VOP3__V_CMPX_T_U64

    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
    void
    Inst_VOP3__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                sdst.setBit(lane, 1);
            }
        }

        wf->execMask() = sdst.rawData();
        sdst.write();
    }

    Inst_VOP3__V_CNDMASK_B32::Inst_VOP3__V_CNDMASK_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cndmask_b32", false)
    {
        setFlag(ALU);
        setFlag(ReadsVCC);
    } // Inst_VOP3__V_CNDMASK_B32

    Inst_VOP3__V_CNDMASK_B32::~Inst_VOP3__V_CNDMASK_B32()
    {
    } // ~Inst_VOP3__V_CNDMASK_B32

    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
    // as a scalar GPR in S2.
    void
    Inst_VOP3__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        vcc.read();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = bits(vcc.rawData(), lane)
                    ? src1[lane] : src0[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_ADD_F32::Inst_VOP3__V_ADD_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_add_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_ADD_F32

    Inst_VOP3__V_ADD_F32::~Inst_VOP3__V_ADD_F32()
    {
    } // ~Inst_VOP3__V_ADD_F32

    // D.f = S0.f + S1.f.
    void
    Inst_VOP3__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] + src1[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_SUB_F32::Inst_VOP3__V_SUB_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_sub_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_SUB_F32

    Inst_VOP3__V_SUB_F32::~Inst_VOP3__V_SUB_F32()
    {
    } // ~Inst_VOP3__V_SUB_F32

    // D.f = S0.f - S1.f.
    void
    Inst_VOP3__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] - src1[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_SUBREV_F32::Inst_VOP3__V_SUBREV_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_subrev_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_SUBREV_F32

    Inst_VOP3__V_SUBREV_F32::~Inst_VOP3__V_SUBREV_F32()
    {
    } // ~Inst_VOP3__V_SUBREV_F32

    // D.f = S1.f - S0.f.
    void
    Inst_VOP3__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] - src0[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MUL_LEGACY_F32::Inst_VOP3__V_MUL_LEGACY_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mul_legacy_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_MUL_LEGACY_F32

    Inst_VOP3__V_MUL_LEGACY_F32::~Inst_VOP3__V_MUL_LEGACY_F32()
    {
    } // ~Inst_VOP3__V_MUL_LEGACY_F32

    // D.f = S0.f * S1.f
    void
    Inst_VOP3__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isnan(src0[lane]) ||
                    std::isnan(src1[lane])) {
                    vdst[lane] = NAN;
                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                           std::fpclassify(src0[lane]) == FP_ZERO) &&
                           !std::signbit(src0[lane])) {
                    if (std::isinf(src1[lane])) {
                        vdst[lane] = NAN;
                    } else if (!std::signbit(src1[lane])) {
                        vdst[lane] = +0.0;
                    } else {
                        vdst[lane] = -0.0;
                    }
                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                           std::fpclassify(src0[lane]) == FP_ZERO) &&
                           std::signbit(src0[lane])) {
                    if (std::isinf(src1[lane])) {
                        vdst[lane] = NAN;
                    } else if (std::signbit(src1[lane])) {
                        vdst[lane] = +0.0;
                    } else {
                        vdst[lane] = -0.0;
                    }
                } else if (std::isinf(src0[lane]) &&
                           !std::signbit(src0[lane])) {
                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                        std::fpclassify(src1[lane]) == FP_ZERO) {
                        vdst[lane] = NAN;
                    } else if (!std::signbit(src1[lane])) {
                        vdst[lane] = +INFINITY;
                    } else {
                        vdst[lane] = -INFINITY;
                    }
                } else if (std::isinf(src0[lane]) &&
                           std::signbit(src0[lane])) {
                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                        std::fpclassify(src1[lane]) == FP_ZERO) {
                        vdst[lane] = NAN;
                    } else if (std::signbit(src1[lane])) {
                        vdst[lane] = +INFINITY;
                    } else {
                        vdst[lane] = -INFINITY;
                    }
                } else {
                    vdst[lane] = src0[lane] * src1[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MUL_F32::Inst_VOP3__V_MUL_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mul_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_MUL_F32

    Inst_VOP3__V_MUL_F32::~Inst_VOP3__V_MUL_F32()
    {
    } // ~Inst_VOP3__V_MUL_F32

    // D.f = S0.f * S1.f.
    void
    Inst_VOP3__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isnan(src0[lane]) ||
                    std::isnan(src1[lane])) {
                    vdst[lane] = NAN;
                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                           std::fpclassify(src0[lane]) == FP_ZERO) &&
                           !std::signbit(src0[lane])) {
                    if (std::isinf(src1[lane])) {
                        vdst[lane] = NAN;
                    } else if (!std::signbit(src1[lane])) {
                        vdst[lane] = +0.0;
                    } else {
                        vdst[lane] = -0.0;
                    }
                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                           std::fpclassify(src0[lane]) == FP_ZERO) &&
                           std::signbit(src0[lane])) {
                    if (std::isinf(src1[lane])) {
                        vdst[lane] = NAN;
                    } else if (std::signbit(src1[lane])) {
                        vdst[lane] = +0.0;
                    } else {
                        vdst[lane] = -0.0;
                    }
                } else if (std::isinf(src0[lane]) &&
                           !std::signbit(src0[lane])) {
                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                        std::fpclassify(src1[lane]) == FP_ZERO) {
                        vdst[lane] = NAN;
                    } else if (!std::signbit(src1[lane])) {
                        vdst[lane] = +INFINITY;
                    } else {
                        vdst[lane] = -INFINITY;
                    }
                } else if (std::isinf(src0[lane]) &&
                           std::signbit(src0[lane])) {
                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                        std::fpclassify(src1[lane]) == FP_ZERO) {
                        vdst[lane] = NAN;
                    } else if (std::signbit(src1[lane])) {
                        vdst[lane] = +INFINITY;
                    } else {
                        vdst[lane] = -INFINITY;
                    }
                } else {
                    vdst[lane] = src0[lane] * src1[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MUL_I32_I24::Inst_VOP3__V_MUL_I32_I24(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mul_i32_i24", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MUL_I32_I24

    Inst_VOP3__V_MUL_I32_I24::~Inst_VOP3__V_MUL_I32_I24()
    {
    } // ~Inst_VOP3__V_MUL_I32_I24

    // D.i = S0.i[23:0] * S1.i[23:0].
    void
    Inst_VOP3__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
                    * sext<24>(bits(src1[lane], 23, 0));
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MUL_HI_I32_I24::Inst_VOP3__V_MUL_HI_I32_I24(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mul_hi_i32_i24", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MUL_HI_I32_I24

    Inst_VOP3__V_MUL_HI_I32_I24::~Inst_VOP3__V_MUL_HI_I32_I24()
    {
    } // ~Inst_VOP3__V_MUL_HI_I32_I24

    // D.i = (S0.i[23:0] * S1.i[23:0]) >> 32.
    void
    Inst_VOP3__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemI64 tmp_src0
                    = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
                VecElemI64 tmp_src1
                    = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));

                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MUL_U32_U24::Inst_VOP3__V_MUL_U32_U24(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mul_u32_u24", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MUL_U32_U24

    Inst_VOP3__V_MUL_U32_U24::~Inst_VOP3__V_MUL_U32_U24()
    {
    } // ~Inst_VOP3__V_MUL_U32_U24

    // D.u = S0.u[23:0] * S1.u[23:0].
    void
    Inst_VOP3__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MUL_HI_U32_U24::Inst_VOP3__V_MUL_HI_U32_U24(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mul_hi_u32_u24", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MUL_HI_U32_U24

    Inst_VOP3__V_MUL_HI_U32_U24::~Inst_VOP3__V_MUL_HI_U32_U24()
    {
    } // ~Inst_VOP3__V_MUL_HI_U32_U24

    // D.i = (S0.u[23:0] * S1.u[23:0]) >> 32.
    void
    Inst_VOP3__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MIN_F32::Inst_VOP3__V_MIN_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_min_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_MIN_F32

    Inst_VOP3__V_MIN_F32::~Inst_VOP3__V_MIN_F32()
    {
    } // ~Inst_VOP3__V_MIN_F32

    // D.f = (S0.f < S1.f ? S0.f : S1.f).
    void
    Inst_VOP3__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fmin(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MAX_F32::Inst_VOP3__V_MAX_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_max_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_MAX_F32

    Inst_VOP3__V_MAX_F32::~Inst_VOP3__V_MAX_F32()
    {
    } // ~Inst_VOP3__V_MAX_F32

    // D.f = (S0.f >= S1.f ? S0.f : S1.f).
    void
    Inst_VOP3__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fmax(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MIN_I32::Inst_VOP3__V_MIN_I32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_min_i32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MIN_I32

    Inst_VOP3__V_MIN_I32::~Inst_VOP3__V_MIN_I32()
    {
    } // ~Inst_VOP3__V_MIN_I32

    // D.i = min(S0.i, S1.i).
    void
    Inst_VOP3__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::min(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MAX_I32::Inst_VOP3__V_MAX_I32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_max_i32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MAX_I32

    Inst_VOP3__V_MAX_I32::~Inst_VOP3__V_MAX_I32()
    {
    } // ~Inst_VOP3__V_MAX_I32

    // D.i = max(S0.i, S1.i).
    void
    Inst_VOP3__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::max(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MIN_U32::Inst_VOP3__V_MIN_U32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_min_u32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MIN_U32

    Inst_VOP3__V_MIN_U32::~Inst_VOP3__V_MIN_U32()
    {
    } // ~Inst_VOP3__V_MIN_U32

    // D.u = min(S0.u, S1.u).
    void
    Inst_VOP3__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::min(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MAX_U32::Inst_VOP3__V_MAX_U32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_max_u32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MAX_U32

    Inst_VOP3__V_MAX_U32::~Inst_VOP3__V_MAX_U32()
    {
    } // ~Inst_VOP3__V_MAX_U32

    // D.u = max(S0.u, S1.u).
    void
    Inst_VOP3__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::max(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_LSHRREV_B32::Inst_VOP3__V_LSHRREV_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_lshrrev_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_LSHRREV_B32

    Inst_VOP3__V_LSHRREV_B32::~Inst_VOP3__V_LSHRREV_B32()
    {
    } // ~Inst_VOP3__V_LSHRREV_B32

    // D.u = S1.u >> S0.u[4:0].
    // The vacated bits are set to zero.
    void
    Inst_VOP3__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_ASHRREV_I32::Inst_VOP3__V_ASHRREV_I32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_ashrrev_i32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_ASHRREV_I32

    Inst_VOP3__V_ASHRREV_I32::~Inst_VOP3__V_ASHRREV_I32()
    {
    } // ~Inst_VOP3__V_ASHRREV_I32

    // D.i = signext(S1.i) >> S0.i[4:0].
    // The vacated bits are set to the sign bit of the input value.
    void
    Inst_VOP3__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_LSHLREV_B32::Inst_VOP3__V_LSHLREV_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_lshlrev_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_LSHLREV_B32

    Inst_VOP3__V_LSHLREV_B32::~Inst_VOP3__V_LSHLREV_B32()
    {
    } // ~Inst_VOP3__V_LSHLREV_B32

    // D.u = S1.u << S0.u[4:0].
    void
    Inst_VOP3__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_AND_B32::Inst_VOP3__V_AND_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_and_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_AND_B32

    Inst_VOP3__V_AND_B32::~Inst_VOP3__V_AND_B32()
    {
    } // ~Inst_VOP3__V_AND_B32

    // D.u = S0.u & S1.u.
    // Input and output modifiers not supported.
    void
    Inst_VOP3__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] & src1[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_OR_B32::Inst_VOP3__V_OR_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_or_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_OR_B32

    Inst_VOP3__V_OR_B32::~Inst_VOP3__V_OR_B32()
    {
    } // ~Inst_VOP3__V_OR_B32

    // D.u = S0.u | S1.u.
    // Input and output modifiers not supported.
    void
    Inst_VOP3__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] | src1[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_XOR_B32::Inst_VOP3__V_XOR_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_xor_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_XOR_B32

    Inst_VOP3__V_XOR_B32::~Inst_VOP3__V_XOR_B32()
    {
    } // ~Inst_VOP3__V_XOR_B32

    // D.u = S0.u ^ S1.u.
    // Input and output modifiers not supported.
    void
    Inst_VOP3__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] ^ src1[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MAC_F32::Inst_VOP3__V_MAC_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mac_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
        setFlag(MAC);
    } // Inst_VOP3__V_MAC_F32

    Inst_VOP3__V_MAC_F32::~Inst_VOP3__V_MAC_F32()
    {
    } // ~Inst_VOP3__V_MAC_F32

    // D.f = S0.f * S1.f + D.f.
    void
    Inst_VOP3__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        vdst.read();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_ADD_U32::Inst_VOP3__V_ADD_U32(InFmt_VOP3_SDST_ENC *iFmt)
        : Inst_VOP3_SDST_ENC(iFmt, "v_add_u32")
    {
        setFlag(ALU);
        setFlag(WritesVCC);
    } // Inst_VOP3__V_ADD_U32

    Inst_VOP3__V_ADD_U32::~Inst_VOP3__V_ADD_U32()
    {
    } // ~Inst_VOP3__V_ADD_U32

    // D.u = S0.u + S1.u;
    // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
    // overflow or carry-out.
    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
    void
    Inst_VOP3__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);
        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] + src1[lane];
                vcc.setBit(lane, ((VecElemU64)src0[lane]
                    + (VecElemU64)src1[lane]) >= 0x100000000ULL ? 1 : 0);
            }
        }

        vdst.write();
        vcc.write();
    }

    Inst_VOP3__V_SUB_U32::Inst_VOP3__V_SUB_U32(InFmt_VOP3_SDST_ENC *iFmt)
        : Inst_VOP3_SDST_ENC(iFmt, "v_sub_u32")
    {
        setFlag(ALU);
        setFlag(WritesVCC);
    } // Inst_VOP3__V_SUB_U32

    Inst_VOP3__V_SUB_U32::~Inst_VOP3__V_SUB_U32()
    {
    } // ~Inst_VOP3__V_SUB_U32

    // D.u = S0.u - S1.u;
    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
    // carry-out.
    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
    void
    Inst_VOP3__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);
        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] - src1[lane];
                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
            }
        }

        vdst.write();
        vcc.write();
    }

    Inst_VOP3__V_SUBREV_U32::Inst_VOP3__V_SUBREV_U32(
          InFmt_VOP3_SDST_ENC *iFmt)
        : Inst_VOP3_SDST_ENC(iFmt, "v_subrev_u32")
    {
        setFlag(ALU);
        setFlag(WritesVCC);
    } // Inst_VOP3__V_SUBREV_U32

    Inst_VOP3__V_SUBREV_U32::~Inst_VOP3__V_SUBREV_U32()
    {
    } // ~Inst_VOP3__V_SUBREV_U32

    // D.u = S1.u - S0.u;
    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
    // carry-out.
    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
    void
    Inst_VOP3__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);
        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] - src0[lane];
                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
            }
        }

        vdst.write();
        vcc.write();
    }

    Inst_VOP3__V_ADDC_U32::Inst_VOP3__V_ADDC_U32(InFmt_VOP3_SDST_ENC *iFmt)
        : Inst_VOP3_SDST_ENC(iFmt, "v_addc_u32")
    {
        setFlag(ALU);
        setFlag(WritesVCC);
        setFlag(ReadsVCC);
    } // Inst_VOP3__V_ADDC_U32

    Inst_VOP3__V_ADDC_U32::~Inst_VOP3__V_ADDC_U32()
    {
    } // ~Inst_VOP3__V_ADDC_U32

    // D.u = S0.u + S1.u + VCC[threadId];
    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x100000000ULL ? 1 : 0)
    // is an UNSIGNED overflow.
    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
    // source comes from the SGPR-pair at S2.u.
    void
    Inst_VOP3__V_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);

        src0.readSrc();
        src1.readSrc();
        vcc.read();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] + src1[lane]
                    + bits(vcc.rawData(), lane);
                sdst.setBit(lane, ((VecElemU64)src0[lane]
                    + (VecElemU64)src1[lane]
                        + (VecElemU64)bits(vcc.rawData(), lane))
                            >= 0x100000000 ? 1 : 0);
            }
        }

        vdst.write();
        sdst.write();
    }

    Inst_VOP3__V_SUBB_U32::Inst_VOP3__V_SUBB_U32(InFmt_VOP3_SDST_ENC *iFmt)
        : Inst_VOP3_SDST_ENC(iFmt, "v_subb_u32")
    {
        setFlag(ALU);
        setFlag(WritesVCC);
        setFlag(ReadsVCC);
    } // Inst_VOP3__V_SUBB_U32

    Inst_VOP3__V_SUBB_U32::~Inst_VOP3__V_SUBB_U32()
    {
    } // ~Inst_VOP3__V_SUBB_U32

    // D.u = S0.u - S1.u - VCC[threadId];
    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
    // overflow.
    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
    // source comes from the SGPR-pair at S2.u.
    void
    Inst_VOP3__V_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        vcc.read();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] - src1[lane]
                    - bits(vcc.rawData(), lane);
                sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
                    > src0[lane] ? 1 : 0);
            }
        }

        vdst.write();
        sdst.write();
    }

    Inst_VOP3__V_SUBBREV_U32::Inst_VOP3__V_SUBBREV_U32(
          InFmt_VOP3_SDST_ENC *iFmt)
        : Inst_VOP3_SDST_ENC(iFmt, "v_subbrev_u32")
    {
        setFlag(ALU);
        setFlag(WritesVCC);
        setFlag(ReadsVCC);
    } // Inst_VOP3__V_SUBBREV_U32

    Inst_VOP3__V_SUBBREV_U32::~Inst_VOP3__V_SUBBREV_U32()
    {
    } // ~Inst_VOP3__V_SUBBREV_U32

    // D.u = S1.u - S0.u - VCC[threadId];
    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
    // overflow.
    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
    // source comes from the SGPR-pair at S2.u.
    void
    Inst_VOP3__V_SUBBREV_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstScalarOperandU64 sdst(gpuDynInst, instData.SDST);
        ScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        vcc.read();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] - src0[lane]
                    - bits(vcc.rawData(), lane);
                sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
                    > src0[lane] ? 1 : 0);
            }
        }

        vdst.write();
        sdst.write();
    }

    Inst_VOP3__V_ADD_F16::Inst_VOP3__V_ADD_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_add_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_ADD_F16

    Inst_VOP3__V_ADD_F16::~Inst_VOP3__V_ADD_F16()
    {
    } // ~Inst_VOP3__V_ADD_F16

    // D.f16 = S0.f16 + S1.f16.
    void
    Inst_VOP3__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_SUB_F16::Inst_VOP3__V_SUB_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_sub_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_SUB_F16

    Inst_VOP3__V_SUB_F16::~Inst_VOP3__V_SUB_F16()
    {
    } // ~Inst_VOP3__V_SUB_F16

    // D.f16 = S0.f16 - S1.f16.
    void
    Inst_VOP3__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_SUBREV_F16::Inst_VOP3__V_SUBREV_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_subrev_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_SUBREV_F16

    Inst_VOP3__V_SUBREV_F16::~Inst_VOP3__V_SUBREV_F16()
    {
    } // ~Inst_VOP3__V_SUBREV_F16

    // D.f16 = S1.f16 - S0.f16.
    void
    Inst_VOP3__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_MUL_F16::Inst_VOP3__V_MUL_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mul_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_MUL_F16

    Inst_VOP3__V_MUL_F16::~Inst_VOP3__V_MUL_F16()
    {
    } // ~Inst_VOP3__V_MUL_F16

    // D.f16 = S0.f16 * S1.f16.
    void
    Inst_VOP3__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_MAC_F16::Inst_VOP3__V_MAC_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mac_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
        setFlag(MAC);
    } // Inst_VOP3__V_MAC_F16

    Inst_VOP3__V_MAC_F16::~Inst_VOP3__V_MAC_F16()
    {
    } // ~Inst_VOP3__V_MAC_F16

    // D.f16 = S0.f16 * S1.f16 + D.f16.
    void
    Inst_VOP3__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_ADD_U16::Inst_VOP3__V_ADD_U16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_add_u16", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_ADD_U16

    Inst_VOP3__V_ADD_U16::~Inst_VOP3__V_ADD_U16()
    {
    } // ~Inst_VOP3__V_ADD_U16

    // D.u16 = S0.u16 + S1.u16.
    void
    Inst_VOP3__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] + src1[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_SUB_U16::Inst_VOP3__V_SUB_U16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_sub_u16", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_SUB_U16

    Inst_VOP3__V_SUB_U16::~Inst_VOP3__V_SUB_U16()
    {
    } // ~Inst_VOP3__V_SUB_U16

    // D.u16 = S0.u16 - S1.u16.
    void
    Inst_VOP3__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] - src1[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_SUBREV_U16::Inst_VOP3__V_SUBREV_U16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_subrev_u16", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_SUBREV_U16

    Inst_VOP3__V_SUBREV_U16::~Inst_VOP3__V_SUBREV_U16()
    {
    } // ~Inst_VOP3__V_SUBREV_U16

    // D.u16 = S1.u16 - S0.u16.
    void
    Inst_VOP3__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] - src0[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MUL_LO_U16::Inst_VOP3__V_MUL_LO_U16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mul_lo_u16", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MUL_LO_U16

    Inst_VOP3__V_MUL_LO_U16::~Inst_VOP3__V_MUL_LO_U16()
    {
    } // ~Inst_VOP3__V_MUL_LO_U16

    // D.u16 = S0.u16 * S1.u16.
    void
    Inst_VOP3__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] * src1[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_LSHLREV_B16::Inst_VOP3__V_LSHLREV_B16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_lshlrev_b16", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_LSHLREV_B16

    Inst_VOP3__V_LSHLREV_B16::~Inst_VOP3__V_LSHLREV_B16()
    {
    } // ~Inst_VOP3__V_LSHLREV_B16

    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
    void
    Inst_VOP3__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_LSHRREV_B16::Inst_VOP3__V_LSHRREV_B16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_lshrrev_b16", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_LSHRREV_B16

    Inst_VOP3__V_LSHRREV_B16::~Inst_VOP3__V_LSHRREV_B16()
    {
    } // ~Inst_VOP3__V_LSHRREV_B16

    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
    // The vacated bits are set to zero.
    void
    Inst_VOP3__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_ASHRREV_I16::Inst_VOP3__V_ASHRREV_I16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_ashrrev_i16", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_ASHRREV_I16

    Inst_VOP3__V_ASHRREV_I16::~Inst_VOP3__V_ASHRREV_I16()
    {
    } // ~Inst_VOP3__V_ASHRREV_I16

    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
    // The vacated bits are set to the sign bit of the input value.
    void
    Inst_VOP3__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        VecOperandI16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MAX_F16::Inst_VOP3__V_MAX_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_max_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_MAX_F16

    Inst_VOP3__V_MAX_F16::~Inst_VOP3__V_MAX_F16()
    {
    } // ~Inst_VOP3__V_MAX_F16

    // D.f16 = max(S0.f16, S1.f16).
    void
    Inst_VOP3__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_MIN_F16::Inst_VOP3__V_MIN_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_min_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_MIN_F16

    Inst_VOP3__V_MIN_F16::~Inst_VOP3__V_MIN_F16()
    {
    } // ~Inst_VOP3__V_MIN_F16

    // D.f16 = min(S0.f16, S1.f16).
    void
    Inst_VOP3__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_MAX_U16::Inst_VOP3__V_MAX_U16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_max_u16", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MAX_U16

    Inst_VOP3__V_MAX_U16::~Inst_VOP3__V_MAX_U16()
    {
    } // ~Inst_VOP3__V_MAX_U16

    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
    void
    Inst_VOP3__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::max(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MAX_I16::Inst_VOP3__V_MAX_I16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_max_i16", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MAX_I16

    Inst_VOP3__V_MAX_I16::~Inst_VOP3__V_MAX_I16()
    {
    } // ~Inst_VOP3__V_MAX_I16

    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
    void
    Inst_VOP3__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        VecOperandI16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::max(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MIN_U16::Inst_VOP3__V_MIN_U16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_min_u16", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MIN_U16

    Inst_VOP3__V_MIN_U16::~Inst_VOP3__V_MIN_U16()
    {
    } // ~Inst_VOP3__V_MIN_U16

    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
    void
    Inst_VOP3__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::min(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MIN_I16::Inst_VOP3__V_MIN_I16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_min_i16", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MIN_I16

    Inst_VOP3__V_MIN_I16::~Inst_VOP3__V_MIN_I16()
    {
    } // ~Inst_VOP3__V_MIN_I16

    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
    void
    Inst_VOP3__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        VecOperandI16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::min(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_LDEXP_F16::Inst_VOP3__V_LDEXP_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_ldexp_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_LDEXP_F16

    Inst_VOP3__V_LDEXP_F16::~Inst_VOP3__V_LDEXP_F16()
    {
    } // ~Inst_VOP3__V_LDEXP_F16

    // D.f16 = S0.f16 * (2 ** S1.i16).
    void
    Inst_VOP3__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_nop", false)
    {
        setFlag(Nop);
        setFlag(ALU);
    } // Inst_VOP3__V_NOP

    Inst_VOP3__V_NOP::~Inst_VOP3__V_NOP()
    {
    } // ~Inst_VOP3__V_NOP

    // Do nothing.
    void
    Inst_VOP3__V_NOP::execute(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_VOP3__V_MOV_B32::Inst_VOP3__V_MOV_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mov_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MOV_B32

    Inst_VOP3__V_MOV_B32::~Inst_VOP3__V_MOV_B32()
    {
    } // ~Inst_VOP3__V_MOV_B32

    // D.u = S0.u.
    // Input and output modifiers not supported; this is an untyped operation.
    void
    Inst_VOP3__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_I32_F64::Inst_VOP3__V_CVT_I32_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_i32_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CVT_I32_F64

    Inst_VOP3__V_CVT_I32_F64::~Inst_VOP3__V_CVT_I32_F64()
    {
    } // ~Inst_VOP3__V_CVT_I32_F64

    // D.i = (int)S0.d.
    // Out-of-range floating point values (including infinity) saturate. NaN
    // is converted to 0.
    void
    Inst_VOP3__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                int exp;
                std::frexp(src[lane],&exp);
                if (std::isnan(src[lane])) {
                    vdst[lane] = 0;
                } else if (std::isinf(src[lane]) || exp > 30) {
                    if (std::signbit(src[lane])) {
                        vdst[lane] = INT_MIN;
                    } else {
                        vdst[lane] = INT_MAX;
                    }
                } else {
                    vdst[lane] = (VecElemI32)src[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_F64_I32::Inst_VOP3__V_CVT_F64_I32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_f64_i32", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CVT_F64_I32

    Inst_VOP3__V_CVT_F64_I32::~Inst_VOP3__V_CVT_F64_I32()
    {
    } // ~Inst_VOP3__V_CVT_F64_I32

    // D.d = (double)S0.i.
    void
    Inst_VOP3__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF64)src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_F32_I32::Inst_VOP3__V_CVT_F32_I32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_f32_i32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_F32_I32

    Inst_VOP3__V_CVT_F32_I32::~Inst_VOP3__V_CVT_F32_I32()
    {
    } // ~Inst_VOP3__V_CVT_F32_I32

    // D.f = (float)S0.i.
    void
    Inst_VOP3__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        VecOperandI32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_F32_U32::Inst_VOP3__V_CVT_F32_U32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_f32_u32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_F32_U32

    Inst_VOP3__V_CVT_F32_U32::~Inst_VOP3__V_CVT_F32_U32()
    {
    } // ~Inst_VOP3__V_CVT_F32_U32

    // D.f = (float)S0.u.
    void
    Inst_VOP3__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_U32_F32::Inst_VOP3__V_CVT_U32_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_u32_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_U32_F32

    Inst_VOP3__V_CVT_U32_F32::~Inst_VOP3__V_CVT_U32_F32()
    {
    } // ~Inst_VOP3__V_CVT_U32_F32

    // D.u = (unsigned)S0.f.
    // Out-of-range floating point values (including infinity) saturate. NaN
    // is converted to 0.
    void
    Inst_VOP3__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                int exp;
                std::frexp(src[lane],&exp);
                if (std::isnan(src[lane])) {
                    vdst[lane] = 0;
                } else if (std::isinf(src[lane])) {
                    if (std::signbit(src[lane])) {
                        vdst[lane] = 0;
                    } else {
                        vdst[lane] = UINT_MAX;
                    }
                } else if (exp > 31) {
                    vdst[lane] = UINT_MAX;
                } else {
                    vdst[lane] = (VecElemU32)src[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_I32_F32::Inst_VOP3__V_CVT_I32_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_i32_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_I32_F32

    Inst_VOP3__V_CVT_I32_F32::~Inst_VOP3__V_CVT_I32_F32()
    {
    } // ~Inst_VOP3__V_CVT_I32_F32

    // D.i = (int)S0.f.
    // Out-of-range floating point values (including infinity) saturate. NaN
    // is converted to 0.
    void
    Inst_VOP3__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                int exp;
                std::frexp(src[lane],&exp);
                if (std::isnan(src[lane])) {
                    vdst[lane] = 0;
                } else if (std::isinf(src[lane]) || exp > 30) {
                    if (std::signbit(src[lane])) {
                        vdst[lane] = INT_MIN;
                    } else {
                        vdst[lane] = INT_MAX;
                    }
                } else {
                    vdst[lane] = (VecElemI32)src[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MOV_FED_B32::Inst_VOP3__V_MOV_FED_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mov_fed_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MOV_FED_B32

    Inst_VOP3__V_MOV_FED_B32::~Inst_VOP3__V_MOV_FED_B32()
    {
    } // ~Inst_VOP3__V_MOV_FED_B32

    // D.u = S0.u;
    // Input and output modifiers not supported; this is an untyped operation.
    void
    Inst_VOP3__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CVT_F16_F32::Inst_VOP3__V_CVT_F16_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_f16_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_F16_F32

    Inst_VOP3__V_CVT_F16_F32::~Inst_VOP3__V_CVT_F16_F32()
    {
    } // ~Inst_VOP3__V_CVT_F16_F32

    // D.f16 = flt32_to_flt16(S0.f).
    void
    Inst_VOP3__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CVT_F32_F16::Inst_VOP3__V_CVT_F32_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_f32_f16", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_F32_F16

    Inst_VOP3__V_CVT_F32_F16::~Inst_VOP3__V_CVT_F32_F16()
    {
    } // ~Inst_VOP3__V_CVT_F32_F16

    // D.f = flt16_to_flt32(S0.f16).
    void
    Inst_VOP3__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CVT_RPI_I32_F32::Inst_VOP3__V_CVT_RPI_I32_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_rpi_i32_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_RPI_I32_F32

    Inst_VOP3__V_CVT_RPI_I32_F32::~Inst_VOP3__V_CVT_RPI_I32_F32()
    {
    } // ~Inst_VOP3__V_CVT_RPI_I32_F32

    // D.i = (int)floor(S0.f + 0.5).
    void
    Inst_VOP3__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_FLR_I32_F32::Inst_VOP3__V_CVT_FLR_I32_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_flr_i32_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_FLR_I32_F32

    Inst_VOP3__V_CVT_FLR_I32_F32::~Inst_VOP3__V_CVT_FLR_I32_F32()
    {
    } // ~Inst_VOP3__V_CVT_FLR_I32_F32

    // D.i = (int)floor(S0.f).
    void
    Inst_VOP3__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemI32)std::floor(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_OFF_F32_I4::Inst_VOP3__V_CVT_OFF_F32_I4(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_off_f32_i4", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_OFF_F32_I4

    Inst_VOP3__V_CVT_OFF_F32_I4::~Inst_VOP3__V_CVT_OFF_F32_I4()
    {
    } // ~Inst_VOP3__V_CVT_OFF_F32_I4

    // 4-bit signed int to 32-bit float.
    void
    Inst_VOP3__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CVT_F32_F64::Inst_VOP3__V_CVT_F32_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_f32_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CVT_F32_F64

    Inst_VOP3__V_CVT_F32_F64::~Inst_VOP3__V_CVT_F32_F64()
    {
    } // ~Inst_VOP3__V_CVT_F32_F64

    // D.f = (float)S0.d.
    void
    Inst_VOP3__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_F64_F32::Inst_VOP3__V_CVT_F64_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_f64_f32", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CVT_F64_F32

    Inst_VOP3__V_CVT_F64_F32::~Inst_VOP3__V_CVT_F64_F32()
    {
    } // ~Inst_VOP3__V_CVT_F64_F32

    // D.d = (double)S0.f.
    void
    Inst_VOP3__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF64)src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_F32_UBYTE0::Inst_VOP3__V_CVT_F32_UBYTE0(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_f32_ubyte0", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_F32_UBYTE0

    Inst_VOP3__V_CVT_F32_UBYTE0::~Inst_VOP3__V_CVT_F32_UBYTE0()
    {
    } // ~Inst_VOP3__V_CVT_F32_UBYTE0

    // D.f = (float)(S0.u[7:0]).
    void
    Inst_VOP3__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)bits(src[lane], 7, 0);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_F32_UBYTE1::Inst_VOP3__V_CVT_F32_UBYTE1(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_f32_ubyte1", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_F32_UBYTE1

    Inst_VOP3__V_CVT_F32_UBYTE1::~Inst_VOP3__V_CVT_F32_UBYTE1()
    {
    } // ~Inst_VOP3__V_CVT_F32_UBYTE1

    // D.f = (float)(S0.u[15:8]).
    void
    Inst_VOP3__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)bits(src[lane], 15, 8);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_F32_UBYTE2::Inst_VOP3__V_CVT_F32_UBYTE2(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_f32_ubyte2", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_F32_UBYTE2

    Inst_VOP3__V_CVT_F32_UBYTE2::~Inst_VOP3__V_CVT_F32_UBYTE2()
    {
    } // ~Inst_VOP3__V_CVT_F32_UBYTE2

    // D.f = (float)(S0.u[23:16]).
    void
    Inst_VOP3__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)bits(src[lane], 23, 16);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_F32_UBYTE3::Inst_VOP3__V_CVT_F32_UBYTE3(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_f32_ubyte3", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_F32_UBYTE3

    Inst_VOP3__V_CVT_F32_UBYTE3::~Inst_VOP3__V_CVT_F32_UBYTE3()
    {
    } // ~Inst_VOP3__V_CVT_F32_UBYTE3

    // D.f = (float)(S0.u[31:24]).
    void
    Inst_VOP3__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF32)bits(src[lane], 31, 24);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_U32_F64::Inst_VOP3__V_CVT_U32_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_u32_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CVT_U32_F64

    Inst_VOP3__V_CVT_U32_F64::~Inst_VOP3__V_CVT_U32_F64()
    {
    } // ~Inst_VOP3__V_CVT_U32_F64

    // D.u = (unsigned)S0.d.
    // Out-of-range floating point values (including infinity) saturate. NaN
    // is converted to 0.
    void
    Inst_VOP3__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                int exp;
                std::frexp(src[lane],&exp);
                if (std::isnan(src[lane])) {
                    vdst[lane] = 0;
                } else if (std::isinf(src[lane])) {
                    if (std::signbit(src[lane])) {
                        vdst[lane] = 0;
                    } else {
                        vdst[lane] = UINT_MAX;
                    }
                } else if (exp > 31) {
                    vdst[lane] = UINT_MAX;
                } else {
                    vdst[lane] = (VecElemU32)src[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_F64_U32::Inst_VOP3__V_CVT_F64_U32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_f64_u32", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CVT_F64_U32

    Inst_VOP3__V_CVT_F64_U32::~Inst_VOP3__V_CVT_F64_U32()
    {
    } // ~Inst_VOP3__V_CVT_F64_U32

    // D.d = (double)S0.u.
    void
    Inst_VOP3__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (VecElemF64)src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_TRUNC_F64::Inst_VOP3__V_TRUNC_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_trunc_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_TRUNC_F64

    Inst_VOP3__V_TRUNC_F64::~Inst_VOP3__V_TRUNC_F64()
    {
    } // ~Inst_VOP3__V_TRUNC_F64

    // D.d = trunc(S0.d), return integer part of S0.d.
    void
    Inst_VOP3__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::trunc(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CEIL_F64::Inst_VOP3__V_CEIL_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_ceil_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_CEIL_F64

    Inst_VOP3__V_CEIL_F64::~Inst_VOP3__V_CEIL_F64()
    {
    } // ~Inst_VOP3__V_CEIL_F64

    // D.d = ceil(S0.d);
    void
    Inst_VOP3__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::ceil(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_RNDNE_F64::Inst_VOP3__V_RNDNE_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_rndne_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_RNDNE_F64

    Inst_VOP3__V_RNDNE_F64::~Inst_VOP3__V_RNDNE_F64()
    {
    } // ~Inst_VOP3__V_RNDNE_F64

    // D.d = round_nearest_even(S0.d).
    void
    Inst_VOP3__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = roundNearestEven(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_FLOOR_F64::Inst_VOP3__V_FLOOR_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_floor_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_FLOOR_F64

    Inst_VOP3__V_FLOOR_F64::~Inst_VOP3__V_FLOOR_F64()
    {
    } // ~Inst_VOP3__V_FLOOR_F64

    // D.d = floor(S0.d);
    void
    Inst_VOP3__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::floor(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_FRACT_F32::Inst_VOP3__V_FRACT_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_fract_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_FRACT_F32

    Inst_VOP3__V_FRACT_F32::~Inst_VOP3__V_FRACT_F32()
    {
    } // ~Inst_VOP3__V_FRACT_F32

    // D.f = modf(S0.f).
    void
    Inst_VOP3__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemF32 int_part(0.0);
                vdst[lane] = std::modf(src[lane], &int_part);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_TRUNC_F32::Inst_VOP3__V_TRUNC_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_trunc_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_TRUNC_F32

    Inst_VOP3__V_TRUNC_F32::~Inst_VOP3__V_TRUNC_F32()
    {
    } // ~Inst_VOP3__V_TRUNC_F32

    // D.f = trunc(S0.f), return integer part of S0.f.
    void
    Inst_VOP3__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::trunc(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CEIL_F32::Inst_VOP3__V_CEIL_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_ceil_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CEIL_F32

    Inst_VOP3__V_CEIL_F32::~Inst_VOP3__V_CEIL_F32()
    {
    } // ~Inst_VOP3__V_CEIL_F32

    // D.f = ceil(S0.f);
    void
    Inst_VOP3__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::ceil(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_RNDNE_F32::Inst_VOP3__V_RNDNE_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_rndne_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_RNDNE_F32

    Inst_VOP3__V_RNDNE_F32::~Inst_VOP3__V_RNDNE_F32()
    {
    } // ~Inst_VOP3__V_RNDNE_F32

    // D.f = round_nearest_even(S0.f).
    void
    Inst_VOP3__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = roundNearestEven(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_FLOOR_F32::Inst_VOP3__V_FLOOR_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_floor_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_FLOOR_F32

    Inst_VOP3__V_FLOOR_F32::~Inst_VOP3__V_FLOOR_F32()
    {
    } // ~Inst_VOP3__V_FLOOR_F32

    // D.f = floor(S0.f);
    void
    Inst_VOP3__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::floor(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_EXP_F32::Inst_VOP3__V_EXP_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_exp_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_EXP_F32

    Inst_VOP3__V_EXP_F32::~Inst_VOP3__V_EXP_F32()
    {
    } // ~Inst_VOP3__V_EXP_F32

    // D.f = pow(2.0, S0.f).
    void
    Inst_VOP3__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::pow(2.0, src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_LOG_F32::Inst_VOP3__V_LOG_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_log_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_LOG_F32

    Inst_VOP3__V_LOG_F32::~Inst_VOP3__V_LOG_F32()
    {
    } // ~Inst_VOP3__V_LOG_F32

    // D.f = log2(S0.f).
    void
    Inst_VOP3__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::log2(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_RCP_F32::Inst_VOP3__V_RCP_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_rcp_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_RCP_F32

    Inst_VOP3__V_RCP_F32::~Inst_VOP3__V_RCP_F32()
    {
    } // ~Inst_VOP3__V_RCP_F32

    // D.f = 1.0 / S0.f.
    void
    Inst_VOP3__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = 1.0 / src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_RCP_IFLAG_F32::Inst_VOP3__V_RCP_IFLAG_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_rcp_iflag_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_RCP_IFLAG_F32

    Inst_VOP3__V_RCP_IFLAG_F32::~Inst_VOP3__V_RCP_IFLAG_F32()
    {
    } // ~Inst_VOP3__V_RCP_IFLAG_F32

    // D.f = 1.0 / S0.f.
    void
    Inst_VOP3__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = 1.0 / src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_RSQ_F32::Inst_VOP3__V_RSQ_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_rsq_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_RSQ_F32

    Inst_VOP3__V_RSQ_F32::~Inst_VOP3__V_RSQ_F32()
    {
    } // ~Inst_VOP3__V_RSQ_F32

    // D.f = 1.0 / sqrt(S0.f).
    void
    Inst_VOP3__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = 1.0 / std::sqrt(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_RCP_F64::Inst_VOP3__V_RCP_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_rcp_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_RCP_F64

    Inst_VOP3__V_RCP_F64::~Inst_VOP3__V_RCP_F64()
    {
    } // ~Inst_VOP3__V_RCP_F64

    // D.d = 1.0 / S0.d.
    void
    Inst_VOP3__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::fpclassify(src[lane]) == FP_ZERO) {
                    vdst[lane] = +INFINITY;
                } else if (std::isnan(src[lane])) {
                    vdst[lane] = NAN;
                } else if (std::isinf(src[lane])) {
                    if (std::signbit(src[lane])) {
                        vdst[lane] = -0.0;
                    } else {
                        vdst[lane] = 0.0;
                    }
                } else {
                    vdst[lane] = 1.0 / src[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_RSQ_F64::Inst_VOP3__V_RSQ_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_rsq_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_RSQ_F64

    Inst_VOP3__V_RSQ_F64::~Inst_VOP3__V_RSQ_F64()
    {
    } // ~Inst_VOP3__V_RSQ_F64

    // D.d = 1.0 / sqrt(S0.d).
    void
    Inst_VOP3__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::fpclassify(src[lane]) == FP_ZERO) {
                    vdst[lane] = +INFINITY;
                } else if (std::isnan(src[lane])) {
                    vdst[lane] = NAN;
                } else if (std::isinf(src[lane]) && !std::signbit(src[lane])) {
                    vdst[lane] = 0.0;
                } else if (std::signbit(src[lane])) {
                    vdst[lane] = NAN;
                } else {
                    vdst[lane] = 1.0 / std::sqrt(src[lane]);
                }
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_SQRT_F32::Inst_VOP3__V_SQRT_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_sqrt_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_SQRT_F32

    Inst_VOP3__V_SQRT_F32::~Inst_VOP3__V_SQRT_F32()
    {
    } // ~Inst_VOP3__V_SQRT_F32

    // D.f = sqrt(S0.f).
    void
    Inst_VOP3__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::sqrt(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_SQRT_F64::Inst_VOP3__V_SQRT_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_sqrt_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_SQRT_F64

    Inst_VOP3__V_SQRT_F64::~Inst_VOP3__V_SQRT_F64()
    {
    } // ~Inst_VOP3__V_SQRT_F64

    // D.d = sqrt(S0.d).
    void
    Inst_VOP3__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::sqrt(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_SIN_F32::Inst_VOP3__V_SIN_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_sin_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_SIN_F32

    Inst_VOP3__V_SIN_F32::~Inst_VOP3__V_SIN_F32()
    {
    } // ~Inst_VOP3__V_SIN_F32

    // D.f = sin(S0.f * 2 * PI).
    void
    Inst_VOP3__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();
        pi.read();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::sin(src[lane] * 2 * pi.rawData());
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_COS_F32::Inst_VOP3__V_COS_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cos_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_COS_F32

    Inst_VOP3__V_COS_F32::~Inst_VOP3__V_COS_F32()
    {
    } // ~Inst_VOP3__V_COS_F32

    // D.f = cos(S0.f * 2 * PI).
    void
    Inst_VOP3__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();
        pi.read();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::cos(src[lane] * 2 * pi.rawData());
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_NOT_B32::Inst_VOP3__V_NOT_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_not_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_NOT_B32

    Inst_VOP3__V_NOT_B32::~Inst_VOP3__V_NOT_B32()
    {
    } // ~Inst_VOP3__V_NOT_B32

    // D.u = ~S0.u.
    // Input and output modifiers not supported.
    void
    Inst_VOP3__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = ~src[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_BFREV_B32::Inst_VOP3__V_BFREV_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_bfrev_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_BFREV_B32

    Inst_VOP3__V_BFREV_B32::~Inst_VOP3__V_BFREV_B32()
    {
    } // ~Inst_VOP3__V_BFREV_B32

    // D.u[31:0] = S0.u[0:31], bitfield reverse.
    // Input and output modifiers not supported.
    void
    Inst_VOP3__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = reverseBits(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_FFBH_U32::Inst_VOP3__V_FFBH_U32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_ffbh_u32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_FFBH_U32

    Inst_VOP3__V_FFBH_U32::~Inst_VOP3__V_FFBH_U32()
    {
    } // ~Inst_VOP3__V_FFBH_U32

    // D.u = position of first 1 in S0.u from MSB;
    // D.u = 0xffffffff if S0.u == 0.
    void
    Inst_VOP3__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = findFirstOneMsb(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_FFBL_B32::Inst_VOP3__V_FFBL_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_ffbl_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_FFBL_B32

    Inst_VOP3__V_FFBL_B32::~Inst_VOP3__V_FFBL_B32()
    {
    } // ~Inst_VOP3__V_FFBL_B32

    // D.u = position of first 1 in S0.u from LSB;
    // D.u = 0xffffffff if S0.u == 0.
    void
    Inst_VOP3__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = findFirstOne(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_FFBH_I32::Inst_VOP3__V_FFBH_I32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_ffbh_i32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_FFBH_I32

    Inst_VOP3__V_FFBH_I32::~Inst_VOP3__V_FFBH_I32()
    {
    } // ~Inst_VOP3__V_FFBH_I32

    // D.u = position of first bit different from sign bit in S0.i from MSB;
    // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
    void
    Inst_VOP3__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = firstOppositeSignBit(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_FREXP_EXP_I32_F64::Inst_VOP3__V_FREXP_EXP_I32_F64(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_frexp_exp_i32_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_FREXP_EXP_I32_F64

    Inst_VOP3__V_FREXP_EXP_I32_F64::~Inst_VOP3__V_FREXP_EXP_I32_F64()
    {
    } // ~Inst_VOP3__V_FREXP_EXP_I32_F64

    // See V_FREXP_EXP_I32_F32.
    void
    Inst_VOP3__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
                    vdst[lane] = 0;
                } else {
                    VecElemI32 exp(0);
                    std::frexp(src[lane], &exp);
                    vdst[lane] = exp;
                }
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_FREXP_MANT_F64::Inst_VOP3__V_FREXP_MANT_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_frexp_mant_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_FREXP_MANT_F64

    Inst_VOP3__V_FREXP_MANT_F64::~Inst_VOP3__V_FREXP_MANT_F64()
    {
    } // ~Inst_VOP3__V_FREXP_MANT_F64

    void
    Inst_VOP3__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemI32 exp(0);
                vdst[lane] = std::frexp(src[lane], &exp);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_FRACT_F64::Inst_VOP3__V_FRACT_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_fract_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_FRACT_F64

    Inst_VOP3__V_FRACT_F64::~Inst_VOP3__V_FRACT_F64()
    {
    } // ~Inst_VOP3__V_FRACT_F64

    void
    Inst_VOP3__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemF32 int_part(0.0);
                vdst[lane] = std::modf(src[lane], &int_part);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_FREXP_EXP_I32_F32::Inst_VOP3__V_FREXP_EXP_I32_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_frexp_exp_i32_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_FREXP_EXP_I32_F32

    Inst_VOP3__V_FREXP_EXP_I32_F32::~Inst_VOP3__V_FREXP_EXP_I32_F32()
    {
    } // ~Inst_VOP3__V_FREXP_EXP_I32_F32

    // frexp(S0.f, Exponenti(S0.f))
    // if (S0.f == INF || S0.f == NAN) then D.i = 0;
    // else D.i = Exponent(S0.f)
    void
    Inst_VOP3__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isinf(src[lane])|| std::isnan(src[lane])) {
                    vdst[lane] = 0;
                } else {
                    VecElemI32 exp(0);
                    std::frexp(src[lane], &exp);
                    vdst[lane] = exp;
                }
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_FREXP_MANT_F32::Inst_VOP3__V_FREXP_MANT_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_frexp_mant_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_FREXP_MANT_F32

    Inst_VOP3__V_FREXP_MANT_F32::~Inst_VOP3__V_FREXP_MANT_F32()
    {
    } // ~Inst_VOP3__V_FREXP_MANT_F32

    // if (S0.f == INF || S0.f == NAN) then D.f = S0.f;
    // else D.f = Mantissa(S0.f).
    void
    Inst_VOP3__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
                    vdst[lane] = src[lane];
                } else {
                    VecElemI32 exp(0);
                    vdst[lane] = std::frexp(src[lane], &exp);
                }
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CLREXCP::Inst_VOP3__V_CLREXCP(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_clrexcp", false)
    {
    } // Inst_VOP3__V_CLREXCP

    Inst_VOP3__V_CLREXCP::~Inst_VOP3__V_CLREXCP()
    {
    } // ~Inst_VOP3__V_CLREXCP

    void
    Inst_VOP3__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CVT_F16_U16::Inst_VOP3__V_CVT_F16_U16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_f16_u16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CVT_F16_U16

    Inst_VOP3__V_CVT_F16_U16::~Inst_VOP3__V_CVT_F16_U16()
    {
    } // ~Inst_VOP3__V_CVT_F16_U16

    // D.f16 = uint16_to_flt16(S.u16).
    void
    Inst_VOP3__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CVT_F16_I16::Inst_VOP3__V_CVT_F16_I16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_f16_i16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CVT_F16_I16

    Inst_VOP3__V_CVT_F16_I16::~Inst_VOP3__V_CVT_F16_I16()
    {
    } // ~Inst_VOP3__V_CVT_F16_I16

    // D.f16 = int16_to_flt16(S.i16).
    void
    Inst_VOP3__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CVT_U16_F16::Inst_VOP3__V_CVT_U16_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_u16_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CVT_U16_F16

    Inst_VOP3__V_CVT_U16_F16::~Inst_VOP3__V_CVT_U16_F16()
    {
    } // ~Inst_VOP3__V_CVT_U16_F16

    // D.u16 = flt16_to_uint16(S.f16).
    void
    Inst_VOP3__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CVT_I16_F16::Inst_VOP3__V_CVT_I16_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_i16_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CVT_I16_F16

    Inst_VOP3__V_CVT_I16_F16::~Inst_VOP3__V_CVT_I16_F16()
    {
    } // ~Inst_VOP3__V_CVT_I16_F16

    // D.i16 = flt16_to_int16(S.f16).
    void
    Inst_VOP3__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_RCP_F16::Inst_VOP3__V_RCP_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_rcp_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_RCP_F16

    Inst_VOP3__V_RCP_F16::~Inst_VOP3__V_RCP_F16()
    {
    } // ~Inst_VOP3__V_RCP_F16

    // if (S0.f16 == 1.0f)
    //     D.f16 = 1.0f;
    // else
    //     D.f16 = 1 / S0.f16.
    void
    Inst_VOP3__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_SQRT_F16::Inst_VOP3__V_SQRT_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_sqrt_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_SQRT_F16

    Inst_VOP3__V_SQRT_F16::~Inst_VOP3__V_SQRT_F16()
    {
    } // ~Inst_VOP3__V_SQRT_F16

    // if (S0.f16 == 1.0f)
    //     D.f16 = 1.0f;
    // else
    //     D.f16 = sqrt(S0.f16).
    void
    Inst_VOP3__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_RSQ_F16::Inst_VOP3__V_RSQ_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_rsq_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_RSQ_F16

    Inst_VOP3__V_RSQ_F16::~Inst_VOP3__V_RSQ_F16()
    {
    } // ~Inst_VOP3__V_RSQ_F16

    // if (S0.f16 == 1.0f)
    //     D.f16 = 1.0f;
    // else
    //     D.f16 = 1 / sqrt(S0.f16).
    void
    Inst_VOP3__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_LOG_F16::Inst_VOP3__V_LOG_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_log_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_LOG_F16

    Inst_VOP3__V_LOG_F16::~Inst_VOP3__V_LOG_F16()
    {
    } // ~Inst_VOP3__V_LOG_F16

    // if (S0.f16 == 1.0f)
    //     D.f16 = 0.0f;
    // else
    //     D.f16 = log2(S0.f16).
    void
    Inst_VOP3__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_EXP_F16::Inst_VOP3__V_EXP_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_exp_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_EXP_F16

    Inst_VOP3__V_EXP_F16::~Inst_VOP3__V_EXP_F16()
    {
    } // ~Inst_VOP3__V_EXP_F16

    // if (S0.f16 == 0.0f)
    //     D.f16 = 1.0f;
    // else
    //     D.f16 = pow(2.0, S0.f16).
    void
    Inst_VOP3__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_FREXP_MANT_F16::Inst_VOP3__V_FREXP_MANT_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_frexp_mant_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_FREXP_MANT_F16

    Inst_VOP3__V_FREXP_MANT_F16::~Inst_VOP3__V_FREXP_MANT_F16()
    {
    } // ~Inst_VOP3__V_FREXP_MANT_F16

    // if (S0.f16 == +-INF || S0.f16 == NAN)
    //     D.f16 = S0.f16;
    // else
    //     D.f16 = mantissa(S0.f16).
    void
    Inst_VOP3__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_FREXP_EXP_I16_F16::Inst_VOP3__V_FREXP_EXP_I16_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_frexp_exp_i16_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_FREXP_EXP_I16_F16

    Inst_VOP3__V_FREXP_EXP_I16_F16::~Inst_VOP3__V_FREXP_EXP_I16_F16()
    {
    } // ~Inst_VOP3__V_FREXP_EXP_I16_F16

    void
    Inst_VOP3__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_FLOOR_F16::Inst_VOP3__V_FLOOR_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_floor_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_FLOOR_F16

    Inst_VOP3__V_FLOOR_F16::~Inst_VOP3__V_FLOOR_F16()
    {
    } // ~Inst_VOP3__V_FLOOR_F16

    // D.f16 = floor(S0.f16);
    void
    Inst_VOP3__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CEIL_F16::Inst_VOP3__V_CEIL_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_ceil_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_CEIL_F16

    Inst_VOP3__V_CEIL_F16::~Inst_VOP3__V_CEIL_F16()
    {
    } // ~Inst_VOP3__V_CEIL_F16

    // D.f16 = ceil(S0.f16);
    void
    Inst_VOP3__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_TRUNC_F16::Inst_VOP3__V_TRUNC_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_trunc_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_TRUNC_F16

    Inst_VOP3__V_TRUNC_F16::~Inst_VOP3__V_TRUNC_F16()
    {
    } // ~Inst_VOP3__V_TRUNC_F16

    // D.f16 = trunc(S0.f16).
    void
    Inst_VOP3__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_RNDNE_F16::Inst_VOP3__V_RNDNE_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_rndne_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_RNDNE_F16

    Inst_VOP3__V_RNDNE_F16::~Inst_VOP3__V_RNDNE_F16()
    {
    } // ~Inst_VOP3__V_RNDNE_F16

    // D.f16 = roundNearestEven(S0.f16);
    void
    Inst_VOP3__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_FRACT_F16::Inst_VOP3__V_FRACT_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_fract_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_FRACT_F16

    Inst_VOP3__V_FRACT_F16::~Inst_VOP3__V_FRACT_F16()
    {
    } // ~Inst_VOP3__V_FRACT_F16

    // D.f16 = S0.f16 + -floor(S0.f16).
    void
    Inst_VOP3__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_SIN_F16::Inst_VOP3__V_SIN_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_sin_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_SIN_F16

    Inst_VOP3__V_SIN_F16::~Inst_VOP3__V_SIN_F16()
    {
    } // ~Inst_VOP3__V_SIN_F16

    // D.f16 = sin(S0.f16 * 2 * PI).
    void
    Inst_VOP3__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_COS_F16::Inst_VOP3__V_COS_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cos_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_COS_F16

    Inst_VOP3__V_COS_F16::~Inst_VOP3__V_COS_F16()
    {
    } // ~Inst_VOP3__V_COS_F16

    // D.f16 = cos(S0.f16 * 2 * PI).
    void
    Inst_VOP3__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_EXP_LEGACY_F32::Inst_VOP3__V_EXP_LEGACY_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_exp_legacy_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_EXP_LEGACY_F32

    Inst_VOP3__V_EXP_LEGACY_F32::~Inst_VOP3__V_EXP_LEGACY_F32()
    {
    } // ~Inst_VOP3__V_EXP_LEGACY_F32

    // D.f = pow(2.0, S0.f)
    void
    Inst_VOP3__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        if (instData.ABS & 0x1) {
            src.absModifier();
        }

        if (extData.NEG & 0x1) {
            src.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::pow(2.0, src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_LOG_LEGACY_F32::Inst_VOP3__V_LOG_LEGACY_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_log_legacy_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_LOG_LEGACY_F32

    Inst_VOP3__V_LOG_LEGACY_F32::~Inst_VOP3__V_LOG_LEGACY_F32()
    {
    } // ~Inst_VOP3__V_LOG_LEGACY_F32

    // D.f = log2(S0.f).
    void
    Inst_VOP3__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::log2(src[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MAD_LEGACY_F32::Inst_VOP3__V_MAD_LEGACY_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mad_legacy_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
        setFlag(MAD);
    } // Inst_VOP3__V_MAD_LEGACY_F32

    Inst_VOP3__V_MAD_LEGACY_F32::~Inst_VOP3__V_MAD_LEGACY_F32()
    {
    } // ~Inst_VOP3__V_MAD_LEGACY_F32

    // D.f = S0.f * S1.f + S2.f
    void
    Inst_VOP3__V_MAD_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (instData.ABS & 0x4) {
            src2.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        if (extData.NEG & 0x4) {
            src2.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MAD_F32::Inst_VOP3__V_MAD_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mad_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
        setFlag(MAD);
    } // Inst_VOP3__V_MAD_F32

    Inst_VOP3__V_MAD_F32::~Inst_VOP3__V_MAD_F32()
    {
    } // ~Inst_VOP3__V_MAD_F32

    // D.f = S0.f * S1.f + S2.f.
    void
    Inst_VOP3__V_MAD_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (instData.ABS & 0x4) {
            src2.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        if (extData.NEG & 0x4) {
            src2.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MAD_I32_I24::Inst_VOP3__V_MAD_I32_I24(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mad_i32_i24", false)
    {
        setFlag(ALU);
        setFlag(MAD);
    } // Inst_VOP3__V_MAD_I32_I24

    Inst_VOP3__V_MAD_I32_I24::~Inst_VOP3__V_MAD_I32_I24()
    {
    } // ~Inst_VOP3__V_MAD_I32_I24

    // D.i = S0.i[23:0] * S1.i[23:0] + S2.i.
    void
    Inst_VOP3__V_MAD_I32_I24::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
                    * sext<24>(bits(src1[lane], 23, 0)) + src2[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MAD_U32_U24::Inst_VOP3__V_MAD_U32_U24(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mad_u32_u24", false)
    {
        setFlag(ALU);
        setFlag(MAD);
    } // Inst_VOP3__V_MAD_U32_U24

    Inst_VOP3__V_MAD_U32_U24::~Inst_VOP3__V_MAD_U32_U24()
    {
    } // ~Inst_VOP3__V_MAD_U32_U24

    // D.u = S0.u[23:0] * S1.u[23:0] + S2.u.
    void
    Inst_VOP3__V_MAD_U32_U24::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0)
                    + src2[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CUBEID_F32::Inst_VOP3__V_CUBEID_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cubeid_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CUBEID_F32

    Inst_VOP3__V_CUBEID_F32::~Inst_VOP3__V_CUBEID_F32()
    {
    } // ~Inst_VOP3__V_CUBEID_F32

    void
    Inst_VOP3__V_CUBEID_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CUBESC_F32::Inst_VOP3__V_CUBESC_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cubesc_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CUBESC_F32

    Inst_VOP3__V_CUBESC_F32::~Inst_VOP3__V_CUBESC_F32()
    {
    } // ~Inst_VOP3__V_CUBESC_F32

    void
    Inst_VOP3__V_CUBESC_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CUBETC_F32::Inst_VOP3__V_CUBETC_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cubetc_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CUBETC_F32

    Inst_VOP3__V_CUBETC_F32::~Inst_VOP3__V_CUBETC_F32()
    {
    } // ~Inst_VOP3__V_CUBETC_F32

    void
    Inst_VOP3__V_CUBETC_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CUBEMA_F32::Inst_VOP3__V_CUBEMA_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cubema_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CUBEMA_F32

    Inst_VOP3__V_CUBEMA_F32::~Inst_VOP3__V_CUBEMA_F32()
    {
    } // ~Inst_VOP3__V_CUBEMA_F32

    void
    Inst_VOP3__V_CUBEMA_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_BFE_U32::Inst_VOP3__V_BFE_U32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_bfe_u32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_BFE_U32

    Inst_VOP3__V_BFE_U32::~Inst_VOP3__V_BFE_U32()
    {
    } // ~Inst_VOP3__V_BFE_U32

    // D.u = (S0.u >> S1.u[4:0]) & ((1 << S2.u[4:0]) - 1).
    // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
    void
    Inst_VOP3__V_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
                    & ((1 << bits(src2[lane], 4, 0)) - 1);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_BFE_I32::Inst_VOP3__V_BFE_I32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_bfe_i32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_BFE_I32

    Inst_VOP3__V_BFE_I32::~Inst_VOP3__V_BFE_I32()
    {
    } // ~Inst_VOP3__V_BFE_I32

    // D.i = (S0.i >> S1.u[4:0]) & ((1 << S2.u[4:0]) - 1).
    // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
    void
    Inst_VOP3__V_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
                    & ((1 << bits(src2[lane], 4, 0)) - 1);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_BFI_B32::Inst_VOP3__V_BFI_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_bfi_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_BFI_B32

    Inst_VOP3__V_BFI_B32::~Inst_VOP3__V_BFI_B32()
    {
    } // ~Inst_VOP3__V_BFI_B32

    // D.u = (S0.u & S1.u) | (~S0.u & S2.u); bitfield insert.
    void
    Inst_VOP3__V_BFI_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (src0[lane] & src1[lane]) | (~src0[lane]
                    & src2[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_FMA_F32::Inst_VOP3__V_FMA_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_fma_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
        setFlag(FMA);
    } // Inst_VOP3__V_FMA_F32

    Inst_VOP3__V_FMA_F32::~Inst_VOP3__V_FMA_F32()
    {
    } // ~Inst_VOP3__V_FMA_F32

    // D.f = S0.f * S1.f + S2.f.
    void
    Inst_VOP3__V_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (instData.ABS & 0x4) {
            src2.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        if (extData.NEG & 0x4) {
            src2.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_FMA_F64::Inst_VOP3__V_FMA_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_fma_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
        setFlag(FMA);
    } // Inst_VOP3__V_FMA_F64

    Inst_VOP3__V_FMA_F64::~Inst_VOP3__V_FMA_F64()
    {
    } // ~Inst_VOP3__V_FMA_F64

    // D.d = S0.d * S1.d + S2.d.
    void
    Inst_VOP3__V_FMA_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (instData.ABS & 0x4) {
            src2.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        if (extData.NEG & 0x4) {
            src2.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_LERP_U8::Inst_VOP3__V_LERP_U8(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_lerp_u8", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_LERP_U8

    Inst_VOP3__V_LERP_U8::~Inst_VOP3__V_LERP_U8()
    {
    } // ~Inst_VOP3__V_LERP_U8

    // D.u = ((S0.u[31:24] + S1.u[31:24] + S2.u[24]) >> 1) << 24
    // D.u += ((S0.u[23:16] + S1.u[23:16] + S2.u[16]) >> 1) << 16;
    // D.u += ((S0.u[15:8] + S1.u[15:8] + S2.u[8]) >> 1) << 8;
    // D.u += ((S0.u[7:0] + S1.u[7:0] + S2.u[0]) >> 1).
    void
    Inst_VOP3__V_LERP_U8::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = ((bits(src0[lane], 31, 24)
                    + bits(src1[lane], 31, 24) + bits(src2[lane], 24)) >> 1)
                        << 24;
                vdst[lane] += ((bits(src0[lane], 23, 16)
                    + bits(src1[lane], 23, 16) + bits(src2[lane], 16)) >> 1)
                        << 16;
                vdst[lane] += ((bits(src0[lane], 15, 8)
                    + bits(src1[lane], 15, 8) + bits(src2[lane], 8)) >> 1)
                        << 8;
                vdst[lane] += ((bits(src0[lane], 7, 0) + bits(src1[lane], 7, 0)
                    + bits(src2[lane], 0)) >> 1);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_ALIGNBIT_B32::Inst_VOP3__V_ALIGNBIT_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_alignbit_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_ALIGNBIT_B32

    Inst_VOP3__V_ALIGNBIT_B32::~Inst_VOP3__V_ALIGNBIT_B32()
    {
    } // ~Inst_VOP3__V_ALIGNBIT_B32

    // D.u = ({S0, S1} >> S2.u[4:0]) & 0xffffffff.
    void
    Inst_VOP3__V_ALIGNBIT_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
                    | (VecElemU64)src1[lane]);
                vdst[lane] = (VecElemU32)((src_0_1
                    >> (VecElemU64)bits(src2[lane], 4, 0)) & 0xffffffff);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_ALIGNBYTE_B32::Inst_VOP3__V_ALIGNBYTE_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_alignbyte_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_ALIGNBYTE_B32

    Inst_VOP3__V_ALIGNBYTE_B32::~Inst_VOP3__V_ALIGNBYTE_B32()
    {
    } // ~Inst_VOP3__V_ALIGNBYTE_B32

    // D.u = ({S0, S1} >> (8 * S2.u[4:0])) & 0xffffffff.
    void
    Inst_VOP3__V_ALIGNBYTE_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
                    | (VecElemU64)src1[lane]);
                vdst[lane] = (VecElemU32)((src_0_1
                    >> (8ULL * (VecElemU64)bits(src2[lane], 4, 0)))
                        & 0xffffffff);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MIN3_F32::Inst_VOP3__V_MIN3_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_min3_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_MIN3_F32

    Inst_VOP3__V_MIN3_F32::~Inst_VOP3__V_MIN3_F32()
    {
    } // ~Inst_VOP3__V_MIN3_F32

    // D.f = min(S0.f, S1.f, S2.f).
    void
    Inst_VOP3__V_MIN3_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (instData.ABS & 0x4) {
            src2.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        if (extData.NEG & 0x4) {
            src2.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemF32 min_0_1 = std::fmin(src0[lane], src1[lane]);
                vdst[lane] = std::fmin(min_0_1, src2[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MIN3_I32::Inst_VOP3__V_MIN3_I32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_min3_i32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MIN3_I32

    Inst_VOP3__V_MIN3_I32::~Inst_VOP3__V_MIN3_I32()
    {
    } // ~Inst_VOP3__V_MIN3_I32

    // D.i = min(S0.i, S1.i, S2.i).
    void
    Inst_VOP3__V_MIN3_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemI32 min_0_1 = std::min(src0[lane], src1[lane]);
                vdst[lane] = std::min(min_0_1, src2[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MIN3_U32::Inst_VOP3__V_MIN3_U32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_min3_u32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MIN3_U32

    Inst_VOP3__V_MIN3_U32::~Inst_VOP3__V_MIN3_U32()
    {
    } // ~Inst_VOP3__V_MIN3_U32

    // D.u = min(S0.u, S1.u, S2.u).
    void
    Inst_VOP3__V_MIN3_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemU32 min_0_1 = std::min(src0[lane], src1[lane]);
                vdst[lane] = std::min(min_0_1, src2[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MAX3_F32::Inst_VOP3__V_MAX3_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_max3_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_MAX3_F32

    Inst_VOP3__V_MAX3_F32::~Inst_VOP3__V_MAX3_F32()
    {
    } // ~Inst_VOP3__V_MAX3_F32

    // D.f = max(S0.f, S1.f, S2.f).
    void
    Inst_VOP3__V_MAX3_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (instData.ABS & 0x4) {
            src2.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        if (extData.NEG & 0x4) {
            src2.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemF32 max_0_1 = std::fmax(src0[lane], src1[lane]);
                vdst[lane] = std::fmax(max_0_1, src2[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MAX3_I32::Inst_VOP3__V_MAX3_I32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_max3_i32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MAX3_I32

    Inst_VOP3__V_MAX3_I32::~Inst_VOP3__V_MAX3_I32()
    {
    } // ~Inst_VOP3__V_MAX3_I32

    // D.i = max(S0.i, S1.i, S2.i).
    void
    Inst_VOP3__V_MAX3_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemI32 max_0_1 = std::max(src0[lane], src1[lane]);
                vdst[lane] = std::max(max_0_1, src2[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MAX3_U32::Inst_VOP3__V_MAX3_U32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_max3_u32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MAX3_U32

    Inst_VOP3__V_MAX3_U32::~Inst_VOP3__V_MAX3_U32()
    {
    } // ~Inst_VOP3__V_MAX3_U32

    // D.u = max(S0.u, S1.u, S2.u).
    void
    Inst_VOP3__V_MAX3_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemU32 max_0_1 = std::max(src0[lane], src1[lane]);
                vdst[lane] = std::max(max_0_1, src2[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MED3_F32::Inst_VOP3__V_MED3_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_med3_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_MED3_F32

    Inst_VOP3__V_MED3_F32::~Inst_VOP3__V_MED3_F32()
    {
    } // ~Inst_VOP3__V_MED3_F32

    // D.f = median(S0.f, S1.f, S2.f).
    void
    Inst_VOP3__V_MED3_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (instData.ABS & 0x4) {
            src2.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        if (extData.NEG & 0x4) {
            src2.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MED3_I32::Inst_VOP3__V_MED3_I32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_med3_i32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MED3_I32

    Inst_VOP3__V_MED3_I32::~Inst_VOP3__V_MED3_I32()
    {
    } // ~Inst_VOP3__V_MED3_I32

    // D.i = median(S0.i, S1.i, S2.i).
    void
    Inst_VOP3__V_MED3_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MED3_U32::Inst_VOP3__V_MED3_U32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_med3_u32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MED3_U32

    Inst_VOP3__V_MED3_U32::~Inst_VOP3__V_MED3_U32()
    {
    } // ~Inst_VOP3__V_MED3_U32

    // D.u = median(S0.u, S1.u, S2.u).
    void
    Inst_VOP3__V_MED3_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_SAD_U8::Inst_VOP3__V_SAD_U8(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_sad_u8", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_SAD_U8

    Inst_VOP3__V_SAD_U8::~Inst_VOP3__V_SAD_U8()
    {
    } // ~Inst_VOP3__V_SAD_U8

    // D.u = abs(S0.i[31:24] - S1.i[31:24]) + abs(S0.i[23:16] - S1.i[23:16]) +
    // abs(S0.i[15:8] - S1.i[15:8]) + abs(S0.i[7:0] - S1.i[7:0]) + S2.u.
    // Sum of absolute differences with accumulation, overflow into upper bits
    // is allowed.
    void
    Inst_VOP3__V_SAD_U8::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::abs(bits(src0[lane], 31, 24)
                    - bits(src1[lane], 31, 24))
                    + std::abs(bits(src0[lane], 23, 16)
                    - bits(src1[lane], 23, 16))
                    + std::abs(bits(src0[lane], 15, 8)
                    - bits(src1[lane], 15, 8))
                    + std::abs(bits(src0[lane], 7, 0)
                    - bits(src1[lane], 7, 0)) + src2[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_SAD_HI_U8::Inst_VOP3__V_SAD_HI_U8(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_sad_hi_u8", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_SAD_HI_U8

    Inst_VOP3__V_SAD_HI_U8::~Inst_VOP3__V_SAD_HI_U8()
    {
    } // ~Inst_VOP3__V_SAD_HI_U8

    // D.u = (SAD_U8(S0, S1, 0) << 16) + S2.u.
    // Sum of absolute differences with accumulation, overflow is lost.
    void
    Inst_VOP3__V_SAD_HI_U8::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (((bits(src0[lane], 31, 24)
                    - bits(src1[lane], 31, 24)) + (bits(src0[lane], 23, 16)
                    - bits(src1[lane], 23, 16)) + (bits(src0[lane], 15, 8)
                    - bits(src1[lane], 15, 8)) + (bits(src0[lane], 7, 0)
                    - bits(src1[lane], 7, 0))) << 16) + src2[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_SAD_U16::Inst_VOP3__V_SAD_U16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_sad_u16", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_SAD_U16

    Inst_VOP3__V_SAD_U16::~Inst_VOP3__V_SAD_U16()
    {
    } // ~Inst_VOP3__V_SAD_U16

    // D.u = abs(S0.i[31:16] - S1.i[31:16]) + abs(S0.i[15:0] - S1.i[15:0])
    // + S2.u.
    // Word SAD with accumulation.
    void
    Inst_VOP3__V_SAD_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::abs(bits(src0[lane], 31, 16)
                    - bits(src1[lane], 31, 16))
                    + std::abs(bits(src0[lane], 15, 0)
                    - bits(src1[lane], 15, 0)) + src2[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_SAD_U32::Inst_VOP3__V_SAD_U32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_sad_u32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_SAD_U32

    Inst_VOP3__V_SAD_U32::~Inst_VOP3__V_SAD_U32()
    {
    } // ~Inst_VOP3__V_SAD_U32

    // D.u = abs(S0.i - S1.i) + S2.u.
    // Dword SAD with accumulation.
    void
    Inst_VOP3__V_SAD_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::abs(src0[lane] - src1[lane]) + src2[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_PK_U8_F32::Inst_VOP3__V_CVT_PK_U8_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_pk_u8_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_PK_U8_F32

    Inst_VOP3__V_CVT_PK_U8_F32::~Inst_VOP3__V_CVT_PK_U8_F32()
    {
    } // ~Inst_VOP3__V_CVT_PK_U8_F32

    // D.u = ((flt32_to_uint8(S0.f) & 0xff) << (8 * S1.u[1:0]))
    // | (S2.u & ~(0xff << (8 * S1.u[1:0]))).
    // Convert floating point value S0 to 8-bit unsigned integer and pack the
    // result into byte S1 of dword S2.
    void
    Inst_VOP3__V_CVT_PK_U8_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }


        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = (((VecElemU8)src0[lane] & 0xff)
                    << (8 * bits(src1[lane], 1, 0)))
                    | (src2[lane] & ~(0xff << (8 * bits(src1[lane], 1, 0))));
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_DIV_FIXUP_F32::Inst_VOP3__V_DIV_FIXUP_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_div_fixup_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_DIV_FIXUP_F32

    Inst_VOP3__V_DIV_FIXUP_F32::~Inst_VOP3__V_DIV_FIXUP_F32()
    {
    } // ~Inst_VOP3__V_DIV_FIXUP_F32

    // D.f = Divide fixup and flags -- s0.f = Quotient, s1.f = Denominator,
    // s2.f = Numerator.
    void
    Inst_VOP3__V_DIV_FIXUP_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (instData.ABS & 0x4) {
            src2.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        if (extData.NEG & 0x4) {
            src2.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::fpclassify(src1[lane]) == FP_ZERO) {
                    if (std::signbit(src1[lane])) {
                        vdst[lane] = -INFINITY;
                    } else {
                        vdst[lane] = +INFINITY;
                    }
                } else if (std::isnan(src2[lane]) || std::isnan(src1[lane])) {
                    vdst[lane] = NAN;
                } else if (std::isinf(src1[lane])) {
                    if (std::signbit(src1[lane])) {
                        vdst[lane] = -INFINITY;
                    } else {
                        vdst[lane] = +INFINITY;
                    }
                } else {
                    vdst[lane] = src2[lane] / src1[lane];
                }
            }
        }

        vdst.write();
    } // execute
    // --- Inst_VOP3__V_DIV_FIXUP_F64 class methods ---

    Inst_VOP3__V_DIV_FIXUP_F64::Inst_VOP3__V_DIV_FIXUP_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_div_fixup_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_DIV_FIXUP_F64

    Inst_VOP3__V_DIV_FIXUP_F64::~Inst_VOP3__V_DIV_FIXUP_F64()
    {
    } // ~Inst_VOP3__V_DIV_FIXUP_F64

    // D.d = Divide fixup and flags -- s0.d = Quotient, s1.d = Denominator,
    // s2.d = Numerator.
    void
    Inst_VOP3__V_DIV_FIXUP_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (instData.ABS & 0x4) {
            src2.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        if (extData.NEG & 0x4) {
            src2.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                int sign_out = std::signbit(src1[lane])
                              ^ std::signbit(src2[lane]);
                int exp1(0);
                int exp2(0);
                std::frexp(src1[lane], &exp1);
                std::frexp(src2[lane], &exp2);

                if (std::isnan(src1[lane]) || std::isnan(src2[lane])) {
                    vdst[lane] = std::numeric_limits<VecElemF64>::quiet_NaN();
                } else if (std::fpclassify(src1[lane]) == FP_ZERO
                           && std::fpclassify(src2[lane]) == FP_ZERO) {
                    vdst[lane]
                        = std::numeric_limits<VecElemF64>::signaling_NaN();
                } else if (std::isinf(src1[lane]) && std::isinf(src2[lane])) {
                    vdst[lane]
                        = std::numeric_limits<VecElemF64>::signaling_NaN();
                } else if (std::fpclassify(src1[lane]) == FP_ZERO
                           || std::isinf(src2[lane])) {
                    vdst[lane] = sign_out ? -INFINITY : +INFINITY;
                } else if (std::isinf(src1[lane])
                           || std::fpclassify(src2[lane]) == FP_ZERO) {
                    vdst[lane] = sign_out ? -0.0 : +0.0;
                } else if (exp2 - exp1 < -1075) {
                    vdst[lane] = src0[lane];
                } else if (exp1 == 2047) {
                    vdst[lane] = src0[lane];
                } else {
                    vdst[lane] = sign_out ? -std::fabs(src0[lane])
                        : std::fabs(src0[lane]);
                }
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_DIV_SCALE_F32::Inst_VOP3__V_DIV_SCALE_F32(
          InFmt_VOP3_SDST_ENC *iFmt)
        : Inst_VOP3_SDST_ENC(iFmt, "v_div_scale_f32")
    {
        setFlag(ALU);
        setFlag(WritesVCC);
        setFlag(F32);
    } // Inst_VOP3__V_DIV_SCALE_F32

    Inst_VOP3__V_DIV_SCALE_F32::~Inst_VOP3__V_DIV_SCALE_F32()
    {
    } // ~Inst_VOP3__V_DIV_SCALE_F32

    // {vcc,D.f} = Divide preop and flags -- s0.f = Quotient, s1.f =
    // Denominator, s2.f = Numerator -- s0 must equal s1 or s2. Given a
    // numerator and denominator, this opcode will appropriately scale inputs
    // for division to avoid subnormal terms during Newton-Raphson correction
    // algorithm. This opcode producses a VCC flag for post-scale of quotient.
    void
    Inst_VOP3__V_DIV_SCALE_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        if (extData.NEG & 0x4) {
            src2.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane];
                vcc.setBit(lane, 0);
            }
        }

        vcc.write();
        vdst.write();
    } // execute
    // --- Inst_VOP3__V_DIV_SCALE_F64 class methods ---

    Inst_VOP3__V_DIV_SCALE_F64::Inst_VOP3__V_DIV_SCALE_F64(
          InFmt_VOP3_SDST_ENC *iFmt)
        : Inst_VOP3_SDST_ENC(iFmt, "v_div_scale_f64")
    {
        setFlag(ALU);
        setFlag(WritesVCC);
        setFlag(F64);
    } // Inst_VOP3__V_DIV_SCALE_F64

    Inst_VOP3__V_DIV_SCALE_F64::~Inst_VOP3__V_DIV_SCALE_F64()
    {
    } // ~Inst_VOP3__V_DIV_SCALE_F64

    // {vcc,D.d} = Divide preop and flags -- s0.d = Quotient, s1.d =
    // Denominator, s2.d = Numerator -- s0 must equal s1 or s2. Given a
    // numerator and denominator, this opcode will appropriately scale inputs
    // for division to avoid subnormal terms during Newton-Raphson correction
    // algorithm. This opcode producses a VCC flag for post-scale of quotient.
    void
    Inst_VOP3__V_DIV_SCALE_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        if (extData.NEG & 0x4) {
            src2.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                int exp1(0);
                int exp2(0);
                std::frexp(src1[lane], &exp1);
                std::frexp(src2[lane], &exp2);
                vcc.setBit(lane, 0);

                if (std::fpclassify(src1[lane]) == FP_ZERO
                    || std::fpclassify(src2[lane]) == FP_ZERO) {
                    vdst[lane] = NAN;
                } else if (exp2 - exp1 >= 768) {
                    vcc.setBit(lane, 1);
                    if (src0[lane] == src1[lane]) {
                        vdst[lane] = std::ldexp(src0[lane], 128);
                    }
                } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL) {
                    vdst[lane] = std::ldexp(src0[lane], 128);
                } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL
                           && std::fpclassify(src2[lane] / src1[lane])
                           == FP_SUBNORMAL) {
                    vcc.setBit(lane, 1);
                    if (src0[lane] == src1[lane]) {
                        vdst[lane] = std::ldexp(src0[lane], 128);
                    }
                } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL) {
                    vdst[lane] = std::ldexp(src0[lane], -128);
                } else if (std::fpclassify(src2[lane] / src1[lane])
                           == FP_SUBNORMAL) {
                    vcc.setBit(lane, 1);
                    if (src0[lane] == src2[lane]) {
                        vdst[lane] = std::ldexp(src0[lane], 128);
                    }
                } else if (exp2 <= 53) {
                    vdst[lane] = std::ldexp(src0[lane], 128);
                }
            }
        }

        vcc.write();
        vdst.write();
    }

    Inst_VOP3__V_DIV_FMAS_F32::Inst_VOP3__V_DIV_FMAS_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_div_fmas_f32", false)
    {
        setFlag(ALU);
        setFlag(ReadsVCC);
        setFlag(F32);
        setFlag(FMA);
    } // Inst_VOP3__V_DIV_FMAS_F32

    Inst_VOP3__V_DIV_FMAS_F32::~Inst_VOP3__V_DIV_FMAS_F32()
    {
    } // ~Inst_VOP3__V_DIV_FMAS_F32

    // D.f = Special case divide FMA with scale and flags(s0.f = Quotient,
    // s1.f = Denominator, s2.f = Numerator)
    void
    Inst_VOP3__V_DIV_FMAS_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (instData.ABS & 0x4) {
            src2.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        if (extData.NEG & 0x4) {
            src2.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
            }
        }

        //vdst.write();
    } // execute
    // --- Inst_VOP3__V_DIV_FMAS_F64 class methods ---

    Inst_VOP3__V_DIV_FMAS_F64::Inst_VOP3__V_DIV_FMAS_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_div_fmas_f64", false)
    {
        setFlag(ALU);
        setFlag(ReadsVCC);
        setFlag(F64);
        setFlag(FMA);
    } // Inst_VOP3__V_DIV_FMAS_F64

    Inst_VOP3__V_DIV_FMAS_F64::~Inst_VOP3__V_DIV_FMAS_F64()
    {
    } // ~Inst_VOP3__V_DIV_FMAS_F64

    // D.d = Special case divide FMA with scale and flags(s0.d = Quotient,
    // s1.d = Denominator, s2.d = Numerator)
    void
    Inst_VOP3__V_DIV_FMAS_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);
        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();
        vcc.read();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (instData.ABS & 0x4) {
            src2.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        if (extData.NEG & 0x4) {
            src2.negModifier();
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (bits(vcc.rawData(), lane)) {
                    vdst[lane] = std::pow(2, 64)
                        * std::fma(src0[lane], src1[lane], src2[lane]);
                } else {
                    vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
                }
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MSAD_U8::Inst_VOP3__V_MSAD_U8(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_msad_u8", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MSAD_U8

    Inst_VOP3__V_MSAD_U8::~Inst_VOP3__V_MSAD_U8()
    {
    } // ~Inst_VOP3__V_MSAD_U8

    // D.u = Masked Byte SAD with accum_lo(S0.u, S1.u, S2.u).
    void
    Inst_VOP3__V_MSAD_U8::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_QSAD_PK_U16_U8::Inst_VOP3__V_QSAD_PK_U16_U8(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_qsad_pk_u16_u8", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_QSAD_PK_U16_U8

    Inst_VOP3__V_QSAD_PK_U16_U8::~Inst_VOP3__V_QSAD_PK_U16_U8()
    {
    } // ~Inst_VOP3__V_QSAD_PK_U16_U8

    // D.u = Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
    // S1.u[31:0], S2.u[63:0])
    void
    Inst_VOP3__V_QSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_MQSAD_PK_U16_U8::Inst_VOP3__V_MQSAD_PK_U16_U8(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mqsad_pk_u16_u8", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MQSAD_PK_U16_U8

    Inst_VOP3__V_MQSAD_PK_U16_U8::~Inst_VOP3__V_MQSAD_PK_U16_U8()
    {
    } // ~Inst_VOP3__V_MQSAD_PK_U16_U8

    // D.u = Masked Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
    // S1.u[31:0], S2.u[63:0])
    void
    Inst_VOP3__V_MQSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_MQSAD_U32_U8::Inst_VOP3__V_MQSAD_U32_U8(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mqsad_u32_u8", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MQSAD_U32_U8

    Inst_VOP3__V_MQSAD_U32_U8::~Inst_VOP3__V_MQSAD_U32_U8()
    {
    } // ~Inst_VOP3__V_MQSAD_U32_U8

    // D.u128 = Masked Quad-Byte SAD with 32-bit accum_lo/hi(S0.u[63:0],
    // S1.u[31:0], S2.u[127:0])
    void
    Inst_VOP3__V_MQSAD_U32_U8::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_MAD_U64_U32::Inst_VOP3__V_MAD_U64_U32(
          InFmt_VOP3_SDST_ENC *iFmt)
        : Inst_VOP3_SDST_ENC(iFmt, "v_mad_u64_u32")
    {
        setFlag(ALU);
        setFlag(WritesVCC);
        setFlag(MAD);
    } // Inst_VOP3__V_MAD_U64_U32

    Inst_VOP3__V_MAD_U64_U32::~Inst_VOP3__V_MAD_U64_U32()
    {
    } // ~Inst_VOP3__V_MAD_U64_U32

    // {vcc_out, D.u64} = S0.u32 * S1.u32 + S2.u64.
    void
    Inst_VOP3__V_MAD_U64_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
        VecOperandU64 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();
        vdst.read();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
                    src2[lane]));
            }
        }

        vcc.write();
        vdst.write();
    }

    Inst_VOP3__V_MAD_I64_I32::Inst_VOP3__V_MAD_I64_I32(
          InFmt_VOP3_SDST_ENC *iFmt)
        : Inst_VOP3_SDST_ENC(iFmt, "v_mad_i64_i32")
    {
        setFlag(ALU);
        setFlag(WritesVCC);
        setFlag(MAD);
    } // Inst_VOP3__V_MAD_I64_I32

    Inst_VOP3__V_MAD_I64_I32::~Inst_VOP3__V_MAD_I64_I32()
    {
    } // ~Inst_VOP3__V_MAD_I64_I32

    // {vcc_out,D.i64} = S0.i32 * S1.i32 + S2.i64.
    void
    Inst_VOP3__V_MAD_I64_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandI64 src2(gpuDynInst, extData.SRC2);
        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
        VecOperandI64 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
                    src2[lane]));
            }
        }

        vcc.write();
        vdst.write();
    }

    Inst_VOP3__V_MAD_F16::Inst_VOP3__V_MAD_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mad_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
        setFlag(MAD);
    } // Inst_VOP3__V_MAD_F16

    Inst_VOP3__V_MAD_F16::~Inst_VOP3__V_MAD_F16()
    {
    } // ~Inst_VOP3__V_MAD_F16

    // D.f16 = S0.f16 * S1.f16 + S2.f16.
    // Supports round mode, exception flags, saturation.
    void
    Inst_VOP3__V_MAD_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_MAD_U16::Inst_VOP3__V_MAD_U16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mad_u16", false)
    {
        setFlag(ALU);
        setFlag(MAD);
    } // Inst_VOP3__V_MAD_U16

    Inst_VOP3__V_MAD_U16::~Inst_VOP3__V_MAD_U16()
    {
    } // ~Inst_VOP3__V_MAD_U16

    // D.u16 = S0.u16 * S1.u16 + S2.u16.
    // Supports saturation (unsigned 16-bit integer domain).
    void
    Inst_VOP3__V_MAD_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU16 src2(gpuDynInst, extData.SRC2);
        VecOperandU16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] * src1[lane] + src2[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MAD_I16::Inst_VOP3__V_MAD_I16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mad_i16", false)
    {
        setFlag(ALU);
        setFlag(MAD);
    } // Inst_VOP3__V_MAD_I16

    Inst_VOP3__V_MAD_I16::~Inst_VOP3__V_MAD_I16()
    {
    } // ~Inst_VOP3__V_MAD_I16

    // D.i16 = S0.i16 * S1.i16 + S2.i16.
    // Supports saturation (signed 16-bit integer domain).
    void
    Inst_VOP3__V_MAD_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandI16 src2(gpuDynInst, extData.SRC2);
        VecOperandI16 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src0[lane] * src1[lane] + src2[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_PERM_B32::Inst_VOP3__V_PERM_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_perm_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_PERM_B32

    Inst_VOP3__V_PERM_B32::~Inst_VOP3__V_PERM_B32()
    {
    } // ~Inst_VOP3__V_PERM_B32

    // D.u[31:24] = permute({S0.u, S1.u}, S2.u[31:24]);
    // D.u[23:16] = permute({S0.u, S1.u}, S2.u[23:16]);
    // D.u[15:8] = permute({S0.u, S1.u}, S2.u[15:8]);
    // D.u[7:0] = permute({S0.u, S1.u}, S2.u[7:0]);
    // byte permute(byte in[8], byte sel) {
    //     if(sel>=13) then return 0xff;
    //     elsif(sel==12) then return 0x00;
    //     elsif(sel==11) then return in[7][7] * 0xff;
    //     elsif(sel==10) then return in[5][7] * 0xff;
    //     elsif(sel==9) then return in[3][7] * 0xff;
    //     elsif(sel==8) then return in[1][7] * 0xff;
    //     else return in[sel];
    //     }
    void
    Inst_VOP3__V_PERM_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();
        src2.readSrc();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemU64 selector = (VecElemU64)src0[lane];
                selector = (selector << 32) | (VecElemU64)src1[lane];
                vdst[lane] = 0;

                DPRINTF(GCN3, "Executing v_perm_b32 src_0 0x%08x, src_1 "
                        "0x%08x, src_2 0x%08x, vdst 0x%08x\n", src0[lane],
                        src1[lane], src2[lane], vdst[lane]);
                DPRINTF(GCN3, "Selector: 0x%08x \n", selector);

                for (int i = 0; i < 4 ; ++i) {
                    VecElemU32 permuted_val = permute(selector, 0xFF
                        & ((VecElemU32)src2[lane] >> (8 * i)));
                    vdst[lane] |= (permuted_val << i);
                }

                DPRINTF(GCN3, "v_perm result: 0x%08x\n", vdst[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_FMA_F16::Inst_VOP3__V_FMA_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_fma_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
        setFlag(FMA);
    } // Inst_VOP3__V_FMA_F16

    Inst_VOP3__V_FMA_F16::~Inst_VOP3__V_FMA_F16()
    {
    } // ~Inst_VOP3__V_FMA_F16

    // D.f16 = S0.f16 * S1.f16 + S2.f16.
    // Fused half precision multiply add.
    void
    Inst_VOP3__V_FMA_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_DIV_FIXUP_F16::Inst_VOP3__V_DIV_FIXUP_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_div_fixup_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_DIV_FIXUP_F16

    Inst_VOP3__V_DIV_FIXUP_F16::~Inst_VOP3__V_DIV_FIXUP_F16()
    {
    } // ~Inst_VOP3__V_DIV_FIXUP_F16

    // sign_out =  sign(S1.f16)^sign(S2.f16);
    // if (S2.f16 == NAN)
    //     D.f16 = Quiet(S2.f16);
    // else if (S1.f16 == NAN)
    //     D.f16 = Quiet(S1.f16);
    // else if (S1.f16 == S2.f16 == 0)
    //     # 0/0
    //     D.f16 = pele_nan(0xfe00);
    // else if (abs(S1.f16) == abs(S2.f16) == +-INF)
    //     # inf/inf
    //     D.f16 = pele_nan(0xfe00);
    // else if (S1.f16 ==0 || abs(S2.f16) == +-INF)
    //     # x/0, or inf/y
    //     D.f16 = sign_out ? -INF : INF;
    // else if (abs(S1.f16) == +-INF || S2.f16 == 0)
    //     # x/inf, 0/y
    //     D.f16 = sign_out ? -0 : 0;
    // else if ((exp(S2.f16) - exp(S1.f16)) < -150)
    //     D.f16 = sign_out ? -underflow : underflow;
    // else if (exp(S1.f16) == 255)
    //     D.f16 = sign_out ? -overflow : overflow;
    // else
    //     D.f16 = sign_out ? -abs(S0.f16) : abs(S0.f16).
    // Half precision division fixup.
    // S0 = Quotient, S1 = Denominator, S3 = Numerator.
    // Given a numerator, denominator, and quotient from a divide, this opcode
    // will detect and apply special case numerics, touching up the quotient if
    // necessary. This opcode also generates invalid, denorm and divide by
    // zero exceptions caused by the division.
    void
    Inst_VOP3__V_DIV_FIXUP_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_pkaccum_u8_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_PKACCUM_U8_F32

    Inst_VOP3__V_CVT_PKACCUM_U8_F32::~Inst_VOP3__V_CVT_PKACCUM_U8_F32()
    {
    } // ~Inst_VOP3__V_CVT_PKACCUM_U8_F32

    // byte = S1.u[1:0]; bit = byte * 8;
    // D.u[bit + 7:bit] = flt32_to_uint8(S0.f);
    // Pack converted value of S0.f into byte S1 of the destination.
    // SQ translates to V_CVT_PK_U8_F32.
    // Note: this opcode uses src_c to pass destination in as a source.
    void
    Inst_VOP3__V_CVT_PKACCUM_U8_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_INTERP_P1_F32::Inst_VOP3__V_INTERP_P1_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_interp_p1_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_INTERP_P1_F32

    Inst_VOP3__V_INTERP_P1_F32::~Inst_VOP3__V_INTERP_P1_F32()
    {
    } // ~Inst_VOP3__V_INTERP_P1_F32

    // D.f = P10 * S.f + P0;
    void
    Inst_VOP3__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_INTERP_P2_F32::Inst_VOP3__V_INTERP_P2_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_interp_p2_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_INTERP_P2_F32

    Inst_VOP3__V_INTERP_P2_F32::~Inst_VOP3__V_INTERP_P2_F32()
    {
    } // ~Inst_VOP3__V_INTERP_P2_F32

    // D.f = P20 * S.f + D.f;
    void
    Inst_VOP3__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_INTERP_MOV_F32::Inst_VOP3__V_INTERP_MOV_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_interp_mov_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_INTERP_MOV_F32

    Inst_VOP3__V_INTERP_MOV_F32::~Inst_VOP3__V_INTERP_MOV_F32()
    {
    } // ~Inst_VOP3__V_INTERP_MOV_F32

    // D.f = {P10,P20,P0}[S.u]; parameter load.
    void
    Inst_VOP3__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_INTERP_P1LL_F16::Inst_VOP3__V_INTERP_P1LL_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_interp_p1ll_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_INTERP_P1LL_F16

    Inst_VOP3__V_INTERP_P1LL_F16::~Inst_VOP3__V_INTERP_P1LL_F16()
    {
    } // ~Inst_VOP3__V_INTERP_P1LL_F16

    // D.f32 = P10.f16 * S0.f32 + P0.f16.
    void
    Inst_VOP3__V_INTERP_P1LL_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_INTERP_P1LV_F16::Inst_VOP3__V_INTERP_P1LV_F16(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_interp_p1lv_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_INTERP_P1LV_F16

    Inst_VOP3__V_INTERP_P1LV_F16::~Inst_VOP3__V_INTERP_P1LV_F16()
    {
    } // ~Inst_VOP3__V_INTERP_P1LV_F16

    void
    Inst_VOP3__V_INTERP_P1LV_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_INTERP_P2_F16::Inst_VOP3__V_INTERP_P2_F16(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_interp_p2_f16", false)
    {
        setFlag(ALU);
        setFlag(F16);
    } // Inst_VOP3__V_INTERP_P2_F16

    Inst_VOP3__V_INTERP_P2_F16::~Inst_VOP3__V_INTERP_P2_F16()
    {
    } // ~Inst_VOP3__V_INTERP_P2_F16

    // D.f16 = P20.f16 * S0.f32 + S2.f32.
    void
    Inst_VOP3__V_INTERP_P2_F16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_ADD_F64::Inst_VOP3__V_ADD_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_add_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_ADD_F64

    Inst_VOP3__V_ADD_F64::~Inst_VOP3__V_ADD_F64()
    {
    } // ~Inst_VOP3__V_ADD_F64

    // D.d = S0.d + S1.d.
    void
    Inst_VOP3__V_ADD_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isnan(src0[lane]) ||
                    std::isnan(src1[lane]) ) {
                        vdst[lane] = NAN;
                } else if (std::isinf(src0[lane]) &&
                           std::isinf(src1[lane])) {
                    if (std::signbit(src0[lane]) !=
                        std::signbit(src1[lane])) {
                        vdst[lane] = NAN;
                    } else {
                        vdst[lane] = src0[lane];
                    }
                } else if (std::isinf(src0[lane])) {
                    vdst[lane] = src0[lane];
                } else if (std::isinf(src1[lane])) {
                    vdst[lane] = src1[lane];
                } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                           std::fpclassify(src0[lane]) == FP_ZERO) {
                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                        std::fpclassify(src1[lane]) == FP_ZERO) {
                        if (std::signbit(src0[lane]) &&
                            std::signbit(src1[lane])) {
                            vdst[lane] = -0.0;
                        } else {
                            vdst[lane] = 0.0;
                        }
                    } else {
                        vdst[lane] = src1[lane];
                    }
                } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                           std::fpclassify(src1[lane]) == FP_ZERO) {
                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                        std::fpclassify(src0[lane]) == FP_ZERO) {
                        if (std::signbit(src0[lane]) &&
                            std::signbit(src1[lane])) {
                            vdst[lane] = -0.0;
                        } else {
                            vdst[lane] = 0.0;
                        }
                    } else {
                        vdst[lane] = src0[lane];
                    }
                } else {
                    vdst[lane] = src0[lane] + src1[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MUL_F64::Inst_VOP3__V_MUL_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mul_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_MUL_F64

    Inst_VOP3__V_MUL_F64::~Inst_VOP3__V_MUL_F64()
    {
    } // ~Inst_VOP3__V_MUL_F64

    // D.d = S0.d * S1.d.
    void
    Inst_VOP3__V_MUL_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isnan(src0[lane]) ||
                    std::isnan(src1[lane])) {
                    vdst[lane] = NAN;
                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                           std::fpclassify(src0[lane]) == FP_ZERO) &&
                           !std::signbit(src0[lane])) {
                    if (std::isinf(src1[lane])) {
                        vdst[lane] = NAN;
                    } else if (!std::signbit(src1[lane])) {
                        vdst[lane] = +0.0;
                    } else {
                        vdst[lane] = -0.0;
                    }
                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
                           std::fpclassify(src0[lane]) == FP_ZERO) &&
                           std::signbit(src0[lane])) {
                    if (std::isinf(src1[lane])) {
                        vdst[lane] = NAN;
                    } else if (std::signbit(src1[lane])) {
                        vdst[lane] = +0.0;
                    } else {
                        vdst[lane] = -0.0;
                    }
                } else if (std::isinf(src0[lane]) &&
                           !std::signbit(src0[lane])) {
                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                        std::fpclassify(src1[lane]) == FP_ZERO) {
                        vdst[lane] = NAN;
                    } else if (!std::signbit(src1[lane])) {
                        vdst[lane] = +INFINITY;
                    } else {
                        vdst[lane] = -INFINITY;
                    }
                } else if (std::isinf(src0[lane]) &&
                           std::signbit(src0[lane])) {
                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
                        std::fpclassify(src1[lane]) == FP_ZERO) {
                        vdst[lane] = NAN;
                    } else if (std::signbit(src1[lane])) {
                        vdst[lane] = +INFINITY;
                    } else {
                        vdst[lane] = -INFINITY;
                    }
                } else {
                    vdst[lane] = src0[lane] * src1[lane];
                }
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MIN_F64::Inst_VOP3__V_MIN_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_min_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_MIN_F64

    Inst_VOP3__V_MIN_F64::~Inst_VOP3__V_MIN_F64()
    {
    } // ~Inst_VOP3__V_MIN_F64

    // D.d = min(S0.d, S1.d).
    void
    Inst_VOP3__V_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fmin(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MAX_F64::Inst_VOP3__V_MAX_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_max_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_MAX_F64

    Inst_VOP3__V_MAX_F64::~Inst_VOP3__V_MAX_F64()
    {
    } // ~Inst_VOP3__V_MAX_F64

    // D.d = max(S0.d, S1.d).
    void
    Inst_VOP3__V_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (instData.ABS & 0x2) {
            src1.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        if (extData.NEG & 0x2) {
            src1.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::fmax(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_LDEXP_F64::Inst_VOP3__V_LDEXP_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_ldexp_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_LDEXP_F64

    Inst_VOP3__V_LDEXP_F64::~Inst_VOP3__V_LDEXP_F64()
    {
    } // ~Inst_VOP3__V_LDEXP_F64

    // D.d = pow(S0.d, S1.i[31:0]).
    void
    Inst_VOP3__V_LDEXP_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandF64 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        if (instData.ABS & 0x1) {
            src0.absModifier();
        }

        if (extData.NEG & 0x1) {
            src0.negModifier();
        }

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                if (std::isnan(src0[lane]) || std::isinf(src0[lane])) {
                    vdst[lane] = src0[lane];
                } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
                           || std::fpclassify(src0[lane]) == FP_ZERO) {
                    if (std::signbit(src0[lane])) {
                        vdst[lane] = -0.0;
                    } else {
                        vdst[lane] = +0.0;
                    }
                } else {
                    vdst[lane] = std::ldexp(src0[lane], src1[lane]);
                }
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MUL_LO_U32::Inst_VOP3__V_MUL_LO_U32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mul_lo_u32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MUL_LO_U32

    Inst_VOP3__V_MUL_LO_U32::~Inst_VOP3__V_MUL_LO_U32()
    {
    } // ~Inst_VOP3__V_MUL_LO_U32

    // D.u = S0.u * S1.u.
    void
    Inst_VOP3__V_MUL_LO_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemI64 s0 = (VecElemI64)src0[lane];
                VecElemI64 s1 = (VecElemI64)src1[lane];
                vdst[lane] = (VecElemU32)((s0 * s1) & 0xffffffffLL);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MUL_HI_U32::Inst_VOP3__V_MUL_HI_U32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mul_hi_u32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MUL_HI_U32

    Inst_VOP3__V_MUL_HI_U32::~Inst_VOP3__V_MUL_HI_U32()
    {
    } // ~Inst_VOP3__V_MUL_HI_U32

    // D.u = (S0.u * S1.u) >> 32.
    void
    Inst_VOP3__V_MUL_HI_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemI64 s0 = (VecElemI64)src0[lane];
                VecElemI64 s1 = (VecElemI64)src1[lane];
                vdst[lane]
                    = (VecElemU32)(((s0 * s1) >> 32) & 0xffffffffLL);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MUL_HI_I32::Inst_VOP3__V_MUL_HI_I32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mul_hi_i32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MUL_HI_I32

    Inst_VOP3__V_MUL_HI_I32::~Inst_VOP3__V_MUL_HI_I32()
    {
    } // ~Inst_VOP3__V_MUL_HI_I32

    // D.i = (S0.i * S1.i) >> 32.
    void
    Inst_VOP3__V_MUL_HI_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        VecOperandI32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                VecElemI64 s0 = (VecElemI64)src0[lane];
                VecElemI64 s1 = (VecElemI64)src1[lane];
                vdst[lane]
                    = (VecElemI32)(((s0 * s1) >> 32LL) & 0xffffffffLL);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_LDEXP_F32::Inst_VOP3__V_LDEXP_F32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_ldexp_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_LDEXP_F32

    Inst_VOP3__V_LDEXP_F32::~Inst_VOP3__V_LDEXP_F32()
    {
    } // ~Inst_VOP3__V_LDEXP_F32

    // D.f = pow(S0.f, S1.i)
    void
    Inst_VOP3__V_LDEXP_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = std::ldexp(src0[lane], src1[lane]);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_READLANE_B32::Inst_VOP3__V_READLANE_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_readlane_b32", true)
    {
        setFlag(ALU);
        setFlag(IgnoreExec);
    } // Inst_VOP3__V_READLANE_B32

    Inst_VOP3__V_READLANE_B32::~Inst_VOP3__V_READLANE_B32()
    {
    } // ~Inst_VOP3__V_READLANE_B32

    // Copy one VGPR value to one SGPR. D = SGPR-dest, S0 = Source Data (VGPR#
    // or M0(lds-direct)), S1 = Lane Select (SGPR or M0). Ignores exec mask.
    // Input and output modifiers not supported; this is an untyped operation.
    void
    Inst_VOP3__V_READLANE_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
        ScalarOperandU32 sdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.read();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        sdst = src0[src1.rawData() & 0x3f];

        sdst.write();
    }

    Inst_VOP3__V_WRITELANE_B32::Inst_VOP3__V_WRITELANE_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_writelane_b32", false)
    {
        setFlag(ALU);
        setFlag(IgnoreExec);
    } // Inst_VOP3__V_WRITELANE_B32

    Inst_VOP3__V_WRITELANE_B32::~Inst_VOP3__V_WRITELANE_B32()
    {
    } // ~Inst_VOP3__V_WRITELANE_B32

    // Write value into one VGPR in one lane. D = VGPR-dest, S0 = Source Data
    // (sgpr, m0, exec or constants), S1 = Lane Select (SGPR or M0). Ignores
    // exec mask. Input and output modifiers not supported; this is an untyped
    // operation.
    void
    Inst_VOP3__V_WRITELANE_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        ConstScalarOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.read();
        src1.read();
        vdst.read();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        vdst[src1.rawData() & 0x3f] = src0.rawData();

        vdst.write();
    }

    Inst_VOP3__V_BCNT_U32_B32::Inst_VOP3__V_BCNT_U32_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_bcnt_u32_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_BCNT_U32_B32

    Inst_VOP3__V_BCNT_U32_B32::~Inst_VOP3__V_BCNT_U32_B32()
    {
    } // ~Inst_VOP3__V_BCNT_U32_B32

    // D.u = CountOneBits(S0.u) + S1.u. Bit count.
    void
    Inst_VOP3__V_BCNT_U32_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = popCount(src0[lane]) + src1[lane];
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_MBCNT_LO_U32_B32::Inst_VOP3__V_MBCNT_LO_U32_B32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mbcnt_lo_u32_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MBCNT_LO_U32_B32

    Inst_VOP3__V_MBCNT_LO_U32_B32::~Inst_VOP3__V_MBCNT_LO_U32_B32()
    {
    } // ~Inst_VOP3__V_MBCNT_LO_U32_B32

    // Masked bit count, ThreadPosition is the position of this thread in the
    // wavefront (in 0..63).
    void
    Inst_VOP3__V_MBCNT_LO_U32_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);
        uint64_t threadMask = 0;

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                threadMask = ((1LL << lane) - 1LL);
                vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) +
                             src1[lane];
            }
        }

        vdst.write();
    } // execute
    // --- Inst_VOP3__V_MBCNT_HI_U32_B32 class methods ---

    Inst_VOP3__V_MBCNT_HI_U32_B32::Inst_VOP3__V_MBCNT_HI_U32_B32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_mbcnt_hi_u32_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_MBCNT_HI_U32_B32

    Inst_VOP3__V_MBCNT_HI_U32_B32::~Inst_VOP3__V_MBCNT_HI_U32_B32()
    {
    } // ~Inst_VOP3__V_MBCNT_HI_U32_B32

    // ThreadMask = (1 << ThreadPosition) - 1;
    // D.u = CountOneBits(S0.u & ThreadMask[63:32]) + S1.u.
    // Masked bit count, ThreadPosition is the position of this thread in the
    // wavefront (in 0..63).
    void
    Inst_VOP3__V_MBCNT_HI_U32_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);
        uint64_t threadMask = 0;

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                threadMask = ((1LL << lane) - 1LL);
                vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) +
                             src1[lane];
            }
        }

        vdst.write();
    } // execute
    // --- Inst_VOP3__V_LSHLREV_B64 class methods ---

    Inst_VOP3__V_LSHLREV_B64::Inst_VOP3__V_LSHLREV_B64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_lshlrev_b64", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_LSHLREV_B64

    Inst_VOP3__V_LSHLREV_B64::~Inst_VOP3__V_LSHLREV_B64()
    {
    } // ~Inst_VOP3__V_LSHLREV_B64

    // D.u64 = S1.u64 << S0.u[5:0].
    void
    Inst_VOP3__V_LSHLREV_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
        VecOperandU64 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] << bits(src0[lane], 5, 0);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_LSHRREV_B64::Inst_VOP3__V_LSHRREV_B64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_lshrrev_b64", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_LSHRREV_B64

    Inst_VOP3__V_LSHRREV_B64::~Inst_VOP3__V_LSHRREV_B64()
    {
    } // ~Inst_VOP3__V_LSHRREV_B64

    // D.u64 = S1.u64 >> S0.u[5:0].
    // The vacated bits are set to zero.
    void
    Inst_VOP3__V_LSHRREV_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
        VecOperandU64 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = src1[lane] >> bits(src0[lane], 5, 0);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_ASHRREV_I64::Inst_VOP3__V_ASHRREV_I64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_ashrrev_i64", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_ASHRREV_I64

    Inst_VOP3__V_ASHRREV_I64::~Inst_VOP3__V_ASHRREV_I64()
    {
    } // ~Inst_VOP3__V_ASHRREV_I64

    // D.u64 = signext(S1.u64) >> S0.u[5:0].
    // The vacated bits are set to the sign bit of the input value.
    void
    Inst_VOP3__V_ASHRREV_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
        VecOperandU64 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane]
                    = src1[lane] >> bits(src0[lane], 5, 0);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_TRIG_PREOP_F64::Inst_VOP3__V_TRIG_PREOP_F64(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_trig_preop_f64", false)
    {
        setFlag(ALU);
        setFlag(F64);
    } // Inst_VOP3__V_TRIG_PREOP_F64

    Inst_VOP3__V_TRIG_PREOP_F64::~Inst_VOP3__V_TRIG_PREOP_F64()
    {
    } // ~Inst_VOP3__V_TRIG_PREOP_F64

    void
    Inst_VOP3__V_TRIG_PREOP_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_BFM_B32::Inst_VOP3__V_BFM_B32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_bfm_b32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_BFM_B32

    Inst_VOP3__V_BFM_B32::~Inst_VOP3__V_BFM_B32()
    {
    } // ~Inst_VOP3__V_BFM_B32

    // D.u = ((1 << S0.u[4:0]) - 1) << S1.u[4:0];
    void
    Inst_VOP3__V_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        src0.readSrc();
        src1.readSrc();

        /**
         * input modifiers are supported by FP operations only
         */
        assert(!(instData.ABS & 0x1));
        assert(!(instData.ABS & 0x2));
        assert(!(instData.ABS & 0x4));
        assert(!(extData.NEG & 0x1));
        assert(!(extData.NEG & 0x2));
        assert(!(extData.NEG & 0x4));

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                vdst[lane] = ((1 << bits(src0[lane], 4, 0)) - 1)
                    << bits(src1[lane], 4, 0);
            }
        }

        vdst.write();
    }

    Inst_VOP3__V_CVT_PKNORM_I16_F32::Inst_VOP3__V_CVT_PKNORM_I16_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_pknorm_i16_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_PKNORM_I16_F32

    Inst_VOP3__V_CVT_PKNORM_I16_F32::~Inst_VOP3__V_CVT_PKNORM_I16_F32()
    {
    } // ~Inst_VOP3__V_CVT_PKNORM_I16_F32

    // D = {(snorm)S1.f, (snorm)S0.f}.
    void
    Inst_VOP3__V_CVT_PKNORM_I16_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CVT_PKNORM_U16_F32::Inst_VOP3__V_CVT_PKNORM_U16_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_pknorm_u16_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_PKNORM_U16_F32

    Inst_VOP3__V_CVT_PKNORM_U16_F32::~Inst_VOP3__V_CVT_PKNORM_U16_F32()
    {
    } // ~Inst_VOP3__V_CVT_PKNORM_U16_F32

    // D = {(unorm)S1.f, (unorm)S0.f}.
    void
    Inst_VOP3__V_CVT_PKNORM_U16_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CVT_PKRTZ_F16_F32::Inst_VOP3__V_CVT_PKRTZ_F16_F32(
          InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_pkrtz_f16_f32", false)
    {
        setFlag(ALU);
        setFlag(F32);
    } // Inst_VOP3__V_CVT_PKRTZ_F16_F32

    Inst_VOP3__V_CVT_PKRTZ_F16_F32::~Inst_VOP3__V_CVT_PKRTZ_F16_F32()
    {
    } // ~Inst_VOP3__V_CVT_PKRTZ_F16_F32

    void
    Inst_VOP3__V_CVT_PKRTZ_F16_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CVT_PK_U16_U32::Inst_VOP3__V_CVT_PK_U16_U32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_pk_u16_u32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CVT_PK_U16_U32

    Inst_VOP3__V_CVT_PK_U16_U32::~Inst_VOP3__V_CVT_PK_U16_U32()
    {
    } // ~Inst_VOP3__V_CVT_PK_U16_U32

    // D = {uint32_to_uint16(S1.u), uint32_to_uint16(S0.u)}.
    void
    Inst_VOP3__V_CVT_PK_U16_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_VOP3__V_CVT_PK_I16_I32::Inst_VOP3__V_CVT_PK_I16_I32(InFmt_VOP3 *iFmt)
        : Inst_VOP3(iFmt, "v_cvt_pk_i16_i32", false)
    {
        setFlag(ALU);
    } // Inst_VOP3__V_CVT_PK_I16_I32

    Inst_VOP3__V_CVT_PK_I16_I32::~Inst_VOP3__V_CVT_PK_I16_I32()
    {
    } // ~Inst_VOP3__V_CVT_PK_I16_I32

    // D = {int32_to_int16(S1.i), int32_to_int16(S0.i)}.
    void
    Inst_VOP3__V_CVT_PK_I16_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_ADD_U32::Inst_DS__DS_ADD_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_add_u32")
    {
    } // Inst_DS__DS_ADD_U32

    Inst_DS__DS_ADD_U32::~Inst_DS__DS_ADD_U32()
    {
    } // ~Inst_DS__DS_ADD_U32

    // tmp = MEM[ADDR];
    // MEM[ADDR] += DATA;
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_SUB_U32::Inst_DS__DS_SUB_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_sub_u32")
    {
    } // Inst_DS__DS_SUB_U32

    Inst_DS__DS_SUB_U32::~Inst_DS__DS_SUB_U32()
    {
    } // ~Inst_DS__DS_SUB_U32

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_RSUB_U32::Inst_DS__DS_RSUB_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_rsub_u32")
    {
    } // Inst_DS__DS_RSUB_U32

    Inst_DS__DS_RSUB_U32::~Inst_DS__DS_RSUB_U32()
    {
    } // ~Inst_DS__DS_RSUB_U32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = DATA - MEM[ADDR];
    // RETURN_DATA = tmp.
    // Subtraction with reversed operands.
    void
    Inst_DS__DS_RSUB_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_INC_U32::Inst_DS__DS_INC_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_inc_u32")
    {
    } // Inst_DS__DS_INC_U32

    Inst_DS__DS_INC_U32::~Inst_DS__DS_INC_U32()
    {
    } // ~Inst_DS__DS_INC_U32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_INC_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_DEC_U32::Inst_DS__DS_DEC_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_dec_u32")
    {
    } // Inst_DS__DS_DEC_U32

    Inst_DS__DS_DEC_U32::~Inst_DS__DS_DEC_U32()
    {
    } // ~Inst_DS__DS_DEC_U32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
    // (unsigned compare); RETURN_DATA = tmp.
    void
    Inst_DS__DS_DEC_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_I32::Inst_DS__DS_MIN_I32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_i32")
    {
    } // Inst_DS__DS_MIN_I32

    Inst_DS__DS_MIN_I32::~Inst_DS__DS_MIN_I32()
    {
    } // ~Inst_DS__DS_MIN_I32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_I32::Inst_DS__DS_MAX_I32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_i32")
    {
    } // Inst_DS__DS_MAX_I32

    Inst_DS__DS_MAX_I32::~Inst_DS__DS_MAX_I32()
    {
    } // ~Inst_DS__DS_MAX_I32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_U32::Inst_DS__DS_MIN_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_u32")
    {
    } // Inst_DS__DS_MIN_U32

    Inst_DS__DS_MIN_U32::~Inst_DS__DS_MIN_U32()
    {
    } // ~Inst_DS__DS_MIN_U32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_U32::Inst_DS__DS_MAX_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_u32")
    {
    } // Inst_DS__DS_MAX_U32

    Inst_DS__DS_MAX_U32::~Inst_DS__DS_MAX_U32()
    {
    } // ~Inst_DS__DS_MAX_U32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_AND_B32::Inst_DS__DS_AND_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_and_b32")
    {
    } // Inst_DS__DS_AND_B32

    Inst_DS__DS_AND_B32::~Inst_DS__DS_AND_B32()
    {
    } // ~Inst_DS__DS_AND_B32

    // tmp = MEM[ADDR];
    // MEM[ADDR] &= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_AND_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_OR_B32::Inst_DS__DS_OR_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_or_b32")
    {
    } // Inst_DS__DS_OR_B32

    Inst_DS__DS_OR_B32::~Inst_DS__DS_OR_B32()
    {
    } // ~Inst_DS__DS_OR_B32

    // tmp = MEM[ADDR];
    // MEM[ADDR] |= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_OR_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_XOR_B32::Inst_DS__DS_XOR_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_xor_b32")
    {
    } // Inst_DS__DS_XOR_B32

    Inst_DS__DS_XOR_B32::~Inst_DS__DS_XOR_B32()
    {
    } // ~Inst_DS__DS_XOR_B32

    // tmp = MEM[ADDR];
    // MEM[ADDR] ^= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MSKOR_B32::Inst_DS__DS_MSKOR_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_mskor_b32")
    {
    } // Inst_DS__DS_MSKOR_B32

    Inst_DS__DS_MSKOR_B32::~Inst_DS__DS_MSKOR_B32()
    {
    } // ~Inst_DS__DS_MSKOR_B32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_MSKOR_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_WRITE_B32::Inst_DS__DS_WRITE_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_write_b32")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_DS__DS_WRITE_B32

    Inst_DS__DS_WRITE_B32::~Inst_DS__DS_WRITE_B32()
    {
    } // ~Inst_DS__DS_WRITE_B32

    // MEM[ADDR] = DATA.
    // Write dword.
    void
    Inst_DS__DS_WRITE_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(
                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);

        addr.read();
        data.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
                    = data[lane];
            }
        }

        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

        wf->wrLmReqsInPipe--;
        wf->outstandingReqsWrLm++;
        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_DS__DS_WRITE_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        Addr offset0 = instData.OFFSET0;
        Addr offset1 = instData.OFFSET1;
        Addr offset = (offset1 << 8) | offset0;

        initMemWrite<VecElemU32>(gpuDynInst, offset);
    } // initiateAcc

    void
    Inst_DS__DS_WRITE_B32::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_DS__DS_WRITE2_B32::Inst_DS__DS_WRITE2_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_write2_b32")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_DS__DS_WRITE2_B32

    Inst_DS__DS_WRITE2_B32::~Inst_DS__DS_WRITE2_B32()
    {
    } // ~Inst_DS__DS_WRITE2_B32

    // MEM[ADDR_BASE + OFFSET0 * 4] = DATA;
    // MEM[ADDR_BASE + OFFSET1 * 4] = DATA2.
    // Write 2 dwords.
    void
    Inst_DS__DS_WRITE2_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(
                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
        ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);

        addr.read();
        data0.read();
        data1.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
                    = data0[lane];
                (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
            }
        }

        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

        wf->wrLmReqsInPipe--;
        wf->outstandingReqsWrLm++;
        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_DS__DS_WRITE2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        Addr offset0 = instData.OFFSET0 * 4;
        Addr offset1 = instData.OFFSET1 * 4;

        initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
    }

    void
    Inst_DS__DS_WRITE2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_DS__DS_WRITE2ST64_B32::Inst_DS__DS_WRITE2ST64_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_write2st64_b32")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_DS__DS_WRITE2ST64_B32

    Inst_DS__DS_WRITE2ST64_B32::~Inst_DS__DS_WRITE2ST64_B32()
    {
    } // ~Inst_DS__DS_WRITE2ST64_B32

    // MEM[ADDR_BASE + OFFSET0 * 4 * 64] = DATA;
    // MEM[ADDR_BASE + OFFSET1 * 4 * 64] = DATA2;
    // Write 2 dwords.
    void
    Inst_DS__DS_WRITE2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(
                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
        ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);

        addr.read();
        data0.read();
        data1.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
                    = data0[lane];
                (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
            }
        }

        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

        wf->wrLmReqsInPipe--;
        wf->outstandingReqsWrLm++;
        wf->outstandingReqs++;
        wf->validateRequestCounters();
    } // execute

    void
    Inst_DS__DS_WRITE2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        Addr offset0 = instData.OFFSET0 * 4 * 64;
        Addr offset1 = instData.OFFSET1 * 4 * 64;

        initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
    }

    void
    Inst_DS__DS_WRITE2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }
    // --- Inst_DS__DS_CMPST_B32 class methods ---

    Inst_DS__DS_CMPST_B32::Inst_DS__DS_CMPST_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_cmpst_b32")
    {
    } // Inst_DS__DS_CMPST_B32

    Inst_DS__DS_CMPST_B32::~Inst_DS__DS_CMPST_B32()
    {
    } // ~Inst_DS__DS_CMPST_B32

    // tmp = MEM[ADDR];
    // src = DATA2;
    // cmp = DATA;
    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
    // RETURN_DATA[0] = tmp.
    // Compare and store.
    void
    Inst_DS__DS_CMPST_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_CMPST_F32::Inst_DS__DS_CMPST_F32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_cmpst_f32")
    {
        setFlag(F32);
    } // Inst_DS__DS_CMPST_F32

    Inst_DS__DS_CMPST_F32::~Inst_DS__DS_CMPST_F32()
    {
    } // ~Inst_DS__DS_CMPST_F32

    // tmp = MEM[ADDR];
    // src = DATA2;
    // cmp = DATA;
    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
    // RETURN_DATA[0] = tmp.
    void
    Inst_DS__DS_CMPST_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_F32::Inst_DS__DS_MIN_F32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_f32")
    {
        setFlag(F32);
    } // Inst_DS__DS_MIN_F32

    Inst_DS__DS_MIN_F32::~Inst_DS__DS_MIN_F32()
    {
    } // ~Inst_DS__DS_MIN_F32

    // tmp = MEM[ADDR];
    // src = DATA;
    // cmp = DATA2;
    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
    void
    Inst_DS__DS_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_F32::Inst_DS__DS_MAX_F32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_f32")
    {
        setFlag(F32);
    } // Inst_DS__DS_MAX_F32

    Inst_DS__DS_MAX_F32::~Inst_DS__DS_MAX_F32()
    {
    } // ~Inst_DS__DS_MAX_F32

    // tmp = MEM[ADDR];
    // src = DATA;
    // cmp = DATA2;
    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
    void
    Inst_DS__DS_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_NOP::Inst_DS__DS_NOP(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_nop")
    {
        setFlag(Nop);
    } // Inst_DS__DS_NOP

    Inst_DS__DS_NOP::~Inst_DS__DS_NOP()
    {
    } // ~Inst_DS__DS_NOP

    // Do nothing.
    void
    Inst_DS__DS_NOP::execute(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_DS__DS_ADD_F32::Inst_DS__DS_ADD_F32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_add_f32")
    {
        setFlag(F32);
    } // Inst_DS__DS_ADD_F32

    Inst_DS__DS_ADD_F32::~Inst_DS__DS_ADD_F32()
    {
    } // ~Inst_DS__DS_ADD_F32

    // tmp = MEM[ADDR];
    // MEM[ADDR] += DATA;
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_WRITE_B8::Inst_DS__DS_WRITE_B8(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_write_b8")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_DS__DS_WRITE_B8

    Inst_DS__DS_WRITE_B8::~Inst_DS__DS_WRITE_B8()
    {
    } // ~Inst_DS__DS_WRITE_B8

    // MEM[ADDR] = DATA[7:0].
    void
    Inst_DS__DS_WRITE_B8::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(
                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU8 data(gpuDynInst, extData.DATA0);

        addr.read();
        data.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
                    = data[lane];
            }
        }

        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

        wf->wrLmReqsInPipe--;
        wf->outstandingReqsWrLm++;
        wf->outstandingReqs++;
        wf->validateRequestCounters();
    } // execute

    void
    Inst_DS__DS_WRITE_B8::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        Addr offset0 = instData.OFFSET0;
        Addr offset1 = instData.OFFSET1;
        Addr offset = (offset1 << 8) | offset0;

        initMemWrite<VecElemU8>(gpuDynInst, offset);
    } // initiateAcc

    void
    Inst_DS__DS_WRITE_B8::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc
    // --- Inst_DS__DS_WRITE_B16 class methods ---

    Inst_DS__DS_WRITE_B16::Inst_DS__DS_WRITE_B16(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_write_b16")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_DS__DS_WRITE_B16

    Inst_DS__DS_WRITE_B16::~Inst_DS__DS_WRITE_B16()
    {
    } // ~Inst_DS__DS_WRITE_B16

    // MEM[ADDR] = DATA[15:0]
    void
    Inst_DS__DS_WRITE_B16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(
                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU16 data(gpuDynInst, extData.DATA0);

        addr.read();
        data.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
                    = data[lane];
            }
        }

        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

        wf->wrLmReqsInPipe--;
        wf->outstandingReqsWrLm++;
        wf->outstandingReqs++;
        wf->validateRequestCounters();
    } // execute

    void
    Inst_DS__DS_WRITE_B16::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        Addr offset0 = instData.OFFSET0;
        Addr offset1 = instData.OFFSET1;
        Addr offset = (offset1 << 8) | offset0;

        initMemWrite<VecElemU16>(gpuDynInst, offset);
    } // initiateAcc

    void
    Inst_DS__DS_WRITE_B16::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc
    // --- Inst_DS__DS_ADD_RTN_U32 class methods ---

    Inst_DS__DS_ADD_RTN_U32::Inst_DS__DS_ADD_RTN_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_add_rtn_u32")
    {
    } // Inst_DS__DS_ADD_RTN_U32

    Inst_DS__DS_ADD_RTN_U32::~Inst_DS__DS_ADD_RTN_U32()
    {
    } // ~Inst_DS__DS_ADD_RTN_U32

    // tmp = MEM[ADDR];
    // MEM[ADDR] += DATA;
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_ADD_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_SUB_RTN_U32::Inst_DS__DS_SUB_RTN_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_sub_rtn_u32")
    {
    } // Inst_DS__DS_SUB_RTN_U32

    Inst_DS__DS_SUB_RTN_U32::~Inst_DS__DS_SUB_RTN_U32()
    {
    } // ~Inst_DS__DS_SUB_RTN_U32

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_SUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_RSUB_RTN_U32::Inst_DS__DS_RSUB_RTN_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_rsub_rtn_u32")
    {
    } // Inst_DS__DS_RSUB_RTN_U32

    Inst_DS__DS_RSUB_RTN_U32::~Inst_DS__DS_RSUB_RTN_U32()
    {
    } // ~Inst_DS__DS_RSUB_RTN_U32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = DATA - MEM[ADDR];
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_RSUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_INC_RTN_U32::Inst_DS__DS_INC_RTN_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_inc_rtn_u32")
    {
    } // Inst_DS__DS_INC_RTN_U32

    Inst_DS__DS_INC_RTN_U32::~Inst_DS__DS_INC_RTN_U32()
    {
    } // ~Inst_DS__DS_INC_RTN_U32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_INC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_DEC_RTN_U32::Inst_DS__DS_DEC_RTN_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_dec_rtn_u32")
    {
    } // Inst_DS__DS_DEC_RTN_U32

    Inst_DS__DS_DEC_RTN_U32::~Inst_DS__DS_DEC_RTN_U32()
    {
    } // ~Inst_DS__DS_DEC_RTN_U32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
    // (unsigned compare); RETURN_DATA = tmp.
    void
    Inst_DS__DS_DEC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_RTN_I32::Inst_DS__DS_MIN_RTN_I32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_rtn_i32")
    {
    } // Inst_DS__DS_MIN_RTN_I32

    Inst_DS__DS_MIN_RTN_I32::~Inst_DS__DS_MIN_RTN_I32()
    {
    } // ~Inst_DS__DS_MIN_RTN_I32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_MIN_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_RTN_I32::Inst_DS__DS_MAX_RTN_I32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_rtn_i32")
    {
    } // Inst_DS__DS_MAX_RTN_I32

    Inst_DS__DS_MAX_RTN_I32::~Inst_DS__DS_MAX_RTN_I32()
    {
    } // ~Inst_DS__DS_MAX_RTN_I32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_MAX_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_RTN_U32::Inst_DS__DS_MIN_RTN_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_rtn_u32")
    {
    } // Inst_DS__DS_MIN_RTN_U32

    Inst_DS__DS_MIN_RTN_U32::~Inst_DS__DS_MIN_RTN_U32()
    {
    } // ~Inst_DS__DS_MIN_RTN_U32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_MIN_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_RTN_U32::Inst_DS__DS_MAX_RTN_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_rtn_u32")
    {
    } // Inst_DS__DS_MAX_RTN_U32

    Inst_DS__DS_MAX_RTN_U32::~Inst_DS__DS_MAX_RTN_U32()
    {
    } // ~Inst_DS__DS_MAX_RTN_U32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_MAX_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_AND_RTN_B32::Inst_DS__DS_AND_RTN_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_and_rtn_b32")
    {
    } // Inst_DS__DS_AND_RTN_B32

    Inst_DS__DS_AND_RTN_B32::~Inst_DS__DS_AND_RTN_B32()
    {
    } // ~Inst_DS__DS_AND_RTN_B32

    // tmp = MEM[ADDR];
    // MEM[ADDR] &= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_AND_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_OR_RTN_B32::Inst_DS__DS_OR_RTN_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_or_rtn_b32")
    {
    } // Inst_DS__DS_OR_RTN_B32

    Inst_DS__DS_OR_RTN_B32::~Inst_DS__DS_OR_RTN_B32()
    {
    } // ~Inst_DS__DS_OR_RTN_B32

    // tmp = MEM[ADDR];
    // MEM[ADDR] |= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_OR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_XOR_RTN_B32::Inst_DS__DS_XOR_RTN_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_xor_rtn_b32")
    {
    } // Inst_DS__DS_XOR_RTN_B32

    Inst_DS__DS_XOR_RTN_B32::~Inst_DS__DS_XOR_RTN_B32()
    {
    } // ~Inst_DS__DS_XOR_RTN_B32

    // tmp = MEM[ADDR];
    // MEM[ADDR] ^= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_XOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MSKOR_RTN_B32::Inst_DS__DS_MSKOR_RTN_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_mskor_rtn_b32")
    {
    } // Inst_DS__DS_MSKOR_RTN_B32

    Inst_DS__DS_MSKOR_RTN_B32::~Inst_DS__DS_MSKOR_RTN_B32()
    {
    } // ~Inst_DS__DS_MSKOR_RTN_B32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_MSKOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_WRXCHG_RTN_B32::Inst_DS__DS_WRXCHG_RTN_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_wrxchg_rtn_b32")
    {
    } // Inst_DS__DS_WRXCHG_RTN_B32

    Inst_DS__DS_WRXCHG_RTN_B32::~Inst_DS__DS_WRXCHG_RTN_B32()
    {
    } // ~Inst_DS__DS_WRXCHG_RTN_B32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = DATA;
    // RETURN_DATA = tmp.
    // Write-exchange operation.
    void
    Inst_DS__DS_WRXCHG_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_WRXCHG2_RTN_B32::Inst_DS__DS_WRXCHG2_RTN_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_wrxchg2_rtn_b32")
    {
    } // Inst_DS__DS_WRXCHG2_RTN_B32

    Inst_DS__DS_WRXCHG2_RTN_B32::~Inst_DS__DS_WRXCHG2_RTN_B32()
    {
    } // ~Inst_DS__DS_WRXCHG2_RTN_B32

    // Write-exchange 2 separate dwords.
    void
    Inst_DS__DS_WRXCHG2_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_WRXCHG2ST64_RTN_B32::Inst_DS__DS_WRXCHG2ST64_RTN_B32(
          InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b32")
    {
    } // Inst_DS__DS_WRXCHG2ST64_RTN_B32

    Inst_DS__DS_WRXCHG2ST64_RTN_B32::~Inst_DS__DS_WRXCHG2ST64_RTN_B32()
    {
    } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B32

    // Write-exchange 2 separate dwords with a stride of 64 dwords.
    void
    Inst_DS__DS_WRXCHG2ST64_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_CMPST_RTN_B32::Inst_DS__DS_CMPST_RTN_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_cmpst_rtn_b32")
    {
    } // Inst_DS__DS_CMPST_RTN_B32

    Inst_DS__DS_CMPST_RTN_B32::~Inst_DS__DS_CMPST_RTN_B32()
    {
    } // ~Inst_DS__DS_CMPST_RTN_B32

    // tmp = MEM[ADDR];
    // src = DATA2;
    // cmp = DATA;
    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
    // RETURN_DATA[0] = tmp.
    // Compare and store.
    void
    Inst_DS__DS_CMPST_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_CMPST_RTN_F32::Inst_DS__DS_CMPST_RTN_F32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_cmpst_rtn_f32")
    {
        setFlag(F32);
    } // Inst_DS__DS_CMPST_RTN_F32

    Inst_DS__DS_CMPST_RTN_F32::~Inst_DS__DS_CMPST_RTN_F32()
    {
    } // ~Inst_DS__DS_CMPST_RTN_F32

    // tmp = MEM[ADDR];
    // src = DATA2;
    // cmp = DATA;
    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
    // RETURN_DATA[0] = tmp.
    void
    Inst_DS__DS_CMPST_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_RTN_F32::Inst_DS__DS_MIN_RTN_F32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_rtn_f32")
    {
        setFlag(F32);
    } // Inst_DS__DS_MIN_RTN_F32

    Inst_DS__DS_MIN_RTN_F32::~Inst_DS__DS_MIN_RTN_F32()
    {
    } // ~Inst_DS__DS_MIN_RTN_F32

    // tmp = MEM[ADDR];
    // src = DATA;
    // cmp = DATA2;
    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
    void
    Inst_DS__DS_MIN_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_RTN_F32::Inst_DS__DS_MAX_RTN_F32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_rtn_f32")
    {
        setFlag(F32);
    } // Inst_DS__DS_MAX_RTN_F32

    Inst_DS__DS_MAX_RTN_F32::~Inst_DS__DS_MAX_RTN_F32()
    {
    } // ~Inst_DS__DS_MAX_RTN_F32

    // tmp = MEM[ADDR];
    // src = DATA;
    // cmp = DATA2;
    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
    void
    Inst_DS__DS_MAX_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_WRAP_RTN_B32::Inst_DS__DS_WRAP_RTN_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_wrap_rtn_b32")
    {
    } // Inst_DS__DS_WRAP_RTN_B32

    Inst_DS__DS_WRAP_RTN_B32::~Inst_DS__DS_WRAP_RTN_B32()
    {
    } // ~Inst_DS__DS_WRAP_RTN_B32

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp >= DATA) ? tmp - DATA : tmp + DATA2;
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_WRAP_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_ADD_RTN_F32::Inst_DS__DS_ADD_RTN_F32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_add_rtn_f32")
    {
        setFlag(F32);
    } // Inst_DS__DS_ADD_RTN_F32

    Inst_DS__DS_ADD_RTN_F32::~Inst_DS__DS_ADD_RTN_F32()
    {
    } // ~Inst_DS__DS_ADD_RTN_F32

    // tmp = MEM[ADDR];
    // MEM[ADDR] += DATA;
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_ADD_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_DS__DS_READ_B32::Inst_DS__DS_READ_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_read_b32")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_DS__DS_READ_B32

    Inst_DS__DS_READ_B32::~Inst_DS__DS_READ_B32()
    {
    } // ~Inst_DS__DS_READ_B32

    // RETURN_DATA = MEM[ADDR].
    // Dword read.
    void
    Inst_DS__DS_READ_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(
                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

        wf->rdLmReqsInPipe--;
        wf->outstandingReqsRdLm++;
        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_DS__DS_READ_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        Addr offset0 = instData.OFFSET0;
        Addr offset1 = instData.OFFSET1;
        Addr offset = (offset1 << 8) | offset0;

        initMemRead<VecElemU32>(gpuDynInst, offset);
    } // initiateAcc

    void
    Inst_DS__DS_READ_B32::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU32 vdst(gpuDynInst, extData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                vdst[lane] = (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane];
            }
        }

        vdst.write();
    } // completeAcc

    Inst_DS__DS_READ2_B32::Inst_DS__DS_READ2_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_read2_b32")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_DS__DS_READ2_B32

    Inst_DS__DS_READ2_B32::~Inst_DS__DS_READ2_B32()
    {
    } // ~Inst_DS__DS_READ2_B32

    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4];
    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4].
    // Read 2 dwords.
    void
    Inst_DS__DS_READ2_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(
                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

        wf->rdLmReqsInPipe--;
        wf->outstandingReqsRdLm++;
        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_DS__DS_READ2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        Addr offset0 = instData.OFFSET0 * 4;
        Addr offset1 = instData.OFFSET1 * 4;

        initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
    } // initiateAcc

    void
    Inst_DS__DS_READ2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 2];
                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 2 + 1];
            }
        }

        vdst0.write();
        vdst1.write();
    } // completeAcc

    Inst_DS__DS_READ2ST64_B32::Inst_DS__DS_READ2ST64_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_read2st64_b32")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_DS__DS_READ2ST64_B32

    Inst_DS__DS_READ2ST64_B32::~Inst_DS__DS_READ2ST64_B32()
    {
    } // ~Inst_DS__DS_READ2ST64_B32

    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4 * 64];
    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4 * 64].
    // Read 2 dwords.
    void
    Inst_DS__DS_READ2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(
                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

        wf->rdLmReqsInPipe--;
        wf->outstandingReqsRdLm++;
        wf->outstandingReqs++;
        wf->validateRequestCounters();
    } // execute

    void
    Inst_DS__DS_READ2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        Addr offset0 = (instData.OFFSET0 * 4 * 64);
        Addr offset1 = (instData.OFFSET1 * 4 * 64);

        initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
    }

    void
    Inst_DS__DS_READ2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
                    gpuDynInst->d_data))[lane * 2];
                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
                    gpuDynInst->d_data))[lane * 2 + 1];
            }
        }

        vdst0.write();
        vdst1.write();
    }
    // --- Inst_DS__DS_READ_I8 class methods ---

    Inst_DS__DS_READ_I8::Inst_DS__DS_READ_I8(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_read_i8")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_DS__DS_READ_I8

    Inst_DS__DS_READ_I8::~Inst_DS__DS_READ_I8()
    {
    } // ~Inst_DS__DS_READ_I8

    // RETURN_DATA = signext(MEM[ADDR][7:0]).
    // Signed byte read.
    void
    Inst_DS__DS_READ_I8::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_READ_U8::Inst_DS__DS_READ_U8(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_read_u8")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_DS__DS_READ_U8

    Inst_DS__DS_READ_U8::~Inst_DS__DS_READ_U8()
    {
    } // ~Inst_DS__DS_READ_U8

    // RETURN_DATA = {24'h0,MEM[ADDR][7:0]}.
    // Unsigned byte read.
    void
    Inst_DS__DS_READ_U8::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(
                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

        wf->rdLmReqsInPipe--;
        wf->outstandingReqsRdLm++;
        wf->outstandingReqs++;
        wf->validateRequestCounters();
    } // execute

    void
    Inst_DS__DS_READ_U8::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        Addr offset0 = instData.OFFSET0;
        Addr offset1 = instData.OFFSET1;
        Addr offset = (offset1 << 8) | offset0;

        initMemRead<VecElemU8>(gpuDynInst, offset);
    } // initiateAcc

    void
    Inst_DS__DS_READ_U8::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU32 vdst(gpuDynInst, extData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU8*>(
                    gpuDynInst->d_data))[lane];
            }
        }

        vdst.write();
    } // completeAcc
    // --- Inst_DS__DS_READ_I16 class methods ---

    Inst_DS__DS_READ_I16::Inst_DS__DS_READ_I16(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_read_i16")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_DS__DS_READ_I16

    Inst_DS__DS_READ_I16::~Inst_DS__DS_READ_I16()
    {
    } // ~Inst_DS__DS_READ_I16

    // RETURN_DATA = signext(MEM[ADDR][15:0]).
    // Signed short read.
    void
    Inst_DS__DS_READ_I16::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_READ_U16::Inst_DS__DS_READ_U16(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_read_u16")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_DS__DS_READ_U16

    Inst_DS__DS_READ_U16::~Inst_DS__DS_READ_U16()
    {
    } // ~Inst_DS__DS_READ_U16

    // RETURN_DATA = {16'h0,MEM[ADDR][15:0]}.
    // Unsigned short read.
    void
    Inst_DS__DS_READ_U16::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(
                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

        wf->rdLmReqsInPipe--;
        wf->outstandingReqsRdLm++;
        wf->outstandingReqs++;
        wf->validateRequestCounters();
    } // execute
    void
    Inst_DS__DS_READ_U16::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        Addr offset0 = instData.OFFSET0;
        Addr offset1 = instData.OFFSET1;
        Addr offset = (offset1 << 8) | offset0;

        initMemRead<VecElemU16>(gpuDynInst, offset);
    } // initiateAcc

    void
    Inst_DS__DS_READ_U16::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU32 vdst(gpuDynInst, extData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU16*>(
                    gpuDynInst->d_data))[lane];
            }
        }

        vdst.write();
    } // completeAcc
    // --- Inst_DS__DS_SWIZZLE_B32 class methods ---

    Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_swizzle_b32")
    {
         setFlag(Load);
    } // Inst_DS__DS_SWIZZLE_B32

    Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32()
    {
    } // ~Inst_DS__DS_SWIZZLE_B32

    // RETURN_DATA = swizzle(vgpr_data, offset1:offset0).
    // Dword swizzle, no data is written to LDS memory;
    void
    Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        wf->rdLmReqsInPipe--;
        wf->validateRequestCounters();

        if (gpuDynInst->exec_mask.none()) {
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()
                                ->cyclesToTicks(Cycles(24)));

        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
        VecOperandU32 vdst(gpuDynInst, extData.VDST);
        /**
         * The "DS pattern" is comprised of both offset fields. That is, the
         * swizzle pattern between lanes. Bit 15 of the DS pattern dictates
         * which swizzle mode to use. There are two different swizzle
         * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use
         * QDMode else use Bit-masks mode. The remaining bits dictate how to
         * swizzle the lanes.
         *
         * QDMode:      Chunks the lanes into 4s and swizzles among them.
         *              Bits 7:6 dictate where lane 3 (of the current chunk)
         *              gets its date, 5:4 lane 2, etc.
         *
         * Bit-mask:    This mode breaks bits 14:0 into 3 equal-sized chunks.
         *              14:10 is the xor_mask, 9:5 is the or_mask, and 4:0
         *              is the and_mask. Each lane is swizzled by performing
         *              the appropriate operation using these masks.
         */
        VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) | instData.OFFSET0);

        data.read();

        if (bits(ds_pattern, 15)) {
            // QDMode
            for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) {
                /**
                 * This operation allows data sharing between groups
                 * of four consecutive threads. Note the increment by
                 * 4 in the for loop.
                 */
                if (gpuDynInst->exec_mask[lane]) {
                    int index0 = lane + bits(ds_pattern, 1, 0);
                    panic_if(index0 >= NumVecElemPerVecReg, "%s: index0 (%d) "
                             "is out of bounds.\n", gpuDynInst->disassemble(),
                             index0);
                    vdst[lane]
                        = gpuDynInst->exec_mask[index0] ? data[index0]: 0;
                }
                if (gpuDynInst->exec_mask[lane + 1]) {
                    int index1 = lane + bits(ds_pattern, 3, 2);
                    panic_if(index1 >= NumVecElemPerVecReg, "%s: index1 (%d) "
                             "is out of bounds.\n", gpuDynInst->disassemble(),
                             index1);
                    vdst[lane + 1]
                        = gpuDynInst->exec_mask[index1] ? data[index1]: 0;
                }
                if (gpuDynInst->exec_mask[lane + 2]) {
                    int index2 = lane + bits(ds_pattern, 5, 4);
                    panic_if(index2 >= NumVecElemPerVecReg, "%s: index2 (%d) "
                             "is out of bounds.\n", gpuDynInst->disassemble(),
                             index2);
                    vdst[lane + 2]
                        = gpuDynInst->exec_mask[index2] ? data[index2]: 0;
                }
                if (gpuDynInst->exec_mask[lane + 3]) {
                    int index3 = lane + bits(ds_pattern, 7, 6);
                    panic_if(index3 >= NumVecElemPerVecReg, "%s: index3 (%d) "
                             "is out of bounds.\n", gpuDynInst->disassemble(),
                             index3);
                    vdst[lane + 3]
                        = gpuDynInst->exec_mask[index3] ? data[index3]: 0;
                }
            }
        } else {
            // Bit Mode
            int and_mask = bits(ds_pattern, 4, 0);
            int or_mask = bits(ds_pattern, 9, 5);
            int xor_mask = bits(ds_pattern, 14, 10);
            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (gpuDynInst->exec_mask[lane]) {
                    int index = (((lane & and_mask) | or_mask) ^ xor_mask);
                    // Adjust for the next 32 lanes.
                    if (lane > 31) {
                        index += 32;
                    }
                    panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is "
                             "out of bounds.\n", gpuDynInst->disassemble(),
                             index);
                    vdst[lane]
                        = gpuDynInst->exec_mask[index] ? data[index] : 0;
                }
            }
        }

        vdst.write();
    } // execute
    // --- Inst_DS__DS_PERMUTE_B32 class methods ---

    Inst_DS__DS_PERMUTE_B32::Inst_DS__DS_PERMUTE_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_permute_b32")
    {
        setFlag(MemoryRef);
        /**
         * While this operation doesn't actually use DS storage we classify
         * it as a load here because it does a writeback to a VGPR, which
         * fits in better with the LDS pipeline logic.
         */
         setFlag(Load);
    } // Inst_DS__DS_PERMUTE_B32

    Inst_DS__DS_PERMUTE_B32::~Inst_DS__DS_PERMUTE_B32()
    {
    } // ~Inst_DS__DS_PERMUTE_B32

    // Forward permute.
    void
    Inst_DS__DS_PERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()
                                ->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
        VecOperandU32 vdst(gpuDynInst, extData.VDST);

        addr.read();
        data.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                /**
                 * One of the offset fields can be used for the index.
                 * It is assumed OFFSET0 would be used, as OFFSET1 is
                 * typically only used for DS ops that operate on two
                 * disparate pieces of data.
                 */
                assert(!instData.OFFSET1);
                /**
                 * The address provided is a byte address, but VGPRs are
                 * 4 bytes, so we must divide by 4 to get the actual VGPR
                 * index. Additionally, the index is calculated modulo the
                 * WF size, 64 in this case, so we simply extract bits 7-2.
                 */
                int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
                panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
                         "of bounds.\n", gpuDynInst->disassemble(), index);
                /**
                 * If the shuffled index corresponds to a lane that is
                 * inactive then this instruction writes a 0 to the active
                 * lane in VDST.
                 */
                if (wf->execMask(index)) {
                    vdst[index] = data[lane];
                } else {
                    vdst[index] = 0;
                }
            }
        }

        vdst.write();

        wf->decLGKMInstsIssued();
        wf->rdLmReqsInPipe--;
        wf->validateRequestCounters();
    } // execute
    // --- Inst_DS__DS_BPERMUTE_B32 class methods ---

    Inst_DS__DS_BPERMUTE_B32::Inst_DS__DS_BPERMUTE_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_bpermute_b32")
    {
        setFlag(MemoryRef);
        /**
         * While this operation doesn't actually use DS storage we classify
         * it as a load here because it does a writeback to a VGPR, which
         * fits in better with the LDS pipeline logic.
         */
        setFlag(Load);
    } // Inst_DS__DS_BPERMUTE_B32

    Inst_DS__DS_BPERMUTE_B32::~Inst_DS__DS_BPERMUTE_B32()
    {
    } // ~Inst_DS__DS_BPERMUTE_B32

    // Backward permute.
    void
    Inst_DS__DS_BPERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()
                                ->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
        VecOperandU32 vdst(gpuDynInst, extData.VDST);

        addr.read();
        data.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                /**
                 * One of the offset fields can be used for the index.
                 * It is assumed OFFSET0 would be used, as OFFSET1 is
                 * typically only used for DS ops that operate on two
                 * disparate pieces of data.
                 */
                assert(!instData.OFFSET1);
                /**
                 * The address provided is a byte address, but VGPRs are
                 * 4 bytes, so we must divide by 4 to get the actual VGPR
                 * index. Additionally, the index is calculated modulo the
                 * WF size, 64 in this case, so we simply extract bits 7-2.
                 */
                int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
                panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
                         "of bounds.\n", gpuDynInst->disassemble(), index);
                /**
                 * If the shuffled index corresponds to a lane that is
                 * inactive then this instruction writes a 0 to the active
                 * lane in VDST.
                 */
                if (wf->execMask(index)) {
                    vdst[lane] = data[index];
                } else {
                    vdst[lane] = 0;
                }
            }
        }

        vdst.write();

        wf->decLGKMInstsIssued();
        wf->rdLmReqsInPipe--;
        wf->validateRequestCounters();
    } // execute

    // --- Inst_DS__DS_ADD_U64 class methods ---

    Inst_DS__DS_ADD_U64::Inst_DS__DS_ADD_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_add_u64")
    {
    } // Inst_DS__DS_ADD_U64

    Inst_DS__DS_ADD_U64::~Inst_DS__DS_ADD_U64()
    {
    } // ~Inst_DS__DS_ADD_U64

    // tmp = MEM[ADDR];
    // MEM[ADDR] += DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_ADD_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_SUB_U64::Inst_DS__DS_SUB_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_sub_u64")
    {
    } // Inst_DS__DS_SUB_U64

    Inst_DS__DS_SUB_U64::~Inst_DS__DS_SUB_U64()
    {
    } // ~Inst_DS__DS_SUB_U64

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_SUB_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_RSUB_U64::Inst_DS__DS_RSUB_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_rsub_u64")
    {
    } // Inst_DS__DS_RSUB_U64

    Inst_DS__DS_RSUB_U64::~Inst_DS__DS_RSUB_U64()
    {
    } // ~Inst_DS__DS_RSUB_U64

    // tmp = MEM[ADDR];
    // MEM[ADDR] = DATA - MEM[ADDR];
    // RETURN_DATA = tmp.
    // Subtraction with reversed operands.
    void
    Inst_DS__DS_RSUB_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_INC_U64::Inst_DS__DS_INC_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_inc_u64")
    {
    } // Inst_DS__DS_INC_U64

    Inst_DS__DS_INC_U64::~Inst_DS__DS_INC_U64()
    {
    } // ~Inst_DS__DS_INC_U64

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_INC_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_DEC_U64::Inst_DS__DS_DEC_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_dec_u64")
    {
    } // Inst_DS__DS_DEC_U64

    Inst_DS__DS_DEC_U64::~Inst_DS__DS_DEC_U64()
    {
    } // ~Inst_DS__DS_DEC_U64

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
    // (unsigned compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_DEC_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_I64::Inst_DS__DS_MIN_I64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_i64")
    {
    } // Inst_DS__DS_MIN_I64

    Inst_DS__DS_MIN_I64::~Inst_DS__DS_MIN_I64()
    {
    } // ~Inst_DS__DS_MIN_I64

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_MIN_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_I64::Inst_DS__DS_MAX_I64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_i64")
    {
    } // Inst_DS__DS_MAX_I64

    Inst_DS__DS_MAX_I64::~Inst_DS__DS_MAX_I64()
    {
    } // ~Inst_DS__DS_MAX_I64

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_MAX_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_U64::Inst_DS__DS_MIN_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_u64")
    {
    } // Inst_DS__DS_MIN_U64

    Inst_DS__DS_MIN_U64::~Inst_DS__DS_MIN_U64()
    {
    } // ~Inst_DS__DS_MIN_U64

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_MIN_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_U64::Inst_DS__DS_MAX_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_u64")
    {
    } // Inst_DS__DS_MAX_U64

    Inst_DS__DS_MAX_U64::~Inst_DS__DS_MAX_U64()
    {
    } // ~Inst_DS__DS_MAX_U64

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_MAX_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_AND_B64::Inst_DS__DS_AND_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_and_b64")
    {
    } // Inst_DS__DS_AND_B64

    Inst_DS__DS_AND_B64::~Inst_DS__DS_AND_B64()
    {
    } // ~Inst_DS__DS_AND_B64

    // tmp = MEM[ADDR];
    // MEM[ADDR] &= DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_AND_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_OR_B64::Inst_DS__DS_OR_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_or_b64")
    {
    } // Inst_DS__DS_OR_B64

    Inst_DS__DS_OR_B64::~Inst_DS__DS_OR_B64()
    {
    } // ~Inst_DS__DS_OR_B64

    // tmp = MEM[ADDR];
    // MEM[ADDR] |= DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_OR_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_XOR_B64::Inst_DS__DS_XOR_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_xor_b64")
    {
    } // Inst_DS__DS_XOR_B64

    Inst_DS__DS_XOR_B64::~Inst_DS__DS_XOR_B64()
    {
    } // ~Inst_DS__DS_XOR_B64

    // tmp = MEM[ADDR];
    // MEM[ADDR] ^= DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MSKOR_B64::Inst_DS__DS_MSKOR_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_mskor_b64")
    {
    } // Inst_DS__DS_MSKOR_B64

    Inst_DS__DS_MSKOR_B64::~Inst_DS__DS_MSKOR_B64()
    {
    } // ~Inst_DS__DS_MSKOR_B64

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
    // RETURN_DATA = tmp.
    void
    Inst_DS__DS_MSKOR_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_WRITE_B64::Inst_DS__DS_WRITE_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_write_b64")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_DS__DS_WRITE_B64

    Inst_DS__DS_WRITE_B64::~Inst_DS__DS_WRITE_B64()
    {
    } // ~Inst_DS__DS_WRITE_B64

    // MEM[ADDR] = DATA.
    // Write qword.
    void
    Inst_DS__DS_WRITE_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(
                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU64 data(gpuDynInst, extData.DATA0);

        addr.read();
        data.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
                    = data[lane];
            }
        }

        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

        wf->wrLmReqsInPipe--;
        wf->outstandingReqsWrLm++;
        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_DS__DS_WRITE_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        Addr offset0 = instData.OFFSET0;
        Addr offset1 = instData.OFFSET1;
        Addr offset = (offset1 << 8) | offset0;

        initMemWrite<VecElemU64>(gpuDynInst, offset);
    } // initiateAcc

    void
    Inst_DS__DS_WRITE_B64::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_DS__DS_WRITE2_B64::Inst_DS__DS_WRITE2_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_write2_b64")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_DS__DS_WRITE2_B64

    Inst_DS__DS_WRITE2_B64::~Inst_DS__DS_WRITE2_B64()
    {
    } // ~Inst_DS__DS_WRITE2_B64

    // MEM[ADDR_BASE + OFFSET0 * 8] = DATA;
    // MEM[ADDR_BASE + OFFSET1 * 8] = DATA2.
    // Write 2 qwords.
    void
    Inst_DS__DS_WRITE2_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(
                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU64 data0(gpuDynInst, extData.DATA0);
        ConstVecOperandU64 data1(gpuDynInst, extData.DATA1);

        addr.read();
        data0.read();
        data1.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (wf->execMask(lane)) {
                (reinterpret_cast<VecElemU64*>(
                    gpuDynInst->d_data))[lane * 2] = data0[lane];
                (reinterpret_cast<VecElemU64*>(
                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
            }
        }

        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

        wf->wrLmReqsInPipe--;
        wf->outstandingReqsWrLm++;
        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_DS__DS_WRITE2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        Addr offset0 = instData.OFFSET0 * 8;
        Addr offset1 = instData.OFFSET1 * 8;

        initDualMemWrite<VecElemU64>(gpuDynInst, offset0, offset1);
    }

    void
    Inst_DS__DS_WRITE2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_DS__DS_WRITE2ST64_B64::Inst_DS__DS_WRITE2ST64_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_write2st64_b64")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_DS__DS_WRITE2ST64_B64

    Inst_DS__DS_WRITE2ST64_B64::~Inst_DS__DS_WRITE2ST64_B64()
    {
    } // ~Inst_DS__DS_WRITE2ST64_B64

    // MEM[ADDR_BASE + OFFSET0 * 8 * 64] = DATA;
    // MEM[ADDR_BASE + OFFSET1 * 8 * 64] = DATA2;
    // Write 2 qwords.
    void
    Inst_DS__DS_WRITE2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_CMPST_B64::Inst_DS__DS_CMPST_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_cmpst_b64")
    {
    } // Inst_DS__DS_CMPST_B64

    Inst_DS__DS_CMPST_B64::~Inst_DS__DS_CMPST_B64()
    {
    } // ~Inst_DS__DS_CMPST_B64

    // tmp = MEM[ADDR];
    // src = DATA2;
    // cmp = DATA;
    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
    // RETURN_DATA[0] = tmp.
    // Compare and store.
    void
    Inst_DS__DS_CMPST_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_CMPST_F64::Inst_DS__DS_CMPST_F64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_cmpst_f64")
    {
        setFlag(F64);
    } // Inst_DS__DS_CMPST_F64

    Inst_DS__DS_CMPST_F64::~Inst_DS__DS_CMPST_F64()
    {
    } // ~Inst_DS__DS_CMPST_F64

    // tmp = MEM[ADDR];
    // src = DATA2;
    // cmp = DATA;
    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
    // RETURN_DATA[0] = tmp.
    void
    Inst_DS__DS_CMPST_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_F64::Inst_DS__DS_MIN_F64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_f64")
    {
        setFlag(F64);
    } // Inst_DS__DS_MIN_F64

    Inst_DS__DS_MIN_F64::~Inst_DS__DS_MIN_F64()
    {
    } // ~Inst_DS__DS_MIN_F64

    // tmp = MEM[ADDR];
    // src = DATA;
    // cmp = DATA2;
    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
    void
    Inst_DS__DS_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_F64::Inst_DS__DS_MAX_F64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_f64")
    {
        setFlag(F64);
    } // Inst_DS__DS_MAX_F64

    Inst_DS__DS_MAX_F64::~Inst_DS__DS_MAX_F64()
    {
    } // ~Inst_DS__DS_MAX_F64

    // tmp = MEM[ADDR];
    // src = DATA;
    // cmp = DATA2;
    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
    void
    Inst_DS__DS_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_ADD_RTN_U64::Inst_DS__DS_ADD_RTN_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_add_rtn_u64")
    {
    } // Inst_DS__DS_ADD_RTN_U64

    Inst_DS__DS_ADD_RTN_U64::~Inst_DS__DS_ADD_RTN_U64()
    {
    } // ~Inst_DS__DS_ADD_RTN_U64

    // tmp = MEM[ADDR];
    // MEM[ADDR] += DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_ADD_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_SUB_RTN_U64::Inst_DS__DS_SUB_RTN_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_sub_rtn_u64")
    {
    } // Inst_DS__DS_SUB_RTN_U64

    Inst_DS__DS_SUB_RTN_U64::~Inst_DS__DS_SUB_RTN_U64()
    {
    } // ~Inst_DS__DS_SUB_RTN_U64

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_SUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_RSUB_RTN_U64::Inst_DS__DS_RSUB_RTN_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_rsub_rtn_u64")
    {
    } // Inst_DS__DS_RSUB_RTN_U64

    Inst_DS__DS_RSUB_RTN_U64::~Inst_DS__DS_RSUB_RTN_U64()
    {
    } // ~Inst_DS__DS_RSUB_RTN_U64

    // tmp = MEM[ADDR];
    // MEM[ADDR] = DATA - MEM[ADDR];
    // RETURN_DATA = tmp.
    // Subtraction with reversed operands.
    void
    Inst_DS__DS_RSUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_INC_RTN_U64::Inst_DS__DS_INC_RTN_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_inc_rtn_u64")
    {
    } // Inst_DS__DS_INC_RTN_U64

    Inst_DS__DS_INC_RTN_U64::~Inst_DS__DS_INC_RTN_U64()
    {
    } // ~Inst_DS__DS_INC_RTN_U64

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_INC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_DEC_RTN_U64::Inst_DS__DS_DEC_RTN_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_dec_rtn_u64")
    {
    } // Inst_DS__DS_DEC_RTN_U64

    Inst_DS__DS_DEC_RTN_U64::~Inst_DS__DS_DEC_RTN_U64()
    {
    } // ~Inst_DS__DS_DEC_RTN_U64

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
    // (unsigned compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_DEC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_RTN_I64::Inst_DS__DS_MIN_RTN_I64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_rtn_i64")
    {
    } // Inst_DS__DS_MIN_RTN_I64

    Inst_DS__DS_MIN_RTN_I64::~Inst_DS__DS_MIN_RTN_I64()
    {
    } // ~Inst_DS__DS_MIN_RTN_I64

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_MIN_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_RTN_I64::Inst_DS__DS_MAX_RTN_I64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_rtn_i64")
    {
    } // Inst_DS__DS_MAX_RTN_I64

    Inst_DS__DS_MAX_RTN_I64::~Inst_DS__DS_MAX_RTN_I64()
    {
    } // ~Inst_DS__DS_MAX_RTN_I64

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_MAX_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_RTN_U64::Inst_DS__DS_MIN_RTN_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_rtn_u64")
    {
    } // Inst_DS__DS_MIN_RTN_U64

    Inst_DS__DS_MIN_RTN_U64::~Inst_DS__DS_MIN_RTN_U64()
    {
    } // ~Inst_DS__DS_MIN_RTN_U64

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_MIN_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_RTN_U64::Inst_DS__DS_MAX_RTN_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_rtn_u64")
    {
    } // Inst_DS__DS_MAX_RTN_U64

    Inst_DS__DS_MAX_RTN_U64::~Inst_DS__DS_MAX_RTN_U64()
    {
    } // ~Inst_DS__DS_MAX_RTN_U64

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_MAX_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_AND_RTN_B64::Inst_DS__DS_AND_RTN_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_and_rtn_b64")
    {
    } // Inst_DS__DS_AND_RTN_B64

    Inst_DS__DS_AND_RTN_B64::~Inst_DS__DS_AND_RTN_B64()
    {
    } // ~Inst_DS__DS_AND_RTN_B64

    // tmp = MEM[ADDR];
    // MEM[ADDR] &= DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_AND_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_OR_RTN_B64::Inst_DS__DS_OR_RTN_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_or_rtn_b64")
    {
    } // Inst_DS__DS_OR_RTN_B64

    Inst_DS__DS_OR_RTN_B64::~Inst_DS__DS_OR_RTN_B64()
    {
    } // ~Inst_DS__DS_OR_RTN_B64

    // tmp = MEM[ADDR];
    // MEM[ADDR] |= DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_OR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_XOR_RTN_B64::Inst_DS__DS_XOR_RTN_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_xor_rtn_b64")
    {
    } // Inst_DS__DS_XOR_RTN_B64

    Inst_DS__DS_XOR_RTN_B64::~Inst_DS__DS_XOR_RTN_B64()
    {
    } // ~Inst_DS__DS_XOR_RTN_B64

    // tmp = MEM[ADDR];
    // MEM[ADDR] ^= DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_DS__DS_XOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MSKOR_RTN_B64::Inst_DS__DS_MSKOR_RTN_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_mskor_rtn_b64")
    {
    } // Inst_DS__DS_MSKOR_RTN_B64

    Inst_DS__DS_MSKOR_RTN_B64::~Inst_DS__DS_MSKOR_RTN_B64()
    {
    } // ~Inst_DS__DS_MSKOR_RTN_B64

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
    // RETURN_DATA = tmp.
    // Masked dword OR, D0 contains the mask and D1 contains the new value.
    void
    Inst_DS__DS_MSKOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_WRXCHG_RTN_B64::Inst_DS__DS_WRXCHG_RTN_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_wrxchg_rtn_b64")
    {
    } // Inst_DS__DS_WRXCHG_RTN_B64

    Inst_DS__DS_WRXCHG_RTN_B64::~Inst_DS__DS_WRXCHG_RTN_B64()
    {
    } // ~Inst_DS__DS_WRXCHG_RTN_B64

    // tmp = MEM[ADDR];
    // MEM[ADDR] = DATA;
    // RETURN_DATA = tmp.
    // Write-exchange operation.
    void
    Inst_DS__DS_WRXCHG_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_WRXCHG2_RTN_B64::Inst_DS__DS_WRXCHG2_RTN_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_wrxchg2_rtn_b64")
    {
    } // Inst_DS__DS_WRXCHG2_RTN_B64

    Inst_DS__DS_WRXCHG2_RTN_B64::~Inst_DS__DS_WRXCHG2_RTN_B64()
    {
    } // ~Inst_DS__DS_WRXCHG2_RTN_B64

    // Write-exchange 2 separate qwords.
    void
    Inst_DS__DS_WRXCHG2_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_WRXCHG2ST64_RTN_B64::Inst_DS__DS_WRXCHG2ST64_RTN_B64(
          InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b64")
    {
    } // Inst_DS__DS_WRXCHG2ST64_RTN_B64

    Inst_DS__DS_WRXCHG2ST64_RTN_B64::~Inst_DS__DS_WRXCHG2ST64_RTN_B64()
    {
    } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B64

    // Write-exchange 2 qwords with a stride of 64 qwords.
    void
    Inst_DS__DS_WRXCHG2ST64_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_CMPST_RTN_B64::Inst_DS__DS_CMPST_RTN_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_cmpst_rtn_b64")
    {
    } // Inst_DS__DS_CMPST_RTN_B64

    Inst_DS__DS_CMPST_RTN_B64::~Inst_DS__DS_CMPST_RTN_B64()
    {
    } // ~Inst_DS__DS_CMPST_RTN_B64

    // tmp = MEM[ADDR];
    // src = DATA2;
    // cmp = DATA;
    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
    // RETURN_DATA[0] = tmp.
    // Compare and store.
    void
    Inst_DS__DS_CMPST_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_CMPST_RTN_F64::Inst_DS__DS_CMPST_RTN_F64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_cmpst_rtn_f64")
    {
        setFlag(F64);
    } // Inst_DS__DS_CMPST_RTN_F64

    Inst_DS__DS_CMPST_RTN_F64::~Inst_DS__DS_CMPST_RTN_F64()
    {
    } // ~Inst_DS__DS_CMPST_RTN_F64

    // tmp = MEM[ADDR];
    // src = DATA2;
    // cmp = DATA;
    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
    // RETURN_DATA[0] = tmp.
    void
    Inst_DS__DS_CMPST_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_RTN_F64::Inst_DS__DS_MIN_RTN_F64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_rtn_f64")
    {
        setFlag(F64);
    } // Inst_DS__DS_MIN_RTN_F64

    Inst_DS__DS_MIN_RTN_F64::~Inst_DS__DS_MIN_RTN_F64()
    {
    } // ~Inst_DS__DS_MIN_RTN_F64

    // tmp = MEM[ADDR];
    // src = DATA;
    // cmp = DATA2;
    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
    void
    Inst_DS__DS_MIN_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_RTN_F64::Inst_DS__DS_MAX_RTN_F64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_rtn_f64")
    {
        setFlag(F64);
    } // Inst_DS__DS_MAX_RTN_F64

    Inst_DS__DS_MAX_RTN_F64::~Inst_DS__DS_MAX_RTN_F64()
    {
    } // ~Inst_DS__DS_MAX_RTN_F64

    // tmp = MEM[ADDR];
    // src = DATA;
    // cmp = DATA2;
    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
    void
    Inst_DS__DS_MAX_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_READ_B64::Inst_DS__DS_READ_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_read_b64")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_DS__DS_READ_B64

    Inst_DS__DS_READ_B64::~Inst_DS__DS_READ_B64()
    {
    } // ~Inst_DS__DS_READ_B64

    // RETURN_DATA = MEM[ADDR].
    // Read 1 qword.
    void
    Inst_DS__DS_READ_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(
                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

        wf->rdLmReqsInPipe--;
        wf->outstandingReqsRdLm++;
        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_DS__DS_READ_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        Addr offset0 = instData.OFFSET0;
        Addr offset1 = instData.OFFSET1;
        Addr offset = (offset1 << 8) | offset0;

        initMemRead<VecElemU64>(gpuDynInst, offset);
    } // initiateAcc

    void
    Inst_DS__DS_READ_B64::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU64 vdst(gpuDynInst, extData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                vdst[lane] = (reinterpret_cast<VecElemU64*>(
                    gpuDynInst->d_data))[lane];
            }
        }

        vdst.write();
    } // completeAcc

    Inst_DS__DS_READ2_B64::Inst_DS__DS_READ2_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_read2_b64")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_DS__DS_READ2_B64

    Inst_DS__DS_READ2_B64::~Inst_DS__DS_READ2_B64()
    {
    } // ~Inst_DS__DS_READ2_B64

    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8];
    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8].
    // Read 2 qwords.
    void
    Inst_DS__DS_READ2_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(
                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

        wf->rdLmReqsInPipe--;
        wf->outstandingReqsRdLm++;
        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_DS__DS_READ2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        Addr offset0 = instData.OFFSET0 * 8;
        Addr offset1 = instData.OFFSET1 * 8;

        initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
    } // initiateAcc

    void
    Inst_DS__DS_READ2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU64 vdst0(gpuDynInst, extData.VDST);
        VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
                    gpuDynInst->d_data))[lane * 2];
                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
                    gpuDynInst->d_data))[lane * 2 + 1];
            }
        }

        vdst0.write();
        vdst1.write();
    } // completeAcc

    Inst_DS__DS_READ2ST64_B64::Inst_DS__DS_READ2ST64_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_read2st64_b64")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_DS__DS_READ2ST64_B64

    Inst_DS__DS_READ2ST64_B64::~Inst_DS__DS_READ2ST64_B64()
    {
    } // ~Inst_DS__DS_READ2ST64_B64

    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8 * 64];
    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8 * 64].
    // Read 2 qwords.
    void
    Inst_DS__DS_READ2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(
                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);

        wf->rdLmReqsInPipe--;
        wf->outstandingReqsRdLm++;
        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_DS__DS_READ2ST64_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        Addr offset0 = (instData.OFFSET0 * 8 * 64);
        Addr offset1 = (instData.OFFSET1 * 8 * 64);

        initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
    }

    void
    Inst_DS__DS_READ2ST64_B64::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU64 vdst0(gpuDynInst, extData.VDST);
        VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
                    gpuDynInst->d_data))[lane * 2];
                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
                    gpuDynInst->d_data))[lane * 2 + 1];
            }
        }

        vdst0.write();
        vdst1.write();
    }

    Inst_DS__DS_CONDXCHG32_RTN_B64::Inst_DS__DS_CONDXCHG32_RTN_B64(
          InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_condxchg32_rtn_b64")
    {
    } // Inst_DS__DS_CONDXCHG32_RTN_B64

    Inst_DS__DS_CONDXCHG32_RTN_B64::~Inst_DS__DS_CONDXCHG32_RTN_B64()
    {
    } // ~Inst_DS__DS_CONDXCHG32_RTN_B64

    // Conditional write exchange.
    void
    Inst_DS__DS_CONDXCHG32_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_ADD_SRC2_U32::Inst_DS__DS_ADD_SRC2_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_add_src2_u32")
    {
    } // Inst_DS__DS_ADD_SRC2_U32

    Inst_DS__DS_ADD_SRC2_U32::~Inst_DS__DS_ADD_SRC2_U32()
    {
    } // ~Inst_DS__DS_ADD_SRC2_U32

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = MEM[A] + MEM[B].
    void
    Inst_DS__DS_ADD_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_SUB_SRC2_U32::Inst_DS__DS_SUB_SRC2_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_sub_src2_u32")
    {
    } // Inst_DS__DS_SUB_SRC2_U32

    Inst_DS__DS_SUB_SRC2_U32::~Inst_DS__DS_SUB_SRC2_U32()
    {
    } // ~Inst_DS__DS_SUB_SRC2_U32

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = MEM[A] - MEM[B].
    void
    Inst_DS__DS_SUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_RSUB_SRC2_U32::Inst_DS__DS_RSUB_SRC2_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_rsub_src2_u32")
    {
    } // Inst_DS__DS_RSUB_SRC2_U32

    Inst_DS__DS_RSUB_SRC2_U32::~Inst_DS__DS_RSUB_SRC2_U32()
    {
    } // ~Inst_DS__DS_RSUB_SRC2_U32

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = MEM[B] - MEM[A].
    void
    Inst_DS__DS_RSUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_INC_SRC2_U32::Inst_DS__DS_INC_SRC2_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_inc_src2_u32")
    {
    } // Inst_DS__DS_INC_SRC2_U32

    Inst_DS__DS_INC_SRC2_U32::~Inst_DS__DS_INC_SRC2_U32()
    {
    } // ~Inst_DS__DS_INC_SRC2_U32

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
    void
    Inst_DS__DS_INC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_DEC_SRC2_U32::Inst_DS__DS_DEC_SRC2_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_dec_src2_u32")
    {
    } // Inst_DS__DS_DEC_SRC2_U32

    Inst_DS__DS_DEC_SRC2_U32::~Inst_DS__DS_DEC_SRC2_U32()
    {
    } // ~Inst_DS__DS_DEC_SRC2_U32

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
    // Uint decrement.
    void
    Inst_DS__DS_DEC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_SRC2_I32::Inst_DS__DS_MIN_SRC2_I32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_src2_i32")
    {
    } // Inst_DS__DS_MIN_SRC2_I32

    Inst_DS__DS_MIN_SRC2_I32::~Inst_DS__DS_MIN_SRC2_I32()
    {
    } // ~Inst_DS__DS_MIN_SRC2_I32

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = min(MEM[A], MEM[B]).
    void
    Inst_DS__DS_MIN_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_SRC2_I32::Inst_DS__DS_MAX_SRC2_I32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_src2_i32")
    {
    } // Inst_DS__DS_MAX_SRC2_I32

    Inst_DS__DS_MAX_SRC2_I32::~Inst_DS__DS_MAX_SRC2_I32()
    {
    } // ~Inst_DS__DS_MAX_SRC2_I32

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = max(MEM[A], MEM[B]).
    void
    Inst_DS__DS_MAX_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_SRC2_U32::Inst_DS__DS_MIN_SRC2_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_src2_u32")
    {
    } // Inst_DS__DS_MIN_SRC2_U32

    Inst_DS__DS_MIN_SRC2_U32::~Inst_DS__DS_MIN_SRC2_U32()
    {
    } // ~Inst_DS__DS_MIN_SRC2_U32

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = min(MEM[A], MEM[B]).
    void
    Inst_DS__DS_MIN_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_SRC2_U32::Inst_DS__DS_MAX_SRC2_U32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_src2_u32")
    {
    } // Inst_DS__DS_MAX_SRC2_U32

    Inst_DS__DS_MAX_SRC2_U32::~Inst_DS__DS_MAX_SRC2_U32()
    {
    } // ~Inst_DS__DS_MAX_SRC2_U32

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = max(MEM[A], MEM[B]).
    void
    Inst_DS__DS_MAX_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_AND_SRC2_B32::Inst_DS__DS_AND_SRC2_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_and_src2_b32")
    {
    } // Inst_DS__DS_AND_SRC2_B32

    Inst_DS__DS_AND_SRC2_B32::~Inst_DS__DS_AND_SRC2_B32()
    {
    } // ~Inst_DS__DS_AND_SRC2_B32

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = MEM[A] & MEM[B].
    void
    Inst_DS__DS_AND_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_OR_SRC2_B32::Inst_DS__DS_OR_SRC2_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_or_src2_b32")
    {
    } // Inst_DS__DS_OR_SRC2_B32

    Inst_DS__DS_OR_SRC2_B32::~Inst_DS__DS_OR_SRC2_B32()
    {
    } // ~Inst_DS__DS_OR_SRC2_B32

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = MEM[A] | MEM[B].
    void
    Inst_DS__DS_OR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_XOR_SRC2_B32::Inst_DS__DS_XOR_SRC2_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_xor_src2_b32")
    {
    } // Inst_DS__DS_XOR_SRC2_B32

    Inst_DS__DS_XOR_SRC2_B32::~Inst_DS__DS_XOR_SRC2_B32()
    {
    } // ~Inst_DS__DS_XOR_SRC2_B32

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = MEM[A] ^ MEM[B].
    void
    Inst_DS__DS_XOR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_WRITE_SRC2_B32::Inst_DS__DS_WRITE_SRC2_B32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_write_src2_b32")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_DS__DS_WRITE_SRC2_B32

    Inst_DS__DS_WRITE_SRC2_B32::~Inst_DS__DS_WRITE_SRC2_B32()
    {
    } // ~Inst_DS__DS_WRITE_SRC2_B32

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = MEM[B].
    // Write dword.
    void
    Inst_DS__DS_WRITE_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_SRC2_F32::Inst_DS__DS_MIN_SRC2_F32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_src2_f32")
    {
        setFlag(F32);
    } // Inst_DS__DS_MIN_SRC2_F32

    Inst_DS__DS_MIN_SRC2_F32::~Inst_DS__DS_MIN_SRC2_F32()
    {
    } // ~Inst_DS__DS_MIN_SRC2_F32

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
    void
    Inst_DS__DS_MIN_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_SRC2_F32::Inst_DS__DS_MAX_SRC2_F32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_src2_f32")
    {
        setFlag(F32);
    } // Inst_DS__DS_MAX_SRC2_F32

    Inst_DS__DS_MAX_SRC2_F32::~Inst_DS__DS_MAX_SRC2_F32()
    {
    } // ~Inst_DS__DS_MAX_SRC2_F32

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
    void
    Inst_DS__DS_MAX_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_ADD_SRC2_F32::Inst_DS__DS_ADD_SRC2_F32(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_add_src2_f32")
    {
        setFlag(F32);
    } // Inst_DS__DS_ADD_SRC2_F32

    Inst_DS__DS_ADD_SRC2_F32::~Inst_DS__DS_ADD_SRC2_F32()
    {
    } // ~Inst_DS__DS_ADD_SRC2_F32

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = MEM[B] + MEM[A].
    void
    Inst_DS__DS_ADD_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::Inst_DS__DS_GWS_SEMA_RELEASE_ALL(
          InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_gws_sema_release_all")
    {
    } // Inst_DS__DS_GWS_SEMA_RELEASE_ALL

    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::~Inst_DS__DS_GWS_SEMA_RELEASE_ALL()
    {
    } // ~Inst_DS__DS_GWS_SEMA_RELEASE_ALL

    void
    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_GWS_INIT::Inst_DS__DS_GWS_INIT(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_gws_init")
    {
    } // Inst_DS__DS_GWS_INIT

    Inst_DS__DS_GWS_INIT::~Inst_DS__DS_GWS_INIT()
    {
    } // ~Inst_DS__DS_GWS_INIT

    void
    Inst_DS__DS_GWS_INIT::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_GWS_SEMA_V::Inst_DS__DS_GWS_SEMA_V(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_gws_sema_v")
    {
    } // Inst_DS__DS_GWS_SEMA_V

    Inst_DS__DS_GWS_SEMA_V::~Inst_DS__DS_GWS_SEMA_V()
    {
    } // ~Inst_DS__DS_GWS_SEMA_V

    void
    Inst_DS__DS_GWS_SEMA_V::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_GWS_SEMA_BR::Inst_DS__DS_GWS_SEMA_BR(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_gws_sema_br")
    {
    } // Inst_DS__DS_GWS_SEMA_BR

    Inst_DS__DS_GWS_SEMA_BR::~Inst_DS__DS_GWS_SEMA_BR()
    {
    } // ~Inst_DS__DS_GWS_SEMA_BR

    void
    Inst_DS__DS_GWS_SEMA_BR::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_GWS_SEMA_P::Inst_DS__DS_GWS_SEMA_P(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_gws_sema_p")
    {
    } // Inst_DS__DS_GWS_SEMA_P

    Inst_DS__DS_GWS_SEMA_P::~Inst_DS__DS_GWS_SEMA_P()
    {
    } // ~Inst_DS__DS_GWS_SEMA_P

    void
    Inst_DS__DS_GWS_SEMA_P::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_GWS_BARRIER::Inst_DS__DS_GWS_BARRIER(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_gws_barrier")
    {
    } // Inst_DS__DS_GWS_BARRIER

    Inst_DS__DS_GWS_BARRIER::~Inst_DS__DS_GWS_BARRIER()
    {
    } // ~Inst_DS__DS_GWS_BARRIER

    void
    Inst_DS__DS_GWS_BARRIER::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_CONSUME::Inst_DS__DS_CONSUME(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_consume")
    {
    } // Inst_DS__DS_CONSUME

    Inst_DS__DS_CONSUME::~Inst_DS__DS_CONSUME()
    {
    } // ~Inst_DS__DS_CONSUME

    void
    Inst_DS__DS_CONSUME::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_APPEND::Inst_DS__DS_APPEND(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_append")
    {
    } // Inst_DS__DS_APPEND

    Inst_DS__DS_APPEND::~Inst_DS__DS_APPEND()
    {
    } // ~Inst_DS__DS_APPEND

    void
    Inst_DS__DS_APPEND::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_ORDERED_COUNT::Inst_DS__DS_ORDERED_COUNT(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_ordered_count")
    {
    } // Inst_DS__DS_ORDERED_COUNT

    Inst_DS__DS_ORDERED_COUNT::~Inst_DS__DS_ORDERED_COUNT()
    {
    } // ~Inst_DS__DS_ORDERED_COUNT

    void
    Inst_DS__DS_ORDERED_COUNT::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_ADD_SRC2_U64::Inst_DS__DS_ADD_SRC2_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_add_src2_u64")
    {
    } // Inst_DS__DS_ADD_SRC2_U64

    Inst_DS__DS_ADD_SRC2_U64::~Inst_DS__DS_ADD_SRC2_U64()
    {
    } // ~Inst_DS__DS_ADD_SRC2_U64

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = MEM[A] + MEM[B].
    void
    Inst_DS__DS_ADD_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_SUB_SRC2_U64::Inst_DS__DS_SUB_SRC2_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_sub_src2_u64")
    {
    } // Inst_DS__DS_SUB_SRC2_U64

    Inst_DS__DS_SUB_SRC2_U64::~Inst_DS__DS_SUB_SRC2_U64()
    {
    } // ~Inst_DS__DS_SUB_SRC2_U64

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = MEM[A] - MEM[B].
    void
    Inst_DS__DS_SUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_RSUB_SRC2_U64::Inst_DS__DS_RSUB_SRC2_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_rsub_src2_u64")
    {
    } // Inst_DS__DS_RSUB_SRC2_U64

    Inst_DS__DS_RSUB_SRC2_U64::~Inst_DS__DS_RSUB_SRC2_U64()
    {
    } // ~Inst_DS__DS_RSUB_SRC2_U64

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = MEM[B] - MEM[A].
    void
    Inst_DS__DS_RSUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_INC_SRC2_U64::Inst_DS__DS_INC_SRC2_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_inc_src2_u64")
    {
    } // Inst_DS__DS_INC_SRC2_U64

    Inst_DS__DS_INC_SRC2_U64::~Inst_DS__DS_INC_SRC2_U64()
    {
    } // ~Inst_DS__DS_INC_SRC2_U64

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
    void
    Inst_DS__DS_INC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_DEC_SRC2_U64::Inst_DS__DS_DEC_SRC2_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_dec_src2_u64")
    {
    } // Inst_DS__DS_DEC_SRC2_U64

    Inst_DS__DS_DEC_SRC2_U64::~Inst_DS__DS_DEC_SRC2_U64()
    {
    } // ~Inst_DS__DS_DEC_SRC2_U64

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
    // Uint decrement.
    void
    Inst_DS__DS_DEC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_SRC2_I64::Inst_DS__DS_MIN_SRC2_I64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_src2_i64")
    {
    } // Inst_DS__DS_MIN_SRC2_I64

    Inst_DS__DS_MIN_SRC2_I64::~Inst_DS__DS_MIN_SRC2_I64()
    {
    } // ~Inst_DS__DS_MIN_SRC2_I64

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = min(MEM[A], MEM[B]).
    void
    Inst_DS__DS_MIN_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_SRC2_I64::Inst_DS__DS_MAX_SRC2_I64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_src2_i64")
    {
    } // Inst_DS__DS_MAX_SRC2_I64

    Inst_DS__DS_MAX_SRC2_I64::~Inst_DS__DS_MAX_SRC2_I64()
    {
    } // ~Inst_DS__DS_MAX_SRC2_I64

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = max(MEM[A], MEM[B]).
    void
    Inst_DS__DS_MAX_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_SRC2_U64::Inst_DS__DS_MIN_SRC2_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_src2_u64")
    {
    } // Inst_DS__DS_MIN_SRC2_U64

    Inst_DS__DS_MIN_SRC2_U64::~Inst_DS__DS_MIN_SRC2_U64()
    {
    } // ~Inst_DS__DS_MIN_SRC2_U64

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = min(MEM[A], MEM[B]).
    void
    Inst_DS__DS_MIN_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_SRC2_U64::Inst_DS__DS_MAX_SRC2_U64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_src2_u64")
    {
    } // Inst_DS__DS_MAX_SRC2_U64

    Inst_DS__DS_MAX_SRC2_U64::~Inst_DS__DS_MAX_SRC2_U64()
    {
    } // ~Inst_DS__DS_MAX_SRC2_U64

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = max(MEM[A], MEM[B]).
    void
    Inst_DS__DS_MAX_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_AND_SRC2_B64::Inst_DS__DS_AND_SRC2_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_and_src2_b64")
    {
    } // Inst_DS__DS_AND_SRC2_B64

    Inst_DS__DS_AND_SRC2_B64::~Inst_DS__DS_AND_SRC2_B64()
    {
    } // ~Inst_DS__DS_AND_SRC2_B64

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = MEM[A] & MEM[B].
    void
    Inst_DS__DS_AND_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_OR_SRC2_B64::Inst_DS__DS_OR_SRC2_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_or_src2_b64")
    {
    } // Inst_DS__DS_OR_SRC2_B64

    Inst_DS__DS_OR_SRC2_B64::~Inst_DS__DS_OR_SRC2_B64()
    {
    } // ~Inst_DS__DS_OR_SRC2_B64

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = MEM[A] | MEM[B].
    void
    Inst_DS__DS_OR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_XOR_SRC2_B64::Inst_DS__DS_XOR_SRC2_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_xor_src2_b64")
    {
    } // Inst_DS__DS_XOR_SRC2_B64

    Inst_DS__DS_XOR_SRC2_B64::~Inst_DS__DS_XOR_SRC2_B64()
    {
    } // ~Inst_DS__DS_XOR_SRC2_B64

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = MEM[A] ^ MEM[B].
    void
    Inst_DS__DS_XOR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_WRITE_SRC2_B64::Inst_DS__DS_WRITE_SRC2_B64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_write_src2_b64")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_DS__DS_WRITE_SRC2_B64

    Inst_DS__DS_WRITE_SRC2_B64::~Inst_DS__DS_WRITE_SRC2_B64()
    {
    } // ~Inst_DS__DS_WRITE_SRC2_B64

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = MEM[B].
    // Write qword.
    void
    Inst_DS__DS_WRITE_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MIN_SRC2_F64::Inst_DS__DS_MIN_SRC2_F64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_min_src2_f64")
    {
        setFlag(F64);
    } // Inst_DS__DS_MIN_SRC2_F64

    Inst_DS__DS_MIN_SRC2_F64::~Inst_DS__DS_MIN_SRC2_F64()
    {
    } // ~Inst_DS__DS_MIN_SRC2_F64

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
    void
    Inst_DS__DS_MIN_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_MAX_SRC2_F64::Inst_DS__DS_MAX_SRC2_F64(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_max_src2_f64")
    {
        setFlag(F64);
    } // Inst_DS__DS_MAX_SRC2_F64

    Inst_DS__DS_MAX_SRC2_F64::~Inst_DS__DS_MAX_SRC2_F64()
    {
    } // ~Inst_DS__DS_MAX_SRC2_F64

    // A = ADDR_BASE;
    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
    // {offset1[6],offset1[6:0],offset0});
    // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
    void
    Inst_DS__DS_MAX_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_WRITE_B96::Inst_DS__DS_WRITE_B96(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_write_b96")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_DS__DS_WRITE_B96

    Inst_DS__DS_WRITE_B96::~Inst_DS__DS_WRITE_B96()
    {
    } // ~Inst_DS__DS_WRITE_B96

    // {MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[95:0].
    // Tri-dword write.
    void
    Inst_DS__DS_WRITE_B96::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_WRITE_B128::Inst_DS__DS_WRITE_B128(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_write_b128")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_DS__DS_WRITE_B128

    Inst_DS__DS_WRITE_B128::~Inst_DS__DS_WRITE_B128()
    {
    } // ~Inst_DS__DS_WRITE_B128

    // {MEM[ADDR + 12], MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[127:0].
    // Qword write.
    void
    Inst_DS__DS_WRITE_B128::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_READ_B96::Inst_DS__DS_READ_B96(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_read_b96")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_DS__DS_READ_B96

    Inst_DS__DS_READ_B96::~Inst_DS__DS_READ_B96()
    {
    } // ~Inst_DS__DS_READ_B96

    // Tri-dword read.
    void
    Inst_DS__DS_READ_B96::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_DS__DS_READ_B128::Inst_DS__DS_READ_B128(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_read_b128")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_DS__DS_READ_B128

    Inst_DS__DS_READ_B128::~Inst_DS__DS_READ_B128()
    {
    } // ~Inst_DS__DS_READ_B128

    // Qword read.
    void
    Inst_DS__DS_READ_B128::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_LOAD_FORMAT_X
        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_X(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_format_x")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_X

    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::~Inst_MUBUF__BUFFER_LOAD_FORMAT_X()
    {
    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_X

    // Untyped buffer load 1 dword with format conversion.
    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XY(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_format_xy")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XY

    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY()
    {
    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY

    // Untyped buffer load 2 dwords with format conversion.
    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_format_xyz")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ

    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ()
    {
    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ

    // Untyped buffer load 3 dwords with format conversion.
    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_format_xyzw")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW

    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW()
    {
    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW

    // Untyped buffer load 4 dwords with format conversion.
    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_STORE_FORMAT_X
        ::Inst_MUBUF__BUFFER_STORE_FORMAT_X(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_store_format_x")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_STORE_FORMAT_X

    Inst_MUBUF__BUFFER_STORE_FORMAT_X::~Inst_MUBUF__BUFFER_STORE_FORMAT_X()
    {
    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_X

    // Untyped buffer store 1 dword with format conversion.
    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_STORE_FORMAT_XY
        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XY(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_store_format_xy")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XY

    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::~Inst_MUBUF__BUFFER_STORE_FORMAT_XY()
    {
    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XY

    // Untyped buffer store 2 dwords with format conversion.
    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_store_format_xyz")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ

    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ()
    {
    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ

    // Untyped buffer store 3 dwords with format conversion.
    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_store_format_xyzw")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW

    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW()
    {
    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW

    // Untyped buffer store 4 dwords with format conversion.
    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_format_d16_x")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X

    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X()
    {
    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X

    // Untyped buffer load 1 dword with format conversion.
    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xy")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY

    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY()
    {
    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY

    // Untyped buffer load 2 dwords with format conversion.
    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
        GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::completeAcc(
        GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyz")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ

    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ()
    {
    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ

    // Untyped buffer load 3 dwords with format conversion.
    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
        GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
        GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyzw")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW

    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW()
    {
    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW

    // Untyped buffer load 4 dwords with format conversion.
    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
        GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
        GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_store_format_d16_x")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X

    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X()
    {
    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X

    // Untyped buffer store 1 dword with format conversion.
    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::initiateAcc(
        GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::completeAcc(
        GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xy")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY

    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY()
    {
    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY

    // Untyped buffer store 2 dwords with format conversion.
    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::initiateAcc(
        GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::completeAcc(
        GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyz")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ

    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ()
    {
    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ

    // Untyped buffer store 3 dwords with format conversion.
    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
        GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
        GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyzw")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW

    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW()
    {
    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW

    // Untyped buffer store 4 dwords with format conversion.
    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
        GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
        GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_LOAD_UBYTE
        ::Inst_MUBUF__BUFFER_LOAD_UBYTE(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_ubyte")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        if (instData.LDS) {
            setFlag(GroupSegment);
        } else {
            setFlag(GlobalSegment);
        }
    } // Inst_MUBUF__BUFFER_LOAD_UBYTE

    Inst_MUBUF__BUFFER_LOAD_UBYTE::~Inst_MUBUF__BUFFER_LOAD_UBYTE()
    {
    } // ~Inst_MUBUF__BUFFER_LOAD_UBYTE

    // Untyped buffer load unsigned byte (zero extend to VGPR destination).
    void
    Inst_MUBUF__BUFFER_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

        rsrcDesc.read();
        offset.read();

        int inst_offset = instData.OFFSET;

        if (!instData.IDXEN && !instData.OFFEN) {
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (!instData.IDXEN && instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (instData.IDXEN && !instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        } else {
            addr0.read();
            addr1.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        }

        if (isLocalMem()) {
            gpuDynInst->computeUnit()->localMemoryPipe.
                issueRequest(gpuDynInst);
            wf->rdLmReqsInPipe--;
            wf->outstandingReqsRdLm++;
        } else {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        }

        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_MUBUF__BUFFER_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<VecElemU8>(gpuDynInst);
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU32 vdst(gpuDynInst, extData.VDATA);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                if (!oobMask[lane]) {
                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
                        gpuDynInst->d_data))[lane]);
                } else {
                    vdst[lane] = 0;
                }
            }
        }

        vdst.write();
    }


    Inst_MUBUF__BUFFER_LOAD_SBYTE
        ::Inst_MUBUF__BUFFER_LOAD_SBYTE(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_sbyte")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_LOAD_SBYTE

    Inst_MUBUF__BUFFER_LOAD_SBYTE::~Inst_MUBUF__BUFFER_LOAD_SBYTE()
    {
    } // ~Inst_MUBUF__BUFFER_LOAD_SBYTE

    // Untyped buffer load signed byte (sign extend to VGPR destination).
    void
    Inst_MUBUF__BUFFER_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_LOAD_USHORT
        ::Inst_MUBUF__BUFFER_LOAD_USHORT(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_ushort")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        if (instData.LDS) {
            setFlag(GroupSegment);
        } else {
            setFlag(GlobalSegment);
        }
    } // Inst_MUBUF__BUFFER_LOAD_USHORT

    Inst_MUBUF__BUFFER_LOAD_USHORT::~Inst_MUBUF__BUFFER_LOAD_USHORT()
    {
    } // ~Inst_MUBUF__BUFFER_LOAD_USHORT

    // Untyped buffer load unsigned short (zero extend to VGPR destination).
    void
    Inst_MUBUF__BUFFER_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

        rsrcDesc.read();
        offset.read();

        int inst_offset = instData.OFFSET;

        if (!instData.IDXEN && !instData.OFFEN) {
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (!instData.IDXEN && instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (instData.IDXEN && !instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        } else {
            addr0.read();
            addr1.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        }

        if (isLocalMem()) {
            gpuDynInst->computeUnit()->localMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdLmReqsInPipe--;
            wf->outstandingReqsRdLm++;
        } else {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        }

        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_MUBUF__BUFFER_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<VecElemU16>(gpuDynInst);
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU32 vdst(gpuDynInst, extData.VDATA);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                if (!oobMask[lane]) {
                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
                        gpuDynInst->d_data))[lane]);
                } else {
                    vdst[lane] = 0;
                }
            }
        }

        vdst.write();
    }


    Inst_MUBUF__BUFFER_LOAD_SSHORT
        ::Inst_MUBUF__BUFFER_LOAD_SSHORT(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_sshort")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_LOAD_SSHORT

    Inst_MUBUF__BUFFER_LOAD_SSHORT::~Inst_MUBUF__BUFFER_LOAD_SSHORT()
    {
    } // ~Inst_MUBUF__BUFFER_LOAD_SSHORT

    // Untyped buffer load signed short (sign extend to VGPR destination).
    void
    Inst_MUBUF__BUFFER_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MUBUF__BUFFER_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_LOAD_DWORD
        ::Inst_MUBUF__BUFFER_LOAD_DWORD(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_dword")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        if (instData.LDS) {
            setFlag(GroupSegment);
        } else {
            setFlag(GlobalSegment);
        }
    } // Inst_MUBUF__BUFFER_LOAD_DWORD

    Inst_MUBUF__BUFFER_LOAD_DWORD::~Inst_MUBUF__BUFFER_LOAD_DWORD()
    {
    } // ~Inst_MUBUF__BUFFER_LOAD_DWORD

    // Untyped buffer load dword.
    void
    Inst_MUBUF__BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

        rsrcDesc.read();
        offset.read();

        int inst_offset = instData.OFFSET;

        if (!instData.IDXEN && !instData.OFFEN) {
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (!instData.IDXEN && instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (instData.IDXEN && !instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        } else {
            addr0.read();
            addr1.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        }

        if (isLocalMem()) {
            gpuDynInst->computeUnit()->localMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdLmReqsInPipe--;
            wf->outstandingReqsRdLm++;
        } else {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        }

        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_MUBUF__BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<VecElemU32>(gpuDynInst);
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU32 vdst(gpuDynInst, extData.VDATA);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                if (!oobMask[lane]) {
                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
                        gpuDynInst->d_data))[lane];
                } else {
                    vdst[lane] = 0;
                }
            }
        }

        vdst.write();
    } // completeAcc

    Inst_MUBUF__BUFFER_LOAD_DWORDX2
        ::Inst_MUBUF__BUFFER_LOAD_DWORDX2(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_dwordx2")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        if (instData.LDS) {
            setFlag(GroupSegment);
        } else {
            setFlag(GlobalSegment);
        }
    } // Inst_MUBUF__BUFFER_LOAD_DWORDX2

    Inst_MUBUF__BUFFER_LOAD_DWORDX2::~Inst_MUBUF__BUFFER_LOAD_DWORDX2()
    {
    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX2

    // Untyped buffer load 2 dwords.
    void
    Inst_MUBUF__BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

        rsrcDesc.read();
        offset.read();

        int inst_offset = instData.OFFSET;

        if (!instData.IDXEN && !instData.OFFEN) {
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (!instData.IDXEN && instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (instData.IDXEN && !instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        } else {
            addr0.read();
            addr1.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        }

        if (isLocalMem()) {
            gpuDynInst->computeUnit()->localMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdLmReqsInPipe--;
            wf->outstandingReqsRdLm++;
        } else {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        }

        wf->outstandingReqs++;
        wf->validateRequestCounters();
    } // execute

    void
    Inst_MUBUF__BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<2>(gpuDynInst);
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                if (!oobMask[lane]) {
                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
                        gpuDynInst->d_data))[lane * 2];
                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
                        gpuDynInst->d_data))[lane * 2 + 1];
                } else {
                    vdst0[lane] = 0;
                    vdst1[lane] = 0;
                }
            }
        }

        vdst0.write();
        vdst1.write();
    } // completeAcc

    Inst_MUBUF__BUFFER_LOAD_DWORDX3
        ::Inst_MUBUF__BUFFER_LOAD_DWORDX3(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_dwordx3")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        if (instData.LDS) {
            setFlag(GroupSegment);
        } else {
            setFlag(GlobalSegment);
        }
    } // Inst_MUBUF__BUFFER_LOAD_DWORDX3

    Inst_MUBUF__BUFFER_LOAD_DWORDX3::~Inst_MUBUF__BUFFER_LOAD_DWORDX3()
    {
    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX3

    // Untyped buffer load 3 dwords.
    void
    Inst_MUBUF__BUFFER_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

        rsrcDesc.read();
        offset.read();

        int inst_offset = instData.OFFSET;

        if (!instData.IDXEN && !instData.OFFEN) {
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (!instData.IDXEN && instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (instData.IDXEN && !instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        } else {
            addr0.read();
            addr1.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        }

        if (isLocalMem()) {
            gpuDynInst->computeUnit()->localMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdLmReqsInPipe--;
            wf->outstandingReqsRdLm++;
        } else {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        }

        wf->outstandingReqs++;
        wf->validateRequestCounters();
    } // execute

    void
    Inst_MUBUF__BUFFER_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<3>(gpuDynInst);
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
        VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                if (!oobMask[lane]) {
                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
                        gpuDynInst->d_data))[lane * 3];
                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
                        gpuDynInst->d_data))[lane * 3 + 1];
                    vdst2[lane] = (reinterpret_cast<VecElemU32*>(
                        gpuDynInst->d_data))[lane * 3 + 2];
                } else {
                    vdst0[lane] = 0;
                    vdst1[lane] = 0;
                    vdst2[lane] = 0;
                }
            }
        }

        vdst0.write();
        vdst1.write();
        vdst2.write();
    } // completeAcc

    Inst_MUBUF__BUFFER_LOAD_DWORDX4
        ::Inst_MUBUF__BUFFER_LOAD_DWORDX4(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_dwordx4")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        if (instData.LDS) {
            setFlag(GroupSegment);
        } else {
            setFlag(GlobalSegment);
        }
    } // Inst_MUBUF__BUFFER_LOAD_DWORDX4

    Inst_MUBUF__BUFFER_LOAD_DWORDX4::~Inst_MUBUF__BUFFER_LOAD_DWORDX4()
    {
    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX4

    // Untyped buffer load 4 dwords.
    void
    Inst_MUBUF__BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

        rsrcDesc.read();
        offset.read();

        int inst_offset = instData.OFFSET;

        if (!instData.IDXEN && !instData.OFFEN) {
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (!instData.IDXEN && instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (instData.IDXEN && !instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        } else {
            addr0.read();
            addr1.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        }

        if (isLocalMem()) {
            gpuDynInst->computeUnit()->localMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdLmReqsInPipe--;
            wf->outstandingReqsRdLm++;
        } else {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        }

        wf->outstandingReqs++;
        wf->validateRequestCounters();
    } // execute

    void
    Inst_MUBUF__BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<4>(gpuDynInst);
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
        VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
        VecOperandU32 vdst3(gpuDynInst, extData.VDATA + 3);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                if (!oobMask[lane]) {
                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
                        gpuDynInst->d_data))[lane * 4];
                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
                        gpuDynInst->d_data))[lane * 4 + 1];
                    vdst2[lane] = (reinterpret_cast<VecElemU32*>(
                        gpuDynInst->d_data))[lane * 4 + 2];
                    vdst3[lane] = (reinterpret_cast<VecElemU32*>(
                        gpuDynInst->d_data))[lane * 4 + 3];
                } else {
                    vdst0[lane] = 0;
                    vdst1[lane] = 0;
                    vdst2[lane] = 0;
                    vdst3[lane] = 0;
                }
            }
        }

        vdst0.write();
        vdst1.write();
        vdst2.write();
        vdst3.write();
    } // completeAcc

    Inst_MUBUF__BUFFER_STORE_BYTE
        ::Inst_MUBUF__BUFFER_STORE_BYTE(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_store_byte")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        if (instData.LDS) {
            setFlag(GroupSegment);
        } else {
            setFlag(GlobalSegment);
        }
    } // Inst_MUBUF__BUFFER_STORE_BYTE

    Inst_MUBUF__BUFFER_STORE_BYTE::~Inst_MUBUF__BUFFER_STORE_BYTE()
    {
    } // ~Inst_MUBUF__BUFFER_STORE_BYTE

    // Untyped buffer store byte.
    void
    Inst_MUBUF__BUFFER_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

        rsrcDesc.read();
        offset.read();

        int inst_offset = instData.OFFSET;

        if (!instData.IDXEN && !instData.OFFEN) {
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (!instData.IDXEN && instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (instData.IDXEN && !instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        } else {
            addr0.read();
            addr1.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        }

        if (isLocalMem()) {
            gpuDynInst->computeUnit()->localMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrLmReqsInPipe--;
            wf->outstandingReqsWrLm++;
        } else {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
        }

        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_MUBUF__BUFFER_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        ConstVecOperandI8 data(gpuDynInst, extData.VDATA);
        data.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemI8*>(gpuDynInst->d_data))[lane]
                    = data[lane];
            }
        }

        initMemWrite<VecElemI8>(gpuDynInst);
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_STORE_SHORT
        ::Inst_MUBUF__BUFFER_STORE_SHORT(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_store_short")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        if (instData.LDS) {
            setFlag(GroupSegment);
        } else {
            setFlag(GlobalSegment);
        }
    } // Inst_MUBUF__BUFFER_STORE_SHORT

    Inst_MUBUF__BUFFER_STORE_SHORT::~Inst_MUBUF__BUFFER_STORE_SHORT()
    {
    } // ~Inst_MUBUF__BUFFER_STORE_SHORT

    // Untyped buffer store short.
    void
    Inst_MUBUF__BUFFER_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

        rsrcDesc.read();
        offset.read();

        int inst_offset = instData.OFFSET;

        if (!instData.IDXEN && !instData.OFFEN) {
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (!instData.IDXEN && instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (instData.IDXEN && !instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        } else {
            addr0.read();
            addr1.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        }

        if (isLocalMem()) {
            gpuDynInst->computeUnit()->localMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrLmReqsInPipe--;
            wf->outstandingReqsWrLm++;
        } else {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
        }

        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_MUBUF__BUFFER_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        ConstVecOperandI16 data(gpuDynInst, extData.VDATA);
        data.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemI16*>(gpuDynInst->d_data))[lane]
                    = data[lane];
            }
        }

        initMemWrite<VecElemI16>(gpuDynInst);
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MUBUF__BUFFER_STORE_DWORD::
        Inst_MUBUF__BUFFER_STORE_DWORD(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_store_dword")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        if (instData.LDS) {
            setFlag(GroupSegment);
        } else {
            setFlag(GlobalSegment);
        }
    } // Inst_MUBUF__BUFFER_STORE_DWORD

    Inst_MUBUF__BUFFER_STORE_DWORD::~Inst_MUBUF__BUFFER_STORE_DWORD()
    {
    } // ~Inst_MUBUF__BUFFER_STORE_DWORD

    // Untyped buffer store dword.
    void
    Inst_MUBUF__BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);

        rsrcDesc.read();
        offset.read();

        int inst_offset = instData.OFFSET;

        if (!instData.IDXEN && !instData.OFFEN) {
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (!instData.IDXEN && instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (instData.IDXEN && !instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        } else {
            addr0.read();
            addr1.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        }

        if (isLocalMem()) {
            gpuDynInst->computeUnit()->localMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrLmReqsInPipe--;
            wf->outstandingReqsWrLm++;
        } else {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
        }

        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_MUBUF__BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        ConstVecOperandU32 data(gpuDynInst, extData.VDATA);
        data.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
                    = data[lane];
            }
        }

        initMemWrite<VecElemU32>(gpuDynInst);
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_MUBUF__BUFFER_STORE_DWORDX2
        ::Inst_MUBUF__BUFFER_STORE_DWORDX2(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_store_dwordx2")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        if (instData.LDS) {
            setFlag(GroupSegment);
        } else {
            setFlag(GlobalSegment);
        }
    } // Inst_MUBUF__BUFFER_STORE_DWORDX2

    Inst_MUBUF__BUFFER_STORE_DWORDX2::~Inst_MUBUF__BUFFER_STORE_DWORDX2()
    {
    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX2

    // Untyped buffer store 2 dwords.
    void
    Inst_MUBUF__BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);

        rsrcDesc.read();
        offset.read();
        data0.read();
        data1.read();

        int inst_offset = instData.OFFSET;

        if (!instData.IDXEN && !instData.OFFEN) {
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (!instData.IDXEN && instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (instData.IDXEN && !instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        } else {
            addr0.read();
            addr1.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        }

        if (isLocalMem()) {
            gpuDynInst->computeUnit()->localMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrLmReqsInPipe--;
            wf->outstandingReqsWrLm++;
        } else {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
                    = data0[lane];
                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
                    = data1[lane];
            }
        }

        wf->outstandingReqs++;
        wf->validateRequestCounters();
    } // execute

    void
    Inst_MUBUF__BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemWrite<2>(gpuDynInst);
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_MUBUF__BUFFER_STORE_DWORDX3
        ::Inst_MUBUF__BUFFER_STORE_DWORDX3(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_store_dwordx3")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        if (instData.LDS) {
            setFlag(GroupSegment);
        } else {
            setFlag(GlobalSegment);
        }
    } // Inst_MUBUF__BUFFER_STORE_DWORDX3

    Inst_MUBUF__BUFFER_STORE_DWORDX3::~Inst_MUBUF__BUFFER_STORE_DWORDX3()
    {
    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX3

    // Untyped buffer store 3 dwords.
    void
    Inst_MUBUF__BUFFER_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
        ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);

        rsrcDesc.read();
        offset.read();
        data0.read();
        data1.read();
        data2.read();

        int inst_offset = instData.OFFSET;

        if (!instData.IDXEN && !instData.OFFEN) {
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (!instData.IDXEN && instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (instData.IDXEN && !instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        } else {
            addr0.read();
            addr1.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        }

        if (isLocalMem()) {
            gpuDynInst->computeUnit()->localMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrLmReqsInPipe--;
            wf->outstandingReqsWrLm++;
        } else {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
                    = data0[lane];
                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
                    = data1[lane];
                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
                    = data2[lane];
            }
        }

        wf->outstandingReqs++;
        wf->validateRequestCounters();
    } // execute

    void
    Inst_MUBUF__BUFFER_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemWrite<3>(gpuDynInst);
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_MUBUF__BUFFER_STORE_DWORDX4
        ::Inst_MUBUF__BUFFER_STORE_DWORDX4(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_store_dwordx4")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        if (instData.LDS) {
            setFlag(GroupSegment);
        } else {
            setFlag(GlobalSegment);
        }
    } // Inst_MUBUF__BUFFER_STORE_DWORDX4

    Inst_MUBUF__BUFFER_STORE_DWORDX4::~Inst_MUBUF__BUFFER_STORE_DWORDX4()
    {
    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX4

    // Untyped buffer store 4 dwords.
    void
    Inst_MUBUF__BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
        ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
        ConstVecOperandU32 data3(gpuDynInst, extData.VDATA + 3);

        rsrcDesc.read();
        offset.read();
        data0.read();
        data1.read();
        data2.read();
        data3.read();

        int inst_offset = instData.OFFSET;

        if (!instData.IDXEN && !instData.OFFEN) {
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (!instData.IDXEN && instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr0, addr1, rsrcDesc, offset, inst_offset);
        } else if (instData.IDXEN && !instData.OFFEN) {
            addr0.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        } else {
            addr0.read();
            addr1.read();
            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
                    addr1, addr0, rsrcDesc, offset, inst_offset);
        }

        if (isLocalMem()) {
            gpuDynInst->computeUnit()->localMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrLmReqsInPipe--;
            wf->outstandingReqsWrLm++;
        } else {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
        }

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
                    = data0[lane];
                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
                    = data1[lane];
                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
                    = data2[lane];
                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 3]
                    = data3[lane];
            }
        }

        wf->outstandingReqs++;
        wf->validateRequestCounters();
    } // execute

    void
    Inst_MUBUF__BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemWrite<4>(gpuDynInst);
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_MUBUF__BUFFER_STORE_LDS_DWORD
        ::Inst_MUBUF__BUFFER_STORE_LDS_DWORD(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_store_lds_dword")
    {
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_STORE_LDS_DWORD

    Inst_MUBUF__BUFFER_STORE_LDS_DWORD::~Inst_MUBUF__BUFFER_STORE_LDS_DWORD()
    {
    } // ~Inst_MUBUF__BUFFER_STORE_LDS_DWORD

    // Store one DWORD from LDS memory to system memory without utilizing
    // VGPRs.
    void
    Inst_MUBUF__BUFFER_STORE_LDS_DWORD::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_WBINVL1::Inst_MUBUF__BUFFER_WBINVL1(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_wbinvl1")
    {
        setFlag(MemoryRef);
        setFlag(GPUStaticInst::MemSync);
        setFlag(GlobalSegment);
        setFlag(MemSync);
    } // Inst_MUBUF__BUFFER_WBINVL1

    Inst_MUBUF__BUFFER_WBINVL1::~Inst_MUBUF__BUFFER_WBINVL1()
    {
    } // ~Inst_MUBUF__BUFFER_WBINVL1

    // Write back and invalidate the shader L1.
    // Always returns ACK to shader.
    void
    Inst_MUBUF__BUFFER_WBINVL1::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->rdGmReqsInPipe--;

            wf->outstandingReqsWrGm++;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_MUBUF__BUFFER_WBINVL1::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        injectGlobalMemFence(gpuDynInst);
    } // initiateAcc

    void
    Inst_MUBUF__BUFFER_WBINVL1::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_MUBUF__BUFFER_WBINVL1_VOL
        ::Inst_MUBUF__BUFFER_WBINVL1_VOL(InFmt_MUBUF*iFmt)
        : Inst_MUBUF(iFmt, "buffer_wbinvl1_vol") {
        /**
         * This instruction is same as buffer_wbinvl1 instruction except this
         * instruction only invalidate L1 shader line with MTYPE for system
         * or group coherence. Since L1 do not differentiate between its cache
         * lines, this instruction currently behaves (and implemented )
         * exactly like buffer_wbinvl1 instruction.
         */
        setFlag(MemoryRef);
        setFlag(GPUStaticInst::MemSync);
        setFlag(GlobalSegment);
        setFlag(MemSync);
    } // Inst_MUBUF__BUFFER_WBINVL1_VOL

    Inst_MUBUF__BUFFER_WBINVL1_VOL::~Inst_MUBUF__BUFFER_WBINVL1_VOL()
    {
    } // ~Inst_MUBUF__BUFFER_WBINVL1_VOL

    // Write back and invalidate the shader L1 only for lines that are marked
    // volatile. Always returns ACK to shader.
    void
    Inst_MUBUF__BUFFER_WBINVL1_VOL::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();
        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->rdGmReqsInPipe--;

            wf->outstandingReqsWrGm++;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }
    void
    Inst_MUBUF__BUFFER_WBINVL1_VOL::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        injectGlobalMemFence(gpuDynInst);
    } // initiateAcc
    void
    Inst_MUBUF__BUFFER_WBINVL1_VOL::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_MUBUF__BUFFER_ATOMIC_SWAP
        ::Inst_MUBUF__BUFFER_ATOMIC_SWAP(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_swap")
    {
        setFlag(AtomicExch);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        } // if
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_SWAP

    Inst_MUBUF__BUFFER_ATOMIC_SWAP::~Inst_MUBUF__BUFFER_ATOMIC_SWAP()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP

    // tmp = MEM[ADDR];
    // MEM[ADDR] = DATA;
    // RETURN_DATA = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
        ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap")
    {
        setFlag(AtomicCAS);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP

    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP

    // tmp = MEM[ADDR];
    // src = DATA[0];
    // cmp = DATA[1];
    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
    // RETURN_DATA[0] = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_ADD
        ::Inst_MUBUF__BUFFER_ATOMIC_ADD(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_add")
    {
        setFlag(AtomicAdd);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        } // if
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_ADD

    Inst_MUBUF__BUFFER_ATOMIC_ADD::~Inst_MUBUF__BUFFER_ATOMIC_ADD()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD

    // tmp = MEM[ADDR];
    // MEM[ADDR] += DATA;
    // RETURN_DATA = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_SUB
        ::Inst_MUBUF__BUFFER_ATOMIC_SUB(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_sub")
    {
        setFlag(AtomicSub);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_SUB

    Inst_MUBUF__BUFFER_ATOMIC_SUB::~Inst_MUBUF__BUFFER_ATOMIC_SUB()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_SMIN
        ::Inst_MUBUF__BUFFER_ATOMIC_SMIN(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_smin")
    {
        setFlag(AtomicMin);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_SMIN

    Inst_MUBUF__BUFFER_ATOMIC_SMIN::~Inst_MUBUF__BUFFER_ATOMIC_SMIN()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
    // RETURN_DATA = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_UMIN
        ::Inst_MUBUF__BUFFER_ATOMIC_UMIN(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_umin")
    {
        setFlag(AtomicMin);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_UMIN

    Inst_MUBUF__BUFFER_ATOMIC_UMIN::~Inst_MUBUF__BUFFER_ATOMIC_UMIN()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
    // RETURN_DATA = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_SMAX
        ::Inst_MUBUF__BUFFER_ATOMIC_SMAX(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_smax")
    {
        setFlag(AtomicMax);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_SMAX

    Inst_MUBUF__BUFFER_ATOMIC_SMAX::~Inst_MUBUF__BUFFER_ATOMIC_SMAX()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
    // RETURN_DATA = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_UMAX
        ::Inst_MUBUF__BUFFER_ATOMIC_UMAX(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_umax")
    {
        setFlag(AtomicMax);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        } // if
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_UMAX

    Inst_MUBUF__BUFFER_ATOMIC_UMAX::~Inst_MUBUF__BUFFER_ATOMIC_UMAX()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
    // RETURN_DATA = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_AND
        ::Inst_MUBUF__BUFFER_ATOMIC_AND(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_and")
    {
        setFlag(AtomicAnd);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_AND

    Inst_MUBUF__BUFFER_ATOMIC_AND::~Inst_MUBUF__BUFFER_ATOMIC_AND()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_AND

    // tmp = MEM[ADDR];
    // MEM[ADDR] &= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_OR
        ::Inst_MUBUF__BUFFER_ATOMIC_OR(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_or")
    {
        setFlag(AtomicOr);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_OR

    Inst_MUBUF__BUFFER_ATOMIC_OR::~Inst_MUBUF__BUFFER_ATOMIC_OR()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_OR

    // tmp = MEM[ADDR];
    // MEM[ADDR] |= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_XOR
        ::Inst_MUBUF__BUFFER_ATOMIC_XOR(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_xor")
    {
        setFlag(AtomicXor);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_XOR

    Inst_MUBUF__BUFFER_ATOMIC_XOR::~Inst_MUBUF__BUFFER_ATOMIC_XOR()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR

    // tmp = MEM[ADDR];
    // MEM[ADDR] ^= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_INC
        ::Inst_MUBUF__BUFFER_ATOMIC_INC(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_inc")
    {
        setFlag(AtomicInc);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_INC

    Inst_MUBUF__BUFFER_ATOMIC_INC::~Inst_MUBUF__BUFFER_ATOMIC_INC()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_INC

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
    // RETURN_DATA = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_DEC
        ::Inst_MUBUF__BUFFER_ATOMIC_DEC(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_dec")
    {
        setFlag(AtomicDec);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_DEC

    Inst_MUBUF__BUFFER_ATOMIC_DEC::~Inst_MUBUF__BUFFER_ATOMIC_DEC()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
    // (unsigned compare); RETURN_DATA = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
        ::Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_swap_x2")
    {
        setFlag(AtomicExch);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2

    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] = DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
        ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap_x2")
    {
        setFlag(AtomicCAS);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2

    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
        ::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2

    // tmp = MEM[ADDR];
    // src = DATA[0:1];
    // cmp = DATA[2:3];
    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
        ::Inst_MUBUF__BUFFER_ATOMIC_ADD_X2(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_add_x2")
    {
        setFlag(AtomicAdd);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_ADD_X2

    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] += DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
        ::Inst_MUBUF__BUFFER_ATOMIC_SUB_X2(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_sub_x2")
    {
        setFlag(AtomicSub);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_SUB_X2

    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
        ::Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_smin_x2")
    {
        setFlag(AtomicMin);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2

    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
        ::Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_umin_x2")
    {
        setFlag(AtomicMin);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2

    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
        ::Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_smax_x2")
    {
        setFlag(AtomicMax);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2

    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
        ::Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_umax_x2")
    {
        setFlag(AtomicMax);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2

    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_AND_X2
        ::Inst_MUBUF__BUFFER_ATOMIC_AND_X2(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_and_x2")
    {
        setFlag(AtomicAnd);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_AND_X2

    Inst_MUBUF__BUFFER_ATOMIC_AND_X2::~Inst_MUBUF__BUFFER_ATOMIC_AND_X2()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_AND_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] &= DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_OR_X2
        ::Inst_MUBUF__BUFFER_ATOMIC_OR_X2(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_or_x2")
    {
        setFlag(AtomicOr);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_OR_X2

    Inst_MUBUF__BUFFER_ATOMIC_OR_X2::~Inst_MUBUF__BUFFER_ATOMIC_OR_X2()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_OR_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] |= DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
        ::Inst_MUBUF__BUFFER_ATOMIC_XOR_X2(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_xor_x2")
    {
        setFlag(AtomicXor);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_XOR_X2

    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] ^= DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_INC_X2
        ::Inst_MUBUF__BUFFER_ATOMIC_INC_X2(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_inc_x2")
    {
        setFlag(AtomicInc);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_INC_X2

    Inst_MUBUF__BUFFER_ATOMIC_INC_X2::~Inst_MUBUF__BUFFER_ATOMIC_INC_X2()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_INC_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
        ::Inst_MUBUF__BUFFER_ATOMIC_DEC_X2(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_atomic_dec_x2")
    {
        setFlag(AtomicDec);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MUBUF__BUFFER_ATOMIC_DEC_X2

    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2()
    {
    } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
    // (unsigned compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_X(InFmt_MTBUF *iFmt)
        : Inst_MTBUF(iFmt, "tbuffer_load_format_x")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_X

    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X()
    {
    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X

    // Typed buffer load 1 dword with format conversion.
    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY(InFmt_MTBUF *iFmt)
        : Inst_MTBUF(iFmt, "tbuffer_load_format_xy")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY

    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY()
    {
    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY

    // Typed buffer load 2 dwords with format conversion.
    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ(InFmt_MTBUF *iFmt)
        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyz")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ

    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ()
    {
    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ

    // Typed buffer load 3 dwords with format conversion.
    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW(InFmt_MTBUF *iFmt)
        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyzw")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW

    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW()
    {
    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW

    // Typed buffer load 4 dwords with format conversion.
    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MTBUF__TBUFFER_STORE_FORMAT_X
        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_X(InFmt_MTBUF *iFmt)
        : Inst_MTBUF(iFmt, "tbuffer_store_format_x")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_X

    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::~Inst_MTBUF__TBUFFER_STORE_FORMAT_X()
    {
    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_X

    // Typed buffer store 1 dword with format conversion.
    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XY(InFmt_MTBUF *iFmt)
        : Inst_MTBUF(iFmt, "tbuffer_store_format_xy")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XY

    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY()
    {
    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY

    // Typed buffer store 2 dwords with format conversion.
    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ(InFmt_MTBUF *iFmt)
        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyz")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ

    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ()
    {
    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ

    // Typed buffer store 3 dwords with format conversion.
    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW(InFmt_MTBUF *iFmt)
        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyzw")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW

    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW()
    {
    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW

    // Typed buffer store 4 dwords with format conversion.
    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::initiateAcc(
        GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::completeAcc(
        GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X(InFmt_MTBUF *iFmt)
        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_x")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X

    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::
        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X()
    {
    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X

    // Typed buffer load 1 dword with format conversion.
    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::initiateAcc(
          GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::completeAcc(
        GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xy")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY

    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY()
    {
    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY

    // Typed buffer load 2 dwords with format conversion.
    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
        GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::completeAcc(
        GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ(
          InFmt_MTBUF *iFmt)
        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyz")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ

    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ()
    {
    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ

    // Typed buffer load 3 dwords with format conversion.
    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
        GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
        GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW(
          InFmt_MTBUF *iFmt)
        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyzw")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW

    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW()
    {
    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW

    // Typed buffer load 4 dwords with format conversion.
    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
        GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
        GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X(InFmt_MTBUF *iFmt)
        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_x")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X

    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X()
    {
    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X

    // Typed buffer store 1 dword with format conversion.
    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::initiateAcc(
        GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::completeAcc(
        GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xy")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY

    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY()
    {
    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY

    // Typed buffer store 2 dwords with format conversion.
    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::initiateAcc(
        GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::completeAcc(
        GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ(InFmt_MTBUF *iFmt)
        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyz")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ

    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ()
    {
    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ

    // Typed buffer store 3 dwords with format conversion.
    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
          GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
        GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW(InFmt_MTBUF *iFmt)
        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyzw")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW

    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW()
    {
    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW

    // Typed buffer store 4 dwords with format conversion.
    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::execute(
        GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
        GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
        GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MIMG__IMAGE_LOAD::Inst_MIMG__IMAGE_LOAD(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_load")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_LOAD

    Inst_MIMG__IMAGE_LOAD::~Inst_MIMG__IMAGE_LOAD()
    {
    } // ~Inst_MIMG__IMAGE_LOAD

    // Image memory load with format conversion specified
    void
    Inst_MIMG__IMAGE_LOAD::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MIMG__IMAGE_LOAD::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MIMG__IMAGE_LOAD::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MIMG__IMAGE_LOAD_MIP::Inst_MIMG__IMAGE_LOAD_MIP(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_load_mip")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_LOAD_MIP

    Inst_MIMG__IMAGE_LOAD_MIP::~Inst_MIMG__IMAGE_LOAD_MIP()
    {
    } // ~Inst_MIMG__IMAGE_LOAD_MIP

    void
    Inst_MIMG__IMAGE_LOAD_MIP::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MIMG__IMAGE_LOAD_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MIMG__IMAGE_LOAD_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MIMG__IMAGE_LOAD_PCK::Inst_MIMG__IMAGE_LOAD_PCK(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_load_pck")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_LOAD_PCK

    Inst_MIMG__IMAGE_LOAD_PCK::~Inst_MIMG__IMAGE_LOAD_PCK()
    {
    } // ~Inst_MIMG__IMAGE_LOAD_PCK

    void
    Inst_MIMG__IMAGE_LOAD_PCK::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MIMG__IMAGE_LOAD_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MIMG__IMAGE_LOAD_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MIMG__IMAGE_LOAD_PCK_SGN::Inst_MIMG__IMAGE_LOAD_PCK_SGN(
        InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_load_pck_sgn")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_LOAD_PCK_SGN

    Inst_MIMG__IMAGE_LOAD_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_PCK_SGN()
    {
    } // ~Inst_MIMG__IMAGE_LOAD_PCK_SGN

    // Image memory load with with no format conversion and sign extension
    void
    Inst_MIMG__IMAGE_LOAD_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MIMG__IMAGE_LOAD_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MIMG__IMAGE_LOAD_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MIMG__IMAGE_LOAD_MIP_PCK::Inst_MIMG__IMAGE_LOAD_MIP_PCK(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_load_mip_pck")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_LOAD_MIP_PCK

    Inst_MIMG__IMAGE_LOAD_MIP_PCK::~Inst_MIMG__IMAGE_LOAD_MIP_PCK()
    {
    } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK

    // Image memory load with user-supplied mip level, no format conversion
    void
    Inst_MIMG__IMAGE_LOAD_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MIMG__IMAGE_LOAD_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MIMG__IMAGE_LOAD_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN(
        InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_load_mip_pck_sgn")
    {
        setFlag(MemoryRef);
        setFlag(Load);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN

    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN()
    {
    } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN

    // Image memory load with user-supplied mip level, no format conversion.
    void
    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MIMG__IMAGE_STORE::Inst_MIMG__IMAGE_STORE(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_store")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_STORE

    Inst_MIMG__IMAGE_STORE::~Inst_MIMG__IMAGE_STORE()
    {
    } // ~Inst_MIMG__IMAGE_STORE

    // Image memory store with format conversion specified
    void
    Inst_MIMG__IMAGE_STORE::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MIMG__IMAGE_STORE::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MIMG__IMAGE_STORE::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MIMG__IMAGE_STORE_MIP::Inst_MIMG__IMAGE_STORE_MIP(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_store_mip")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_STORE_MIP

    Inst_MIMG__IMAGE_STORE_MIP::~Inst_MIMG__IMAGE_STORE_MIP()
    {
    } // ~Inst_MIMG__IMAGE_STORE_MIP

    void
    Inst_MIMG__IMAGE_STORE_MIP::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MIMG__IMAGE_STORE_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MIMG__IMAGE_STORE_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MIMG__IMAGE_STORE_PCK::Inst_MIMG__IMAGE_STORE_PCK(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_store_pck")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_STORE_PCK

    Inst_MIMG__IMAGE_STORE_PCK::~Inst_MIMG__IMAGE_STORE_PCK()
    {
    } // ~Inst_MIMG__IMAGE_STORE_PCK

    // Image memory store of packed data without format conversion.
    void
    Inst_MIMG__IMAGE_STORE_PCK::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MIMG__IMAGE_STORE_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MIMG__IMAGE_STORE_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MIMG__IMAGE_STORE_MIP_PCK::Inst_MIMG__IMAGE_STORE_MIP_PCK(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_store_mip_pck")
    {
        setFlag(MemoryRef);
        setFlag(Store);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_STORE_MIP_PCK

    Inst_MIMG__IMAGE_STORE_MIP_PCK::~Inst_MIMG__IMAGE_STORE_MIP_PCK()
    {
    } // ~Inst_MIMG__IMAGE_STORE_MIP_PCK

    // Image memory store of packed data without format conversion
    void
    Inst_MIMG__IMAGE_STORE_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_MIMG__IMAGE_STORE_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_MIMG__IMAGE_STORE_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_MIMG__IMAGE_GET_RESINFO::Inst_MIMG__IMAGE_GET_RESINFO(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_get_resinfo")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GET_RESINFO

    Inst_MIMG__IMAGE_GET_RESINFO::~Inst_MIMG__IMAGE_GET_RESINFO()
    {
    } // ~Inst_MIMG__IMAGE_GET_RESINFO

    void
    Inst_MIMG__IMAGE_GET_RESINFO::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_ATOMIC_SWAP::Inst_MIMG__IMAGE_ATOMIC_SWAP(
        InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_atomic_swap")
    {
        setFlag(AtomicExch);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_ATOMIC_SWAP

    Inst_MIMG__IMAGE_ATOMIC_SWAP::~Inst_MIMG__IMAGE_ATOMIC_SWAP()
    {
    } // ~Inst_MIMG__IMAGE_ATOMIC_SWAP

    // tmp = MEM[ADDR];
    // MEM[ADDR] = DATA;
    // RETURN_DATA = tmp.
    void
    Inst_MIMG__IMAGE_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::Inst_MIMG__IMAGE_ATOMIC_CMPSWAP(
        InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_atomic_cmpswap")
    {
        setFlag(AtomicCAS);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_ATOMIC_CMPSWAP

    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP()
    {
    } // ~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP

    // tmp = MEM[ADDR];
    // src = DATA[0];
    // cmp = DATA[1];
    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
    // RETURN_DATA[0] = tmp.
    void
    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_ATOMIC_ADD::Inst_MIMG__IMAGE_ATOMIC_ADD(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_atomic_add")
    {
        setFlag(AtomicAdd);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_ATOMIC_ADD

    Inst_MIMG__IMAGE_ATOMIC_ADD::~Inst_MIMG__IMAGE_ATOMIC_ADD()
    {
    } // ~Inst_MIMG__IMAGE_ATOMIC_ADD

    // tmp = MEM[ADDR];
    // MEM[ADDR] += DATA;
    // RETURN_DATA = tmp.
    void
    Inst_MIMG__IMAGE_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_ATOMIC_SUB::Inst_MIMG__IMAGE_ATOMIC_SUB(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_atomic_sub")
    {
        setFlag(AtomicSub);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_ATOMIC_SUB

    Inst_MIMG__IMAGE_ATOMIC_SUB::~Inst_MIMG__IMAGE_ATOMIC_SUB()
    {
    } // ~Inst_MIMG__IMAGE_ATOMIC_SUB

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_MIMG__IMAGE_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_ATOMIC_SMIN::Inst_MIMG__IMAGE_ATOMIC_SMIN(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_atomic_smin")
    {
        setFlag(AtomicMin);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_ATOMIC_SMIN

    Inst_MIMG__IMAGE_ATOMIC_SMIN::~Inst_MIMG__IMAGE_ATOMIC_SMIN()
    {
    } // ~Inst_MIMG__IMAGE_ATOMIC_SMIN

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
    // RETURN_DATA = tmp.
    void
    Inst_MIMG__IMAGE_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_ATOMIC_UMIN::Inst_MIMG__IMAGE_ATOMIC_UMIN(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_atomic_umin")
    {
        setFlag(AtomicMin);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_ATOMIC_UMIN

    Inst_MIMG__IMAGE_ATOMIC_UMIN::~Inst_MIMG__IMAGE_ATOMIC_UMIN()
    {
    } // ~Inst_MIMG__IMAGE_ATOMIC_UMIN

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
    // RETURN_DATA = tmp.
    void
    Inst_MIMG__IMAGE_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_ATOMIC_SMAX::Inst_MIMG__IMAGE_ATOMIC_SMAX(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_atomic_smax")
    {
        setFlag(AtomicMax);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_ATOMIC_SMAX

    Inst_MIMG__IMAGE_ATOMIC_SMAX::~Inst_MIMG__IMAGE_ATOMIC_SMAX()
    {
    } // ~Inst_MIMG__IMAGE_ATOMIC_SMAX

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
    // RETURN_DATA = tmp.
    void
    Inst_MIMG__IMAGE_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_ATOMIC_UMAX::Inst_MIMG__IMAGE_ATOMIC_UMAX(
        InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_atomic_umax")
    {
        setFlag(AtomicMax);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_ATOMIC_UMAX

    Inst_MIMG__IMAGE_ATOMIC_UMAX::~Inst_MIMG__IMAGE_ATOMIC_UMAX()
    {
    } // ~Inst_MIMG__IMAGE_ATOMIC_UMAX

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
    // RETURN_DATA = tmp.
    void
    Inst_MIMG__IMAGE_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_ATOMIC_AND::Inst_MIMG__IMAGE_ATOMIC_AND(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_atomic_and")
    {
        setFlag(AtomicAnd);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_ATOMIC_AND

    Inst_MIMG__IMAGE_ATOMIC_AND::~Inst_MIMG__IMAGE_ATOMIC_AND()
    {
    } // ~Inst_MIMG__IMAGE_ATOMIC_AND

    // tmp = MEM[ADDR];
    // MEM[ADDR] &= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_MIMG__IMAGE_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_ATOMIC_OR::Inst_MIMG__IMAGE_ATOMIC_OR(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_atomic_or")
    {
        setFlag(AtomicOr);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_ATOMIC_OR

    Inst_MIMG__IMAGE_ATOMIC_OR::~Inst_MIMG__IMAGE_ATOMIC_OR()
    {
    } // ~Inst_MIMG__IMAGE_ATOMIC_OR

    // tmp = MEM[ADDR];
    // MEM[ADDR] |= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_MIMG__IMAGE_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_ATOMIC_XOR::Inst_MIMG__IMAGE_ATOMIC_XOR(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_atomic_xor")
    {
        setFlag(AtomicXor);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_ATOMIC_XOR

    Inst_MIMG__IMAGE_ATOMIC_XOR::~Inst_MIMG__IMAGE_ATOMIC_XOR()
    {
    } // ~Inst_MIMG__IMAGE_ATOMIC_XOR

    // tmp = MEM[ADDR];
    // MEM[ADDR] ^= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_MIMG__IMAGE_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_ATOMIC_INC::Inst_MIMG__IMAGE_ATOMIC_INC(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_atomic_inc")
    {
        setFlag(AtomicInc);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_ATOMIC_INC

    Inst_MIMG__IMAGE_ATOMIC_INC::~Inst_MIMG__IMAGE_ATOMIC_INC()
    {
    } // ~Inst_MIMG__IMAGE_ATOMIC_INC

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
    // RETURN_DATA = tmp.
    void
    Inst_MIMG__IMAGE_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_ATOMIC_DEC::Inst_MIMG__IMAGE_ATOMIC_DEC(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_atomic_dec")
    {
        setFlag(AtomicDec);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_ATOMIC_DEC

    Inst_MIMG__IMAGE_ATOMIC_DEC::~Inst_MIMG__IMAGE_ATOMIC_DEC()
    {
    } // ~Inst_MIMG__IMAGE_ATOMIC_DEC

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
    // (unsigned compare); RETURN_DATA = tmp.
    void
    Inst_MIMG__IMAGE_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE::Inst_MIMG__IMAGE_SAMPLE(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE

    Inst_MIMG__IMAGE_SAMPLE::~Inst_MIMG__IMAGE_SAMPLE()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE

    void
    Inst_MIMG__IMAGE_SAMPLE::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_CL::Inst_MIMG__IMAGE_SAMPLE_CL(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_cl")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_CL

    Inst_MIMG__IMAGE_SAMPLE_CL::~Inst_MIMG__IMAGE_SAMPLE_CL()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_CL

    void
    Inst_MIMG__IMAGE_SAMPLE_CL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_D::Inst_MIMG__IMAGE_SAMPLE_D(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_d")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_D

    Inst_MIMG__IMAGE_SAMPLE_D::~Inst_MIMG__IMAGE_SAMPLE_D()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_D

    void
    Inst_MIMG__IMAGE_SAMPLE_D::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_D_CL::Inst_MIMG__IMAGE_SAMPLE_D_CL(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_d_cl")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_D_CL

    Inst_MIMG__IMAGE_SAMPLE_D_CL::~Inst_MIMG__IMAGE_SAMPLE_D_CL()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL

    void
    Inst_MIMG__IMAGE_SAMPLE_D_CL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_L::Inst_MIMG__IMAGE_SAMPLE_L(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_l")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_L

    Inst_MIMG__IMAGE_SAMPLE_L::~Inst_MIMG__IMAGE_SAMPLE_L()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_L

    void
    Inst_MIMG__IMAGE_SAMPLE_L::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_B::Inst_MIMG__IMAGE_SAMPLE_B(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_b")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_B

    Inst_MIMG__IMAGE_SAMPLE_B::~Inst_MIMG__IMAGE_SAMPLE_B()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_B

    void
    Inst_MIMG__IMAGE_SAMPLE_B::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_B_CL::Inst_MIMG__IMAGE_SAMPLE_B_CL(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_b_cl")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_B_CL

    Inst_MIMG__IMAGE_SAMPLE_B_CL::~Inst_MIMG__IMAGE_SAMPLE_B_CL()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL

    void
    Inst_MIMG__IMAGE_SAMPLE_B_CL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_LZ::Inst_MIMG__IMAGE_SAMPLE_LZ(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_lz")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_LZ

    Inst_MIMG__IMAGE_SAMPLE_LZ::~Inst_MIMG__IMAGE_SAMPLE_LZ()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_LZ

    void
    Inst_MIMG__IMAGE_SAMPLE_LZ::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C::Inst_MIMG__IMAGE_SAMPLE_C(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C

    Inst_MIMG__IMAGE_SAMPLE_C::~Inst_MIMG__IMAGE_SAMPLE_C()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C

    void
    Inst_MIMG__IMAGE_SAMPLE_C::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_CL::Inst_MIMG__IMAGE_SAMPLE_C_CL(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_cl")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_CL

    Inst_MIMG__IMAGE_SAMPLE_C_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CL()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL

    void
    Inst_MIMG__IMAGE_SAMPLE_C_CL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_D::Inst_MIMG__IMAGE_SAMPLE_C_D(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_d")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_D

    Inst_MIMG__IMAGE_SAMPLE_C_D::~Inst_MIMG__IMAGE_SAMPLE_C_D()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D

    void
    Inst_MIMG__IMAGE_SAMPLE_C_D::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::Inst_MIMG__IMAGE_SAMPLE_C_D_CL(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_d_cl")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL

    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL

    void
    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_L::Inst_MIMG__IMAGE_SAMPLE_C_L(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_l")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_L

    Inst_MIMG__IMAGE_SAMPLE_C_L::~Inst_MIMG__IMAGE_SAMPLE_C_L()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_L

    void
    Inst_MIMG__IMAGE_SAMPLE_C_L::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_B::Inst_MIMG__IMAGE_SAMPLE_C_B(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_b")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_B

    Inst_MIMG__IMAGE_SAMPLE_C_B::~Inst_MIMG__IMAGE_SAMPLE_C_B()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B

    void
    Inst_MIMG__IMAGE_SAMPLE_C_B::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::Inst_MIMG__IMAGE_SAMPLE_C_B_CL(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_b_cl")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL

    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL

    void
    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_LZ::Inst_MIMG__IMAGE_SAMPLE_C_LZ(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_lz")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_LZ

    Inst_MIMG__IMAGE_SAMPLE_C_LZ::~Inst_MIMG__IMAGE_SAMPLE_C_LZ()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ

    void
    Inst_MIMG__IMAGE_SAMPLE_C_LZ::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_O::Inst_MIMG__IMAGE_SAMPLE_O(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_O

    Inst_MIMG__IMAGE_SAMPLE_O::~Inst_MIMG__IMAGE_SAMPLE_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_O

    void
    Inst_MIMG__IMAGE_SAMPLE_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_CL_O::Inst_MIMG__IMAGE_SAMPLE_CL_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_cl_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_CL_O

    Inst_MIMG__IMAGE_SAMPLE_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CL_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_CL_O

    void
    Inst_MIMG__IMAGE_SAMPLE_CL_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_D_O::Inst_MIMG__IMAGE_SAMPLE_D_O(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_d_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_D_O

    Inst_MIMG__IMAGE_SAMPLE_D_O::~Inst_MIMG__IMAGE_SAMPLE_D_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_D_O

    void
    Inst_MIMG__IMAGE_SAMPLE_D_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_D_CL_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_d_cl_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_D_CL_O

    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_D_CL_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL_O

    void
    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_L_O::Inst_MIMG__IMAGE_SAMPLE_L_O(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_l_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_L_O

    Inst_MIMG__IMAGE_SAMPLE_L_O::~Inst_MIMG__IMAGE_SAMPLE_L_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_L_O

    void
    Inst_MIMG__IMAGE_SAMPLE_L_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_B_O::Inst_MIMG__IMAGE_SAMPLE_B_O(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_b_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_B_O

    Inst_MIMG__IMAGE_SAMPLE_B_O::~Inst_MIMG__IMAGE_SAMPLE_B_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_B_O

    void
    Inst_MIMG__IMAGE_SAMPLE_B_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_B_CL_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_b_cl_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_B_CL_O

    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_B_CL_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL_O

    void
    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_LZ_O::Inst_MIMG__IMAGE_SAMPLE_LZ_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_lz_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_LZ_O

    Inst_MIMG__IMAGE_SAMPLE_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_LZ_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_LZ_O

    void
    Inst_MIMG__IMAGE_SAMPLE_LZ_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_O::Inst_MIMG__IMAGE_SAMPLE_C_O(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_O

    Inst_MIMG__IMAGE_SAMPLE_C_O::~Inst_MIMG__IMAGE_SAMPLE_C_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_O

    void
    Inst_MIMG__IMAGE_SAMPLE_C_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CL_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_cl_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_CL_O

    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CL_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL_O

    void
    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_D_O::Inst_MIMG__IMAGE_SAMPLE_C_D_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_d_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_D_O

    Inst_MIMG__IMAGE_SAMPLE_C_D_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_O

    void
    Inst_MIMG__IMAGE_SAMPLE_C_D_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_d_cl_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O

    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O

    void
    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_L_O::Inst_MIMG__IMAGE_SAMPLE_C_L_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_l_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_L_O

    Inst_MIMG__IMAGE_SAMPLE_C_L_O::~Inst_MIMG__IMAGE_SAMPLE_C_L_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_L_O

    void
    Inst_MIMG__IMAGE_SAMPLE_C_L_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_B_O::Inst_MIMG__IMAGE_SAMPLE_C_B_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_b_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_B_O

    Inst_MIMG__IMAGE_SAMPLE_C_B_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_O

    void
    Inst_MIMG__IMAGE_SAMPLE_C_B_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_b_cl_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O

    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O

    void
    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::Inst_MIMG__IMAGE_SAMPLE_C_LZ_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_lz_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_LZ_O

    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O

    void
    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4::Inst_MIMG__IMAGE_GATHER4(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4

    Inst_MIMG__IMAGE_GATHER4::~Inst_MIMG__IMAGE_GATHER4()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4

    void
    Inst_MIMG__IMAGE_GATHER4::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_CL::Inst_MIMG__IMAGE_GATHER4_CL(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_cl")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_CL

    Inst_MIMG__IMAGE_GATHER4_CL::~Inst_MIMG__IMAGE_GATHER4_CL()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_CL

    void
    Inst_MIMG__IMAGE_GATHER4_CL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_L::Inst_MIMG__IMAGE_GATHER4_L(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_l")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_L

    Inst_MIMG__IMAGE_GATHER4_L::~Inst_MIMG__IMAGE_GATHER4_L()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_L

    void
    Inst_MIMG__IMAGE_GATHER4_L::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_B::Inst_MIMG__IMAGE_GATHER4_B(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_b")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_B

    Inst_MIMG__IMAGE_GATHER4_B::~Inst_MIMG__IMAGE_GATHER4_B()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_B

    void
    Inst_MIMG__IMAGE_GATHER4_B::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_B_CL::Inst_MIMG__IMAGE_GATHER4_B_CL(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_b_cl")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_B_CL

    Inst_MIMG__IMAGE_GATHER4_B_CL::~Inst_MIMG__IMAGE_GATHER4_B_CL()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_B_CL

    void
    Inst_MIMG__IMAGE_GATHER4_B_CL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_LZ::Inst_MIMG__IMAGE_GATHER4_LZ(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_lz")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_LZ

    Inst_MIMG__IMAGE_GATHER4_LZ::~Inst_MIMG__IMAGE_GATHER4_LZ()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_LZ

    void
    Inst_MIMG__IMAGE_GATHER4_LZ::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_C::Inst_MIMG__IMAGE_GATHER4_C(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_c")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_C

    Inst_MIMG__IMAGE_GATHER4_C::~Inst_MIMG__IMAGE_GATHER4_C()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_C

    void
    Inst_MIMG__IMAGE_GATHER4_C::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_C_CL::Inst_MIMG__IMAGE_GATHER4_C_CL(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_c_cl")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_C_CL

    Inst_MIMG__IMAGE_GATHER4_C_CL::~Inst_MIMG__IMAGE_GATHER4_C_CL()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_C_CL

    void
    Inst_MIMG__IMAGE_GATHER4_C_CL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_C_L::Inst_MIMG__IMAGE_GATHER4_C_L(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_c_l")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_C_L

    Inst_MIMG__IMAGE_GATHER4_C_L::~Inst_MIMG__IMAGE_GATHER4_C_L()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_C_L

    void
    Inst_MIMG__IMAGE_GATHER4_C_L::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_C_B::Inst_MIMG__IMAGE_GATHER4_C_B(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_c_b")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_C_B

    Inst_MIMG__IMAGE_GATHER4_C_B::~Inst_MIMG__IMAGE_GATHER4_C_B()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_C_B

    void
    Inst_MIMG__IMAGE_GATHER4_C_B::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_C_B_CL::Inst_MIMG__IMAGE_GATHER4_C_B_CL(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_c_b_cl")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_C_B_CL

    Inst_MIMG__IMAGE_GATHER4_C_B_CL::~Inst_MIMG__IMAGE_GATHER4_C_B_CL()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL

    void
    Inst_MIMG__IMAGE_GATHER4_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_C_LZ::Inst_MIMG__IMAGE_GATHER4_C_LZ(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_c_lz")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_C_LZ

    Inst_MIMG__IMAGE_GATHER4_C_LZ::~Inst_MIMG__IMAGE_GATHER4_C_LZ()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ

    void
    Inst_MIMG__IMAGE_GATHER4_C_LZ::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_O::Inst_MIMG__IMAGE_GATHER4_O(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_O

    Inst_MIMG__IMAGE_GATHER4_O::~Inst_MIMG__IMAGE_GATHER4_O()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_O

    void
    Inst_MIMG__IMAGE_GATHER4_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_CL_O::Inst_MIMG__IMAGE_GATHER4_CL_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_cl_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_CL_O

    Inst_MIMG__IMAGE_GATHER4_CL_O::~Inst_MIMG__IMAGE_GATHER4_CL_O()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_CL_O

    void
    Inst_MIMG__IMAGE_GATHER4_CL_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_L_O::Inst_MIMG__IMAGE_GATHER4_L_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_l_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_L_O

    Inst_MIMG__IMAGE_GATHER4_L_O::~Inst_MIMG__IMAGE_GATHER4_L_O()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_L_O

    void
    Inst_MIMG__IMAGE_GATHER4_L_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_B_O::Inst_MIMG__IMAGE_GATHER4_B_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_b_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_B_O

    Inst_MIMG__IMAGE_GATHER4_B_O::~Inst_MIMG__IMAGE_GATHER4_B_O()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_B_O

    void
    Inst_MIMG__IMAGE_GATHER4_B_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_B_CL_O::Inst_MIMG__IMAGE_GATHER4_B_CL_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_b_cl_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_B_CL_O

    Inst_MIMG__IMAGE_GATHER4_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_B_CL_O()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_B_CL_O

    void
    Inst_MIMG__IMAGE_GATHER4_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_LZ_O::Inst_MIMG__IMAGE_GATHER4_LZ_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_lz_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_LZ_O

    Inst_MIMG__IMAGE_GATHER4_LZ_O::~Inst_MIMG__IMAGE_GATHER4_LZ_O()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_LZ_O

    void
    Inst_MIMG__IMAGE_GATHER4_LZ_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_C_O::Inst_MIMG__IMAGE_GATHER4_C_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_c_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_C_O

    Inst_MIMG__IMAGE_GATHER4_C_O::~Inst_MIMG__IMAGE_GATHER4_C_O()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_C_O

    void
    Inst_MIMG__IMAGE_GATHER4_C_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_C_CL_O::Inst_MIMG__IMAGE_GATHER4_C_CL_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_c_cl_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_C_CL_O

    Inst_MIMG__IMAGE_GATHER4_C_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_CL_O()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_C_CL_O

    void
    Inst_MIMG__IMAGE_GATHER4_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_C_L_O::Inst_MIMG__IMAGE_GATHER4_C_L_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_c_l_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_C_L_O

    Inst_MIMG__IMAGE_GATHER4_C_L_O::~Inst_MIMG__IMAGE_GATHER4_C_L_O()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_C_L_O

    void
    Inst_MIMG__IMAGE_GATHER4_C_L_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_C_B_O::Inst_MIMG__IMAGE_GATHER4_C_B_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_c_b_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_C_B_O

    Inst_MIMG__IMAGE_GATHER4_C_B_O::~Inst_MIMG__IMAGE_GATHER4_C_B_O()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_O

    void
    Inst_MIMG__IMAGE_GATHER4_C_B_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::Inst_MIMG__IMAGE_GATHER4_C_B_CL_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_c_b_cl_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_C_B_CL_O

    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O

    void
    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::Inst_MIMG__IMAGE_GATHER4_C_LZ_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_gather4_c_lz_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GATHER4_C_LZ_O

    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::~Inst_MIMG__IMAGE_GATHER4_C_LZ_O()
    {
    } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ_O

    void
    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_GET_LOD::Inst_MIMG__IMAGE_GET_LOD(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_get_lod")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_GET_LOD

    Inst_MIMG__IMAGE_GET_LOD::~Inst_MIMG__IMAGE_GET_LOD()
    {
    } // ~Inst_MIMG__IMAGE_GET_LOD

    void
    Inst_MIMG__IMAGE_GET_LOD::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_CD::Inst_MIMG__IMAGE_SAMPLE_CD(InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_cd")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_CD

    Inst_MIMG__IMAGE_SAMPLE_CD::~Inst_MIMG__IMAGE_SAMPLE_CD()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_CD

    void
    Inst_MIMG__IMAGE_SAMPLE_CD::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_CD_CL::Inst_MIMG__IMAGE_SAMPLE_CD_CL(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_cd_cl")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_CD_CL

    Inst_MIMG__IMAGE_SAMPLE_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_CD_CL()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL

    void
    Inst_MIMG__IMAGE_SAMPLE_CD_CL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_CD::Inst_MIMG__IMAGE_SAMPLE_C_CD(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_cd")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_CD

    Inst_MIMG__IMAGE_SAMPLE_C_CD::~Inst_MIMG__IMAGE_SAMPLE_C_CD()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD

    void
    Inst_MIMG__IMAGE_SAMPLE_C_CD::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_cd_cl")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL

    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL

    void
    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_CD_O::Inst_MIMG__IMAGE_SAMPLE_CD_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_cd_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_CD_O

    Inst_MIMG__IMAGE_SAMPLE_CD_O::~Inst_MIMG__IMAGE_SAMPLE_CD_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_O

    void
    Inst_MIMG__IMAGE_SAMPLE_CD_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_CD_CL_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_cd_cl_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_CD_CL_O

    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O

    void
    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_cd_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_O

    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_O

    void
    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O(
          InFmt_MIMG *iFmt)
        : Inst_MIMG(iFmt, "image_sample_c_cd_cl_o")
    {
        setFlag(GlobalSegment);
    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O

    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O()
    {
    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O

    void
    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_EXP__EXP::Inst_EXP__EXP(InFmt_EXP *iFmt)
        : Inst_EXP(iFmt, "exp")
    {
    } // Inst_EXP__EXP

    Inst_EXP__EXP::~Inst_EXP__EXP()
    {
    } // ~Inst_EXP__EXP

    void
    Inst_EXP__EXP::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_FLAT__FLAT_LOAD_UBYTE::Inst_FLAT__FLAT_LOAD_UBYTE(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_load_ubyte")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_FLAT__FLAT_LOAD_UBYTE

    Inst_FLAT__FLAT_LOAD_UBYTE::~Inst_FLAT__FLAT_LOAD_UBYTE()
    {
    } // ~Inst_FLAT__FLAT_LOAD_UBYTE

    // Untyped buffer load unsigned byte (zero extend to VGPR destination).
    void
    Inst_FLAT__FLAT_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->rdGmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            gpuDynInst->exec_mask = wf->execMask();
            wf->computeUnit->vrf[wf->simdId]->
                scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = gpuDynInst->wavefront()->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    } // execute

    void
    Inst_FLAT__FLAT_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<VecElemU8>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU32 vdst(gpuDynInst, extData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
                    gpuDynInst->d_data))[lane]);
            }
        }
        vdst.write();
    } // execute
    // --- Inst_FLAT__FLAT_LOAD_SBYTE class methods ---

    Inst_FLAT__FLAT_LOAD_SBYTE::Inst_FLAT__FLAT_LOAD_SBYTE(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_load_sbyte")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_FLAT__FLAT_LOAD_SBYTE

    Inst_FLAT__FLAT_LOAD_SBYTE::~Inst_FLAT__FLAT_LOAD_SBYTE()
    {
    } // ~Inst_FLAT__FLAT_LOAD_SBYTE

    // Untyped buffer load signed byte (sign extend to VGPR destination).
    void
    Inst_FLAT__FLAT_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->rdGmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            gpuDynInst->exec_mask = wf->execMask();
            wf->computeUnit->vrf[wf->simdId]->
                scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = gpuDynInst->wavefront()->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<VecElemI8>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandI32 vdst(gpuDynInst, extData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                vdst[lane] = (VecElemI32)((reinterpret_cast<VecElemI8*>(
                    gpuDynInst->d_data))[lane]);
            }
        }
        vdst.write();
    }

    Inst_FLAT__FLAT_LOAD_USHORT::Inst_FLAT__FLAT_LOAD_USHORT(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_load_ushort")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_FLAT__FLAT_LOAD_USHORT

    Inst_FLAT__FLAT_LOAD_USHORT::~Inst_FLAT__FLAT_LOAD_USHORT()
    {
    } // ~Inst_FLAT__FLAT_LOAD_USHORT

    // Untyped buffer load unsigned short (zero extend to VGPR destination).
    void
    Inst_FLAT__FLAT_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->rdGmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            gpuDynInst->exec_mask = wf->execMask();
            wf->computeUnit->vrf[wf->simdId]->
                scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = gpuDynInst->wavefront()->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<VecElemU16>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU32 vdst(gpuDynInst, extData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
                    gpuDynInst->d_data))[lane]);
            }
        }
        vdst.write();
    }


    Inst_FLAT__FLAT_LOAD_SSHORT::Inst_FLAT__FLAT_LOAD_SSHORT(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_load_sshort")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_FLAT__FLAT_LOAD_SSHORT

    Inst_FLAT__FLAT_LOAD_SSHORT::~Inst_FLAT__FLAT_LOAD_SSHORT()
    {
    } // ~Inst_FLAT__FLAT_LOAD_SSHORT

    // Untyped buffer load signed short (sign extend to VGPR destination).
    void
    Inst_FLAT__FLAT_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    void
    Inst_FLAT__FLAT_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
    } // initiateAcc

    void
    Inst_FLAT__FLAT_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_FLAT__FLAT_LOAD_DWORD::Inst_FLAT__FLAT_LOAD_DWORD(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_load_dword")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_FLAT__FLAT_LOAD_DWORD

    Inst_FLAT__FLAT_LOAD_DWORD::~Inst_FLAT__FLAT_LOAD_DWORD()
    {
    } // ~Inst_FLAT__FLAT_LOAD_DWORD

    // Untyped buffer load dword.
    void
    Inst_FLAT__FLAT_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->rdGmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            gpuDynInst->exec_mask = wf->execMask();
            wf->computeUnit->vrf[wf->simdId]->
                scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = gpuDynInst->wavefront()->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<VecElemU32>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU32 vdst(gpuDynInst, extData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                vdst[lane] = (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane];
            }
        }
        vdst.write();
    } // completeAcc

    Inst_FLAT__FLAT_LOAD_DWORDX2::Inst_FLAT__FLAT_LOAD_DWORDX2(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_load_dwordx2")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_FLAT__FLAT_LOAD_DWORDX2

    Inst_FLAT__FLAT_LOAD_DWORDX2::~Inst_FLAT__FLAT_LOAD_DWORDX2()
    {
    } // ~Inst_FLAT__FLAT_LOAD_DWORDX2

    // Untyped buffer load 2 dwords.
    void
    Inst_FLAT__FLAT_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->rdGmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            gpuDynInst->exec_mask = wf->execMask();
            wf->computeUnit->vrf[wf->simdId]->
                scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = gpuDynInst->wavefront()->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<VecElemU64>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU64 vdst(gpuDynInst, extData.VDST);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                vdst[lane] = (reinterpret_cast<VecElemU64*>(
                    gpuDynInst->d_data))[lane];
            }
        }
        vdst.write();
    } // completeAcc

    Inst_FLAT__FLAT_LOAD_DWORDX3::Inst_FLAT__FLAT_LOAD_DWORDX3(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_load_dwordx3")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_FLAT__FLAT_LOAD_DWORDX3

    Inst_FLAT__FLAT_LOAD_DWORDX3::~Inst_FLAT__FLAT_LOAD_DWORDX3()
    {
    } // ~Inst_FLAT__FLAT_LOAD_DWORDX3

    // Untyped buffer load 3 dwords.
    void
    Inst_FLAT__FLAT_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->rdGmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            gpuDynInst->exec_mask = wf->execMask();
            wf->computeUnit->vrf[wf->simdId]->
                scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<3>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 3];
                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 3 + 1];
                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 3 + 2];
            }
        }

        vdst0.write();
        vdst1.write();
        vdst2.write();
    } // completeAcc

    Inst_FLAT__FLAT_LOAD_DWORDX4::Inst_FLAT__FLAT_LOAD_DWORDX4(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_load_dwordx4")
    {
        setFlag(MemoryRef);
        setFlag(Load);
    } // Inst_FLAT__FLAT_LOAD_DWORDX4

    Inst_FLAT__FLAT_LOAD_DWORDX4::~Inst_FLAT__FLAT_LOAD_DWORDX4()
    {
    } // ~Inst_FLAT__FLAT_LOAD_DWORDX4

    // Untyped buffer load 4 dwords.
    void
    Inst_FLAT__FLAT_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->rdGmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            gpuDynInst->exec_mask = wf->execMask();
            wf->computeUnit->vrf[wf->simdId]->
                scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initMemRead<4>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 4];
                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 4 + 1];
                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 4 + 2];
                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 4 + 3];
            }
        }

        vdst0.write();
        vdst1.write();
        vdst2.write();
        vdst3.write();
    } // completeAcc

    Inst_FLAT__FLAT_STORE_BYTE::Inst_FLAT__FLAT_STORE_BYTE(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_store_byte")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_FLAT__FLAT_STORE_BYTE

    Inst_FLAT__FLAT_STORE_BYTE::~Inst_FLAT__FLAT_STORE_BYTE()
    {
    } // ~Inst_FLAT__FLAT_STORE_BYTE

    // Untyped buffer store byte.
    void
    Inst_FLAT__FLAT_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    } // execute

    void
    Inst_FLAT__FLAT_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        ConstVecOperandU8 data(gpuDynInst, extData.DATA);
        data.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
                    = data[lane];
            }
        }

        initMemWrite<VecElemU8>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    }

    Inst_FLAT__FLAT_STORE_SHORT::Inst_FLAT__FLAT_STORE_SHORT(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_store_short")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_FLAT__FLAT_STORE_SHORT

    Inst_FLAT__FLAT_STORE_SHORT::~Inst_FLAT__FLAT_STORE_SHORT()
    {
    } // ~Inst_FLAT__FLAT_STORE_SHORT

    // Untyped buffer store short.
    void
    Inst_FLAT__FLAT_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        ConstVecOperandU16 data(gpuDynInst, extData.DATA);

        data.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
                    = data[lane];
            }
        }

        initMemWrite<VecElemU16>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_FLAT__FLAT_STORE_DWORD::Inst_FLAT__FLAT_STORE_DWORD(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_store_dword")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_FLAT__FLAT_STORE_DWORD

    Inst_FLAT__FLAT_STORE_DWORD::~Inst_FLAT__FLAT_STORE_DWORD()
    {
    } // ~Inst_FLAT__FLAT_STORE_DWORD

    // Untyped buffer store dword.
    void
    Inst_FLAT__FLAT_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
        data.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
                    = data[lane];
            }
        }

        initMemWrite<VecElemU32>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_FLAT__FLAT_STORE_DWORDX2::Inst_FLAT__FLAT_STORE_DWORDX2(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_store_dwordx2")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_FLAT__FLAT_STORE_DWORDX2

    Inst_FLAT__FLAT_STORE_DWORDX2::~Inst_FLAT__FLAT_STORE_DWORDX2()
    {
    } // ~Inst_FLAT__FLAT_STORE_DWORDX2

    // Untyped buffer store 2 dwords.
    void
    Inst_FLAT__FLAT_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        wf->outstandingReqs++;
        wf->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
        data.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
                    = data[lane];
            }
        }

        initMemWrite<VecElemU64>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_FLAT__FLAT_STORE_DWORDX3::Inst_FLAT__FLAT_STORE_DWORDX3(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_store_dwordx3")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_FLAT__FLAT_STORE_DWORDX3

    Inst_FLAT__FLAT_STORE_DWORDX3::~Inst_FLAT__FLAT_STORE_DWORDX3()
    {
    } // ~Inst_FLAT__FLAT_STORE_DWORDX3

    // Untyped buffer store 3 dwords.
    void
    Inst_FLAT__FLAT_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);

        data0.read();
        data1.read();
        data2.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 3] = data0[lane];
                (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 3 + 1] = data1[lane];
                (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 3 + 2] = data2[lane];
            }
        }

        initMemWrite<3>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_FLAT__FLAT_STORE_DWORDX4::Inst_FLAT__FLAT_STORE_DWORDX4(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_store_dwordx4")
    {
        setFlag(MemoryRef);
        setFlag(Store);
    } // Inst_FLAT__FLAT_STORE_DWORDX4

    Inst_FLAT__FLAT_STORE_DWORDX4::~Inst_FLAT__FLAT_STORE_DWORDX4()
    {
    } // ~Inst_FLAT__FLAT_STORE_DWORDX4

    // Untyped buffer store 4 dwords.
    void
    Inst_FLAT__FLAT_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe
                .issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
        ConstVecOperandU32 data3(gpuDynInst, extData.DATA + 3);

        data0.read();
        data1.read();
        data2.read();
        data3.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 4] = data0[lane];
                (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
                (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
                (reinterpret_cast<VecElemU32*>(
                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
            }
        }

        initMemWrite<4>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
    {
    } // completeAcc

    Inst_FLAT__FLAT_ATOMIC_SWAP::Inst_FLAT__FLAT_ATOMIC_SWAP(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_swap")
    {
        setFlag(AtomicExch);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        } // if
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_SWAP

    Inst_FLAT__FLAT_ATOMIC_SWAP::~Inst_FLAT__FLAT_ATOMIC_SWAP()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_SWAP

    // tmp = MEM[ADDR];
    // MEM[ADDR] = DATA;
    // RETURN_DATA = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->rdGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            if (instData.GLC) {
                gpuDynInst->exec_mask = wf->execMask();
                wf->computeUnit->vrf[wf->simdId]->
                    scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            }
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);

        addr.read();

        calcAddr(gpuDynInst, addr);

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL ||
            gpuDynInst->executedAs() == Enums::SC_PRIVATE) {
            // TODO: additional address computation required for scratch
            panic_if(gpuDynInst->executedAs() == Enums::SC_PRIVATE,
                     "Flats to private aperture not tested yet\n");
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();

        ConstVecOperandU32 data(gpuDynInst, extData.DATA);

        data.read();

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
                    = data[lane];
            }
        }

    } // execute

    void
    Inst_FLAT__FLAT_ATOMIC_SWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initAtomicAccess<VecElemU32>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_ATOMIC_SWAP::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        if (isAtomicRet()) {
            VecOperandU32 vdst(gpuDynInst, extData.VDST);

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (gpuDynInst->exec_mask[lane]) {
                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
                        gpuDynInst->d_data))[lane];
                }
            }

            vdst.write();
        }
    } // completeAcc

    // --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP class methods ---

    Inst_FLAT__FLAT_ATOMIC_CMPSWAP
        ::Inst_FLAT__FLAT_ATOMIC_CMPSWAP(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_cmpswap")
    {
        setFlag(AtomicCAS);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        } // if
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP

    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP

    // tmp = MEM[ADDR];
    // src = DATA[0];
    // cmp = DATA[1];
    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
    // RETURN_DATA[0] = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->rdGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            if (instData.GLC) {
                gpuDynInst->exec_mask = wf->execMask();
                wf->computeUnit->vrf[wf->simdId]->
                    scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            }
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
        ConstVecOperandU32 cmp(gpuDynInst, extData.DATA + 1);

        addr.read();
        data.read();
        cmp.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU32*>(gpuDynInst->x_data))[lane]
                    = data[lane];
                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
                    = cmp[lane];
            }
        }

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL ||
            gpuDynInst->executedAs() == Enums::SC_PRIVATE) {
            /**
             * TODO: If you encounter this panic, just remove this panic
             * and restart the simulation. It should just work fine but
             * this is to warn user that this path is never tested although
             * all the necessary logic is implemented
             */
            panic_if(gpuDynInst->executedAs() == Enums::SC_PRIVATE,
                     "Flats to private aperture not tested yet\n");
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initAtomicAccess<VecElemU32>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        if (isAtomicRet()) {
            VecOperandU32 vdst(gpuDynInst, extData.VDST);

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (gpuDynInst->exec_mask[lane]) {
                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
                        gpuDynInst->d_data))[lane];
                }
            }

            vdst.write();
        }
    } // completeAcc

    Inst_FLAT__FLAT_ATOMIC_ADD::Inst_FLAT__FLAT_ATOMIC_ADD(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_add")
    {
        setFlag(AtomicAdd);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        } // if
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_ADD

    Inst_FLAT__FLAT_ATOMIC_ADD::~Inst_FLAT__FLAT_ATOMIC_ADD()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_ADD

    // tmp = MEM[ADDR];
    // MEM[ADDR] += DATA;
    // RETURN_DATA = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->rdGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            if (instData.GLC) {
                gpuDynInst->exec_mask = wf->execMask();
                wf->computeUnit->vrf[wf->simdId]->
                    scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            }
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU32 data(gpuDynInst, extData.DATA);

        addr.read();
        data.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
                    = data[lane];
            }
        }

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_ATOMIC_ADD::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initAtomicAccess<VecElemU32>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_ATOMIC_ADD::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        if (isAtomicRet()) {
            VecOperandU32 vdst(gpuDynInst, extData.VDST);

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (gpuDynInst->exec_mask[lane]) {
                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
                        gpuDynInst->d_data))[lane];
                }
            }

            vdst.write();
        }
    } // completeAcc

    Inst_FLAT__FLAT_ATOMIC_SUB::Inst_FLAT__FLAT_ATOMIC_SUB(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_sub")
    {
        setFlag(AtomicSub);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        } // if
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_SUB

    Inst_FLAT__FLAT_ATOMIC_SUB::~Inst_FLAT__FLAT_ATOMIC_SUB()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_SUB

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->rdGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            if (instData.GLC) {
                gpuDynInst->exec_mask = wf->execMask();
                wf->computeUnit->vrf[wf->simdId]->
                    scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            }
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU32 data(gpuDynInst, extData.DATA);

        addr.read();
        data.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
                    = data[lane];
            }
        }

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }
    void
    Inst_FLAT__FLAT_ATOMIC_SUB::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initAtomicAccess<VecElemU32>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_ATOMIC_SUB::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        if (isAtomicRet()) {
            VecOperandU32 vdst(gpuDynInst, extData.VDST);

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (gpuDynInst->exec_mask[lane]) {
                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
                        gpuDynInst->d_data))[lane];
                }
            }

            vdst.write();
        }
    } // completeAcc

    Inst_FLAT__FLAT_ATOMIC_SMIN::Inst_FLAT__FLAT_ATOMIC_SMIN(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_smin")
    {
        setFlag(AtomicMin);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_SMIN

    Inst_FLAT__FLAT_ATOMIC_SMIN::~Inst_FLAT__FLAT_ATOMIC_SMIN()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_SMIN

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
    // RETURN_DATA = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_FLAT__FLAT_ATOMIC_UMIN::Inst_FLAT__FLAT_ATOMIC_UMIN(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_umin")
    {
        setFlag(AtomicMin);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_UMIN

    Inst_FLAT__FLAT_ATOMIC_UMIN::~Inst_FLAT__FLAT_ATOMIC_UMIN()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_UMIN

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
    // RETURN_DATA = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_FLAT__FLAT_ATOMIC_SMAX::Inst_FLAT__FLAT_ATOMIC_SMAX(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_smax")
    {
        setFlag(AtomicMax);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_SMAX

    Inst_FLAT__FLAT_ATOMIC_SMAX::~Inst_FLAT__FLAT_ATOMIC_SMAX()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_SMAX

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
    // RETURN_DATA = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_FLAT__FLAT_ATOMIC_UMAX::Inst_FLAT__FLAT_ATOMIC_UMAX(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_umax")
    {
        setFlag(AtomicMax);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_UMAX

    Inst_FLAT__FLAT_ATOMIC_UMAX::~Inst_FLAT__FLAT_ATOMIC_UMAX()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_UMAX

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
    // RETURN_DATA = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_FLAT__FLAT_ATOMIC_AND::Inst_FLAT__FLAT_ATOMIC_AND(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_and")
    {
        setFlag(AtomicAnd);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_AND

    Inst_FLAT__FLAT_ATOMIC_AND::~Inst_FLAT__FLAT_ATOMIC_AND()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_AND

    // tmp = MEM[ADDR];
    // MEM[ADDR] &= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_FLAT__FLAT_ATOMIC_OR::Inst_FLAT__FLAT_ATOMIC_OR(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_or")
    {
        setFlag(AtomicOr);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_OR

    Inst_FLAT__FLAT_ATOMIC_OR::~Inst_FLAT__FLAT_ATOMIC_OR()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_OR

    // tmp = MEM[ADDR];
    // MEM[ADDR] |= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_FLAT__FLAT_ATOMIC_XOR::Inst_FLAT__FLAT_ATOMIC_XOR(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_xor")
    {
        setFlag(AtomicXor);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_XOR

    Inst_FLAT__FLAT_ATOMIC_XOR::~Inst_FLAT__FLAT_ATOMIC_XOR()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_XOR

    // tmp = MEM[ADDR];
    // MEM[ADDR] ^= DATA;
    // RETURN_DATA = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_FLAT__FLAT_ATOMIC_INC::Inst_FLAT__FLAT_ATOMIC_INC(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_inc")
    {
        setFlag(AtomicInc);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_INC

    Inst_FLAT__FLAT_ATOMIC_INC::~Inst_FLAT__FLAT_ATOMIC_INC()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_INC

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
    // RETURN_DATA = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->rdGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            if (instData.GLC) {
                gpuDynInst->exec_mask = wf->execMask();
                wf->computeUnit->vrf[wf->simdId]->
                    scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            }
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU32 data(gpuDynInst, extData.DATA);

        addr.read();
        data.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
                    = data[lane];
            }
        }

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_ATOMIC_INC::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initAtomicAccess<VecElemU32>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_ATOMIC_INC::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        if (isAtomicRet()) {
            VecOperandU32 vdst(gpuDynInst, extData.VDST);

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (gpuDynInst->exec_mask[lane]) {
                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
                        gpuDynInst->d_data))[lane];
                }
            }

            vdst.write();
        }
    } // completeAcc

    Inst_FLAT__FLAT_ATOMIC_DEC::Inst_FLAT__FLAT_ATOMIC_DEC(InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_dec")
    {
        setFlag(AtomicDec);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_DEC

    Inst_FLAT__FLAT_ATOMIC_DEC::~Inst_FLAT__FLAT_ATOMIC_DEC()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_DEC

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
    // (unsigned compare); RETURN_DATA = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->rdGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            if (instData.GLC) {
                gpuDynInst->exec_mask = wf->execMask();
                wf->computeUnit->vrf[wf->simdId]->
                    scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            }
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU32 data(gpuDynInst, extData.DATA);

        addr.read();
        data.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
                    = data[lane];
            }
        }

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_ATOMIC_DEC::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initAtomicAccess<VecElemU32>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_ATOMIC_DEC::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        if (isAtomicRet()) {
            VecOperandU32 vdst(gpuDynInst, extData.VDST);

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (gpuDynInst->exec_mask[lane]) {
                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
                        gpuDynInst->d_data))[lane];
                }
            }

            vdst.write();
        }
    } // completeAcc

    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::Inst_FLAT__FLAT_ATOMIC_SWAP_X2(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_swap_x2")
    {
        setFlag(AtomicExch);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_SWAP_X2

    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::~Inst_FLAT__FLAT_ATOMIC_SWAP_X2()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_SWAP_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] = DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_cmpswap_x2")
    {
        setFlag(AtomicCAS);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2

    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2

    // tmp = MEM[ADDR];
    // src = DATA[0:1];
    // cmp = DATA[2:3];
    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->rdGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            if (instData.GLC) {
                gpuDynInst->exec_mask = wf->execMask();
                wf->computeUnit->vrf[wf->simdId]->
                    scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            }
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
        ConstVecOperandU64 cmp(gpuDynInst, extData.DATA + 2);

        addr.read();
        data.read();
        cmp.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU64*>(gpuDynInst->x_data))[lane]
                    = data[lane];
                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
                    = cmp[lane];
            }
        }

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL ||
            gpuDynInst->executedAs() == Enums::SC_PRIVATE) {
            /**
             * TODO: If you encounter this panic, just remove this panic
             * and restart the simulation. It should just work fine but
             * this is to warn user that this path is never tested although
             * all the necessary logic is implemented
             */
            panic_if(gpuDynInst->executedAs() == Enums::SC_PRIVATE,
                     "Flats to private aperture not tested yet\n");
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initAtomicAccess<VecElemU64>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        if (isAtomicRet()) {
            VecOperandU64 vdst(gpuDynInst, extData.VDST);

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (gpuDynInst->exec_mask[lane]) {
                    vdst[lane] = (reinterpret_cast<VecElemU64*>(
                        gpuDynInst->d_data))[lane];
                }
            }

            vdst.write();
        }
    } // completeAcc

    Inst_FLAT__FLAT_ATOMIC_ADD_X2::Inst_FLAT__FLAT_ATOMIC_ADD_X2(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_add_x2")
    {
        setFlag(AtomicAdd);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_ADD_X2

    Inst_FLAT__FLAT_ATOMIC_ADD_X2::~Inst_FLAT__FLAT_ATOMIC_ADD_X2()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] += DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->rdGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            if (instData.GLC) {
                gpuDynInst->exec_mask = wf->execMask();
                wf->computeUnit->vrf[wf->simdId]->
                    scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            }
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU64 data(gpuDynInst, extData.DATA);

        addr.read();
        data.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
                    = data[lane];
            }
        }

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_ATOMIC_ADD_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initAtomicAccess<VecElemU64>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_ATOMIC_ADD_X2::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        if (isAtomicRet()) {
            VecOperandU64 vdst(gpuDynInst, extData.VDST);


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (gpuDynInst->exec_mask[lane]) {
                    vdst[lane] = (reinterpret_cast<VecElemU64*>(
                        gpuDynInst->d_data))[lane];
                }
            }

            vdst.write();
        }
    } // completeAcc

    Inst_FLAT__FLAT_ATOMIC_SUB_X2::Inst_FLAT__FLAT_ATOMIC_SUB_X2(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_sub_x2")
    {
        setFlag(AtomicSub);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_SUB_X2

    Inst_FLAT__FLAT_ATOMIC_SUB_X2::~Inst_FLAT__FLAT_ATOMIC_SUB_X2()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_SUB_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->rdGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            if (instData.GLC) {
                gpuDynInst->exec_mask = wf->execMask();
                wf->computeUnit->vrf[wf->simdId]->
                    scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            }
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU64 data(gpuDynInst, extData.DATA);

        addr.read();
        data.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
                    = data[lane];
            }
        }

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_ATOMIC_SUB_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initAtomicAccess<VecElemU64>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_ATOMIC_SUB_X2::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        if (isAtomicRet()) {
            VecOperandU64 vdst(gpuDynInst, extData.VDST);


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (gpuDynInst->exec_mask[lane]) {
                    vdst[lane] = (reinterpret_cast<VecElemU64*>(
                        gpuDynInst->d_data))[lane];
                }
            }

            vdst.write();
        }
    } // completeAcc

    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::Inst_FLAT__FLAT_ATOMIC_SMIN_X2(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_smin_x2")
    {
        setFlag(AtomicMin);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_SMIN_X2

    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::~Inst_FLAT__FLAT_ATOMIC_SMIN_X2()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_SMIN_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::Inst_FLAT__FLAT_ATOMIC_UMIN_X2(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_umin_x2")
    {
        setFlag(AtomicMin);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_UMIN_X2

    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::~Inst_FLAT__FLAT_ATOMIC_UMIN_X2()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_UMIN_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::Inst_FLAT__FLAT_ATOMIC_SMAX_X2(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_smax_x2")
    {
        setFlag(AtomicMax);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_SMAX_X2

    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::~Inst_FLAT__FLAT_ATOMIC_SMAX_X2()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_SMAX_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::Inst_FLAT__FLAT_ATOMIC_UMAX_X2(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_umax_x2")
    {
        setFlag(AtomicMax);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_UMAX_X2

    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::~Inst_FLAT__FLAT_ATOMIC_UMAX_X2()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_UMAX_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_FLAT__FLAT_ATOMIC_AND_X2::Inst_FLAT__FLAT_ATOMIC_AND_X2(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_and_x2")
    {
        setFlag(AtomicAnd);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_AND_X2

    Inst_FLAT__FLAT_ATOMIC_AND_X2::~Inst_FLAT__FLAT_ATOMIC_AND_X2()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_AND_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] &= DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_FLAT__FLAT_ATOMIC_OR_X2::Inst_FLAT__FLAT_ATOMIC_OR_X2(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_or_x2")
    {
        setFlag(AtomicOr);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_OR_X2

    Inst_FLAT__FLAT_ATOMIC_OR_X2::~Inst_FLAT__FLAT_ATOMIC_OR_X2()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_OR_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] |= DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_FLAT__FLAT_ATOMIC_XOR_X2::Inst_FLAT__FLAT_ATOMIC_XOR_X2(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_xor_x2")
    {
        setFlag(AtomicXor);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_XOR_X2

    Inst_FLAT__FLAT_ATOMIC_XOR_X2::~Inst_FLAT__FLAT_ATOMIC_XOR_X2()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_XOR_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] ^= DATA[0:1];
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        panicUnimplemented();
    }

    Inst_FLAT__FLAT_ATOMIC_INC_X2::Inst_FLAT__FLAT_ATOMIC_INC_X2(
          InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_inc_x2")
    {
        setFlag(AtomicInc);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_INC_X2

    Inst_FLAT__FLAT_ATOMIC_INC_X2::~Inst_FLAT__FLAT_ATOMIC_INC_X2()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_INC_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->rdGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            if (instData.GLC) {
                gpuDynInst->exec_mask = wf->execMask();
                wf->computeUnit->vrf[wf->simdId]->
                    scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            }
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU64 data(gpuDynInst, extData.DATA);

        addr.read();
        data.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
                    = data[lane];
            }
        }

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_ATOMIC_INC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initAtomicAccess<VecElemU64>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_ATOMIC_INC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        if (isAtomicRet()) {
            VecOperandU64 vdst(gpuDynInst, extData.VDST);


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (gpuDynInst->exec_mask[lane]) {
                    vdst[lane] = (reinterpret_cast<VecElemU64*>(
                        gpuDynInst->d_data))[lane];
                }
            }

            vdst.write();
        }
    } // completeAcc

    Inst_FLAT__FLAT_ATOMIC_DEC_X2::Inst_FLAT__FLAT_ATOMIC_DEC_X2(
        InFmt_FLAT *iFmt)
        : Inst_FLAT(iFmt, "flat_atomic_dec_x2")
    {
        setFlag(AtomicDec);
        if (instData.GLC) {
            setFlag(AtomicReturn);
        } else {
            setFlag(AtomicNoReturn);
        }
        setFlag(MemoryRef);
    } // Inst_FLAT__FLAT_ATOMIC_DEC_X2

    Inst_FLAT__FLAT_ATOMIC_DEC_X2::~Inst_FLAT__FLAT_ATOMIC_DEC_X2()
    {
    } // ~Inst_FLAT__FLAT_ATOMIC_DEC_X2

    // tmp = MEM[ADDR];
    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
    // (unsigned compare);
    // RETURN_DATA[0:1] = tmp.
    void
    Inst_FLAT__FLAT_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *wf = gpuDynInst->wavefront();

        if (wf->execMask().none()) {
            wf->decVMemInstsIssued();
            wf->decLGKMInstsIssued();
            wf->wrGmReqsInPipe--;
            wf->rdGmReqsInPipe--;
            wf->wrLmReqsInPipe--;
            wf->rdLmReqsInPipe--;
            if (instData.GLC) {
                gpuDynInst->exec_mask = wf->execMask();
                wf->computeUnit->vrf[wf->simdId]->
                    scheduleWriteOperandsFromLoad(wf, gpuDynInst);
            }
            return;
        }

        gpuDynInst->execUnitId = wf->execUnitId;
        gpuDynInst->exec_mask = wf->execMask();
        gpuDynInst->latency.init(gpuDynInst->computeUnit());
        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());

        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
        ConstVecOperandU64 data(gpuDynInst, extData.DATA);

        addr.read();
        data.read();

        calcAddr(gpuDynInst, addr);

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
            if (gpuDynInst->exec_mask[lane]) {
                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
                    = data[lane];
            }
        }

        if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) {
            gpuDynInst->computeUnit()->globalMemoryPipe.
                issueRequest(gpuDynInst);
            wf->wrGmReqsInPipe--;
            wf->outstandingReqsWrGm++;
            wf->rdGmReqsInPipe--;
            wf->outstandingReqsRdGm++;
        } else {
            fatal("Non global flat instructions not implemented yet.\n");
        }

        gpuDynInst->wavefront()->outstandingReqs++;
        gpuDynInst->wavefront()->validateRequestCounters();
    }

    void
    Inst_FLAT__FLAT_ATOMIC_DEC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
    {
        initAtomicAccess<VecElemU64>(gpuDynInst);
    } // initiateAcc

    void
    Inst_FLAT__FLAT_ATOMIC_DEC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
    {
        if (isAtomicRet()) {
            VecOperandU64 vdst(gpuDynInst, extData.VDST);


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (gpuDynInst->exec_mask[lane]) {
                    vdst[lane] = (reinterpret_cast<VecElemU64*>(
                        gpuDynInst->d_data))[lane];
                }
            }

            vdst.write();
        }
    } // completeAcc
} // namespace Gcn3ISA
